Pārlūkot izejas kodu

refactor(mm): new abstraction of page table and etc.

greatbridf 8 mēneši atpakaļ
vecāks
revīzija
dc97e018fa
87 mainītis faili ar 3921 papildinājumiem un 2507 dzēšanām
  1. 24 11
      Cargo.lock
  2. 6 2
      Cargo.toml
  3. 2 1
      arch/Cargo.toml
  4. 34 0
      arch/src/x86_64/fence.rs
  5. 218 0
      arch/src/x86_64/mm.rs
  6. 12 8
      arch/src/x86_64/mod.rs
  7. 9 0
      crates/buddy_allocator/Cargo.toml
  8. 47 0
      crates/buddy_allocator/src/free_area.rs
  9. 73 0
      crates/buddy_allocator/src/lib.rs
  10. 122 0
      crates/buddy_allocator/src/zone.rs
  11. 0 1
      crates/eonix_log/Cargo.toml
  12. 0 1
      crates/eonix_log/src/lib.rs
  13. 7 0
      crates/eonix_mm/Cargo.toml
  14. 14 0
      crates/eonix_mm/src/address.rs
  15. 64 0
      crates/eonix_mm/src/address/addr.rs
  16. 190 0
      crates/eonix_mm/src/address/addr_range.rs
  17. 4 0
      crates/eonix_mm/src/address/error.rs
  18. 65 0
      crates/eonix_mm/src/address/paddr.rs
  19. 60 0
      crates/eonix_mm/src/address/vaddr.rs
  20. 5 0
      crates/eonix_mm/src/lib.rs
  21. 9 0
      crates/eonix_mm/src/page_table.rs
  22. 132 0
      crates/eonix_mm/src/page_table/page_table.rs
  23. 38 0
      crates/eonix_mm/src/page_table/paging_mode.rs
  24. 52 0
      crates/eonix_mm/src/page_table/pte.rs
  25. 177 0
      crates/eonix_mm/src/page_table/pte_iterator.rs
  26. 9 0
      crates/eonix_mm/src/paging.rs
  27. 219 0
      crates/eonix_mm/src/paging/page.rs
  28. 31 0
      crates/eonix_mm/src/paging/page_alloc.rs
  29. 65 0
      crates/eonix_mm/src/paging/pfn.rs
  30. 97 0
      crates/eonix_mm/src/paging/raw_page.rs
  31. 12 0
      crates/eonix_percpu/Cargo.toml
  32. 24 0
      crates/eonix_percpu/src/arch.rs
  33. 181 0
      crates/eonix_percpu/src/lib.rs
  34. 0 1
      crates/eonix_runtime/Cargo.toml
  35. 1 1
      crates/eonix_runtime/src/executor/builder.rs
  36. 3 1
      crates/eonix_runtime/src/executor/stack.rs
  37. 1 2
      crates/eonix_runtime/src/scheduler.rs
  38. 0 8
      crates/eonix_spin_irq/Cargo.toml
  39. 1 0
      crates/eonix_sync/Cargo.toml
  40. 4 1
      crates/eonix_sync/src/lib.rs
  41. 14 0
      crates/eonix_sync/src/spin.rs
  42. 6 34
      crates/eonix_sync/src/spin/spin_irq.rs
  43. 7 4
      crates/eonix_sync/src/wait_list.rs
  44. 2 2
      crates/eonix_sync/src/wait_list/wait_handle.rs
  45. 7 3
      crates/eonix_sync/src/wait_list/wait_object.rs
  46. 6 0
      crates/intrusive_list/Cargo.toml
  47. 59 0
      crates/intrusive_list/src/lib.rs
  48. 3 5
      src/driver/ahci/command.rs
  49. 48 0
      src/driver/ahci/command_table.rs
  50. 11 8
      src/driver/ahci/control.rs
  51. 4 1
      src/driver/ahci/defs.rs
  52. 63 65
      src/driver/ahci/mod.rs
  53. 96 209
      src/driver/ahci/port.rs
  54. 58 0
      src/driver/ahci/register.rs
  55. 94 0
      src/driver/ahci/slot.rs
  56. 46 0
      src/driver/ahci/stats.rs
  57. 434 434
      src/driver/e1000e.rs
  58. 0 1
      src/driver/serial.rs
  59. 11 11
      src/elf.rs
  60. 10 4
      src/fs/fat32/file.rs
  61. 9 6
      src/fs/procfs.rs
  62. 40 4
      src/io.rs
  63. 10 5
      src/kernel/block.rs
  64. 8 5
      src/kernel/cpu.rs
  65. 0 1
      src/kernel/interrupt.rs
  66. 2 5
      src/kernel/mem.rs
  67. 158 0
      src/kernel/mem/access.rs
  68. 17 398
      src/kernel/mem/address.rs
  69. 60 56
      src/kernel/mem/mm_area.rs
  70. 318 145
      src/kernel/mem/mm_list.rs
  71. 39 0
      src/kernel/mem/mm_list/mapping.rs
  72. 9 8
      src/kernel/mem/mm_list/page_fault.rs
  73. 92 403
      src/kernel/mem/page_alloc.rs
  74. 0 316
      src/kernel/mem/page_table.rs
  75. 52 163
      src/kernel/mem/paging.rs
  76. 0 80
      src/kernel/mem/phys.rs
  77. 7 8
      src/kernel/smp.rs
  78. 14 18
      src/kernel/syscall/mm.rs
  79. 11 8
      src/kernel/syscall/procops.rs
  80. 11 7
      src/kernel/task/kernel_stack.rs
  81. 3 9
      src/kernel/task/process_list.rs
  82. 10 10
      src/kernel/task/signal/signal_action.rs
  83. 5 8
      src/kernel/task/thread.rs
  84. 14 21
      src/kernel/vfs/file.rs
  85. 6 4
      src/lib.rs
  86. 1 0
      src/sync.rs
  87. 34 0
      src/sync/fence.rs

+ 24 - 11
Cargo.lock

@@ -16,6 +16,7 @@ name = "arch"
 version = "0.1.0"
 dependencies = [
  "cfg-if",
+ "eonix_mm",
  "percpu-macros",
 ]
 
@@ -55,6 +56,15 @@ version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
+[[package]]
+name = "buddy_allocator"
+version = "0.1.0"
+dependencies = [
+ "eonix_mm",
+ "eonix_sync",
+ "intrusive_list",
+]
+
 [[package]]
 name = "cexpr"
 version = "0.6.0"
@@ -91,10 +101,16 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 name = "eonix_log"
 version = "0.1.0"
 dependencies = [
- "eonix_spin_irq",
  "eonix_sync",
 ]
 
+[[package]]
+name = "eonix_mm"
+version = "0.1.0"
+dependencies = [
+ "intrusive_list",
+]
+
 [[package]]
 name = "eonix_preempt"
 version = "0.1.0"
@@ -110,24 +126,16 @@ dependencies = [
  "atomic_unique_refcell",
  "eonix_log",
  "eonix_preempt",
- "eonix_spin_irq",
  "eonix_sync",
  "intrusive-collections",
  "pointers",
 ]
 
-[[package]]
-name = "eonix_spin_irq"
-version = "0.1.0"
-dependencies = [
- "arch",
- "eonix_sync",
-]
-
 [[package]]
 name = "eonix_sync"
 version = "0.1.0"
 dependencies = [
+ "arch",
  "eonix_preempt",
  "intrusive-collections",
 ]
@@ -140,10 +148,11 @@ dependencies = [
  "atomic_unique_refcell",
  "bindgen",
  "bitflags",
+ "buddy_allocator",
  "eonix_log",
+ "eonix_mm",
  "eonix_preempt",
  "eonix_runtime",
- "eonix_spin_irq",
  "eonix_sync",
  "intrusive-collections",
  "itertools",
@@ -166,6 +175,10 @@ dependencies = [
  "memoffset",
 ]
 
+[[package]]
+name = "intrusive_list"
+version = "0.1.0"
+
 [[package]]
 name = "itertools"
 version = "0.13.0"

+ 6 - 2
Cargo.toml

@@ -11,9 +11,10 @@ arch = { path = "./arch" }
 atomic_unique_refcell = { path = "./crates/atomic_unique_refcell", features = [
     "no_std",
 ] }
+buddy_allocator = { path = "./crates/buddy_allocator" }
+eonix_mm = { path = "./crates/eonix_mm" }
 eonix_preempt = { path = "./crates/eonix_preempt" }
 eonix_runtime = { path = "./crates/eonix_runtime" }
-eonix_spin_irq = { path = "./crates/eonix_spin_irq" }
 eonix_sync = { path = "./crates/eonix_sync" }
 eonix_log = { path = "./crates/eonix_log" }
 pointers = { path = "./crates/pointers" }
@@ -47,7 +48,10 @@ opt-level = 2
 opt-level = 0
 
 [profile.dev.package.eonix_sync]
-opt-level = 0
+opt-level = 2
+
+[profile.dev.package.intrusive_list]
+opt-level = 2
 
 [profile.dev.package."*"]
 opt-level = "s"

+ 2 - 1
arch/Cargo.toml

@@ -4,5 +4,6 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-percpu-macros = { path="./percpu-macros" }
+eonix_mm = { path = "../crates/eonix_mm" }
+percpu-macros = { path = "./percpu-macros" }
 cfg-if = "1.0"

+ 34 - 0
arch/src/x86_64/fence.rs

@@ -0,0 +1,34 @@
+use core::arch::asm;
+
+#[doc(hidden)]
+/// Issues a full memory barrier.
+///
+/// Note that this acts as a low-level operation **ONLY** and should be used with caution.
+/// **NO COMPILER BARRIERS** are emitted by this function.
+pub fn memory_barrier() {
+    unsafe {
+        asm!("mfence", options(nostack, nomem, preserves_flags));
+    }
+}
+
+#[doc(hidden)]
+/// Issues a read memory barrier.
+///
+/// Note that this acts as a low-level operation **ONLY** and should be used with caution.
+/// **NO COMPILER BARRIERS** are emitted by this function.
+pub fn read_memory_barrier() {
+    unsafe {
+        asm!("lfence", options(nostack, nomem, preserves_flags));
+    }
+}
+
+#[doc(hidden)]
+/// Issues a write memory barrier.
+///
+/// Note that this acts as a low-level operation **ONLY** and should be used with caution.
+/// **NO COMPILER BARRIERS** are emitted by this function.
+pub fn write_memory_barrier() {
+    unsafe {
+        asm!("sfence", options(nostack, nomem, preserves_flags));
+    }
+}

+ 218 - 0
arch/src/x86_64/mm.rs

@@ -1 +1,219 @@
+use core::{marker::PhantomData, ptr::NonNull};
+use eonix_mm::{
+    address::{Addr as _, PAddr},
+    page_table::{PageAttribute, PageTableLevel, PagingMode, RawPageTable, PTE},
+    paging::{PageBlock, PFN},
+};
+
 pub const PAGE_SIZE: usize = 0x1000;
+
+const KERNEL_PML4_PFN: PFN = PFN::from_val(0x2000 >> 12);
+
+const PA_P: u64 = 0x001;
+const PA_RW: u64 = 0x002;
+const PA_US: u64 = 0x004;
+#[allow(dead_code)]
+const PA_PWT: u64 = 0x008;
+#[allow(dead_code)]
+const PA_PCD: u64 = 0x010;
+const PA_A: u64 = 0x020;
+const PA_D: u64 = 0x040;
+#[allow(dead_code)]
+const PA_PS: u64 = 0x080;
+const PA_G: u64 = 0x100;
+const PA_COW: u64 = 0x200;
+const PA_MMAP: u64 = 0x400;
+const PA_ANON: u64 = 0x800;
+const PA_NXE: u64 = 0x8000_0000_0000_0000;
+const PA_MASK: u64 = 0xfff0_0000_0000_0fff;
+
+#[repr(transparent)]
+pub struct PTE64(u64);
+
+#[derive(Clone, Copy)]
+pub struct PageAttribute64(u64);
+
+pub struct RawPageTable4Levels<'a>(NonNull<PTE64>, PhantomData<&'a ()>);
+
+pub struct PagingMode4Levels;
+
+impl PTE for PTE64 {
+    type Attr = PageAttribute64;
+
+    fn set(&mut self, pfn: PFN, attr: Self::Attr) {
+        let paddr = PAddr::from(pfn).addr();
+
+        self.0 = (paddr as u64 & !PA_MASK) | (attr.0 & PA_MASK);
+    }
+
+    fn get(&self) -> (PFN, Self::Attr) {
+        (
+            PFN::from(PAddr::from((self.0 & !PA_MASK) as usize)),
+            PageAttribute64(self.0 & PA_MASK),
+        )
+    }
+
+    fn take(&mut self) -> (PFN, Self::Attr) {
+        let pfn_attr = self.get();
+        self.0 = 0;
+        pfn_attr
+    }
+}
+
+impl PagingMode for PagingMode4Levels {
+    type Entry = PTE64;
+    type RawTable<'a> = RawPageTable4Levels<'a>;
+
+    const LEVELS: &'static [PageTableLevel] = &[
+        PageTableLevel::new(39, 9),
+        PageTableLevel::new(30, 9),
+        PageTableLevel::new(21, 9),
+        PageTableLevel::new(12, 9),
+    ];
+
+    const KERNEL_ROOT_TABLE_PFN: PFN = KERNEL_PML4_PFN;
+}
+
+impl<'a> RawPageTable<'a> for RawPageTable4Levels<'a> {
+    type Entry = PTE64;
+
+    fn index(&self, index: u16) -> &'a Self::Entry {
+        unsafe { &self.0.cast::<[PTE64; 512]>().as_ref()[index as usize] }
+    }
+
+    fn index_mut(&mut self, index: u16) -> &'a mut Self::Entry {
+        unsafe { &mut self.0.cast::<[PTE64; 512]>().as_mut()[index as usize] }
+    }
+
+    unsafe fn from_ptr(ptr: NonNull<PageBlock>) -> Self {
+        Self(ptr.cast(), PhantomData)
+    }
+}
+
+impl PageAttribute for PageAttribute64 {
+    fn new() -> Self {
+        Self(PA_NXE)
+    }
+
+    fn present(self, present: bool) -> Self {
+        if present {
+            Self(self.0 | PA_P)
+        } else {
+            Self(self.0 & !PA_P)
+        }
+    }
+
+    fn write(self, write: bool) -> Self {
+        if write {
+            Self(self.0 | PA_RW)
+        } else {
+            Self(self.0 & !PA_RW)
+        }
+    }
+
+    fn execute(self, execute: bool) -> Self {
+        if execute {
+            Self(self.0 & !PA_NXE)
+        } else {
+            Self(self.0 | PA_NXE)
+        }
+    }
+
+    fn user(self, user: bool) -> Self {
+        if user {
+            Self(self.0 | PA_US)
+        } else {
+            Self(self.0 & !PA_US)
+        }
+    }
+
+    fn accessed(self, accessed: bool) -> Self {
+        if accessed {
+            Self(self.0 | PA_A)
+        } else {
+            Self(self.0 & !PA_A)
+        }
+    }
+
+    fn dirty(self, dirty: bool) -> Self {
+        if dirty {
+            Self(self.0 | PA_D)
+        } else {
+            Self(self.0 & !PA_D)
+        }
+    }
+
+    fn global(self, global: bool) -> Self {
+        if global {
+            Self(self.0 | PA_G)
+        } else {
+            Self(self.0 & !PA_G)
+        }
+    }
+
+    fn copy_on_write(self, cow: bool) -> Self {
+        if cow {
+            Self(self.0 | PA_COW)
+        } else {
+            Self(self.0 & !PA_COW)
+        }
+    }
+
+    fn mapped(self, mmap: bool) -> Self {
+        if mmap {
+            Self(self.0 | PA_MMAP)
+        } else {
+            Self(self.0 & !PA_MMAP)
+        }
+    }
+
+    fn anonymous(self, anon: bool) -> Self {
+        if anon {
+            Self(self.0 | PA_ANON)
+        } else {
+            Self(self.0 & !PA_ANON)
+        }
+    }
+
+    fn is_present(&self) -> bool {
+        self.0 & PA_P != 0
+    }
+
+    fn is_write(&self) -> bool {
+        self.0 & PA_RW != 0
+    }
+
+    fn is_execute(&self) -> bool {
+        self.0 & PA_NXE == 0
+    }
+
+    fn is_user(&self) -> bool {
+        self.0 & PA_US != 0
+    }
+
+    fn is_accessed(&self) -> bool {
+        self.0 & PA_A != 0
+    }
+
+    fn is_dirty(&self) -> bool {
+        self.0 & PA_D != 0
+    }
+
+    fn is_global(&self) -> bool {
+        self.0 & PA_G != 0
+    }
+
+    fn is_copy_on_write(&self) -> bool {
+        self.0 & PA_COW != 0
+    }
+
+    fn is_mapped(&self) -> bool {
+        self.0 & PA_MMAP != 0
+    }
+
+    fn is_anonymous(&self) -> bool {
+        self.0 & PA_ANON != 0
+    }
+}
+
+pub type DefaultPagingMode = PagingMode4Levels;

+ 12 - 8
arch/src/x86_64/mod.rs

@@ -1,4 +1,5 @@
 mod context;
+mod fence;
 mod gdt;
 mod init;
 mod interrupt;
@@ -7,18 +8,21 @@ mod mm;
 mod percpu;
 mod user;
 
+use core::arch::asm;
+use eonix_mm::address::{Addr as _, PAddr, VAddr};
+use eonix_mm::paging::PFN;
+
 pub use self::context::*;
 pub use self::gdt::*;
 pub use self::init::*;
 pub use self::interrupt::*;
 pub use self::io::*;
 pub use self::user::*;
+pub use fence::*;
 pub use mm::*;
 pub use percpu::*;
 pub use percpu_macros::{define_percpu, define_percpu_shared};
 
-use core::arch::asm;
-
 #[inline(always)]
 pub fn flush_tlb(vaddr: usize) {
     unsafe {
@@ -43,7 +47,7 @@ pub fn flush_tlb_all() {
 }
 
 #[inline(always)]
-pub fn get_root_page_table() -> usize {
+pub fn get_root_page_table_pfn() -> PFN {
     let cr3: usize;
     unsafe {
         asm!(
@@ -52,22 +56,22 @@ pub fn get_root_page_table() -> usize {
             options(att_syntax)
         );
     }
-    cr3
+    PFN::from(PAddr::from(cr3))
 }
 
 #[inline(always)]
-pub fn set_root_page_table(pfn: usize) {
+pub fn set_root_page_table_pfn(pfn: PFN) {
     unsafe {
         asm!(
             "mov {0}, %cr3",
-            in(reg) pfn,
+            in(reg) PAddr::from(pfn).addr(),
             options(att_syntax)
         );
     }
 }
 
 #[inline(always)]
-pub fn get_page_fault_address() -> usize {
+pub fn get_page_fault_address() -> VAddr {
     let cr2: usize;
     unsafe {
         asm!(
@@ -76,7 +80,7 @@ pub fn get_page_fault_address() -> usize {
             options(att_syntax)
         );
     }
-    cr2
+    VAddr::from(cr2)
 }
 
 #[inline(always)]

+ 9 - 0
crates/buddy_allocator/Cargo.toml

@@ -0,0 +1,9 @@
+[package]
+name = "buddy_allocator"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+eonix_mm = { path = "../eonix_mm" }
+eonix_sync = { path = "../eonix_sync" }
+intrusive_list = { path = "../intrusive_list" }

+ 47 - 0
crates/buddy_allocator/src/free_area.rs

@@ -0,0 +1,47 @@
+use core::marker::{Send, Sync};
+use eonix_mm::paging::{PageFlags, RawPage, RawPagePtr};
+use intrusive_list::{container_of, Link};
+
+pub struct FreeArea {
+    free_list: Link,
+    count: usize,
+}
+
+unsafe impl Send for FreeArea {}
+unsafe impl Sync for FreeArea {}
+
+impl FreeArea {
+    pub const fn new() -> Self {
+        Self {
+            free_list: Link::new(),
+            count: 0,
+        }
+    }
+
+    pub fn get_free_pages(&mut self) -> Option<RawPagePtr> {
+        self.free_list.next_mut().map(|pages_link| {
+            assert_ne!(self.count, 0);
+
+            let pages_ptr = unsafe { container_of!(pages_link, RawPage, link) };
+            let pages_ptr = RawPagePtr::new(pages_ptr);
+
+            self.count -= 1;
+            pages_link.remove();
+
+            pages_ptr
+        })
+    }
+
+    pub fn add_pages(&mut self, pages_ptr: RawPagePtr) {
+        self.count += 1;
+        pages_ptr.as_mut().flags.set(PageFlags::FREE);
+        self.free_list.insert(&mut pages_ptr.as_mut().link)
+    }
+
+    pub fn del_pages(&mut self, pages_ptr: RawPagePtr) {
+        assert!(self.count >= 1 && pages_ptr.as_ref().flags.has(PageFlags::FREE));
+        self.count -= 1;
+        pages_ptr.as_mut().flags.clear(PageFlags::FREE);
+        pages_ptr.as_mut().link.remove();
+    }
+}

+ 73 - 0
crates/buddy_allocator/src/lib.rs

@@ -0,0 +1,73 @@
+#![no_std]
+
+mod free_area;
+mod zone;
+
+use core::sync::atomic::Ordering;
+use eonix_mm::{
+    address::PAddr,
+    paging::{PageAlloc, PageFlags, RawPagePtr, PFN},
+};
+use eonix_sync::Spin;
+use zone::Zone;
+
+pub use free_area::FreeArea;
+
+const MAX_ORDER: u32 = 10;
+const ZONE_AREAS: usize = const { MAX_ORDER as usize + 1 };
+
+static BUDDY_ALLOCATOR: BuddyAllocator = BuddyAllocator::new();
+
+pub struct BuddyAllocator {
+    zone: Spin<Zone<ZONE_AREAS>>,
+}
+
+impl BuddyAllocator {
+    const fn new() -> Self {
+        Self {
+            zone: Spin::new(Zone::new()),
+        }
+    }
+
+    pub fn create_pages(start: PAddr, end: PAddr) {
+        BUDDY_ALLOCATOR.zone.lock().create_pages(start, end);
+    }
+}
+
+impl PageAlloc for BuddyAllocator {
+    fn alloc_order(order: u32) -> Option<RawPagePtr> {
+        let pages_ptr = BUDDY_ALLOCATOR.zone.lock().get_free_pages(order);
+
+        if let Some(pages_ptr) = pages_ptr {
+            // SAFETY: Memory order here can be Relaxed is for the same reason as that
+            // in the copy constructor of `std::shared_ptr`.
+            pages_ptr.refcount().fetch_add(1, Ordering::Relaxed);
+            pages_ptr.flags().clear(PageFlags::FREE);
+        }
+
+        pages_ptr
+    }
+
+    unsafe fn dealloc(page_ptr: RawPagePtr) {
+        BUDDY_ALLOCATOR.zone.lock().free_pages(page_ptr);
+    }
+
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool {
+        !page_ptr.flags().has(PageFlags::FREE) && page_ptr.flags().has(PageFlags::BUDDY)
+    }
+}
+
+pub(self) trait BuddyPFNOps {
+    fn buddy_pfn(self, order: u32) -> PFN;
+    fn combined_pfn(self, buddy_pfn: PFN) -> PFN;
+}
+
+impl BuddyPFNOps for PFN {
+    fn buddy_pfn(self, order: u32) -> PFN {
+        PFN::from(usize::from(self) ^ (1 << order))
+    }
+
+    fn combined_pfn(self, buddy_pfn: PFN) -> PFN {
+        PFN::from(usize::from(self) & usize::from(buddy_pfn))
+    }
+}

+ 122 - 0
crates/buddy_allocator/src/zone.rs

@@ -0,0 +1,122 @@
+use crate::BuddyPFNOps as _;
+
+use super::free_area::FreeArea;
+use core::sync::atomic::Ordering;
+use eonix_mm::{
+    address::{AddrOps as _, PAddr},
+    paging::{PageFlags, RawPagePtr, PFN},
+};
+
+pub(super) struct Zone<const AREAS: usize> {
+    free_areas: [FreeArea; AREAS],
+}
+
+impl<const AREAS: usize> Zone<AREAS> {
+    pub const fn new() -> Self {
+        Self {
+            free_areas: [const { FreeArea::new() }; AREAS],
+        }
+    }
+
+    pub fn get_free_pages(&mut self, order: u32) -> Option<RawPagePtr> {
+        for current_order in order..AREAS as u32 {
+            let pages_ptr = self.free_areas[current_order as usize].get_free_pages();
+            let Some(pages_ptr) = pages_ptr else { continue };
+
+            pages_ptr.as_mut().order = order;
+
+            if current_order > order {
+                self.expand(pages_ptr, current_order, order);
+            }
+            assert!(pages_ptr.flags().has(PageFlags::PRESENT | PageFlags::FREE));
+
+            return Some(pages_ptr);
+        }
+        None
+    }
+
+    fn expand(&mut self, pages_ptr: RawPagePtr, order: u32, target_order: u32) {
+        let mut offset = 1 << order;
+
+        for order in (target_order..order).rev() {
+            offset >>= 1;
+            let split_pages_ptr = pages_ptr.offset(offset);
+            split_pages_ptr.as_mut().order = order;
+            split_pages_ptr.flags().set(PageFlags::BUDDY);
+            self.free_areas[order as usize].add_pages(split_pages_ptr);
+        }
+    }
+
+    pub fn free_pages(&mut self, mut pages_ptr: RawPagePtr) {
+        assert_eq!(pages_ptr.refcount().load(Ordering::Relaxed), 0);
+
+        let mut pfn = PFN::from(pages_ptr);
+        let mut current_order = pages_ptr.order();
+
+        while current_order < (AREAS - 1) as u32 {
+            let buddy_pfn = pfn.buddy_pfn(current_order);
+            let buddy_pages_ptr = RawPagePtr::from(buddy_pfn);
+
+            if !self.buddy_check(buddy_pages_ptr, current_order) {
+                break;
+            }
+
+            pages_ptr.flags().clear(PageFlags::BUDDY);
+            buddy_pages_ptr.flags().clear(PageFlags::BUDDY);
+            self.free_areas[current_order as usize].del_pages(buddy_pages_ptr);
+
+            pages_ptr = RawPagePtr::from(pfn.combined_pfn(buddy_pfn));
+            pfn = pfn.combined_pfn(buddy_pfn);
+
+            pages_ptr.flags().set(PageFlags::BUDDY);
+            current_order += 1;
+        }
+
+        pages_ptr.as_mut().order = current_order;
+        self.free_areas[current_order as usize].add_pages(pages_ptr);
+    }
+
+    /// This function checks whether a page is free && is a buddy
+    /// we can coalesce a page and its buddy if
+    /// - the buddy is valid(present) &&
+    /// - the buddy is right now in free_areas &&
+    /// - a page and its buddy have the same order &&
+    /// - a page and its buddy are in the same zone.    // check when smp
+    fn buddy_check(&self, pages_ptr: RawPagePtr, order: u32) -> bool {
+        if !pages_ptr.flags().has(PageFlags::PRESENT) {
+            return false;
+        }
+        if !pages_ptr.flags().has(PageFlags::FREE) {
+            return false;
+        }
+        if pages_ptr.flags().has(PageFlags::LOCAL) {
+            return false;
+        }
+        if pages_ptr.as_ref().order != order {
+            return false;
+        }
+
+        assert_eq!(pages_ptr.refcount().load(Ordering::Relaxed), 0);
+        true
+    }
+
+    /// Only used on buddy initialization
+    pub fn create_pages(&mut self, start: PAddr, end: PAddr) {
+        let mut start_pfn = PFN::from(start.ceil());
+        let end_pfn = PFN::from(end.floor());
+
+        while start_pfn < end_pfn {
+            let mut order = usize::from(start_pfn)
+                .trailing_zeros()
+                .min((AREAS - 1) as u32);
+
+            while start_pfn + order as usize > end_pfn {
+                order -= 1;
+            }
+            let page_ptr: RawPagePtr = start_pfn.into();
+            page_ptr.flags().set(PageFlags::BUDDY);
+            self.free_areas[order as usize].add_pages(page_ptr);
+            start_pfn = start_pfn + (1 << order) as usize;
+        }
+    }
+}

+ 0 - 1
crates/eonix_log/Cargo.toml

@@ -4,5 +4,4 @@ version = "0.1.0"
 edition = "2024"
 
 [dependencies]
-eonix_spin_irq = { path = "../eonix_spin_irq" }
 eonix_sync = { path = "../eonix_sync" }

+ 0 - 1
crates/eonix_log/src/lib.rs

@@ -3,7 +3,6 @@
 use core::fmt::{self, Write};
 
 use alloc::sync::Arc;
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::Spin;
 
 extern crate alloc;

+ 7 - 0
crates/eonix_mm/Cargo.toml

@@ -0,0 +1,7 @@
+[package]
+name = "eonix_mm"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+intrusive_list = { path = "../intrusive_list" }

+ 14 - 0
crates/eonix_mm/src/address.rs

@@ -0,0 +1,14 @@
+mod addr;
+mod addr_range;
+mod error;
+mod paddr;
+mod vaddr;
+
+pub use addr::{Addr, AddrOps};
+pub use addr_range::AddrRange;
+pub use error::AddressError;
+pub use paddr::PAddr;
+pub use vaddr::VAddr;
+
+pub type PRange = AddrRange<PAddr>;
+pub type VRange = AddrRange<VAddr>;

+ 64 - 0
crates/eonix_mm/src/address/addr.rs

@@ -0,0 +1,64 @@
+use crate::paging::PAGE_SIZE;
+use core::ops::{Add, Sub};
+
+pub trait Addr:
+    Sized
+    + Copy
+    + Clone
+    + Ord
+    + PartialOrd
+    + Eq
+    + PartialEq
+    + Sub<Output = usize>
+    + Sub<usize, Output = Self>
+    + Add<usize, Output = Self>
+    + From<usize>
+{
+    fn addr(self) -> usize;
+}
+
+pub trait AddrOps: Sized {
+    fn offset_in(self, size: usize) -> usize;
+
+    fn is_aligned_to(self, size: usize) -> bool;
+
+    /// Aligns the address to the nearest lower multiple of `size`.
+    fn floor_to(self, size: usize) -> Self;
+
+    /// Aligns the address to the nearest lower multiple of `size`.
+    fn ceil_to(self, size: usize) -> Self;
+
+    fn page_offset(self) -> usize {
+        self.offset_in(PAGE_SIZE)
+    }
+
+    fn is_page_aligned(self) -> bool {
+        self.is_aligned_to(PAGE_SIZE)
+    }
+
+    fn floor(self) -> Self {
+        self.floor_to(PAGE_SIZE)
+    }
+
+    fn ceil(self) -> Self {
+        self.ceil_to(PAGE_SIZE)
+    }
+}
+
+impl<A: Addr> AddrOps for A {
+    fn offset_in(self, size: usize) -> usize {
+        self.addr() % size
+    }
+
+    fn is_aligned_to(self, size: usize) -> bool {
+        self.offset_in(size) == 0
+    }
+
+    fn floor_to(self, size: usize) -> Self {
+        Self::from(self.addr() / size * size)
+    }
+
+    fn ceil_to(self, size: usize) -> Self {
+        Self::from(self.addr().div_ceil(size) * size)
+    }
+}

+ 190 - 0
crates/eonix_mm/src/address/addr_range.rs

@@ -0,0 +1,190 @@
+use super::addr::Addr;
+use core::{cmp::Ordering, fmt, ops::RangeBounds};
+
+#[derive(Clone, Copy)]
+/// A range of addresses.
+///
+/// The range is defined by two addresses, `start` and `end` and is inclusive
+/// on the start and exclusive on the end.
+///
+/// # Relations
+///
+/// ## Comparison
+///
+/// ### Equal
+/// Any two ranges that have one of them **containing** the other are considered equal.
+///
+/// ### Less
+/// If the two are not equal, the one that has the **smallest** start address is considered less.
+///
+/// ### Greater
+/// If the two are not equal, the one that has the **largest** end address is considered greater.
+///
+/// ## Overlapping Check
+/// Use `overlap_with` instead of `==` to check if two ranges overlap.
+pub struct AddrRange<A: Addr> {
+    start: A,
+    end: A,
+}
+
+impl<A: Addr> Eq for AddrRange<A> {}
+impl<A: Addr> PartialOrd for AddrRange<A> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<A: Addr> PartialEq for AddrRange<A> {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl<A: Addr> Ord for AddrRange<A> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        if self.start == other.start {
+            return Ordering::Equal;
+        }
+
+        if self.end == other.end {
+            if self.start == self.end {
+                return Ordering::Greater;
+            }
+            if other.start == other.end {
+                return Ordering::Less;
+            }
+            return Ordering::Equal;
+        }
+
+        if self.start < other.start {
+            if other.end < self.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Less;
+            }
+        }
+
+        if other.start < self.start {
+            if self.end < other.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Greater;
+            }
+        }
+
+        unreachable!()
+    }
+}
+
+impl<A: Addr> From<A> for AddrRange<A> {
+    fn from(addr: A) -> Self {
+        Self {
+            start: addr,
+            end: addr,
+        }
+    }
+}
+
+impl<A: Addr> AddrRange<A> {
+    /// Creates a new `AddrRange` with the given start and end addresses.
+    ///
+    /// # Panics
+    /// Panics if the start address is greater than the end address.
+    ///
+    /// # Hint
+    /// Use `AddrRange::from(addr).grow(size)` to create a range of size `size`
+    /// starting from `addr`.
+    pub fn new(start: A, end: A) -> Self {
+        assert!(start <= end);
+        Self { start, end }
+    }
+
+    pub const fn start(&self) -> A {
+        self.start
+    }
+
+    pub const fn end(&self) -> A {
+        self.end
+    }
+
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
+
+    pub fn shrink(&self, size: usize) -> Self {
+        assert!(size <= self.len());
+        Self::new(self.start, self.end - size)
+    }
+
+    pub fn grow(&self, count: usize) -> Self {
+        Self::new(self.start, self.end + count)
+    }
+
+    pub fn into_bounds(&self) -> impl RangeBounds<Self> {
+        if self.len() == 0 {
+            Self::from(self.start())..=Self::from(self.start())
+        } else {
+            Self::from(self.start())..=Self::from(self.end() - 1)
+        }
+    }
+
+    pub fn overlap_with(&self, other: &Self) -> bool {
+        self.start < other.end && self.end > other.start
+    }
+
+    pub fn split_at_checked(&self, at: A) -> (Option<Self>, Option<Self>) {
+        if self.end <= at {
+            (Some(*self), None)
+        } else if at <= self.start {
+            (None, Some(*self))
+        } else {
+            (
+                Some(Self::new(self.start, at)),
+                Some(Self::new(at, self.end)),
+            )
+        }
+    }
+
+    pub fn split_at(&self, at: A) -> (Self, Self) {
+        let (left, right) = self.split_at_checked(at);
+        (
+            left.expect("`at` is too large"),
+            right.expect("`at` is too small"),
+        )
+    }
+
+    pub fn mask_with_checked(&self, mask: &Self) -> Option<(Option<Self>, Self, Option<Self>)> {
+        if mask.len() == 0 || !self.overlap_with(mask) {
+            return None;
+        }
+
+        let left;
+        let mut mid;
+        let right;
+
+        if self.start < mask.start && mask.start < self.end {
+            let (l, r) = self.split_at(mask.start);
+            left = Some(l);
+            mid = r;
+        } else {
+            left = None;
+            mid = *self;
+        }
+
+        if mask.end < self.end {
+            let (l, r) = mid.split_at(mask.end);
+            mid = l;
+            right = Some(r);
+        } else {
+            right = None;
+        }
+
+        Some((left, mid, right))
+    }
+}
+
+impl<A: Addr + fmt::Debug> fmt::Debug for AddrRange<A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "[{:?}, {:?})", self.start, self.end)
+    }
+}

+ 4 - 0
crates/eonix_mm/src/address/error.rs

@@ -0,0 +1,4 @@
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AddressError {
+    InvalidAddress,
+}

+ 65 - 0
crates/eonix_mm/src/address/paddr.rs

@@ -0,0 +1,65 @@
+use super::addr::Addr;
+use crate::paging::{PAGE_SIZE_BITS, PFN};
+use core::{
+    fmt,
+    ops::{Add, Sub},
+};
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+pub struct PAddr(usize);
+
+impl From<usize> for PAddr {
+    fn from(v: usize) -> Self {
+        Self(v)
+    }
+}
+
+impl Sub for PAddr {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Sub<usize> for PAddr {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        PAddr(self.0 - rhs)
+    }
+}
+
+impl Add<usize> for PAddr {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        PAddr(self.0 + rhs)
+    }
+}
+
+impl fmt::Debug for PAddr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "PAddr({:#x})", self.0)
+    }
+}
+
+impl Addr for PAddr {
+    fn addr(self) -> usize {
+        let Self(addr) = self;
+        addr
+    }
+}
+
+impl From<PFN> for PAddr {
+    fn from(value: PFN) -> Self {
+        Self(usize::from(value) << PAGE_SIZE_BITS)
+    }
+}
+
+impl PAddr {
+    pub const fn from_val(val: usize) -> Self {
+        Self(val)
+    }
+}

+ 60 - 0
crates/eonix_mm/src/address/vaddr.rs

@@ -0,0 +1,60 @@
+use super::addr::Addr;
+use core::{
+    fmt,
+    ops::{Add, Sub},
+};
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+pub struct VAddr(usize);
+
+impl From<usize> for VAddr {
+    fn from(v: usize) -> Self {
+        Self::from(v)
+    }
+}
+
+impl VAddr {
+    pub const NULL: Self = Self(0);
+
+    pub const fn from(v: usize) -> Self {
+        Self(v)
+    }
+}
+
+impl Sub for VAddr {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Sub<usize> for VAddr {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 - rhs)
+    }
+}
+
+impl Add<usize> for VAddr {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 + rhs)
+    }
+}
+
+impl fmt::Debug for VAddr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "VAddr({:#x})", self.0)
+    }
+}
+
+impl Addr for VAddr {
+    fn addr(self) -> usize {
+        let Self(addr) = self;
+        addr
+    }
+}

+ 5 - 0
crates/eonix_mm/src/lib.rs

@@ -0,0 +1,5 @@
+#![no_std]
+
+pub mod address;
+pub mod page_table;
+pub mod paging;

+ 9 - 0
crates/eonix_mm/src/page_table.rs

@@ -0,0 +1,9 @@
+mod page_table;
+mod paging_mode;
+mod pte;
+mod pte_iterator;
+
+pub use page_table::{PageTable, RawPageTable};
+pub use paging_mode::{PageTableLevel, PagingMode};
+pub use pte::{PageAttribute, PTE};
+pub use pte_iterator::PageTableIterator;

+ 132 - 0
crates/eonix_mm/src/page_table/page_table.rs

@@ -0,0 +1,132 @@
+use super::{
+    paging_mode::PageTableLevel,
+    pte_iterator::{KernelIterator, UserIterator},
+    PageAttribute, PagingMode, PTE,
+};
+use crate::{
+    address::{PAddr, VRange},
+    page_table::PageTableIterator,
+    paging::{Page, PageAccess, PageAlloc, PageBlock},
+};
+use core::{marker::PhantomData, ptr::NonNull};
+
+pub trait RawPageTable<'a>: 'a {
+    type Entry: PTE + 'a;
+
+    /// Return the entry at the given index.
+    fn index(&self, index: u16) -> &'a Self::Entry;
+
+    /// Return a mutable reference to the entry at the given index.
+    fn index_mut(&mut self, index: u16) -> &'a mut Self::Entry;
+
+    /// Get the page table pointed to by raw pointer `ptr`.
+    unsafe fn from_ptr(ptr: NonNull<PageBlock>) -> Self;
+}
+
+pub struct PageTable<'a, M, A, X>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+{
+    root_table_page: Page<A>,
+    phantom: PhantomData<&'a (M, X)>,
+}
+
+impl<'a, M, A, X> PageTable<'a, M, A, X>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+{
+    pub fn new<A1: PageAlloc>(kernel_root_table_page: &Page<A1>) -> Self {
+        let new_root_table_page = Page::<A>::alloc();
+        let new_table_data = X::get_ptr_for_page(&new_root_table_page);
+        let kernel_table_data = X::get_ptr_for_page(kernel_root_table_page);
+
+        unsafe {
+            // SAFETY: `new_table_data` and `kernel_table_data` are both valid pointers
+            //         to **different** page tables.
+            new_table_data.copy_from_nonoverlapping(kernel_table_data, 1);
+        }
+
+        let mut root_page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(new_table_data)
+        };
+
+        let level0 = M::LEVELS[0];
+        for idx in 0..level0.max_index() / 2 {
+            // We consider the first half of the page table as user space.
+            // Clear all (potential) user space mappings.
+            root_page_table.index_mut(idx).take();
+        }
+
+        Self {
+            root_table_page: new_root_table_page,
+            phantom: PhantomData,
+        }
+    }
+
+    pub fn addr(&self) -> PAddr {
+        self.root_table_page.start()
+    }
+
+    pub fn iter_user(&self, range: VRange) -> impl Iterator<Item = &mut M::Entry> {
+        let page_table_ptr = X::get_ptr_for_page(&self.root_table_page);
+        let root_page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(page_table_ptr)
+        };
+
+        PageTableIterator::<M, A, X, UserIterator>::new(root_page_table, range)
+    }
+
+    pub fn iter_kernel(&self, range: VRange) -> impl Iterator<Item = &mut M::Entry> {
+        let page_table_ptr = X::get_ptr_for_page(&self.root_table_page);
+        let root_page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(page_table_ptr)
+        };
+
+        PageTableIterator::<M, A, X, KernelIterator>::new(root_page_table, range)
+    }
+
+    fn drop_page_table_recursive(page_table: &Page<A>, levels: &[PageTableLevel]) {
+        let [level, remaining_levels @ ..] = levels else { return };
+
+        let page_table_ptr = X::get_ptr_for_page(page_table);
+        let mut page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(page_table_ptr)
+        };
+
+        for pte in (0..=level.max_index()).map(|i| page_table.index_mut(i)) {
+            let (pfn, attr) = pte.take();
+            if !attr.is_present() || !attr.is_user() {
+                continue;
+            }
+
+            let page_table = unsafe {
+                // SAFETY: We got the pfn from a valid page table entry, so it should be valid.
+                Page::<A>::from_raw(pfn)
+            };
+
+            Self::drop_page_table_recursive(&page_table, remaining_levels);
+        }
+    }
+}
+
+impl<'a, M, A, X> Drop for PageTable<'a, M, A, X>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+{
+    fn drop(&mut self) {
+        Self::drop_page_table_recursive(&self.root_table_page, M::LEVELS);
+    }
+}

+ 38 - 0
crates/eonix_mm/src/page_table/paging_mode.rs

@@ -0,0 +1,38 @@
+use super::{RawPageTable, PTE};
+use crate::{
+    address::{Addr as _, VAddr},
+    paging::PFN,
+};
+
+pub trait PagingMode {
+    type Entry: PTE;
+    type RawTable<'a>: RawPageTable<'a, Entry = Self::Entry>;
+
+    const LEVELS: &'static [PageTableLevel];
+    const KERNEL_ROOT_TABLE_PFN: PFN;
+}
+
+#[derive(Clone, Copy, PartialOrd, PartialEq)]
+pub struct PageTableLevel(usize, usize);
+
+impl PageTableLevel {
+    pub const fn new(nth_bit: usize, len: usize) -> Self {
+        Self(nth_bit, len)
+    }
+
+    pub const fn nth_bit(self) -> usize {
+        self.0
+    }
+
+    pub const fn len(self) -> usize {
+        self.1
+    }
+
+    pub const fn max_index(self) -> u16 {
+        (1 << self.len()) - 1
+    }
+
+    pub fn index_of(self, vaddr: VAddr) -> u16 {
+        ((vaddr.addr() >> self.nth_bit()) & ((1 << self.len()) - 1)) as u16
+    }
+}

+ 52 - 0
crates/eonix_mm/src/page_table/pte.rs

@@ -0,0 +1,52 @@
+use crate::paging::PFN;
+
+pub trait PageAttribute: Copy {
+    /// Create a new instance of the attribute with all attributes set to false.
+    fn new() -> Self;
+
+    fn present(self, present: bool) -> Self;
+    fn write(self, write: bool) -> Self;
+    fn execute(self, execute: bool) -> Self;
+    fn user(self, user: bool) -> Self;
+    fn accessed(self, accessed: bool) -> Self;
+    fn dirty(self, dirty: bool) -> Self;
+    fn global(self, global: bool) -> Self;
+    fn copy_on_write(self, cow: bool) -> Self;
+    fn mapped(self, mmap: bool) -> Self;
+    fn anonymous(self, anon: bool) -> Self;
+
+    fn is_present(&self) -> bool;
+    fn is_write(&self) -> bool;
+    fn is_execute(&self) -> bool;
+    fn is_user(&self) -> bool;
+    fn is_accessed(&self) -> bool;
+    fn is_dirty(&self) -> bool;
+    fn is_global(&self) -> bool;
+    fn is_copy_on_write(&self) -> bool;
+    fn is_mapped(&self) -> bool;
+    fn is_anonymous(&self) -> bool;
+}
+
+pub trait PTE: Sized {
+    type Attr: PageAttribute;
+
+    fn set(&mut self, pfn: PFN, attr: Self::Attr);
+    fn get(&self) -> (PFN, Self::Attr);
+    fn take(&mut self) -> (PFN, Self::Attr);
+
+    fn set_pfn(&mut self, pfn: PFN) {
+        self.set(pfn, self.get_attr());
+    }
+
+    fn set_attr(&mut self, attr: Self::Attr) {
+        self.set(self.get_pfn(), attr);
+    }
+
+    fn get_pfn(&self) -> PFN {
+        self.get().0
+    }
+
+    fn get_attr(&self) -> Self::Attr {
+        self.get().1
+    }
+}

+ 177 - 0
crates/eonix_mm/src/page_table/pte_iterator.rs

@@ -0,0 +1,177 @@
+use super::{PageAttribute as _, PagingMode, RawPageTable as _, PTE};
+use crate::{
+    address::{AddrOps as _, VRange},
+    paging::{Page, PageAccess, PageAlloc, PAGE_SIZE},
+};
+use core::marker::PhantomData;
+
+pub struct KernelIterator;
+pub struct UserIterator;
+
+pub trait IteratorType<M: PagingMode> {
+    fn page_table_attributes() -> <M::Entry as PTE>::Attr;
+
+    fn get_page_table<'a, A, X>(pte: &mut M::Entry) -> M::RawTable<'a>
+    where
+        A: PageAlloc,
+        X: PageAccess,
+    {
+        let attr = pte.get_attr();
+
+        if attr.is_present() {
+            let pfn = pte.get_pfn();
+            unsafe {
+                // SAFETY: We are creating a pointer to a page referenced to in
+                //         some page table, which should be valid.
+                let page_table_ptr = X::get_ptr_for_pfn(pfn);
+                // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+                M::RawTable::from_ptr(page_table_ptr)
+            }
+        } else {
+            let page = Page::<A>::alloc();
+            let page_table_ptr = X::get_ptr_for_page(&page);
+
+            unsafe {
+                // SAFETY: `page_table_ptr` is good for writing and properly aligned.
+                page_table_ptr.write_bytes(0, 1);
+            }
+
+            pte.set(page.into_raw(), Self::page_table_attributes());
+
+            unsafe {
+                // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+                M::RawTable::from_ptr(page_table_ptr)
+            }
+        }
+    }
+}
+
+pub struct PageTableIterator<'a, M, A, X, K>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+    K: IteratorType<M>,
+{
+    remaining: usize,
+
+    indicies: [u16; 8],
+    tables: [Option<M::RawTable<'a>>; 8],
+
+    _phantom: PhantomData<&'a (A, X, K)>,
+}
+
+impl<'a, M, A, X, K> PageTableIterator<'a, M, A, X, K>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+    K: IteratorType<M>,
+{
+    fn parse_tables_starting_from(&mut self, idx_level: usize) {
+        let levels_len = M::LEVELS.len();
+
+        for (idx, &pt_idx) in self
+            .indicies
+            .iter()
+            .enumerate()
+            .take(levels_len - 1)
+            .skip(idx_level)
+        {
+            let [parent_table, child_table] = unsafe {
+                // SAFETY: `idx` and `idx + 1` must not overlap.
+                //         `idx + 1` is always less than `levels_len` since we iterate
+                //         until `levels_len - 1`.
+                self.tables.get_disjoint_unchecked_mut([idx, idx + 1])
+            };
+            let parent_table = parent_table.as_mut().expect("Parent table is None");
+            let next_pte = parent_table.index_mut(pt_idx);
+            child_table.replace(K::get_page_table::<A, X>(next_pte));
+        }
+    }
+
+    pub fn new(page_table: M::RawTable<'a>, range: VRange) -> Self {
+        let start = range.start().floor();
+        let end = range.end().ceil();
+
+        let mut me = Self {
+            remaining: (end - start) / PAGE_SIZE,
+            indicies: [0; 8],
+            tables: [const { None }; 8],
+            _phantom: PhantomData,
+        };
+
+        for (i, level) in M::LEVELS.iter().enumerate() {
+            me.indicies[i] = level.index_of(start);
+        }
+
+        me.tables[0] = Some(page_table);
+        me.parse_tables_starting_from(0);
+
+        me
+    }
+}
+
+impl<'a, M, A, X, K> Iterator for PageTableIterator<'a, M, A, X, K>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+    K: IteratorType<M>,
+{
+    type Item = &'a mut M::Entry;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.remaining == 0 {
+            return None;
+        } else {
+            self.remaining -= 1;
+        }
+
+        let len_levels = M::LEVELS.len();
+
+        let retval = self.tables[len_levels - 1]
+            .as_mut()
+            .unwrap()
+            .index_mut(self.indicies[len_levels - 1]);
+
+        let idx_level_start_updating = M::LEVELS
+            .iter()
+            .zip(self.indicies.iter_mut())
+            .enumerate()
+            .rev()
+            .skip_while(|(_, (level, idx))| **idx == level.max_index())
+            .map(|(i, _)| i)
+            .next()
+            .expect("Index out of bounds");
+
+        self.indicies[idx_level_start_updating] += 1;
+        self.indicies[idx_level_start_updating + 1..len_levels].fill(0);
+        self.parse_tables_starting_from(idx_level_start_updating);
+
+        Some(retval)
+    }
+}
+
+impl<M: PagingMode> IteratorType<M> for KernelIterator {
+    fn page_table_attributes() -> <M::Entry as PTE>::Attr {
+        <M::Entry as PTE>::Attr::new()
+            .present(true)
+            .write(true)
+            .execute(true)
+            .global(true)
+    }
+}
+
+impl<M: PagingMode> IteratorType<M> for UserIterator {
+    fn page_table_attributes() -> <M::Entry as PTE>::Attr {
+        <M::Entry as PTE>::Attr::new()
+            .present(true)
+            .write(true)
+            .execute(true)
+            .user(true)
+    }
+}

+ 9 - 0
crates/eonix_mm/src/paging.rs

@@ -0,0 +1,9 @@
+mod page;
+mod page_alloc;
+mod pfn;
+mod raw_page;
+
+pub use page::{Page, PageAccess, PageBlock, PAGE_SIZE, PAGE_SIZE_BITS};
+pub use page_alloc::PageAlloc;
+pub use pfn::PFN;
+pub use raw_page::{PageFlags, RawPage, RawPagePtr};

+ 219 - 0
crates/eonix_mm/src/paging/page.rs

@@ -0,0 +1,219 @@
+use super::{raw_page::RawPagePtr, PageAlloc, PFN};
+use crate::address::{AddrRange, PAddr};
+use core::{fmt, marker::PhantomData, mem::ManuallyDrop, ptr::NonNull, sync::atomic::Ordering};
+
+pub const PAGE_SIZE: usize = 4096;
+pub const PAGE_SIZE_BITS: u32 = PAGE_SIZE.trailing_zeros();
+
+/// A block of memory that is aligned to the page size and can be used for
+/// page-aligned allocations.
+///
+/// This is used to ensure that the memory is properly aligned to the page size.
+#[allow(dead_code)]
+#[repr(align(4096))]
+pub struct PageBlock([u8; PAGE_SIZE]);
+
+/// A trait that provides the kernel access to the page.
+pub trait PageAccess {
+    /// Returns a kernel-accessible pointer to the page referenced by the given
+    /// physical frame number.
+    ///
+    /// # Safety
+    /// This function is unsafe because calling this function on some non-existing
+    /// pfn will cause undefined behavior.
+    unsafe fn get_ptr_for_pfn(pfn: PFN) -> NonNull<PageBlock>;
+
+    /// Returns a kernel-accessible pointer to the given page.
+    fn get_ptr_for_page<A: PageAlloc>(page: &Page<A>) -> NonNull<PageBlock> {
+        unsafe {
+            // SAFETY: `page.pfn()` is guaranteed to be valid.
+            Self::get_ptr_for_pfn(page.pfn())
+        }
+    }
+}
+
+/// A Page allocated in allocator `A`.
+#[derive(PartialEq, Eq, PartialOrd, Ord)]
+pub struct Page<A: PageAlloc> {
+    raw_page: RawPagePtr,
+    _phantom: PhantomData<A>,
+}
+
+unsafe impl<A: PageAlloc> Send for Page<A> {}
+unsafe impl<A: PageAlloc> Sync for Page<A> {}
+
+impl<A: PageAlloc> Page<A> {
+    /// Allocate a page of the given *order*.
+    pub fn alloc_order(order: u32) -> Self {
+        Self {
+            raw_page: A::alloc_order(order).expect("Out of memory"),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Allocate exactly one page.
+    pub fn alloc() -> Self {
+        Self {
+            raw_page: A::alloc().expect("Out of memory"),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Allocate a contiguous block of pages that can contain at least `count` pages.
+    pub fn alloc_at_least(count: usize) -> Self {
+        Self {
+            raw_page: A::alloc_at_least(count).expect("Out of memory"),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Whether we are the only owner of the page.
+    pub fn is_exclusive(&self) -> bool {
+        self.raw_page.refcount().load(Ordering::Acquire) == 1
+    }
+
+    /// Returns the *order* of the page, which is the log2 of the number of pages
+    /// contained in the page object.
+    pub fn order(&self) -> u32 {
+        self.raw_page.order()
+    }
+
+    /// Returns the total size of the page in bytes.
+    pub fn len(&self) -> usize {
+        1 << (self.order() + PAGE_SIZE_BITS)
+    }
+
+    /// Acquire the ownership of the page pointed to by `pfn`, leaving `refcount` untouched.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `pfn` points to a valid page allocated through `alloc_order()` and that the
+    /// page have not been freed or deallocated yet.
+    ///
+    /// No checks are done. Any violation of this assumption may lead to undefined behavior.
+    pub unsafe fn from_raw_unchecked(pfn: PFN) -> Self {
+        Self {
+            raw_page: RawPagePtr::from(pfn),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Acquire the ownership of the page pointed to by `pfn`, leaving `refcount` untouched.
+    ///
+    /// This function is a safe wrapper around `from_paddr_unchecked()` that does **some sort
+    /// of** checks to ensure that the page is valid and managed by the allocator.
+    ///
+    /// # Panic
+    /// This function will panic if the page is not valid or if the page is not managed by
+    /// the allocator.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `pfn` points to an existing page (A.K.A. inside the global page array) and the
+    /// page will not be freed or deallocated during the call.
+    pub unsafe fn from_raw(pfn: PFN) -> Self {
+        unsafe {
+            // SAFETY: The caller guarantees that the page is inside the global page array.
+            assert!(A::has_management_over(RawPagePtr::from(pfn)));
+
+            // SAFETY: We've checked that the validity of the page. And the caller guarantees
+            //         that the page will not be freed or deallocated during the call.
+            Self::from_raw_unchecked(pfn)
+        }
+    }
+
+    /// Do some work with the page without touching the reference count with the same
+    /// restrictions as `from_raw()`.
+    ///
+    /// # Safety
+    /// Check `from_raw()` for the safety requirements.
+    pub unsafe fn with_raw<F, O>(pfn: PFN, func: F) -> O
+    where
+        F: FnOnce(&Self) -> O,
+    {
+        unsafe {
+            let me = ManuallyDrop::new(Self::from_raw(pfn));
+            func(&me)
+        }
+    }
+
+    /// Do some work with the page without touching the reference count with the same
+    /// restrictions as `from_raw_unchecked()`.
+    ///
+    /// # Safety
+    /// Check `from_raw_unchecked()` for the safety requirements.
+    pub unsafe fn with_raw_unchecked<F, O>(pfn: PFN, func: F) -> O
+    where
+        F: FnOnce(&Self) -> O,
+    {
+        unsafe {
+            let me = ManuallyDrop::new(Self::from_raw_unchecked(pfn));
+            func(&me)
+        }
+    }
+
+    /// Consumes the `Page` and returns the physical frame number without dropping
+    /// the reference count the page holds.
+    pub fn into_raw(self) -> PFN {
+        let me = ManuallyDrop::new(self);
+        me.pfn()
+    }
+
+    /// Returns the physical frame number of the page, which is aligned with the
+    /// page size and valid.
+    pub fn pfn(&self) -> PFN {
+        PFN::from(self.raw_page)
+    }
+
+    /// Returns the start physical address of the page, which is guaranteed to be
+    /// aligned to the page size and valid.
+    pub fn start(&self) -> PAddr {
+        PAddr::from(self.pfn())
+    }
+
+    /// Returns the physical address range of the page, which is guaranteed to be
+    /// aligned to the page size and valid.
+    pub fn range(&self) -> AddrRange<PAddr> {
+        AddrRange::from(self.start()).grow(self.len())
+    }
+}
+
+impl<A: PageAlloc> Clone for Page<A> {
+    fn clone(&self) -> Self {
+        // SAFETY: Memory order here can be Relaxed is for the same reason as that
+        // in the copy constructor of `std::shared_ptr`.
+        self.raw_page.refcount().fetch_add(1, Ordering::Relaxed);
+
+        Self {
+            raw_page: self.raw_page,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<A: PageAlloc> Drop for Page<A> {
+    fn drop(&mut self) {
+        match self.raw_page.refcount().fetch_sub(1, Ordering::AcqRel) {
+            0 => panic!("Refcount for an in-use page is 0"),
+            1 => unsafe {
+                // SAFETY: `self.raw_page` points to a valid page inside the global page array.
+                assert!(A::has_management_over(self.raw_page));
+
+                // SAFETY: `self.raw_page` is managed by the allocator and we're dropping the page.
+                A::dealloc(self.raw_page)
+            },
+            _ => {}
+        }
+    }
+}
+
+impl<A: PageAlloc> fmt::Debug for Page<A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "Page({:?}, order={})",
+            usize::from(PFN::from(self.raw_page)),
+            self.order()
+        )
+    }
+}

+ 31 - 0
crates/eonix_mm/src/paging/page_alloc.rs

@@ -0,0 +1,31 @@
+use super::raw_page::RawPagePtr;
+
+pub trait PageAlloc: Sized {
+    /// Allocate a page of the given *order*.
+    fn alloc_order(order: u32) -> Option<RawPagePtr>;
+
+    /// Allocate exactly one page.
+    fn alloc() -> Option<RawPagePtr> {
+        Self::alloc_order(0)
+    }
+
+    /// Allocate a contiguous block of pages that can contain at least `count` pages.
+    fn alloc_at_least(count: usize) -> Option<RawPagePtr> {
+        let order = count.next_power_of_two().trailing_zeros();
+        Self::alloc_order(order)
+    }
+
+    /// Deallocate a page.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `page` is allocated in this allocator and never used after this call.
+    unsafe fn dealloc(page_ptr: RawPagePtr);
+
+    /// Check whether the page is allocated and managed by the allocator.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `page_ptr` points to a raw page inside the global page array.
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool;
+}

+ 65 - 0
crates/eonix_mm/src/paging/pfn.rs

@@ -0,0 +1,65 @@
+use crate::address::{Addr as _, PAddr};
+use core::{
+    fmt,
+    ops::{Add, Sub},
+};
+
+use super::PAGE_SIZE_BITS;
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+pub struct PFN(usize);
+
+impl From<PFN> for usize {
+    fn from(v: PFN) -> Self {
+        v.0
+    }
+}
+
+impl From<usize> for PFN {
+    fn from(v: usize) -> Self {
+        Self(v)
+    }
+}
+
+impl Sub for PFN {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Sub<usize> for PFN {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        PFN(self.0 - rhs)
+    }
+}
+
+impl Add<usize> for PFN {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        PFN(self.0 + rhs)
+    }
+}
+
+impl fmt::Debug for PFN {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "PFN({:#x})", self.0)
+    }
+}
+
+impl From<PAddr> for PFN {
+    fn from(paddr: PAddr) -> Self {
+        Self(paddr.addr() >> PAGE_SIZE_BITS)
+    }
+}
+
+impl PFN {
+    pub const fn from_val(pfn: usize) -> Self {
+        Self(pfn)
+    }
+}

+ 97 - 0
crates/eonix_mm/src/paging/raw_page.rs

@@ -0,0 +1,97 @@
+use super::PFN;
+use core::{
+    ptr::NonNull,
+    sync::atomic::{AtomicU32, AtomicUsize, Ordering},
+};
+use intrusive_list::Link;
+
+const PAGE_ARRAY: NonNull<RawPage> =
+    unsafe { NonNull::new_unchecked(0xffffff8040000000 as *mut _) };
+
+pub struct PageFlags(AtomicU32);
+
+pub struct RawPage {
+    /// This can be used for LRU page swap in the future.
+    ///
+    /// Now only used for free page links in the buddy system.
+    pub link: Link,
+    /// # Safety
+    /// This field is only used in buddy system and is protected by the global lock.
+    pub order: u32,
+    pub flags: PageFlags,
+    pub refcount: AtomicUsize,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct RawPagePtr(NonNull<RawPage>);
+
+impl PageFlags {
+    pub const PRESENT: u32 = 1 << 0;
+    // pub const LOCKED: u32 = 1 << 1;
+    pub const BUDDY: u32 = 1 << 2;
+    // pub const SLAB: u32 = 1 << 3;
+    // pub const DIRTY: u32 = 1 << 4;
+    pub const FREE: u32 = 1 << 5;
+    pub const LOCAL: u32 = 1 << 6;
+
+    pub fn has(&self, flag: u32) -> bool {
+        (self.0.load(Ordering::Relaxed) & flag) == flag
+    }
+
+    pub fn set(&self, flag: u32) {
+        self.0.fetch_or(flag, Ordering::Relaxed);
+    }
+
+    pub fn clear(&self, flag: u32) {
+        self.0.fetch_and(!flag, Ordering::Relaxed);
+    }
+}
+
+impl RawPagePtr {
+    pub const fn new(ptr: NonNull<RawPage>) -> Self {
+        Self(ptr)
+    }
+
+    pub const fn as_ptr(self) -> *mut RawPage {
+        self.0.as_ptr()
+    }
+
+    pub const fn as_ref<'a>(self) -> &'a RawPage {
+        unsafe { &*self.as_ptr() }
+    }
+
+    pub const fn as_mut<'a>(self) -> &'a mut RawPage {
+        unsafe { &mut *self.as_ptr() }
+    }
+
+    pub const fn order(&self) -> u32 {
+        self.as_ref().order
+    }
+
+    pub const fn flags(&self) -> &PageFlags {
+        &self.as_ref().flags
+    }
+
+    pub const fn refcount(&self) -> &AtomicUsize {
+        &self.as_ref().refcount
+    }
+
+    pub const fn offset(&self, count: usize) -> Self {
+        let new_raw_ptr = unsafe { self.0.add(count) };
+        Self::new(new_raw_ptr)
+    }
+}
+
+impl From<RawPagePtr> for PFN {
+    fn from(value: RawPagePtr) -> Self {
+        let idx = unsafe { value.as_ptr().offset_from(PAGE_ARRAY.as_ptr()) as usize };
+        Self::from(idx)
+    }
+}
+
+impl From<PFN> for RawPagePtr {
+    fn from(pfn: PFN) -> Self {
+        let raw_page_ptr = unsafe { PAGE_ARRAY.add(usize::from(pfn)) };
+        Self::new(raw_page_ptr)
+    }
+}

+ 12 - 0
crates/eonix_percpu/Cargo.toml

@@ -0,0 +1,12 @@
+[package]
+name = "eonix_percpu"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
+syn = { version = "2.0", features = ["full"] }

+ 24 - 0
crates/eonix_percpu/src/arch.rs

@@ -0,0 +1,24 @@
+use proc_macro2::TokenStream;
+use quote::quote;
+use syn::{Ident, Type};
+
+/// Get the base address for percpu variables of the current thread.
+pub fn get_percpu_pointer(percpu: &Ident, ty: &Type) -> TokenStream {
+    quote! {
+        {
+            #[cfg(target_arch = "x86_64")]
+            {
+                let base: *mut #ty;
+                ::core::arch::asm!(
+                    "mov %gs:0, {address}",
+                    "add ${percpu_pointer}, {address}",
+                    percpu_pointer = sym #percpu,
+                    address = out(reg) base,
+                    options(att_syntax)
+                );
+                base
+            }
+        }
+    }
+    .into()
+}

+ 181 - 0
crates/eonix_percpu/src/lib.rs

@@ -0,0 +1,181 @@
+extern crate proc_macro;
+
+use proc_macro::TokenStream;
+use quote::{format_ident, quote};
+use syn::{parse_macro_input, ItemStatic};
+
+mod arch;
+
+#[proc_macro_attribute]
+pub fn define_percpu(attrs: TokenStream, item: TokenStream) -> TokenStream {
+    if !attrs.is_empty() {
+        panic!("`define_percpu` attribute does not take any arguments");
+    }
+
+    let item = parse_macro_input!(item as ItemStatic);
+    let vis = &item.vis;
+    let ident = &item.ident;
+    let ty = &item.ty;
+    let expr = &item.expr;
+
+    let is_bool = quote!(#ty).to_string().as_str() == "bool";
+    let is_integer =
+        ["u8", "u16", "u32", "u64", "usize"].contains(&quote!(#ty).to_string().as_str());
+
+    let is_atomic_like = is_bool || is_integer || quote!(#ty).to_string().contains("NonNull");
+
+    let inner_ident = format_ident!("_percpu_inner_{}", ident);
+    let access_ident = format_ident!("_access_{}", ident);
+
+    let integer_methods = if is_integer {
+        quote! {
+            pub fn add(&self, value: #ty) {
+                *unsafe { self.as_mut() } += value;
+            }
+
+            pub fn sub(&self, value: #ty) {
+                *unsafe { self.as_mut() } -= value;
+            }
+        }
+    } else {
+        quote! {}
+    };
+
+    let preempt_disable = if !is_atomic_like {
+        quote! { eonix_preempt::disable(); }
+    } else {
+        quote! {}
+    };
+
+    let preempt_enable = if !is_atomic_like {
+        quote! { eonix_preempt::enable(); }
+    } else {
+        quote! {}
+    };
+
+    let as_ptr = arch::get_percpu_pointer(&inner_ident, &ty);
+
+    quote! {
+        #[link_section = ".percpu"]
+        #[allow(non_upper_case_globals)]
+        static mut #inner_ident: #ty = #expr;
+        #[allow(non_camel_case_types)]
+        #vis struct #access_ident;
+        #vis static #ident: #access_ident = #access_ident;
+
+        impl #access_ident {
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ptr(&self) -> *mut #ty {
+                #as_ptr
+            }
+
+            pub fn get(&self) -> #ty {
+                #preempt_disable
+                let value = unsafe { self.as_ptr().read() };
+                #preempt_enable
+                value
+            }
+
+            pub fn set(&self, value: #ty) {
+                #preempt_disable
+                unsafe { self.as_ptr().write(value) }
+                #preempt_enable
+            }
+
+            pub fn swap(&self, mut value: #ty) -> #ty {
+                #preempt_disable
+                unsafe { self.as_ptr().swap(&mut value) }
+                #preempt_enable
+                value
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for immutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ref(&self) -> & #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_ref().unwrap()
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_mut(&self) -> &mut #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_mut().unwrap()
+            }
+
+            #integer_methods
+        }
+    }
+    .into()
+}
+
+#[proc_macro_attribute]
+pub fn define_percpu_shared(attrs: TokenStream, item: TokenStream) -> TokenStream {
+    if !attrs.is_empty() {
+        panic!("`define_percpu_shared` attribute does not take any arguments");
+    }
+
+    let item = parse_macro_input!(item as ItemStatic);
+    let vis = &item.vis;
+    let ident = &item.ident;
+    let ty = &item.ty;
+    let expr = &item.expr;
+
+    let inner_ident = format_ident!("_percpu_shared_inner_{}", ident);
+    let access_ident = format_ident!("_access_shared_{}", ident);
+
+    let as_ptr = arch::get_percpu_pointer(&inner_ident, &ty);
+
+    quote! {
+        #[link_section = ".percpu"]
+        #[allow(non_upper_case_globals)]
+        static #inner_ident: #ty = #expr;
+        #[allow(non_camel_case_types)]
+        #vis struct #access_ident;
+        #vis static #ident: #access_ident = #access_ident;
+
+        impl #access_ident {
+            fn as_ptr(&self) -> *const #ty {
+                unsafe { ( #as_ptr ) }
+            }
+
+            pub fn get_ref(&self) -> & #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                unsafe { self.as_ptr().as_ref().unwrap() }
+            }
+
+            pub fn get_for_cpu(&self, cpuid: usize) -> Option<& #ty > {
+                let offset = & #inner_ident as *const _ as usize;
+                let base = ::arch::PercpuArea::get_for(cpuid);
+                base.map(|base| unsafe { base.byte_add(offset).cast().as_ref() })
+            }
+        }
+
+        impl ::core::ops::Deref for #access_ident {
+            type Target = #ty;
+
+            fn deref(&self) -> &Self::Target {
+                self.get_ref()
+            }
+        }
+
+        impl<T> ::core::convert::AsRef<T> for #access_ident
+        where
+            <Self as ::core::ops::Deref>::Target: ::core::convert::AsRef<T>,
+        {
+            fn as_ref(&self) -> &T {
+                use ::core::ops::Deref;
+
+                self.deref().as_ref()
+            }
+        }
+    }
+    .into()
+}

+ 0 - 1
crates/eonix_runtime/Cargo.toml

@@ -8,7 +8,6 @@ arch = { path = "../../arch" }
 atomic_unique_refcell = { path = "../atomic_unique_refcell" }
 eonix_log = { path = "../eonix_log" }
 eonix_preempt = { path = "../eonix_preempt" }
-eonix_spin_irq = { path = "../eonix_spin_irq" }
 eonix_sync = { path = "../eonix_sync" }
 pointers = { path = "../pointers" }
 

+ 1 - 1
crates/eonix_runtime/src/executor/builder.rs

@@ -48,7 +48,7 @@ where
         let mut execution_context = ExecutionContext::new();
         let output_handle = OutputHandle::new();
 
-        execution_context.set_sp(stack.get_bottom() as *const _ as _);
+        execution_context.set_sp(stack.get_bottom().addr().get() as _);
 
         let executor = Box::pin(RealExecutor {
             _stack: stack,

+ 3 - 1
crates/eonix_runtime/src/executor/stack.rs

@@ -1,4 +1,6 @@
+use core::ptr::NonNull;
+
 pub trait Stack: Sized + Send {
     fn new() -> Self;
-    fn get_bottom(&self) -> &();
+    fn get_bottom(&self) -> NonNull<()>;
 }

+ 1 - 2
crates/eonix_runtime/src/scheduler.rs

@@ -14,7 +14,6 @@ use core::{
 };
 use eonix_log::println_trace;
 use eonix_preempt::assert_preempt_count_eq;
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::{LazyLock, Spin};
 use intrusive_collections::RBTree;
 use pointers::BorrowedArc;
@@ -92,7 +91,7 @@ impl Scheduler {
             // SAFETY: Preemption is disabled.
             let context: &mut ExecutionContext = LOCAL_SCHEDULER_CONTEXT.as_mut();
             context.set_ip(local_scheduler as _);
-            context.set_sp(stack.get_bottom() as *const _ as usize);
+            context.set_sp(stack.get_bottom().addr().get() as usize);
             eonix_preempt::enable();
         }
 

+ 0 - 8
crates/eonix_spin_irq/Cargo.toml

@@ -1,8 +0,0 @@
-[package]
-name = "eonix_spin_irq"
-version = "0.1.0"
-edition = "2024"
-
-[dependencies]
-arch = { path = "../../arch" }
-eonix_sync = { path = "../eonix_sync" }

+ 1 - 0
crates/eonix_sync/Cargo.toml

@@ -4,6 +4,7 @@ version = "0.1.0"
 edition = "2024"
 
 [dependencies]
+arch = { path = "../../arch" }
 eonix_preempt = { path = "../eonix_preempt" }
 intrusive-collections = "0.9.7"
 

+ 4 - 1
crates/eonix_sync/src/lib.rs

@@ -14,7 +14,10 @@ pub use lazy_lock::LazyLock;
 pub use locked::{AsProof, AsProofMut, Locked, Proof, ProofMut};
 pub use mutex::{Mutex, MutexGuard};
 pub use rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-pub use spin::{LoopRelax, Relax, Spin, SpinGuard, SpinRelax, UnlockedSpinGuard};
+pub use spin::{
+    LoopRelax, Relax, Spin, SpinGuard, SpinIrqGuard, SpinRelax, UnlockedSpinGuard,
+    UnlockedSpinIrqGuard,
+};
 pub use wait_list::WaitList;
 
 extern crate alloc;

+ 14 - 0
crates/eonix_sync/src/spin.rs

@@ -1,14 +1,17 @@
 mod guard;
 mod relax;
+mod spin_irq;
 
 use core::{
     cell::UnsafeCell,
     marker::PhantomData,
     sync::atomic::{AtomicBool, Ordering},
 };
+use spin_irq::IrqStateGuard;
 
 pub use guard::{SpinGuard, UnlockedSpinGuard};
 pub use relax::{LoopRelax, Relax, SpinRelax};
+pub use spin_irq::{SpinIrqGuard, UnlockedSpinIrqGuard};
 
 //// A spinlock is a lock that uses busy-waiting to acquire the lock.
 /// It is useful for short critical sections where the overhead of a context switch
@@ -66,6 +69,17 @@ where
         }
     }
 
+    pub fn lock_irq(&self) -> SpinIrqGuard<'_, T, R> {
+        let irq_state = arch::disable_irqs_save();
+        let guard = self.lock();
+
+        SpinIrqGuard {
+            guard,
+            irq_state: IrqStateGuard::new(irq_state),
+            _not_send: PhantomData,
+        }
+    }
+
     pub fn get_mut(&mut self) -> &mut T {
         // SAFETY: The exclusive access to the lock is guaranteed by the borrow checker.
         unsafe { &mut *self.value.get() }

+ 6 - 34
crates/eonix_spin_irq/src/lib.rs → crates/eonix_sync/src/spin/spin_irq.rs

@@ -1,34 +1,23 @@
-#![no_std]
-
+use super::{Relax, SpinGuard, SpinRelax, UnlockedSpinGuard};
+use crate::{marker::NotSend, UnlockableGuard, UnlockedGuard};
 use core::{
     marker::PhantomData,
     mem::ManuallyDrop,
     ops::{Deref, DerefMut},
 };
-use eonix_sync::{
-    marker::NotSend, Relax, Spin, SpinGuard, SpinRelax, UnlockableGuard, UnlockedGuard,
-    UnlockedSpinGuard,
-};
-
-pub trait SpinIrq<T, R = SpinRelax>
-where
-    T: ?Sized,
-{
-    fn lock_irq(&self) -> SpinIrqGuard<'_, T, R>;
-}
 
-struct IrqStateGuard(ManuallyDrop<arch::IrqState>);
+pub(super) struct IrqStateGuard(ManuallyDrop<arch::IrqState>);
 
 pub struct SpinIrqGuard<'a, T, R = SpinRelax>
 where
     T: ?Sized,
 {
-    guard: SpinGuard<'a, T, R>,
-    irq_state: IrqStateGuard,
+    pub(super) guard: SpinGuard<'a, T, R>,
+    pub(super) irq_state: IrqStateGuard,
     /// We don't want this to be `Send` because we don't want to allow the guard to be
     /// transferred to another thread since we have disabled the preemption and saved
     /// IRQ states on the local cpu.
-    _not_send: PhantomData<NotSend>,
+    pub(super) _not_send: PhantomData<NotSend>,
 }
 
 pub struct UnlockedSpinIrqGuard<'a, T, R>
@@ -43,23 +32,6 @@ where
 //         we can access the guard from multiple threads.
 unsafe impl<T, R> Sync for SpinIrqGuard<'_, T, R> where T: ?Sized + Sync {}
 
-impl<T, R> SpinIrq<T, R> for Spin<T, R>
-where
-    T: ?Sized,
-    R: Relax,
-{
-    fn lock_irq(&self) -> SpinIrqGuard<'_, T, R> {
-        let irq_state = arch::disable_irqs_save();
-        let guard = self.lock();
-
-        SpinIrqGuard {
-            guard,
-            irq_state: IrqStateGuard::new(irq_state),
-            _not_send: PhantomData,
-        }
-    }
-}
-
 impl IrqStateGuard {
     pub const fn new(irq_state: arch::IrqState) -> Self {
         Self(ManuallyDrop::new(irq_state))

+ 7 - 4
crates/eonix_sync/src/wait_list.rs

@@ -9,6 +9,9 @@ use wait_object::{WaitObject, WaitObjectAdapter};
 pub use wait_handle::WaitHandle;
 
 pub struct WaitList {
+    /// # Lock
+    /// `WaitList`s might be used in IRQ handlers, so `lock_irq` should
+    /// be used on `waiters`.
     waiters: LazyLock<Spin<LinkedList<WaitObjectAdapter>>>,
 }
 
@@ -20,11 +23,11 @@ impl WaitList {
     }
 
     pub fn has_waiters(&self) -> bool {
-        !self.waiters.lock().is_empty()
+        !self.waiters.lock_irq().is_empty()
     }
 
     pub fn notify_one(&self) -> bool {
-        let mut waiters = self.waiters.lock();
+        let mut waiters = self.waiters.lock_irq();
         let mut waiter = waiters.front_mut();
 
         if !waiter.is_null() {
@@ -40,7 +43,7 @@ impl WaitList {
     }
 
     pub fn notify_all(&self) -> usize {
-        let mut waiters = self.waiters.lock();
+        let mut waiters = self.waiters.lock_irq();
         let mut waiter = waiters.front_mut();
         let mut count = 0;
 
@@ -83,7 +86,7 @@ impl WaitList {
     }
 
     pub(self) fn notify_waiter(&self, wait_object: &WaitObject) {
-        let mut waiters = self.waiters.lock();
+        let mut waiters = self.waiters.lock_irq();
         if !wait_object.on_list() {
             return;
         }

+ 2 - 2
crates/eonix_sync/src/wait_list/wait_handle.rs

@@ -96,7 +96,7 @@ impl<'a> WaitHandle<'a> {
 
         match *state {
             State::Init => {
-                let mut waiters = wait_list.waiters.lock();
+                let mut waiters = wait_list.waiters.lock_irq();
                 waiters.push_back(wait_object_ref);
 
                 if let Some(waker) = waker.cloned() {
@@ -206,7 +206,7 @@ impl Drop for WaitHandle<'_> {
             self.wait_until_off_list();
         } else {
             // Lock the list and try again.
-            let mut waiters = self.wait_list.waiters.lock();
+            let mut waiters = self.wait_list.waiters.lock_irq();
 
             if wait_object.on_list() {
                 let mut cursor = unsafe {

+ 7 - 3
crates/eonix_sync/src/wait_list/wait_object.rs

@@ -17,6 +17,10 @@ intrusive_adapter!(
 
 pub struct WaitObject {
     woken_up: AtomicBool,
+    /// Separation of the field `waker` from its lock is basically due to the
+    /// consideration of space. We hope that the object can fit into a cacheline
+    /// and `woken_up` takes only 1 byte where the rest 7 bytes can accomodate 1
+    /// extra byte required for a spinlock.
     waker_lock: Spin<()>,
     waker: UnsafeCell<Option<Waker>>,
     wait_list: AtomicPtr<WaitList>,
@@ -40,7 +44,7 @@ impl WaitObject {
     }
 
     pub fn save_waker(&self, waker: Waker) {
-        let _lock = self.waker_lock.lock();
+        let _lock = self.waker_lock.lock_irq();
         unsafe {
             // SAFETY: We're holding the waker lock.
             let old_waker = (*self.waker.get()).replace(waker);
@@ -53,7 +57,7 @@ impl WaitObject {
     /// # Returns
     /// Whether the waker was saved.
     pub fn save_waker_if_not_woken_up(&self, waker: &Waker) -> bool {
-        let _lock = self.waker_lock.lock();
+        let _lock = self.waker_lock.lock_irq();
         if self.woken_up() {
             return false;
         }
@@ -68,7 +72,7 @@ impl WaitObject {
     }
 
     pub fn take_waker(&self) -> Option<Waker> {
-        let _lock = self.waker_lock.lock();
+        let _lock = self.waker_lock.lock_irq();
         unsafe {
             // SAFETY: We're holding the waker lock.
             self.waker.get().as_mut().unwrap().take()

+ 6 - 0
crates/intrusive_list/Cargo.toml

@@ -0,0 +1,6 @@
+[package]
+name = "intrusive_list"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]

+ 59 - 0
crates/intrusive_list/src/lib.rs

@@ -0,0 +1,59 @@
+#![no_std]
+
+use core::ptr::NonNull;
+
+pub struct Link {
+    prev: Option<NonNull<Link>>,
+    next: Option<NonNull<Link>>,
+}
+
+impl Link {
+    pub const fn new() -> Self {
+        Self {
+            prev: None,
+            next: None,
+        }
+    }
+
+    pub fn insert(&mut self, node: &mut Self) {
+        unsafe {
+            let insert_node = NonNull::new(&raw mut *node);
+            if let Some(next) = self.next {
+                (*next.as_ptr()).prev = insert_node;
+            }
+            node.next = self.next;
+            node.prev = NonNull::new(&raw mut *self);
+            self.next = insert_node;
+        }
+    }
+
+    pub fn remove(&mut self) {
+        if let Some(next) = self.next {
+            unsafe { (*next.as_ptr()).prev = self.prev };
+        }
+
+        if let Some(prev) = self.prev {
+            unsafe { (*prev.as_ptr()).next = self.next };
+        }
+
+        self.prev = None;
+        self.next = None;
+    }
+
+    pub fn next(&self) -> Option<&Self> {
+        self.next.map(|node| unsafe { &*node.as_ptr() })
+    }
+
+    pub fn next_mut(&mut self) -> Option<&mut Self> {
+        self.next.map(|node| unsafe { &mut *node.as_ptr() })
+    }
+}
+
+#[macro_export]
+macro_rules! container_of {
+    ($ptr:expr, $type:ty, $($f:tt)*) => {{
+        let ptr = $ptr as *const _ as *const u8;
+        let offset: usize = ::core::mem::offset_of!($type, $($f)*);
+        ::core::ptr::NonNull::new_unchecked(ptr.sub(offset) as *mut $type)
+    }}
+}

+ 3 - 5
src/driver/ahci/command.rs

@@ -1,8 +1,6 @@
-use crate::prelude::*;
-
-use crate::kernel::mem::paging::Page;
-
 use super::bindings::EINVAL;
+use crate::kernel::mem::paging::Page;
+use crate::prelude::*;
 
 pub trait Command {
     fn pages(&self) -> &[Page];
@@ -22,7 +20,7 @@ pub struct IdentifyCommand {
 impl IdentifyCommand {
     pub fn new() -> Self {
         Self {
-            page: Page::alloc_one(),
+            page: Page::alloc(),
         }
     }
 }

+ 48 - 0
src/driver/ahci/command_table.rs

@@ -0,0 +1,48 @@
+use super::{command::Command, PRDTEntry, FISH2D};
+use crate::kernel::mem::{AsMemoryBlock as _, Page};
+use eonix_mm::address::PAddr;
+
+pub struct CommandTable<'a> {
+    page: Page,
+    command_fis: &'a mut FISH2D,
+
+    prdt: &'a mut [PRDTEntry; 248],
+    prdt_entries: Option<u16>,
+}
+
+impl CommandTable<'_> {
+    pub fn new() -> Self {
+        let page = Page::alloc();
+        let memory = page.as_memblk();
+
+        let (lhs, prdt) = memory.split_at(0x80);
+
+        let (command_fis, _) = lhs.split_at(size_of::<FISH2D>());
+        let command_fis = unsafe { command_fis.as_ptr().as_mut() };
+        let prdt = unsafe { prdt.as_ptr().as_mut() };
+
+        Self {
+            page,
+            command_fis,
+            prdt,
+            prdt_entries: None,
+        }
+    }
+
+    pub fn setup(&mut self, cmd: &impl Command) {
+        self.command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
+        self.prdt_entries = Some(cmd.pages().len() as u16);
+
+        for (idx, page) in cmd.pages().iter().enumerate() {
+            self.prdt[idx].setup(page);
+        }
+    }
+
+    pub fn prdt_len(&self) -> u16 {
+        self.prdt_entries.unwrap()
+    }
+
+    pub fn base(&self) -> PAddr {
+        self.page.start()
+    }
+}

+ 11 - 8
src/driver/ahci/control.rs

@@ -1,6 +1,7 @@
-use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
-
 use super::{BitsIterator, GHC_IE};
+use crate::{kernel::mem::PhysAccess as _, sync::fence::memory_barrier};
+use core::ptr::NonNull;
+use eonix_mm::address::PAddr;
 
 /// An `AdapterControl` is an HBA device Global Host Control block
 ///
@@ -34,7 +35,7 @@ const CONTROL_IS: usize = 2;
 const CONTROL_PI: usize = 3;
 
 pub struct AdapterControl {
-    inner: *mut u32,
+    control_data: NonNull<u32>,
 }
 
 /// # Safety
@@ -42,25 +43,26 @@ pub struct AdapterControl {
 unsafe impl Send for AdapterControl {}
 
 impl AdapterControl {
-    pub fn new(addr: usize) -> Self {
+    pub fn new(addr: PAddr) -> Self {
         Self {
-            inner: NoCachePP::new(addr).as_ptr(),
+            control_data: unsafe { addr.as_ptr() },
         }
     }
 }
 
 impl AdapterControl {
     fn read(&self, off: usize) -> u32 {
-        unsafe { self.inner.offset(off as isize).read_volatile() }
+        unsafe { self.control_data.offset(off as isize).read_volatile() }
     }
 
     fn write(&self, off: usize, value: u32) {
-        unsafe { self.inner.offset(off as isize).write_volatile(value) }
+        unsafe { self.control_data.offset(off as isize).write_volatile(value) }
     }
 
     pub fn enable_interrupts(&self) {
         let ghc = self.read(CONTROL_GHC);
         self.write(CONTROL_GHC, ghc | GHC_IE);
+        memory_barrier();
     }
 
     pub fn implemented_ports(&self) -> BitsIterator {
@@ -72,6 +74,7 @@ impl AdapterControl {
     }
 
     pub fn clear_interrupt(&self, no: u32) {
-        self.write(CONTROL_IS, 1 << no)
+        self.write(CONTROL_IS, 1 << no);
+        memory_barrier();
     }
 }

+ 4 - 1
src/driver/ahci/defs.rs

@@ -1,6 +1,8 @@
 #![allow(dead_code)]
 
 use crate::kernel::mem::paging::Page;
+use eonix_mm::address::Addr as _;
+
 pub const VENDOR_INTEL: u16 = 0x8086;
 pub const DEVICE_AHCI: u16 = 0x2922;
 
@@ -51,6 +53,7 @@ pub const PORT_IS_ERROR: u32 =
 /// `clear_busy_upon_ok` and `bytes_transferred` are volatile
 ///
 #[repr(C)]
+#[derive(Clone, Copy)]
 pub struct CommandHeader {
     // [0:4]: Command FIS length
     // [5]: ATAPI
@@ -237,7 +240,7 @@ pub struct PRDTEntry {
 
 impl PRDTEntry {
     pub fn setup(&mut self, page: &Page) {
-        self.base = page.as_phys() as u64;
+        self.base = page.start().addr() as u64;
         self._reserved1 = 0;
 
         // The last bit MUST be set to 1 according to the AHCI spec

+ 63 - 65
src/driver/ahci/mod.rs

@@ -13,14 +13,21 @@ use bindings::{
     EIO,
 };
 use control::AdapterControl;
+use core::ptr::NonNull;
 use defs::*;
-use eonix_spin_irq::SpinIrq as _;
+use eonix_mm::address::{AddrOps as _, PAddr};
 use port::AdapterPort;
 
+pub(self) use register::Register;
+
 mod command;
+mod command_table;
 mod control;
 mod defs;
 mod port;
+mod register;
+pub(self) mod slot;
+mod stats;
 
 pub struct BitsIterator {
     data: u32,
@@ -53,70 +60,23 @@ impl Iterator for BitsIterator {
     }
 }
 
-fn vread<T: Sized + Copy>(refval: *const T) -> T {
-    unsafe { refval.read_volatile() }
-}
-
-fn vwrite<T: Sized + Copy>(refval: *mut T, val: T) {
-    unsafe { refval.write_volatile(val) }
-}
-
-#[allow(dead_code)]
-struct Device {
-    control_base: usize,
+struct Device<'a> {
+    control_base: PAddr,
     control: AdapterControl,
     // TODO: impl Drop to free pci device
-    pcidev: *mut pci_device,
+    pcidev: NonNull<pci_device>,
     /// # Lock
     /// Might be accessed from irq handler, use with `lock_irq()`
-    ports: Spin<[Option<Arc<AdapterPort>>; 32]>,
+    ports: Spin<[Option<Arc<AdapterPort<'a>>>; 32]>,
 }
 
 /// # Safety
 /// `pcidev` is never accessed from Rust code
 /// TODO!!!: place *mut pci_device in a safe wrapper
-unsafe impl Send for Device {}
-unsafe impl Sync for Device {}
-
-impl Device {
-    fn probe_ports(&self) -> KResult<()> {
-        for nport in self.control.implemented_ports() {
-            let port = Arc::new(AdapterPort::new(self.control_base, nport));
-            if !port.status_ok() {
-                continue;
-            }
-
-            self.ports.lock_irq()[nport as usize] = Some(port.clone());
-            if let Err(e) = (|| -> KResult<()> {
-                port.init()?;
-
-                {
-                    let port = port.clone();
-                    let name = format!("ahci-p{}-stats", port.nport);
-                    procfs::populate_root(name.into_bytes().into(), move |buffer| {
-                        writeln!(&mut buffer.get_writer(), "{:?}", &*port.stats.lock())
-                            .map_err(|_| EIO)
-                    })?;
-                }
-
-                let port = BlockDevice::register_disk(
-                    make_device(8, nport * 16),
-                    2147483647, // TODO: get size from device
-                    port,
-                )?;
-
-                port.partprobe()?;
-
-                Ok(())
-            })() {
-                self.ports.lock_irq()[nport as usize] = None;
-                println_warn!("probe port {nport} failed with {e}");
-            }
-        }
-
-        Ok(())
-    }
+unsafe impl Send for Device<'_> {}
+unsafe impl Sync for Device<'_> {}
 
+impl Device<'_> {
     fn handle_interrupt(&self) {
         // Safety
         // `self.ports` is accessed inside irq handler
@@ -128,7 +88,7 @@ impl Device {
             }
 
             let port = ports[nport as usize].as_ref().unwrap();
-            let status = vread(port.interrupt_status());
+            let status = port.interrupt_status().read_once();
 
             if status & PORT_IS_ERROR != 0 {
                 println_warn!("port {nport} SATA error");
@@ -136,7 +96,7 @@ impl Device {
             }
 
             debug_assert!(status & PORT_IS_DHRS != 0);
-            vwrite(port.interrupt_status(), PORT_IS_DHRS);
+            port.interrupt_status().write_once(PORT_IS_DHRS);
 
             self.control.clear_interrupt(nport);
 
@@ -145,19 +105,20 @@ impl Device {
     }
 }
 
-impl Device {
-    pub fn new(pcidev: *mut pci_device) -> KResult<Arc<Self>> {
-        let base = unsafe { *(*pcidev).header_type0() }.bars[PCI_REG_ABAR];
-        let irqno = unsafe { *(*pcidev).header_type0() }.interrupt_line;
+impl Device<'static> {
+    pub fn new(pcidev: NonNull<pci_device>) -> KResult<Arc<Self>> {
+        let base =
+            PAddr::from(unsafe { *pcidev.as_ref().header_type0() }.bars[PCI_REG_ABAR] as usize);
+        let irqno = unsafe { *pcidev.as_ref().header_type0() }.interrupt_line;
 
         // use MMIO
-        if base & 0xf != 0 {
+        if !base.is_aligned_to(16) {
             return Err(EIO);
         }
 
         let device = Arc::new(Device {
-            control_base: base as usize,
-            control: AdapterControl::new(base as usize),
+            control_base: base,
+            control: AdapterControl::new(base),
             pcidev,
             ports: Spin::new([const { None }; 32]),
         });
@@ -171,10 +132,47 @@ impl Device {
 
         Ok(device)
     }
+
+    fn probe_ports(&self) -> KResult<()> {
+        for nport in self.control.implemented_ports() {
+            let port = Arc::new(AdapterPort::new(self.control_base, nport));
+            if !port.status_ok() {
+                continue;
+            }
+
+            self.ports.lock_irq()[nport as usize] = Some(port.clone());
+            if let Err(e) = (|| -> KResult<()> {
+                port.init()?;
+
+                {
+                    let port = port.clone();
+                    let name = format!("ahci-p{}-stats", port.nport);
+                    procfs::populate_root(name.into_bytes().into(), move |buffer| {
+                        port.print_stats(&mut buffer.get_writer())
+                    })?;
+                }
+
+                let port = BlockDevice::register_disk(
+                    make_device(8, nport * 16),
+                    2147483647, // TODO: get size from device
+                    port,
+                )?;
+
+                port.partprobe()?;
+
+                Ok(())
+            })() {
+                self.ports.lock_irq()[nport as usize] = None;
+                println_warn!("probe port {nport} failed with {e}");
+            }
+        }
+
+        Ok(())
+    }
 }
 
 unsafe extern "C" fn probe_device(pcidev: *mut pci_device) -> i32 {
-    match Device::new(pcidev) {
+    match Device::new(NonNull::new(pcidev).expect("NULL `pci_device` pointer")) {
         Ok(device) => {
             // TODO!!!: save device to pci_device
             Box::leak(Box::new(device));

+ 96 - 209
src/driver/ahci/port.rs

@@ -1,35 +1,21 @@
-use core::pin::pin;
-
 use super::command::{Command, IdentifyCommand, ReadLBACommand};
+use super::slot::CommandSlot;
+use super::stats::AdapterPortStats;
 use super::{
-    vread, vwrite, CommandHeader, PRDTEntry, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
-    PORT_CMD_ST, PORT_IE_DEFAULT,
+    CommandHeader, Register, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE, PORT_CMD_ST, PORT_IE_DEFAULT,
 };
+use crate::driver::ahci::command_table::CommandTable;
 use crate::kernel::block::{BlockDeviceRequest, BlockRequestQueue};
 use crate::kernel::mem::paging::Page;
-use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
+use crate::kernel::mem::AsMemoryBlock as _;
 use crate::prelude::*;
 use alloc::collections::vec_deque::VecDeque;
 use bindings::{EINVAL, EIO};
+use core::pin::pin;
+use eonix_mm::address::{Addr as _, PAddr};
 use eonix_runtime::task::Task;
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::WaitList;
 
-fn spinwait_clear(refval: *const u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
-
-    let mut spins = 0;
-    while vread(refval) & mask != 0 {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
-        }
-
-        spins += 1;
-    }
-
-    Ok(())
-}
-
 /// An `AdapterPort` is an HBA device in AHCI mode.
 ///
 /// # Access
@@ -66,63 +52,6 @@ pub struct AdapterPortData {
     vendor: [u32; 4],
 }
 
-#[allow(dead_code)]
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-enum SlotState {
-    Idle,
-    Working,
-    Finished,
-    Error,
-}
-
-struct CommandSlotInner {
-    state: SlotState,
-    /// # Usage
-    /// `cmdheader` might be used in irq handler. So in order to wait for
-    /// commands to finish, we should use `lock_irq` on `cmdheader`
-    cmdheader: *mut CommandHeader,
-}
-
-/// # Safety
-/// This is safe because the `cmdheader` is not shared between threads
-unsafe impl Send for CommandSlotInner {}
-
-impl CommandSlotInner {
-    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
-        let cmdheader = unsafe { self.cmdheader.as_mut().unwrap() };
-        cmdheader.first = 0x05; // FIS type
-
-        if write {
-            cmdheader.first |= 0x40;
-        }
-
-        cmdheader.second = 0x00;
-
-        cmdheader.prdt_length = prdtlen;
-        cmdheader.bytes_transferred = 0;
-        cmdheader.command_table_base = cmdtable_base;
-
-        cmdheader._reserved = [0; 4];
-    }
-}
-
-struct CommandSlot {
-    inner: Spin<CommandSlotInner>,
-    wait_list: WaitList,
-}
-
-impl CommandSlot {
-    fn new(cmdheader: *mut CommandHeader) -> Self {
-        Self {
-            inner: Spin::new(CommandSlotInner {
-                state: SlotState::Idle,
-                cmdheader,
-            }),
-            wait_list: WaitList::new(),
-        }
-    }
-}
-
 struct FreeList {
     free: VecDeque<u32>,
     working: VecDeque<u32>,
@@ -137,85 +66,83 @@ impl FreeList {
     }
 }
 
-#[derive(Default, Debug)]
-pub struct AdapterPortStats {
-    /// Number of commands sent
-    cmd_sent: u64,
-
-    /// Number of transmission errors
-    cmd_error: u64,
-
-    /// Number of interrupts fired
-    int_fired: u64,
-}
-
-pub struct AdapterPort {
+pub struct AdapterPort<'a> {
     pub nport: u32,
-    regs: *mut (),
-    page: Page,
-    slots: [CommandSlot; 32],
+    regs_base: PAddr,
+
+    slots: [CommandSlot<'a>; 32],
     free_list: Spin<FreeList>,
     free_list_wait: WaitList,
 
-    /// Statistics for this port
-    pub stats: Spin<AdapterPortStats>,
-}
+    /// Holds the command list.
+    /// **DO NOT USE IT DIRECTLY**
+    _page: Page,
+
+    cmdlist_base: PAddr,
+    fis_base: PAddr,
 
-/// # Safety
-/// This is safe because the `AdapterPort` can be accessed by only one thread at the same time
-unsafe impl Send for AdapterPort {}
-unsafe impl Sync for AdapterPort {}
+    stats: AdapterPortStats,
+}
 
-impl AdapterPort {
-    pub fn new(base: usize, nport: u32) -> Self {
-        let page = Page::alloc_one();
-        let cmdheaders_start = page.as_cached().as_ptr::<CommandHeader>();
+impl<'a> AdapterPort<'a> {
+    pub fn new(base: PAddr, nport: u32) -> Self {
+        let page = Page::alloc();
+        let cmdlist_base = page.start();
+        let cmdlist_size = 32 * size_of::<CommandHeader>();
+        let fis_base = cmdlist_base + cmdlist_size;
+
+        let (mut cmdheaders, _) = page.as_memblk().split_at(cmdlist_size);
+        let slots = core::array::from_fn(move |_| {
+            let (cmdheader, next) = cmdheaders.split_at(size_of::<CommandHeader>());
+            cmdheaders = next;
+            CommandSlot::new(unsafe { cmdheader.as_ptr().as_mut() })
+        });
 
         Self {
             nport,
-            regs: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_ptr(),
-            slots: core::array::from_fn(|index| {
-                CommandSlot::new(unsafe { cmdheaders_start.offset(index as isize) })
-            }),
+            regs_base: base + 0x100 + 0x80 * nport as usize,
+            slots,
             free_list: Spin::new(FreeList::new()),
             free_list_wait: WaitList::new(),
-            page,
-            stats: Spin::default(),
+            _page: page,
+            stats: AdapterPortStats::new(),
+            cmdlist_base,
+            fis_base,
         }
     }
 }
 
-impl AdapterPort {
-    fn command_list_base(&self) -> *mut u64 {
-        unsafe { self.regs.byte_offset(0x00).cast() }
+impl AdapterPort<'_> {
+    fn command_list_base(&self) -> Register<u64> {
+        Register::new(self.regs_base + 0x00)
     }
 
-    fn fis_base(&self) -> *mut u64 {
-        unsafe { self.regs.byte_offset(0x08).cast() }
+    fn fis_base(&self) -> Register<u64> {
+        Register::new(self.regs_base + 0x08)
     }
 
-    fn sata_status(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x28).cast() }
+    fn sata_status(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x28)
     }
 
-    fn command_status(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x18).cast() }
+    fn command_status(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x18)
     }
 
-    fn command_issue(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x38).cast() }
+    fn command_issue(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x38)
     }
 
-    pub fn interrupt_status(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x10).cast() }
+    pub fn interrupt_status(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x10)
     }
 
-    pub fn interrupt_enable(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x14).cast() }
+    fn interrupt_enable(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x14)
     }
 
     pub fn status_ok(&self) -> bool {
-        vread(self.sata_status()) & 0xf == 0x3
+        self.sata_status().read_once() & 0xf == 0x3
     }
 
     fn get_free_slot(&self) -> u32 {
@@ -234,16 +161,16 @@ impl AdapterPort {
     }
 
     fn save_working(&self, slot: u32) {
-        self.free_list.lock().working.push_back(slot);
+        self.free_list.lock_irq().working.push_back(slot);
     }
 
     fn release_free_slot(&self, slot: u32) {
-        self.free_list.lock().free.push_back(slot);
+        self.free_list.lock_irq().free.push_back(slot);
         self.free_list_wait.notify_one();
     }
 
     pub fn handle_interrupt(&self) {
-        let ci = vread(self.command_issue());
+        let ci = self.command_issue().read_once();
 
         // no need to use `lock_irq()` inside interrupt handler
         let mut free_list = self.free_list.lock();
@@ -253,104 +180,55 @@ impl AdapterPort {
                 return true;
             }
 
-            let slot = &self.slots[n as usize];
-
-            // TODO: check error
-            let mut slot_inner = slot.inner.lock();
-            debug_assert_eq!(slot_inner.state, SlotState::Working);
-            slot_inner.state = SlotState::Finished;
-            slot.wait_list.notify_all();
-            self.stats.lock().int_fired += 1;
+            self.slots[n as usize].handle_irq();
+            self.stats.inc_int_fired();
 
             false
         });
     }
 
     fn stop_command(&self) -> KResult<()> {
-        vwrite(
-            self.command_status(),
-            vread(self.command_status()) & !(PORT_CMD_ST | PORT_CMD_FRE),
-        );
-
-        spinwait_clear(self.command_status(), PORT_CMD_CR | PORT_CMD_FR)
+        let status_reg = self.command_status();
+        let status = status_reg.read();
+        status_reg.write_once(status & !(PORT_CMD_ST | PORT_CMD_FRE));
+        status_reg.spinwait_clear(PORT_CMD_CR | PORT_CMD_FR)
     }
 
     fn start_command(&self) -> KResult<()> {
-        spinwait_clear(self.command_status(), PORT_CMD_CR)?;
+        let status_reg = self.command_status();
+        status_reg.spinwait_clear(PORT_CMD_CR)?;
 
-        let cmd_status = vread(self.command_status());
-        vwrite(
-            self.command_status(),
-            cmd_status | PORT_CMD_ST | PORT_CMD_FRE,
-        );
+        let status = status_reg.read();
+        status_reg.write_once(status | PORT_CMD_ST | PORT_CMD_FRE);
 
         Ok(())
     }
 
     fn send_command(&self, cmd: &impl Command) -> KResult<()> {
-        let pages = cmd.pages();
-        let cmdtable_page = Page::alloc_one();
+        let mut cmdtable = CommandTable::new();
+        cmdtable.setup(cmd);
 
-        let command_fis: &mut FISH2D = cmdtable_page.as_cached().as_mut();
-        command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
+        let slot_index = self.get_free_slot();
+        let slot = &self.slots[slot_index as usize];
 
-        let prdt: &mut [PRDTEntry; 248] = cmdtable_page.as_cached().offset(0x80).as_mut();
+        slot.prepare_command(&cmdtable, cmd.write());
+        self.save_working(slot_index);
 
-        for (idx, page) in pages.iter().enumerate() {
-            prdt[idx].setup(page);
-        }
-
-        let slot_index = self.get_free_slot() as usize;
-        let slot_object = &self.slots[slot_index];
-
-        let mut slot = slot_object.inner.lock_irq();
-
-        slot.setup(
-            cmdtable_page.as_phys() as u64,
-            pages.len() as u16,
-            cmd.write(),
-        );
-        slot.state = SlotState::Working;
+        let cmdissue_reg = self.command_issue();
 
         // should we clear received fis here?
-        debug_assert!(vread(self.command_issue()) & (1 << slot_index) == 0);
-        vwrite(self.command_issue(), 1 << slot_index);
-
-        if spinwait_clear(self.command_issue(), 1 << slot_index).is_err() {
-            let mut saved = false;
-            while slot.state == SlotState::Working {
-                if !saved {
-                    saved = true;
-                    self.save_working(slot_index as u32);
-                }
-                let mut wait = pin!(slot_object.wait_list.prepare_to_wait());
-                wait.as_mut().add_to_wait_list();
-                drop(slot);
-                Task::block_on(wait);
-                slot = slot_object.inner.lock_irq();
-            }
-        } else {
-            // TODO: check error
-            slot.state = SlotState::Finished;
-        }
+        debug_assert!(cmdissue_reg.read_once() & (1 << slot_index) == 0);
+        cmdissue_reg.write_once(1 << slot_index);
 
-        let state = slot.state;
-        slot.state = SlotState::Idle;
+        self.stats.inc_cmd_sent();
 
-        debug_assert_ne!(state, SlotState::Working);
-        self.release_free_slot(slot_index as u32);
+        if let Err(_) = Task::block_on(slot.wait_finish()) {
+            self.stats.inc_cmd_error();
+            return Err(EIO);
+        };
 
-        match state {
-            SlotState::Finished => {
-                self.stats.lock().cmd_sent += 1;
-                Ok(())
-            }
-            SlotState::Error => {
-                self.stats.lock().cmd_error += 1;
-                Err(EIO)
-            }
-            _ => panic!("Invalid slot state"),
-        }
+        self.release_free_slot(slot_index);
+        Ok(())
     }
 
     fn identify(&self) -> KResult<()> {
@@ -365,10 +243,11 @@ impl AdapterPort {
     pub fn init(&self) -> KResult<()> {
         self.stop_command()?;
 
-        vwrite(self.interrupt_enable(), PORT_IE_DEFAULT);
+        self.command_list_base()
+            .write(self.cmdlist_base.addr() as u64);
+        self.fis_base().write(self.fis_base.addr() as u64);
 
-        vwrite(self.command_list_base(), self.page.as_phys() as u64);
-        vwrite(self.fis_base(), self.page.as_phys() as u64 + 0x400);
+        self.interrupt_enable().write_once(PORT_IE_DEFAULT);
 
         self.start_command()?;
 
@@ -380,9 +259,17 @@ impl AdapterPort {
             Ok(_) => Ok(()),
         }
     }
+
+    pub fn print_stats(&self, writer: &mut impl Write) -> KResult<()> {
+        writeln!(writer, "cmd_sent: {}", self.stats.get_cmd_sent()).map_err(|_| EIO)?;
+        writeln!(writer, "cmd_error: {}", self.stats.get_cmd_error()).map_err(|_| EIO)?;
+        writeln!(writer, "int_fired: {}", self.stats.get_int_fired()).map_err(|_| EIO)?;
+
+        Ok(())
+    }
 }
 
-impl BlockRequestQueue for AdapterPort {
+impl BlockRequestQueue for AdapterPort<'_> {
     fn max_request_pages(&self) -> u64 {
         1024
     }

+ 58 - 0
src/driver/ahci/register.rs

@@ -0,0 +1,58 @@
+use crate::{
+    kernel::{constants::EIO, mem::PhysAccess as _},
+    sync::fence::memory_barrier,
+    KResult,
+};
+use core::ptr::NonNull;
+use eonix_mm::address::PAddr;
+
+pub struct Register<T: Copy> {
+    addr: NonNull<T>,
+}
+
+unsafe impl<T: Copy> Send for Register<T> {}
+unsafe impl<T: Copy> Sync for Register<T> {}
+
+impl<T: Copy> Register<T> {
+    pub fn new(addr: PAddr) -> Self {
+        Self {
+            addr: unsafe { addr.as_ptr() },
+        }
+    }
+
+    pub fn read(&self) -> T {
+        unsafe { self.addr.as_ptr().read_volatile() }
+    }
+
+    pub fn write(&self, value: T) {
+        unsafe { self.addr.as_ptr().write_volatile(value) }
+    }
+
+    pub fn read_once(&self) -> T {
+        let val = unsafe { self.addr.as_ptr().read_volatile() };
+        memory_barrier();
+        val
+    }
+
+    pub fn write_once(&self, value: T) {
+        let val = unsafe { self.addr.as_ptr().write_volatile(value) };
+        memory_barrier();
+        val
+    }
+}
+
+impl Register<u32> {
+    pub fn spinwait_clear(&self, mask: u32) -> KResult<()> {
+        const SPINWAIT_MAX: usize = 1000;
+
+        for _ in 0..SPINWAIT_MAX {
+            if self.read() & mask == 0 {
+                memory_barrier();
+                return Ok(());
+            }
+        }
+
+        memory_barrier();
+        Err(EIO)
+    }
+}

+ 94 - 0
src/driver/ahci/slot.rs

@@ -0,0 +1,94 @@
+use super::{command_table::CommandTable, CommandHeader};
+use crate::KResult;
+use core::pin::pin;
+use eonix_mm::address::Addr as _;
+use eonix_sync::{Spin, WaitList};
+
+pub struct CommandSlot<'a> {
+    /// # Usage
+    /// `inner.cmdheader` might be used in irq handler. So in order to wait for
+    /// commands to finish, we should use `lock_irq` on `inner`
+    inner: Spin<CommandSlotInner<'a>>,
+    wait_list: WaitList,
+}
+
+struct CommandSlotInner<'a> {
+    state: SlotState,
+    cmdheader: &'a mut CommandHeader,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum SlotState {
+    Idle,
+    Working,
+    Finished,
+    Error,
+}
+
+impl<'a> CommandSlot<'a> {
+    pub fn new(cmdheader: &'a mut CommandHeader) -> Self {
+        Self {
+            inner: Spin::new(CommandSlotInner {
+                state: SlotState::Idle,
+                cmdheader,
+            }),
+            wait_list: WaitList::new(),
+        }
+    }
+
+    pub fn handle_irq(&self) {
+        let mut inner = self.inner.lock();
+        debug_assert_eq!(inner.state, SlotState::Working);
+
+        // TODO: Check errors.
+        inner.state = SlotState::Finished;
+        inner.cmdheader.bytes_transferred = 0;
+        inner.cmdheader.prdt_length = 0;
+
+        self.wait_list.notify_all();
+    }
+
+    pub fn prepare_command(&self, cmdtable: &CommandTable, write: bool) {
+        let mut inner = self.inner.lock_irq();
+        let cmdheader = &mut inner.cmdheader;
+
+        cmdheader.first = 0x05; // FIS type
+
+        if write {
+            cmdheader.first |= 0x40;
+        }
+
+        cmdheader.second = 0x00;
+
+        cmdheader.prdt_length = cmdtable.prdt_len();
+        cmdheader.bytes_transferred = 0;
+        cmdheader.command_table_base = cmdtable.base().addr() as u64;
+
+        cmdheader._reserved = [0; 4];
+
+        inner.state = SlotState::Working;
+    }
+
+    pub async fn wait_finish(&self) -> KResult<()> {
+        let mut inner = loop {
+            let inner = self.inner.lock_irq();
+            if inner.state != SlotState::Working {
+                break inner;
+            }
+
+            let mut wait = pin!(self.wait_list.prepare_to_wait());
+            wait.as_mut().add_to_wait_list();
+
+            if inner.state != SlotState::Working {
+                break inner;
+            }
+
+            drop(inner);
+            wait.await;
+        };
+
+        inner.state = SlotState::Idle;
+
+        Ok(())
+    }
+}

+ 46 - 0
src/driver/ahci/stats.rs

@@ -0,0 +1,46 @@
+use core::sync::atomic::{AtomicUsize, Ordering};
+
+pub struct AdapterPortStats {
+    /// Number of commands sent
+    cmd_sent: AtomicUsize,
+
+    /// Number of transmission errors
+    cmd_error: AtomicUsize,
+
+    /// Number of interrupts fired
+    int_fired: AtomicUsize,
+}
+
+impl AdapterPortStats {
+    pub const fn new() -> Self {
+        Self {
+            cmd_sent: AtomicUsize::new(0),
+            cmd_error: AtomicUsize::new(0),
+            int_fired: AtomicUsize::new(0),
+        }
+    }
+
+    pub fn inc_int_fired(&self) {
+        self.int_fired.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn inc_cmd_sent(&self) {
+        self.cmd_sent.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn inc_cmd_error(&self) {
+        self.cmd_error.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn get_int_fired(&self) -> usize {
+        self.int_fired.load(Ordering::Relaxed)
+    }
+
+    pub fn get_cmd_sent(&self) -> usize {
+        self.cmd_sent.load(Ordering::Relaxed)
+    }
+
+    pub fn get_cmd_error(&self) -> usize {
+        self.cmd_error.load(Ordering::Relaxed)
+    }
+}

+ 434 - 434
src/driver/e1000e.rs

@@ -1,439 +1,439 @@
-use crate::prelude::*;
-
-use crate::bindings::root::kernel::hw::pci;
-use crate::kernel::interrupt::register_irq_handler;
-use crate::kernel::mem::{paging, phys};
-use crate::net::netdev;
-use alloc::boxed::Box;
-use alloc::vec::Vec;
-use bindings::EFAULT;
-use paging::Page;
-use phys::{NoCachePP, PhysPtr};
-
-use crate::bindings::root::{EAGAIN, EINVAL, EIO};
-
-mod defs;
-
-#[repr(C)]
-struct RxDescriptor {
-    buffer: u64,
-    length: u16,
-    checksum: u16,
-    status: u8,
-    errors: u8,
-    vlan: u16,
-}
-
-#[repr(C)]
-struct TxDescriptor {
-    buffer: u64,
-    length: u16,
-    cso: u8, // Checksum offset
-    cmd: u8,
-    status: u8,
-    css: u8, // Checksum start
-    vlan: u16,
-}
-
-const RX_DESC_SIZE: usize = 32;
-const TX_DESC_SIZE: usize = 32;
-
-struct E1000eDev {
-    mac: netdev::Mac,
-    status: netdev::LinkStatus,
-    speed: netdev::LinkSpeed,
-    id: u32,
-
-    base: NoCachePP,
-    rt_desc_page: Page,
-    rx_head: Option<u32>,
-    rx_tail: Option<u32>,
-    tx_tail: Option<u32>,
-
-    rx_buffers: Option<Box<Vec<Page>>>,
-    tx_buffers: Option<Box<Vec<Page>>>,
-}
-
-fn test(val: u32, bit: u32) -> bool {
-    (val & bit) == bit
-}
-
-struct PrintableBytes<'a>(&'a [u8]);
-
-impl core::fmt::Debug for PrintableBytes<'_> {
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        write!(f, "PrintableBytes {{")?;
-        for chunk in self.0.chunks(16) {
-            for &byte in chunk {
-                write!(f, "{byte} ")?;
-            }
-            write!(f, "\n")?;
-        }
-        write!(f, "}}")?;
-
-        Ok(())
-    }
-}
-
-impl netdev::Netdev for E1000eDev {
-    fn mac(&self) -> netdev::Mac {
-        self.mac
-    }
-
-    fn link_status(&self) -> netdev::LinkStatus {
-        self.status
-    }
-
-    fn link_speed(&self) -> netdev::LinkSpeed {
-        self.speed
-    }
-
-    fn id(&self) -> u32 {
-        self.id
-    }
-
-    fn up(&mut self) -> Result<(), u32> {
-        let ctrl = self.read(defs::REG_CTRL);
-        let status = self.read(defs::REG_STAT);
-
-        // check link up
-        if !test(ctrl, defs::CTRL_SLU) || !test(status, defs::STAT_LU) {
-            return Err(EIO);
-        }
-
-        // auto negotiation of speed
-        match status & defs::STAT_SPEED_MASK {
-            defs::STAT_SPEED_10M => self.speed = netdev::LinkSpeed::Speed10M,
-            defs::STAT_SPEED_100M => self.speed = netdev::LinkSpeed::Speed100M,
-            defs::STAT_SPEED_1000M => self.speed = netdev::LinkSpeed::Speed1000M,
-            _ => return Err(EINVAL),
-        }
-
-        // clear multicast table
-        for i in (0..128).step_by(4) {
-            self.write(defs::REG_MTA + i, 0);
-        }
-
-        self.clear_stats()?;
-
-        // setup interrupt handler
-        let device = netdev::get_netdev(self.id).unwrap();
-        let handler = move || {
-            eonix_runtime::task::Task::block_on(device.lock())
-                .fire()
-                .unwrap();
-        };
-
-        register_irq_handler(0xb, handler)?;
-
-        // enable interrupts
-        self.write(defs::REG_IMS, defs::ICR_NORMAL | defs::ICR_UP);
-
-        // read to clear any pending interrupts
-        self.read(defs::REG_ICR);
-
-        self.setup_rx()?;
-        self.setup_tx()?;
-
-        self.status = netdev::LinkStatus::Up;
-
-        Ok(())
-    }
-
-    fn fire(&mut self) -> Result<(), u32> {
-        let cause = self.read(defs::REG_ICR);
-        if !test(cause, defs::ICR_INT) {
-            return Ok(());
-        }
-
-        loop {
-            let tail = self.rx_tail.ok_or(EIO)?;
-            let next_tail = (tail + 1) % RX_DESC_SIZE as u32;
-
-            if next_tail == self.read(defs::REG_RDH) {
-                break;
-            }
-
-            let ref mut desc = self.rx_desc_table()[next_tail as usize];
-            if !test(desc.status as u32, defs::RXD_STAT_DD as u32) {
-                Err(EIO)?;
-            }
-
-            desc.status = 0;
-            let len = desc.length as usize;
-
-            let buffers = self.rx_buffers.as_mut().ok_or(EIO)?;
-            let data = &buffers[next_tail as usize].as_slice()[..len];
-
-            println_debug!("e1000e: received {len} bytes, {:?}", PrintableBytes(data));
-            self.rx_tail = Some(next_tail);
-        }
-
-        Ok(())
-    }
-
-    fn send(&mut self, buf: &[u8]) -> Result<(), u32> {
-        let tail = self.tx_tail.ok_or(EIO)?;
-        let head = self.read(defs::REG_TDH);
-        let next_tail = (tail + 1) % TX_DESC_SIZE as u32;
-
-        if next_tail == head {
-            return Err(EAGAIN);
-        }
-
-        let ref mut desc = self.tx_desc_table()[tail as usize];
-        if !test(desc.status as u32, defs::TXD_STAT_DD as u32) {
-            return Err(EIO);
-        }
-
-        let buffer_page = Page::alloc_one();
-        if buf.len() > buffer_page.len() {
-            return Err(EFAULT);
-        }
-        buffer_page.as_mut_slice()[..buf.len()].copy_from_slice(buf);
-
-        desc.buffer = buffer_page.as_phys() as u64;
-        desc.length = buf.len() as u16;
-        desc.cmd = defs::TXD_CMD_EOP | defs::TXD_CMD_IFCS | defs::TXD_CMD_RS;
-        desc.status = 0;
-
-        self.tx_tail = Some(next_tail);
-        self.write(defs::REG_TDT, next_tail);
-
-        // TODO: check if the packets are sent and update self.tx_head state
-
-        Ok(())
-    }
-}
-
-impl E1000eDev {
-    fn setup_rx(&mut self) -> Result<(), u32> {
-        if !self.rx_head.is_none() || !self.rx_tail.is_none() {
-            return Err(EINVAL);
-        }
-
-        let addr = self.rt_desc_page.as_phys();
-
-        self.write(defs::REG_RDBAL, addr as u32);
-        self.write(defs::REG_RDBAH, (addr >> 32) as u32);
-
-        self.write(
-            defs::REG_RDLEN,
-            (RX_DESC_SIZE * size_of::<RxDescriptor>()) as u32,
-        );
-
-        self.write(defs::REG_RDH, 0);
-        self.write(defs::REG_RDT, RX_DESC_SIZE as u32 - 1);
-
-        self.rx_head = Some(0);
-        self.rx_tail = Some(RX_DESC_SIZE as u32 - 1);
-
-        self.write(
-            defs::REG_RCTL,
-            defs::RCTL_EN
-                | defs::RCTL_MPE
-                | defs::RCTL_LPE
-                | defs::RCTL_LBM_NO
-                | defs::RCTL_DTYP_LEGACY
-                | defs::RCTL_BAM
-                | defs::RCTL_BSIZE_8192
-                | defs::RCTL_SECRC,
-        );
-
-        Ok(())
-    }
-
-    fn setup_tx(&mut self) -> Result<(), u32> {
-        if !self.tx_tail.is_none() {
-            return Err(EINVAL);
-        }
-
-        let addr = self.rt_desc_page.as_phys() + 0x200;
-
-        self.write(defs::REG_TDBAL, addr as u32);
-        self.write(defs::REG_TDBAH, (addr >> 32) as u32);
-
-        self.write(
-            defs::REG_TDLEN,
-            (TX_DESC_SIZE * size_of::<TxDescriptor>()) as u32,
-        );
-
-        self.write(defs::REG_TDH, 0);
-        self.write(defs::REG_TDT, 0);
-
-        self.tx_tail = Some(0);
-
-        self.write(
-            defs::REG_TCTL,
-            defs::TCTL_EN
-                | defs::TCTL_PSP
-                | (15 << defs::TCTL_CT_SHIFT)
-                | (64 << defs::TCTL_COLD_SHIFT)
-                | defs::TCTL_RTLC,
-        );
-
-        Ok(())
-    }
-
-    fn reset(&self) -> Result<(), u32> {
-        // disable interrupts so we won't mess things up
-        self.write(defs::REG_IMC, 0xffffffff);
-
-        let ctrl = self.read(defs::REG_CTRL);
-        self.write(defs::REG_CTRL, ctrl | defs::CTRL_GIOD);
-
-        while self.read(defs::REG_STAT) & defs::STAT_GIOE != 0 {
-            // wait for link up
-        }
-
-        let ctrl = self.read(defs::REG_CTRL);
-        self.write(defs::REG_CTRL, ctrl | defs::CTRL_RST);
-
-        while self.read(defs::REG_CTRL) & defs::CTRL_RST != 0 {
-            // wait for reset
-        }
-
-        // disable interrupts again
-        self.write(defs::REG_IMC, 0xffffffff);
-
-        Ok(())
-    }
-
-    fn clear_stats(&self) -> Result<(), u32> {
-        self.write(defs::REG_COLC, 0);
-        self.write(defs::REG_GPRC, 0);
-        self.write(defs::REG_MPRC, 0);
-        self.write(defs::REG_GPTC, 0);
-        self.write(defs::REG_GORCL, 0);
-        self.write(defs::REG_GORCH, 0);
-        self.write(defs::REG_GOTCL, 0);
-        self.write(defs::REG_GOTCH, 0);
-        Ok(())
-    }
-
-    pub fn new(base: NoCachePP) -> Result<Self, u32> {
-        let page = Page::alloc_one();
-
-        page.zero();
-
-        let mut dev = Self {
-            mac: [0; 6],
-            status: netdev::LinkStatus::Down,
-            speed: netdev::LinkSpeed::SpeedUnknown,
-            id: netdev::alloc_id(),
-            base,
-            rt_desc_page: page,
-            rx_head: None,
-            rx_tail: None,
-            tx_tail: None,
-            rx_buffers: None,
-            tx_buffers: None,
-        };
-
-        dev.reset()?;
-
-        dev.mac = unsafe { dev.base.offset(0x5400).as_ptr::<[u8; 6]>().read() };
-        dev.tx_buffers = Some(Box::new(Vec::with_capacity(TX_DESC_SIZE)));
-
-        let mut rx_buffers = Box::new(Vec::with_capacity(RX_DESC_SIZE));
-
-        for index in 0..RX_DESC_SIZE {
-            let page = Page::alloc_many(2);
-
-            let ref mut desc = dev.rx_desc_table()[index];
-            desc.buffer = page.as_phys() as u64;
-            desc.status = 0;
-
-            rx_buffers.push(page);
-        }
-
-        for index in 0..TX_DESC_SIZE {
-            let ref mut desc = dev.tx_desc_table()[index];
-            desc.status = defs::TXD_STAT_DD;
-        }
-
-        dev.rx_buffers = Some(rx_buffers);
-
-        Ok(dev)
-    }
-
-    fn read(&self, offset: u32) -> u32 {
-        unsafe {
-            self.base
-                .offset(offset as isize)
-                .as_ptr::<u32>()
-                .read_volatile()
-        }
-    }
-
-    fn write(&self, offset: u32, value: u32) {
-        unsafe {
-            self.base
-                .offset(offset as isize)
-                .as_ptr::<u32>()
-                .write_volatile(value)
-        }
-    }
-
-    fn rx_desc_table<'lt>(&'lt self) -> &'lt mut [RxDescriptor; RX_DESC_SIZE] {
-        self.rt_desc_page.as_cached().as_mut()
-    }
-
-    fn tx_desc_table<'lt>(&'lt self) -> &'lt mut [TxDescriptor; TX_DESC_SIZE] {
-        self.rt_desc_page.as_cached().offset(0x200).as_mut()
-    }
-}
-
-impl Drop for E1000eDev {
-    fn drop(&mut self) {
-        assert_eq!(self.status, netdev::LinkStatus::Down);
-
-        if let Some(_) = self.rx_buffers.take() {}
-
-        // TODO: we should wait until all packets are sent
-        if let Some(_) = self.tx_buffers.take() {}
-
-        let _ = self.rt_desc_page;
-    }
-}
-
-impl pci::pci_device {
-    fn header0(&self) -> &pci::device_header_type0 {
-        unsafe { self.header_type0().as_ref() }.unwrap()
-    }
-}
-
-fn do_probe_device(dev: &mut pci::pci_device) -> Result<(), u32> {
-    let bar0 = dev.header0().bars[0];
-
-    if bar0 & 0xf != 0 {
-        return Err(EINVAL);
-    }
-
-    unsafe { dev.enableBusMastering() };
-
-    let base = NoCachePP::new((bar0 & !0xf) as usize);
-    let e1000e = E1000eDev::new(base)?;
-
-    netdev::register_netdev(e1000e)?;
-
-    Ok(())
-}
-
-unsafe extern "C" fn probe_device(dev: *mut pci::pci_device) -> i32 {
-    let dev = dev.as_mut().unwrap();
-    match do_probe_device(dev) {
-        Ok(_) => 0,
-        Err(e) => -(e as i32),
-    }
-}
+// use crate::prelude::*;
+//
+// use crate::bindings::root::kernel::hw::pci;
+// use crate::kernel::interrupt::register_irq_handler;
+// use crate::kernel::mem::{paging, phys};
+// use crate::net::netdev;
+// use alloc::boxed::Box;
+// use alloc::vec::Vec;
+// use bindings::EFAULT;
+// use paging::Page;
+// use phys::{NoCachePP, PhysPtr};
+//
+// use crate::bindings::root::{EAGAIN, EINVAL, EIO};
+//
+// mod defs;
+//
+// #[repr(C)]
+// struct RxDescriptor {
+//     buffer: u64,
+//     length: u16,
+//     checksum: u16,
+//     status: u8,
+//     errors: u8,
+//     vlan: u16,
+// }
+//
+// #[repr(C)]
+// struct TxDescriptor {
+//     buffer: u64,
+//     length: u16,
+//     cso: u8, // Checksum offset
+//     cmd: u8,
+//     status: u8,
+//     css: u8, // Checksum start
+//     vlan: u16,
+// }
+//
+// const RX_DESC_SIZE: usize = 32;
+// const TX_DESC_SIZE: usize = 32;
+//
+// struct E1000eDev {
+//     mac: netdev::Mac,
+//     status: netdev::LinkStatus,
+//     speed: netdev::LinkSpeed,
+//     id: u32,
+//
+//     base: NoCachePP,
+//     rt_desc_page: Page,
+//     rx_head: Option<u32>,
+//     rx_tail: Option<u32>,
+//     tx_tail: Option<u32>,
+//
+//     rx_buffers: Option<Box<Vec<Page>>>,
+//     tx_buffers: Option<Box<Vec<Page>>>,
+// }
+//
+// fn test(val: u32, bit: u32) -> bool {
+//     (val & bit) == bit
+// }
+//
+// struct PrintableBytes<'a>(&'a [u8]);
+//
+// impl core::fmt::Debug for PrintableBytes<'_> {
+//     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+//         write!(f, "PrintableBytes {{")?;
+//         for chunk in self.0.chunks(16) {
+//             for &byte in chunk {
+//                 write!(f, "{byte} ")?;
+//             }
+//             write!(f, "\n")?;
+//         }
+//         write!(f, "}}")?;
+//
+//         Ok(())
+//     }
+// }
+//
+// impl netdev::Netdev for E1000eDev {
+//     fn mac(&self) -> netdev::Mac {
+//         self.mac
+//     }
+//
+//     fn link_status(&self) -> netdev::LinkStatus {
+//         self.status
+//     }
+//
+//     fn link_speed(&self) -> netdev::LinkSpeed {
+//         self.speed
+//     }
+//
+//     fn id(&self) -> u32 {
+//         self.id
+//     }
+//
+//     fn up(&mut self) -> Result<(), u32> {
+//         let ctrl = self.read(defs::REG_CTRL);
+//         let status = self.read(defs::REG_STAT);
+//
+//         // check link up
+//         if !test(ctrl, defs::CTRL_SLU) || !test(status, defs::STAT_LU) {
+//             return Err(EIO);
+//         }
+//
+//         // auto negotiation of speed
+//         match status & defs::STAT_SPEED_MASK {
+//             defs::STAT_SPEED_10M => self.speed = netdev::LinkSpeed::Speed10M,
+//             defs::STAT_SPEED_100M => self.speed = netdev::LinkSpeed::Speed100M,
+//             defs::STAT_SPEED_1000M => self.speed = netdev::LinkSpeed::Speed1000M,
+//             _ => return Err(EINVAL),
+//         }
+//
+//         // clear multicast table
+//         for i in (0..128).step_by(4) {
+//             self.write(defs::REG_MTA + i, 0);
+//         }
+//
+//         self.clear_stats()?;
+//
+//         // setup interrupt handler
+//         let device = netdev::get_netdev(self.id).unwrap();
+//         let handler = move || {
+//             eonix_runtime::task::Task::block_on(device.lock())
+//                 .fire()
+//                 .unwrap();
+//         };
+//
+//         register_irq_handler(0xb, handler)?;
+//
+//         // enable interrupts
+//         self.write(defs::REG_IMS, defs::ICR_NORMAL | defs::ICR_UP);
+//
+//         // read to clear any pending interrupts
+//         self.read(defs::REG_ICR);
+//
+//         self.setup_rx()?;
+//         self.setup_tx()?;
+//
+//         self.status = netdev::LinkStatus::Up;
+//
+//         Ok(())
+//     }
+//
+//     fn fire(&mut self) -> Result<(), u32> {
+//         let cause = self.read(defs::REG_ICR);
+//         if !test(cause, defs::ICR_INT) {
+//             return Ok(());
+//         }
+//
+//         loop {
+//             let tail = self.rx_tail.ok_or(EIO)?;
+//             let next_tail = (tail + 1) % RX_DESC_SIZE as u32;
+//
+//             if next_tail == self.read(defs::REG_RDH) {
+//                 break;
+//             }
+//
+//             let ref mut desc = self.rx_desc_table()[next_tail as usize];
+//             if !test(desc.status as u32, defs::RXD_STAT_DD as u32) {
+//                 Err(EIO)?;
+//             }
+//
+//             desc.status = 0;
+//             let len = desc.length as usize;
+//
+//             let buffers = self.rx_buffers.as_mut().ok_or(EIO)?;
+//             let data = &buffers[next_tail as usize].as_slice()[..len];
+//
+//             println_debug!("e1000e: received {len} bytes, {:?}", PrintableBytes(data));
+//             self.rx_tail = Some(next_tail);
+//         }
+//
+//         Ok(())
+//     }
+//
+//     fn send(&mut self, buf: &[u8]) -> Result<(), u32> {
+//         let tail = self.tx_tail.ok_or(EIO)?;
+//         let head = self.read(defs::REG_TDH);
+//         let next_tail = (tail + 1) % TX_DESC_SIZE as u32;
+//
+//         if next_tail == head {
+//             return Err(EAGAIN);
+//         }
+//
+//         let ref mut desc = self.tx_desc_table()[tail as usize];
+//         if !test(desc.status as u32, defs::TXD_STAT_DD as u32) {
+//             return Err(EIO);
+//         }
+//
+//         let buffer_page = Page::alloc_one();
+//         if buf.len() > buffer_page.len() {
+//             return Err(EFAULT);
+//         }
+//         buffer_page.as_mut_slice()[..buf.len()].copy_from_slice(buf);
+//
+//         desc.buffer = buffer_page.as_phys() as u64;
+//         desc.length = buf.len() as u16;
+//         desc.cmd = defs::TXD_CMD_EOP | defs::TXD_CMD_IFCS | defs::TXD_CMD_RS;
+//         desc.status = 0;
+//
+//         self.tx_tail = Some(next_tail);
+//         self.write(defs::REG_TDT, next_tail);
+//
+//         // TODO: check if the packets are sent and update self.tx_head state
+//
+//         Ok(())
+//     }
+// }
+//
+// impl E1000eDev {
+//     fn setup_rx(&mut self) -> Result<(), u32> {
+//         if !self.rx_head.is_none() || !self.rx_tail.is_none() {
+//             return Err(EINVAL);
+//         }
+//
+//         let addr = self.rt_desc_page.as_phys();
+//
+//         self.write(defs::REG_RDBAL, addr as u32);
+//         self.write(defs::REG_RDBAH, (addr >> 32) as u32);
+//
+//         self.write(
+//             defs::REG_RDLEN,
+//             (RX_DESC_SIZE * size_of::<RxDescriptor>()) as u32,
+//         );
+//
+//         self.write(defs::REG_RDH, 0);
+//         self.write(defs::REG_RDT, RX_DESC_SIZE as u32 - 1);
+//
+//         self.rx_head = Some(0);
+//         self.rx_tail = Some(RX_DESC_SIZE as u32 - 1);
+//
+//         self.write(
+//             defs::REG_RCTL,
+//             defs::RCTL_EN
+//                 | defs::RCTL_MPE
+//                 | defs::RCTL_LPE
+//                 | defs::RCTL_LBM_NO
+//                 | defs::RCTL_DTYP_LEGACY
+//                 | defs::RCTL_BAM
+//                 | defs::RCTL_BSIZE_8192
+//                 | defs::RCTL_SECRC,
+//         );
+//
+//         Ok(())
+//     }
+//
+//     fn setup_tx(&mut self) -> Result<(), u32> {
+//         if !self.tx_tail.is_none() {
+//             return Err(EINVAL);
+//         }
+//
+//         let addr = self.rt_desc_page.as_phys() + 0x200;
+//
+//         self.write(defs::REG_TDBAL, addr as u32);
+//         self.write(defs::REG_TDBAH, (addr >> 32) as u32);
+//
+//         self.write(
+//             defs::REG_TDLEN,
+//             (TX_DESC_SIZE * size_of::<TxDescriptor>()) as u32,
+//         );
+//
+//         self.write(defs::REG_TDH, 0);
+//         self.write(defs::REG_TDT, 0);
+//
+//         self.tx_tail = Some(0);
+//
+//         self.write(
+//             defs::REG_TCTL,
+//             defs::TCTL_EN
+//                 | defs::TCTL_PSP
+//                 | (15 << defs::TCTL_CT_SHIFT)
+//                 | (64 << defs::TCTL_COLD_SHIFT)
+//                 | defs::TCTL_RTLC,
+//         );
+//
+//         Ok(())
+//     }
+//
+//     fn reset(&self) -> Result<(), u32> {
+//         // disable interrupts so we won't mess things up
+//         self.write(defs::REG_IMC, 0xffffffff);
+//
+//         let ctrl = self.read(defs::REG_CTRL);
+//         self.write(defs::REG_CTRL, ctrl | defs::CTRL_GIOD);
+//
+//         while self.read(defs::REG_STAT) & defs::STAT_GIOE != 0 {
+//             // wait for link up
+//         }
+//
+//         let ctrl = self.read(defs::REG_CTRL);
+//         self.write(defs::REG_CTRL, ctrl | defs::CTRL_RST);
+//
+//         while self.read(defs::REG_CTRL) & defs::CTRL_RST != 0 {
+//             // wait for reset
+//         }
+//
+//         // disable interrupts again
+//         self.write(defs::REG_IMC, 0xffffffff);
+//
+//         Ok(())
+//     }
+//
+//     fn clear_stats(&self) -> Result<(), u32> {
+//         self.write(defs::REG_COLC, 0);
+//         self.write(defs::REG_GPRC, 0);
+//         self.write(defs::REG_MPRC, 0);
+//         self.write(defs::REG_GPTC, 0);
+//         self.write(defs::REG_GORCL, 0);
+//         self.write(defs::REG_GORCH, 0);
+//         self.write(defs::REG_GOTCL, 0);
+//         self.write(defs::REG_GOTCH, 0);
+//         Ok(())
+//     }
+//
+//     pub fn new(base: NoCachePP) -> Result<Self, u32> {
+//         let page = Page::alloc_one();
+//
+//         page.zero();
+//
+//         let mut dev = Self {
+//             mac: [0; 6],
+//             status: netdev::LinkStatus::Down,
+//             speed: netdev::LinkSpeed::SpeedUnknown,
+//             id: netdev::alloc_id(),
+//             base,
+//             rt_desc_page: page,
+//             rx_head: None,
+//             rx_tail: None,
+//             tx_tail: None,
+//             rx_buffers: None,
+//             tx_buffers: None,
+//         };
+//
+//         dev.reset()?;
+//
+//         dev.mac = unsafe { dev.base.offset(0x5400).as_ptr::<[u8; 6]>().read() };
+//         dev.tx_buffers = Some(Box::new(Vec::with_capacity(TX_DESC_SIZE)));
+//
+//         let mut rx_buffers = Box::new(Vec::with_capacity(RX_DESC_SIZE));
+//
+//         for index in 0..RX_DESC_SIZE {
+//             let page = Page::alloc_many(2);
+//
+//             let ref mut desc = dev.rx_desc_table()[index];
+//             desc.buffer = page.as_phys() as u64;
+//             desc.status = 0;
+//
+//             rx_buffers.push(page);
+//         }
+//
+//         for index in 0..TX_DESC_SIZE {
+//             let ref mut desc = dev.tx_desc_table()[index];
+//             desc.status = defs::TXD_STAT_DD;
+//         }
+//
+//         dev.rx_buffers = Some(rx_buffers);
+//
+//         Ok(dev)
+//     }
+//
+//     fn read(&self, offset: u32) -> u32 {
+//         unsafe {
+//             self.base
+//                 .offset(offset as isize)
+//                 .as_ptr::<u32>()
+//                 .read_volatile()
+//         }
+//     }
+//
+//     fn write(&self, offset: u32, value: u32) {
+//         unsafe {
+//             self.base
+//                 .offset(offset as isize)
+//                 .as_ptr::<u32>()
+//                 .write_volatile(value)
+//         }
+//     }
+//
+//     fn rx_desc_table<'lt>(&'lt self) -> &'lt mut [RxDescriptor; RX_DESC_SIZE] {
+//         self.rt_desc_page.as_cached().as_mut()
+//     }
+//
+//     fn tx_desc_table<'lt>(&'lt self) -> &'lt mut [TxDescriptor; TX_DESC_SIZE] {
+//         self.rt_desc_page.as_cached().offset(0x200).as_mut()
+//     }
+// }
+//
+// impl Drop for E1000eDev {
+//     fn drop(&mut self) {
+//         assert_eq!(self.status, netdev::LinkStatus::Down);
+//
+//         if let Some(_) = self.rx_buffers.take() {}
+//
+//         // TODO: we should wait until all packets are sent
+//         if let Some(_) = self.tx_buffers.take() {}
+//
+//         let _ = self.rt_desc_page;
+//     }
+// }
+//
+// impl pci::pci_device {
+//     fn header0(&self) -> &pci::device_header_type0 {
+//         unsafe { self.header_type0().as_ref() }.unwrap()
+//     }
+// }
+//
+// fn do_probe_device(dev: &mut pci::pci_device) -> Result<(), u32> {
+//     let bar0 = dev.header0().bars[0];
+//
+//     if bar0 & 0xf != 0 {
+//         return Err(EINVAL);
+//     }
+//
+//     unsafe { dev.enableBusMastering() };
+//
+//     let base = NoCachePP::new((bar0 & !0xf) as usize);
+//     let e1000e = E1000eDev::new(base)?;
+//
+//     netdev::register_netdev(e1000e)?;
+//
+//     Ok(())
+// }
+//
+// unsafe extern "C" fn probe_device(dev: *mut pci::pci_device) -> i32 {
+//     let dev = dev.as_mut().unwrap();
+//     match do_probe_device(dev) {
+//         Ok(_) => 0,
+//         Err(e) => -(e as i32),
+//     }
+// }
 
 pub fn register_e1000e_driver() {
-    let dev_ids = [0x100e, 0x10d3, 0x10ea, 0x153a];
+    // let dev_ids = [0x100e, 0x10d3, 0x10ea, 0x153a];
 
-    for id in dev_ids.into_iter() {
-        let ret = unsafe { pci::register_driver_r(0x8086, id, Some(probe_device)) };
+    // for id in dev_ids.into_iter() {
+    //     let ret = unsafe { pci::register_driver_r(0x8086, id, Some(probe_device)) };
 
-        assert_eq!(ret, 0);
-    }
+    //     assert_eq!(ret, 0);
+    // }
 }

+ 0 - 1
src/driver/serial.rs

@@ -10,7 +10,6 @@ use alloc::{collections::vec_deque::VecDeque, format, sync::Arc};
 use bitflags::bitflags;
 use core::pin::pin;
 use eonix_runtime::{run::FutureRun, scheduler::Scheduler};
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::WaitList;
 
 bitflags! {

+ 11 - 11
src/elf.rs

@@ -1,15 +1,15 @@
-use alloc::{ffi::CString, sync::Arc};
-use bitflags::bitflags;
-
 use crate::{
     io::{ByteBuffer, UninitBuffer},
     kernel::{
         constants::ENOEXEC,
-        mem::{FileMapping, MMList, Mapping, Permission, VAddr},
+        mem::{FileMapping, MMList, Mapping, Permission},
         vfs::dentry::Dentry,
     },
     prelude::*,
 };
+use alloc::{ffi::CString, sync::Arc};
+use bitflags::bitflags;
+use eonix_mm::address::{Addr as _, AddrOps as _, VAddr};
 
 #[repr(u8)]
 #[allow(dead_code)]
@@ -244,13 +244,13 @@ impl ParsedElf32 {
     pub fn load(self, args: Vec<CString>, envs: Vec<CString>) -> KResult<(VAddr, VAddr, MMList)> {
         let mm_list = MMList::new();
 
-        let mut data_segment_end = VAddr(0);
+        let mut data_segment_end = VAddr::NULL;
         for phent in self
             .phents
             .into_iter()
             .filter(|ent| ent.ph_type == Elf32PhType::Load)
         {
-            let vaddr_start = VAddr(phent.vaddr as usize);
+            let vaddr_start = VAddr::from(phent.vaddr as usize);
             let vmem_vaddr_end = vaddr_start + phent.mem_size as usize;
             let load_vaddr_end = vaddr_start + phent.file_size as usize;
 
@@ -296,8 +296,8 @@ impl ParsedElf32 {
 
         // Map stack area
         mm_list.mmap_fixed(
-            VAddr(0xc0000000 - 0x800000), // Stack bottom is at 0xc0000000
-            0x800000,                     // 8MB stack size
+            VAddr::from(0xc0000000 - 0x800000), // Stack bottom is at 0xc0000000
+            0x800000,                           // 8MB stack size
             Mapping::Anonymous,
             Permission {
                 write: true,
@@ -319,7 +319,7 @@ impl ParsedElf32 {
         longs.push(0); // AT_NULL
 
         sp = sp - longs.len() * size_of::<u32>();
-        sp = VAddr::from(usize::from(sp) & !0xf); // Align to 16 bytes
+        sp = sp.floor_to(16);
 
         mm_list.access_mut(sp, longs.len() * size_of::<u32>(), |offset, data| {
             data.copy_from_slice(unsafe {
@@ -330,7 +330,7 @@ impl ParsedElf32 {
             })
         })?;
 
-        Ok((VAddr(self.entry as usize), sp, mm_list))
+        Ok((VAddr::from(self.entry as usize), sp, mm_list))
     }
 }
 
@@ -342,7 +342,7 @@ fn push_strings(mm_list: &MMList, sp: &mut VAddr, strings: Vec<CString>) -> KRes
         mm_list.access_mut(*sp, len, |offset, data| {
             data.copy_from_slice(&string.as_bytes_with_nul()[offset..offset + data.len()])
         })?;
-        addrs.push(usize::from(*sp) as u32);
+        addrs.push(sp.addr() as u32);
     }
 
     Ok(addrs)

+ 10 - 4
src/fs/fat32/file.rs

@@ -1,6 +1,8 @@
-use crate::{kernel::mem::Page, KResult};
-
 use super::{ClusterIterator, FatFs};
+use crate::{
+    kernel::mem::{AsMemoryBlock as _, Page},
+    KResult,
+};
 
 pub trait ClusterReadIterator<'data>: Iterator<Item = KResult<&'data [u8]>> + 'data {}
 impl<'a, I> ClusterReadIterator<'a> for I where I: Iterator<Item = KResult<&'a [u8]>> + 'a {}
@@ -22,11 +24,15 @@ impl<'data, 'fat: 'data> ClusterRead<'data> for ClusterIterator<'fat> {
         let skip_clusters = offset / cluster_size;
         let mut inner_offset = offset % cluster_size;
 
-        let buffer_page = Page::alloc_one();
+        // TODO: Use block cache.
+        let buffer_page = Page::alloc();
 
         self.skip(skip_clusters).map(move |cluster| {
             vfs.read_cluster(cluster, &buffer_page)?;
-            let data = &buffer_page.as_slice()[inner_offset..];
+            let data = unsafe {
+                // SAFETY: No one could be writing to it.
+                &buffer_page.as_memblk().as_bytes()[inner_offset..]
+            };
             inner_offset = 0;
             Ok(data)
         })

+ 9 - 6
src/fs/procfs.rs

@@ -2,7 +2,7 @@ use crate::{
     io::Buffer,
     kernel::{
         constants::{S_IFDIR, S_IFREG},
-        mem::paging::{Page, PageBuffer},
+        mem::paging::PageBuffer,
         vfs::{
             dentry::Dentry,
             inode::{define_struct_inode, AtomicIno, Ino, Inode, InodeData},
@@ -100,10 +100,13 @@ impl Inode for FileInode {
             return Err(EACCES);
         }
 
-        let mut page_buffer = PageBuffer::new(Page::alloc_one());
-        let nread = self.file.read(&mut page_buffer)?;
+        let mut page_buffer = PageBuffer::new();
+        self.file.read(&mut page_buffer)?;
 
-        let data = split_len_offset(page_buffer.as_slice(), nread, offset);
+        let data = page_buffer
+            .data()
+            .split_at_checked(offset)
+            .map(|(_, data)| data);
 
         match data {
             None => Ok(0),
@@ -269,7 +272,7 @@ impl ProcFsFile for DumpMountsFile {
     fn read(&self, buffer: &mut PageBuffer) -> KResult<usize> {
         dump_mounts(&mut buffer.get_writer());
 
-        Ok(buffer.len())
+        Ok(buffer.data().len())
     }
 }
 
@@ -300,7 +303,7 @@ where
     }
 
     fn read(&self, buffer: &mut PageBuffer) -> KResult<usize> {
-        self.read_fn.as_ref().ok_or(EACCES)?(buffer).map(|_| buffer.len())
+        self.read_fn.as_ref().ok_or(EACCES)?(buffer).map(|_| buffer.data().len())
     }
 }
 

+ 40 - 4
src/io.rs

@@ -1,8 +1,6 @@
-use bindings::EFAULT;
-
 use crate::prelude::*;
-
-use core::mem::MaybeUninit;
+use bindings::EFAULT;
+use core::{cmp, mem::MaybeUninit};
 
 #[must_use]
 pub enum FillResult {
@@ -187,3 +185,41 @@ impl Buffer for ByteBuffer<'_> {
         self.cur
     }
 }
+
+/// Iterator that generates chunks of a given length from a start index
+/// until the end of the total length.
+///
+/// The iterator returns a tuple of (start, len) for each chunk.
+pub struct Chunks {
+    start: usize,
+    end: usize,
+    cur: usize,
+    chunk_len: usize,
+}
+
+impl Chunks {
+    pub const fn new(start: usize, total_len: usize, chunk_len: usize) -> Self {
+        Self {
+            start,
+            end: start + total_len,
+            cur: start,
+            chunk_len,
+        }
+    }
+}
+
+impl Iterator for Chunks {
+    type Item = (usize, usize);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cur >= self.end {
+            return None;
+        }
+
+        let start = self.cur;
+        let len = cmp::min(self.chunk_len, self.end - start);
+
+        self.cur += self.chunk_len;
+        Some((start, len))
+    }
+}

+ 10 - 5
src/kernel/block.rs

@@ -1,4 +1,8 @@
-use super::{constants::ENOENT, mem::paging::Page, vfs::DevId};
+use super::{
+    constants::ENOENT,
+    mem::{paging::Page, AsMemoryBlock as _},
+    vfs::DevId,
+};
 use crate::{
     io::{Buffer, FillResult, UninitBuffer},
     prelude::*,
@@ -218,14 +222,14 @@ impl BlockDevice {
                 count if count <= 8 => {
                     nread = count;
 
-                    let _page = Page::alloc_one();
+                    let _page = Page::alloc();
                     page = Some(_page);
                     pages = core::slice::from_ref(page.as_ref().unwrap());
                 }
                 count if count <= 16 => {
                     nread = count;
 
-                    let _pages = Page::alloc_many(1);
+                    let _pages = Page::alloc_order(1);
                     page = Some(_pages);
                     pages = core::slice::from_ref(page.as_ref().unwrap());
                 }
@@ -235,7 +239,7 @@ impl BlockDevice {
                     let npages = (nread + 15) / 16;
                     let mut _page_vec = Vec::with_capacity(npages as usize);
                     for _ in 0..npages {
-                        _page_vec.push(Page::alloc_many(1));
+                        _page_vec.push(Page::alloc_order(1));
                     }
                     page_vec = Some(_page_vec);
                     pages = page_vec.as_ref().unwrap().as_slice();
@@ -251,7 +255,8 @@ impl BlockDevice {
             self.read_raw(req)?;
 
             for page in pages.iter() {
-                let data = &page.as_slice()[first_sector_offset as usize..];
+                // SAFETY: We are the only owner of the page so no one could be mutating it.
+                let data = unsafe { &page.as_memblk().as_bytes()[first_sector_offset as usize..] };
                 first_sector_offset = 0;
 
                 match buffer.fill(data)? {

+ 8 - 5
src/kernel/cpu.rs

@@ -1,6 +1,8 @@
-use super::mem::{paging::Page, phys::PhysPtr as _};
+use super::mem::AsMemoryBlock;
 use arch::{PercpuArea, CPU};
-use core::{alloc::Layout, mem::ManuallyDrop, pin::Pin, ptr::NonNull};
+use buddy_allocator::BuddyAllocator;
+use core::{alloc::Layout, pin::Pin, ptr::NonNull};
+use eonix_mm::paging::Page;
 use eonix_sync::LazyLock;
 
 #[arch::define_percpu]
@@ -16,10 +18,11 @@ pub unsafe fn local_cpu() -> Pin<&'static mut CPU> {
 pub fn percpu_allocate(layout: Layout) -> NonNull<u8> {
     // TODO: Use page size defined in `arch`.
     let page_count = layout.size().div_ceil(arch::PAGE_SIZE);
-    let page = ManuallyDrop::new(Page::early_alloc_ceil(page_count));
-    let pointer = page.as_cached().as_ptr();
+    let page = Page::<BuddyAllocator>::alloc_at_least(page_count);
+    let page_data = page.as_memblk().as_byte_ptr();
+    core::mem::forget(page);
 
-    NonNull::new(pointer).expect("Allocated page pfn should be non-null.")
+    page_data
 }
 
 pub fn init_localcpu() {

+ 0 - 1
src/kernel/interrupt.rs

@@ -8,7 +8,6 @@ use crate::{driver::Port8, prelude::*};
 use alloc::sync::Arc;
 use arch::{ExtendedContext, InterruptContext};
 use eonix_runtime::task::Task;
-use eonix_spin_irq::SpinIrq as _;
 
 const PIC1_COMMAND: Port8 = Port8::new(0x20);
 const PIC1_DATA: Port8 = Port8::new(0x21);

+ 2 - 5
src/kernel/mem.rs

@@ -1,15 +1,12 @@
 pub mod paging;
-pub mod phys;
 
+mod access;
 mod address;
 mod mm_area;
 mod mm_list;
 mod page_alloc;
-mod page_table;
 
-#[allow(unused_imports)]
-pub use address::{PAddr, VAddr, VRange, PFN, VPN};
+pub use access::{AsMemoryBlock, MemoryBlock, PhysAccess};
 pub(self) use mm_area::MMArea;
 pub use mm_list::{handle_page_fault, FileMapping, MMList, Mapping, Permission};
-pub(self) use page_table::{PageTable, PTE};
 pub use paging::{Page, PageBuffer};

+ 158 - 0
src/kernel/mem/access.rs

@@ -0,0 +1,158 @@
+use core::{num::NonZero, ptr::NonNull};
+use eonix_mm::address::{Addr as _, PAddr};
+use eonix_mm::paging::{PageAccess, PageBlock, PFN};
+
+const PHYS_OFFSET: usize = 0xffff_ff00_0000_0000;
+
+/// A block of memory starting at a non-zero address and having a specific length.
+///
+/// This struct is used to represent a memory block that can be accessed
+/// in the kernel space.
+pub struct MemoryBlock {
+    addr: NonZero<usize>,
+    len: usize,
+}
+
+pub struct KernelPageAccess;
+
+pub trait AsMemoryBlock {
+    /// Translate the physical page the page object pointing to into kernel
+    /// accessible pointer. Use it with care.
+    fn as_memblk(&self) -> MemoryBlock;
+}
+
+pub trait PhysAccess {
+    /// Translate the data that this address is pointing to into kernel
+    /// accessible pointer. Use it with care.
+    ///
+    /// # Panic
+    /// If the address is not properly aligned.
+    ///
+    /// # Safety
+    /// The caller must ensure that the data is of type `T`.
+    /// Otherwise, it may lead to undefined behavior.
+    unsafe fn as_ptr<T>(&self) -> NonNull<T>;
+}
+
+impl MemoryBlock {
+    /// Create a new `MemoryBlock` with the given address and length.
+    ///
+    /// # Safety
+    /// The caller must ensure that the address is valid.
+    /// Otherwise, it may lead to undefined behavior.
+    pub unsafe fn new(addr: NonZero<usize>, len: usize) -> Self {
+        Self { addr, len }
+    }
+
+    /// Get the start address of the memory block.
+    #[allow(dead_code)]
+    pub fn addr(&self) -> NonZero<usize> {
+        self.addr
+    }
+
+    /// Get the length of the memory block.
+    #[allow(dead_code)]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Split the memory block into two parts at the given offset.
+    pub fn split_at(&self, at: usize) -> (Self, Self) {
+        if at > self.len {
+            panic!("Out of bounds");
+        }
+
+        let rhs_start = self.addr.checked_add(at).expect("Overflow");
+
+        let lhs = unsafe { Self::new(self.addr, at) };
+        let rhs = unsafe { Self::new(rhs_start, self.len - at) };
+
+        (lhs, rhs)
+    }
+
+    /// Provide a pointer to the data.
+    ///
+    /// # Safety
+    /// Using the returned pointer is undefined behavior if the address is not
+    ///  properly aligned or the size is not equal to the size of `T`.
+    pub unsafe fn as_ptr_unchecked<T>(&self) -> NonNull<T> {
+        // SAFETY: `self.addr` is a non-zero value.
+        NonNull::new_unchecked(self.addr.get() as *mut T)
+    }
+
+    /// Provide a pointer to the data.
+    ///
+    /// # Panic
+    /// Panic if the address is not properly aligned.
+    pub fn as_ptr<T>(&self) -> NonNull<T> {
+        let alignment = align_of::<T>();
+
+        if self.addr.get() % alignment != 0 {
+            panic!("Alignment error");
+        }
+
+        unsafe {
+            // SAFETY: We've checked that `self.addr` is properly aligned.
+            self.as_ptr_unchecked()
+        }
+    }
+
+    /// Provide a pointer to the bytes.
+    pub fn as_byte_ptr(&self) -> NonNull<u8> {
+        unsafe {
+            // SAFETY: No alignment check is needed for bytes.
+            self.as_ptr_unchecked()
+        }
+    }
+
+    /// Provide immutable access to the data it pointed to.
+    ///
+    /// # Safety
+    /// This function is unsafe because it returns an immutable reference with
+    /// a created lifetime.
+    ///
+    /// The caller must ensure that the data has no other mutable aliases while
+    /// the reference is in use. Otherwise, it may lead to undefined behavior.
+    pub unsafe fn as_bytes<'a>(&self) -> &'a [u8] {
+        core::slice::from_raw_parts(self.as_ptr_unchecked().as_ptr(), self.len)
+    }
+
+    /// Provide mutable access to the data it pointed to.
+    ///
+    /// # Panic
+    /// Panic if the address is not properly aligned or the size is not
+    /// equal to the size of `T`.
+    ///
+    /// # Safety
+    /// This function is unsafe because it returns a mutable reference with a
+    /// created lifetime.
+    ///
+    /// The caller must ensure that the data has no other immutable or mutable
+    /// aliases while the reference is in use.
+    /// Otherwise, it may lead to undefined behavior.
+    pub unsafe fn as_bytes_mut<'a>(&mut self) -> &'a mut [u8] {
+        core::slice::from_raw_parts_mut(self.as_ptr_unchecked().as_ptr(), self.len)
+    }
+}
+
+impl PhysAccess for PAddr {
+    unsafe fn as_ptr<T>(&self) -> NonNull<T> {
+        let alignment: usize = align_of::<T>();
+        assert!(self.addr() % alignment == 0, "Alignment error");
+
+        unsafe {
+            // SAFETY: We can assume that we'll never have `self.addr()` equals
+            //         to `-PHYS_OFFSET`. Otherwise, the kernel might be broken.
+            NonNull::new_unchecked((PHYS_OFFSET + self.addr()) as *mut T)
+        }
+    }
+}
+
+impl PageAccess for KernelPageAccess {
+    unsafe fn get_ptr_for_pfn(pfn: PFN) -> NonNull<PageBlock> {
+        unsafe {
+            // SAFETY: The physical address of a page must be aligned to the page size.
+            PAddr::from(pfn).as_ptr()
+        }
+    }
+}

+ 17 - 398
src/kernel/mem/address.rs

@@ -1,411 +1,30 @@
-use arch::PAGE_SIZE;
-use core::{
-    cmp::Ordering,
-    fmt::{self, Debug, Formatter},
-    ops::{Add, RangeBounds, Sub},
-};
+use eonix_mm::address::{VAddr, VRange};
 
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct PAddr(pub usize);
+const USER_SPACE_MEMORY_TOP: VAddr = VAddr::from(0x8000_0000_0000);
+const KERNEL_SPACE_MEMORY_BOTTOM: VAddr = VAddr::from(0xffff_8000_0000_0000);
 
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct VAddr(pub usize);
-
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct PFN(pub usize);
-
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct VPN(pub usize);
-
-const PAGE_SIZE_BITS: usize = 12;
-const USER_SPACE_MEMORY_TOP: VAddr = VAddr(0x8000_0000_0000);
-
-impl From<PAddr> for usize {
-    fn from(v: PAddr) -> Self {
-        v.0
-    }
-}
-
-impl From<PFN> for usize {
-    fn from(v: PFN) -> Self {
-        v.0
-    }
-}
-
-impl From<VAddr> for usize {
-    fn from(v: VAddr) -> Self {
-        v.0
-    }
-}
-
-impl From<VPN> for usize {
-    fn from(v: VPN) -> Self {
-        v.0
-    }
-}
-
-impl From<usize> for PAddr {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<usize> for PFN {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<usize> for VAddr {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<usize> for VPN {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<VPN> for VAddr {
-    fn from(v: VPN) -> Self {
-        Self(v.0 << PAGE_SIZE_BITS)
-    }
-}
-
-impl From<VAddr> for VPN {
-    fn from(v: VAddr) -> Self {
-        assert_eq!(v.page_offset(), 0);
-        v.floor_vpn()
-    }
+pub trait VAddrExt {
+    fn is_user(&self) -> bool;
 }
 
-impl From<PAddr> for PFN {
-    fn from(v: PAddr) -> Self {
-        assert_eq!(v.page_offset(), 0);
-        v.floor_pfn()
-    }
-}
-
-impl From<PFN> for PAddr {
-    fn from(v: PFN) -> Self {
-        Self(v.0 << PAGE_SIZE_BITS)
-    }
-}
-
-impl PAddr {
-    pub fn floor_pfn(&self) -> PFN {
-        PFN(self.0 / PAGE_SIZE)
-    }
-
-    pub fn ceil_pfn(&self) -> PFN {
-        PFN((self.0 + PAGE_SIZE - 1) / PAGE_SIZE)
-    }
-
-    pub fn page_offset(&self) -> usize {
-        self.0 & (PAGE_SIZE - 1)
-    }
-
-    pub fn is_aligned(&self) -> bool {
-        self.page_offset() == 0
-    }
-}
-
-impl PFN {
-    pub fn buddy_pfn(&self, order: u32) -> PFN {
-        PFN::from(self.0 ^ (1 << order))
-    }
-
-    pub fn combined_pfn(&self, buddy_pfn: PFN) -> PFN {
-        PFN::from(self.0 & buddy_pfn.0)
-    }
-}
-
-impl VAddr {
-    pub const NULL: Self = Self(0);
-
-    pub const fn floor_vpn(&self) -> VPN {
-        VPN(self.0 / PAGE_SIZE)
-    }
-
-    pub const fn ceil_vpn(&self) -> VPN {
-        VPN((self.0 - 1 + PAGE_SIZE) / PAGE_SIZE)
-    }
-
-    pub const fn page_offset(self) -> usize {
-        let Self(addr) = self;
-        addr & (PAGE_SIZE - 1)
-    }
-
-    pub const fn is_aligned(&self) -> bool {
-        self.page_offset() == 0
-    }
-
-    pub const fn is_user(self) -> bool {
-        const USER_SPACE_MEMORY_TOP_ADDR: usize = const { USER_SPACE_MEMORY_TOP.0 };
-
-        match self {
-            Self(0) => false,
-            Self(..USER_SPACE_MEMORY_TOP_ADDR) => true,
-            _ => false,
-        }
-    }
-
-    pub const fn floor(self) -> Self {
-        self.floor_to(PAGE_SIZE)
-    }
-
-    pub const fn ceil(self) -> Self {
-        self.ceil_to(PAGE_SIZE)
-    }
-
-    /// Aligns the address to the nearest lower multiple of `size`.
-    pub const fn floor_to(self, size: usize) -> Self {
-        let Self(addr) = self;
-        Self(addr / size * size)
-    }
-
-    /// Aligns the address to the nearest lower multiple of `size`.
-    pub const fn ceil_to(self, size: usize) -> Self {
-        let Self(addr) = self;
-        Self(addr.div_ceil(size) * size)
-    }
-}
-
-impl Sub for VAddr {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for VAddr {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        VAddr(self.0 - rhs)
-    }
-}
-
-impl Add<usize> for VAddr {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        VAddr(self.0 + rhs)
-    }
-}
-
-impl Sub for PAddr {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for PAddr {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        PAddr(self.0 - rhs)
-    }
-}
-
-impl Add<usize> for PAddr {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        PAddr(self.0 + rhs)
-    }
-}
-
-impl Debug for VAddr {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "VAddr{:#x}", self.0)
-    }
-}
-
-impl Debug for PAddr {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "PAddr{:#x}", self.0)
-    }
-}
-
-impl Add<usize> for PFN {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        PFN(self.0 + rhs)
-    }
-}
-
-impl Sub for PFN {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for PFN {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        PFN(self.0 - rhs)
-    }
-}
-
-impl Add<usize> for VPN {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        VPN(self.0 + rhs)
-    }
-}
-
-impl Sub for VPN {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for VPN {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        VPN(self.0 - rhs)
-    }
-}
-
-impl Debug for VPN {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "VPN{:#x}", self.0)
-    }
-}
-
-impl Debug for PFN {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "PFN{:#x}", self.0)
-    }
-}
-
-#[derive(Clone, Copy)]
-pub struct VRange {
-    start: VAddr,
-    end: VAddr,
-}
-
-impl Debug for VRange {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        write!(f, "[{:?}, {:?})", self.start, self.end)
-    }
-}
-
-impl Eq for VRange {}
-impl PartialOrd for VRange {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl PartialEq for VRange {
-    fn eq(&self, other: &Self) -> bool {
-        self.cmp(other) == Ordering::Equal
-    }
-}
-
-/// Any two ranges that have one of them containing the other are considered equal.
-impl Ord for VRange {
-    fn cmp(&self, other: &Self) -> Ordering {
-        if self.start == other.start {
-            return Ordering::Equal;
-        }
-
-        if self.end == other.end {
-            if self.start == self.end {
-                return Ordering::Greater;
-            }
-            if other.start == other.end {
-                return Ordering::Less;
-            }
-            return Ordering::Equal;
-        }
-
-        if self.start < other.start {
-            if other.end < self.end {
-                return Ordering::Equal;
-            } else {
-                return Ordering::Less;
-            }
-        }
-
-        if other.start < self.start {
-            if self.end < other.end {
-                return Ordering::Equal;
-            } else {
-                return Ordering::Greater;
-            }
-        }
-
-        unreachable!()
-    }
+pub trait VRangeExt {
+    #[allow(dead_code)]
+    fn is_kernel(&self) -> bool;
+    fn is_user(&self) -> bool;
 }
 
-impl From<VAddr> for VRange {
-    fn from(addr: VAddr) -> Self {
-        VRange::new(addr, addr)
+impl VAddrExt for VAddr {
+    fn is_user(&self) -> bool {
+        (..USER_SPACE_MEMORY_TOP).contains(&self)
     }
 }
 
-impl VRange {
-    pub fn new(start: VAddr, end: VAddr) -> Self {
-        assert!(start <= end);
-        VRange { start, end }
-    }
-
-    #[allow(dead_code)]
-    pub fn is_overlapped(&self, other: &Self) -> bool {
-        self == other
-    }
-
-    pub fn is_user(&self) -> bool {
-        self.start < USER_SPACE_MEMORY_TOP && self.end <= USER_SPACE_MEMORY_TOP
-    }
-
-    pub fn start(&self) -> VAddr {
-        self.start
-    }
-
-    pub fn end(&self) -> VAddr {
-        self.end
-    }
-
-    pub fn len(&self) -> usize {
-        self.end.0 - self.start.0
-    }
-
-    pub fn shrink(&self, count: usize) -> Self {
-        assert!(count <= self.len());
-        VRange::new(self.start, self.end - count)
-    }
-
-    pub fn grow(&self, count: usize) -> Self {
-        VRange::new(self.start, self.end + count)
+impl VRangeExt for VRange {
+    fn is_user(&self) -> bool {
+        !(self.end() > USER_SPACE_MEMORY_TOP || self.start() >= USER_SPACE_MEMORY_TOP)
     }
 
-    pub fn into_range(self) -> impl RangeBounds<Self> {
-        if self.len() == 0 {
-            VRange::from(self.start())..=VRange::from(self.start())
-        } else {
-            VRange::from(self.start())..=VRange::from(self.end() - 1)
-        }
+    fn is_kernel(&self) -> bool {
+        self.start() >= KERNEL_SPACE_MEMORY_BOTTOM
     }
 }

+ 60 - 56
src/kernel/mem/mm_area.rs

@@ -1,12 +1,10 @@
-use crate::prelude::*;
-
-use bindings::PA_MMAP;
-
+use super::paging::AllocZeroed as _;
+use super::{AsMemoryBlock, Mapping, Page, Permission};
+use crate::io::ByteBuffer;
+use crate::KResult;
 use core::{borrow::Borrow, cell::UnsafeCell, cmp::Ordering};
-
-use crate::bindings::root::{PA_A, PA_ANON, PA_COW, PA_P, PA_RW};
-
-use super::{Mapping, Page, PageBuffer, Permission, VAddr, VRange, PTE};
+use eonix_mm::address::{AddrOps as _, VAddr, VRange};
+use eonix_mm::page_table::{PageAttribute, PTE};
 
 #[derive(Debug)]
 pub struct MMArea {
@@ -44,11 +42,6 @@ impl MMArea {
         *self.range_borrow()
     }
 
-    #[allow(dead_code)]
-    pub fn len(&self) -> usize {
-        self.range_borrow().len()
-    }
-
     /// # Safety
     /// This function should be called only when we can guarantee that the range
     /// won't overlap with any other range in some scope.
@@ -58,7 +51,7 @@ impl MMArea {
     }
 
     pub fn split(mut self, at: VAddr) -> (Option<Self>, Option<Self>) {
-        assert_eq!(at.floor(), at);
+        assert!(at.is_page_aligned());
 
         match self.range_borrow().cmp(&VRange::from(at)) {
             Ordering::Less => (Some(self), None),
@@ -86,83 +79,94 @@ impl MMArea {
 
     /// # Return
     /// Whether the whole handling process is done.
-    pub fn handle_cow(&self, pte: &mut PTE) -> bool {
-        let mut attributes = pte.attributes();
-        let mut pfn = pte.pfn();
+    pub fn handle_cow(&self, pte: &mut impl PTE) -> bool {
+        let mut page_attr = pte.get_attr();
+        let pfn = pte.get_pfn();
 
-        attributes &= !PA_COW as usize;
-        if self.permission.write {
-            attributes |= PA_RW as usize;
-        } else {
-            attributes &= !PA_RW as usize;
-        }
+        page_attr = page_attr.copy_on_write(false);
+        page_attr = page_attr.write(self.permission.write);
 
-        let page = unsafe { Page::take_pfn(pfn, 0) };
-        if unsafe { page.load_refcount() } == 1 {
+        let page = unsafe { Page::from_raw(pfn) };
+        if page.is_exclusive() {
             // SAFETY: This is actually safe. If we read `1` here and we have `MMList` lock
             // held, there couldn't be neither other processes sharing the page, nor other
             // threads making the page COW at the same time.
-            pte.set_attributes(attributes);
+            pte.set_attr(page_attr);
             core::mem::forget(page);
             return true;
         }
 
-        let new_page = Page::alloc_one();
-        if attributes & PA_ANON as usize != 0 {
-            new_page.zero();
+        let new_page;
+        if page_attr.is_anonymous() {
+            new_page = Page::zeroed();
         } else {
-            new_page.as_mut_slice().copy_from_slice(page.as_slice());
+            new_page = Page::alloc();
+
+            unsafe {
+                // SAFETY: `page` is CoW, which means that others won't write to it.
+                let old_page_data = page.as_memblk().as_bytes();
+
+                // SAFETY: `new_page` is exclusive owned by us.
+                let new_page_data = new_page.as_memblk().as_bytes_mut();
+
+                new_page_data.copy_from_slice(old_page_data);
+            };
         }
 
-        attributes &= !(PA_A | PA_ANON) as usize;
+        page_attr = page_attr.accessed(false);
+        page_attr = page_attr.anonymous(false);
 
-        pfn = new_page.into_pfn();
-        pte.set(pfn, attributes);
+        pte.set(new_page.into_raw(), page_attr);
 
         false
     }
 
     /// # Arguments
     /// * `offset`: The offset from the start of the mapping, aligned to 4KB boundary.
-    pub fn handle_mmap(&self, pte: &mut PTE, offset: usize) -> KResult<()> {
+    pub fn handle_mmap(&self, pte: &mut impl PTE, offset: usize) -> KResult<()> {
         // TODO: Implement shared mapping
-        let mut attributes = pte.attributes();
-        let pfn = pte.pfn();
-
-        attributes |= PA_P as usize;
+        let mut page_attr = pte.get_attr();
+        let pfn = pte.get_pfn();
 
         match &self.mapping {
             Mapping::File(mapping) if offset < mapping.length => {
-                // SAFETY: Since we are here, the `pfn` must refer to a valid buddy page.
-                let page = unsafe { Page::from_pfn(pfn, 0) };
-                let nread = mapping
-                    .file
-                    .read(&mut PageBuffer::new(page.clone()), mapping.offset + offset)?;
-
-                if nread < page.len() {
-                    page.as_mut_slice()[nread..].fill(0);
-                }
+                let page = unsafe {
+                    // SAFETY: Since we are here, the `pfn` must refer to a valid buddy page.
+                    Page::with_raw(pfn, |page| page.clone())
+                };
 
-                if mapping.length - offset < 0x1000 {
-                    let length_to_end = mapping.length - offset;
-                    page.as_mut_slice()[length_to_end..].fill(0);
-                }
+                let page_data = unsafe {
+                    // SAFETY: `page` is marked as mapped, so others trying to read or write to
+                    //         it will be blocked and enter the page fault handler, where they will
+                    //         be blocked by the mutex held by us.
+                    page.as_memblk().as_bytes_mut()
+                };
+
+                let cnt_to_read = (mapping.length - offset).min(0x1000);
+                let cnt_read = mapping.file.read(
+                    &mut ByteBuffer::new(&mut page_data[..cnt_to_read]),
+                    mapping.offset + offset,
+                )?;
+
+                page_data[cnt_read..].fill(0);
             }
             Mapping::File(_) => panic!("Offset out of range"),
             _ => panic!("Anonymous mapping should not be PA_MMAP"),
         }
 
-        attributes &= !PA_MMAP as usize;
-        pte.set_attributes(attributes);
+        page_attr = page_attr.present(true).mapped(false);
+        pte.set_attr(page_attr);
         Ok(())
     }
 
-    pub fn handle(&self, pte: &mut PTE, offset: usize) -> KResult<()> {
-        if pte.is_cow() {
+    pub fn handle(&self, pte: &mut impl PTE, offset: usize) -> KResult<()> {
+        let page_attr = pte.get_attr();
+
+        if page_attr.is_copy_on_write() {
             self.handle_cow(pte);
         }
 
-        if pte.is_mmap() {
+        if page_attr.is_mapped() {
             self.handle_mmap(pte, offset)?;
         }
 

+ 318 - 145
src/kernel/mem/mm_list.rs

@@ -1,27 +1,36 @@
+mod mapping;
 mod page_fault;
 
-use super::{MMArea, Page, PageTable, VAddr, VRange};
-use crate::kernel::vfs::dentry::Dentry;
+use super::access::KernelPageAccess;
+use super::address::{VAddrExt as _, VRangeExt as _};
+use super::page_alloc::GlobalPageAlloc;
+use super::paging::{AllocZeroed as _, PageUnmanaged};
+use super::{AsMemoryBlock, MMArea, Page};
 use crate::{prelude::*, sync::ArcSwap};
-use alloc::{collections::btree_set::BTreeSet, sync::Arc};
-use bindings::{EEXIST, EFAULT, EINVAL, ENOMEM, KERNEL_PML4};
-use core::{
-    ops::Sub as _,
-    sync::atomic::{AtomicUsize, Ordering},
+use alloc::collections::btree_set::BTreeSet;
+use arch::DefaultPagingMode;
+use bindings::{EEXIST, EFAULT, EINVAL, ENOMEM};
+use core::fmt;
+use core::sync::atomic::{AtomicUsize, Ordering};
+use eonix_mm::address::{Addr as _, PAddr};
+use eonix_mm::page_table::PagingMode;
+use eonix_mm::paging::PFN;
+use eonix_mm::{
+    address::{AddrOps as _, VAddr, VRange},
+    page_table::{PageAttribute, PageTable, PTE},
+    paging::PAGE_SIZE,
 };
 use eonix_runtime::task::Task;
-use eonix_sync::Mutex;
+use eonix_sync::{LazyLock, Mutex};
 
+pub use mapping::{FileMapping, Mapping};
 pub use page_fault::handle_page_fault;
 
-#[derive(Debug, Clone)]
-pub struct FileMapping {
-    pub file: Arc<Dentry>,
-    /// Offset in the file, aligned to 4KB boundary.
-    pub offset: usize,
-    /// Length of the mapping. Exceeding part will be zeroed.
-    pub length: usize,
-}
+static EMPTY_PAGE: LazyLock<Page> = LazyLock::new(|| Page::zeroed());
+static KERNEL_ROOT_TABLE_PAGE: LazyLock<PageUnmanaged> = LazyLock::new(|| unsafe {
+    // SAFETY: The kernel page table is always valid.
+    PageUnmanaged::from_raw_unchecked(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN)
+});
 
 #[derive(Debug, Clone, Copy)]
 pub struct Permission {
@@ -29,51 +38,21 @@ pub struct Permission {
     pub execute: bool,
 }
 
-#[derive(Debug, Clone)]
-pub enum Mapping {
-    Anonymous,
-    File(FileMapping),
-}
-
-#[derive(Debug)]
-struct MMListInner {
+struct MMListInner<'a> {
     areas: BTreeSet<MMArea>,
-    page_table: PageTable,
+    page_table: PageTable<'a, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess>,
     break_start: Option<VRange>,
     break_pos: Option<VAddr>,
 }
 
-#[derive(Debug)]
 pub struct MMList {
-    inner: ArcSwap<Mutex<MMListInner>>,
+    inner: ArcSwap<Mutex<MMListInner<'static>>>,
+    user_count: AtomicUsize,
     /// Only used in kernel space to switch page tables on context switch.
     root_page_table: AtomicUsize,
 }
 
-impl FileMapping {
-    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
-        assert_eq!(offset & 0xfff, 0);
-        Self {
-            file,
-            offset,
-            length,
-        }
-    }
-
-    pub fn offset(&self, offset: usize) -> Self {
-        if self.length <= offset {
-            Self::new(self.file.clone(), self.offset + self.length, 0)
-        } else {
-            Self::new(
-                self.file.clone(),
-                self.offset + offset,
-                self.length - offset,
-            )
-        }
-    }
-}
-
-impl MMListInner {
+impl MMListInner<'_> {
     fn overlapping_addr(&self, addr: VAddr) -> Option<&MMArea> {
         self.areas.get(&VRange::from(addr))
     }
@@ -83,20 +62,27 @@ impl MMListInner {
     }
 
     fn overlapping_range(&self, range: VRange) -> impl DoubleEndedIterator<Item = &MMArea> + '_ {
-        self.areas.range(range.into_range())
+        self.areas.range(range.into_bounds())
     }
 
     fn check_overlapping_range(&self, range: VRange) -> bool {
         range.is_user() && self.overlapping_range(range).next().is_none()
     }
 
-    fn find_available(&self, hint: VAddr, len: usize) -> Option<VAddr> {
-        let mut range = if hint == VAddr::NULL {
-            VRange::new(VAddr(0x1234000), VAddr(0x1234000 + len).ceil())
+    fn random_start(&self) -> VAddr {
+        VAddr::from(0x1234000)
+    }
+
+    fn find_available(&self, mut hint: VAddr, len: usize) -> Option<VAddr> {
+        let len = len.div_ceil(PAGE_SIZE) * PAGE_SIZE;
+
+        if hint == VAddr::NULL {
+            hint = self.random_start();
         } else {
-            VRange::new(hint.floor(), (hint + len).ceil())
-        };
-        let len = range.len();
+            hint = hint.floor();
+        }
+
+        let mut range = VRange::from(hint).grow(len);
 
         loop {
             if !range.is_user() {
@@ -106,63 +92,85 @@ impl MMListInner {
             match self.overlapping_range(range).next_back() {
                 None => return Some(range.start()),
                 Some(area) => {
-                    range = VRange::new(area.range().end().ceil(), area.range().end().ceil() + len);
+                    range = VRange::from(area.range().end().ceil()).grow(len);
                 }
             }
         }
     }
 
-    fn unmap(&mut self, start: VAddr, len: usize) -> KResult<()> {
+    fn unmap(&mut self, start: VAddr, len: usize) -> KResult<Vec<Page>> {
         assert_eq!(start.floor(), start);
         let end = (start + len).ceil();
-        let range = VRange::new(start, end);
-        if !range.is_user() {
+        let range_to_unmap = VRange::new(start, end);
+        if !range_to_unmap.is_user() {
             return Err(EINVAL);
         }
 
-        let check_range = VRange::from(range.start())..VRange::from(range.end());
-        let mut front_remaining = None;
-        let mut back_remaining = None;
+        let mut left_remaining = None;
+        let mut right_remaining = None;
+
+        let mut pages_to_free = Vec::new();
+
+        // TODO: Write back dirty pages.
 
         self.areas.retain(|area| {
-            if !check_range.contains(&area.range()) {
+            let Some((left, mid, right)) = area.range().mask_with_checked(&range_to_unmap) else {
                 return true;
+            };
+
+            for pte in self.page_table.iter_user(mid) {
+                let (pfn, _) = pte.take();
+                pages_to_free.push(unsafe {
+                    // SAFETY: We got the pfn from a valid page table entry, so it should be valid.
+                    Page::from_raw(pfn)
+                });
             }
-            if area.range() == range.start().into() {
-                let (left, right) = area.clone().split(range.start());
-                self.page_table.unmap(&right.unwrap());
-
-                if let Some(left) = left {
-                    assert!(
-                        front_remaining.replace(left).is_none(),
-                        "There should be only one `front`."
-                    );
+
+            match (left, right) {
+                (None, None) => {}
+                (Some(left), None) => {
+                    assert!(left_remaining.is_none());
+                    let (Some(left), _) = area.clone().split(left.end()) else {
+                        unreachable!("`left.end()` is within the area");
+                    };
+
+                    left_remaining = Some(left);
+                }
+                (None, Some(right)) => {
+                    assert!(right_remaining.is_none());
+                    let (_, Some(right)) = area.clone().split(right.start()) else {
+                        unreachable!("`right.start()` is within the area");
+                    };
+
+                    right_remaining = Some(right);
+                }
+                (Some(left), Some(right)) => {
+                    assert!(left_remaining.is_none());
+                    assert!(right_remaining.is_none());
+                    let (Some(left), Some(mid)) = area.clone().split(left.end()) else {
+                        unreachable!("`left.end()` is within the area");
+                    };
+
+                    let (_, Some(right)) = mid.split(right.start()) else {
+                        unreachable!("`right.start()` is within the area");
+                    };
+
+                    left_remaining = Some(left);
+                    right_remaining = Some(right);
                 }
-            } else if area.range() == range.end().into() {
-                let (left, right) = area.clone().split(range.end());
-                self.page_table.unmap(&left.unwrap());
-
-                assert!(
-                    back_remaining
-                        .replace(right.expect("`right` should be valid"))
-                        .is_none(),
-                    "There should be only one `back`."
-                );
-            } else {
-                self.page_table.unmap(area);
             }
 
             false
         });
 
-        if let Some(front) = front_remaining {
+        if let Some(front) = left_remaining {
             self.areas.insert(front);
         }
-        if let Some(back) = back_remaining {
+        if let Some(back) = right_remaining {
             self.areas.insert(back);
         }
 
-        Ok(())
+        Ok(pages_to_free)
     }
 
     fn mmap(
@@ -192,10 +200,36 @@ impl MMListInner {
 }
 
 impl MMList {
+    async fn flush_user_tlbs(&self) {
+        match self.user_count.load(Ordering::Relaxed) {
+            0 => {
+                // If there are currently no users, we don't need to do anything.
+            }
+            1 => {
+                if PAddr::from(arch::get_root_page_table_pfn()).addr()
+                    == self.root_page_table.load(Ordering::Relaxed)
+                {
+                    // If there is only one user and we are using the page table,
+                    // flushing the TLB for the local cpu only is enough.
+                    arch::flush_tlb_all();
+                } else {
+                    // Send the TLB flush request to the core.
+                    todo!();
+                }
+            }
+            _ => {
+                // If there are more than one users, we broadcast the TLB flush
+                // to all cores.
+                todo!()
+            }
+        }
+    }
+
     pub fn new() -> Self {
-        let page_table = PageTable::new();
+        let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE);
         Self {
-            root_page_table: AtomicUsize::from(page_table.root_page_table()),
+            root_page_table: AtomicUsize::from(page_table.addr().addr()),
+            user_count: AtomicUsize::new(0),
             inner: ArcSwap::new(Mutex::new(MMListInner {
                 areas: BTreeSet::new(),
                 page_table,
@@ -205,13 +239,14 @@ impl MMList {
         }
     }
 
-    pub fn new_cloned(&self) -> Self {
+    pub async fn new_cloned(&self) -> Self {
         let inner = self.inner.borrow();
-        let inner = Task::block_on(inner.lock());
+        let inner = inner.lock().await;
 
-        let page_table = PageTable::new();
+        let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE);
         let list = Self {
-            root_page_table: AtomicUsize::from(page_table.root_page_table()),
+            root_page_table: AtomicUsize::from(page_table.addr().addr()),
+            user_count: AtomicUsize::new(0),
             inner: ArcSwap::new(Mutex::new(MMListInner {
                 areas: inner.areas.clone(),
                 page_table,
@@ -222,68 +257,111 @@ impl MMList {
 
         {
             let list_inner = list.inner.borrow();
-            let list_inner = Task::block_on(list_inner.lock());
+            let list_inner = list_inner.lock().await;
 
             for area in list_inner.areas.iter() {
-                let new_iter = list_inner.page_table.iter_user(area.range()).unwrap();
-                let old_iter = inner.page_table.iter_user(area.range()).unwrap();
-
-                for (new, old) in new_iter.zip(old_iter) {
-                    new.setup_cow(old);
-                }
+                list_inner
+                    .page_table
+                    .set_copy_on_write(&inner.page_table, area.range());
             }
         }
 
-        // We set some pages as COW, so we need to invalidate TLB.
-        inner.page_table.lazy_invalidate_tlb_all();
+        // We've set some pages as CoW, so we need to invalidate all our users' TLB.
+        self.flush_user_tlbs().await;
 
         list
     }
 
-    pub fn switch_page_table(&self) {
+    pub fn activate(&self) {
+        self.user_count.fetch_add(1, Ordering::Acquire);
+
         let root_page_table = self.root_page_table.load(Ordering::Relaxed);
         assert_ne!(root_page_table, 0);
-        arch::set_root_page_table(root_page_table);
+        arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table)));
     }
 
-    pub fn replace(&self, new: Self) {
-        // Switch to kernel page table in case we are using the page table to be swapped and released.
-        let mut switched = false;
-        if arch::get_root_page_table() == self.root_page_table.load(Ordering::Relaxed) {
-            arch::set_root_page_table(KERNEL_PML4 as usize);
-            switched = true;
-        }
+    pub fn deactivate(&self) {
+        arch::set_root_page_table_pfn(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN);
 
-        unsafe {
-            // SAFETY: Even if we're using the page table, we've switched to kernel page table.
-            // So it's safe to release the old memory list.
-            self.release();
-        }
+        let old_user_count = self.user_count.fetch_sub(1, Ordering::Release);
+        assert_ne!(old_user_count, 0);
+    }
 
-        // SAFETY: `self.inner` should be `None` after releasing.
-        self.inner.swap(Some(new.inner.borrow().clone()));
-        self.root_page_table.store(
-            new.root_page_table.load(Ordering::Relaxed),
-            Ordering::Relaxed,
-        );
+    /// Deactivate `self` and activate `to` with root page table changed only once.
+    /// This might reduce the overhead of switching page tables twice.
+    #[allow(dead_code)]
+    pub fn switch(&self, to: &Self) {
+        self.user_count.fetch_add(1, Ordering::Acquire);
 
-        if switched {
-            self.switch_page_table();
-        }
+        let root_page_table = self.root_page_table.load(Ordering::Relaxed);
+        assert_ne!(root_page_table, 0);
+        arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table)));
+
+        let old_user_count = to.user_count.fetch_sub(1, Ordering::Release);
+        assert_ne!(old_user_count, 0);
     }
 
+    /// Replace the current page table with a new one.
+    ///
     /// # Safety
-    /// This function is unsafe because the caller should make sure that the `inner` is not currently used.
-    pub unsafe fn release(&self) {
-        // TODO: Check whether we should wake someone up if they've been put to sleep when calling `vfork`.
-        self.inner.swap(None);
+    /// This function should be called only when we are sure that the `MMList` is not
+    /// being used by any other thread.
+    pub unsafe fn replace(&self, new: Option<Self>) {
+        eonix_preempt::disable();
+
+        assert_eq!(
+            self.user_count.load(Ordering::Relaxed),
+            1,
+            "We should be the only user"
+        );
+
+        assert_eq!(
+            new.as_ref()
+                .map(|new_mm| new_mm.user_count.load(Ordering::Relaxed))
+                .unwrap_or(0),
+            0,
+            "`new` must not be used by anyone"
+        );
+
+        let old_root_page_table = self.root_page_table.load(Ordering::Relaxed);
+        let current_root_page_table = arch::get_root_page_table_pfn();
+        assert_eq!(
+            PAddr::from(current_root_page_table).addr(),
+            old_root_page_table,
+            "We should be the only user"
+        );
+
+        let new_root_page_table = match &new {
+            Some(new_mm) => new_mm.root_page_table.load(Ordering::Relaxed),
+            None => PAddr::from(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN).addr(),
+        };
+
+        arch::set_root_page_table_pfn(PFN::from(PAddr::from(new_root_page_table)));
+
         self.root_page_table
-            .swap(KERNEL_PML4 as _, Ordering::Relaxed);
+            .store(new_root_page_table, Ordering::Relaxed);
+
+        // TODO: Check whether we should wake someone up if they've been put
+        //       to sleep when calling `vfork`.
+        self.inner
+            .swap(new.map(|new_mm| new_mm.inner.swap(None)).flatten());
+
+        eonix_preempt::enable();
     }
 
     /// No need to do invalidation manually, `PageTable` already does it.
-    pub fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
-        Task::block_on(self.inner.borrow().lock()).unmap(start, len)
+    pub async fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
+        let pages_to_free = self.inner.borrow().lock().await.unmap(start, len)?;
+
+        // We need to assure that the pages are not accessed anymore.
+        // The ones having these pages in their TLB could read from or write to them.
+        // So flush the TLBs first for all our users.
+        self.flush_user_tlbs().await;
+
+        // Then free the pages.
+        drop(pages_to_free);
+
+        Ok(())
     }
 
     pub fn mmap_hint(
@@ -359,17 +437,19 @@ impl MMList {
             .get(&break_start)
             .expect("Program break area should be valid");
 
-        let len: usize = pos.sub(current_break);
+        let len = pos - current_break;
+        let range_to_grow = VRange::from(program_break.range().end()).grow(len);
+
+        program_break.grow(len);
+
         inner.page_table.set_anonymous(
-            VRange::from(program_break.range().end()).grow(len),
+            range_to_grow,
             Permission {
                 write: true,
                 execute: false,
             },
         );
 
-        program_break.grow(len);
-
         inner.break_pos = Some(pos);
         pos
     }
@@ -415,7 +495,7 @@ impl MMList {
 
             for (idx, pte) in inner
                 .page_table
-                .iter_user(VRange::new(current, access_end))?
+                .iter_user(VRange::new(current, access_end))
                 .enumerate()
             {
                 let page_start = current.floor() + idx * 0x1000;
@@ -438,11 +518,15 @@ impl MMList {
                 }
 
                 unsafe {
-                    let page = Page::from_pfn(pte.pfn(), 0);
-                    func(
-                        offset + idx * 0x1000,
-                        &mut page.as_mut_slice()[start_offset..end_offset],
-                    );
+                    // SAFETY: We are sure that the page is valid and we have the right to access it.
+                    Page::with_raw(pte.get_pfn(), |page| {
+                        // SAFETY: The caller guarantees that no one else is using the page.
+                        let page_data = page.as_memblk().as_bytes_mut();
+                        func(
+                            offset + idx * 0x1000,
+                            &mut page_data[start_offset..end_offset],
+                        );
+                    });
                 }
             }
 
@@ -454,3 +538,92 @@ impl MMList {
         Ok(())
     }
 }
+
+impl fmt::Debug for MMList {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MMList").finish()
+    }
+}
+
+trait PageTableExt {
+    fn set_anonymous(&self, range: VRange, permission: Permission);
+    fn set_mmapped(&self, range: VRange, permission: Permission);
+    fn set_copy_on_write(&self, from: &Self, range: VRange);
+}
+
+impl PageTableExt for PageTable<'_, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess> {
+    fn set_anonymous(&self, range: VRange, permission: Permission) {
+        for pte in self.iter_user(range) {
+            pte.set_anonymous(permission.execute);
+        }
+    }
+
+    fn set_mmapped(&self, range: VRange, permission: Permission) {
+        for pte in self.iter_user(range) {
+            pte.set_mapped(permission.execute);
+        }
+    }
+
+    fn set_copy_on_write(&self, from: &Self, range: VRange) {
+        let to_iter = self.iter_user(range);
+        let from_iter = from.iter_user(range);
+
+        for (to, from) in to_iter.zip(from_iter) {
+            to.set_copy_on_write(from);
+        }
+    }
+}
+
+trait PTEExt {
+    fn set_anonymous(&mut self, execute: bool);
+    fn set_mapped(&mut self, execute: bool);
+    fn set_copy_on_write(&mut self, from: &mut Self);
+}
+
+impl<T> PTEExt for T
+where
+    T: PTE,
+{
+    fn set_anonymous(&mut self, execute: bool) {
+        // Writable flag is set during page fault handling while executable flag is
+        // preserved across page faults, so we set executable flag now.
+        let attr = <Self as PTE>::Attr::new()
+            .present(true)
+            .user(true)
+            .copy_on_write(true)
+            .anonymous(true)
+            .execute(execute);
+
+        self.set(EMPTY_PAGE.clone().into_raw(), attr);
+    }
+
+    fn set_mapped(&mut self, execute: bool) {
+        // Writable flag is set during page fault handling while executable flag is
+        // preserved across page faults, so we set executable flag now.
+        let attr = <Self as PTE>::Attr::new()
+            .user(true)
+            .copy_on_write(true)
+            .anonymous(true)
+            .mapped(true)
+            .execute(execute);
+
+        self.set(EMPTY_PAGE.clone().into_raw(), attr);
+    }
+
+    fn set_copy_on_write(&mut self, from: &mut Self) {
+        let mut from_attr = from.get_attr();
+        if !from_attr.is_present() {
+            return;
+        }
+
+        from_attr = from_attr.write(false).copy_on_write(true);
+
+        let pfn = unsafe {
+            // SAFETY: We get the pfn from a valid page table entry, so it should be valid as well.
+            Page::with_raw(from.get_pfn(), |page| page.clone().into_raw())
+        };
+
+        self.set(pfn, from_attr.accessed(false));
+        from.set_attr(from_attr);
+    }
+}

+ 39 - 0
src/kernel/mem/mm_list/mapping.rs

@@ -0,0 +1,39 @@
+use crate::kernel::vfs::dentry::Dentry;
+use alloc::sync::Arc;
+
+#[derive(Debug, Clone)]
+pub struct FileMapping {
+    pub file: Arc<Dentry>,
+    /// Offset in the file, aligned to 4KB boundary.
+    pub offset: usize,
+    /// Length of the mapping. Exceeding part will be zeroed.
+    pub length: usize,
+}
+#[derive(Debug, Clone)]
+pub enum Mapping {
+    Anonymous,
+    File(FileMapping),
+}
+
+impl FileMapping {
+    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
+        assert_eq!(offset & 0xfff, 0);
+        Self {
+            file,
+            offset,
+            length,
+        }
+    }
+
+    pub fn offset(&self, offset: usize) -> Self {
+        if self.length <= offset {
+            Self::new(self.file.clone(), self.offset + self.length, 0)
+        } else {
+            Self::new(
+                self.file.clone(),
+                self.offset + offset,
+                self.length - offset,
+            )
+        }
+    }
+}

+ 9 - 8
src/kernel/mem/mm_list/page_fault.rs

@@ -1,9 +1,11 @@
 use super::{MMList, VAddr};
-use crate::kernel::mem::{Mapping, VRange};
+use crate::kernel::mem::Mapping;
 use crate::kernel::task::{ProcessList, Signal, Thread};
 use crate::prelude::*;
 use arch::InterruptContext;
 use bitflags::bitflags;
+use eonix_mm::address::{AddrOps as _, VRange};
+use eonix_mm::paging::PAGE_SIZE;
 use eonix_runtime::task::Task;
 
 bitflags! {
@@ -61,8 +63,7 @@ impl MMList {
 
         let pte = inner
             .page_table
-            .iter_user(VRange::new(addr.floor(), addr.floor() + 0x1000))
-            .unwrap()
+            .iter_user(VRange::from(addr.floor()).grow(PAGE_SIZE))
             .next()
             .expect("If we can find the mapped area, we should be able to find the PTE");
 
@@ -109,14 +110,14 @@ fn try_page_fault_fix(int_stack: &mut InterruptContext, addr: VAddr) {
 
 fn kernel_page_fault_die(vaddr: VAddr, ip: usize) -> ! {
     panic!(
-        "Invalid kernel mode memory access to {:#8x} while executing the instruction at {:#8x}",
-        vaddr.0, ip
+        "Invalid kernel mode memory access to {:?} while executing the instruction at {:#8x}",
+        vaddr, ip
     )
 }
 
 pub fn handle_page_fault(int_stack: &mut InterruptContext) {
     let error = PageFaultError::from_bits_truncate(int_stack.error_code);
-    let vaddr = VAddr(arch::get_page_fault_address());
+    let vaddr = arch::get_page_fault_address();
 
     let result = Thread::current()
         .process
@@ -125,8 +126,8 @@ pub fn handle_page_fault(int_stack: &mut InterruptContext) {
 
     if let Err(signal) = result {
         println_debug!(
-            "Page fault on {:#x} in user space at {:#x}",
-            vaddr.0,
+            "Page fault on {:?} in user space at {:#x}",
+            vaddr,
             int_stack.rip
         );
         ProcessList::kill_current(signal)

+ 92 - 403
src/kernel/mem/page_alloc.rs

@@ -1,483 +1,172 @@
-use super::address::{PAddr, PFN};
-use crate::intrusive_list::Link;
-use crate::{container_of, prelude::*};
-use bitflags::bitflags;
-use core::sync::atomic::Ordering;
-use core::{ptr::NonNull, sync::atomic::AtomicU32};
-
-const MAX_PAGE_ORDER: u32 = 10;
-const PAGE_ALLOC_COSTLY_ORDER: u32 = 3;
+use super::{paging::AllocZeroed as _, Page};
+use buddy_allocator::{BuddyAllocator, FreeArea as BuddyFreeArea};
+use core::{ptr::NonNull, sync::atomic::Ordering};
+use eonix_mm::{
+    address::{AddrOps as _, PAddr},
+    paging::{PageAlloc, PageFlags, RawPagePtr, PFN},
+};
+
+const COSTLY_ORDER: u32 = 3;
 const BATCH_SIZE: u32 = 64;
-const PAGE_ARRAY: *mut Page = 0xffffff8040000000 as *mut Page;
-
-pub(super) type PagePtr = Ptr<Page>;
-
-#[repr(transparent)]
-pub struct Ptr<T>(Option<NonNull<T>>);
-
-impl<T> Clone for Ptr<T> {
-    fn clone(&self) -> Self {
-        Self(self.0)
-    }
-}
-
-impl<T> Copy for Ptr<T> {}
-
-impl<T> Ptr<T> {
-    pub const fn new(ptr: Option<NonNull<T>>) -> Self {
-        Self(ptr)
-    }
-
-    pub fn from_raw(ptr: *mut T) -> Self {
-        Self::new(NonNull::new(ptr))
-    }
-
-    pub fn null() -> Self {
-        Self::new(None)
-    }
-
-    pub fn is_none(&self) -> bool {
-        self.0.is_none()
-    }
-
-    pub fn is_some(&self) -> bool {
-        self.0.is_some()
-    }
-
-    pub fn as_ptr(&self) -> *mut T {
-        self.0.unwrap().as_ptr()
-    }
-
-    pub fn as_ref<'a>(&self) -> &'a T {
-        unsafe { &*self.as_ptr() }
-    }
-
-    pub fn as_mut<'a>(&self) -> &'a mut T {
-        unsafe { &mut *self.as_ptr() }
-    }
-}
-
-impl PagePtr {
-    pub unsafe fn increase_refcount(&self) -> u32 {
-        self.as_mut().increase_refcount()
-    }
-
-    pub unsafe fn decrease_refcount(&self) -> u32 {
-        self.as_mut().decrease_refcount()
-    }
-
-    pub unsafe fn load_refcount(&self) -> u32 {
-        self.as_ref().refcount.load(Ordering::Acquire)
-    }
-
-    fn get_order(&self) -> u32 {
-        self.as_ref().order
-    }
 
-    pub fn is_valid(&self, order: u32) -> bool {
-        self.is_some() && self.get_order() == order
-    }
-
-    fn offset(&self, count: usize) -> Self {
-        match self.0 {
-            Some(non_null_ptr) => {
-                let new_raw_ptr = unsafe { non_null_ptr.as_ptr().add(count) };
-                Self::from_raw(new_raw_ptr)
-            }
-            None => Self::null(),
-        }
-    }
-}
-
-impl Into<PFN> for PagePtr {
-    fn into(self) -> PFN {
-        unsafe { PFN::from(self.as_ptr().offset_from(PAGE_ARRAY) as usize) }
-    }
-}
-
-impl From<PFN> for PagePtr {
-    fn from(pfn: PFN) -> Self {
-        unsafe { Self::from_raw(PAGE_ARRAY.add(pfn.0)) }
-    }
-}
-
-bitflags! {
-    // TODO: Use atomic
-    struct PageFlags: usize {
-        const PRESENT = 1 << 0;
-        const LOCKED  = 1 << 1;
-        const BUDDY   = 1 << 2;
-        const SLAB    = 1 << 3;
-        const DIRTY   = 1 << 4;
-        const FREE    = 1 << 5;
-        const LOCAL   = 1 << 6;
-    }
-}
-
-pub(super) struct Page {
-    // Now only used for free page links in the buddy system.
-    // Can be used for LRU page swap in the future.
-    link: Link,
-    flags: PageFlags, // TODO: This should be atomic.
-    /// # Safety
-    /// This field is only used in buddy system, which is protected by the global lock.
-    order: u32,
-    refcount: AtomicU32,
-}
-
-struct FreeArea {
-    free_list: Link,
-    count: usize,
-}
+#[arch::define_percpu]
+static PERCPU_PAGE_ALLOC: PerCpuPageAlloc = PerCpuPageAlloc::new();
 
-/// Safety: `Zone` is `Send` because the `PAGE_ARRAY` is shared between cores.
-unsafe impl Send for Zone {}
-// /// Safety: TODO
-// unsafe impl Sync for Zone {}
+pub struct NoAlloc;
 
-struct Zone {
-    free_areas: [FreeArea; MAX_PAGE_ORDER as usize + 1],
-}
+pub struct GlobalPageAlloc;
 
-struct PerCpuPages {
+struct PerCpuPageAlloc {
     batch: u32,
-    _high: u32, // TODO: use in future
-    free_areas: [FreeArea; PAGE_ALLOC_COSTLY_ORDER as usize + 1],
+    // TODO: might be used in the future.
+    // high: u32,
+    free_areas: [BuddyFreeArea; COSTLY_ORDER as usize + 1],
 }
 
-impl PerCpuPages {
+impl PerCpuPageAlloc {
     const fn new() -> Self {
         Self {
             batch: BATCH_SIZE,
-            _high: 0,
-            free_areas: [const { FreeArea::new() }; PAGE_ALLOC_COSTLY_ORDER as usize + 1],
+            // high: 0,
+            free_areas: [const { BuddyFreeArea::new() }; COSTLY_ORDER as usize + 1],
         }
     }
 
-    fn get_free_pages(&mut self, order: u32) -> PagePtr {
-        assert!(order <= PAGE_ALLOC_COSTLY_ORDER);
+    fn do_alloc_order(&mut self, order: u32) -> Option<RawPagePtr> {
+        assert!(order <= COSTLY_ORDER);
+        let free_area = &mut self.free_areas[order as usize];
 
-        loop {
-            let pages_ptr = self.free_areas[order as usize].get_free_pages();
-            if pages_ptr.is_some() {
-                return pages_ptr;
-            }
+        let mut page_ptr = free_area.get_free_pages();
 
+        if page_ptr.is_none() {
             let batch = self.batch >> order;
-            ZONE.lock()
-                .get_bulk_free_pages(&mut self.free_areas[order as usize], order, batch);
-        }
-    }
-
-    fn free_pages(&mut self, pages_ptr: PagePtr, order: u32) {
-        assert!(order <= PAGE_ALLOC_COSTLY_ORDER);
-        assert_eq!(unsafe { pages_ptr.load_refcount() }, 0);
-        assert_eq!(pages_ptr.get_order(), order);
-
-        self.free_areas[order as usize].add_pages(pages_ptr);
-    }
-}
-
-impl Page {
-    fn set_flags(&mut self, flags: PageFlags) {
-        self.flags.insert(flags);
-    }
-
-    fn remove_flags(&mut self, flags: PageFlags) {
-        self.flags.remove(flags);
-    }
-
-    fn set_order(&mut self, order: u32) {
-        self.order = order;
-    }
-
-    unsafe fn increase_refcount(&mut self) -> u32 {
-        self.refcount.fetch_add(1, Ordering::Relaxed)
-    }
-
-    unsafe fn decrease_refcount(&mut self) -> u32 {
-        self.refcount.fetch_sub(1, Ordering::AcqRel)
-    }
-
-    pub fn is_buddy(&self) -> bool {
-        self.flags.contains(PageFlags::BUDDY)
-    }
-
-    #[allow(dead_code)]
-    pub fn is_slab(&self) -> bool {
-        self.flags.contains(PageFlags::SLAB)
-    }
-
-    pub fn is_present(&self) -> bool {
-        self.flags.contains(PageFlags::PRESENT)
-    }
-
-    pub fn is_free(&self) -> bool {
-        self.flags.contains(PageFlags::FREE)
-    }
-
-    pub fn is_local(&self) -> bool {
-        self.flags.contains(PageFlags::LOCAL)
-    }
-}
+            for _ in 0..batch {
+                if let Some(pages_ptr) = BuddyAllocator::alloc_order(order) {
+                    pages_ptr.flags().set(PageFlags::LOCAL);
+                    free_area.add_pages(pages_ptr);
+                } else {
+                    break;
+                };
+            }
 
-impl FreeArea {
-    const fn new() -> Self {
-        Self {
-            free_list: Link::new(),
-            count: 0,
+            page_ptr = free_area.get_free_pages();
         }
-    }
-
-    fn get_free_pages(&mut self) -> PagePtr {
-        if let Some(pages_link) = self.free_list.next_mut() {
-            assert_ne!(self.count, 0);
-
-            let pages_ptr = unsafe { container_of!(pages_link, Page, link) };
-            let pages_ptr = Ptr::from_raw(pages_ptr);
-
-            self.count -= 1;
-            pages_link.remove();
 
-            pages_ptr
-        } else {
-            PagePtr::null()
-        }
+        page_ptr.inspect(|page_ptr| page_ptr.flags().clear(PageFlags::FREE))
     }
 
-    fn add_pages(&mut self, pages_ptr: PagePtr) {
-        self.count += 1;
-        pages_ptr.as_mut().set_flags(PageFlags::FREE);
-        self.free_list.insert(&mut pages_ptr.as_mut().link)
-    }
+    fn free_pages(&mut self, pages_ptr: RawPagePtr, order: u32) {
+        assert_eq!(pages_ptr.order(), order);
+        assert_eq!(pages_ptr.refcount().load(Ordering::Relaxed), 0);
 
-    fn del_pages(&mut self, pages_ptr: PagePtr) {
-        assert!(self.count >= 1 && pages_ptr.as_ref().is_free());
-        self.count -= 1;
-        pages_ptr.as_mut().remove_flags(PageFlags::FREE);
-        pages_ptr.as_mut().link.remove();
+        // TODO: Temporary workaround here.
+        pages_ptr.refcount().store(1, Ordering::Relaxed);
+        self.free_areas[order as usize].add_pages(pages_ptr);
     }
 }
 
-impl Zone {
-    const fn new() -> Self {
-        Self {
-            free_areas: [const { FreeArea::new() }; MAX_PAGE_ORDER as usize + 1],
+impl PageAlloc for GlobalPageAlloc {
+    fn alloc_order(order: u32) -> Option<RawPagePtr> {
+        if order > COSTLY_ORDER {
+            BuddyAllocator::alloc_order(order)
+        } else {
+            PerCpuPageAlloc::alloc_order(order)
         }
     }
 
-    /// Only used for per-cpu pages
-    fn get_bulk_free_pages(&mut self, free_area: &mut FreeArea, order: u32, count: u32) -> u32 {
-        for i in 0..count {
-            let pages_ptr = self.get_free_pages(order);
-            if pages_ptr.is_none() {
-                return i;
-            }
-
-            pages_ptr.as_mut().set_flags(PageFlags::LOCAL);
-            free_area.add_pages(pages_ptr);
+    unsafe fn dealloc(page_ptr: RawPagePtr) {
+        if page_ptr.order() > COSTLY_ORDER {
+            BuddyAllocator::dealloc(page_ptr);
+        } else {
+            PerCpuPageAlloc::dealloc(page_ptr);
         }
-        count
     }
 
-    fn get_free_pages(&mut self, order: u32) -> PagePtr {
-        for current_order in order..=MAX_PAGE_ORDER {
-            let pages_ptr = self.free_areas[current_order as usize].get_free_pages();
-            if pages_ptr.is_none() {
-                continue;
-            }
-
-            pages_ptr.as_mut().set_order(order);
-
-            if current_order > order {
-                self.expand(pages_ptr, current_order, order);
-            }
-            assert!(pages_ptr.as_ref().is_present() && pages_ptr.as_ref().is_free());
-            return pages_ptr;
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool {
+        if page_ptr.order() > COSTLY_ORDER {
+            BuddyAllocator::has_management_over(page_ptr)
+        } else {
+            PerCpuPageAlloc::has_management_over(page_ptr)
         }
-        PagePtr::new(None)
     }
+}
 
-    fn expand(&mut self, pages_ptr: PagePtr, order: u32, target_order: u32) {
-        assert!(pages_ptr.is_some());
-        let mut offset = 1 << order;
-
-        for order in (target_order..order).rev() {
-            offset >>= 1;
-            let split_pages_ptr = pages_ptr.offset(offset);
-            split_pages_ptr.as_mut().set_order(order);
-            split_pages_ptr.as_mut().set_flags(PageFlags::BUDDY);
-            self.free_areas[order as usize].add_pages(split_pages_ptr);
-        }
+impl PageAlloc for NoAlloc {
+    fn alloc_order(_order: u32) -> Option<RawPagePtr> {
+        panic!("NoAlloc cannot allocate pages");
     }
 
-    fn free_pages(&mut self, mut pages_ptr: PagePtr, order: u32) {
-        assert_eq!(unsafe { pages_ptr.load_refcount() }, 0);
-        assert_eq!(pages_ptr.get_order(), order);
-
-        let mut pfn: PFN = pages_ptr.into();
-        let mut current_order = order;
-
-        while current_order < MAX_PAGE_ORDER {
-            let buddy_pfn = pfn.buddy_pfn(current_order);
-            let buddy_pages_ptr = PagePtr::from(buddy_pfn);
-
-            if !self.buddy_check(buddy_pages_ptr, current_order) {
-                break;
-            }
-
-            pages_ptr.as_mut().remove_flags(PageFlags::BUDDY);
-            buddy_pages_ptr.as_mut().remove_flags(PageFlags::BUDDY);
-            self.free_areas[current_order as usize].del_pages(buddy_pages_ptr);
-            pages_ptr = PagePtr::from(pfn.combined_pfn(buddy_pfn));
-            pages_ptr.as_mut().set_flags(PageFlags::BUDDY);
-            pfn = pfn.combined_pfn(buddy_pfn);
-            current_order += 1;
-        }
-
-        pages_ptr.as_mut().set_order(current_order);
-        self.free_areas[current_order as usize].add_pages(pages_ptr);
+    unsafe fn dealloc(_: RawPagePtr) {
+        panic!("NoAlloc cannot deallocate pages");
     }
 
-    /// This function checks whether a page is free && is the buddy
-    /// we can coalesce a page and its buddy if
-    /// - the buddy is valid(present) &&
-    /// - the buddy is right now in free_areas &&
-    /// - a page and its buddy have the same order &&
-    /// - a page and its buddy are in the same zone.    // check when smp
-    fn buddy_check(&self, pages_ptr: PagePtr, order: u32) -> bool {
-        if !pages_ptr.as_ref().is_present() {
-            return false;
-        }
-        if !(pages_ptr.as_ref().is_free()) {
-            return false;
-        }
-        if pages_ptr.as_ref().is_local() {
-            return false;
-        }
-        if pages_ptr.as_ref().order != order {
-            return false;
-        }
-
-        assert_eq!(unsafe { pages_ptr.load_refcount() }, 0);
+    unsafe fn has_management_over(_: RawPagePtr) -> bool {
         true
     }
-
-    /// Only used on buddy initialization
-    fn create_pages(&mut self, start: usize, end: usize) {
-        let mut start_pfn = PAddr::from(start).ceil_pfn();
-        let end_pfn = PAddr::from(end).floor_pfn();
-
-        while start_pfn < end_pfn {
-            let mut order = usize::from(start_pfn).trailing_zeros().min(MAX_PAGE_ORDER);
-
-            while start_pfn + order as usize > end_pfn {
-                order -= 1;
-            }
-            let page_ptr: PagePtr = start_pfn.into();
-            page_ptr.as_mut().set_flags(PageFlags::BUDDY);
-            self.free_areas[order as usize].add_pages(page_ptr);
-            start_pfn = start_pfn + (1 << order) as usize;
-        }
-    }
 }
 
-#[arch::define_percpu]
-static PER_CPU_PAGES: PerCpuPages = PerCpuPages::new();
-
-static ZONE: Spin<Zone> = Spin::new(Zone::new());
-
-fn __alloc_pages(order: u32) -> PagePtr {
-    let pages_ptr;
-
-    if order <= PAGE_ALLOC_COSTLY_ORDER {
+impl PageAlloc for PerCpuPageAlloc {
+    fn alloc_order(order: u32) -> Option<RawPagePtr> {
+        let page_ptr;
         unsafe {
             eonix_preempt::disable();
-            pages_ptr = PER_CPU_PAGES.as_mut().get_free_pages(order);
+            page_ptr = PERCPU_PAGE_ALLOC.as_mut().do_alloc_order(order);
             eonix_preempt::enable();
         }
-    } else {
-        pages_ptr = ZONE.lock().get_free_pages(order);
-    }
 
-    unsafe {
-        pages_ptr.as_mut().increase_refcount();
+        page_ptr
     }
-    pages_ptr.as_mut().remove_flags(PageFlags::FREE);
-    pages_ptr
-}
 
-fn __free_pages(pages_ptr: PagePtr, order: u32) {
-    if order <= PAGE_ALLOC_COSTLY_ORDER {
+    unsafe fn dealloc(page_ptr: RawPagePtr) {
+        let order = page_ptr.order();
+
         unsafe {
             eonix_preempt::disable();
-            PER_CPU_PAGES.as_mut().free_pages(pages_ptr, order);
+            PERCPU_PAGE_ALLOC.as_mut().free_pages(page_ptr, order);
             eonix_preempt::enable();
         }
-    } else {
-        ZONE.lock().free_pages(pages_ptr, order);
     }
-}
 
-pub(super) fn alloc_page() -> PagePtr {
-    __alloc_pages(0)
-}
-
-pub(super) fn alloc_pages(order: u32) -> PagePtr {
-    __alloc_pages(order)
-}
-
-pub(super) fn early_alloc_pages(order: u32) -> PagePtr {
-    let pages_ptr = ZONE.lock().get_free_pages(order);
-    unsafe {
-        pages_ptr.as_mut().increase_refcount();
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool {
+        BuddyAllocator::has_management_over(page_ptr) && page_ptr.flags().has(PageFlags::LOCAL)
     }
-    pages_ptr.as_mut().remove_flags(PageFlags::FREE);
-    pages_ptr
-}
-
-pub(super) fn free_pages(page_ptr: PagePtr, order: u32) {
-    __free_pages(page_ptr, order)
 }
 
 #[no_mangle]
 pub extern "C" fn mark_present(start: usize, end: usize) {
-    let mut start_pfn = PAddr::from(start).ceil_pfn();
-    let end_pfn = PAddr::from(end).floor_pfn();
+    let mut start_pfn = PFN::from(PAddr::from(start).ceil());
+    let end_pfn = PFN::from(PAddr::from(end).floor());
+
     while start_pfn < end_pfn {
-        PagePtr::from(start_pfn)
-            .as_mut()
-            .set_flags(PageFlags::PRESENT);
+        RawPagePtr::from(start_pfn).flags().set(PageFlags::PRESENT);
         start_pfn = start_pfn + 1;
     }
 }
 
 #[no_mangle]
-pub extern "C" fn create_pages(start: usize, end: usize) {
-    ZONE.lock().create_pages(start, end);
+pub extern "C" fn create_pages(start: PAddr, end: PAddr) {
+    BuddyAllocator::create_pages(start, end);
 }
 
 #[no_mangle]
-pub extern "C" fn page_to_pfn(page: *const Page) -> usize {
-    unsafe { page.offset_from(PAGE_ARRAY) as usize }
+pub extern "C" fn page_to_pfn(page: *const ()) -> PFN {
+    let page_ptr = RawPagePtr::new(NonNull::new(page as *mut _).unwrap());
+    PFN::from(page_ptr)
 }
 
 #[no_mangle]
-pub extern "C" fn c_alloc_page() -> *const Page {
-    alloc_page().as_ptr() as *const Page
+pub extern "C" fn c_alloc_page() -> *const () {
+    GlobalPageAlloc::alloc().expect("Out of memory").as_ptr() as *const _
 }
 
 #[no_mangle]
-pub extern "C" fn c_alloc_pages(order: u32) -> *const Page {
-    alloc_pages(order).as_ptr() as *const Page
+pub extern "C" fn c_alloc_pages(order: u32) -> *const () {
+    GlobalPageAlloc::alloc_order(order)
+        .expect("Out of memory")
+        .as_ptr() as *const _
 }
 
 #[no_mangle]
-pub extern "C" fn c_alloc_page_table() -> usize {
-    let pfn: PFN = alloc_page().into();
-    let paddr: usize = usize::from(pfn) << 12;
-    unsafe {
-        core::ptr::write_bytes(paddr as *mut u8, 0, 4096);
-    }
-    paddr
+pub extern "C" fn c_alloc_page_table() -> PAddr {
+    PAddr::from(Page::zeroed().into_raw())
 }

+ 0 - 316
src/kernel/mem/page_table.rs

@@ -1,316 +0,0 @@
-use super::{
-    paging::Page,
-    phys::{CachedPP, PhysPtr as _},
-    VAddr, VRange,
-};
-use super::{MMArea, Permission};
-use crate::bindings::root::{EINVAL, KERNEL_PML4};
-use crate::prelude::*;
-use eonix_sync::LazyLock;
-
-const PA_P: usize = 0x001;
-const PA_RW: usize = 0x002;
-const PA_US: usize = 0x004;
-#[allow(dead_code)]
-const PA_PWT: usize = 0x008;
-#[allow(dead_code)]
-const PA_PCD: usize = 0x010;
-const PA_A: usize = 0x020;
-const PA_D: usize = 0x040;
-#[allow(dead_code)]
-const PA_PS: usize = 0x080;
-const PA_G: usize = 0x100;
-const PA_COW: usize = 0x200;
-const PA_MMAP: usize = 0x400;
-const PA_ANON: usize = 0x800;
-const PA_NXE: usize = 0x8000_0000_0000_0000;
-const PA_MASK: usize = 0xfff0_0000_0000_0fff;
-
-#[repr(transparent)]
-#[derive(Debug, Clone, Copy)]
-pub struct PTE(usize);
-
-#[derive(Debug)]
-pub struct PageTable {
-    page: Page,
-}
-
-#[allow(dead_code)]
-pub struct PTEIterator<'lt, const KERNEL: bool> {
-    count: usize,
-    i4: u16,
-    i3: u16,
-    i2: u16,
-    i1: u16,
-    p4: CachedPP,
-    p3: CachedPP,
-    p2: CachedPP,
-    p1: CachedPP,
-
-    start: VAddr,
-    end: VAddr,
-    _phantom: core::marker::PhantomData<&'lt ()>,
-}
-
-static EMPTY_PAGE: LazyLock<Page> = LazyLock::new(|| {
-    let page = Page::alloc_one();
-    page.zero();
-    page
-});
-
-impl PTE {
-    pub fn is_user(&self) -> bool {
-        self.0 & PA_US != 0
-    }
-
-    pub fn is_present(&self) -> bool {
-        self.0 & PA_P != 0
-    }
-
-    pub fn is_cow(&self) -> bool {
-        self.0 & PA_COW != 0
-    }
-
-    pub fn is_mmap(&self) -> bool {
-        self.0 & PA_MMAP != 0
-    }
-
-    pub fn pfn(&self) -> usize {
-        self.0 & !PA_MASK
-    }
-
-    pub fn attributes(&self) -> usize {
-        self.0 & PA_MASK
-    }
-
-    pub fn set(&mut self, pfn: usize, attributes: usize) {
-        self.0 = pfn | attributes;
-    }
-
-    #[allow(dead_code)]
-    pub fn set_pfn(&mut self, pfn: usize) {
-        self.set(pfn, self.attributes())
-    }
-
-    pub fn set_attributes(&mut self, attributes: usize) {
-        self.set(self.pfn(), attributes)
-    }
-
-    fn parse_page_table(&mut self, kernel: bool) -> CachedPP {
-        let attributes = if kernel {
-            PA_P | PA_RW | PA_G
-        } else {
-            PA_P | PA_RW | PA_US
-        };
-
-        if self.is_present() {
-            CachedPP::new(self.pfn())
-        } else {
-            let page = Page::alloc_one();
-            let pp = page.as_cached();
-            page.zero();
-
-            self.set(page.into_pfn(), attributes);
-            pp
-        }
-    }
-
-    pub fn setup_cow(&mut self, from: &mut Self) {
-        self.set(
-            unsafe { Page::from_pfn(from.pfn(), 0) }.into_pfn(),
-            (from.attributes() & !(PA_RW | PA_A | PA_D)) | PA_COW,
-        );
-
-        from.set_attributes((from.attributes() & !PA_RW) | PA_COW);
-    }
-
-    pub fn clear(&mut self) {
-        self.set(0, 0)
-    }
-
-    /// Take the ownership of the page from the PTE, clear the PTE and return the page.
-    pub fn take(&mut self) -> Page {
-        // SAFETY: Acquire the ownership of the page from the page table and then
-        // clear the PTE so no one could be able to access the page from here later on.
-        let page = unsafe { Page::take_pfn(self.pfn(), 0) };
-        self.clear();
-        page
-    }
-}
-
-impl<'lt, const KERNEL: bool> PTEIterator<'lt, KERNEL> {
-    fn new(pt: &'lt Page, start: VAddr, end: VAddr) -> KResult<Self> {
-        if start > end {
-            return Err(EINVAL);
-        }
-
-        let p4 = pt.as_cached();
-        let p3 = p4.as_mut_slice::<PTE>(512)[Self::index(4, start)].parse_page_table(KERNEL);
-        let p2 = p3.as_mut_slice::<PTE>(512)[Self::index(3, start)].parse_page_table(KERNEL);
-        let p1 = p2.as_mut_slice::<PTE>(512)[Self::index(2, start)].parse_page_table(KERNEL);
-
-        Ok(Self {
-            count: (end.0 - start.0) >> 12,
-            i4: Self::index(4, start) as u16,
-            i3: Self::index(3, start) as u16,
-            i2: Self::index(2, start) as u16,
-            i1: Self::index(1, start) as u16,
-            p4,
-            p3,
-            p2,
-            p1,
-            start,
-            end,
-            _phantom: core::marker::PhantomData,
-        })
-    }
-
-    fn offset(level: u32) -> usize {
-        12 + (level as usize - 1) * 9
-    }
-
-    fn index(level: u32, vaddr: VAddr) -> usize {
-        (vaddr.0 >> Self::offset(level)) & 0x1ff
-    }
-}
-
-impl<'lt, const KERNEL: bool> Iterator for PTEIterator<'lt, KERNEL> {
-    type Item = &'lt mut PTE;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.count != 0 {
-            self.count -= 1;
-        } else {
-            return None;
-        }
-
-        let retval = &mut self.p1.as_mut_slice::<PTE>(512)[self.i1 as usize];
-        self.i1 = (self.i1 + 1) % 512;
-        if self.i1 == 0 {
-            self.i2 = (self.i2 + 1) % 512;
-            if self.i2 == 0 {
-                self.i3 = (self.i3 + 1) % 512;
-                if self.i3 == 0 {
-                    self.i4 = (self.i4 + 1) % 512;
-                    if self.i4 == 0 {
-                        panic!("PTEIterator: out of range");
-                    }
-                }
-                self.p3 =
-                    self.p4.as_mut_slice::<PTE>(512)[self.i4 as usize].parse_page_table(KERNEL);
-            }
-            self.p2 = self.p3.as_mut_slice::<PTE>(512)[self.i3 as usize].parse_page_table(KERNEL);
-        }
-        self.p1 = self.p2.as_mut_slice::<PTE>(512)[self.i2 as usize].parse_page_table(KERNEL);
-        Some(retval)
-    }
-}
-
-impl PageTable {
-    pub fn new() -> Self {
-        let page = Page::alloc_one();
-        page.zero();
-
-        // TODO: copy only the kernel space mappings.
-        let kernel_space_page_table = CachedPP::new(KERNEL_PML4 as usize);
-
-        page.as_cached().as_mut_slice::<u64>(512)[256..]
-            .copy_from_slice(&kernel_space_page_table.as_mut_slice(512)[256..]);
-
-        Self { page }
-    }
-
-    pub fn root_page_table(&self) -> usize {
-        self.page.as_phys()
-    }
-
-    pub fn iter_user(&self, range: VRange) -> KResult<PTEIterator<'_, false>> {
-        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
-    }
-
-    #[allow(dead_code)]
-    pub fn iter_kernel(&self, range: VRange) -> KResult<PTEIterator<'_, true>> {
-        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
-    }
-
-    pub fn unmap(&self, area: &MMArea) {
-        let range = area.range();
-        let use_invlpg = range.len() / 4096 < 4;
-        let iter = self.iter_user(range).unwrap();
-
-        if self.page.as_phys() != arch::get_root_page_table() {
-            for pte in iter {
-                pte.take();
-            }
-            return;
-        }
-
-        if use_invlpg {
-            for (offset_pages, pte) in iter.enumerate() {
-                pte.take();
-
-                let pfn = range.start().floor().0 + offset_pages * 4096;
-                arch::flush_tlb(pfn);
-            }
-        } else {
-            for pte in iter {
-                pte.take();
-            }
-            arch::flush_tlb_all();
-        }
-    }
-
-    pub fn lazy_invalidate_tlb_all(&self) {
-        if self.page.as_phys() == arch::get_root_page_table() {
-            arch::flush_tlb_all();
-        }
-    }
-
-    pub fn set_mmapped(&self, range: VRange, permission: Permission) {
-        // PA_RW is set during page fault handling.
-        // PA_NXE is preserved across page faults, so we set PA_NXE now.
-        let attributes = if permission.execute {
-            PA_US | PA_COW | PA_ANON | PA_MMAP
-        } else {
-            PA_US | PA_COW | PA_ANON | PA_MMAP | PA_NXE
-        };
-
-        for pte in self.iter_user(range).unwrap() {
-            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
-        }
-    }
-
-    pub fn set_anonymous(&self, range: VRange, permission: Permission) {
-        // PA_RW is set during page fault handling.
-        // PA_NXE is preserved across page faults, so we set PA_NXE now.
-        let attributes = if permission.execute {
-            PA_P | PA_US | PA_COW | PA_ANON
-        } else {
-            PA_P | PA_US | PA_COW | PA_ANON | PA_NXE
-        };
-
-        for pte in self.iter_user(range).unwrap() {
-            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
-        }
-    }
-}
-
-fn drop_page_table_recursive(pt: &Page, level: usize) {
-    for pte in pt
-        .as_cached()
-        .as_mut_slice::<PTE>(512)
-        .iter_mut()
-        .filter(|pte| pte.is_present() && pte.is_user())
-    {
-        let page = pte.take();
-        if level > 1 {
-            drop_page_table_recursive(&page, level - 1);
-        }
-    }
-}
-
-impl Drop for PageTable {
-    fn drop(&mut self) {
-        drop_page_table_recursive(&self.page, 4);
-    }
-}

+ 52 - 163
src/kernel/mem/paging.rs

@@ -1,180 +1,57 @@
-use super::address::PFN;
-use super::page_alloc::{alloc_page, alloc_pages, early_alloc_pages, free_pages, PagePtr};
-use super::phys::PhysPtr;
+use super::{
+    access::AsMemoryBlock,
+    page_alloc::{GlobalPageAlloc, NoAlloc},
+    MemoryBlock, PhysAccess,
+};
 use crate::io::{Buffer, FillResult};
-use crate::kernel::mem::phys;
-use core::fmt;
+use eonix_mm::paging::{Page as GenericPage, PageAlloc};
 
-pub struct Page {
-    page_ptr: PagePtr,
-    order: u32,
-}
-
-#[allow(dead_code)]
-impl Page {
-    pub fn alloc_one() -> Self {
-        let page_ptr = alloc_page();
-        Self { page_ptr, order: 0 }
-    }
-
-    pub fn alloc_many(order: u32) -> Self {
-        let page_ptr = alloc_pages(order);
-        Self { page_ptr, order }
-    }
-
-    /// Allocate a contiguous block of pages that can contain at least `count` pages.
-    pub fn alloc_ceil(count: usize) -> Self {
-        assert_ne!(count, 0);
-        let order = count.next_power_of_two().trailing_zeros();
-        Self::alloc_many(order)
-    }
-
-    pub fn early_alloc_ceil(count: usize) -> Self {
-        assert_ne!(count, 0);
-        let order = count.next_power_of_two().trailing_zeros();
-        let page_ptr = early_alloc_pages(order);
-        Self { page_ptr, order }
-    }
-
-    /// Get `Page` from `pfn`, acquiring the ownership of the page. `refcount` is not increased.
-    ///
-    /// # Safety
-    /// Caller must ensure that the pfn is no longer referenced by any other code.
-    pub unsafe fn take_pfn(pfn: usize, order: u32) -> Self {
-        let page_ptr: PagePtr = PFN::from(pfn >> 12).into();
-
-        // Only buddy pages can be used here.
-        // Also, check if the order is correct.
-        assert!(page_ptr.as_ref().is_buddy() && page_ptr.is_valid(order));
-
-        Self { page_ptr, order }
-    }
-
-    /// Get `Page` from `pfn` and increase the reference count.
-    ///
-    /// # Safety
-    /// Caller must ensure that `pfn` refers to a valid physical frame number with `refcount` > 0.
-    pub unsafe fn from_pfn(pfn: usize, order: u32) -> Self {
-        // SAFETY: `pfn` is a valid physical frame number with refcount > 0.
-        Self::increase_refcount(pfn);
-
-        // SAFETY: `pfn` has an increased refcount.
-        unsafe { Self::take_pfn(pfn, order) }
-    }
-
-    /// Consumes the `Page` and returns the physical frame number without dropping the reference
-    /// count the page holds.
-    pub fn into_pfn(self) -> usize {
-        let pfn: PFN = self.page_ptr.into();
-        core::mem::forget(self);
-        usize::from(pfn) << 12
-    }
-
-    pub fn len(&self) -> usize {
-        1 << (self.order + 12)
-    }
-
-    pub fn as_phys(&self) -> usize {
-        let pfn: PFN = self.page_ptr.into();
-        usize::from(pfn) << 12
-    }
-
-    pub fn as_cached(&self) -> phys::CachedPP {
-        phys::CachedPP::new(self.as_phys())
-    }
-
-    pub fn as_nocache(&self) -> phys::NoCachePP {
-        phys::NoCachePP::new(self.as_phys())
-    }
-
-    pub fn as_slice<'r, 'lt>(&'r self) -> &'lt [u8] {
-        self.as_cached().as_slice(self.len())
-    }
-
-    pub fn as_mut_slice<'r, 'lt>(&'r self) -> &'lt mut [u8] {
-        self.as_cached().as_mut_slice(self.len())
-    }
-
-    pub fn zero(&self) {
-        self.as_mut_slice().fill(0);
-    }
-
-    /// # Safety
-    /// Caller must ensure that the page is properly freed.
-    pub unsafe fn increase_refcount(pfn: usize) {
-        let page_ptr: PagePtr = PFN::from(pfn >> 12).into();
-        page_ptr.increase_refcount();
-    }
+pub type PageUnmanaged = GenericPage<NoAlloc>;
+pub type Page = GenericPage<GlobalPageAlloc>;
 
-    pub unsafe fn load_refcount(&self) -> usize {
-        self.page_ptr.load_refcount() as usize
-    }
+/// A buffer that wraps a page and provides a `Buffer` interface.
+pub struct PageBuffer {
+    page: Page,
+    offset: usize,
 }
 
-impl Clone for Page {
-    fn clone(&self) -> Self {
-        unsafe { self.page_ptr.increase_refcount() };
-
-        Self {
-            page_ptr: self.page_ptr,
-            order: self.order,
-        }
-    }
+pub trait AllocZeroed {
+    fn zeroed() -> Self;
 }
 
-impl Drop for Page {
-    fn drop(&mut self) {
-        match unsafe { self.page_ptr.decrease_refcount() } {
-            0 => panic!("In-use page refcount is 0"),
-            1 => free_pages(self.page_ptr, self.order),
-            _ => {}
+impl<A: PageAlloc> AsMemoryBlock for GenericPage<A> {
+    fn as_memblk(&self) -> MemoryBlock {
+        unsafe {
+            // SAFETY: `self.start()` points to valid memory of length `self.len()`.
+            MemoryBlock::new(self.start().as_ptr::<()>().addr(), self.len())
         }
     }
 }
 
-impl PartialEq for Page {
-    fn eq(&self, other: &Self) -> bool {
-        // assert!(self.page_ptr != other.page_ptr || self.order == other.order);
-
-        self.page_ptr.as_ptr() == other.page_ptr.as_ptr()
-    }
-}
-
-unsafe impl Sync for Page {}
-unsafe impl Send for Page {}
-
-impl fmt::Debug for Page {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let pfn = self.as_phys();
-        write!(f, "Page({:#x}, order={})", pfn, self.order)
-    }
-}
-
-pub struct PageBuffer {
-    page: Page,
-    offset: usize,
-}
-
-#[allow(dead_code)]
 impl PageBuffer {
-    pub fn new(page: Page) -> Self {
-        Self { page, offset: 0 }
-    }
-
-    pub fn len(&self) -> usize {
-        self.offset
+    pub fn new() -> Self {
+        Self {
+            page: Page::alloc(),
+            offset: 0,
+        }
     }
 
-    pub fn remaining(&self) -> usize {
-        self.page.len() - self.offset
+    pub fn all(&self) -> &[u8] {
+        unsafe {
+            // SAFETY: The page is exclusivly owned by us.
+            self.page.as_memblk().as_bytes()
+        }
     }
 
-    pub fn as_slice(&self) -> &[u8] {
-        self.page.as_slice()
+    pub fn data(&self) -> &[u8] {
+        &self.all()[..self.offset]
     }
 
-    fn available_as_slice(&self) -> &mut [u8] {
-        &mut self.page.as_mut_slice()[self.offset..]
+    pub fn available_mut(&mut self) -> &mut [u8] {
+        unsafe {
+            // SAFETY: The page is exclusivly owned by us.
+            &mut self.page.as_memblk().as_bytes_mut()[self.offset..]
+        }
     }
 }
 
@@ -184,16 +61,17 @@ impl Buffer for PageBuffer {
     }
 
     fn wrote(&self) -> usize {
-        self.len()
+        self.offset
     }
 
     fn fill(&mut self, data: &[u8]) -> crate::KResult<crate::io::FillResult> {
-        if self.remaining() == 0 {
+        let available = self.available_mut();
+        if available.len() == 0 {
             return Ok(FillResult::Full);
         }
 
-        let len = core::cmp::min(data.len(), self.remaining());
-        self.available_as_slice()[..len].copy_from_slice(&data[..len]);
+        let len = core::cmp::min(data.len(), available.len());
+        available[..len].copy_from_slice(&data[..len]);
         self.offset += len;
 
         if len < data.len() {
@@ -203,3 +81,14 @@ impl Buffer for PageBuffer {
         }
     }
 }
+
+impl AllocZeroed for Page {
+    fn zeroed() -> Self {
+        let page = Self::alloc();
+        unsafe {
+            // SAFETY: The page is exclusivly owned by us.
+            page.as_memblk().as_bytes_mut().fill(0);
+        }
+        page
+    }
+}

+ 0 - 80
src/kernel/mem/phys.rs

@@ -1,80 +0,0 @@
-use core::fmt;
-
-pub trait PhysPtr {
-    fn as_ptr<T>(&self) -> *mut T;
-
-    #[allow(dead_code)]
-    fn as_ref<'lifetime, T>(&self) -> &'lifetime T {
-        unsafe { &*(self.as_ptr()) }
-    }
-
-    fn as_mut<'lifetime, T>(&self) -> &'lifetime mut T {
-        unsafe { &mut *(self.as_ptr()) }
-    }
-
-    fn as_slice<'lifetime, T>(&self, len: usize) -> &'lifetime [T] {
-        unsafe { core::slice::from_raw_parts(self.as_ptr(), len) }
-    }
-
-    fn as_mut_slice<'lifetime, T>(&self, len: usize) -> &'lifetime mut [T] {
-        unsafe { core::slice::from_raw_parts_mut(self.as_ptr(), len) }
-    }
-}
-
-#[derive(Clone, Copy, PartialEq, Eq)]
-pub struct CachedPP {
-    addr: usize,
-}
-
-#[derive(Clone, Copy, PartialEq, Eq)]
-pub struct NoCachePP {
-    addr: usize,
-}
-
-impl CachedPP {
-    pub const fn new(addr: usize) -> Self {
-        Self { addr }
-    }
-
-    pub const fn offset(&self, offset: usize) -> Self {
-        Self {
-            addr: self.addr + offset,
-        }
-    }
-}
-
-impl PhysPtr for CachedPP {
-    fn as_ptr<T>(&self) -> *mut T {
-        (self.addr + 0xffffff0000000000) as *mut T
-    }
-}
-
-impl fmt::Debug for CachedPP {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "CachedPP({:#x})", self.addr)
-    }
-}
-
-impl NoCachePP {
-    pub fn new(addr: usize) -> Self {
-        Self { addr }
-    }
-
-    pub fn offset(&self, offset: isize) -> Self {
-        Self {
-            addr: self.addr + offset as usize,
-        }
-    }
-}
-
-impl PhysPtr for NoCachePP {
-    fn as_ptr<T>(&self) -> *mut T {
-        (self.addr + 0xffffff4000000000) as *mut T
-    }
-}
-
-impl fmt::Debug for NoCachePP {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "NoCachePP({:#x})", self.addr)
-    }
-}

+ 7 - 8
src/kernel/smp.rs

@@ -1,20 +1,19 @@
 use super::cpu::init_localcpu;
 use crate::{
-    kernel::{
-        cpu::local_cpu,
-        mem::{paging::Page, phys::PhysPtr as _},
-        task::KernelStack,
-    },
+    kernel::{cpu::local_cpu, mem::paging::Page, task::KernelStack},
     println_debug,
 };
 use arch::define_smp_bootstrap;
+use eonix_mm::address::Addr as _;
 use eonix_runtime::scheduler::Scheduler;
 
 define_smp_bootstrap!(4, ap_entry, {
-    let page = Page::alloc_many(9);
-    let stack_bottom = page.as_cached().as_ptr::<()>() as usize + page.len();
+    let page = Page::alloc_order(9);
+    let stack_bottom = page.range().end();
     core::mem::forget(page);
-    stack_bottom
+
+    // Physical address is used for init state APs.
+    stack_bottom.addr() as u64
 });
 
 unsafe extern "C" fn ap_entry() -> ! {

+ 14 - 18
src/kernel/syscall/mm.rs

@@ -1,15 +1,15 @@
-use bindings::{EINVAL, ENOMEM};
-
+use super::{define_syscall32, register_syscall, MapArgument, MapArgumentImpl};
 use crate::{
     kernel::{
         constants::{UserMmapFlags, UserMmapProtocol},
-        mem::{Mapping, Permission, VAddr},
+        mem::{Mapping, Permission},
         task::Thread,
     },
     prelude::*,
 };
-
-use super::{define_syscall32, register_syscall, MapArgument, MapArgumentImpl};
+use bindings::{EINVAL, ENOMEM};
+use eonix_mm::address::{Addr as _, AddrOps as _, VAddr};
+use eonix_runtime::task::Task;
 
 /// Check whether we are doing an implemented function.
 /// If `condition` is false, return `Err(err)`.
@@ -29,8 +29,8 @@ fn do_mmap_pgoff(
     fd: u32,
     pgoffset: usize,
 ) -> KResult<usize> {
-    let addr = VAddr(addr);
-    if addr.floor() != addr || len == 0 {
+    let addr = VAddr::from(addr);
+    if !addr.is_page_aligned() || len == 0 {
         return Err(EINVAL);
     }
 
@@ -45,7 +45,7 @@ fn do_mmap_pgoff(
 
     // PROT_NONE, we do unmapping.
     if prot.is_empty() {
-        mm_list.unmap(addr, len).map(|_| 0)?;
+        Task::block_on(mm_list.unmap(addr, len)).map(|_| 0)?;
         return Ok(0);
     }
     // Otherwise, do mmapping.
@@ -74,26 +74,22 @@ fn do_mmap_pgoff(
         )
     };
 
-    addr.map(|addr| addr.0)
+    addr.map(|addr| addr.addr())
 }
 
 fn do_munmap(addr: usize, len: usize) -> KResult<usize> {
-    let addr = VAddr(addr);
-    if addr.floor() != addr || len == 0 {
+    let addr = VAddr::from(addr);
+    if !addr.is_page_aligned() || len == 0 {
         return Err(EINVAL);
     }
 
     let len = (len + 0xfff) & !0xfff;
-    Thread::current()
-        .process
-        .mm_list
-        .unmap(addr, len)
-        .map(|_| 0)
+    Task::block_on(Thread::current().process.mm_list.unmap(addr, len)).map(|_| 0)
 }
 
 fn do_brk(addr: usize) -> KResult<usize> {
-    let vaddr = if addr == 0 { None } else { Some(VAddr(addr)) };
-    Ok(Thread::current().process.mm_list.set_break(vaddr).0)
+    let vaddr = if addr == 0 { None } else { Some(VAddr::from(addr)) };
+    Ok(Thread::current().process.mm_list.set_break(vaddr).addr())
 }
 
 impl MapArgument<'_, UserMmapProtocol> for MapArgumentImpl {

+ 11 - 8
src/kernel/syscall/procops.rs

@@ -5,7 +5,7 @@ use crate::io::Buffer;
 use crate::kernel::constants::{
     ENOSYS, PR_GET_NAME, PR_SET_NAME, RLIMIT_STACK, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK,
 };
-use crate::kernel::mem::{Page, PageBuffer, VAddr};
+use crate::kernel::mem::PageBuffer;
 use crate::kernel::task::{
     KernelStack, ProcessBuilder, ProcessList, Signal, SignalAction, SignalMask, Thread,
     ThreadBuilder, ThreadRunnable, UserDescriptor, WaitObject, WaitType,
@@ -22,6 +22,7 @@ use alloc::ffi::CString;
 use arch::{ExtendedContext, InterruptContext};
 use bindings::{EINVAL, ENOENT, ENOTDIR, ERANGE, ESRCH};
 use bitflags::bitflags;
+use eonix_mm::address::{Addr as _, VAddr};
 use eonix_runtime::scheduler::Scheduler;
 use eonix_runtime::task::Task;
 use eonix_sync::AsProof as _;
@@ -39,11 +40,10 @@ fn do_umask(mask: u32) -> KResult<u32> {
 fn do_getcwd(buffer: *mut u8, bufsize: usize) -> KResult<usize> {
     let context = FsContext::get_current();
     let mut user_buffer = UserBuffer::new(buffer, bufsize)?;
+    let mut buffer = PageBuffer::new();
 
-    let page = Page::alloc_one();
-    let mut buffer = PageBuffer::new(page.clone());
     context.cwd.lock().get_path(&context, &mut buffer)?;
-    user_buffer.fill(page.as_slice())?.ok_or(ERANGE)?;
+    user_buffer.fill(buffer.data())?.ok_or(ERANGE)?;
 
     Ok(buffer.wrote())
 }
@@ -99,7 +99,10 @@ fn do_execve(exec: &[u8], argv: Vec<CString>, envp: Vec<CString>) -> KResult<(VA
     let elf = ParsedElf32::parse(dentry.clone())?;
     let result = elf.load(argv, envp);
     if let Ok((ip, sp, mm_list)) = result {
-        Thread::current().process.mm_list.replace(mm_list);
+        unsafe {
+            // SAFETY: We are doing execve, all other threads are terminated.
+            Thread::current().process.mm_list.replace(Some(mm_list));
+        }
         Thread::current().files.on_exec();
         Thread::current().signal_list.clear_non_ignore();
         Thread::current().set_name(dentry.name().clone());
@@ -149,8 +152,8 @@ fn sys_execve(int_stack: &mut InterruptContext, _: &mut ExtendedContext) -> usiz
 
         let (ip, sp) = do_execve(exec.as_cstr().to_bytes(), argv_vec, envp_vec)?;
 
-        int_stack.rip = ip.0 as u64;
-        int_stack.rsp = sp.0 as u64;
+        int_stack.rip = ip.addr() as u64;
+        int_stack.rsp = sp.addr() as u64;
         Ok(())
     })() {
         Ok(_) => 0,
@@ -569,7 +572,7 @@ fn sys_fork(int_stack: &mut InterruptContext, _: &mut ExtendedContext) -> usize
 
     let thread_builder = ThreadBuilder::new().fork_from(&current);
     let (new_thread, new_process) = ProcessBuilder::new()
-        .mm_list(current_process.mm_list.new_cloned())
+        .mm_list(Task::block_on(current_process.mm_list.new_cloned()))
         .parent(current_process)
         .pgroup(current_pgroup)
         .session(current_session)

+ 11 - 7
src/kernel/task/kernel_stack.rs

@@ -1,10 +1,11 @@
-use crate::kernel::mem::{paging::Page, phys::PhysPtr};
+use crate::kernel::mem::{paging::Page, PhysAccess as _};
+use core::{num::NonZero, ptr::NonNull};
 use eonix_runtime::executor::Stack;
 
 #[derive(Debug)]
 pub struct KernelStack {
     _pages: Page,
-    bottom: usize,
+    bottom: NonZero<usize>,
 }
 
 impl KernelStack {
@@ -13,8 +14,11 @@ impl KernelStack {
     const KERNEL_STACK_ORDER: u32 = 7;
 
     pub fn new() -> Self {
-        let pages = Page::alloc_many(Self::KERNEL_STACK_ORDER);
-        let bottom = pages.as_cached().offset(pages.len()).as_ptr::<u8>() as usize;
+        let pages = Page::alloc_order(Self::KERNEL_STACK_ORDER);
+        let bottom = unsafe {
+            // SAFETY: The paddr is from a page, which should be valid.
+            pages.range().end().as_ptr::<u8>().addr()
+        };
 
         Self {
             _pages: pages,
@@ -28,8 +32,8 @@ impl Stack for KernelStack {
         Self::new()
     }
 
-    fn get_bottom(&self) -> &() {
-        // SAFETY: We hold the ownership of a valid stack.
-        unsafe { &*(self.bottom as *const ()) }
+    fn get_bottom(&self) -> NonNull<()> {
+        // SAFETY: The stack is allocated and `bottom` is non-zero.
+        unsafe { NonNull::new_unchecked(self.bottom.get() as *mut _) }
     }
 }

+ 3 - 9
src/kernel/task/process_list.rs

@@ -1,10 +1,9 @@
 use super::{Process, ProcessGroup, Session, Signal, Thread, WaitObject, WaitType};
-use crate::{prelude::*, rcu::rcu_sync};
+use crate::rcu::rcu_sync;
 use alloc::{
     collections::btree_map::BTreeMap,
     sync::{Arc, Weak},
 };
-use bindings::KERNEL_PML4;
 use eonix_runtime::task::Task;
 use eonix_sync::{AsProof as _, AsProofMut as _, RwLock};
 
@@ -142,17 +141,12 @@ impl ProcessList {
             }
         }
 
-        eonix_preempt::disable();
-
         // Release the MMList as well as the page table.
-        // Before we release the page table, we need to switch to the kernel page table.
-        arch::set_root_page_table(KERNEL_PML4 as usize);
         unsafe {
-            process.mm_list.release();
+            // SAFETY: We are exiting the process, so no one might be using it.
+            process.mm_list.replace(None);
         }
 
-        eonix_preempt::enable();
-
         // Make children orphans (adopted by init)
         {
             let init = self.init_process();

+ 10 - 10
src/kernel/task/signal/signal_action.rs

@@ -3,7 +3,6 @@ use crate::{
     io::BufferFill as _,
     kernel::{
         constants::{EFAULT, EINVAL, ENOSYS},
-        mem::VAddr,
         user::UserBuffer,
     },
     SIGNAL_NOW,
@@ -11,6 +10,7 @@ use crate::{
 use alloc::collections::btree_map::BTreeMap;
 use arch::{ExtendedContext, InterruptContext};
 use core::num::NonZero;
+use eonix_mm::address::{Addr as _, AddrOps as _, VAddr};
 use posix_types::signal::{SigAction, TryFromSigAction};
 
 #[derive(Debug, Clone, Copy)]
@@ -93,9 +93,9 @@ impl SignalAction {
                 // TODO!!!: Determine the size of the return address
                 let sp = VAddr::from(int_stack.rsp as usize - 128 - CONTEXT_SIZE).floor_to(16)
                     - size_of::<u32>();
-                let restorer_address = usize::from(restorer) as u32;
+                let restorer_address = restorer.addr() as u32;
                 let mut stack =
-                    UserBuffer::new(usize::from(sp) as *mut u8, CONTEXT_SIZE + size_of::<u32>())?;
+                    UserBuffer::new(sp.addr() as *mut u8, CONTEXT_SIZE + size_of::<u32>())?;
 
                 stack.copy(&restorer_address)?.ok_or(EFAULT)?; // Restorer address
                 stack.copy(&u32::from(signal))?.ok_or(EFAULT)?; // `signum`
@@ -103,8 +103,8 @@ impl SignalAction {
                 stack.copy(ext_ctx)?.ok_or(EFAULT)?; // MMX registers
                 stack.copy(int_stack)?.ok_or(EFAULT)?; // Interrupt stack
 
-                int_stack.rip = usize::from(handler) as u64;
-                int_stack.rsp = usize::from(sp) as u64;
+                int_stack.rip = handler.addr() as u64;
+                int_stack.rsp = sp.addr() as u64;
                 Ok(())
             }
         }
@@ -138,7 +138,7 @@ impl TryFromSigAction for SignalAction {
 
     fn new() -> Self {
         Self::SimpleHandler {
-            handler: VAddr(0),
+            handler: VAddr::NULL,
             restorer: None,
             mask: SignalMask::empty(),
         }
@@ -150,7 +150,7 @@ impl TryFromSigAction for SignalAction {
 
     fn handler(mut self, handler_addr: usize) -> Result<Self, Self::Error> {
         if let Self::SimpleHandler { handler, .. } = &mut self {
-            *handler = VAddr(handler_addr);
+            *handler = VAddr::from(handler_addr);
             Ok(self)
         } else {
             unreachable!()
@@ -159,7 +159,7 @@ impl TryFromSigAction for SignalAction {
 
     fn restorer(mut self, restorer_addr: usize) -> Result<Self, Self::Error> {
         if let Self::SimpleHandler { restorer, .. } = &mut self {
-            *restorer = NonZero::new(restorer_addr).map(|x| VAddr(x.get()));
+            *restorer = NonZero::new(restorer_addr).map(|x| VAddr::from(x.get()));
             Ok(self)
         } else {
             unreachable!()
@@ -187,11 +187,11 @@ impl From<SignalAction> for SigAction {
                 mask,
             } => {
                 let action = SigAction::new()
-                    .handler(usize::from(handler))
+                    .handler(handler.addr())
                     .mask(u64::from(mask));
 
                 if let Some(restorer) = restorer {
-                    action.restorer(usize::from(restorer))
+                    action.restorer(restorer.addr())
                 } else {
                     action
                 }

+ 5 - 8
src/kernel/task/thread.rs

@@ -5,7 +5,6 @@ use super::{
 use crate::{
     kernel::{
         cpu::local_cpu,
-        mem::VAddr,
         user::dataflow::CheckedUserPointer,
         vfs::{filearray::FileArray, FsContext},
     },
@@ -13,7 +12,6 @@ use crate::{
 };
 use alloc::sync::Arc;
 use arch::{InterruptContext, UserTLS, _arch_fork_return};
-use bindings::KERNEL_PML4;
 use core::{
     arch::asm,
     pin::Pin,
@@ -21,6 +19,7 @@ use core::{
     sync::atomic::{AtomicUsize, Ordering},
     task::Waker,
 };
+use eonix_mm::address::{Addr as _, VAddr};
 use eonix_runtime::{
     context::ExecutionContext,
     run::{Contexted, Run, RunState},
@@ -298,11 +297,9 @@ impl Thread {
 
 impl ThreadRunnable {
     pub fn new(thread: Arc<Thread>, entry: VAddr, stack_pointer: VAddr) -> Self {
-        let (VAddr(entry), VAddr(stack_pointer)) = (entry, stack_pointer);
-
         let mut interrupt_context = InterruptContext::default();
-        interrupt_context.set_return_address(entry as _, true);
-        interrupt_context.set_stack_pointer(stack_pointer as _, true);
+        interrupt_context.set_return_address(entry.addr() as _, true);
+        interrupt_context.set_stack_pointer(stack_pointer.addr() as _, true);
         interrupt_context.set_interrupt_enabled(true);
 
         Self {
@@ -347,7 +344,7 @@ impl Contexted for ThreadRunnable {
             CURRENT_THREAD.swap(Some(current_thread));
         }
 
-        thread.process.mm_list.switch_page_table();
+        thread.process.mm_list.activate();
 
         unsafe {
             // SAFETY: Preemption is disabled.
@@ -356,7 +353,7 @@ impl Contexted for ThreadRunnable {
     }
 
     fn restore_running_context(&self) {
-        arch::set_root_page_table(KERNEL_PML4 as usize);
+        self.thread.process.mm_list.deactivate();
     }
 }
 

+ 14 - 21
src/kernel/vfs/file.rs

@@ -4,10 +4,10 @@ use super::{
     s_isblk, s_isdir, s_isreg,
 };
 use crate::{
-    io::{Buffer, BufferFill, ByteBuffer},
+    io::{Buffer, BufferFill, ByteBuffer, Chunks},
     kernel::{
         constants::{TCGETS, TCSETS, TIOCGPGRP, TIOCGWINSZ, TIOCSPGRP},
-        mem::paging::Page,
+        mem::{paging::Page, AsMemoryBlock as _},
         task::{Signal, Thread},
         terminal::{Terminal, TerminalIORequest},
         user::{UserPointer, UserPointerMut},
@@ -498,38 +498,31 @@ impl File {
     }
 
     pub async fn sendfile(&self, dest_file: &Self, count: usize) -> KResult<usize> {
-        let buffer_page = Page::alloc_one();
+        let buffer_page = Page::alloc();
+        // SAFETY: We are the only owner of the page.
+        let buffer = unsafe { buffer_page.as_memblk().as_bytes_mut() };
 
         match self {
             File::Inode(file) if s_isblk(file.mode) || s_isreg(file.mode) => (),
             _ => return Err(EINVAL),
         }
 
-        // TODO!!!: zero copy implementation with mmap
-        let mut tot = 0usize;
-        while tot < count {
+        for (cur, len) in Chunks::new(0, count, buffer.len()) {
             if Thread::current().signal_list.has_pending_signal() {
-                if tot == 0 {
-                    return Err(EINTR);
-                } else {
-                    return Ok(tot);
-                }
+                return if cur == 0 { Err(EINTR) } else { Ok(cur) };
             }
-
-            let batch_size = usize::min(count - tot, buffer_page.len());
-            let slice = &mut buffer_page.as_mut_slice()[..batch_size];
-            let mut buffer = ByteBuffer::new(slice);
-
-            let nwrote = self.read(&mut buffer).await?;
-
-            if nwrote == 0 {
+            let nread = self.read(&mut ByteBuffer::new(&mut buffer[..len])).await?;
+            if nread == 0 {
                 break;
             }
 
-            tot += dest_file.write(&slice[..nwrote]).await?;
+            let nwrote = dest_file.write(&buffer[..nread]).await?;
+            if nwrote != nread {
+                return Ok(cur + nwrote);
+            }
         }
 
-        Ok(tot)
+        Ok(count)
     }
 
     pub fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {

+ 6 - 4
src/lib.rs

@@ -28,6 +28,7 @@ mod sync;
 use alloc::{ffi::CString, sync::Arc};
 use core::alloc::{GlobalAlloc, Layout};
 use elf::ParsedElf32;
+use eonix_mm::{address::PAddr, paging::PFN};
 use eonix_runtime::{run::FutureRun, scheduler::Scheduler, task::Task};
 use kernel::{
     cpu::init_localcpu,
@@ -95,7 +96,7 @@ extern "C" {
 }
 
 #[no_mangle]
-pub extern "C" fn rust_kinit(early_kstack_pfn: usize) -> ! {
+pub extern "C" fn rust_kinit(early_kstack_paddr: PAddr) -> ! {
     // We don't call global constructors.
     // Rust doesn't need that, and we're not going to use global variables in C++.
     init_localcpu();
@@ -114,7 +115,8 @@ pub extern "C" fn rust_kinit(early_kstack_pfn: usize) -> ! {
     // So call `init_vfs` first, then `init_multitasking`.
     Scheduler::init_local_scheduler::<KernelStack>();
 
-    Scheduler::get().spawn::<KernelStack, _>(FutureRun::new(init_process(early_kstack_pfn)));
+    Scheduler::get()
+        .spawn::<KernelStack, _>(FutureRun::new(init_process(PFN::from(early_kstack_paddr))));
 
     unsafe {
         // SAFETY: `preempt::count()` == 1.
@@ -122,8 +124,8 @@ pub extern "C" fn rust_kinit(early_kstack_pfn: usize) -> ! {
     }
 }
 
-async fn init_process(early_kstack_pfn: usize) {
-    unsafe { Page::take_pfn(early_kstack_pfn, 9) };
+async fn init_process(early_kstack_pfn: PFN) {
+    unsafe { Page::from_raw(early_kstack_pfn) };
 
     kernel::syscall::register_syscalls();
     CharDevice::init().unwrap();

+ 1 - 0
src/sync.rs

@@ -1,5 +1,6 @@
 mod arcswap;
 mod condvar;
+pub mod fence;
 
 pub use arcswap::ArcSwap;
 pub use eonix_sync::Spin;

+ 34 - 0
src/sync/fence.rs

@@ -0,0 +1,34 @@
+use core::sync::atomic::{compiler_fence, Ordering};
+
+/// A strong memory barrier that prevents reordering of memory operations.
+pub fn memory_barrier() {
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+
+    arch::memory_barrier();
+
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+}
+
+/// A read memory barrier that prevents reordering of read operations.
+pub fn read_memory_barrier() {
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+
+    arch::read_memory_barrier();
+
+    // A read memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+}
+
+/// A write memory barrier that prevents reordering of write operations.
+pub fn write_memory_barrier() {
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+
+    arch::write_memory_barrier();
+
+    // A write memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+}