Browse Source

refactor(mm): new abstraction of page table and etc.

greatbridf 10 tháng trước cách đây
mục cha
commit
dc97e018fa
87 tập tin đã thay đổi với 3921 bổ sung2507 xóa
  1. 24 11
      Cargo.lock
  2. 6 2
      Cargo.toml
  3. 2 1
      arch/Cargo.toml
  4. 34 0
      arch/src/x86_64/fence.rs
  5. 218 0
      arch/src/x86_64/mm.rs
  6. 12 8
      arch/src/x86_64/mod.rs
  7. 9 0
      crates/buddy_allocator/Cargo.toml
  8. 47 0
      crates/buddy_allocator/src/free_area.rs
  9. 73 0
      crates/buddy_allocator/src/lib.rs
  10. 122 0
      crates/buddy_allocator/src/zone.rs
  11. 0 1
      crates/eonix_log/Cargo.toml
  12. 0 1
      crates/eonix_log/src/lib.rs
  13. 7 0
      crates/eonix_mm/Cargo.toml
  14. 14 0
      crates/eonix_mm/src/address.rs
  15. 64 0
      crates/eonix_mm/src/address/addr.rs
  16. 190 0
      crates/eonix_mm/src/address/addr_range.rs
  17. 4 0
      crates/eonix_mm/src/address/error.rs
  18. 65 0
      crates/eonix_mm/src/address/paddr.rs
  19. 60 0
      crates/eonix_mm/src/address/vaddr.rs
  20. 5 0
      crates/eonix_mm/src/lib.rs
  21. 9 0
      crates/eonix_mm/src/page_table.rs
  22. 132 0
      crates/eonix_mm/src/page_table/page_table.rs
  23. 38 0
      crates/eonix_mm/src/page_table/paging_mode.rs
  24. 52 0
      crates/eonix_mm/src/page_table/pte.rs
  25. 177 0
      crates/eonix_mm/src/page_table/pte_iterator.rs
  26. 9 0
      crates/eonix_mm/src/paging.rs
  27. 219 0
      crates/eonix_mm/src/paging/page.rs
  28. 31 0
      crates/eonix_mm/src/paging/page_alloc.rs
  29. 65 0
      crates/eonix_mm/src/paging/pfn.rs
  30. 97 0
      crates/eonix_mm/src/paging/raw_page.rs
  31. 12 0
      crates/eonix_percpu/Cargo.toml
  32. 24 0
      crates/eonix_percpu/src/arch.rs
  33. 181 0
      crates/eonix_percpu/src/lib.rs
  34. 0 1
      crates/eonix_runtime/Cargo.toml
  35. 1 1
      crates/eonix_runtime/src/executor/builder.rs
  36. 3 1
      crates/eonix_runtime/src/executor/stack.rs
  37. 1 2
      crates/eonix_runtime/src/scheduler.rs
  38. 0 8
      crates/eonix_spin_irq/Cargo.toml
  39. 1 0
      crates/eonix_sync/Cargo.toml
  40. 4 1
      crates/eonix_sync/src/lib.rs
  41. 14 0
      crates/eonix_sync/src/spin.rs
  42. 6 34
      crates/eonix_sync/src/spin/spin_irq.rs
  43. 7 4
      crates/eonix_sync/src/wait_list.rs
  44. 2 2
      crates/eonix_sync/src/wait_list/wait_handle.rs
  45. 7 3
      crates/eonix_sync/src/wait_list/wait_object.rs
  46. 6 0
      crates/intrusive_list/Cargo.toml
  47. 59 0
      crates/intrusive_list/src/lib.rs
  48. 3 5
      src/driver/ahci/command.rs
  49. 48 0
      src/driver/ahci/command_table.rs
  50. 11 8
      src/driver/ahci/control.rs
  51. 4 1
      src/driver/ahci/defs.rs
  52. 63 65
      src/driver/ahci/mod.rs
  53. 96 209
      src/driver/ahci/port.rs
  54. 58 0
      src/driver/ahci/register.rs
  55. 94 0
      src/driver/ahci/slot.rs
  56. 46 0
      src/driver/ahci/stats.rs
  57. 434 434
      src/driver/e1000e.rs
  58. 0 1
      src/driver/serial.rs
  59. 11 11
      src/elf.rs
  60. 10 4
      src/fs/fat32/file.rs
  61. 9 6
      src/fs/procfs.rs
  62. 40 4
      src/io.rs
  63. 10 5
      src/kernel/block.rs
  64. 8 5
      src/kernel/cpu.rs
  65. 0 1
      src/kernel/interrupt.rs
  66. 2 5
      src/kernel/mem.rs
  67. 158 0
      src/kernel/mem/access.rs
  68. 17 398
      src/kernel/mem/address.rs
  69. 60 56
      src/kernel/mem/mm_area.rs
  70. 318 145
      src/kernel/mem/mm_list.rs
  71. 39 0
      src/kernel/mem/mm_list/mapping.rs
  72. 9 8
      src/kernel/mem/mm_list/page_fault.rs
  73. 92 403
      src/kernel/mem/page_alloc.rs
  74. 0 316
      src/kernel/mem/page_table.rs
  75. 52 163
      src/kernel/mem/paging.rs
  76. 0 80
      src/kernel/mem/phys.rs
  77. 7 8
      src/kernel/smp.rs
  78. 14 18
      src/kernel/syscall/mm.rs
  79. 11 8
      src/kernel/syscall/procops.rs
  80. 11 7
      src/kernel/task/kernel_stack.rs
  81. 3 9
      src/kernel/task/process_list.rs
  82. 10 10
      src/kernel/task/signal/signal_action.rs
  83. 5 8
      src/kernel/task/thread.rs
  84. 14 21
      src/kernel/vfs/file.rs
  85. 6 4
      src/lib.rs
  86. 1 0
      src/sync.rs
  87. 34 0
      src/sync/fence.rs

+ 24 - 11
Cargo.lock

@@ -16,6 +16,7 @@ name = "arch"
 version = "0.1.0"
 dependencies = [
  "cfg-if",
+ "eonix_mm",
  "percpu-macros",
 ]
 
@@ -55,6 +56,15 @@ version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
+[[package]]
+name = "buddy_allocator"
+version = "0.1.0"
+dependencies = [
+ "eonix_mm",
+ "eonix_sync",
+ "intrusive_list",
+]
+
 [[package]]
 name = "cexpr"
 version = "0.6.0"
@@ -91,10 +101,16 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 name = "eonix_log"
 version = "0.1.0"
 dependencies = [
- "eonix_spin_irq",
  "eonix_sync",
 ]
 
+[[package]]
+name = "eonix_mm"
+version = "0.1.0"
+dependencies = [
+ "intrusive_list",
+]
+
 [[package]]
 name = "eonix_preempt"
 version = "0.1.0"
@@ -110,24 +126,16 @@ dependencies = [
  "atomic_unique_refcell",
  "eonix_log",
  "eonix_preempt",
- "eonix_spin_irq",
  "eonix_sync",
  "intrusive-collections",
  "pointers",
 ]
 
-[[package]]
-name = "eonix_spin_irq"
-version = "0.1.0"
-dependencies = [
- "arch",
- "eonix_sync",
-]
-
 [[package]]
 name = "eonix_sync"
 version = "0.1.0"
 dependencies = [
+ "arch",
  "eonix_preempt",
  "intrusive-collections",
 ]
@@ -140,10 +148,11 @@ dependencies = [
  "atomic_unique_refcell",
  "bindgen",
  "bitflags",
+ "buddy_allocator",
  "eonix_log",
+ "eonix_mm",
  "eonix_preempt",
  "eonix_runtime",
- "eonix_spin_irq",
  "eonix_sync",
  "intrusive-collections",
  "itertools",
@@ -166,6 +175,10 @@ dependencies = [
  "memoffset",
 ]
 
+[[package]]
+name = "intrusive_list"
+version = "0.1.0"
+
 [[package]]
 name = "itertools"
 version = "0.13.0"

+ 6 - 2
Cargo.toml

@@ -11,9 +11,10 @@ arch = { path = "./arch" }
 atomic_unique_refcell = { path = "./crates/atomic_unique_refcell", features = [
     "no_std",
 ] }
+buddy_allocator = { path = "./crates/buddy_allocator" }
+eonix_mm = { path = "./crates/eonix_mm" }
 eonix_preempt = { path = "./crates/eonix_preempt" }
 eonix_runtime = { path = "./crates/eonix_runtime" }
-eonix_spin_irq = { path = "./crates/eonix_spin_irq" }
 eonix_sync = { path = "./crates/eonix_sync" }
 eonix_log = { path = "./crates/eonix_log" }
 pointers = { path = "./crates/pointers" }
@@ -47,7 +48,10 @@ opt-level = 2
 opt-level = 0
 
 [profile.dev.package.eonix_sync]
-opt-level = 0
+opt-level = 2
+
+[profile.dev.package.intrusive_list]
+opt-level = 2
 
 [profile.dev.package."*"]
 opt-level = "s"

+ 2 - 1
arch/Cargo.toml

@@ -4,5 +4,6 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-percpu-macros = { path="./percpu-macros" }
+eonix_mm = { path = "../crates/eonix_mm" }
+percpu-macros = { path = "./percpu-macros" }
 cfg-if = "1.0"

+ 34 - 0
arch/src/x86_64/fence.rs

@@ -0,0 +1,34 @@
+use core::arch::asm;
+
+#[doc(hidden)]
+/// Issues a full memory barrier.
+///
+/// Note that this acts as a low-level operation **ONLY** and should be used with caution.
+/// **NO COMPILER BARRIERS** are emitted by this function.
+pub fn memory_barrier() {
+    unsafe {
+        asm!("mfence", options(nostack, nomem, preserves_flags));
+    }
+}
+
+#[doc(hidden)]
+/// Issues a read memory barrier.
+///
+/// Note that this acts as a low-level operation **ONLY** and should be used with caution.
+/// **NO COMPILER BARRIERS** are emitted by this function.
+pub fn read_memory_barrier() {
+    unsafe {
+        asm!("lfence", options(nostack, nomem, preserves_flags));
+    }
+}
+
+#[doc(hidden)]
+/// Issues a write memory barrier.
+///
+/// Note that this acts as a low-level operation **ONLY** and should be used with caution.
+/// **NO COMPILER BARRIERS** are emitted by this function.
+pub fn write_memory_barrier() {
+    unsafe {
+        asm!("sfence", options(nostack, nomem, preserves_flags));
+    }
+}

+ 218 - 0
arch/src/x86_64/mm.rs

@@ -1 +1,219 @@
+use core::{marker::PhantomData, ptr::NonNull};
+use eonix_mm::{
+    address::{Addr as _, PAddr},
+    page_table::{PageAttribute, PageTableLevel, PagingMode, RawPageTable, PTE},
+    paging::{PageBlock, PFN},
+};
+
 pub const PAGE_SIZE: usize = 0x1000;
+
+const KERNEL_PML4_PFN: PFN = PFN::from_val(0x2000 >> 12);
+
+const PA_P: u64 = 0x001;
+const PA_RW: u64 = 0x002;
+const PA_US: u64 = 0x004;
+#[allow(dead_code)]
+const PA_PWT: u64 = 0x008;
+#[allow(dead_code)]
+const PA_PCD: u64 = 0x010;
+const PA_A: u64 = 0x020;
+const PA_D: u64 = 0x040;
+#[allow(dead_code)]
+const PA_PS: u64 = 0x080;
+const PA_G: u64 = 0x100;
+const PA_COW: u64 = 0x200;
+const PA_MMAP: u64 = 0x400;
+const PA_ANON: u64 = 0x800;
+const PA_NXE: u64 = 0x8000_0000_0000_0000;
+const PA_MASK: u64 = 0xfff0_0000_0000_0fff;
+
+#[repr(transparent)]
+pub struct PTE64(u64);
+
+#[derive(Clone, Copy)]
+pub struct PageAttribute64(u64);
+
+pub struct RawPageTable4Levels<'a>(NonNull<PTE64>, PhantomData<&'a ()>);
+
+pub struct PagingMode4Levels;
+
+impl PTE for PTE64 {
+    type Attr = PageAttribute64;
+
+    fn set(&mut self, pfn: PFN, attr: Self::Attr) {
+        let paddr = PAddr::from(pfn).addr();
+
+        self.0 = (paddr as u64 & !PA_MASK) | (attr.0 & PA_MASK);
+    }
+
+    fn get(&self) -> (PFN, Self::Attr) {
+        (
+            PFN::from(PAddr::from((self.0 & !PA_MASK) as usize)),
+            PageAttribute64(self.0 & PA_MASK),
+        )
+    }
+
+    fn take(&mut self) -> (PFN, Self::Attr) {
+        let pfn_attr = self.get();
+        self.0 = 0;
+        pfn_attr
+    }
+}
+
+impl PagingMode for PagingMode4Levels {
+    type Entry = PTE64;
+    type RawTable<'a> = RawPageTable4Levels<'a>;
+
+    const LEVELS: &'static [PageTableLevel] = &[
+        PageTableLevel::new(39, 9),
+        PageTableLevel::new(30, 9),
+        PageTableLevel::new(21, 9),
+        PageTableLevel::new(12, 9),
+    ];
+
+    const KERNEL_ROOT_TABLE_PFN: PFN = KERNEL_PML4_PFN;
+}
+
+impl<'a> RawPageTable<'a> for RawPageTable4Levels<'a> {
+    type Entry = PTE64;
+
+    fn index(&self, index: u16) -> &'a Self::Entry {
+        unsafe { &self.0.cast::<[PTE64; 512]>().as_ref()[index as usize] }
+    }
+
+    fn index_mut(&mut self, index: u16) -> &'a mut Self::Entry {
+        unsafe { &mut self.0.cast::<[PTE64; 512]>().as_mut()[index as usize] }
+    }
+
+    unsafe fn from_ptr(ptr: NonNull<PageBlock>) -> Self {
+        Self(ptr.cast(), PhantomData)
+    }
+}
+
+impl PageAttribute for PageAttribute64 {
+    fn new() -> Self {
+        Self(PA_NXE)
+    }
+
+    fn present(self, present: bool) -> Self {
+        if present {
+            Self(self.0 | PA_P)
+        } else {
+            Self(self.0 & !PA_P)
+        }
+    }
+
+    fn write(self, write: bool) -> Self {
+        if write {
+            Self(self.0 | PA_RW)
+        } else {
+            Self(self.0 & !PA_RW)
+        }
+    }
+
+    fn execute(self, execute: bool) -> Self {
+        if execute {
+            Self(self.0 & !PA_NXE)
+        } else {
+            Self(self.0 | PA_NXE)
+        }
+    }
+
+    fn user(self, user: bool) -> Self {
+        if user {
+            Self(self.0 | PA_US)
+        } else {
+            Self(self.0 & !PA_US)
+        }
+    }
+
+    fn accessed(self, accessed: bool) -> Self {
+        if accessed {
+            Self(self.0 | PA_A)
+        } else {
+            Self(self.0 & !PA_A)
+        }
+    }
+
+    fn dirty(self, dirty: bool) -> Self {
+        if dirty {
+            Self(self.0 | PA_D)
+        } else {
+            Self(self.0 & !PA_D)
+        }
+    }
+
+    fn global(self, global: bool) -> Self {
+        if global {
+            Self(self.0 | PA_G)
+        } else {
+            Self(self.0 & !PA_G)
+        }
+    }
+
+    fn copy_on_write(self, cow: bool) -> Self {
+        if cow {
+            Self(self.0 | PA_COW)
+        } else {
+            Self(self.0 & !PA_COW)
+        }
+    }
+
+    fn mapped(self, mmap: bool) -> Self {
+        if mmap {
+            Self(self.0 | PA_MMAP)
+        } else {
+            Self(self.0 & !PA_MMAP)
+        }
+    }
+
+    fn anonymous(self, anon: bool) -> Self {
+        if anon {
+            Self(self.0 | PA_ANON)
+        } else {
+            Self(self.0 & !PA_ANON)
+        }
+    }
+
+    fn is_present(&self) -> bool {
+        self.0 & PA_P != 0
+    }
+
+    fn is_write(&self) -> bool {
+        self.0 & PA_RW != 0
+    }
+
+    fn is_execute(&self) -> bool {
+        self.0 & PA_NXE == 0
+    }
+
+    fn is_user(&self) -> bool {
+        self.0 & PA_US != 0
+    }
+
+    fn is_accessed(&self) -> bool {
+        self.0 & PA_A != 0
+    }
+
+    fn is_dirty(&self) -> bool {
+        self.0 & PA_D != 0
+    }
+
+    fn is_global(&self) -> bool {
+        self.0 & PA_G != 0
+    }
+
+    fn is_copy_on_write(&self) -> bool {
+        self.0 & PA_COW != 0
+    }
+
+    fn is_mapped(&self) -> bool {
+        self.0 & PA_MMAP != 0
+    }
+
+    fn is_anonymous(&self) -> bool {
+        self.0 & PA_ANON != 0
+    }
+}
+
+pub type DefaultPagingMode = PagingMode4Levels;

+ 12 - 8
arch/src/x86_64/mod.rs

@@ -1,4 +1,5 @@
 mod context;
+mod fence;
 mod gdt;
 mod init;
 mod interrupt;
@@ -7,18 +8,21 @@ mod mm;
 mod percpu;
 mod user;
 
+use core::arch::asm;
+use eonix_mm::address::{Addr as _, PAddr, VAddr};
+use eonix_mm::paging::PFN;
+
 pub use self::context::*;
 pub use self::gdt::*;
 pub use self::init::*;
 pub use self::interrupt::*;
 pub use self::io::*;
 pub use self::user::*;
+pub use fence::*;
 pub use mm::*;
 pub use percpu::*;
 pub use percpu_macros::{define_percpu, define_percpu_shared};
 
-use core::arch::asm;
-
 #[inline(always)]
 pub fn flush_tlb(vaddr: usize) {
     unsafe {
@@ -43,7 +47,7 @@ pub fn flush_tlb_all() {
 }
 
 #[inline(always)]
-pub fn get_root_page_table() -> usize {
+pub fn get_root_page_table_pfn() -> PFN {
     let cr3: usize;
     unsafe {
         asm!(
@@ -52,22 +56,22 @@ pub fn get_root_page_table() -> usize {
             options(att_syntax)
         );
     }
-    cr3
+    PFN::from(PAddr::from(cr3))
 }
 
 #[inline(always)]
-pub fn set_root_page_table(pfn: usize) {
+pub fn set_root_page_table_pfn(pfn: PFN) {
     unsafe {
         asm!(
             "mov {0}, %cr3",
-            in(reg) pfn,
+            in(reg) PAddr::from(pfn).addr(),
             options(att_syntax)
         );
     }
 }
 
 #[inline(always)]
-pub fn get_page_fault_address() -> usize {
+pub fn get_page_fault_address() -> VAddr {
     let cr2: usize;
     unsafe {
         asm!(
@@ -76,7 +80,7 @@ pub fn get_page_fault_address() -> usize {
             options(att_syntax)
         );
     }
-    cr2
+    VAddr::from(cr2)
 }
 
 #[inline(always)]

+ 9 - 0
crates/buddy_allocator/Cargo.toml

@@ -0,0 +1,9 @@
+[package]
+name = "buddy_allocator"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+eonix_mm = { path = "../eonix_mm" }
+eonix_sync = { path = "../eonix_sync" }
+intrusive_list = { path = "../intrusive_list" }

+ 47 - 0
crates/buddy_allocator/src/free_area.rs

@@ -0,0 +1,47 @@
+use core::marker::{Send, Sync};
+use eonix_mm::paging::{PageFlags, RawPage, RawPagePtr};
+use intrusive_list::{container_of, Link};
+
+pub struct FreeArea {
+    free_list: Link,
+    count: usize,
+}
+
+unsafe impl Send for FreeArea {}
+unsafe impl Sync for FreeArea {}
+
+impl FreeArea {
+    pub const fn new() -> Self {
+        Self {
+            free_list: Link::new(),
+            count: 0,
+        }
+    }
+
+    pub fn get_free_pages(&mut self) -> Option<RawPagePtr> {
+        self.free_list.next_mut().map(|pages_link| {
+            assert_ne!(self.count, 0);
+
+            let pages_ptr = unsafe { container_of!(pages_link, RawPage, link) };
+            let pages_ptr = RawPagePtr::new(pages_ptr);
+
+            self.count -= 1;
+            pages_link.remove();
+
+            pages_ptr
+        })
+    }
+
+    pub fn add_pages(&mut self, pages_ptr: RawPagePtr) {
+        self.count += 1;
+        pages_ptr.as_mut().flags.set(PageFlags::FREE);
+        self.free_list.insert(&mut pages_ptr.as_mut().link)
+    }
+
+    pub fn del_pages(&mut self, pages_ptr: RawPagePtr) {
+        assert!(self.count >= 1 && pages_ptr.as_ref().flags.has(PageFlags::FREE));
+        self.count -= 1;
+        pages_ptr.as_mut().flags.clear(PageFlags::FREE);
+        pages_ptr.as_mut().link.remove();
+    }
+}

+ 73 - 0
crates/buddy_allocator/src/lib.rs

@@ -0,0 +1,73 @@
+#![no_std]
+
+mod free_area;
+mod zone;
+
+use core::sync::atomic::Ordering;
+use eonix_mm::{
+    address::PAddr,
+    paging::{PageAlloc, PageFlags, RawPagePtr, PFN},
+};
+use eonix_sync::Spin;
+use zone::Zone;
+
+pub use free_area::FreeArea;
+
+const MAX_ORDER: u32 = 10;
+const ZONE_AREAS: usize = const { MAX_ORDER as usize + 1 };
+
+static BUDDY_ALLOCATOR: BuddyAllocator = BuddyAllocator::new();
+
+pub struct BuddyAllocator {
+    zone: Spin<Zone<ZONE_AREAS>>,
+}
+
+impl BuddyAllocator {
+    const fn new() -> Self {
+        Self {
+            zone: Spin::new(Zone::new()),
+        }
+    }
+
+    pub fn create_pages(start: PAddr, end: PAddr) {
+        BUDDY_ALLOCATOR.zone.lock().create_pages(start, end);
+    }
+}
+
+impl PageAlloc for BuddyAllocator {
+    fn alloc_order(order: u32) -> Option<RawPagePtr> {
+        let pages_ptr = BUDDY_ALLOCATOR.zone.lock().get_free_pages(order);
+
+        if let Some(pages_ptr) = pages_ptr {
+            // SAFETY: Memory order here can be Relaxed is for the same reason as that
+            // in the copy constructor of `std::shared_ptr`.
+            pages_ptr.refcount().fetch_add(1, Ordering::Relaxed);
+            pages_ptr.flags().clear(PageFlags::FREE);
+        }
+
+        pages_ptr
+    }
+
+    unsafe fn dealloc(page_ptr: RawPagePtr) {
+        BUDDY_ALLOCATOR.zone.lock().free_pages(page_ptr);
+    }
+
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool {
+        !page_ptr.flags().has(PageFlags::FREE) && page_ptr.flags().has(PageFlags::BUDDY)
+    }
+}
+
+pub(self) trait BuddyPFNOps {
+    fn buddy_pfn(self, order: u32) -> PFN;
+    fn combined_pfn(self, buddy_pfn: PFN) -> PFN;
+}
+
+impl BuddyPFNOps for PFN {
+    fn buddy_pfn(self, order: u32) -> PFN {
+        PFN::from(usize::from(self) ^ (1 << order))
+    }
+
+    fn combined_pfn(self, buddy_pfn: PFN) -> PFN {
+        PFN::from(usize::from(self) & usize::from(buddy_pfn))
+    }
+}

+ 122 - 0
crates/buddy_allocator/src/zone.rs

@@ -0,0 +1,122 @@
+use crate::BuddyPFNOps as _;
+
+use super::free_area::FreeArea;
+use core::sync::atomic::Ordering;
+use eonix_mm::{
+    address::{AddrOps as _, PAddr},
+    paging::{PageFlags, RawPagePtr, PFN},
+};
+
+pub(super) struct Zone<const AREAS: usize> {
+    free_areas: [FreeArea; AREAS],
+}
+
+impl<const AREAS: usize> Zone<AREAS> {
+    pub const fn new() -> Self {
+        Self {
+            free_areas: [const { FreeArea::new() }; AREAS],
+        }
+    }
+
+    pub fn get_free_pages(&mut self, order: u32) -> Option<RawPagePtr> {
+        for current_order in order..AREAS as u32 {
+            let pages_ptr = self.free_areas[current_order as usize].get_free_pages();
+            let Some(pages_ptr) = pages_ptr else { continue };
+
+            pages_ptr.as_mut().order = order;
+
+            if current_order > order {
+                self.expand(pages_ptr, current_order, order);
+            }
+            assert!(pages_ptr.flags().has(PageFlags::PRESENT | PageFlags::FREE));
+
+            return Some(pages_ptr);
+        }
+        None
+    }
+
+    fn expand(&mut self, pages_ptr: RawPagePtr, order: u32, target_order: u32) {
+        let mut offset = 1 << order;
+
+        for order in (target_order..order).rev() {
+            offset >>= 1;
+            let split_pages_ptr = pages_ptr.offset(offset);
+            split_pages_ptr.as_mut().order = order;
+            split_pages_ptr.flags().set(PageFlags::BUDDY);
+            self.free_areas[order as usize].add_pages(split_pages_ptr);
+        }
+    }
+
+    pub fn free_pages(&mut self, mut pages_ptr: RawPagePtr) {
+        assert_eq!(pages_ptr.refcount().load(Ordering::Relaxed), 0);
+
+        let mut pfn = PFN::from(pages_ptr);
+        let mut current_order = pages_ptr.order();
+
+        while current_order < (AREAS - 1) as u32 {
+            let buddy_pfn = pfn.buddy_pfn(current_order);
+            let buddy_pages_ptr = RawPagePtr::from(buddy_pfn);
+
+            if !self.buddy_check(buddy_pages_ptr, current_order) {
+                break;
+            }
+
+            pages_ptr.flags().clear(PageFlags::BUDDY);
+            buddy_pages_ptr.flags().clear(PageFlags::BUDDY);
+            self.free_areas[current_order as usize].del_pages(buddy_pages_ptr);
+
+            pages_ptr = RawPagePtr::from(pfn.combined_pfn(buddy_pfn));
+            pfn = pfn.combined_pfn(buddy_pfn);
+
+            pages_ptr.flags().set(PageFlags::BUDDY);
+            current_order += 1;
+        }
+
+        pages_ptr.as_mut().order = current_order;
+        self.free_areas[current_order as usize].add_pages(pages_ptr);
+    }
+
+    /// This function checks whether a page is free && is a buddy
+    /// we can coalesce a page and its buddy if
+    /// - the buddy is valid(present) &&
+    /// - the buddy is right now in free_areas &&
+    /// - a page and its buddy have the same order &&
+    /// - a page and its buddy are in the same zone.    // check when smp
+    fn buddy_check(&self, pages_ptr: RawPagePtr, order: u32) -> bool {
+        if !pages_ptr.flags().has(PageFlags::PRESENT) {
+            return false;
+        }
+        if !pages_ptr.flags().has(PageFlags::FREE) {
+            return false;
+        }
+        if pages_ptr.flags().has(PageFlags::LOCAL) {
+            return false;
+        }
+        if pages_ptr.as_ref().order != order {
+            return false;
+        }
+
+        assert_eq!(pages_ptr.refcount().load(Ordering::Relaxed), 0);
+        true
+    }
+
+    /// Only used on buddy initialization
+    pub fn create_pages(&mut self, start: PAddr, end: PAddr) {
+        let mut start_pfn = PFN::from(start.ceil());
+        let end_pfn = PFN::from(end.floor());
+
+        while start_pfn < end_pfn {
+            let mut order = usize::from(start_pfn)
+                .trailing_zeros()
+                .min((AREAS - 1) as u32);
+
+            while start_pfn + order as usize > end_pfn {
+                order -= 1;
+            }
+            let page_ptr: RawPagePtr = start_pfn.into();
+            page_ptr.flags().set(PageFlags::BUDDY);
+            self.free_areas[order as usize].add_pages(page_ptr);
+            start_pfn = start_pfn + (1 << order) as usize;
+        }
+    }
+}

+ 0 - 1
crates/eonix_log/Cargo.toml

@@ -4,5 +4,4 @@ version = "0.1.0"
 edition = "2024"
 
 [dependencies]
-eonix_spin_irq = { path = "../eonix_spin_irq" }
 eonix_sync = { path = "../eonix_sync" }

+ 0 - 1
crates/eonix_log/src/lib.rs

@@ -3,7 +3,6 @@
 use core::fmt::{self, Write};
 
 use alloc::sync::Arc;
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::Spin;
 
 extern crate alloc;

+ 7 - 0
crates/eonix_mm/Cargo.toml

@@ -0,0 +1,7 @@
+[package]
+name = "eonix_mm"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+intrusive_list = { path = "../intrusive_list" }

+ 14 - 0
crates/eonix_mm/src/address.rs

@@ -0,0 +1,14 @@
+mod addr;
+mod addr_range;
+mod error;
+mod paddr;
+mod vaddr;
+
+pub use addr::{Addr, AddrOps};
+pub use addr_range::AddrRange;
+pub use error::AddressError;
+pub use paddr::PAddr;
+pub use vaddr::VAddr;
+
+pub type PRange = AddrRange<PAddr>;
+pub type VRange = AddrRange<VAddr>;

+ 64 - 0
crates/eonix_mm/src/address/addr.rs

@@ -0,0 +1,64 @@
+use crate::paging::PAGE_SIZE;
+use core::ops::{Add, Sub};
+
+pub trait Addr:
+    Sized
+    + Copy
+    + Clone
+    + Ord
+    + PartialOrd
+    + Eq
+    + PartialEq
+    + Sub<Output = usize>
+    + Sub<usize, Output = Self>
+    + Add<usize, Output = Self>
+    + From<usize>
+{
+    fn addr(self) -> usize;
+}
+
+pub trait AddrOps: Sized {
+    fn offset_in(self, size: usize) -> usize;
+
+    fn is_aligned_to(self, size: usize) -> bool;
+
+    /// Aligns the address to the nearest lower multiple of `size`.
+    fn floor_to(self, size: usize) -> Self;
+
+    /// Aligns the address to the nearest lower multiple of `size`.
+    fn ceil_to(self, size: usize) -> Self;
+
+    fn page_offset(self) -> usize {
+        self.offset_in(PAGE_SIZE)
+    }
+
+    fn is_page_aligned(self) -> bool {
+        self.is_aligned_to(PAGE_SIZE)
+    }
+
+    fn floor(self) -> Self {
+        self.floor_to(PAGE_SIZE)
+    }
+
+    fn ceil(self) -> Self {
+        self.ceil_to(PAGE_SIZE)
+    }
+}
+
+impl<A: Addr> AddrOps for A {
+    fn offset_in(self, size: usize) -> usize {
+        self.addr() % size
+    }
+
+    fn is_aligned_to(self, size: usize) -> bool {
+        self.offset_in(size) == 0
+    }
+
+    fn floor_to(self, size: usize) -> Self {
+        Self::from(self.addr() / size * size)
+    }
+
+    fn ceil_to(self, size: usize) -> Self {
+        Self::from(self.addr().div_ceil(size) * size)
+    }
+}

+ 190 - 0
crates/eonix_mm/src/address/addr_range.rs

@@ -0,0 +1,190 @@
+use super::addr::Addr;
+use core::{cmp::Ordering, fmt, ops::RangeBounds};
+
+#[derive(Clone, Copy)]
+/// A range of addresses.
+///
+/// The range is defined by two addresses, `start` and `end` and is inclusive
+/// on the start and exclusive on the end.
+///
+/// # Relations
+///
+/// ## Comparison
+///
+/// ### Equal
+/// Any two ranges that have one of them **containing** the other are considered equal.
+///
+/// ### Less
+/// If the two are not equal, the one that has the **smallest** start address is considered less.
+///
+/// ### Greater
+/// If the two are not equal, the one that has the **largest** end address is considered greater.
+///
+/// ## Overlapping Check
+/// Use `overlap_with` instead of `==` to check if two ranges overlap.
+pub struct AddrRange<A: Addr> {
+    start: A,
+    end: A,
+}
+
+impl<A: Addr> Eq for AddrRange<A> {}
+impl<A: Addr> PartialOrd for AddrRange<A> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<A: Addr> PartialEq for AddrRange<A> {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl<A: Addr> Ord for AddrRange<A> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        if self.start == other.start {
+            return Ordering::Equal;
+        }
+
+        if self.end == other.end {
+            if self.start == self.end {
+                return Ordering::Greater;
+            }
+            if other.start == other.end {
+                return Ordering::Less;
+            }
+            return Ordering::Equal;
+        }
+
+        if self.start < other.start {
+            if other.end < self.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Less;
+            }
+        }
+
+        if other.start < self.start {
+            if self.end < other.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Greater;
+            }
+        }
+
+        unreachable!()
+    }
+}
+
+impl<A: Addr> From<A> for AddrRange<A> {
+    fn from(addr: A) -> Self {
+        Self {
+            start: addr,
+            end: addr,
+        }
+    }
+}
+
+impl<A: Addr> AddrRange<A> {
+    /// Creates a new `AddrRange` with the given start and end addresses.
+    ///
+    /// # Panics
+    /// Panics if the start address is greater than the end address.
+    ///
+    /// # Hint
+    /// Use `AddrRange::from(addr).grow(size)` to create a range of size `size`
+    /// starting from `addr`.
+    pub fn new(start: A, end: A) -> Self {
+        assert!(start <= end);
+        Self { start, end }
+    }
+
+    pub const fn start(&self) -> A {
+        self.start
+    }
+
+    pub const fn end(&self) -> A {
+        self.end
+    }
+
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
+
+    pub fn shrink(&self, size: usize) -> Self {
+        assert!(size <= self.len());
+        Self::new(self.start, self.end - size)
+    }
+
+    pub fn grow(&self, count: usize) -> Self {
+        Self::new(self.start, self.end + count)
+    }
+
+    pub fn into_bounds(&self) -> impl RangeBounds<Self> {
+        if self.len() == 0 {
+            Self::from(self.start())..=Self::from(self.start())
+        } else {
+            Self::from(self.start())..=Self::from(self.end() - 1)
+        }
+    }
+
+    pub fn overlap_with(&self, other: &Self) -> bool {
+        self.start < other.end && self.end > other.start
+    }
+
+    pub fn split_at_checked(&self, at: A) -> (Option<Self>, Option<Self>) {
+        if self.end <= at {
+            (Some(*self), None)
+        } else if at <= self.start {
+            (None, Some(*self))
+        } else {
+            (
+                Some(Self::new(self.start, at)),
+                Some(Self::new(at, self.end)),
+            )
+        }
+    }
+
+    pub fn split_at(&self, at: A) -> (Self, Self) {
+        let (left, right) = self.split_at_checked(at);
+        (
+            left.expect("`at` is too large"),
+            right.expect("`at` is too small"),
+        )
+    }
+
+    pub fn mask_with_checked(&self, mask: &Self) -> Option<(Option<Self>, Self, Option<Self>)> {
+        if mask.len() == 0 || !self.overlap_with(mask) {
+            return None;
+        }
+
+        let left;
+        let mut mid;
+        let right;
+
+        if self.start < mask.start && mask.start < self.end {
+            let (l, r) = self.split_at(mask.start);
+            left = Some(l);
+            mid = r;
+        } else {
+            left = None;
+            mid = *self;
+        }
+
+        if mask.end < self.end {
+            let (l, r) = mid.split_at(mask.end);
+            mid = l;
+            right = Some(r);
+        } else {
+            right = None;
+        }
+
+        Some((left, mid, right))
+    }
+}
+
+impl<A: Addr + fmt::Debug> fmt::Debug for AddrRange<A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "[{:?}, {:?})", self.start, self.end)
+    }
+}

+ 4 - 0
crates/eonix_mm/src/address/error.rs

@@ -0,0 +1,4 @@
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AddressError {
+    InvalidAddress,
+}

+ 65 - 0
crates/eonix_mm/src/address/paddr.rs

@@ -0,0 +1,65 @@
+use super::addr::Addr;
+use crate::paging::{PAGE_SIZE_BITS, PFN};
+use core::{
+    fmt,
+    ops::{Add, Sub},
+};
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+pub struct PAddr(usize);
+
+impl From<usize> for PAddr {
+    fn from(v: usize) -> Self {
+        Self(v)
+    }
+}
+
+impl Sub for PAddr {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Sub<usize> for PAddr {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        PAddr(self.0 - rhs)
+    }
+}
+
+impl Add<usize> for PAddr {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        PAddr(self.0 + rhs)
+    }
+}
+
+impl fmt::Debug for PAddr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "PAddr({:#x})", self.0)
+    }
+}
+
+impl Addr for PAddr {
+    fn addr(self) -> usize {
+        let Self(addr) = self;
+        addr
+    }
+}
+
+impl From<PFN> for PAddr {
+    fn from(value: PFN) -> Self {
+        Self(usize::from(value) << PAGE_SIZE_BITS)
+    }
+}
+
+impl PAddr {
+    pub const fn from_val(val: usize) -> Self {
+        Self(val)
+    }
+}

+ 60 - 0
crates/eonix_mm/src/address/vaddr.rs

@@ -0,0 +1,60 @@
+use super::addr::Addr;
+use core::{
+    fmt,
+    ops::{Add, Sub},
+};
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+pub struct VAddr(usize);
+
+impl From<usize> for VAddr {
+    fn from(v: usize) -> Self {
+        Self::from(v)
+    }
+}
+
+impl VAddr {
+    pub const NULL: Self = Self(0);
+
+    pub const fn from(v: usize) -> Self {
+        Self(v)
+    }
+}
+
+impl Sub for VAddr {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Sub<usize> for VAddr {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 - rhs)
+    }
+}
+
+impl Add<usize> for VAddr {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 + rhs)
+    }
+}
+
+impl fmt::Debug for VAddr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "VAddr({:#x})", self.0)
+    }
+}
+
+impl Addr for VAddr {
+    fn addr(self) -> usize {
+        let Self(addr) = self;
+        addr
+    }
+}

+ 5 - 0
crates/eonix_mm/src/lib.rs

@@ -0,0 +1,5 @@
+#![no_std]
+
+pub mod address;
+pub mod page_table;
+pub mod paging;

+ 9 - 0
crates/eonix_mm/src/page_table.rs

@@ -0,0 +1,9 @@
+mod page_table;
+mod paging_mode;
+mod pte;
+mod pte_iterator;
+
+pub use page_table::{PageTable, RawPageTable};
+pub use paging_mode::{PageTableLevel, PagingMode};
+pub use pte::{PageAttribute, PTE};
+pub use pte_iterator::PageTableIterator;

+ 132 - 0
crates/eonix_mm/src/page_table/page_table.rs

@@ -0,0 +1,132 @@
+use super::{
+    paging_mode::PageTableLevel,
+    pte_iterator::{KernelIterator, UserIterator},
+    PageAttribute, PagingMode, PTE,
+};
+use crate::{
+    address::{PAddr, VRange},
+    page_table::PageTableIterator,
+    paging::{Page, PageAccess, PageAlloc, PageBlock},
+};
+use core::{marker::PhantomData, ptr::NonNull};
+
+pub trait RawPageTable<'a>: 'a {
+    type Entry: PTE + 'a;
+
+    /// Return the entry at the given index.
+    fn index(&self, index: u16) -> &'a Self::Entry;
+
+    /// Return a mutable reference to the entry at the given index.
+    fn index_mut(&mut self, index: u16) -> &'a mut Self::Entry;
+
+    /// Get the page table pointed to by raw pointer `ptr`.
+    unsafe fn from_ptr(ptr: NonNull<PageBlock>) -> Self;
+}
+
+pub struct PageTable<'a, M, A, X>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+{
+    root_table_page: Page<A>,
+    phantom: PhantomData<&'a (M, X)>,
+}
+
+impl<'a, M, A, X> PageTable<'a, M, A, X>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+{
+    pub fn new<A1: PageAlloc>(kernel_root_table_page: &Page<A1>) -> Self {
+        let new_root_table_page = Page::<A>::alloc();
+        let new_table_data = X::get_ptr_for_page(&new_root_table_page);
+        let kernel_table_data = X::get_ptr_for_page(kernel_root_table_page);
+
+        unsafe {
+            // SAFETY: `new_table_data` and `kernel_table_data` are both valid pointers
+            //         to **different** page tables.
+            new_table_data.copy_from_nonoverlapping(kernel_table_data, 1);
+        }
+
+        let mut root_page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(new_table_data)
+        };
+
+        let level0 = M::LEVELS[0];
+        for idx in 0..level0.max_index() / 2 {
+            // We consider the first half of the page table as user space.
+            // Clear all (potential) user space mappings.
+            root_page_table.index_mut(idx).take();
+        }
+
+        Self {
+            root_table_page: new_root_table_page,
+            phantom: PhantomData,
+        }
+    }
+
+    pub fn addr(&self) -> PAddr {
+        self.root_table_page.start()
+    }
+
+    pub fn iter_user(&self, range: VRange) -> impl Iterator<Item = &mut M::Entry> {
+        let page_table_ptr = X::get_ptr_for_page(&self.root_table_page);
+        let root_page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(page_table_ptr)
+        };
+
+        PageTableIterator::<M, A, X, UserIterator>::new(root_page_table, range)
+    }
+
+    pub fn iter_kernel(&self, range: VRange) -> impl Iterator<Item = &mut M::Entry> {
+        let page_table_ptr = X::get_ptr_for_page(&self.root_table_page);
+        let root_page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(page_table_ptr)
+        };
+
+        PageTableIterator::<M, A, X, KernelIterator>::new(root_page_table, range)
+    }
+
+    fn drop_page_table_recursive(page_table: &Page<A>, levels: &[PageTableLevel]) {
+        let [level, remaining_levels @ ..] = levels else { return };
+
+        let page_table_ptr = X::get_ptr_for_page(page_table);
+        let mut page_table = unsafe {
+            // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+            M::RawTable::from_ptr(page_table_ptr)
+        };
+
+        for pte in (0..=level.max_index()).map(|i| page_table.index_mut(i)) {
+            let (pfn, attr) = pte.take();
+            if !attr.is_present() || !attr.is_user() {
+                continue;
+            }
+
+            let page_table = unsafe {
+                // SAFETY: We got the pfn from a valid page table entry, so it should be valid.
+                Page::<A>::from_raw(pfn)
+            };
+
+            Self::drop_page_table_recursive(&page_table, remaining_levels);
+        }
+    }
+}
+
+impl<'a, M, A, X> Drop for PageTable<'a, M, A, X>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+{
+    fn drop(&mut self) {
+        Self::drop_page_table_recursive(&self.root_table_page, M::LEVELS);
+    }
+}

+ 38 - 0
crates/eonix_mm/src/page_table/paging_mode.rs

@@ -0,0 +1,38 @@
+use super::{RawPageTable, PTE};
+use crate::{
+    address::{Addr as _, VAddr},
+    paging::PFN,
+};
+
+pub trait PagingMode {
+    type Entry: PTE;
+    type RawTable<'a>: RawPageTable<'a, Entry = Self::Entry>;
+
+    const LEVELS: &'static [PageTableLevel];
+    const KERNEL_ROOT_TABLE_PFN: PFN;
+}
+
+#[derive(Clone, Copy, PartialOrd, PartialEq)]
+pub struct PageTableLevel(usize, usize);
+
+impl PageTableLevel {
+    pub const fn new(nth_bit: usize, len: usize) -> Self {
+        Self(nth_bit, len)
+    }
+
+    pub const fn nth_bit(self) -> usize {
+        self.0
+    }
+
+    pub const fn len(self) -> usize {
+        self.1
+    }
+
+    pub const fn max_index(self) -> u16 {
+        (1 << self.len()) - 1
+    }
+
+    pub fn index_of(self, vaddr: VAddr) -> u16 {
+        ((vaddr.addr() >> self.nth_bit()) & ((1 << self.len()) - 1)) as u16
+    }
+}

+ 52 - 0
crates/eonix_mm/src/page_table/pte.rs

@@ -0,0 +1,52 @@
+use crate::paging::PFN;
+
+pub trait PageAttribute: Copy {
+    /// Create a new instance of the attribute with all attributes set to false.
+    fn new() -> Self;
+
+    fn present(self, present: bool) -> Self;
+    fn write(self, write: bool) -> Self;
+    fn execute(self, execute: bool) -> Self;
+    fn user(self, user: bool) -> Self;
+    fn accessed(self, accessed: bool) -> Self;
+    fn dirty(self, dirty: bool) -> Self;
+    fn global(self, global: bool) -> Self;
+    fn copy_on_write(self, cow: bool) -> Self;
+    fn mapped(self, mmap: bool) -> Self;
+    fn anonymous(self, anon: bool) -> Self;
+
+    fn is_present(&self) -> bool;
+    fn is_write(&self) -> bool;
+    fn is_execute(&self) -> bool;
+    fn is_user(&self) -> bool;
+    fn is_accessed(&self) -> bool;
+    fn is_dirty(&self) -> bool;
+    fn is_global(&self) -> bool;
+    fn is_copy_on_write(&self) -> bool;
+    fn is_mapped(&self) -> bool;
+    fn is_anonymous(&self) -> bool;
+}
+
+pub trait PTE: Sized {
+    type Attr: PageAttribute;
+
+    fn set(&mut self, pfn: PFN, attr: Self::Attr);
+    fn get(&self) -> (PFN, Self::Attr);
+    fn take(&mut self) -> (PFN, Self::Attr);
+
+    fn set_pfn(&mut self, pfn: PFN) {
+        self.set(pfn, self.get_attr());
+    }
+
+    fn set_attr(&mut self, attr: Self::Attr) {
+        self.set(self.get_pfn(), attr);
+    }
+
+    fn get_pfn(&self) -> PFN {
+        self.get().0
+    }
+
+    fn get_attr(&self) -> Self::Attr {
+        self.get().1
+    }
+}

+ 177 - 0
crates/eonix_mm/src/page_table/pte_iterator.rs

@@ -0,0 +1,177 @@
+use super::{PageAttribute as _, PagingMode, RawPageTable as _, PTE};
+use crate::{
+    address::{AddrOps as _, VRange},
+    paging::{Page, PageAccess, PageAlloc, PAGE_SIZE},
+};
+use core::marker::PhantomData;
+
+pub struct KernelIterator;
+pub struct UserIterator;
+
+pub trait IteratorType<M: PagingMode> {
+    fn page_table_attributes() -> <M::Entry as PTE>::Attr;
+
+    fn get_page_table<'a, A, X>(pte: &mut M::Entry) -> M::RawTable<'a>
+    where
+        A: PageAlloc,
+        X: PageAccess,
+    {
+        let attr = pte.get_attr();
+
+        if attr.is_present() {
+            let pfn = pte.get_pfn();
+            unsafe {
+                // SAFETY: We are creating a pointer to a page referenced to in
+                //         some page table, which should be valid.
+                let page_table_ptr = X::get_ptr_for_pfn(pfn);
+                // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+                M::RawTable::from_ptr(page_table_ptr)
+            }
+        } else {
+            let page = Page::<A>::alloc();
+            let page_table_ptr = X::get_ptr_for_page(&page);
+
+            unsafe {
+                // SAFETY: `page_table_ptr` is good for writing and properly aligned.
+                page_table_ptr.write_bytes(0, 1);
+            }
+
+            pte.set(page.into_raw(), Self::page_table_attributes());
+
+            unsafe {
+                // SAFETY: `page_table_ptr` is a valid pointer to a page table.
+                M::RawTable::from_ptr(page_table_ptr)
+            }
+        }
+    }
+}
+
+pub struct PageTableIterator<'a, M, A, X, K>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+    K: IteratorType<M>,
+{
+    remaining: usize,
+
+    indicies: [u16; 8],
+    tables: [Option<M::RawTable<'a>>; 8],
+
+    _phantom: PhantomData<&'a (A, X, K)>,
+}
+
+impl<'a, M, A, X, K> PageTableIterator<'a, M, A, X, K>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+    K: IteratorType<M>,
+{
+    fn parse_tables_starting_from(&mut self, idx_level: usize) {
+        let levels_len = M::LEVELS.len();
+
+        for (idx, &pt_idx) in self
+            .indicies
+            .iter()
+            .enumerate()
+            .take(levels_len - 1)
+            .skip(idx_level)
+        {
+            let [parent_table, child_table] = unsafe {
+                // SAFETY: `idx` and `idx + 1` must not overlap.
+                //         `idx + 1` is always less than `levels_len` since we iterate
+                //         until `levels_len - 1`.
+                self.tables.get_disjoint_unchecked_mut([idx, idx + 1])
+            };
+            let parent_table = parent_table.as_mut().expect("Parent table is None");
+            let next_pte = parent_table.index_mut(pt_idx);
+            child_table.replace(K::get_page_table::<A, X>(next_pte));
+        }
+    }
+
+    pub fn new(page_table: M::RawTable<'a>, range: VRange) -> Self {
+        let start = range.start().floor();
+        let end = range.end().ceil();
+
+        let mut me = Self {
+            remaining: (end - start) / PAGE_SIZE,
+            indicies: [0; 8],
+            tables: [const { None }; 8],
+            _phantom: PhantomData,
+        };
+
+        for (i, level) in M::LEVELS.iter().enumerate() {
+            me.indicies[i] = level.index_of(start);
+        }
+
+        me.tables[0] = Some(page_table);
+        me.parse_tables_starting_from(0);
+
+        me
+    }
+}
+
+impl<'a, M, A, X, K> Iterator for PageTableIterator<'a, M, A, X, K>
+where
+    M: PagingMode,
+    M::Entry: 'a,
+    A: PageAlloc,
+    X: PageAccess,
+    K: IteratorType<M>,
+{
+    type Item = &'a mut M::Entry;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.remaining == 0 {
+            return None;
+        } else {
+            self.remaining -= 1;
+        }
+
+        let len_levels = M::LEVELS.len();
+
+        let retval = self.tables[len_levels - 1]
+            .as_mut()
+            .unwrap()
+            .index_mut(self.indicies[len_levels - 1]);
+
+        let idx_level_start_updating = M::LEVELS
+            .iter()
+            .zip(self.indicies.iter_mut())
+            .enumerate()
+            .rev()
+            .skip_while(|(_, (level, idx))| **idx == level.max_index())
+            .map(|(i, _)| i)
+            .next()
+            .expect("Index out of bounds");
+
+        self.indicies[idx_level_start_updating] += 1;
+        self.indicies[idx_level_start_updating + 1..len_levels].fill(0);
+        self.parse_tables_starting_from(idx_level_start_updating);
+
+        Some(retval)
+    }
+}
+
+impl<M: PagingMode> IteratorType<M> for KernelIterator {
+    fn page_table_attributes() -> <M::Entry as PTE>::Attr {
+        <M::Entry as PTE>::Attr::new()
+            .present(true)
+            .write(true)
+            .execute(true)
+            .global(true)
+    }
+}
+
+impl<M: PagingMode> IteratorType<M> for UserIterator {
+    fn page_table_attributes() -> <M::Entry as PTE>::Attr {
+        <M::Entry as PTE>::Attr::new()
+            .present(true)
+            .write(true)
+            .execute(true)
+            .user(true)
+    }
+}

+ 9 - 0
crates/eonix_mm/src/paging.rs

@@ -0,0 +1,9 @@
+mod page;
+mod page_alloc;
+mod pfn;
+mod raw_page;
+
+pub use page::{Page, PageAccess, PageBlock, PAGE_SIZE, PAGE_SIZE_BITS};
+pub use page_alloc::PageAlloc;
+pub use pfn::PFN;
+pub use raw_page::{PageFlags, RawPage, RawPagePtr};

+ 219 - 0
crates/eonix_mm/src/paging/page.rs

@@ -0,0 +1,219 @@
+use super::{raw_page::RawPagePtr, PageAlloc, PFN};
+use crate::address::{AddrRange, PAddr};
+use core::{fmt, marker::PhantomData, mem::ManuallyDrop, ptr::NonNull, sync::atomic::Ordering};
+
+pub const PAGE_SIZE: usize = 4096;
+pub const PAGE_SIZE_BITS: u32 = PAGE_SIZE.trailing_zeros();
+
+/// A block of memory that is aligned to the page size and can be used for
+/// page-aligned allocations.
+///
+/// This is used to ensure that the memory is properly aligned to the page size.
+#[allow(dead_code)]
+#[repr(align(4096))]
+pub struct PageBlock([u8; PAGE_SIZE]);
+
+/// A trait that provides the kernel access to the page.
+pub trait PageAccess {
+    /// Returns a kernel-accessible pointer to the page referenced by the given
+    /// physical frame number.
+    ///
+    /// # Safety
+    /// This function is unsafe because calling this function on some non-existing
+    /// pfn will cause undefined behavior.
+    unsafe fn get_ptr_for_pfn(pfn: PFN) -> NonNull<PageBlock>;
+
+    /// Returns a kernel-accessible pointer to the given page.
+    fn get_ptr_for_page<A: PageAlloc>(page: &Page<A>) -> NonNull<PageBlock> {
+        unsafe {
+            // SAFETY: `page.pfn()` is guaranteed to be valid.
+            Self::get_ptr_for_pfn(page.pfn())
+        }
+    }
+}
+
+/// A Page allocated in allocator `A`.
+#[derive(PartialEq, Eq, PartialOrd, Ord)]
+pub struct Page<A: PageAlloc> {
+    raw_page: RawPagePtr,
+    _phantom: PhantomData<A>,
+}
+
+unsafe impl<A: PageAlloc> Send for Page<A> {}
+unsafe impl<A: PageAlloc> Sync for Page<A> {}
+
+impl<A: PageAlloc> Page<A> {
+    /// Allocate a page of the given *order*.
+    pub fn alloc_order(order: u32) -> Self {
+        Self {
+            raw_page: A::alloc_order(order).expect("Out of memory"),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Allocate exactly one page.
+    pub fn alloc() -> Self {
+        Self {
+            raw_page: A::alloc().expect("Out of memory"),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Allocate a contiguous block of pages that can contain at least `count` pages.
+    pub fn alloc_at_least(count: usize) -> Self {
+        Self {
+            raw_page: A::alloc_at_least(count).expect("Out of memory"),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Whether we are the only owner of the page.
+    pub fn is_exclusive(&self) -> bool {
+        self.raw_page.refcount().load(Ordering::Acquire) == 1
+    }
+
+    /// Returns the *order* of the page, which is the log2 of the number of pages
+    /// contained in the page object.
+    pub fn order(&self) -> u32 {
+        self.raw_page.order()
+    }
+
+    /// Returns the total size of the page in bytes.
+    pub fn len(&self) -> usize {
+        1 << (self.order() + PAGE_SIZE_BITS)
+    }
+
+    /// Acquire the ownership of the page pointed to by `pfn`, leaving `refcount` untouched.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `pfn` points to a valid page allocated through `alloc_order()` and that the
+    /// page have not been freed or deallocated yet.
+    ///
+    /// No checks are done. Any violation of this assumption may lead to undefined behavior.
+    pub unsafe fn from_raw_unchecked(pfn: PFN) -> Self {
+        Self {
+            raw_page: RawPagePtr::from(pfn),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Acquire the ownership of the page pointed to by `pfn`, leaving `refcount` untouched.
+    ///
+    /// This function is a safe wrapper around `from_paddr_unchecked()` that does **some sort
+    /// of** checks to ensure that the page is valid and managed by the allocator.
+    ///
+    /// # Panic
+    /// This function will panic if the page is not valid or if the page is not managed by
+    /// the allocator.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `pfn` points to an existing page (A.K.A. inside the global page array) and the
+    /// page will not be freed or deallocated during the call.
+    pub unsafe fn from_raw(pfn: PFN) -> Self {
+        unsafe {
+            // SAFETY: The caller guarantees that the page is inside the global page array.
+            assert!(A::has_management_over(RawPagePtr::from(pfn)));
+
+            // SAFETY: We've checked that the validity of the page. And the caller guarantees
+            //         that the page will not be freed or deallocated during the call.
+            Self::from_raw_unchecked(pfn)
+        }
+    }
+
+    /// Do some work with the page without touching the reference count with the same
+    /// restrictions as `from_raw()`.
+    ///
+    /// # Safety
+    /// Check `from_raw()` for the safety requirements.
+    pub unsafe fn with_raw<F, O>(pfn: PFN, func: F) -> O
+    where
+        F: FnOnce(&Self) -> O,
+    {
+        unsafe {
+            let me = ManuallyDrop::new(Self::from_raw(pfn));
+            func(&me)
+        }
+    }
+
+    /// Do some work with the page without touching the reference count with the same
+    /// restrictions as `from_raw_unchecked()`.
+    ///
+    /// # Safety
+    /// Check `from_raw_unchecked()` for the safety requirements.
+    pub unsafe fn with_raw_unchecked<F, O>(pfn: PFN, func: F) -> O
+    where
+        F: FnOnce(&Self) -> O,
+    {
+        unsafe {
+            let me = ManuallyDrop::new(Self::from_raw_unchecked(pfn));
+            func(&me)
+        }
+    }
+
+    /// Consumes the `Page` and returns the physical frame number without dropping
+    /// the reference count the page holds.
+    pub fn into_raw(self) -> PFN {
+        let me = ManuallyDrop::new(self);
+        me.pfn()
+    }
+
+    /// Returns the physical frame number of the page, which is aligned with the
+    /// page size and valid.
+    pub fn pfn(&self) -> PFN {
+        PFN::from(self.raw_page)
+    }
+
+    /// Returns the start physical address of the page, which is guaranteed to be
+    /// aligned to the page size and valid.
+    pub fn start(&self) -> PAddr {
+        PAddr::from(self.pfn())
+    }
+
+    /// Returns the physical address range of the page, which is guaranteed to be
+    /// aligned to the page size and valid.
+    pub fn range(&self) -> AddrRange<PAddr> {
+        AddrRange::from(self.start()).grow(self.len())
+    }
+}
+
+impl<A: PageAlloc> Clone for Page<A> {
+    fn clone(&self) -> Self {
+        // SAFETY: Memory order here can be Relaxed is for the same reason as that
+        // in the copy constructor of `std::shared_ptr`.
+        self.raw_page.refcount().fetch_add(1, Ordering::Relaxed);
+
+        Self {
+            raw_page: self.raw_page,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<A: PageAlloc> Drop for Page<A> {
+    fn drop(&mut self) {
+        match self.raw_page.refcount().fetch_sub(1, Ordering::AcqRel) {
+            0 => panic!("Refcount for an in-use page is 0"),
+            1 => unsafe {
+                // SAFETY: `self.raw_page` points to a valid page inside the global page array.
+                assert!(A::has_management_over(self.raw_page));
+
+                // SAFETY: `self.raw_page` is managed by the allocator and we're dropping the page.
+                A::dealloc(self.raw_page)
+            },
+            _ => {}
+        }
+    }
+}
+
+impl<A: PageAlloc> fmt::Debug for Page<A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "Page({:?}, order={})",
+            usize::from(PFN::from(self.raw_page)),
+            self.order()
+        )
+    }
+}

+ 31 - 0
crates/eonix_mm/src/paging/page_alloc.rs

@@ -0,0 +1,31 @@
+use super::raw_page::RawPagePtr;
+
+pub trait PageAlloc: Sized {
+    /// Allocate a page of the given *order*.
+    fn alloc_order(order: u32) -> Option<RawPagePtr>;
+
+    /// Allocate exactly one page.
+    fn alloc() -> Option<RawPagePtr> {
+        Self::alloc_order(0)
+    }
+
+    /// Allocate a contiguous block of pages that can contain at least `count` pages.
+    fn alloc_at_least(count: usize) -> Option<RawPagePtr> {
+        let order = count.next_power_of_two().trailing_zeros();
+        Self::alloc_order(order)
+    }
+
+    /// Deallocate a page.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `page` is allocated in this allocator and never used after this call.
+    unsafe fn dealloc(page_ptr: RawPagePtr);
+
+    /// Check whether the page is allocated and managed by the allocator.
+    ///
+    /// # Safety
+    /// This function is unsafe because it assumes that the caller has ensured that
+    /// `page_ptr` points to a raw page inside the global page array.
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool;
+}

+ 65 - 0
crates/eonix_mm/src/paging/pfn.rs

@@ -0,0 +1,65 @@
+use crate::address::{Addr as _, PAddr};
+use core::{
+    fmt,
+    ops::{Add, Sub},
+};
+
+use super::PAGE_SIZE_BITS;
+
+#[repr(transparent)]
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+pub struct PFN(usize);
+
+impl From<PFN> for usize {
+    fn from(v: PFN) -> Self {
+        v.0
+    }
+}
+
+impl From<usize> for PFN {
+    fn from(v: usize) -> Self {
+        Self(v)
+    }
+}
+
+impl Sub for PFN {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Sub<usize> for PFN {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        PFN(self.0 - rhs)
+    }
+}
+
+impl Add<usize> for PFN {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        PFN(self.0 + rhs)
+    }
+}
+
+impl fmt::Debug for PFN {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "PFN({:#x})", self.0)
+    }
+}
+
+impl From<PAddr> for PFN {
+    fn from(paddr: PAddr) -> Self {
+        Self(paddr.addr() >> PAGE_SIZE_BITS)
+    }
+}
+
+impl PFN {
+    pub const fn from_val(pfn: usize) -> Self {
+        Self(pfn)
+    }
+}

+ 97 - 0
crates/eonix_mm/src/paging/raw_page.rs

@@ -0,0 +1,97 @@
+use super::PFN;
+use core::{
+    ptr::NonNull,
+    sync::atomic::{AtomicU32, AtomicUsize, Ordering},
+};
+use intrusive_list::Link;
+
+const PAGE_ARRAY: NonNull<RawPage> =
+    unsafe { NonNull::new_unchecked(0xffffff8040000000 as *mut _) };
+
+pub struct PageFlags(AtomicU32);
+
+pub struct RawPage {
+    /// This can be used for LRU page swap in the future.
+    ///
+    /// Now only used for free page links in the buddy system.
+    pub link: Link,
+    /// # Safety
+    /// This field is only used in buddy system and is protected by the global lock.
+    pub order: u32,
+    pub flags: PageFlags,
+    pub refcount: AtomicUsize,
+}
+
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub struct RawPagePtr(NonNull<RawPage>);
+
+impl PageFlags {
+    pub const PRESENT: u32 = 1 << 0;
+    // pub const LOCKED: u32 = 1 << 1;
+    pub const BUDDY: u32 = 1 << 2;
+    // pub const SLAB: u32 = 1 << 3;
+    // pub const DIRTY: u32 = 1 << 4;
+    pub const FREE: u32 = 1 << 5;
+    pub const LOCAL: u32 = 1 << 6;
+
+    pub fn has(&self, flag: u32) -> bool {
+        (self.0.load(Ordering::Relaxed) & flag) == flag
+    }
+
+    pub fn set(&self, flag: u32) {
+        self.0.fetch_or(flag, Ordering::Relaxed);
+    }
+
+    pub fn clear(&self, flag: u32) {
+        self.0.fetch_and(!flag, Ordering::Relaxed);
+    }
+}
+
+impl RawPagePtr {
+    pub const fn new(ptr: NonNull<RawPage>) -> Self {
+        Self(ptr)
+    }
+
+    pub const fn as_ptr(self) -> *mut RawPage {
+        self.0.as_ptr()
+    }
+
+    pub const fn as_ref<'a>(self) -> &'a RawPage {
+        unsafe { &*self.as_ptr() }
+    }
+
+    pub const fn as_mut<'a>(self) -> &'a mut RawPage {
+        unsafe { &mut *self.as_ptr() }
+    }
+
+    pub const fn order(&self) -> u32 {
+        self.as_ref().order
+    }
+
+    pub const fn flags(&self) -> &PageFlags {
+        &self.as_ref().flags
+    }
+
+    pub const fn refcount(&self) -> &AtomicUsize {
+        &self.as_ref().refcount
+    }
+
+    pub const fn offset(&self, count: usize) -> Self {
+        let new_raw_ptr = unsafe { self.0.add(count) };
+        Self::new(new_raw_ptr)
+    }
+}
+
+impl From<RawPagePtr> for PFN {
+    fn from(value: RawPagePtr) -> Self {
+        let idx = unsafe { value.as_ptr().offset_from(PAGE_ARRAY.as_ptr()) as usize };
+        Self::from(idx)
+    }
+}
+
+impl From<PFN> for RawPagePtr {
+    fn from(pfn: PFN) -> Self {
+        let raw_page_ptr = unsafe { PAGE_ARRAY.add(usize::from(pfn)) };
+        Self::new(raw_page_ptr)
+    }
+}

+ 12 - 0
crates/eonix_percpu/Cargo.toml

@@ -0,0 +1,12 @@
+[package]
+name = "eonix_percpu"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+proc-macro2 = "1.0"
+quote = "1.0"
+syn = { version = "2.0", features = ["full"] }

+ 24 - 0
crates/eonix_percpu/src/arch.rs

@@ -0,0 +1,24 @@
+use proc_macro2::TokenStream;
+use quote::quote;
+use syn::{Ident, Type};
+
+/// Get the base address for percpu variables of the current thread.
+pub fn get_percpu_pointer(percpu: &Ident, ty: &Type) -> TokenStream {
+    quote! {
+        {
+            #[cfg(target_arch = "x86_64")]
+            {
+                let base: *mut #ty;
+                ::core::arch::asm!(
+                    "mov %gs:0, {address}",
+                    "add ${percpu_pointer}, {address}",
+                    percpu_pointer = sym #percpu,
+                    address = out(reg) base,
+                    options(att_syntax)
+                );
+                base
+            }
+        }
+    }
+    .into()
+}

+ 181 - 0
crates/eonix_percpu/src/lib.rs

@@ -0,0 +1,181 @@
+extern crate proc_macro;
+
+use proc_macro::TokenStream;
+use quote::{format_ident, quote};
+use syn::{parse_macro_input, ItemStatic};
+
+mod arch;
+
+#[proc_macro_attribute]
+pub fn define_percpu(attrs: TokenStream, item: TokenStream) -> TokenStream {
+    if !attrs.is_empty() {
+        panic!("`define_percpu` attribute does not take any arguments");
+    }
+
+    let item = parse_macro_input!(item as ItemStatic);
+    let vis = &item.vis;
+    let ident = &item.ident;
+    let ty = &item.ty;
+    let expr = &item.expr;
+
+    let is_bool = quote!(#ty).to_string().as_str() == "bool";
+    let is_integer =
+        ["u8", "u16", "u32", "u64", "usize"].contains(&quote!(#ty).to_string().as_str());
+
+    let is_atomic_like = is_bool || is_integer || quote!(#ty).to_string().contains("NonNull");
+
+    let inner_ident = format_ident!("_percpu_inner_{}", ident);
+    let access_ident = format_ident!("_access_{}", ident);
+
+    let integer_methods = if is_integer {
+        quote! {
+            pub fn add(&self, value: #ty) {
+                *unsafe { self.as_mut() } += value;
+            }
+
+            pub fn sub(&self, value: #ty) {
+                *unsafe { self.as_mut() } -= value;
+            }
+        }
+    } else {
+        quote! {}
+    };
+
+    let preempt_disable = if !is_atomic_like {
+        quote! { eonix_preempt::disable(); }
+    } else {
+        quote! {}
+    };
+
+    let preempt_enable = if !is_atomic_like {
+        quote! { eonix_preempt::enable(); }
+    } else {
+        quote! {}
+    };
+
+    let as_ptr = arch::get_percpu_pointer(&inner_ident, &ty);
+
+    quote! {
+        #[link_section = ".percpu"]
+        #[allow(non_upper_case_globals)]
+        static mut #inner_ident: #ty = #expr;
+        #[allow(non_camel_case_types)]
+        #vis struct #access_ident;
+        #vis static #ident: #access_ident = #access_ident;
+
+        impl #access_ident {
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ptr(&self) -> *mut #ty {
+                #as_ptr
+            }
+
+            pub fn get(&self) -> #ty {
+                #preempt_disable
+                let value = unsafe { self.as_ptr().read() };
+                #preempt_enable
+                value
+            }
+
+            pub fn set(&self, value: #ty) {
+                #preempt_disable
+                unsafe { self.as_ptr().write(value) }
+                #preempt_enable
+            }
+
+            pub fn swap(&self, mut value: #ty) -> #ty {
+                #preempt_disable
+                unsafe { self.as_ptr().swap(&mut value) }
+                #preempt_enable
+                value
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for immutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ref(&self) -> & #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_ref().unwrap()
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_mut(&self) -> &mut #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_mut().unwrap()
+            }
+
+            #integer_methods
+        }
+    }
+    .into()
+}
+
+#[proc_macro_attribute]
+pub fn define_percpu_shared(attrs: TokenStream, item: TokenStream) -> TokenStream {
+    if !attrs.is_empty() {
+        panic!("`define_percpu_shared` attribute does not take any arguments");
+    }
+
+    let item = parse_macro_input!(item as ItemStatic);
+    let vis = &item.vis;
+    let ident = &item.ident;
+    let ty = &item.ty;
+    let expr = &item.expr;
+
+    let inner_ident = format_ident!("_percpu_shared_inner_{}", ident);
+    let access_ident = format_ident!("_access_shared_{}", ident);
+
+    let as_ptr = arch::get_percpu_pointer(&inner_ident, &ty);
+
+    quote! {
+        #[link_section = ".percpu"]
+        #[allow(non_upper_case_globals)]
+        static #inner_ident: #ty = #expr;
+        #[allow(non_camel_case_types)]
+        #vis struct #access_ident;
+        #vis static #ident: #access_ident = #access_ident;
+
+        impl #access_ident {
+            fn as_ptr(&self) -> *const #ty {
+                unsafe { ( #as_ptr ) }
+            }
+
+            pub fn get_ref(&self) -> & #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                unsafe { self.as_ptr().as_ref().unwrap() }
+            }
+
+            pub fn get_for_cpu(&self, cpuid: usize) -> Option<& #ty > {
+                let offset = & #inner_ident as *const _ as usize;
+                let base = ::arch::PercpuArea::get_for(cpuid);
+                base.map(|base| unsafe { base.byte_add(offset).cast().as_ref() })
+            }
+        }
+
+        impl ::core::ops::Deref for #access_ident {
+            type Target = #ty;
+
+            fn deref(&self) -> &Self::Target {
+                self.get_ref()
+            }
+        }
+
+        impl<T> ::core::convert::AsRef<T> for #access_ident
+        where
+            <Self as ::core::ops::Deref>::Target: ::core::convert::AsRef<T>,
+        {
+            fn as_ref(&self) -> &T {
+                use ::core::ops::Deref;
+
+                self.deref().as_ref()
+            }
+        }
+    }
+    .into()
+}

+ 0 - 1
crates/eonix_runtime/Cargo.toml

@@ -8,7 +8,6 @@ arch = { path = "../../arch" }
 atomic_unique_refcell = { path = "../atomic_unique_refcell" }
 eonix_log = { path = "../eonix_log" }
 eonix_preempt = { path = "../eonix_preempt" }
-eonix_spin_irq = { path = "../eonix_spin_irq" }
 eonix_sync = { path = "../eonix_sync" }
 pointers = { path = "../pointers" }
 

+ 1 - 1
crates/eonix_runtime/src/executor/builder.rs

@@ -48,7 +48,7 @@ where
         let mut execution_context = ExecutionContext::new();
         let output_handle = OutputHandle::new();
 
-        execution_context.set_sp(stack.get_bottom() as *const _ as _);
+        execution_context.set_sp(stack.get_bottom().addr().get() as _);
 
         let executor = Box::pin(RealExecutor {
             _stack: stack,

+ 3 - 1
crates/eonix_runtime/src/executor/stack.rs

@@ -1,4 +1,6 @@
+use core::ptr::NonNull;
+
 pub trait Stack: Sized + Send {
     fn new() -> Self;
-    fn get_bottom(&self) -> &();
+    fn get_bottom(&self) -> NonNull<()>;
 }

+ 1 - 2
crates/eonix_runtime/src/scheduler.rs

@@ -14,7 +14,6 @@ use core::{
 };
 use eonix_log::println_trace;
 use eonix_preempt::assert_preempt_count_eq;
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::{LazyLock, Spin};
 use intrusive_collections::RBTree;
 use pointers::BorrowedArc;
@@ -92,7 +91,7 @@ impl Scheduler {
             // SAFETY: Preemption is disabled.
             let context: &mut ExecutionContext = LOCAL_SCHEDULER_CONTEXT.as_mut();
             context.set_ip(local_scheduler as _);
-            context.set_sp(stack.get_bottom() as *const _ as usize);
+            context.set_sp(stack.get_bottom().addr().get() as usize);
             eonix_preempt::enable();
         }
 

+ 0 - 8
crates/eonix_spin_irq/Cargo.toml

@@ -1,8 +0,0 @@
-[package]
-name = "eonix_spin_irq"
-version = "0.1.0"
-edition = "2024"
-
-[dependencies]
-arch = { path = "../../arch" }
-eonix_sync = { path = "../eonix_sync" }

+ 1 - 0
crates/eonix_sync/Cargo.toml

@@ -4,6 +4,7 @@ version = "0.1.0"
 edition = "2024"
 
 [dependencies]
+arch = { path = "../../arch" }
 eonix_preempt = { path = "../eonix_preempt" }
 intrusive-collections = "0.9.7"
 

+ 4 - 1
crates/eonix_sync/src/lib.rs

@@ -14,7 +14,10 @@ pub use lazy_lock::LazyLock;
 pub use locked::{AsProof, AsProofMut, Locked, Proof, ProofMut};
 pub use mutex::{Mutex, MutexGuard};
 pub use rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-pub use spin::{LoopRelax, Relax, Spin, SpinGuard, SpinRelax, UnlockedSpinGuard};
+pub use spin::{
+    LoopRelax, Relax, Spin, SpinGuard, SpinIrqGuard, SpinRelax, UnlockedSpinGuard,
+    UnlockedSpinIrqGuard,
+};
 pub use wait_list::WaitList;
 
 extern crate alloc;

+ 14 - 0
crates/eonix_sync/src/spin.rs

@@ -1,14 +1,17 @@
 mod guard;
 mod relax;
+mod spin_irq;
 
 use core::{
     cell::UnsafeCell,
     marker::PhantomData,
     sync::atomic::{AtomicBool, Ordering},
 };
+use spin_irq::IrqStateGuard;
 
 pub use guard::{SpinGuard, UnlockedSpinGuard};
 pub use relax::{LoopRelax, Relax, SpinRelax};
+pub use spin_irq::{SpinIrqGuard, UnlockedSpinIrqGuard};
 
 //// A spinlock is a lock that uses busy-waiting to acquire the lock.
 /// It is useful for short critical sections where the overhead of a context switch
@@ -66,6 +69,17 @@ where
         }
     }
 
+    pub fn lock_irq(&self) -> SpinIrqGuard<'_, T, R> {
+        let irq_state = arch::disable_irqs_save();
+        let guard = self.lock();
+
+        SpinIrqGuard {
+            guard,
+            irq_state: IrqStateGuard::new(irq_state),
+            _not_send: PhantomData,
+        }
+    }
+
     pub fn get_mut(&mut self) -> &mut T {
         // SAFETY: The exclusive access to the lock is guaranteed by the borrow checker.
         unsafe { &mut *self.value.get() }

+ 6 - 34
crates/eonix_spin_irq/src/lib.rs → crates/eonix_sync/src/spin/spin_irq.rs

@@ -1,34 +1,23 @@
-#![no_std]
-
+use super::{Relax, SpinGuard, SpinRelax, UnlockedSpinGuard};
+use crate::{marker::NotSend, UnlockableGuard, UnlockedGuard};
 use core::{
     marker::PhantomData,
     mem::ManuallyDrop,
     ops::{Deref, DerefMut},
 };
-use eonix_sync::{
-    marker::NotSend, Relax, Spin, SpinGuard, SpinRelax, UnlockableGuard, UnlockedGuard,
-    UnlockedSpinGuard,
-};
-
-pub trait SpinIrq<T, R = SpinRelax>
-where
-    T: ?Sized,
-{
-    fn lock_irq(&self) -> SpinIrqGuard<'_, T, R>;
-}
 
-struct IrqStateGuard(ManuallyDrop<arch::IrqState>);
+pub(super) struct IrqStateGuard(ManuallyDrop<arch::IrqState>);
 
 pub struct SpinIrqGuard<'a, T, R = SpinRelax>
 where
     T: ?Sized,
 {
-    guard: SpinGuard<'a, T, R>,
-    irq_state: IrqStateGuard,
+    pub(super) guard: SpinGuard<'a, T, R>,
+    pub(super) irq_state: IrqStateGuard,
     /// We don't want this to be `Send` because we don't want to allow the guard to be
     /// transferred to another thread since we have disabled the preemption and saved
     /// IRQ states on the local cpu.
-    _not_send: PhantomData<NotSend>,
+    pub(super) _not_send: PhantomData<NotSend>,
 }
 
 pub struct UnlockedSpinIrqGuard<'a, T, R>
@@ -43,23 +32,6 @@ where
 //         we can access the guard from multiple threads.
 unsafe impl<T, R> Sync for SpinIrqGuard<'_, T, R> where T: ?Sized + Sync {}
 
-impl<T, R> SpinIrq<T, R> for Spin<T, R>
-where
-    T: ?Sized,
-    R: Relax,
-{
-    fn lock_irq(&self) -> SpinIrqGuard<'_, T, R> {
-        let irq_state = arch::disable_irqs_save();
-        let guard = self.lock();
-
-        SpinIrqGuard {
-            guard,
-            irq_state: IrqStateGuard::new(irq_state),
-            _not_send: PhantomData,
-        }
-    }
-}
-
 impl IrqStateGuard {
     pub const fn new(irq_state: arch::IrqState) -> Self {
         Self(ManuallyDrop::new(irq_state))

+ 7 - 4
crates/eonix_sync/src/wait_list.rs

@@ -9,6 +9,9 @@ use wait_object::{WaitObject, WaitObjectAdapter};
 pub use wait_handle::WaitHandle;
 
 pub struct WaitList {
+    /// # Lock
+    /// `WaitList`s might be used in IRQ handlers, so `lock_irq` should
+    /// be used on `waiters`.
     waiters: LazyLock<Spin<LinkedList<WaitObjectAdapter>>>,
 }
 
@@ -20,11 +23,11 @@ impl WaitList {
     }
 
     pub fn has_waiters(&self) -> bool {
-        !self.waiters.lock().is_empty()
+        !self.waiters.lock_irq().is_empty()
     }
 
     pub fn notify_one(&self) -> bool {
-        let mut waiters = self.waiters.lock();
+        let mut waiters = self.waiters.lock_irq();
         let mut waiter = waiters.front_mut();
 
         if !waiter.is_null() {
@@ -40,7 +43,7 @@ impl WaitList {
     }
 
     pub fn notify_all(&self) -> usize {
-        let mut waiters = self.waiters.lock();
+        let mut waiters = self.waiters.lock_irq();
         let mut waiter = waiters.front_mut();
         let mut count = 0;
 
@@ -83,7 +86,7 @@ impl WaitList {
     }
 
     pub(self) fn notify_waiter(&self, wait_object: &WaitObject) {
-        let mut waiters = self.waiters.lock();
+        let mut waiters = self.waiters.lock_irq();
         if !wait_object.on_list() {
             return;
         }

+ 2 - 2
crates/eonix_sync/src/wait_list/wait_handle.rs

@@ -96,7 +96,7 @@ impl<'a> WaitHandle<'a> {
 
         match *state {
             State::Init => {
-                let mut waiters = wait_list.waiters.lock();
+                let mut waiters = wait_list.waiters.lock_irq();
                 waiters.push_back(wait_object_ref);
 
                 if let Some(waker) = waker.cloned() {
@@ -206,7 +206,7 @@ impl Drop for WaitHandle<'_> {
             self.wait_until_off_list();
         } else {
             // Lock the list and try again.
-            let mut waiters = self.wait_list.waiters.lock();
+            let mut waiters = self.wait_list.waiters.lock_irq();
 
             if wait_object.on_list() {
                 let mut cursor = unsafe {

+ 7 - 3
crates/eonix_sync/src/wait_list/wait_object.rs

@@ -17,6 +17,10 @@ intrusive_adapter!(
 
 pub struct WaitObject {
     woken_up: AtomicBool,
+    /// Separation of the field `waker` from its lock is basically due to the
+    /// consideration of space. We hope that the object can fit into a cacheline
+    /// and `woken_up` takes only 1 byte where the rest 7 bytes can accomodate 1
+    /// extra byte required for a spinlock.
     waker_lock: Spin<()>,
     waker: UnsafeCell<Option<Waker>>,
     wait_list: AtomicPtr<WaitList>,
@@ -40,7 +44,7 @@ impl WaitObject {
     }
 
     pub fn save_waker(&self, waker: Waker) {
-        let _lock = self.waker_lock.lock();
+        let _lock = self.waker_lock.lock_irq();
         unsafe {
             // SAFETY: We're holding the waker lock.
             let old_waker = (*self.waker.get()).replace(waker);
@@ -53,7 +57,7 @@ impl WaitObject {
     /// # Returns
     /// Whether the waker was saved.
     pub fn save_waker_if_not_woken_up(&self, waker: &Waker) -> bool {
-        let _lock = self.waker_lock.lock();
+        let _lock = self.waker_lock.lock_irq();
         if self.woken_up() {
             return false;
         }
@@ -68,7 +72,7 @@ impl WaitObject {
     }
 
     pub fn take_waker(&self) -> Option<Waker> {
-        let _lock = self.waker_lock.lock();
+        let _lock = self.waker_lock.lock_irq();
         unsafe {
             // SAFETY: We're holding the waker lock.
             self.waker.get().as_mut().unwrap().take()

+ 6 - 0
crates/intrusive_list/Cargo.toml

@@ -0,0 +1,6 @@
+[package]
+name = "intrusive_list"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]

+ 59 - 0
crates/intrusive_list/src/lib.rs

@@ -0,0 +1,59 @@
+#![no_std]
+
+use core::ptr::NonNull;
+
+pub struct Link {
+    prev: Option<NonNull<Link>>,
+    next: Option<NonNull<Link>>,
+}
+
+impl Link {
+    pub const fn new() -> Self {
+        Self {
+            prev: None,
+            next: None,
+        }
+    }
+
+    pub fn insert(&mut self, node: &mut Self) {
+        unsafe {
+            let insert_node = NonNull::new(&raw mut *node);
+            if let Some(next) = self.next {
+                (*next.as_ptr()).prev = insert_node;
+            }
+            node.next = self.next;
+            node.prev = NonNull::new(&raw mut *self);
+            self.next = insert_node;
+        }
+    }
+
+    pub fn remove(&mut self) {
+        if let Some(next) = self.next {
+            unsafe { (*next.as_ptr()).prev = self.prev };
+        }
+
+        if let Some(prev) = self.prev {
+            unsafe { (*prev.as_ptr()).next = self.next };
+        }
+
+        self.prev = None;
+        self.next = None;
+    }
+
+    pub fn next(&self) -> Option<&Self> {
+        self.next.map(|node| unsafe { &*node.as_ptr() })
+    }
+
+    pub fn next_mut(&mut self) -> Option<&mut Self> {
+        self.next.map(|node| unsafe { &mut *node.as_ptr() })
+    }
+}
+
+#[macro_export]
+macro_rules! container_of {
+    ($ptr:expr, $type:ty, $($f:tt)*) => {{
+        let ptr = $ptr as *const _ as *const u8;
+        let offset: usize = ::core::mem::offset_of!($type, $($f)*);
+        ::core::ptr::NonNull::new_unchecked(ptr.sub(offset) as *mut $type)
+    }}
+}

+ 3 - 5
src/driver/ahci/command.rs

@@ -1,8 +1,6 @@
-use crate::prelude::*;
-
-use crate::kernel::mem::paging::Page;
-
 use super::bindings::EINVAL;
+use crate::kernel::mem::paging::Page;
+use crate::prelude::*;
 
 pub trait Command {
     fn pages(&self) -> &[Page];
@@ -22,7 +20,7 @@ pub struct IdentifyCommand {
 impl IdentifyCommand {
     pub fn new() -> Self {
         Self {
-            page: Page::alloc_one(),
+            page: Page::alloc(),
         }
     }
 }

+ 48 - 0
src/driver/ahci/command_table.rs

@@ -0,0 +1,48 @@
+use super::{command::Command, PRDTEntry, FISH2D};
+use crate::kernel::mem::{AsMemoryBlock as _, Page};
+use eonix_mm::address::PAddr;
+
+pub struct CommandTable<'a> {
+    page: Page,
+    command_fis: &'a mut FISH2D,
+
+    prdt: &'a mut [PRDTEntry; 248],
+    prdt_entries: Option<u16>,
+}
+
+impl CommandTable<'_> {
+    pub fn new() -> Self {
+        let page = Page::alloc();
+        let memory = page.as_memblk();
+
+        let (lhs, prdt) = memory.split_at(0x80);
+
+        let (command_fis, _) = lhs.split_at(size_of::<FISH2D>());
+        let command_fis = unsafe { command_fis.as_ptr().as_mut() };
+        let prdt = unsafe { prdt.as_ptr().as_mut() };
+
+        Self {
+            page,
+            command_fis,
+            prdt,
+            prdt_entries: None,
+        }
+    }
+
+    pub fn setup(&mut self, cmd: &impl Command) {
+        self.command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
+        self.prdt_entries = Some(cmd.pages().len() as u16);
+
+        for (idx, page) in cmd.pages().iter().enumerate() {
+            self.prdt[idx].setup(page);
+        }
+    }
+
+    pub fn prdt_len(&self) -> u16 {
+        self.prdt_entries.unwrap()
+    }
+
+    pub fn base(&self) -> PAddr {
+        self.page.start()
+    }
+}

+ 11 - 8
src/driver/ahci/control.rs

@@ -1,6 +1,7 @@
-use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
-
 use super::{BitsIterator, GHC_IE};
+use crate::{kernel::mem::PhysAccess as _, sync::fence::memory_barrier};
+use core::ptr::NonNull;
+use eonix_mm::address::PAddr;
 
 /// An `AdapterControl` is an HBA device Global Host Control block
 ///
@@ -34,7 +35,7 @@ const CONTROL_IS: usize = 2;
 const CONTROL_PI: usize = 3;
 
 pub struct AdapterControl {
-    inner: *mut u32,
+    control_data: NonNull<u32>,
 }
 
 /// # Safety
@@ -42,25 +43,26 @@ pub struct AdapterControl {
 unsafe impl Send for AdapterControl {}
 
 impl AdapterControl {
-    pub fn new(addr: usize) -> Self {
+    pub fn new(addr: PAddr) -> Self {
         Self {
-            inner: NoCachePP::new(addr).as_ptr(),
+            control_data: unsafe { addr.as_ptr() },
         }
     }
 }
 
 impl AdapterControl {
     fn read(&self, off: usize) -> u32 {
-        unsafe { self.inner.offset(off as isize).read_volatile() }
+        unsafe { self.control_data.offset(off as isize).read_volatile() }
     }
 
     fn write(&self, off: usize, value: u32) {
-        unsafe { self.inner.offset(off as isize).write_volatile(value) }
+        unsafe { self.control_data.offset(off as isize).write_volatile(value) }
     }
 
     pub fn enable_interrupts(&self) {
         let ghc = self.read(CONTROL_GHC);
         self.write(CONTROL_GHC, ghc | GHC_IE);
+        memory_barrier();
     }
 
     pub fn implemented_ports(&self) -> BitsIterator {
@@ -72,6 +74,7 @@ impl AdapterControl {
     }
 
     pub fn clear_interrupt(&self, no: u32) {
-        self.write(CONTROL_IS, 1 << no)
+        self.write(CONTROL_IS, 1 << no);
+        memory_barrier();
     }
 }

+ 4 - 1
src/driver/ahci/defs.rs

@@ -1,6 +1,8 @@
 #![allow(dead_code)]
 
 use crate::kernel::mem::paging::Page;
+use eonix_mm::address::Addr as _;
+
 pub const VENDOR_INTEL: u16 = 0x8086;
 pub const DEVICE_AHCI: u16 = 0x2922;
 
@@ -51,6 +53,7 @@ pub const PORT_IS_ERROR: u32 =
 /// `clear_busy_upon_ok` and `bytes_transferred` are volatile
 ///
 #[repr(C)]
+#[derive(Clone, Copy)]
 pub struct CommandHeader {
     // [0:4]: Command FIS length
     // [5]: ATAPI
@@ -237,7 +240,7 @@ pub struct PRDTEntry {
 
 impl PRDTEntry {
     pub fn setup(&mut self, page: &Page) {
-        self.base = page.as_phys() as u64;
+        self.base = page.start().addr() as u64;
         self._reserved1 = 0;
 
         // The last bit MUST be set to 1 according to the AHCI spec

+ 63 - 65
src/driver/ahci/mod.rs

@@ -13,14 +13,21 @@ use bindings::{
     EIO,
 };
 use control::AdapterControl;
+use core::ptr::NonNull;
 use defs::*;
-use eonix_spin_irq::SpinIrq as _;
+use eonix_mm::address::{AddrOps as _, PAddr};
 use port::AdapterPort;
 
+pub(self) use register::Register;
+
 mod command;
+mod command_table;
 mod control;
 mod defs;
 mod port;
+mod register;
+pub(self) mod slot;
+mod stats;
 
 pub struct BitsIterator {
     data: u32,
@@ -53,70 +60,23 @@ impl Iterator for BitsIterator {
     }
 }
 
-fn vread<T: Sized + Copy>(refval: *const T) -> T {
-    unsafe { refval.read_volatile() }
-}
-
-fn vwrite<T: Sized + Copy>(refval: *mut T, val: T) {
-    unsafe { refval.write_volatile(val) }
-}
-
-#[allow(dead_code)]
-struct Device {
-    control_base: usize,
+struct Device<'a> {
+    control_base: PAddr,
     control: AdapterControl,
     // TODO: impl Drop to free pci device
-    pcidev: *mut pci_device,
+    pcidev: NonNull<pci_device>,
     /// # Lock
     /// Might be accessed from irq handler, use with `lock_irq()`
-    ports: Spin<[Option<Arc<AdapterPort>>; 32]>,
+    ports: Spin<[Option<Arc<AdapterPort<'a>>>; 32]>,
 }
 
 /// # Safety
 /// `pcidev` is never accessed from Rust code
 /// TODO!!!: place *mut pci_device in a safe wrapper
-unsafe impl Send for Device {}
-unsafe impl Sync for Device {}
-
-impl Device {
-    fn probe_ports(&self) -> KResult<()> {
-        for nport in self.control.implemented_ports() {
-            let port = Arc::new(AdapterPort::new(self.control_base, nport));
-            if !port.status_ok() {
-                continue;
-            }
-
-            self.ports.lock_irq()[nport as usize] = Some(port.clone());
-            if let Err(e) = (|| -> KResult<()> {
-                port.init()?;
-
-                {
-                    let port = port.clone();
-                    let name = format!("ahci-p{}-stats", port.nport);
-                    procfs::populate_root(name.into_bytes().into(), move |buffer| {
-                        writeln!(&mut buffer.get_writer(), "{:?}", &*port.stats.lock())
-                            .map_err(|_| EIO)
-                    })?;
-                }
-
-                let port = BlockDevice::register_disk(
-                    make_device(8, nport * 16),
-                    2147483647, // TODO: get size from device
-                    port,
-                )?;
-
-                port.partprobe()?;
-
-                Ok(())
-            })() {
-                self.ports.lock_irq()[nport as usize] = None;
-                println_warn!("probe port {nport} failed with {e}");
-            }
-        }
-
-        Ok(())
-    }
+unsafe impl Send for Device<'_> {}
+unsafe impl Sync for Device<'_> {}
 
+impl Device<'_> {
     fn handle_interrupt(&self) {
         // Safety
         // `self.ports` is accessed inside irq handler
@@ -128,7 +88,7 @@ impl Device {
             }
 
             let port = ports[nport as usize].as_ref().unwrap();
-            let status = vread(port.interrupt_status());
+            let status = port.interrupt_status().read_once();
 
             if status & PORT_IS_ERROR != 0 {
                 println_warn!("port {nport} SATA error");
@@ -136,7 +96,7 @@ impl Device {
             }
 
             debug_assert!(status & PORT_IS_DHRS != 0);
-            vwrite(port.interrupt_status(), PORT_IS_DHRS);
+            port.interrupt_status().write_once(PORT_IS_DHRS);
 
             self.control.clear_interrupt(nport);
 
@@ -145,19 +105,20 @@ impl Device {
     }
 }
 
-impl Device {
-    pub fn new(pcidev: *mut pci_device) -> KResult<Arc<Self>> {
-        let base = unsafe { *(*pcidev).header_type0() }.bars[PCI_REG_ABAR];
-        let irqno = unsafe { *(*pcidev).header_type0() }.interrupt_line;
+impl Device<'static> {
+    pub fn new(pcidev: NonNull<pci_device>) -> KResult<Arc<Self>> {
+        let base =
+            PAddr::from(unsafe { *pcidev.as_ref().header_type0() }.bars[PCI_REG_ABAR] as usize);
+        let irqno = unsafe { *pcidev.as_ref().header_type0() }.interrupt_line;
 
         // use MMIO
-        if base & 0xf != 0 {
+        if !base.is_aligned_to(16) {
             return Err(EIO);
         }
 
         let device = Arc::new(Device {
-            control_base: base as usize,
-            control: AdapterControl::new(base as usize),
+            control_base: base,
+            control: AdapterControl::new(base),
             pcidev,
             ports: Spin::new([const { None }; 32]),
         });
@@ -171,10 +132,47 @@ impl Device {
 
         Ok(device)
     }
+
+    fn probe_ports(&self) -> KResult<()> {
+        for nport in self.control.implemented_ports() {
+            let port = Arc::new(AdapterPort::new(self.control_base, nport));
+            if !port.status_ok() {
+                continue;
+            }
+
+            self.ports.lock_irq()[nport as usize] = Some(port.clone());
+            if let Err(e) = (|| -> KResult<()> {
+                port.init()?;
+
+                {
+                    let port = port.clone();
+                    let name = format!("ahci-p{}-stats", port.nport);
+                    procfs::populate_root(name.into_bytes().into(), move |buffer| {
+                        port.print_stats(&mut buffer.get_writer())
+                    })?;
+                }
+
+                let port = BlockDevice::register_disk(
+                    make_device(8, nport * 16),
+                    2147483647, // TODO: get size from device
+                    port,
+                )?;
+
+                port.partprobe()?;
+
+                Ok(())
+            })() {
+                self.ports.lock_irq()[nport as usize] = None;
+                println_warn!("probe port {nport} failed with {e}");
+            }
+        }
+
+        Ok(())
+    }
 }
 
 unsafe extern "C" fn probe_device(pcidev: *mut pci_device) -> i32 {
-    match Device::new(pcidev) {
+    match Device::new(NonNull::new(pcidev).expect("NULL `pci_device` pointer")) {
         Ok(device) => {
             // TODO!!!: save device to pci_device
             Box::leak(Box::new(device));

+ 96 - 209
src/driver/ahci/port.rs

@@ -1,35 +1,21 @@
-use core::pin::pin;
-
 use super::command::{Command, IdentifyCommand, ReadLBACommand};
+use super::slot::CommandSlot;
+use super::stats::AdapterPortStats;
 use super::{
-    vread, vwrite, CommandHeader, PRDTEntry, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
-    PORT_CMD_ST, PORT_IE_DEFAULT,
+    CommandHeader, Register, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE, PORT_CMD_ST, PORT_IE_DEFAULT,
 };
+use crate::driver::ahci::command_table::CommandTable;
 use crate::kernel::block::{BlockDeviceRequest, BlockRequestQueue};
 use crate::kernel::mem::paging::Page;
-use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
+use crate::kernel::mem::AsMemoryBlock as _;
 use crate::prelude::*;
 use alloc::collections::vec_deque::VecDeque;
 use bindings::{EINVAL, EIO};
+use core::pin::pin;
+use eonix_mm::address::{Addr as _, PAddr};
 use eonix_runtime::task::Task;
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::WaitList;
 
-fn spinwait_clear(refval: *const u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
-
-    let mut spins = 0;
-    while vread(refval) & mask != 0 {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
-        }
-
-        spins += 1;
-    }
-
-    Ok(())
-}
-
 /// An `AdapterPort` is an HBA device in AHCI mode.
 ///
 /// # Access
@@ -66,63 +52,6 @@ pub struct AdapterPortData {
     vendor: [u32; 4],
 }
 
-#[allow(dead_code)]
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-enum SlotState {
-    Idle,
-    Working,
-    Finished,
-    Error,
-}
-
-struct CommandSlotInner {
-    state: SlotState,
-    /// # Usage
-    /// `cmdheader` might be used in irq handler. So in order to wait for
-    /// commands to finish, we should use `lock_irq` on `cmdheader`
-    cmdheader: *mut CommandHeader,
-}
-
-/// # Safety
-/// This is safe because the `cmdheader` is not shared between threads
-unsafe impl Send for CommandSlotInner {}
-
-impl CommandSlotInner {
-    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
-        let cmdheader = unsafe { self.cmdheader.as_mut().unwrap() };
-        cmdheader.first = 0x05; // FIS type
-
-        if write {
-            cmdheader.first |= 0x40;
-        }
-
-        cmdheader.second = 0x00;
-
-        cmdheader.prdt_length = prdtlen;
-        cmdheader.bytes_transferred = 0;
-        cmdheader.command_table_base = cmdtable_base;
-
-        cmdheader._reserved = [0; 4];
-    }
-}
-
-struct CommandSlot {
-    inner: Spin<CommandSlotInner>,
-    wait_list: WaitList,
-}
-
-impl CommandSlot {
-    fn new(cmdheader: *mut CommandHeader) -> Self {
-        Self {
-            inner: Spin::new(CommandSlotInner {
-                state: SlotState::Idle,
-                cmdheader,
-            }),
-            wait_list: WaitList::new(),
-        }
-    }
-}
-
 struct FreeList {
     free: VecDeque<u32>,
     working: VecDeque<u32>,
@@ -137,85 +66,83 @@ impl FreeList {
     }
 }
 
-#[derive(Default, Debug)]
-pub struct AdapterPortStats {
-    /// Number of commands sent
-    cmd_sent: u64,
-
-    /// Number of transmission errors
-    cmd_error: u64,
-
-    /// Number of interrupts fired
-    int_fired: u64,
-}
-
-pub struct AdapterPort {
+pub struct AdapterPort<'a> {
     pub nport: u32,
-    regs: *mut (),
-    page: Page,
-    slots: [CommandSlot; 32],
+    regs_base: PAddr,
+
+    slots: [CommandSlot<'a>; 32],
     free_list: Spin<FreeList>,
     free_list_wait: WaitList,
 
-    /// Statistics for this port
-    pub stats: Spin<AdapterPortStats>,
-}
+    /// Holds the command list.
+    /// **DO NOT USE IT DIRECTLY**
+    _page: Page,
+
+    cmdlist_base: PAddr,
+    fis_base: PAddr,
 
-/// # Safety
-/// This is safe because the `AdapterPort` can be accessed by only one thread at the same time
-unsafe impl Send for AdapterPort {}
-unsafe impl Sync for AdapterPort {}
+    stats: AdapterPortStats,
+}
 
-impl AdapterPort {
-    pub fn new(base: usize, nport: u32) -> Self {
-        let page = Page::alloc_one();
-        let cmdheaders_start = page.as_cached().as_ptr::<CommandHeader>();
+impl<'a> AdapterPort<'a> {
+    pub fn new(base: PAddr, nport: u32) -> Self {
+        let page = Page::alloc();
+        let cmdlist_base = page.start();
+        let cmdlist_size = 32 * size_of::<CommandHeader>();
+        let fis_base = cmdlist_base + cmdlist_size;
+
+        let (mut cmdheaders, _) = page.as_memblk().split_at(cmdlist_size);
+        let slots = core::array::from_fn(move |_| {
+            let (cmdheader, next) = cmdheaders.split_at(size_of::<CommandHeader>());
+            cmdheaders = next;
+            CommandSlot::new(unsafe { cmdheader.as_ptr().as_mut() })
+        });
 
         Self {
             nport,
-            regs: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_ptr(),
-            slots: core::array::from_fn(|index| {
-                CommandSlot::new(unsafe { cmdheaders_start.offset(index as isize) })
-            }),
+            regs_base: base + 0x100 + 0x80 * nport as usize,
+            slots,
             free_list: Spin::new(FreeList::new()),
             free_list_wait: WaitList::new(),
-            page,
-            stats: Spin::default(),
+            _page: page,
+            stats: AdapterPortStats::new(),
+            cmdlist_base,
+            fis_base,
         }
     }
 }
 
-impl AdapterPort {
-    fn command_list_base(&self) -> *mut u64 {
-        unsafe { self.regs.byte_offset(0x00).cast() }
+impl AdapterPort<'_> {
+    fn command_list_base(&self) -> Register<u64> {
+        Register::new(self.regs_base + 0x00)
     }
 
-    fn fis_base(&self) -> *mut u64 {
-        unsafe { self.regs.byte_offset(0x08).cast() }
+    fn fis_base(&self) -> Register<u64> {
+        Register::new(self.regs_base + 0x08)
     }
 
-    fn sata_status(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x28).cast() }
+    fn sata_status(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x28)
     }
 
-    fn command_status(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x18).cast() }
+    fn command_status(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x18)
     }
 
-    fn command_issue(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x38).cast() }
+    fn command_issue(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x38)
     }
 
-    pub fn interrupt_status(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x10).cast() }
+    pub fn interrupt_status(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x10)
     }
 
-    pub fn interrupt_enable(&self) -> *mut u32 {
-        unsafe { self.regs.byte_offset(0x14).cast() }
+    fn interrupt_enable(&self) -> Register<u32> {
+        Register::new(self.regs_base + 0x14)
     }
 
     pub fn status_ok(&self) -> bool {
-        vread(self.sata_status()) & 0xf == 0x3
+        self.sata_status().read_once() & 0xf == 0x3
     }
 
     fn get_free_slot(&self) -> u32 {
@@ -234,16 +161,16 @@ impl AdapterPort {
     }
 
     fn save_working(&self, slot: u32) {
-        self.free_list.lock().working.push_back(slot);
+        self.free_list.lock_irq().working.push_back(slot);
     }
 
     fn release_free_slot(&self, slot: u32) {
-        self.free_list.lock().free.push_back(slot);
+        self.free_list.lock_irq().free.push_back(slot);
         self.free_list_wait.notify_one();
     }
 
     pub fn handle_interrupt(&self) {
-        let ci = vread(self.command_issue());
+        let ci = self.command_issue().read_once();
 
         // no need to use `lock_irq()` inside interrupt handler
         let mut free_list = self.free_list.lock();
@@ -253,104 +180,55 @@ impl AdapterPort {
                 return true;
             }
 
-            let slot = &self.slots[n as usize];
-
-            // TODO: check error
-            let mut slot_inner = slot.inner.lock();
-            debug_assert_eq!(slot_inner.state, SlotState::Working);
-            slot_inner.state = SlotState::Finished;
-            slot.wait_list.notify_all();
-            self.stats.lock().int_fired += 1;
+            self.slots[n as usize].handle_irq();
+            self.stats.inc_int_fired();
 
             false
         });
     }
 
     fn stop_command(&self) -> KResult<()> {
-        vwrite(
-            self.command_status(),
-            vread(self.command_status()) & !(PORT_CMD_ST | PORT_CMD_FRE),
-        );
-
-        spinwait_clear(self.command_status(), PORT_CMD_CR | PORT_CMD_FR)
+        let status_reg = self.command_status();
+        let status = status_reg.read();
+        status_reg.write_once(status & !(PORT_CMD_ST | PORT_CMD_FRE));
+        status_reg.spinwait_clear(PORT_CMD_CR | PORT_CMD_FR)
     }
 
     fn start_command(&self) -> KResult<()> {
-        spinwait_clear(self.command_status(), PORT_CMD_CR)?;
+        let status_reg = self.command_status();
+        status_reg.spinwait_clear(PORT_CMD_CR)?;
 
-        let cmd_status = vread(self.command_status());
-        vwrite(
-            self.command_status(),
-            cmd_status | PORT_CMD_ST | PORT_CMD_FRE,
-        );
+        let status = status_reg.read();
+        status_reg.write_once(status | PORT_CMD_ST | PORT_CMD_FRE);
 
         Ok(())
     }
 
     fn send_command(&self, cmd: &impl Command) -> KResult<()> {
-        let pages = cmd.pages();
-        let cmdtable_page = Page::alloc_one();
+        let mut cmdtable = CommandTable::new();
+        cmdtable.setup(cmd);
 
-        let command_fis: &mut FISH2D = cmdtable_page.as_cached().as_mut();
-        command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
+        let slot_index = self.get_free_slot();
+        let slot = &self.slots[slot_index as usize];
 
-        let prdt: &mut [PRDTEntry; 248] = cmdtable_page.as_cached().offset(0x80).as_mut();
+        slot.prepare_command(&cmdtable, cmd.write());
+        self.save_working(slot_index);
 
-        for (idx, page) in pages.iter().enumerate() {
-            prdt[idx].setup(page);
-        }
-
-        let slot_index = self.get_free_slot() as usize;
-        let slot_object = &self.slots[slot_index];
-
-        let mut slot = slot_object.inner.lock_irq();
-
-        slot.setup(
-            cmdtable_page.as_phys() as u64,
-            pages.len() as u16,
-            cmd.write(),
-        );
-        slot.state = SlotState::Working;
+        let cmdissue_reg = self.command_issue();
 
         // should we clear received fis here?
-        debug_assert!(vread(self.command_issue()) & (1 << slot_index) == 0);
-        vwrite(self.command_issue(), 1 << slot_index);
-
-        if spinwait_clear(self.command_issue(), 1 << slot_index).is_err() {
-            let mut saved = false;
-            while slot.state == SlotState::Working {
-                if !saved {
-                    saved = true;
-                    self.save_working(slot_index as u32);
-                }
-                let mut wait = pin!(slot_object.wait_list.prepare_to_wait());
-                wait.as_mut().add_to_wait_list();
-                drop(slot);
-                Task::block_on(wait);
-                slot = slot_object.inner.lock_irq();
-            }
-        } else {
-            // TODO: check error
-            slot.state = SlotState::Finished;
-        }
+        debug_assert!(cmdissue_reg.read_once() & (1 << slot_index) == 0);
+        cmdissue_reg.write_once(1 << slot_index);
 
-        let state = slot.state;
-        slot.state = SlotState::Idle;
+        self.stats.inc_cmd_sent();
 
-        debug_assert_ne!(state, SlotState::Working);
-        self.release_free_slot(slot_index as u32);
+        if let Err(_) = Task::block_on(slot.wait_finish()) {
+            self.stats.inc_cmd_error();
+            return Err(EIO);
+        };
 
-        match state {
-            SlotState::Finished => {
-                self.stats.lock().cmd_sent += 1;
-                Ok(())
-            }
-            SlotState::Error => {
-                self.stats.lock().cmd_error += 1;
-                Err(EIO)
-            }
-            _ => panic!("Invalid slot state"),
-        }
+        self.release_free_slot(slot_index);
+        Ok(())
     }
 
     fn identify(&self) -> KResult<()> {
@@ -365,10 +243,11 @@ impl AdapterPort {
     pub fn init(&self) -> KResult<()> {
         self.stop_command()?;
 
-        vwrite(self.interrupt_enable(), PORT_IE_DEFAULT);
+        self.command_list_base()
+            .write(self.cmdlist_base.addr() as u64);
+        self.fis_base().write(self.fis_base.addr() as u64);
 
-        vwrite(self.command_list_base(), self.page.as_phys() as u64);
-        vwrite(self.fis_base(), self.page.as_phys() as u64 + 0x400);
+        self.interrupt_enable().write_once(PORT_IE_DEFAULT);
 
         self.start_command()?;
 
@@ -380,9 +259,17 @@ impl AdapterPort {
             Ok(_) => Ok(()),
         }
     }
+
+    pub fn print_stats(&self, writer: &mut impl Write) -> KResult<()> {
+        writeln!(writer, "cmd_sent: {}", self.stats.get_cmd_sent()).map_err(|_| EIO)?;
+        writeln!(writer, "cmd_error: {}", self.stats.get_cmd_error()).map_err(|_| EIO)?;
+        writeln!(writer, "int_fired: {}", self.stats.get_int_fired()).map_err(|_| EIO)?;
+
+        Ok(())
+    }
 }
 
-impl BlockRequestQueue for AdapterPort {
+impl BlockRequestQueue for AdapterPort<'_> {
     fn max_request_pages(&self) -> u64 {
         1024
     }

+ 58 - 0
src/driver/ahci/register.rs

@@ -0,0 +1,58 @@
+use crate::{
+    kernel::{constants::EIO, mem::PhysAccess as _},
+    sync::fence::memory_barrier,
+    KResult,
+};
+use core::ptr::NonNull;
+use eonix_mm::address::PAddr;
+
+pub struct Register<T: Copy> {
+    addr: NonNull<T>,
+}
+
+unsafe impl<T: Copy> Send for Register<T> {}
+unsafe impl<T: Copy> Sync for Register<T> {}
+
+impl<T: Copy> Register<T> {
+    pub fn new(addr: PAddr) -> Self {
+        Self {
+            addr: unsafe { addr.as_ptr() },
+        }
+    }
+
+    pub fn read(&self) -> T {
+        unsafe { self.addr.as_ptr().read_volatile() }
+    }
+
+    pub fn write(&self, value: T) {
+        unsafe { self.addr.as_ptr().write_volatile(value) }
+    }
+
+    pub fn read_once(&self) -> T {
+        let val = unsafe { self.addr.as_ptr().read_volatile() };
+        memory_barrier();
+        val
+    }
+
+    pub fn write_once(&self, value: T) {
+        let val = unsafe { self.addr.as_ptr().write_volatile(value) };
+        memory_barrier();
+        val
+    }
+}
+
+impl Register<u32> {
+    pub fn spinwait_clear(&self, mask: u32) -> KResult<()> {
+        const SPINWAIT_MAX: usize = 1000;
+
+        for _ in 0..SPINWAIT_MAX {
+            if self.read() & mask == 0 {
+                memory_barrier();
+                return Ok(());
+            }
+        }
+
+        memory_barrier();
+        Err(EIO)
+    }
+}

+ 94 - 0
src/driver/ahci/slot.rs

@@ -0,0 +1,94 @@
+use super::{command_table::CommandTable, CommandHeader};
+use crate::KResult;
+use core::pin::pin;
+use eonix_mm::address::Addr as _;
+use eonix_sync::{Spin, WaitList};
+
+pub struct CommandSlot<'a> {
+    /// # Usage
+    /// `inner.cmdheader` might be used in irq handler. So in order to wait for
+    /// commands to finish, we should use `lock_irq` on `inner`
+    inner: Spin<CommandSlotInner<'a>>,
+    wait_list: WaitList,
+}
+
+struct CommandSlotInner<'a> {
+    state: SlotState,
+    cmdheader: &'a mut CommandHeader,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum SlotState {
+    Idle,
+    Working,
+    Finished,
+    Error,
+}
+
+impl<'a> CommandSlot<'a> {
+    pub fn new(cmdheader: &'a mut CommandHeader) -> Self {
+        Self {
+            inner: Spin::new(CommandSlotInner {
+                state: SlotState::Idle,
+                cmdheader,
+            }),
+            wait_list: WaitList::new(),
+        }
+    }
+
+    pub fn handle_irq(&self) {
+        let mut inner = self.inner.lock();
+        debug_assert_eq!(inner.state, SlotState::Working);
+
+        // TODO: Check errors.
+        inner.state = SlotState::Finished;
+        inner.cmdheader.bytes_transferred = 0;
+        inner.cmdheader.prdt_length = 0;
+
+        self.wait_list.notify_all();
+    }
+
+    pub fn prepare_command(&self, cmdtable: &CommandTable, write: bool) {
+        let mut inner = self.inner.lock_irq();
+        let cmdheader = &mut inner.cmdheader;
+
+        cmdheader.first = 0x05; // FIS type
+
+        if write {
+            cmdheader.first |= 0x40;
+        }
+
+        cmdheader.second = 0x00;
+
+        cmdheader.prdt_length = cmdtable.prdt_len();
+        cmdheader.bytes_transferred = 0;
+        cmdheader.command_table_base = cmdtable.base().addr() as u64;
+
+        cmdheader._reserved = [0; 4];
+
+        inner.state = SlotState::Working;
+    }
+
+    pub async fn wait_finish(&self) -> KResult<()> {
+        let mut inner = loop {
+            let inner = self.inner.lock_irq();
+            if inner.state != SlotState::Working {
+                break inner;
+            }
+
+            let mut wait = pin!(self.wait_list.prepare_to_wait());
+            wait.as_mut().add_to_wait_list();
+
+            if inner.state != SlotState::Working {
+                break inner;
+            }
+
+            drop(inner);
+            wait.await;
+        };
+
+        inner.state = SlotState::Idle;
+
+        Ok(())
+    }
+}

+ 46 - 0
src/driver/ahci/stats.rs

@@ -0,0 +1,46 @@
+use core::sync::atomic::{AtomicUsize, Ordering};
+
+pub struct AdapterPortStats {
+    /// Number of commands sent
+    cmd_sent: AtomicUsize,
+
+    /// Number of transmission errors
+    cmd_error: AtomicUsize,
+
+    /// Number of interrupts fired
+    int_fired: AtomicUsize,
+}
+
+impl AdapterPortStats {
+    pub const fn new() -> Self {
+        Self {
+            cmd_sent: AtomicUsize::new(0),
+            cmd_error: AtomicUsize::new(0),
+            int_fired: AtomicUsize::new(0),
+        }
+    }
+
+    pub fn inc_int_fired(&self) {
+        self.int_fired.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn inc_cmd_sent(&self) {
+        self.cmd_sent.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn inc_cmd_error(&self) {
+        self.cmd_error.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn get_int_fired(&self) -> usize {
+        self.int_fired.load(Ordering::Relaxed)
+    }
+
+    pub fn get_cmd_sent(&self) -> usize {
+        self.cmd_sent.load(Ordering::Relaxed)
+    }
+
+    pub fn get_cmd_error(&self) -> usize {
+        self.cmd_error.load(Ordering::Relaxed)
+    }
+}

+ 434 - 434
src/driver/e1000e.rs

@@ -1,439 +1,439 @@
-use crate::prelude::*;
-
-use crate::bindings::root::kernel::hw::pci;
-use crate::kernel::interrupt::register_irq_handler;
-use crate::kernel::mem::{paging, phys};
-use crate::net::netdev;
-use alloc::boxed::Box;
-use alloc::vec::Vec;
-use bindings::EFAULT;
-use paging::Page;
-use phys::{NoCachePP, PhysPtr};
-
-use crate::bindings::root::{EAGAIN, EINVAL, EIO};
-
-mod defs;
-
-#[repr(C)]
-struct RxDescriptor {
-    buffer: u64,
-    length: u16,
-    checksum: u16,
-    status: u8,
-    errors: u8,
-    vlan: u16,
-}
-
-#[repr(C)]
-struct TxDescriptor {
-    buffer: u64,
-    length: u16,
-    cso: u8, // Checksum offset
-    cmd: u8,
-    status: u8,
-    css: u8, // Checksum start
-    vlan: u16,
-}
-
-const RX_DESC_SIZE: usize = 32;
-const TX_DESC_SIZE: usize = 32;
-
-struct E1000eDev {
-    mac: netdev::Mac,
-    status: netdev::LinkStatus,
-    speed: netdev::LinkSpeed,
-    id: u32,
-
-    base: NoCachePP,
-    rt_desc_page: Page,
-    rx_head: Option<u32>,
-    rx_tail: Option<u32>,
-    tx_tail: Option<u32>,
-
-    rx_buffers: Option<Box<Vec<Page>>>,
-    tx_buffers: Option<Box<Vec<Page>>>,
-}
-
-fn test(val: u32, bit: u32) -> bool {
-    (val & bit) == bit
-}
-
-struct PrintableBytes<'a>(&'a [u8]);
-
-impl core::fmt::Debug for PrintableBytes<'_> {
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        write!(f, "PrintableBytes {{")?;
-        for chunk in self.0.chunks(16) {
-            for &byte in chunk {
-                write!(f, "{byte} ")?;
-            }
-            write!(f, "\n")?;
-        }
-        write!(f, "}}")?;
-
-        Ok(())
-    }
-}
-
-impl netdev::Netdev for E1000eDev {
-    fn mac(&self) -> netdev::Mac {
-        self.mac
-    }
-
-    fn link_status(&self) -> netdev::LinkStatus {
-        self.status
-    }
-
-    fn link_speed(&self) -> netdev::LinkSpeed {
-        self.speed
-    }
-
-    fn id(&self) -> u32 {
-        self.id
-    }
-
-    fn up(&mut self) -> Result<(), u32> {
-        let ctrl = self.read(defs::REG_CTRL);
-        let status = self.read(defs::REG_STAT);
-
-        // check link up
-        if !test(ctrl, defs::CTRL_SLU) || !test(status, defs::STAT_LU) {
-            return Err(EIO);
-        }
-
-        // auto negotiation of speed
-        match status & defs::STAT_SPEED_MASK {
-            defs::STAT_SPEED_10M => self.speed = netdev::LinkSpeed::Speed10M,
-            defs::STAT_SPEED_100M => self.speed = netdev::LinkSpeed::Speed100M,
-            defs::STAT_SPEED_1000M => self.speed = netdev::LinkSpeed::Speed1000M,
-            _ => return Err(EINVAL),
-        }
-
-        // clear multicast table
-        for i in (0..128).step_by(4) {
-            self.write(defs::REG_MTA + i, 0);
-        }
-
-        self.clear_stats()?;
-
-        // setup interrupt handler
-        let device = netdev::get_netdev(self.id).unwrap();
-        let handler = move || {
-            eonix_runtime::task::Task::block_on(device.lock())
-                .fire()
-                .unwrap();
-        };
-
-        register_irq_handler(0xb, handler)?;
-
-        // enable interrupts
-        self.write(defs::REG_IMS, defs::ICR_NORMAL | defs::ICR_UP);
-
-        // read to clear any pending interrupts
-        self.read(defs::REG_ICR);
-
-        self.setup_rx()?;
-        self.setup_tx()?;
-
-        self.status = netdev::LinkStatus::Up;
-
-        Ok(())
-    }
-
-    fn fire(&mut self) -> Result<(), u32> {
-        let cause = self.read(defs::REG_ICR);
-        if !test(cause, defs::ICR_INT) {
-            return Ok(());
-        }
-
-        loop {
-            let tail = self.rx_tail.ok_or(EIO)?;
-            let next_tail = (tail + 1) % RX_DESC_SIZE as u32;
-
-            if next_tail == self.read(defs::REG_RDH) {
-                break;
-            }
-
-            let ref mut desc = self.rx_desc_table()[next_tail as usize];
-            if !test(desc.status as u32, defs::RXD_STAT_DD as u32) {
-                Err(EIO)?;
-            }
-
-            desc.status = 0;
-            let len = desc.length as usize;
-
-            let buffers = self.rx_buffers.as_mut().ok_or(EIO)?;
-            let data = &buffers[next_tail as usize].as_slice()[..len];
-
-            println_debug!("e1000e: received {len} bytes, {:?}", PrintableBytes(data));
-            self.rx_tail = Some(next_tail);
-        }
-
-        Ok(())
-    }
-
-    fn send(&mut self, buf: &[u8]) -> Result<(), u32> {
-        let tail = self.tx_tail.ok_or(EIO)?;
-        let head = self.read(defs::REG_TDH);
-        let next_tail = (tail + 1) % TX_DESC_SIZE as u32;
-
-        if next_tail == head {
-            return Err(EAGAIN);
-        }
-
-        let ref mut desc = self.tx_desc_table()[tail as usize];
-        if !test(desc.status as u32, defs::TXD_STAT_DD as u32) {
-            return Err(EIO);
-        }
-
-        let buffer_page = Page::alloc_one();
-        if buf.len() > buffer_page.len() {
-            return Err(EFAULT);
-        }
-        buffer_page.as_mut_slice()[..buf.len()].copy_from_slice(buf);
-
-        desc.buffer = buffer_page.as_phys() as u64;
-        desc.length = buf.len() as u16;
-        desc.cmd = defs::TXD_CMD_EOP | defs::TXD_CMD_IFCS | defs::TXD_CMD_RS;
-        desc.status = 0;
-
-        self.tx_tail = Some(next_tail);
-        self.write(defs::REG_TDT, next_tail);
-
-        // TODO: check if the packets are sent and update self.tx_head state
-
-        Ok(())
-    }
-}
-
-impl E1000eDev {
-    fn setup_rx(&mut self) -> Result<(), u32> {
-        if !self.rx_head.is_none() || !self.rx_tail.is_none() {
-            return Err(EINVAL);
-        }
-
-        let addr = self.rt_desc_page.as_phys();
-
-        self.write(defs::REG_RDBAL, addr as u32);
-        self.write(defs::REG_RDBAH, (addr >> 32) as u32);
-
-        self.write(
-            defs::REG_RDLEN,
-            (RX_DESC_SIZE * size_of::<RxDescriptor>()) as u32,
-        );
-
-        self.write(defs::REG_RDH, 0);
-        self.write(defs::REG_RDT, RX_DESC_SIZE as u32 - 1);
-
-        self.rx_head = Some(0);
-        self.rx_tail = Some(RX_DESC_SIZE as u32 - 1);
-
-        self.write(
-            defs::REG_RCTL,
-            defs::RCTL_EN
-                | defs::RCTL_MPE
-                | defs::RCTL_LPE
-                | defs::RCTL_LBM_NO
-                | defs::RCTL_DTYP_LEGACY
-                | defs::RCTL_BAM
-                | defs::RCTL_BSIZE_8192
-                | defs::RCTL_SECRC,
-        );
-
-        Ok(())
-    }
-
-    fn setup_tx(&mut self) -> Result<(), u32> {
-        if !self.tx_tail.is_none() {
-            return Err(EINVAL);
-        }
-
-        let addr = self.rt_desc_page.as_phys() + 0x200;
-
-        self.write(defs::REG_TDBAL, addr as u32);
-        self.write(defs::REG_TDBAH, (addr >> 32) as u32);
-
-        self.write(
-            defs::REG_TDLEN,
-            (TX_DESC_SIZE * size_of::<TxDescriptor>()) as u32,
-        );
-
-        self.write(defs::REG_TDH, 0);
-        self.write(defs::REG_TDT, 0);
-
-        self.tx_tail = Some(0);
-
-        self.write(
-            defs::REG_TCTL,
-            defs::TCTL_EN
-                | defs::TCTL_PSP
-                | (15 << defs::TCTL_CT_SHIFT)
-                | (64 << defs::TCTL_COLD_SHIFT)
-                | defs::TCTL_RTLC,
-        );
-
-        Ok(())
-    }
-
-    fn reset(&self) -> Result<(), u32> {
-        // disable interrupts so we won't mess things up
-        self.write(defs::REG_IMC, 0xffffffff);
-
-        let ctrl = self.read(defs::REG_CTRL);
-        self.write(defs::REG_CTRL, ctrl | defs::CTRL_GIOD);
-
-        while self.read(defs::REG_STAT) & defs::STAT_GIOE != 0 {
-            // wait for link up
-        }
-
-        let ctrl = self.read(defs::REG_CTRL);
-        self.write(defs::REG_CTRL, ctrl | defs::CTRL_RST);
-
-        while self.read(defs::REG_CTRL) & defs::CTRL_RST != 0 {
-            // wait for reset
-        }
-
-        // disable interrupts again
-        self.write(defs::REG_IMC, 0xffffffff);
-
-        Ok(())
-    }
-
-    fn clear_stats(&self) -> Result<(), u32> {
-        self.write(defs::REG_COLC, 0);
-        self.write(defs::REG_GPRC, 0);
-        self.write(defs::REG_MPRC, 0);
-        self.write(defs::REG_GPTC, 0);
-        self.write(defs::REG_GORCL, 0);
-        self.write(defs::REG_GORCH, 0);
-        self.write(defs::REG_GOTCL, 0);
-        self.write(defs::REG_GOTCH, 0);
-        Ok(())
-    }
-
-    pub fn new(base: NoCachePP) -> Result<Self, u32> {
-        let page = Page::alloc_one();
-
-        page.zero();
-
-        let mut dev = Self {
-            mac: [0; 6],
-            status: netdev::LinkStatus::Down,
-            speed: netdev::LinkSpeed::SpeedUnknown,
-            id: netdev::alloc_id(),
-            base,
-            rt_desc_page: page,
-            rx_head: None,
-            rx_tail: None,
-            tx_tail: None,
-            rx_buffers: None,
-            tx_buffers: None,
-        };
-
-        dev.reset()?;
-
-        dev.mac = unsafe { dev.base.offset(0x5400).as_ptr::<[u8; 6]>().read() };
-        dev.tx_buffers = Some(Box::new(Vec::with_capacity(TX_DESC_SIZE)));
-
-        let mut rx_buffers = Box::new(Vec::with_capacity(RX_DESC_SIZE));
-
-        for index in 0..RX_DESC_SIZE {
-            let page = Page::alloc_many(2);
-
-            let ref mut desc = dev.rx_desc_table()[index];
-            desc.buffer = page.as_phys() as u64;
-            desc.status = 0;
-
-            rx_buffers.push(page);
-        }
-
-        for index in 0..TX_DESC_SIZE {
-            let ref mut desc = dev.tx_desc_table()[index];
-            desc.status = defs::TXD_STAT_DD;
-        }
-
-        dev.rx_buffers = Some(rx_buffers);
-
-        Ok(dev)
-    }
-
-    fn read(&self, offset: u32) -> u32 {
-        unsafe {
-            self.base
-                .offset(offset as isize)
-                .as_ptr::<u32>()
-                .read_volatile()
-        }
-    }
-
-    fn write(&self, offset: u32, value: u32) {
-        unsafe {
-            self.base
-                .offset(offset as isize)
-                .as_ptr::<u32>()
-                .write_volatile(value)
-        }
-    }
-
-    fn rx_desc_table<'lt>(&'lt self) -> &'lt mut [RxDescriptor; RX_DESC_SIZE] {
-        self.rt_desc_page.as_cached().as_mut()
-    }
-
-    fn tx_desc_table<'lt>(&'lt self) -> &'lt mut [TxDescriptor; TX_DESC_SIZE] {
-        self.rt_desc_page.as_cached().offset(0x200).as_mut()
-    }
-}
-
-impl Drop for E1000eDev {
-    fn drop(&mut self) {
-        assert_eq!(self.status, netdev::LinkStatus::Down);
-
-        if let Some(_) = self.rx_buffers.take() {}
-
-        // TODO: we should wait until all packets are sent
-        if let Some(_) = self.tx_buffers.take() {}
-
-        let _ = self.rt_desc_page;
-    }
-}
-
-impl pci::pci_device {
-    fn header0(&self) -> &pci::device_header_type0 {
-        unsafe { self.header_type0().as_ref() }.unwrap()
-    }
-}
-
-fn do_probe_device(dev: &mut pci::pci_device) -> Result<(), u32> {
-    let bar0 = dev.header0().bars[0];
-
-    if bar0 & 0xf != 0 {
-        return Err(EINVAL);
-    }
-
-    unsafe { dev.enableBusMastering() };
-
-    let base = NoCachePP::new((bar0 & !0xf) as usize);
-    let e1000e = E1000eDev::new(base)?;
-
-    netdev::register_netdev(e1000e)?;
-
-    Ok(())
-}
-
-unsafe extern "C" fn probe_device(dev: *mut pci::pci_device) -> i32 {
-    let dev = dev.as_mut().unwrap();
-    match do_probe_device(dev) {
-        Ok(_) => 0,
-        Err(e) => -(e as i32),
-    }
-}
+// use crate::prelude::*;
+//
+// use crate::bindings::root::kernel::hw::pci;
+// use crate::kernel::interrupt::register_irq_handler;
+// use crate::kernel::mem::{paging, phys};
+// use crate::net::netdev;
+// use alloc::boxed::Box;
+// use alloc::vec::Vec;
+// use bindings::EFAULT;
+// use paging::Page;
+// use phys::{NoCachePP, PhysPtr};
+//
+// use crate::bindings::root::{EAGAIN, EINVAL, EIO};
+//
+// mod defs;
+//
+// #[repr(C)]
+// struct RxDescriptor {
+//     buffer: u64,
+//     length: u16,
+//     checksum: u16,
+//     status: u8,
+//     errors: u8,
+//     vlan: u16,
+// }
+//
+// #[repr(C)]
+// struct TxDescriptor {
+//     buffer: u64,
+//     length: u16,
+//     cso: u8, // Checksum offset
+//     cmd: u8,
+//     status: u8,
+//     css: u8, // Checksum start
+//     vlan: u16,
+// }
+//
+// const RX_DESC_SIZE: usize = 32;
+// const TX_DESC_SIZE: usize = 32;
+//
+// struct E1000eDev {
+//     mac: netdev::Mac,
+//     status: netdev::LinkStatus,
+//     speed: netdev::LinkSpeed,
+//     id: u32,
+//
+//     base: NoCachePP,
+//     rt_desc_page: Page,
+//     rx_head: Option<u32>,
+//     rx_tail: Option<u32>,
+//     tx_tail: Option<u32>,
+//
+//     rx_buffers: Option<Box<Vec<Page>>>,
+//     tx_buffers: Option<Box<Vec<Page>>>,
+// }
+//
+// fn test(val: u32, bit: u32) -> bool {
+//     (val & bit) == bit
+// }
+//
+// struct PrintableBytes<'a>(&'a [u8]);
+//
+// impl core::fmt::Debug for PrintableBytes<'_> {
+//     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+//         write!(f, "PrintableBytes {{")?;
+//         for chunk in self.0.chunks(16) {
+//             for &byte in chunk {
+//                 write!(f, "{byte} ")?;
+//             }
+//             write!(f, "\n")?;
+//         }
+//         write!(f, "}}")?;
+//
+//         Ok(())
+//     }
+// }
+//
+// impl netdev::Netdev for E1000eDev {
+//     fn mac(&self) -> netdev::Mac {
+//         self.mac
+//     }
+//
+//     fn link_status(&self) -> netdev::LinkStatus {
+//         self.status
+//     }
+//
+//     fn link_speed(&self) -> netdev::LinkSpeed {
+//         self.speed
+//     }
+//
+//     fn id(&self) -> u32 {
+//         self.id
+//     }
+//
+//     fn up(&mut self) -> Result<(), u32> {
+//         let ctrl = self.read(defs::REG_CTRL);
+//         let status = self.read(defs::REG_STAT);
+//
+//         // check link up
+//         if !test(ctrl, defs::CTRL_SLU) || !test(status, defs::STAT_LU) {
+//             return Err(EIO);
+//         }
+//
+//         // auto negotiation of speed
+//         match status & defs::STAT_SPEED_MASK {
+//             defs::STAT_SPEED_10M => self.speed = netdev::LinkSpeed::Speed10M,
+//             defs::STAT_SPEED_100M => self.speed = netdev::LinkSpeed::Speed100M,
+//             defs::STAT_SPEED_1000M => self.speed = netdev::LinkSpeed::Speed1000M,
+//             _ => return Err(EINVAL),
+//         }
+//
+//         // clear multicast table
+//         for i in (0..128).step_by(4) {
+//             self.write(defs::REG_MTA + i, 0);
+//         }
+//
+//         self.clear_stats()?;
+//
+//         // setup interrupt handler
+//         let device = netdev::get_netdev(self.id).unwrap();
+//         let handler = move || {
+//             eonix_runtime::task::Task::block_on(device.lock())
+//                 .fire()
+//                 .unwrap();
+//         };
+//
+//         register_irq_handler(0xb, handler)?;
+//
+//         // enable interrupts
+//         self.write(defs::REG_IMS, defs::ICR_NORMAL | defs::ICR_UP);
+//
+//         // read to clear any pending interrupts
+//         self.read(defs::REG_ICR);
+//
+//         self.setup_rx()?;
+//         self.setup_tx()?;
+//
+//         self.status = netdev::LinkStatus::Up;
+//
+//         Ok(())
+//     }
+//
+//     fn fire(&mut self) -> Result<(), u32> {
+//         let cause = self.read(defs::REG_ICR);
+//         if !test(cause, defs::ICR_INT) {
+//             return Ok(());
+//         }
+//
+//         loop {
+//             let tail = self.rx_tail.ok_or(EIO)?;
+//             let next_tail = (tail + 1) % RX_DESC_SIZE as u32;
+//
+//             if next_tail == self.read(defs::REG_RDH) {
+//                 break;
+//             }
+//
+//             let ref mut desc = self.rx_desc_table()[next_tail as usize];
+//             if !test(desc.status as u32, defs::RXD_STAT_DD as u32) {
+//                 Err(EIO)?;
+//             }
+//
+//             desc.status = 0;
+//             let len = desc.length as usize;
+//
+//             let buffers = self.rx_buffers.as_mut().ok_or(EIO)?;
+//             let data = &buffers[next_tail as usize].as_slice()[..len];
+//
+//             println_debug!("e1000e: received {len} bytes, {:?}", PrintableBytes(data));
+//             self.rx_tail = Some(next_tail);
+//         }
+//
+//         Ok(())
+//     }
+//
+//     fn send(&mut self, buf: &[u8]) -> Result<(), u32> {
+//         let tail = self.tx_tail.ok_or(EIO)?;
+//         let head = self.read(defs::REG_TDH);
+//         let next_tail = (tail + 1) % TX_DESC_SIZE as u32;
+//
+//         if next_tail == head {
+//             return Err(EAGAIN);
+//         }
+//
+//         let ref mut desc = self.tx_desc_table()[tail as usize];
+//         if !test(desc.status as u32, defs::TXD_STAT_DD as u32) {
+//             return Err(EIO);
+//         }
+//
+//         let buffer_page = Page::alloc_one();
+//         if buf.len() > buffer_page.len() {
+//             return Err(EFAULT);
+//         }
+//         buffer_page.as_mut_slice()[..buf.len()].copy_from_slice(buf);
+//
+//         desc.buffer = buffer_page.as_phys() as u64;
+//         desc.length = buf.len() as u16;
+//         desc.cmd = defs::TXD_CMD_EOP | defs::TXD_CMD_IFCS | defs::TXD_CMD_RS;
+//         desc.status = 0;
+//
+//         self.tx_tail = Some(next_tail);
+//         self.write(defs::REG_TDT, next_tail);
+//
+//         // TODO: check if the packets are sent and update self.tx_head state
+//
+//         Ok(())
+//     }
+// }
+//
+// impl E1000eDev {
+//     fn setup_rx(&mut self) -> Result<(), u32> {
+//         if !self.rx_head.is_none() || !self.rx_tail.is_none() {
+//             return Err(EINVAL);
+//         }
+//
+//         let addr = self.rt_desc_page.as_phys();
+//
+//         self.write(defs::REG_RDBAL, addr as u32);
+//         self.write(defs::REG_RDBAH, (addr >> 32) as u32);
+//
+//         self.write(
+//             defs::REG_RDLEN,
+//             (RX_DESC_SIZE * size_of::<RxDescriptor>()) as u32,
+//         );
+//
+//         self.write(defs::REG_RDH, 0);
+//         self.write(defs::REG_RDT, RX_DESC_SIZE as u32 - 1);
+//
+//         self.rx_head = Some(0);
+//         self.rx_tail = Some(RX_DESC_SIZE as u32 - 1);
+//
+//         self.write(
+//             defs::REG_RCTL,
+//             defs::RCTL_EN
+//                 | defs::RCTL_MPE
+//                 | defs::RCTL_LPE
+//                 | defs::RCTL_LBM_NO
+//                 | defs::RCTL_DTYP_LEGACY
+//                 | defs::RCTL_BAM
+//                 | defs::RCTL_BSIZE_8192
+//                 | defs::RCTL_SECRC,
+//         );
+//
+//         Ok(())
+//     }
+//
+//     fn setup_tx(&mut self) -> Result<(), u32> {
+//         if !self.tx_tail.is_none() {
+//             return Err(EINVAL);
+//         }
+//
+//         let addr = self.rt_desc_page.as_phys() + 0x200;
+//
+//         self.write(defs::REG_TDBAL, addr as u32);
+//         self.write(defs::REG_TDBAH, (addr >> 32) as u32);
+//
+//         self.write(
+//             defs::REG_TDLEN,
+//             (TX_DESC_SIZE * size_of::<TxDescriptor>()) as u32,
+//         );
+//
+//         self.write(defs::REG_TDH, 0);
+//         self.write(defs::REG_TDT, 0);
+//
+//         self.tx_tail = Some(0);
+//
+//         self.write(
+//             defs::REG_TCTL,
+//             defs::TCTL_EN
+//                 | defs::TCTL_PSP
+//                 | (15 << defs::TCTL_CT_SHIFT)
+//                 | (64 << defs::TCTL_COLD_SHIFT)
+//                 | defs::TCTL_RTLC,
+//         );
+//
+//         Ok(())
+//     }
+//
+//     fn reset(&self) -> Result<(), u32> {
+//         // disable interrupts so we won't mess things up
+//         self.write(defs::REG_IMC, 0xffffffff);
+//
+//         let ctrl = self.read(defs::REG_CTRL);
+//         self.write(defs::REG_CTRL, ctrl | defs::CTRL_GIOD);
+//
+//         while self.read(defs::REG_STAT) & defs::STAT_GIOE != 0 {
+//             // wait for link up
+//         }
+//
+//         let ctrl = self.read(defs::REG_CTRL);
+//         self.write(defs::REG_CTRL, ctrl | defs::CTRL_RST);
+//
+//         while self.read(defs::REG_CTRL) & defs::CTRL_RST != 0 {
+//             // wait for reset
+//         }
+//
+//         // disable interrupts again
+//         self.write(defs::REG_IMC, 0xffffffff);
+//
+//         Ok(())
+//     }
+//
+//     fn clear_stats(&self) -> Result<(), u32> {
+//         self.write(defs::REG_COLC, 0);
+//         self.write(defs::REG_GPRC, 0);
+//         self.write(defs::REG_MPRC, 0);
+//         self.write(defs::REG_GPTC, 0);
+//         self.write(defs::REG_GORCL, 0);
+//         self.write(defs::REG_GORCH, 0);
+//         self.write(defs::REG_GOTCL, 0);
+//         self.write(defs::REG_GOTCH, 0);
+//         Ok(())
+//     }
+//
+//     pub fn new(base: NoCachePP) -> Result<Self, u32> {
+//         let page = Page::alloc_one();
+//
+//         page.zero();
+//
+//         let mut dev = Self {
+//             mac: [0; 6],
+//             status: netdev::LinkStatus::Down,
+//             speed: netdev::LinkSpeed::SpeedUnknown,
+//             id: netdev::alloc_id(),
+//             base,
+//             rt_desc_page: page,
+//             rx_head: None,
+//             rx_tail: None,
+//             tx_tail: None,
+//             rx_buffers: None,
+//             tx_buffers: None,
+//         };
+//
+//         dev.reset()?;
+//
+//         dev.mac = unsafe { dev.base.offset(0x5400).as_ptr::<[u8; 6]>().read() };
+//         dev.tx_buffers = Some(Box::new(Vec::with_capacity(TX_DESC_SIZE)));
+//
+//         let mut rx_buffers = Box::new(Vec::with_capacity(RX_DESC_SIZE));
+//
+//         for index in 0..RX_DESC_SIZE {
+//             let page = Page::alloc_many(2);
+//
+//             let ref mut desc = dev.rx_desc_table()[index];
+//             desc.buffer = page.as_phys() as u64;
+//             desc.status = 0;
+//
+//             rx_buffers.push(page);
+//         }
+//
+//         for index in 0..TX_DESC_SIZE {
+//             let ref mut desc = dev.tx_desc_table()[index];
+//             desc.status = defs::TXD_STAT_DD;
+//         }
+//
+//         dev.rx_buffers = Some(rx_buffers);
+//
+//         Ok(dev)
+//     }
+//
+//     fn read(&self, offset: u32) -> u32 {
+//         unsafe {
+//             self.base
+//                 .offset(offset as isize)
+//                 .as_ptr::<u32>()
+//                 .read_volatile()
+//         }
+//     }
+//
+//     fn write(&self, offset: u32, value: u32) {
+//         unsafe {
+//             self.base
+//                 .offset(offset as isize)
+//                 .as_ptr::<u32>()
+//                 .write_volatile(value)
+//         }
+//     }
+//
+//     fn rx_desc_table<'lt>(&'lt self) -> &'lt mut [RxDescriptor; RX_DESC_SIZE] {
+//         self.rt_desc_page.as_cached().as_mut()
+//     }
+//
+//     fn tx_desc_table<'lt>(&'lt self) -> &'lt mut [TxDescriptor; TX_DESC_SIZE] {
+//         self.rt_desc_page.as_cached().offset(0x200).as_mut()
+//     }
+// }
+//
+// impl Drop for E1000eDev {
+//     fn drop(&mut self) {
+//         assert_eq!(self.status, netdev::LinkStatus::Down);
+//
+//         if let Some(_) = self.rx_buffers.take() {}
+//
+//         // TODO: we should wait until all packets are sent
+//         if let Some(_) = self.tx_buffers.take() {}
+//
+//         let _ = self.rt_desc_page;
+//     }
+// }
+//
+// impl pci::pci_device {
+//     fn header0(&self) -> &pci::device_header_type0 {
+//         unsafe { self.header_type0().as_ref() }.unwrap()
+//     }
+// }
+//
+// fn do_probe_device(dev: &mut pci::pci_device) -> Result<(), u32> {
+//     let bar0 = dev.header0().bars[0];
+//
+//     if bar0 & 0xf != 0 {
+//         return Err(EINVAL);
+//     }
+//
+//     unsafe { dev.enableBusMastering() };
+//
+//     let base = NoCachePP::new((bar0 & !0xf) as usize);
+//     let e1000e = E1000eDev::new(base)?;
+//
+//     netdev::register_netdev(e1000e)?;
+//
+//     Ok(())
+// }
+//
+// unsafe extern "C" fn probe_device(dev: *mut pci::pci_device) -> i32 {
+//     let dev = dev.as_mut().unwrap();
+//     match do_probe_device(dev) {
+//         Ok(_) => 0,
+//         Err(e) => -(e as i32),
+//     }
+// }
 
 pub fn register_e1000e_driver() {
-    let dev_ids = [0x100e, 0x10d3, 0x10ea, 0x153a];
+    // let dev_ids = [0x100e, 0x10d3, 0x10ea, 0x153a];
 
-    for id in dev_ids.into_iter() {
-        let ret = unsafe { pci::register_driver_r(0x8086, id, Some(probe_device)) };
+    // for id in dev_ids.into_iter() {
+    //     let ret = unsafe { pci::register_driver_r(0x8086, id, Some(probe_device)) };
 
-        assert_eq!(ret, 0);
-    }
+    //     assert_eq!(ret, 0);
+    // }
 }

+ 0 - 1
src/driver/serial.rs

@@ -10,7 +10,6 @@ use alloc::{collections::vec_deque::VecDeque, format, sync::Arc};
 use bitflags::bitflags;
 use core::pin::pin;
 use eonix_runtime::{run::FutureRun, scheduler::Scheduler};
-use eonix_spin_irq::SpinIrq as _;
 use eonix_sync::WaitList;
 
 bitflags! {

+ 11 - 11
src/elf.rs

@@ -1,15 +1,15 @@
-use alloc::{ffi::CString, sync::Arc};
-use bitflags::bitflags;
-
 use crate::{
     io::{ByteBuffer, UninitBuffer},
     kernel::{
         constants::ENOEXEC,
-        mem::{FileMapping, MMList, Mapping, Permission, VAddr},
+        mem::{FileMapping, MMList, Mapping, Permission},
         vfs::dentry::Dentry,
     },
     prelude::*,
 };
+use alloc::{ffi::CString, sync::Arc};
+use bitflags::bitflags;
+use eonix_mm::address::{Addr as _, AddrOps as _, VAddr};
 
 #[repr(u8)]
 #[allow(dead_code)]
@@ -244,13 +244,13 @@ impl ParsedElf32 {
     pub fn load(self, args: Vec<CString>, envs: Vec<CString>) -> KResult<(VAddr, VAddr, MMList)> {
         let mm_list = MMList::new();
 
-        let mut data_segment_end = VAddr(0);
+        let mut data_segment_end = VAddr::NULL;
         for phent in self
             .phents
             .into_iter()
             .filter(|ent| ent.ph_type == Elf32PhType::Load)
         {
-            let vaddr_start = VAddr(phent.vaddr as usize);
+            let vaddr_start = VAddr::from(phent.vaddr as usize);
             let vmem_vaddr_end = vaddr_start + phent.mem_size as usize;
             let load_vaddr_end = vaddr_start + phent.file_size as usize;
 
@@ -296,8 +296,8 @@ impl ParsedElf32 {
 
         // Map stack area
         mm_list.mmap_fixed(
-            VAddr(0xc0000000 - 0x800000), // Stack bottom is at 0xc0000000
-            0x800000,                     // 8MB stack size
+            VAddr::from(0xc0000000 - 0x800000), // Stack bottom is at 0xc0000000
+            0x800000,                           // 8MB stack size
             Mapping::Anonymous,
             Permission {
                 write: true,
@@ -319,7 +319,7 @@ impl ParsedElf32 {
         longs.push(0); // AT_NULL
 
         sp = sp - longs.len() * size_of::<u32>();
-        sp = VAddr::from(usize::from(sp) & !0xf); // Align to 16 bytes
+        sp = sp.floor_to(16);
 
         mm_list.access_mut(sp, longs.len() * size_of::<u32>(), |offset, data| {
             data.copy_from_slice(unsafe {
@@ -330,7 +330,7 @@ impl ParsedElf32 {
             })
         })?;
 
-        Ok((VAddr(self.entry as usize), sp, mm_list))
+        Ok((VAddr::from(self.entry as usize), sp, mm_list))
     }
 }
 
@@ -342,7 +342,7 @@ fn push_strings(mm_list: &MMList, sp: &mut VAddr, strings: Vec<CString>) -> KRes
         mm_list.access_mut(*sp, len, |offset, data| {
             data.copy_from_slice(&string.as_bytes_with_nul()[offset..offset + data.len()])
         })?;
-        addrs.push(usize::from(*sp) as u32);
+        addrs.push(sp.addr() as u32);
     }
 
     Ok(addrs)

+ 10 - 4
src/fs/fat32/file.rs

@@ -1,6 +1,8 @@
-use crate::{kernel::mem::Page, KResult};
-
 use super::{ClusterIterator, FatFs};
+use crate::{
+    kernel::mem::{AsMemoryBlock as _, Page},
+    KResult,
+};
 
 pub trait ClusterReadIterator<'data>: Iterator<Item = KResult<&'data [u8]>> + 'data {}
 impl<'a, I> ClusterReadIterator<'a> for I where I: Iterator<Item = KResult<&'a [u8]>> + 'a {}
@@ -22,11 +24,15 @@ impl<'data, 'fat: 'data> ClusterRead<'data> for ClusterIterator<'fat> {
         let skip_clusters = offset / cluster_size;
         let mut inner_offset = offset % cluster_size;
 
-        let buffer_page = Page::alloc_one();
+        // TODO: Use block cache.
+        let buffer_page = Page::alloc();
 
         self.skip(skip_clusters).map(move |cluster| {
             vfs.read_cluster(cluster, &buffer_page)?;
-            let data = &buffer_page.as_slice()[inner_offset..];
+            let data = unsafe {
+                // SAFETY: No one could be writing to it.
+                &buffer_page.as_memblk().as_bytes()[inner_offset..]
+            };
             inner_offset = 0;
             Ok(data)
         })

+ 9 - 6
src/fs/procfs.rs

@@ -2,7 +2,7 @@ use crate::{
     io::Buffer,
     kernel::{
         constants::{S_IFDIR, S_IFREG},
-        mem::paging::{Page, PageBuffer},
+        mem::paging::PageBuffer,
         vfs::{
             dentry::Dentry,
             inode::{define_struct_inode, AtomicIno, Ino, Inode, InodeData},
@@ -100,10 +100,13 @@ impl Inode for FileInode {
             return Err(EACCES);
         }
 
-        let mut page_buffer = PageBuffer::new(Page::alloc_one());
-        let nread = self.file.read(&mut page_buffer)?;
+        let mut page_buffer = PageBuffer::new();
+        self.file.read(&mut page_buffer)?;
 
-        let data = split_len_offset(page_buffer.as_slice(), nread, offset);
+        let data = page_buffer
+            .data()
+            .split_at_checked(offset)
+            .map(|(_, data)| data);
 
         match data {
             None => Ok(0),
@@ -269,7 +272,7 @@ impl ProcFsFile for DumpMountsFile {
     fn read(&self, buffer: &mut PageBuffer) -> KResult<usize> {
         dump_mounts(&mut buffer.get_writer());
 
-        Ok(buffer.len())
+        Ok(buffer.data().len())
     }
 }
 
@@ -300,7 +303,7 @@ where
     }
 
     fn read(&self, buffer: &mut PageBuffer) -> KResult<usize> {
-        self.read_fn.as_ref().ok_or(EACCES)?(buffer).map(|_| buffer.len())
+        self.read_fn.as_ref().ok_or(EACCES)?(buffer).map(|_| buffer.data().len())
     }
 }
 

+ 40 - 4
src/io.rs

@@ -1,8 +1,6 @@
-use bindings::EFAULT;
-
 use crate::prelude::*;
-
-use core::mem::MaybeUninit;
+use bindings::EFAULT;
+use core::{cmp, mem::MaybeUninit};
 
 #[must_use]
 pub enum FillResult {
@@ -187,3 +185,41 @@ impl Buffer for ByteBuffer<'_> {
         self.cur
     }
 }
+
+/// Iterator that generates chunks of a given length from a start index
+/// until the end of the total length.
+///
+/// The iterator returns a tuple of (start, len) for each chunk.
+pub struct Chunks {
+    start: usize,
+    end: usize,
+    cur: usize,
+    chunk_len: usize,
+}
+
+impl Chunks {
+    pub const fn new(start: usize, total_len: usize, chunk_len: usize) -> Self {
+        Self {
+            start,
+            end: start + total_len,
+            cur: start,
+            chunk_len,
+        }
+    }
+}
+
+impl Iterator for Chunks {
+    type Item = (usize, usize);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cur >= self.end {
+            return None;
+        }
+
+        let start = self.cur;
+        let len = cmp::min(self.chunk_len, self.end - start);
+
+        self.cur += self.chunk_len;
+        Some((start, len))
+    }
+}

+ 10 - 5
src/kernel/block.rs

@@ -1,4 +1,8 @@
-use super::{constants::ENOENT, mem::paging::Page, vfs::DevId};
+use super::{
+    constants::ENOENT,
+    mem::{paging::Page, AsMemoryBlock as _},
+    vfs::DevId,
+};
 use crate::{
     io::{Buffer, FillResult, UninitBuffer},
     prelude::*,
@@ -218,14 +222,14 @@ impl BlockDevice {
                 count if count <= 8 => {
                     nread = count;
 
-                    let _page = Page::alloc_one();
+                    let _page = Page::alloc();
                     page = Some(_page);
                     pages = core::slice::from_ref(page.as_ref().unwrap());
                 }
                 count if count <= 16 => {
                     nread = count;
 
-                    let _pages = Page::alloc_many(1);
+                    let _pages = Page::alloc_order(1);
                     page = Some(_pages);
                     pages = core::slice::from_ref(page.as_ref().unwrap());
                 }
@@ -235,7 +239,7 @@ impl BlockDevice {
                     let npages = (nread + 15) / 16;
                     let mut _page_vec = Vec::with_capacity(npages as usize);
                     for _ in 0..npages {
-                        _page_vec.push(Page::alloc_many(1));
+                        _page_vec.push(Page::alloc_order(1));
                     }
                     page_vec = Some(_page_vec);
                     pages = page_vec.as_ref().unwrap().as_slice();
@@ -251,7 +255,8 @@ impl BlockDevice {
             self.read_raw(req)?;
 
             for page in pages.iter() {
-                let data = &page.as_slice()[first_sector_offset as usize..];
+                // SAFETY: We are the only owner of the page so no one could be mutating it.
+                let data = unsafe { &page.as_memblk().as_bytes()[first_sector_offset as usize..] };
                 first_sector_offset = 0;
 
                 match buffer.fill(data)? {

+ 8 - 5
src/kernel/cpu.rs

@@ -1,6 +1,8 @@
-use super::mem::{paging::Page, phys::PhysPtr as _};
+use super::mem::AsMemoryBlock;
 use arch::{PercpuArea, CPU};
-use core::{alloc::Layout, mem::ManuallyDrop, pin::Pin, ptr::NonNull};
+use buddy_allocator::BuddyAllocator;
+use core::{alloc::Layout, pin::Pin, ptr::NonNull};
+use eonix_mm::paging::Page;
 use eonix_sync::LazyLock;
 
 #[arch::define_percpu]
@@ -16,10 +18,11 @@ pub unsafe fn local_cpu() -> Pin<&'static mut CPU> {
 pub fn percpu_allocate(layout: Layout) -> NonNull<u8> {
     // TODO: Use page size defined in `arch`.
     let page_count = layout.size().div_ceil(arch::PAGE_SIZE);
-    let page = ManuallyDrop::new(Page::early_alloc_ceil(page_count));
-    let pointer = page.as_cached().as_ptr();
+    let page = Page::<BuddyAllocator>::alloc_at_least(page_count);
+    let page_data = page.as_memblk().as_byte_ptr();
+    core::mem::forget(page);
 
-    NonNull::new(pointer).expect("Allocated page pfn should be non-null.")
+    page_data
 }
 
 pub fn init_localcpu() {

+ 0 - 1
src/kernel/interrupt.rs

@@ -8,7 +8,6 @@ use crate::{driver::Port8, prelude::*};
 use alloc::sync::Arc;
 use arch::{ExtendedContext, InterruptContext};
 use eonix_runtime::task::Task;
-use eonix_spin_irq::SpinIrq as _;
 
 const PIC1_COMMAND: Port8 = Port8::new(0x20);
 const PIC1_DATA: Port8 = Port8::new(0x21);

+ 2 - 5
src/kernel/mem.rs

@@ -1,15 +1,12 @@
 pub mod paging;
-pub mod phys;
 
+mod access;
 mod address;
 mod mm_area;
 mod mm_list;
 mod page_alloc;
-mod page_table;
 
-#[allow(unused_imports)]
-pub use address::{PAddr, VAddr, VRange, PFN, VPN};
+pub use access::{AsMemoryBlock, MemoryBlock, PhysAccess};
 pub(self) use mm_area::MMArea;
 pub use mm_list::{handle_page_fault, FileMapping, MMList, Mapping, Permission};
-pub(self) use page_table::{PageTable, PTE};
 pub use paging::{Page, PageBuffer};

+ 158 - 0
src/kernel/mem/access.rs

@@ -0,0 +1,158 @@
+use core::{num::NonZero, ptr::NonNull};
+use eonix_mm::address::{Addr as _, PAddr};
+use eonix_mm::paging::{PageAccess, PageBlock, PFN};
+
+const PHYS_OFFSET: usize = 0xffff_ff00_0000_0000;
+
+/// A block of memory starting at a non-zero address and having a specific length.
+///
+/// This struct is used to represent a memory block that can be accessed
+/// in the kernel space.
+pub struct MemoryBlock {
+    addr: NonZero<usize>,
+    len: usize,
+}
+
+pub struct KernelPageAccess;
+
+pub trait AsMemoryBlock {
+    /// Translate the physical page the page object pointing to into kernel
+    /// accessible pointer. Use it with care.
+    fn as_memblk(&self) -> MemoryBlock;
+}
+
+pub trait PhysAccess {
+    /// Translate the data that this address is pointing to into kernel
+    /// accessible pointer. Use it with care.
+    ///
+    /// # Panic
+    /// If the address is not properly aligned.
+    ///
+    /// # Safety
+    /// The caller must ensure that the data is of type `T`.
+    /// Otherwise, it may lead to undefined behavior.
+    unsafe fn as_ptr<T>(&self) -> NonNull<T>;
+}
+
+impl MemoryBlock {
+    /// Create a new `MemoryBlock` with the given address and length.
+    ///
+    /// # Safety
+    /// The caller must ensure that the address is valid.
+    /// Otherwise, it may lead to undefined behavior.
+    pub unsafe fn new(addr: NonZero<usize>, len: usize) -> Self {
+        Self { addr, len }
+    }
+
+    /// Get the start address of the memory block.
+    #[allow(dead_code)]
+    pub fn addr(&self) -> NonZero<usize> {
+        self.addr
+    }
+
+    /// Get the length of the memory block.
+    #[allow(dead_code)]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Split the memory block into two parts at the given offset.
+    pub fn split_at(&self, at: usize) -> (Self, Self) {
+        if at > self.len {
+            panic!("Out of bounds");
+        }
+
+        let rhs_start = self.addr.checked_add(at).expect("Overflow");
+
+        let lhs = unsafe { Self::new(self.addr, at) };
+        let rhs = unsafe { Self::new(rhs_start, self.len - at) };
+
+        (lhs, rhs)
+    }
+
+    /// Provide a pointer to the data.
+    ///
+    /// # Safety
+    /// Using the returned pointer is undefined behavior if the address is not
+    ///  properly aligned or the size is not equal to the size of `T`.
+    pub unsafe fn as_ptr_unchecked<T>(&self) -> NonNull<T> {
+        // SAFETY: `self.addr` is a non-zero value.
+        NonNull::new_unchecked(self.addr.get() as *mut T)
+    }
+
+    /// Provide a pointer to the data.
+    ///
+    /// # Panic
+    /// Panic if the address is not properly aligned.
+    pub fn as_ptr<T>(&self) -> NonNull<T> {
+        let alignment = align_of::<T>();
+
+        if self.addr.get() % alignment != 0 {
+            panic!("Alignment error");
+        }
+
+        unsafe {
+            // SAFETY: We've checked that `self.addr` is properly aligned.
+            self.as_ptr_unchecked()
+        }
+    }
+
+    /// Provide a pointer to the bytes.
+    pub fn as_byte_ptr(&self) -> NonNull<u8> {
+        unsafe {
+            // SAFETY: No alignment check is needed for bytes.
+            self.as_ptr_unchecked()
+        }
+    }
+
+    /// Provide immutable access to the data it pointed to.
+    ///
+    /// # Safety
+    /// This function is unsafe because it returns an immutable reference with
+    /// a created lifetime.
+    ///
+    /// The caller must ensure that the data has no other mutable aliases while
+    /// the reference is in use. Otherwise, it may lead to undefined behavior.
+    pub unsafe fn as_bytes<'a>(&self) -> &'a [u8] {
+        core::slice::from_raw_parts(self.as_ptr_unchecked().as_ptr(), self.len)
+    }
+
+    /// Provide mutable access to the data it pointed to.
+    ///
+    /// # Panic
+    /// Panic if the address is not properly aligned or the size is not
+    /// equal to the size of `T`.
+    ///
+    /// # Safety
+    /// This function is unsafe because it returns a mutable reference with a
+    /// created lifetime.
+    ///
+    /// The caller must ensure that the data has no other immutable or mutable
+    /// aliases while the reference is in use.
+    /// Otherwise, it may lead to undefined behavior.
+    pub unsafe fn as_bytes_mut<'a>(&mut self) -> &'a mut [u8] {
+        core::slice::from_raw_parts_mut(self.as_ptr_unchecked().as_ptr(), self.len)
+    }
+}
+
+impl PhysAccess for PAddr {
+    unsafe fn as_ptr<T>(&self) -> NonNull<T> {
+        let alignment: usize = align_of::<T>();
+        assert!(self.addr() % alignment == 0, "Alignment error");
+
+        unsafe {
+            // SAFETY: We can assume that we'll never have `self.addr()` equals
+            //         to `-PHYS_OFFSET`. Otherwise, the kernel might be broken.
+            NonNull::new_unchecked((PHYS_OFFSET + self.addr()) as *mut T)
+        }
+    }
+}
+
+impl PageAccess for KernelPageAccess {
+    unsafe fn get_ptr_for_pfn(pfn: PFN) -> NonNull<PageBlock> {
+        unsafe {
+            // SAFETY: The physical address of a page must be aligned to the page size.
+            PAddr::from(pfn).as_ptr()
+        }
+    }
+}

+ 17 - 398
src/kernel/mem/address.rs

@@ -1,411 +1,30 @@
-use arch::PAGE_SIZE;
-use core::{
-    cmp::Ordering,
-    fmt::{self, Debug, Formatter},
-    ops::{Add, RangeBounds, Sub},
-};
+use eonix_mm::address::{VAddr, VRange};
 
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct PAddr(pub usize);
+const USER_SPACE_MEMORY_TOP: VAddr = VAddr::from(0x8000_0000_0000);
+const KERNEL_SPACE_MEMORY_BOTTOM: VAddr = VAddr::from(0xffff_8000_0000_0000);
 
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct VAddr(pub usize);
-
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct PFN(pub usize);
-
-#[repr(transparent)]
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct VPN(pub usize);
-
-const PAGE_SIZE_BITS: usize = 12;
-const USER_SPACE_MEMORY_TOP: VAddr = VAddr(0x8000_0000_0000);
-
-impl From<PAddr> for usize {
-    fn from(v: PAddr) -> Self {
-        v.0
-    }
-}
-
-impl From<PFN> for usize {
-    fn from(v: PFN) -> Self {
-        v.0
-    }
-}
-
-impl From<VAddr> for usize {
-    fn from(v: VAddr) -> Self {
-        v.0
-    }
-}
-
-impl From<VPN> for usize {
-    fn from(v: VPN) -> Self {
-        v.0
-    }
-}
-
-impl From<usize> for PAddr {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<usize> for PFN {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<usize> for VAddr {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<usize> for VPN {
-    fn from(v: usize) -> Self {
-        Self(v)
-    }
-}
-
-impl From<VPN> for VAddr {
-    fn from(v: VPN) -> Self {
-        Self(v.0 << PAGE_SIZE_BITS)
-    }
-}
-
-impl From<VAddr> for VPN {
-    fn from(v: VAddr) -> Self {
-        assert_eq!(v.page_offset(), 0);
-        v.floor_vpn()
-    }
+pub trait VAddrExt {
+    fn is_user(&self) -> bool;
 }
 
-impl From<PAddr> for PFN {
-    fn from(v: PAddr) -> Self {
-        assert_eq!(v.page_offset(), 0);
-        v.floor_pfn()
-    }
-}
-
-impl From<PFN> for PAddr {
-    fn from(v: PFN) -> Self {
-        Self(v.0 << PAGE_SIZE_BITS)
-    }
-}
-
-impl PAddr {
-    pub fn floor_pfn(&self) -> PFN {
-        PFN(self.0 / PAGE_SIZE)
-    }
-
-    pub fn ceil_pfn(&self) -> PFN {
-        PFN((self.0 + PAGE_SIZE - 1) / PAGE_SIZE)
-    }
-
-    pub fn page_offset(&self) -> usize {
-        self.0 & (PAGE_SIZE - 1)
-    }
-
-    pub fn is_aligned(&self) -> bool {
-        self.page_offset() == 0
-    }
-}
-
-impl PFN {
-    pub fn buddy_pfn(&self, order: u32) -> PFN {
-        PFN::from(self.0 ^ (1 << order))
-    }
-
-    pub fn combined_pfn(&self, buddy_pfn: PFN) -> PFN {
-        PFN::from(self.0 & buddy_pfn.0)
-    }
-}
-
-impl VAddr {
-    pub const NULL: Self = Self(0);
-
-    pub const fn floor_vpn(&self) -> VPN {
-        VPN(self.0 / PAGE_SIZE)
-    }
-
-    pub const fn ceil_vpn(&self) -> VPN {
-        VPN((self.0 - 1 + PAGE_SIZE) / PAGE_SIZE)
-    }
-
-    pub const fn page_offset(self) -> usize {
-        let Self(addr) = self;
-        addr & (PAGE_SIZE - 1)
-    }
-
-    pub const fn is_aligned(&self) -> bool {
-        self.page_offset() == 0
-    }
-
-    pub const fn is_user(self) -> bool {
-        const USER_SPACE_MEMORY_TOP_ADDR: usize = const { USER_SPACE_MEMORY_TOP.0 };
-
-        match self {
-            Self(0) => false,
-            Self(..USER_SPACE_MEMORY_TOP_ADDR) => true,
-            _ => false,
-        }
-    }
-
-    pub const fn floor(self) -> Self {
-        self.floor_to(PAGE_SIZE)
-    }
-
-    pub const fn ceil(self) -> Self {
-        self.ceil_to(PAGE_SIZE)
-    }
-
-    /// Aligns the address to the nearest lower multiple of `size`.
-    pub const fn floor_to(self, size: usize) -> Self {
-        let Self(addr) = self;
-        Self(addr / size * size)
-    }
-
-    /// Aligns the address to the nearest lower multiple of `size`.
-    pub const fn ceil_to(self, size: usize) -> Self {
-        let Self(addr) = self;
-        Self(addr.div_ceil(size) * size)
-    }
-}
-
-impl Sub for VAddr {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for VAddr {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        VAddr(self.0 - rhs)
-    }
-}
-
-impl Add<usize> for VAddr {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        VAddr(self.0 + rhs)
-    }
-}
-
-impl Sub for PAddr {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for PAddr {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        PAddr(self.0 - rhs)
-    }
-}
-
-impl Add<usize> for PAddr {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        PAddr(self.0 + rhs)
-    }
-}
-
-impl Debug for VAddr {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "VAddr{:#x}", self.0)
-    }
-}
-
-impl Debug for PAddr {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "PAddr{:#x}", self.0)
-    }
-}
-
-impl Add<usize> for PFN {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        PFN(self.0 + rhs)
-    }
-}
-
-impl Sub for PFN {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for PFN {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        PFN(self.0 - rhs)
-    }
-}
-
-impl Add<usize> for VPN {
-    type Output = Self;
-
-    fn add(self, rhs: usize) -> Self::Output {
-        VPN(self.0 + rhs)
-    }
-}
-
-impl Sub for VPN {
-    type Output = usize;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.0 - rhs.0
-    }
-}
-
-impl Sub<usize> for VPN {
-    type Output = Self;
-
-    fn sub(self, rhs: usize) -> Self::Output {
-        VPN(self.0 - rhs)
-    }
-}
-
-impl Debug for VPN {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "VPN{:#x}", self.0)
-    }
-}
-
-impl Debug for PFN {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        write!(f, "PFN{:#x}", self.0)
-    }
-}
-
-#[derive(Clone, Copy)]
-pub struct VRange {
-    start: VAddr,
-    end: VAddr,
-}
-
-impl Debug for VRange {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        write!(f, "[{:?}, {:?})", self.start, self.end)
-    }
-}
-
-impl Eq for VRange {}
-impl PartialOrd for VRange {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl PartialEq for VRange {
-    fn eq(&self, other: &Self) -> bool {
-        self.cmp(other) == Ordering::Equal
-    }
-}
-
-/// Any two ranges that have one of them containing the other are considered equal.
-impl Ord for VRange {
-    fn cmp(&self, other: &Self) -> Ordering {
-        if self.start == other.start {
-            return Ordering::Equal;
-        }
-
-        if self.end == other.end {
-            if self.start == self.end {
-                return Ordering::Greater;
-            }
-            if other.start == other.end {
-                return Ordering::Less;
-            }
-            return Ordering::Equal;
-        }
-
-        if self.start < other.start {
-            if other.end < self.end {
-                return Ordering::Equal;
-            } else {
-                return Ordering::Less;
-            }
-        }
-
-        if other.start < self.start {
-            if self.end < other.end {
-                return Ordering::Equal;
-            } else {
-                return Ordering::Greater;
-            }
-        }
-
-        unreachable!()
-    }
+pub trait VRangeExt {
+    #[allow(dead_code)]
+    fn is_kernel(&self) -> bool;
+    fn is_user(&self) -> bool;
 }
 
-impl From<VAddr> for VRange {
-    fn from(addr: VAddr) -> Self {
-        VRange::new(addr, addr)
+impl VAddrExt for VAddr {
+    fn is_user(&self) -> bool {
+        (..USER_SPACE_MEMORY_TOP).contains(&self)
     }
 }
 
-impl VRange {
-    pub fn new(start: VAddr, end: VAddr) -> Self {
-        assert!(start <= end);
-        VRange { start, end }
-    }
-
-    #[allow(dead_code)]
-    pub fn is_overlapped(&self, other: &Self) -> bool {
-        self == other
-    }
-
-    pub fn is_user(&self) -> bool {
-        self.start < USER_SPACE_MEMORY_TOP && self.end <= USER_SPACE_MEMORY_TOP
-    }
-
-    pub fn start(&self) -> VAddr {
-        self.start
-    }
-
-    pub fn end(&self) -> VAddr {
-        self.end
-    }
-
-    pub fn len(&self) -> usize {
-        self.end.0 - self.start.0
-    }
-
-    pub fn shrink(&self, count: usize) -> Self {
-        assert!(count <= self.len());
-        VRange::new(self.start, self.end - count)
-    }
-
-    pub fn grow(&self, count: usize) -> Self {
-        VRange::new(self.start, self.end + count)
+impl VRangeExt for VRange {
+    fn is_user(&self) -> bool {
+        !(self.end() > USER_SPACE_MEMORY_TOP || self.start() >= USER_SPACE_MEMORY_TOP)
     }
 
-    pub fn into_range(self) -> impl RangeBounds<Self> {
-        if self.len() == 0 {
-            VRange::from(self.start())..=VRange::from(self.start())
-        } else {
-            VRange::from(self.start())..=VRange::from(self.end() - 1)
-        }
+    fn is_kernel(&self) -> bool {
+        self.start() >= KERNEL_SPACE_MEMORY_BOTTOM
     }
 }

+ 60 - 56
src/kernel/mem/mm_area.rs

@@ -1,12 +1,10 @@
-use crate::prelude::*;
-
-use bindings::PA_MMAP;
-
+use super::paging::AllocZeroed as _;
+use super::{AsMemoryBlock, Mapping, Page, Permission};
+use crate::io::ByteBuffer;
+use crate::KResult;
 use core::{borrow::Borrow, cell::UnsafeCell, cmp::Ordering};
-
-use crate::bindings::root::{PA_A, PA_ANON, PA_COW, PA_P, PA_RW};
-
-use super::{Mapping, Page, PageBuffer, Permission, VAddr, VRange, PTE};
+use eonix_mm::address::{AddrOps as _, VAddr, VRange};
+use eonix_mm::page_table::{PageAttribute, PTE};
 
 #[derive(Debug)]
 pub struct MMArea {
@@ -44,11 +42,6 @@ impl MMArea {
         *self.range_borrow()
     }
 
-    #[allow(dead_code)]
-    pub fn len(&self) -> usize {
-        self.range_borrow().len()
-    }
-
     /// # Safety
     /// This function should be called only when we can guarantee that the range
     /// won't overlap with any other range in some scope.
@@ -58,7 +51,7 @@ impl MMArea {
     }
 
     pub fn split(mut self, at: VAddr) -> (Option<Self>, Option<Self>) {
-        assert_eq!(at.floor(), at);
+        assert!(at.is_page_aligned());
 
         match self.range_borrow().cmp(&VRange::from(at)) {
             Ordering::Less => (Some(self), None),
@@ -86,83 +79,94 @@ impl MMArea {
 
     /// # Return
     /// Whether the whole handling process is done.
-    pub fn handle_cow(&self, pte: &mut PTE) -> bool {
-        let mut attributes = pte.attributes();
-        let mut pfn = pte.pfn();
+    pub fn handle_cow(&self, pte: &mut impl PTE) -> bool {
+        let mut page_attr = pte.get_attr();
+        let pfn = pte.get_pfn();
 
-        attributes &= !PA_COW as usize;
-        if self.permission.write {
-            attributes |= PA_RW as usize;
-        } else {
-            attributes &= !PA_RW as usize;
-        }
+        page_attr = page_attr.copy_on_write(false);
+        page_attr = page_attr.write(self.permission.write);
 
-        let page = unsafe { Page::take_pfn(pfn, 0) };
-        if unsafe { page.load_refcount() } == 1 {
+        let page = unsafe { Page::from_raw(pfn) };
+        if page.is_exclusive() {
             // SAFETY: This is actually safe. If we read `1` here and we have `MMList` lock
             // held, there couldn't be neither other processes sharing the page, nor other
             // threads making the page COW at the same time.
-            pte.set_attributes(attributes);
+            pte.set_attr(page_attr);
             core::mem::forget(page);
             return true;
         }
 
-        let new_page = Page::alloc_one();
-        if attributes & PA_ANON as usize != 0 {
-            new_page.zero();
+        let new_page;
+        if page_attr.is_anonymous() {
+            new_page = Page::zeroed();
         } else {
-            new_page.as_mut_slice().copy_from_slice(page.as_slice());
+            new_page = Page::alloc();
+
+            unsafe {
+                // SAFETY: `page` is CoW, which means that others won't write to it.
+                let old_page_data = page.as_memblk().as_bytes();
+
+                // SAFETY: `new_page` is exclusive owned by us.
+                let new_page_data = new_page.as_memblk().as_bytes_mut();
+
+                new_page_data.copy_from_slice(old_page_data);
+            };
         }
 
-        attributes &= !(PA_A | PA_ANON) as usize;
+        page_attr = page_attr.accessed(false);
+        page_attr = page_attr.anonymous(false);
 
-        pfn = new_page.into_pfn();
-        pte.set(pfn, attributes);
+        pte.set(new_page.into_raw(), page_attr);
 
         false
     }
 
     /// # Arguments
     /// * `offset`: The offset from the start of the mapping, aligned to 4KB boundary.
-    pub fn handle_mmap(&self, pte: &mut PTE, offset: usize) -> KResult<()> {
+    pub fn handle_mmap(&self, pte: &mut impl PTE, offset: usize) -> KResult<()> {
         // TODO: Implement shared mapping
-        let mut attributes = pte.attributes();
-        let pfn = pte.pfn();
-
-        attributes |= PA_P as usize;
+        let mut page_attr = pte.get_attr();
+        let pfn = pte.get_pfn();
 
         match &self.mapping {
             Mapping::File(mapping) if offset < mapping.length => {
-                // SAFETY: Since we are here, the `pfn` must refer to a valid buddy page.
-                let page = unsafe { Page::from_pfn(pfn, 0) };
-                let nread = mapping
-                    .file
-                    .read(&mut PageBuffer::new(page.clone()), mapping.offset + offset)?;
-
-                if nread < page.len() {
-                    page.as_mut_slice()[nread..].fill(0);
-                }
+                let page = unsafe {
+                    // SAFETY: Since we are here, the `pfn` must refer to a valid buddy page.
+                    Page::with_raw(pfn, |page| page.clone())
+                };
 
-                if mapping.length - offset < 0x1000 {
-                    let length_to_end = mapping.length - offset;
-                    page.as_mut_slice()[length_to_end..].fill(0);
-                }
+                let page_data = unsafe {
+                    // SAFETY: `page` is marked as mapped, so others trying to read or write to
+                    //         it will be blocked and enter the page fault handler, where they will
+                    //         be blocked by the mutex held by us.
+                    page.as_memblk().as_bytes_mut()
+                };
+
+                let cnt_to_read = (mapping.length - offset).min(0x1000);
+                let cnt_read = mapping.file.read(
+                    &mut ByteBuffer::new(&mut page_data[..cnt_to_read]),
+                    mapping.offset + offset,
+                )?;
+
+                page_data[cnt_read..].fill(0);
             }
             Mapping::File(_) => panic!("Offset out of range"),
             _ => panic!("Anonymous mapping should not be PA_MMAP"),
         }
 
-        attributes &= !PA_MMAP as usize;
-        pte.set_attributes(attributes);
+        page_attr = page_attr.present(true).mapped(false);
+        pte.set_attr(page_attr);
         Ok(())
     }
 
-    pub fn handle(&self, pte: &mut PTE, offset: usize) -> KResult<()> {
-        if pte.is_cow() {
+    pub fn handle(&self, pte: &mut impl PTE, offset: usize) -> KResult<()> {
+        let page_attr = pte.get_attr();
+
+        if page_attr.is_copy_on_write() {
             self.handle_cow(pte);
         }
 
-        if pte.is_mmap() {
+        if page_attr.is_mapped() {
             self.handle_mmap(pte, offset)?;
         }
 

+ 318 - 145
src/kernel/mem/mm_list.rs

@@ -1,27 +1,36 @@
+mod mapping;
 mod page_fault;
 
-use super::{MMArea, Page, PageTable, VAddr, VRange};
-use crate::kernel::vfs::dentry::Dentry;
+use super::access::KernelPageAccess;
+use super::address::{VAddrExt as _, VRangeExt as _};
+use super::page_alloc::GlobalPageAlloc;
+use super::paging::{AllocZeroed as _, PageUnmanaged};
+use super::{AsMemoryBlock, MMArea, Page};
 use crate::{prelude::*, sync::ArcSwap};
-use alloc::{collections::btree_set::BTreeSet, sync::Arc};
-use bindings::{EEXIST, EFAULT, EINVAL, ENOMEM, KERNEL_PML4};
-use core::{
-    ops::Sub as _,
-    sync::atomic::{AtomicUsize, Ordering},
+use alloc::collections::btree_set::BTreeSet;
+use arch::DefaultPagingMode;
+use bindings::{EEXIST, EFAULT, EINVAL, ENOMEM};
+use core::fmt;
+use core::sync::atomic::{AtomicUsize, Ordering};
+use eonix_mm::address::{Addr as _, PAddr};
+use eonix_mm::page_table::PagingMode;
+use eonix_mm::paging::PFN;
+use eonix_mm::{
+    address::{AddrOps as _, VAddr, VRange},
+    page_table::{PageAttribute, PageTable, PTE},
+    paging::PAGE_SIZE,
 };
 use eonix_runtime::task::Task;
-use eonix_sync::Mutex;
+use eonix_sync::{LazyLock, Mutex};
 
+pub use mapping::{FileMapping, Mapping};
 pub use page_fault::handle_page_fault;
 
-#[derive(Debug, Clone)]
-pub struct FileMapping {
-    pub file: Arc<Dentry>,
-    /// Offset in the file, aligned to 4KB boundary.
-    pub offset: usize,
-    /// Length of the mapping. Exceeding part will be zeroed.
-    pub length: usize,
-}
+static EMPTY_PAGE: LazyLock<Page> = LazyLock::new(|| Page::zeroed());
+static KERNEL_ROOT_TABLE_PAGE: LazyLock<PageUnmanaged> = LazyLock::new(|| unsafe {
+    // SAFETY: The kernel page table is always valid.
+    PageUnmanaged::from_raw_unchecked(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN)
+});
 
 #[derive(Debug, Clone, Copy)]
 pub struct Permission {
@@ -29,51 +38,21 @@ pub struct Permission {
     pub execute: bool,
 }
 
-#[derive(Debug, Clone)]
-pub enum Mapping {
-    Anonymous,
-    File(FileMapping),
-}
-
-#[derive(Debug)]
-struct MMListInner {
+struct MMListInner<'a> {
     areas: BTreeSet<MMArea>,
-    page_table: PageTable,
+    page_table: PageTable<'a, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess>,
     break_start: Option<VRange>,
     break_pos: Option<VAddr>,
 }
 
-#[derive(Debug)]
 pub struct MMList {
-    inner: ArcSwap<Mutex<MMListInner>>,
+    inner: ArcSwap<Mutex<MMListInner<'static>>>,
+    user_count: AtomicUsize,
     /// Only used in kernel space to switch page tables on context switch.
     root_page_table: AtomicUsize,
 }
 
-impl FileMapping {
-    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
-        assert_eq!(offset & 0xfff, 0);
-        Self {
-            file,
-            offset,
-            length,
-        }
-    }
-
-    pub fn offset(&self, offset: usize) -> Self {
-        if self.length <= offset {
-            Self::new(self.file.clone(), self.offset + self.length, 0)
-        } else {
-            Self::new(
-                self.file.clone(),
-                self.offset + offset,
-                self.length - offset,
-            )
-        }
-    }
-}
-
-impl MMListInner {
+impl MMListInner<'_> {
     fn overlapping_addr(&self, addr: VAddr) -> Option<&MMArea> {
         self.areas.get(&VRange::from(addr))
     }
@@ -83,20 +62,27 @@ impl MMListInner {
     }
 
     fn overlapping_range(&self, range: VRange) -> impl DoubleEndedIterator<Item = &MMArea> + '_ {
-        self.areas.range(range.into_range())
+        self.areas.range(range.into_bounds())
     }
 
     fn check_overlapping_range(&self, range: VRange) -> bool {
         range.is_user() && self.overlapping_range(range).next().is_none()
     }
 
-    fn find_available(&self, hint: VAddr, len: usize) -> Option<VAddr> {
-        let mut range = if hint == VAddr::NULL {
-            VRange::new(VAddr(0x1234000), VAddr(0x1234000 + len).ceil())
+    fn random_start(&self) -> VAddr {
+        VAddr::from(0x1234000)
+    }
+
+    fn find_available(&self, mut hint: VAddr, len: usize) -> Option<VAddr> {
+        let len = len.div_ceil(PAGE_SIZE) * PAGE_SIZE;
+
+        if hint == VAddr::NULL {
+            hint = self.random_start();
         } else {
-            VRange::new(hint.floor(), (hint + len).ceil())
-        };
-        let len = range.len();
+            hint = hint.floor();
+        }
+
+        let mut range = VRange::from(hint).grow(len);
 
         loop {
             if !range.is_user() {
@@ -106,63 +92,85 @@ impl MMListInner {
             match self.overlapping_range(range).next_back() {
                 None => return Some(range.start()),
                 Some(area) => {
-                    range = VRange::new(area.range().end().ceil(), area.range().end().ceil() + len);
+                    range = VRange::from(area.range().end().ceil()).grow(len);
                 }
             }
         }
     }
 
-    fn unmap(&mut self, start: VAddr, len: usize) -> KResult<()> {
+    fn unmap(&mut self, start: VAddr, len: usize) -> KResult<Vec<Page>> {
         assert_eq!(start.floor(), start);
         let end = (start + len).ceil();
-        let range = VRange::new(start, end);
-        if !range.is_user() {
+        let range_to_unmap = VRange::new(start, end);
+        if !range_to_unmap.is_user() {
             return Err(EINVAL);
         }
 
-        let check_range = VRange::from(range.start())..VRange::from(range.end());
-        let mut front_remaining = None;
-        let mut back_remaining = None;
+        let mut left_remaining = None;
+        let mut right_remaining = None;
+
+        let mut pages_to_free = Vec::new();
+
+        // TODO: Write back dirty pages.
 
         self.areas.retain(|area| {
-            if !check_range.contains(&area.range()) {
+            let Some((left, mid, right)) = area.range().mask_with_checked(&range_to_unmap) else {
                 return true;
+            };
+
+            for pte in self.page_table.iter_user(mid) {
+                let (pfn, _) = pte.take();
+                pages_to_free.push(unsafe {
+                    // SAFETY: We got the pfn from a valid page table entry, so it should be valid.
+                    Page::from_raw(pfn)
+                });
             }
-            if area.range() == range.start().into() {
-                let (left, right) = area.clone().split(range.start());
-                self.page_table.unmap(&right.unwrap());
-
-                if let Some(left) = left {
-                    assert!(
-                        front_remaining.replace(left).is_none(),
-                        "There should be only one `front`."
-                    );
+
+            match (left, right) {
+                (None, None) => {}
+                (Some(left), None) => {
+                    assert!(left_remaining.is_none());
+                    let (Some(left), _) = area.clone().split(left.end()) else {
+                        unreachable!("`left.end()` is within the area");
+                    };
+
+                    left_remaining = Some(left);
+                }
+                (None, Some(right)) => {
+                    assert!(right_remaining.is_none());
+                    let (_, Some(right)) = area.clone().split(right.start()) else {
+                        unreachable!("`right.start()` is within the area");
+                    };
+
+                    right_remaining = Some(right);
+                }
+                (Some(left), Some(right)) => {
+                    assert!(left_remaining.is_none());
+                    assert!(right_remaining.is_none());
+                    let (Some(left), Some(mid)) = area.clone().split(left.end()) else {
+                        unreachable!("`left.end()` is within the area");
+                    };
+
+                    let (_, Some(right)) = mid.split(right.start()) else {
+                        unreachable!("`right.start()` is within the area");
+                    };
+
+                    left_remaining = Some(left);
+                    right_remaining = Some(right);
                 }
-            } else if area.range() == range.end().into() {
-                let (left, right) = area.clone().split(range.end());
-                self.page_table.unmap(&left.unwrap());
-
-                assert!(
-                    back_remaining
-                        .replace(right.expect("`right` should be valid"))
-                        .is_none(),
-                    "There should be only one `back`."
-                );
-            } else {
-                self.page_table.unmap(area);
             }
 
             false
         });
 
-        if let Some(front) = front_remaining {
+        if let Some(front) = left_remaining {
             self.areas.insert(front);
         }
-        if let Some(back) = back_remaining {
+        if let Some(back) = right_remaining {
             self.areas.insert(back);
         }
 
-        Ok(())
+        Ok(pages_to_free)
     }
 
     fn mmap(
@@ -192,10 +200,36 @@ impl MMListInner {
 }
 
 impl MMList {
+    async fn flush_user_tlbs(&self) {
+        match self.user_count.load(Ordering::Relaxed) {
+            0 => {
+                // If there are currently no users, we don't need to do anything.
+            }
+            1 => {
+                if PAddr::from(arch::get_root_page_table_pfn()).addr()
+                    == self.root_page_table.load(Ordering::Relaxed)
+                {
+                    // If there is only one user and we are using the page table,
+                    // flushing the TLB for the local cpu only is enough.
+                    arch::flush_tlb_all();
+                } else {
+                    // Send the TLB flush request to the core.
+                    todo!();
+                }
+            }
+            _ => {
+                // If there are more than one users, we broadcast the TLB flush
+                // to all cores.
+                todo!()
+            }
+        }
+    }
+
     pub fn new() -> Self {
-        let page_table = PageTable::new();
+        let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE);
         Self {
-            root_page_table: AtomicUsize::from(page_table.root_page_table()),
+            root_page_table: AtomicUsize::from(page_table.addr().addr()),
+            user_count: AtomicUsize::new(0),
             inner: ArcSwap::new(Mutex::new(MMListInner {
                 areas: BTreeSet::new(),
                 page_table,
@@ -205,13 +239,14 @@ impl MMList {
         }
     }
 
-    pub fn new_cloned(&self) -> Self {
+    pub async fn new_cloned(&self) -> Self {
         let inner = self.inner.borrow();
-        let inner = Task::block_on(inner.lock());
+        let inner = inner.lock().await;
 
-        let page_table = PageTable::new();
+        let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE);
         let list = Self {
-            root_page_table: AtomicUsize::from(page_table.root_page_table()),
+            root_page_table: AtomicUsize::from(page_table.addr().addr()),
+            user_count: AtomicUsize::new(0),
             inner: ArcSwap::new(Mutex::new(MMListInner {
                 areas: inner.areas.clone(),
                 page_table,
@@ -222,68 +257,111 @@ impl MMList {
 
         {
             let list_inner = list.inner.borrow();
-            let list_inner = Task::block_on(list_inner.lock());
+            let list_inner = list_inner.lock().await;
 
             for area in list_inner.areas.iter() {
-                let new_iter = list_inner.page_table.iter_user(area.range()).unwrap();
-                let old_iter = inner.page_table.iter_user(area.range()).unwrap();
-
-                for (new, old) in new_iter.zip(old_iter) {
-                    new.setup_cow(old);
-                }
+                list_inner
+                    .page_table
+                    .set_copy_on_write(&inner.page_table, area.range());
             }
         }
 
-        // We set some pages as COW, so we need to invalidate TLB.
-        inner.page_table.lazy_invalidate_tlb_all();
+        // We've set some pages as CoW, so we need to invalidate all our users' TLB.
+        self.flush_user_tlbs().await;
 
         list
     }
 
-    pub fn switch_page_table(&self) {
+    pub fn activate(&self) {
+        self.user_count.fetch_add(1, Ordering::Acquire);
+
         let root_page_table = self.root_page_table.load(Ordering::Relaxed);
         assert_ne!(root_page_table, 0);
-        arch::set_root_page_table(root_page_table);
+        arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table)));
     }
 
-    pub fn replace(&self, new: Self) {
-        // Switch to kernel page table in case we are using the page table to be swapped and released.
-        let mut switched = false;
-        if arch::get_root_page_table() == self.root_page_table.load(Ordering::Relaxed) {
-            arch::set_root_page_table(KERNEL_PML4 as usize);
-            switched = true;
-        }
+    pub fn deactivate(&self) {
+        arch::set_root_page_table_pfn(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN);
 
-        unsafe {
-            // SAFETY: Even if we're using the page table, we've switched to kernel page table.
-            // So it's safe to release the old memory list.
-            self.release();
-        }
+        let old_user_count = self.user_count.fetch_sub(1, Ordering::Release);
+        assert_ne!(old_user_count, 0);
+    }
 
-        // SAFETY: `self.inner` should be `None` after releasing.
-        self.inner.swap(Some(new.inner.borrow().clone()));
-        self.root_page_table.store(
-            new.root_page_table.load(Ordering::Relaxed),
-            Ordering::Relaxed,
-        );
+    /// Deactivate `self` and activate `to` with root page table changed only once.
+    /// This might reduce the overhead of switching page tables twice.
+    #[allow(dead_code)]
+    pub fn switch(&self, to: &Self) {
+        self.user_count.fetch_add(1, Ordering::Acquire);
 
-        if switched {
-            self.switch_page_table();
-        }
+        let root_page_table = self.root_page_table.load(Ordering::Relaxed);
+        assert_ne!(root_page_table, 0);
+        arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table)));
+
+        let old_user_count = to.user_count.fetch_sub(1, Ordering::Release);
+        assert_ne!(old_user_count, 0);
     }
 
+    /// Replace the current page table with a new one.
+    ///
     /// # Safety
-    /// This function is unsafe because the caller should make sure that the `inner` is not currently used.
-    pub unsafe fn release(&self) {
-        // TODO: Check whether we should wake someone up if they've been put to sleep when calling `vfork`.
-        self.inner.swap(None);
+    /// This function should be called only when we are sure that the `MMList` is not
+    /// being used by any other thread.
+    pub unsafe fn replace(&self, new: Option<Self>) {
+        eonix_preempt::disable();
+
+        assert_eq!(
+            self.user_count.load(Ordering::Relaxed),
+            1,
+            "We should be the only user"
+        );
+
+        assert_eq!(
+            new.as_ref()
+                .map(|new_mm| new_mm.user_count.load(Ordering::Relaxed))
+                .unwrap_or(0),
+            0,
+            "`new` must not be used by anyone"
+        );
+
+        let old_root_page_table = self.root_page_table.load(Ordering::Relaxed);
+        let current_root_page_table = arch::get_root_page_table_pfn();
+        assert_eq!(
+            PAddr::from(current_root_page_table).addr(),
+            old_root_page_table,
+            "We should be the only user"
+        );
+
+        let new_root_page_table = match &new {
+            Some(new_mm) => new_mm.root_page_table.load(Ordering::Relaxed),
+            None => PAddr::from(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN).addr(),
+        };
+
+        arch::set_root_page_table_pfn(PFN::from(PAddr::from(new_root_page_table)));
+
         self.root_page_table
-            .swap(KERNEL_PML4 as _, Ordering::Relaxed);
+            .store(new_root_page_table, Ordering::Relaxed);
+
+        // TODO: Check whether we should wake someone up if they've been put
+        //       to sleep when calling `vfork`.
+        self.inner
+            .swap(new.map(|new_mm| new_mm.inner.swap(None)).flatten());
+
+        eonix_preempt::enable();
     }
 
     /// No need to do invalidation manually, `PageTable` already does it.
-    pub fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
-        Task::block_on(self.inner.borrow().lock()).unmap(start, len)
+    pub async fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
+        let pages_to_free = self.inner.borrow().lock().await.unmap(start, len)?;
+
+        // We need to assure that the pages are not accessed anymore.
+        // The ones having these pages in their TLB could read from or write to them.
+        // So flush the TLBs first for all our users.
+        self.flush_user_tlbs().await;
+
+        // Then free the pages.
+        drop(pages_to_free);
+
+        Ok(())
     }
 
     pub fn mmap_hint(
@@ -359,17 +437,19 @@ impl MMList {
             .get(&break_start)
             .expect("Program break area should be valid");
 
-        let len: usize = pos.sub(current_break);
+        let len = pos - current_break;
+        let range_to_grow = VRange::from(program_break.range().end()).grow(len);
+
+        program_break.grow(len);
+
         inner.page_table.set_anonymous(
-            VRange::from(program_break.range().end()).grow(len),
+            range_to_grow,
             Permission {
                 write: true,
                 execute: false,
             },
         );
 
-        program_break.grow(len);
-
         inner.break_pos = Some(pos);
         pos
     }
@@ -415,7 +495,7 @@ impl MMList {
 
             for (idx, pte) in inner
                 .page_table
-                .iter_user(VRange::new(current, access_end))?
+                .iter_user(VRange::new(current, access_end))
                 .enumerate()
             {
                 let page_start = current.floor() + idx * 0x1000;
@@ -438,11 +518,15 @@ impl MMList {
                 }
 
                 unsafe {
-                    let page = Page::from_pfn(pte.pfn(), 0);
-                    func(
-                        offset + idx * 0x1000,
-                        &mut page.as_mut_slice()[start_offset..end_offset],
-                    );
+                    // SAFETY: We are sure that the page is valid and we have the right to access it.
+                    Page::with_raw(pte.get_pfn(), |page| {
+                        // SAFETY: The caller guarantees that no one else is using the page.
+                        let page_data = page.as_memblk().as_bytes_mut();
+                        func(
+                            offset + idx * 0x1000,
+                            &mut page_data[start_offset..end_offset],
+                        );
+                    });
                 }
             }
 
@@ -454,3 +538,92 @@ impl MMList {
         Ok(())
     }
 }
+
+impl fmt::Debug for MMList {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MMList").finish()
+    }
+}
+
+trait PageTableExt {
+    fn set_anonymous(&self, range: VRange, permission: Permission);
+    fn set_mmapped(&self, range: VRange, permission: Permission);
+    fn set_copy_on_write(&self, from: &Self, range: VRange);
+}
+
+impl PageTableExt for PageTable<'_, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess> {
+    fn set_anonymous(&self, range: VRange, permission: Permission) {
+        for pte in self.iter_user(range) {
+            pte.set_anonymous(permission.execute);
+        }
+    }
+
+    fn set_mmapped(&self, range: VRange, permission: Permission) {
+        for pte in self.iter_user(range) {
+            pte.set_mapped(permission.execute);
+        }
+    }
+
+    fn set_copy_on_write(&self, from: &Self, range: VRange) {
+        let to_iter = self.iter_user(range);
+        let from_iter = from.iter_user(range);
+
+        for (to, from) in to_iter.zip(from_iter) {
+            to.set_copy_on_write(from);
+        }
+    }
+}
+
+trait PTEExt {
+    fn set_anonymous(&mut self, execute: bool);
+    fn set_mapped(&mut self, execute: bool);
+    fn set_copy_on_write(&mut self, from: &mut Self);
+}
+
+impl<T> PTEExt for T
+where
+    T: PTE,
+{
+    fn set_anonymous(&mut self, execute: bool) {
+        // Writable flag is set during page fault handling while executable flag is
+        // preserved across page faults, so we set executable flag now.
+        let attr = <Self as PTE>::Attr::new()
+            .present(true)
+            .user(true)
+            .copy_on_write(true)
+            .anonymous(true)
+            .execute(execute);
+
+        self.set(EMPTY_PAGE.clone().into_raw(), attr);
+    }
+
+    fn set_mapped(&mut self, execute: bool) {
+        // Writable flag is set during page fault handling while executable flag is
+        // preserved across page faults, so we set executable flag now.
+        let attr = <Self as PTE>::Attr::new()
+            .user(true)
+            .copy_on_write(true)
+            .anonymous(true)
+            .mapped(true)
+            .execute(execute);
+
+        self.set(EMPTY_PAGE.clone().into_raw(), attr);
+    }
+
+    fn set_copy_on_write(&mut self, from: &mut Self) {
+        let mut from_attr = from.get_attr();
+        if !from_attr.is_present() {
+            return;
+        }
+
+        from_attr = from_attr.write(false).copy_on_write(true);
+
+        let pfn = unsafe {
+            // SAFETY: We get the pfn from a valid page table entry, so it should be valid as well.
+            Page::with_raw(from.get_pfn(), |page| page.clone().into_raw())
+        };
+
+        self.set(pfn, from_attr.accessed(false));
+        from.set_attr(from_attr);
+    }
+}

+ 39 - 0
src/kernel/mem/mm_list/mapping.rs

@@ -0,0 +1,39 @@
+use crate::kernel::vfs::dentry::Dentry;
+use alloc::sync::Arc;
+
+#[derive(Debug, Clone)]
+pub struct FileMapping {
+    pub file: Arc<Dentry>,
+    /// Offset in the file, aligned to 4KB boundary.
+    pub offset: usize,
+    /// Length of the mapping. Exceeding part will be zeroed.
+    pub length: usize,
+}
+#[derive(Debug, Clone)]
+pub enum Mapping {
+    Anonymous,
+    File(FileMapping),
+}
+
+impl FileMapping {
+    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
+        assert_eq!(offset & 0xfff, 0);
+        Self {
+            file,
+            offset,
+            length,
+        }
+    }
+
+    pub fn offset(&self, offset: usize) -> Self {
+        if self.length <= offset {
+            Self::new(self.file.clone(), self.offset + self.length, 0)
+        } else {
+            Self::new(
+                self.file.clone(),
+                self.offset + offset,
+                self.length - offset,
+            )
+        }
+    }
+}

+ 9 - 8
src/kernel/mem/mm_list/page_fault.rs

@@ -1,9 +1,11 @@
 use super::{MMList, VAddr};
-use crate::kernel::mem::{Mapping, VRange};
+use crate::kernel::mem::Mapping;
 use crate::kernel::task::{ProcessList, Signal, Thread};
 use crate::prelude::*;
 use arch::InterruptContext;
 use bitflags::bitflags;
+use eonix_mm::address::{AddrOps as _, VRange};
+use eonix_mm::paging::PAGE_SIZE;
 use eonix_runtime::task::Task;
 
 bitflags! {
@@ -61,8 +63,7 @@ impl MMList {
 
         let pte = inner
             .page_table
-            .iter_user(VRange::new(addr.floor(), addr.floor() + 0x1000))
-            .unwrap()
+            .iter_user(VRange::from(addr.floor()).grow(PAGE_SIZE))
             .next()
             .expect("If we can find the mapped area, we should be able to find the PTE");
 
@@ -109,14 +110,14 @@ fn try_page_fault_fix(int_stack: &mut InterruptContext, addr: VAddr) {
 
 fn kernel_page_fault_die(vaddr: VAddr, ip: usize) -> ! {
     panic!(
-        "Invalid kernel mode memory access to {:#8x} while executing the instruction at {:#8x}",
-        vaddr.0, ip
+        "Invalid kernel mode memory access to {:?} while executing the instruction at {:#8x}",
+        vaddr, ip
     )
 }
 
 pub fn handle_page_fault(int_stack: &mut InterruptContext) {
     let error = PageFaultError::from_bits_truncate(int_stack.error_code);
-    let vaddr = VAddr(arch::get_page_fault_address());
+    let vaddr = arch::get_page_fault_address();
 
     let result = Thread::current()
         .process
@@ -125,8 +126,8 @@ pub fn handle_page_fault(int_stack: &mut InterruptContext) {
 
     if let Err(signal) = result {
         println_debug!(
-            "Page fault on {:#x} in user space at {:#x}",
-            vaddr.0,
+            "Page fault on {:?} in user space at {:#x}",
+            vaddr,
             int_stack.rip
         );
         ProcessList::kill_current(signal)

+ 92 - 403
src/kernel/mem/page_alloc.rs

@@ -1,483 +1,172 @@
-use super::address::{PAddr, PFN};
-use crate::intrusive_list::Link;
-use crate::{container_of, prelude::*};
-use bitflags::bitflags;
-use core::sync::atomic::Ordering;
-use core::{ptr::NonNull, sync::atomic::AtomicU32};
-
-const MAX_PAGE_ORDER: u32 = 10;
-const PAGE_ALLOC_COSTLY_ORDER: u32 = 3;
+use super::{paging::AllocZeroed as _, Page};
+use buddy_allocator::{BuddyAllocator, FreeArea as BuddyFreeArea};
+use core::{ptr::NonNull, sync::atomic::Ordering};
+use eonix_mm::{
+    address::{AddrOps as _, PAddr},
+    paging::{PageAlloc, PageFlags, RawPagePtr, PFN},
+};
+
+const COSTLY_ORDER: u32 = 3;
 const BATCH_SIZE: u32 = 64;
-const PAGE_ARRAY: *mut Page = 0xffffff8040000000 as *mut Page;
-
-pub(super) type PagePtr = Ptr<Page>;
-
-#[repr(transparent)]
-pub struct Ptr<T>(Option<NonNull<T>>);
-
-impl<T> Clone for Ptr<T> {
-    fn clone(&self) -> Self {
-        Self(self.0)
-    }
-}
-
-impl<T> Copy for Ptr<T> {}
-
-impl<T> Ptr<T> {
-    pub const fn new(ptr: Option<NonNull<T>>) -> Self {
-        Self(ptr)
-    }
-
-    pub fn from_raw(ptr: *mut T) -> Self {
-        Self::new(NonNull::new(ptr))
-    }
-
-    pub fn null() -> Self {
-        Self::new(None)
-    }
-
-    pub fn is_none(&self) -> bool {
-        self.0.is_none()
-    }
-
-    pub fn is_some(&self) -> bool {
-        self.0.is_some()
-    }
-
-    pub fn as_ptr(&self) -> *mut T {
-        self.0.unwrap().as_ptr()
-    }
-
-    pub fn as_ref<'a>(&self) -> &'a T {
-        unsafe { &*self.as_ptr() }
-    }
-
-    pub fn as_mut<'a>(&self) -> &'a mut T {
-        unsafe { &mut *self.as_ptr() }
-    }
-}
-
-impl PagePtr {
-    pub unsafe fn increase_refcount(&self) -> u32 {
-        self.as_mut().increase_refcount()
-    }
-
-    pub unsafe fn decrease_refcount(&self) -> u32 {
-        self.as_mut().decrease_refcount()
-    }
-
-    pub unsafe fn load_refcount(&self) -> u32 {
-        self.as_ref().refcount.load(Ordering::Acquire)
-    }
-
-    fn get_order(&self) -> u32 {
-        self.as_ref().order
-    }
 
-    pub fn is_valid(&self, order: u32) -> bool {
-        self.is_some() && self.get_order() == order
-    }
-
-    fn offset(&self, count: usize) -> Self {
-        match self.0 {
-            Some(non_null_ptr) => {
-                let new_raw_ptr = unsafe { non_null_ptr.as_ptr().add(count) };
-                Self::from_raw(new_raw_ptr)
-            }
-            None => Self::null(),
-        }
-    }
-}
-
-impl Into<PFN> for PagePtr {
-    fn into(self) -> PFN {
-        unsafe { PFN::from(self.as_ptr().offset_from(PAGE_ARRAY) as usize) }
-    }
-}
-
-impl From<PFN> for PagePtr {
-    fn from(pfn: PFN) -> Self {
-        unsafe { Self::from_raw(PAGE_ARRAY.add(pfn.0)) }
-    }
-}
-
-bitflags! {
-    // TODO: Use atomic
-    struct PageFlags: usize {
-        const PRESENT = 1 << 0;
-        const LOCKED  = 1 << 1;
-        const BUDDY   = 1 << 2;
-        const SLAB    = 1 << 3;
-        const DIRTY   = 1 << 4;
-        const FREE    = 1 << 5;
-        const LOCAL   = 1 << 6;
-    }
-}
-
-pub(super) struct Page {
-    // Now only used for free page links in the buddy system.
-    // Can be used for LRU page swap in the future.
-    link: Link,
-    flags: PageFlags, // TODO: This should be atomic.
-    /// # Safety
-    /// This field is only used in buddy system, which is protected by the global lock.
-    order: u32,
-    refcount: AtomicU32,
-}
-
-struct FreeArea {
-    free_list: Link,
-    count: usize,
-}
+#[arch::define_percpu]
+static PERCPU_PAGE_ALLOC: PerCpuPageAlloc = PerCpuPageAlloc::new();
 
-/// Safety: `Zone` is `Send` because the `PAGE_ARRAY` is shared between cores.
-unsafe impl Send for Zone {}
-// /// Safety: TODO
-// unsafe impl Sync for Zone {}
+pub struct NoAlloc;
 
-struct Zone {
-    free_areas: [FreeArea; MAX_PAGE_ORDER as usize + 1],
-}
+pub struct GlobalPageAlloc;
 
-struct PerCpuPages {
+struct PerCpuPageAlloc {
     batch: u32,
-    _high: u32, // TODO: use in future
-    free_areas: [FreeArea; PAGE_ALLOC_COSTLY_ORDER as usize + 1],
+    // TODO: might be used in the future.
+    // high: u32,
+    free_areas: [BuddyFreeArea; COSTLY_ORDER as usize + 1],
 }
 
-impl PerCpuPages {
+impl PerCpuPageAlloc {
     const fn new() -> Self {
         Self {
             batch: BATCH_SIZE,
-            _high: 0,
-            free_areas: [const { FreeArea::new() }; PAGE_ALLOC_COSTLY_ORDER as usize + 1],
+            // high: 0,
+            free_areas: [const { BuddyFreeArea::new() }; COSTLY_ORDER as usize + 1],
         }
     }
 
-    fn get_free_pages(&mut self, order: u32) -> PagePtr {
-        assert!(order <= PAGE_ALLOC_COSTLY_ORDER);
+    fn do_alloc_order(&mut self, order: u32) -> Option<RawPagePtr> {
+        assert!(order <= COSTLY_ORDER);
+        let free_area = &mut self.free_areas[order as usize];
 
-        loop {
-            let pages_ptr = self.free_areas[order as usize].get_free_pages();
-            if pages_ptr.is_some() {
-                return pages_ptr;
-            }
+        let mut page_ptr = free_area.get_free_pages();
 
+        if page_ptr.is_none() {
             let batch = self.batch >> order;
-            ZONE.lock()
-                .get_bulk_free_pages(&mut self.free_areas[order as usize], order, batch);
-        }
-    }
-
-    fn free_pages(&mut self, pages_ptr: PagePtr, order: u32) {
-        assert!(order <= PAGE_ALLOC_COSTLY_ORDER);
-        assert_eq!(unsafe { pages_ptr.load_refcount() }, 0);
-        assert_eq!(pages_ptr.get_order(), order);
-
-        self.free_areas[order as usize].add_pages(pages_ptr);
-    }
-}
-
-impl Page {
-    fn set_flags(&mut self, flags: PageFlags) {
-        self.flags.insert(flags);
-    }
-
-    fn remove_flags(&mut self, flags: PageFlags) {
-        self.flags.remove(flags);
-    }
-
-    fn set_order(&mut self, order: u32) {
-        self.order = order;
-    }
-
-    unsafe fn increase_refcount(&mut self) -> u32 {
-        self.refcount.fetch_add(1, Ordering::Relaxed)
-    }
-
-    unsafe fn decrease_refcount(&mut self) -> u32 {
-        self.refcount.fetch_sub(1, Ordering::AcqRel)
-    }
-
-    pub fn is_buddy(&self) -> bool {
-        self.flags.contains(PageFlags::BUDDY)
-    }
-
-    #[allow(dead_code)]
-    pub fn is_slab(&self) -> bool {
-        self.flags.contains(PageFlags::SLAB)
-    }
-
-    pub fn is_present(&self) -> bool {
-        self.flags.contains(PageFlags::PRESENT)
-    }
-
-    pub fn is_free(&self) -> bool {
-        self.flags.contains(PageFlags::FREE)
-    }
-
-    pub fn is_local(&self) -> bool {
-        self.flags.contains(PageFlags::LOCAL)
-    }
-}
+            for _ in 0..batch {
+                if let Some(pages_ptr) = BuddyAllocator::alloc_order(order) {
+                    pages_ptr.flags().set(PageFlags::LOCAL);
+                    free_area.add_pages(pages_ptr);
+                } else {
+                    break;
+                };
+            }
 
-impl FreeArea {
-    const fn new() -> Self {
-        Self {
-            free_list: Link::new(),
-            count: 0,
+            page_ptr = free_area.get_free_pages();
         }
-    }
-
-    fn get_free_pages(&mut self) -> PagePtr {
-        if let Some(pages_link) = self.free_list.next_mut() {
-            assert_ne!(self.count, 0);
-
-            let pages_ptr = unsafe { container_of!(pages_link, Page, link) };
-            let pages_ptr = Ptr::from_raw(pages_ptr);
-
-            self.count -= 1;
-            pages_link.remove();
 
-            pages_ptr
-        } else {
-            PagePtr::null()
-        }
+        page_ptr.inspect(|page_ptr| page_ptr.flags().clear(PageFlags::FREE))
     }
 
-    fn add_pages(&mut self, pages_ptr: PagePtr) {
-        self.count += 1;
-        pages_ptr.as_mut().set_flags(PageFlags::FREE);
-        self.free_list.insert(&mut pages_ptr.as_mut().link)
-    }
+    fn free_pages(&mut self, pages_ptr: RawPagePtr, order: u32) {
+        assert_eq!(pages_ptr.order(), order);
+        assert_eq!(pages_ptr.refcount().load(Ordering::Relaxed), 0);
 
-    fn del_pages(&mut self, pages_ptr: PagePtr) {
-        assert!(self.count >= 1 && pages_ptr.as_ref().is_free());
-        self.count -= 1;
-        pages_ptr.as_mut().remove_flags(PageFlags::FREE);
-        pages_ptr.as_mut().link.remove();
+        // TODO: Temporary workaround here.
+        pages_ptr.refcount().store(1, Ordering::Relaxed);
+        self.free_areas[order as usize].add_pages(pages_ptr);
     }
 }
 
-impl Zone {
-    const fn new() -> Self {
-        Self {
-            free_areas: [const { FreeArea::new() }; MAX_PAGE_ORDER as usize + 1],
+impl PageAlloc for GlobalPageAlloc {
+    fn alloc_order(order: u32) -> Option<RawPagePtr> {
+        if order > COSTLY_ORDER {
+            BuddyAllocator::alloc_order(order)
+        } else {
+            PerCpuPageAlloc::alloc_order(order)
         }
     }
 
-    /// Only used for per-cpu pages
-    fn get_bulk_free_pages(&mut self, free_area: &mut FreeArea, order: u32, count: u32) -> u32 {
-        for i in 0..count {
-            let pages_ptr = self.get_free_pages(order);
-            if pages_ptr.is_none() {
-                return i;
-            }
-
-            pages_ptr.as_mut().set_flags(PageFlags::LOCAL);
-            free_area.add_pages(pages_ptr);
+    unsafe fn dealloc(page_ptr: RawPagePtr) {
+        if page_ptr.order() > COSTLY_ORDER {
+            BuddyAllocator::dealloc(page_ptr);
+        } else {
+            PerCpuPageAlloc::dealloc(page_ptr);
         }
-        count
     }
 
-    fn get_free_pages(&mut self, order: u32) -> PagePtr {
-        for current_order in order..=MAX_PAGE_ORDER {
-            let pages_ptr = self.free_areas[current_order as usize].get_free_pages();
-            if pages_ptr.is_none() {
-                continue;
-            }
-
-            pages_ptr.as_mut().set_order(order);
-
-            if current_order > order {
-                self.expand(pages_ptr, current_order, order);
-            }
-            assert!(pages_ptr.as_ref().is_present() && pages_ptr.as_ref().is_free());
-            return pages_ptr;
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool {
+        if page_ptr.order() > COSTLY_ORDER {
+            BuddyAllocator::has_management_over(page_ptr)
+        } else {
+            PerCpuPageAlloc::has_management_over(page_ptr)
         }
-        PagePtr::new(None)
     }
+}
 
-    fn expand(&mut self, pages_ptr: PagePtr, order: u32, target_order: u32) {
-        assert!(pages_ptr.is_some());
-        let mut offset = 1 << order;
-
-        for order in (target_order..order).rev() {
-            offset >>= 1;
-            let split_pages_ptr = pages_ptr.offset(offset);
-            split_pages_ptr.as_mut().set_order(order);
-            split_pages_ptr.as_mut().set_flags(PageFlags::BUDDY);
-            self.free_areas[order as usize].add_pages(split_pages_ptr);
-        }
+impl PageAlloc for NoAlloc {
+    fn alloc_order(_order: u32) -> Option<RawPagePtr> {
+        panic!("NoAlloc cannot allocate pages");
     }
 
-    fn free_pages(&mut self, mut pages_ptr: PagePtr, order: u32) {
-        assert_eq!(unsafe { pages_ptr.load_refcount() }, 0);
-        assert_eq!(pages_ptr.get_order(), order);
-
-        let mut pfn: PFN = pages_ptr.into();
-        let mut current_order = order;
-
-        while current_order < MAX_PAGE_ORDER {
-            let buddy_pfn = pfn.buddy_pfn(current_order);
-            let buddy_pages_ptr = PagePtr::from(buddy_pfn);
-
-            if !self.buddy_check(buddy_pages_ptr, current_order) {
-                break;
-            }
-
-            pages_ptr.as_mut().remove_flags(PageFlags::BUDDY);
-            buddy_pages_ptr.as_mut().remove_flags(PageFlags::BUDDY);
-            self.free_areas[current_order as usize].del_pages(buddy_pages_ptr);
-            pages_ptr = PagePtr::from(pfn.combined_pfn(buddy_pfn));
-            pages_ptr.as_mut().set_flags(PageFlags::BUDDY);
-            pfn = pfn.combined_pfn(buddy_pfn);
-            current_order += 1;
-        }
-
-        pages_ptr.as_mut().set_order(current_order);
-        self.free_areas[current_order as usize].add_pages(pages_ptr);
+    unsafe fn dealloc(_: RawPagePtr) {
+        panic!("NoAlloc cannot deallocate pages");
     }
 
-    /// This function checks whether a page is free && is the buddy
-    /// we can coalesce a page and its buddy if
-    /// - the buddy is valid(present) &&
-    /// - the buddy is right now in free_areas &&
-    /// - a page and its buddy have the same order &&
-    /// - a page and its buddy are in the same zone.    // check when smp
-    fn buddy_check(&self, pages_ptr: PagePtr, order: u32) -> bool {
-        if !pages_ptr.as_ref().is_present() {
-            return false;
-        }
-        if !(pages_ptr.as_ref().is_free()) {
-            return false;
-        }
-        if pages_ptr.as_ref().is_local() {
-            return false;
-        }
-        if pages_ptr.as_ref().order != order {
-            return false;
-        }
-
-        assert_eq!(unsafe { pages_ptr.load_refcount() }, 0);
+    unsafe fn has_management_over(_: RawPagePtr) -> bool {
         true
     }
-
-    /// Only used on buddy initialization
-    fn create_pages(&mut self, start: usize, end: usize) {
-        let mut start_pfn = PAddr::from(start).ceil_pfn();
-        let end_pfn = PAddr::from(end).floor_pfn();
-
-        while start_pfn < end_pfn {
-            let mut order = usize::from(start_pfn).trailing_zeros().min(MAX_PAGE_ORDER);
-
-            while start_pfn + order as usize > end_pfn {
-                order -= 1;
-            }
-            let page_ptr: PagePtr = start_pfn.into();
-            page_ptr.as_mut().set_flags(PageFlags::BUDDY);
-            self.free_areas[order as usize].add_pages(page_ptr);
-            start_pfn = start_pfn + (1 << order) as usize;
-        }
-    }
 }
 
-#[arch::define_percpu]
-static PER_CPU_PAGES: PerCpuPages = PerCpuPages::new();
-
-static ZONE: Spin<Zone> = Spin::new(Zone::new());
-
-fn __alloc_pages(order: u32) -> PagePtr {
-    let pages_ptr;
-
-    if order <= PAGE_ALLOC_COSTLY_ORDER {
+impl PageAlloc for PerCpuPageAlloc {
+    fn alloc_order(order: u32) -> Option<RawPagePtr> {
+        let page_ptr;
         unsafe {
             eonix_preempt::disable();
-            pages_ptr = PER_CPU_PAGES.as_mut().get_free_pages(order);
+            page_ptr = PERCPU_PAGE_ALLOC.as_mut().do_alloc_order(order);
             eonix_preempt::enable();
         }
-    } else {
-        pages_ptr = ZONE.lock().get_free_pages(order);
-    }
 
-    unsafe {
-        pages_ptr.as_mut().increase_refcount();
+        page_ptr
     }
-    pages_ptr.as_mut().remove_flags(PageFlags::FREE);
-    pages_ptr
-}
 
-fn __free_pages(pages_ptr: PagePtr, order: u32) {
-    if order <= PAGE_ALLOC_COSTLY_ORDER {
+    unsafe fn dealloc(page_ptr: RawPagePtr) {
+        let order = page_ptr.order();
+
         unsafe {
             eonix_preempt::disable();
-            PER_CPU_PAGES.as_mut().free_pages(pages_ptr, order);
+            PERCPU_PAGE_ALLOC.as_mut().free_pages(page_ptr, order);
             eonix_preempt::enable();
         }
-    } else {
-        ZONE.lock().free_pages(pages_ptr, order);
     }
-}
 
-pub(super) fn alloc_page() -> PagePtr {
-    __alloc_pages(0)
-}
-
-pub(super) fn alloc_pages(order: u32) -> PagePtr {
-    __alloc_pages(order)
-}
-
-pub(super) fn early_alloc_pages(order: u32) -> PagePtr {
-    let pages_ptr = ZONE.lock().get_free_pages(order);
-    unsafe {
-        pages_ptr.as_mut().increase_refcount();
+    unsafe fn has_management_over(page_ptr: RawPagePtr) -> bool {
+        BuddyAllocator::has_management_over(page_ptr) && page_ptr.flags().has(PageFlags::LOCAL)
     }
-    pages_ptr.as_mut().remove_flags(PageFlags::FREE);
-    pages_ptr
-}
-
-pub(super) fn free_pages(page_ptr: PagePtr, order: u32) {
-    __free_pages(page_ptr, order)
 }
 
 #[no_mangle]
 pub extern "C" fn mark_present(start: usize, end: usize) {
-    let mut start_pfn = PAddr::from(start).ceil_pfn();
-    let end_pfn = PAddr::from(end).floor_pfn();
+    let mut start_pfn = PFN::from(PAddr::from(start).ceil());
+    let end_pfn = PFN::from(PAddr::from(end).floor());
+
     while start_pfn < end_pfn {
-        PagePtr::from(start_pfn)
-            .as_mut()
-            .set_flags(PageFlags::PRESENT);
+        RawPagePtr::from(start_pfn).flags().set(PageFlags::PRESENT);
         start_pfn = start_pfn + 1;
     }
 }
 
 #[no_mangle]
-pub extern "C" fn create_pages(start: usize, end: usize) {
-    ZONE.lock().create_pages(start, end);
+pub extern "C" fn create_pages(start: PAddr, end: PAddr) {
+    BuddyAllocator::create_pages(start, end);
 }
 
 #[no_mangle]
-pub extern "C" fn page_to_pfn(page: *const Page) -> usize {
-    unsafe { page.offset_from(PAGE_ARRAY) as usize }
+pub extern "C" fn page_to_pfn(page: *const ()) -> PFN {
+    let page_ptr = RawPagePtr::new(NonNull::new(page as *mut _).unwrap());
+    PFN::from(page_ptr)
 }
 
 #[no_mangle]
-pub extern "C" fn c_alloc_page() -> *const Page {
-    alloc_page().as_ptr() as *const Page
+pub extern "C" fn c_alloc_page() -> *const () {
+    GlobalPageAlloc::alloc().expect("Out of memory").as_ptr() as *const _
 }
 
 #[no_mangle]
-pub extern "C" fn c_alloc_pages(order: u32) -> *const Page {
-    alloc_pages(order).as_ptr() as *const Page
+pub extern "C" fn c_alloc_pages(order: u32) -> *const () {
+    GlobalPageAlloc::alloc_order(order)
+        .expect("Out of memory")
+        .as_ptr() as *const _
 }
 
 #[no_mangle]
-pub extern "C" fn c_alloc_page_table() -> usize {
-    let pfn: PFN = alloc_page().into();
-    let paddr: usize = usize::from(pfn) << 12;
-    unsafe {
-        core::ptr::write_bytes(paddr as *mut u8, 0, 4096);
-    }
-    paddr
+pub extern "C" fn c_alloc_page_table() -> PAddr {
+    PAddr::from(Page::zeroed().into_raw())
 }

+ 0 - 316
src/kernel/mem/page_table.rs

@@ -1,316 +0,0 @@
-use super::{
-    paging::Page,
-    phys::{CachedPP, PhysPtr as _},
-    VAddr, VRange,
-};
-use super::{MMArea, Permission};
-use crate::bindings::root::{EINVAL, KERNEL_PML4};
-use crate::prelude::*;
-use eonix_sync::LazyLock;
-
-const PA_P: usize = 0x001;
-const PA_RW: usize = 0x002;
-const PA_US: usize = 0x004;
-#[allow(dead_code)]
-const PA_PWT: usize = 0x008;
-#[allow(dead_code)]
-const PA_PCD: usize = 0x010;
-const PA_A: usize = 0x020;
-const PA_D: usize = 0x040;
-#[allow(dead_code)]
-const PA_PS: usize = 0x080;
-const PA_G: usize = 0x100;
-const PA_COW: usize = 0x200;
-const PA_MMAP: usize = 0x400;
-const PA_ANON: usize = 0x800;
-const PA_NXE: usize = 0x8000_0000_0000_0000;
-const PA_MASK: usize = 0xfff0_0000_0000_0fff;
-
-#[repr(transparent)]
-#[derive(Debug, Clone, Copy)]
-pub struct PTE(usize);
-
-#[derive(Debug)]
-pub struct PageTable {
-    page: Page,
-}
-
-#[allow(dead_code)]
-pub struct PTEIterator<'lt, const KERNEL: bool> {
-    count: usize,
-    i4: u16,
-    i3: u16,
-    i2: u16,
-    i1: u16,
-    p4: CachedPP,
-    p3: CachedPP,
-    p2: CachedPP,
-    p1: CachedPP,
-
-    start: VAddr,
-    end: VAddr,
-    _phantom: core::marker::PhantomData<&'lt ()>,
-}
-
-static EMPTY_PAGE: LazyLock<Page> = LazyLock::new(|| {
-    let page = Page::alloc_one();
-    page.zero();
-    page
-});
-
-impl PTE {
-    pub fn is_user(&self) -> bool {
-        self.0 & PA_US != 0
-    }
-
-    pub fn is_present(&self) -> bool {
-        self.0 & PA_P != 0
-    }
-
-    pub fn is_cow(&self) -> bool {
-        self.0 & PA_COW != 0
-    }
-
-    pub fn is_mmap(&self) -> bool {
-        self.0 & PA_MMAP != 0
-    }
-
-    pub fn pfn(&self) -> usize {
-        self.0 & !PA_MASK
-    }
-
-    pub fn attributes(&self) -> usize {
-        self.0 & PA_MASK
-    }
-
-    pub fn set(&mut self, pfn: usize, attributes: usize) {
-        self.0 = pfn | attributes;
-    }
-
-    #[allow(dead_code)]
-    pub fn set_pfn(&mut self, pfn: usize) {
-        self.set(pfn, self.attributes())
-    }
-
-    pub fn set_attributes(&mut self, attributes: usize) {
-        self.set(self.pfn(), attributes)
-    }
-
-    fn parse_page_table(&mut self, kernel: bool) -> CachedPP {
-        let attributes = if kernel {
-            PA_P | PA_RW | PA_G
-        } else {
-            PA_P | PA_RW | PA_US
-        };
-
-        if self.is_present() {
-            CachedPP::new(self.pfn())
-        } else {
-            let page = Page::alloc_one();
-            let pp = page.as_cached();
-            page.zero();
-
-            self.set(page.into_pfn(), attributes);
-            pp
-        }
-    }
-
-    pub fn setup_cow(&mut self, from: &mut Self) {
-        self.set(
-            unsafe { Page::from_pfn(from.pfn(), 0) }.into_pfn(),
-            (from.attributes() & !(PA_RW | PA_A | PA_D)) | PA_COW,
-        );
-
-        from.set_attributes((from.attributes() & !PA_RW) | PA_COW);
-    }
-
-    pub fn clear(&mut self) {
-        self.set(0, 0)
-    }
-
-    /// Take the ownership of the page from the PTE, clear the PTE and return the page.
-    pub fn take(&mut self) -> Page {
-        // SAFETY: Acquire the ownership of the page from the page table and then
-        // clear the PTE so no one could be able to access the page from here later on.
-        let page = unsafe { Page::take_pfn(self.pfn(), 0) };
-        self.clear();
-        page
-    }
-}
-
-impl<'lt, const KERNEL: bool> PTEIterator<'lt, KERNEL> {
-    fn new(pt: &'lt Page, start: VAddr, end: VAddr) -> KResult<Self> {
-        if start > end {
-            return Err(EINVAL);
-        }
-
-        let p4 = pt.as_cached();
-        let p3 = p4.as_mut_slice::<PTE>(512)[Self::index(4, start)].parse_page_table(KERNEL);
-        let p2 = p3.as_mut_slice::<PTE>(512)[Self::index(3, start)].parse_page_table(KERNEL);
-        let p1 = p2.as_mut_slice::<PTE>(512)[Self::index(2, start)].parse_page_table(KERNEL);
-
-        Ok(Self {
-            count: (end.0 - start.0) >> 12,
-            i4: Self::index(4, start) as u16,
-            i3: Self::index(3, start) as u16,
-            i2: Self::index(2, start) as u16,
-            i1: Self::index(1, start) as u16,
-            p4,
-            p3,
-            p2,
-            p1,
-            start,
-            end,
-            _phantom: core::marker::PhantomData,
-        })
-    }
-
-    fn offset(level: u32) -> usize {
-        12 + (level as usize - 1) * 9
-    }
-
-    fn index(level: u32, vaddr: VAddr) -> usize {
-        (vaddr.0 >> Self::offset(level)) & 0x1ff
-    }
-}
-
-impl<'lt, const KERNEL: bool> Iterator for PTEIterator<'lt, KERNEL> {
-    type Item = &'lt mut PTE;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.count != 0 {
-            self.count -= 1;
-        } else {
-            return None;
-        }
-
-        let retval = &mut self.p1.as_mut_slice::<PTE>(512)[self.i1 as usize];
-        self.i1 = (self.i1 + 1) % 512;
-        if self.i1 == 0 {
-            self.i2 = (self.i2 + 1) % 512;
-            if self.i2 == 0 {
-                self.i3 = (self.i3 + 1) % 512;
-                if self.i3 == 0 {
-                    self.i4 = (self.i4 + 1) % 512;
-                    if self.i4 == 0 {
-                        panic!("PTEIterator: out of range");
-                    }
-                }
-                self.p3 =
-                    self.p4.as_mut_slice::<PTE>(512)[self.i4 as usize].parse_page_table(KERNEL);
-            }
-            self.p2 = self.p3.as_mut_slice::<PTE>(512)[self.i3 as usize].parse_page_table(KERNEL);
-        }
-        self.p1 = self.p2.as_mut_slice::<PTE>(512)[self.i2 as usize].parse_page_table(KERNEL);
-        Some(retval)
-    }
-}
-
-impl PageTable {
-    pub fn new() -> Self {
-        let page = Page::alloc_one();
-        page.zero();
-
-        // TODO: copy only the kernel space mappings.
-        let kernel_space_page_table = CachedPP::new(KERNEL_PML4 as usize);
-
-        page.as_cached().as_mut_slice::<u64>(512)[256..]
-            .copy_from_slice(&kernel_space_page_table.as_mut_slice(512)[256..]);
-
-        Self { page }
-    }
-
-    pub fn root_page_table(&self) -> usize {
-        self.page.as_phys()
-    }
-
-    pub fn iter_user(&self, range: VRange) -> KResult<PTEIterator<'_, false>> {
-        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
-    }
-
-    #[allow(dead_code)]
-    pub fn iter_kernel(&self, range: VRange) -> KResult<PTEIterator<'_, true>> {
-        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
-    }
-
-    pub fn unmap(&self, area: &MMArea) {
-        let range = area.range();
-        let use_invlpg = range.len() / 4096 < 4;
-        let iter = self.iter_user(range).unwrap();
-
-        if self.page.as_phys() != arch::get_root_page_table() {
-            for pte in iter {
-                pte.take();
-            }
-            return;
-        }
-
-        if use_invlpg {
-            for (offset_pages, pte) in iter.enumerate() {
-                pte.take();
-
-                let pfn = range.start().floor().0 + offset_pages * 4096;
-                arch::flush_tlb(pfn);
-            }
-        } else {
-            for pte in iter {
-                pte.take();
-            }
-            arch::flush_tlb_all();
-        }
-    }
-
-    pub fn lazy_invalidate_tlb_all(&self) {
-        if self.page.as_phys() == arch::get_root_page_table() {
-            arch::flush_tlb_all();
-        }
-    }
-
-    pub fn set_mmapped(&self, range: VRange, permission: Permission) {
-        // PA_RW is set during page fault handling.
-        // PA_NXE is preserved across page faults, so we set PA_NXE now.
-        let attributes = if permission.execute {
-            PA_US | PA_COW | PA_ANON | PA_MMAP
-        } else {
-            PA_US | PA_COW | PA_ANON | PA_MMAP | PA_NXE
-        };
-
-        for pte in self.iter_user(range).unwrap() {
-            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
-        }
-    }
-
-    pub fn set_anonymous(&self, range: VRange, permission: Permission) {
-        // PA_RW is set during page fault handling.
-        // PA_NXE is preserved across page faults, so we set PA_NXE now.
-        let attributes = if permission.execute {
-            PA_P | PA_US | PA_COW | PA_ANON
-        } else {
-            PA_P | PA_US | PA_COW | PA_ANON | PA_NXE
-        };
-
-        for pte in self.iter_user(range).unwrap() {
-            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
-        }
-    }
-}
-
-fn drop_page_table_recursive(pt: &Page, level: usize) {
-    for pte in pt
-        .as_cached()
-        .as_mut_slice::<PTE>(512)
-        .iter_mut()
-        .filter(|pte| pte.is_present() && pte.is_user())
-    {
-        let page = pte.take();
-        if level > 1 {
-            drop_page_table_recursive(&page, level - 1);
-        }
-    }
-}
-
-impl Drop for PageTable {
-    fn drop(&mut self) {
-        drop_page_table_recursive(&self.page, 4);
-    }
-}

+ 52 - 163
src/kernel/mem/paging.rs

@@ -1,180 +1,57 @@
-use super::address::PFN;
-use super::page_alloc::{alloc_page, alloc_pages, early_alloc_pages, free_pages, PagePtr};
-use super::phys::PhysPtr;
+use super::{
+    access::AsMemoryBlock,
+    page_alloc::{GlobalPageAlloc, NoAlloc},
+    MemoryBlock, PhysAccess,
+};
 use crate::io::{Buffer, FillResult};
-use crate::kernel::mem::phys;
-use core::fmt;
+use eonix_mm::paging::{Page as GenericPage, PageAlloc};
 
-pub struct Page {
-    page_ptr: PagePtr,
-    order: u32,
-}
-
-#[allow(dead_code)]
-impl Page {
-    pub fn alloc_one() -> Self {
-        let page_ptr = alloc_page();
-        Self { page_ptr, order: 0 }
-    }
-
-    pub fn alloc_many(order: u32) -> Self {
-        let page_ptr = alloc_pages(order);
-        Self { page_ptr, order }
-    }
-
-    /// Allocate a contiguous block of pages that can contain at least `count` pages.
-    pub fn alloc_ceil(count: usize) -> Self {
-        assert_ne!(count, 0);
-        let order = count.next_power_of_two().trailing_zeros();
-        Self::alloc_many(order)
-    }
-
-    pub fn early_alloc_ceil(count: usize) -> Self {
-        assert_ne!(count, 0);
-        let order = count.next_power_of_two().trailing_zeros();
-        let page_ptr = early_alloc_pages(order);
-        Self { page_ptr, order }
-    }
-
-    /// Get `Page` from `pfn`, acquiring the ownership of the page. `refcount` is not increased.
-    ///
-    /// # Safety
-    /// Caller must ensure that the pfn is no longer referenced by any other code.
-    pub unsafe fn take_pfn(pfn: usize, order: u32) -> Self {
-        let page_ptr: PagePtr = PFN::from(pfn >> 12).into();
-
-        // Only buddy pages can be used here.
-        // Also, check if the order is correct.
-        assert!(page_ptr.as_ref().is_buddy() && page_ptr.is_valid(order));
-
-        Self { page_ptr, order }
-    }
-
-    /// Get `Page` from `pfn` and increase the reference count.
-    ///
-    /// # Safety
-    /// Caller must ensure that `pfn` refers to a valid physical frame number with `refcount` > 0.
-    pub unsafe fn from_pfn(pfn: usize, order: u32) -> Self {
-        // SAFETY: `pfn` is a valid physical frame number with refcount > 0.
-        Self::increase_refcount(pfn);
-
-        // SAFETY: `pfn` has an increased refcount.
-        unsafe { Self::take_pfn(pfn, order) }
-    }
-
-    /// Consumes the `Page` and returns the physical frame number without dropping the reference
-    /// count the page holds.
-    pub fn into_pfn(self) -> usize {
-        let pfn: PFN = self.page_ptr.into();
-        core::mem::forget(self);
-        usize::from(pfn) << 12
-    }
-
-    pub fn len(&self) -> usize {
-        1 << (self.order + 12)
-    }
-
-    pub fn as_phys(&self) -> usize {
-        let pfn: PFN = self.page_ptr.into();
-        usize::from(pfn) << 12
-    }
-
-    pub fn as_cached(&self) -> phys::CachedPP {
-        phys::CachedPP::new(self.as_phys())
-    }
-
-    pub fn as_nocache(&self) -> phys::NoCachePP {
-        phys::NoCachePP::new(self.as_phys())
-    }
-
-    pub fn as_slice<'r, 'lt>(&'r self) -> &'lt [u8] {
-        self.as_cached().as_slice(self.len())
-    }
-
-    pub fn as_mut_slice<'r, 'lt>(&'r self) -> &'lt mut [u8] {
-        self.as_cached().as_mut_slice(self.len())
-    }
-
-    pub fn zero(&self) {
-        self.as_mut_slice().fill(0);
-    }
-
-    /// # Safety
-    /// Caller must ensure that the page is properly freed.
-    pub unsafe fn increase_refcount(pfn: usize) {
-        let page_ptr: PagePtr = PFN::from(pfn >> 12).into();
-        page_ptr.increase_refcount();
-    }
+pub type PageUnmanaged = GenericPage<NoAlloc>;
+pub type Page = GenericPage<GlobalPageAlloc>;
 
-    pub unsafe fn load_refcount(&self) -> usize {
-        self.page_ptr.load_refcount() as usize
-    }
+/// A buffer that wraps a page and provides a `Buffer` interface.
+pub struct PageBuffer {
+    page: Page,
+    offset: usize,
 }
 
-impl Clone for Page {
-    fn clone(&self) -> Self {
-        unsafe { self.page_ptr.increase_refcount() };
-
-        Self {
-            page_ptr: self.page_ptr,
-            order: self.order,
-        }
-    }
+pub trait AllocZeroed {
+    fn zeroed() -> Self;
 }
 
-impl Drop for Page {
-    fn drop(&mut self) {
-        match unsafe { self.page_ptr.decrease_refcount() } {
-            0 => panic!("In-use page refcount is 0"),
-            1 => free_pages(self.page_ptr, self.order),
-            _ => {}
+impl<A: PageAlloc> AsMemoryBlock for GenericPage<A> {
+    fn as_memblk(&self) -> MemoryBlock {
+        unsafe {
+            // SAFETY: `self.start()` points to valid memory of length `self.len()`.
+            MemoryBlock::new(self.start().as_ptr::<()>().addr(), self.len())
         }
     }
 }
 
-impl PartialEq for Page {
-    fn eq(&self, other: &Self) -> bool {
-        // assert!(self.page_ptr != other.page_ptr || self.order == other.order);
-
-        self.page_ptr.as_ptr() == other.page_ptr.as_ptr()
-    }
-}
-
-unsafe impl Sync for Page {}
-unsafe impl Send for Page {}
-
-impl fmt::Debug for Page {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let pfn = self.as_phys();
-        write!(f, "Page({:#x}, order={})", pfn, self.order)
-    }
-}
-
-pub struct PageBuffer {
-    page: Page,
-    offset: usize,
-}
-
-#[allow(dead_code)]
 impl PageBuffer {
-    pub fn new(page: Page) -> Self {
-        Self { page, offset: 0 }
-    }
-
-    pub fn len(&self) -> usize {
-        self.offset
+    pub fn new() -> Self {
+        Self {
+            page: Page::alloc(),
+            offset: 0,
+        }
     }
 
-    pub fn remaining(&self) -> usize {
-        self.page.len() - self.offset
+    pub fn all(&self) -> &[u8] {
+        unsafe {
+            // SAFETY: The page is exclusivly owned by us.
+            self.page.as_memblk().as_bytes()
+        }
     }
 
-    pub fn as_slice(&self) -> &[u8] {
-        self.page.as_slice()
+    pub fn data(&self) -> &[u8] {
+        &self.all()[..self.offset]
     }
 
-    fn available_as_slice(&self) -> &mut [u8] {
-        &mut self.page.as_mut_slice()[self.offset..]
+    pub fn available_mut(&mut self) -> &mut [u8] {
+        unsafe {
+            // SAFETY: The page is exclusivly owned by us.
+            &mut self.page.as_memblk().as_bytes_mut()[self.offset..]
+        }
     }
 }
 
@@ -184,16 +61,17 @@ impl Buffer for PageBuffer {
     }
 
     fn wrote(&self) -> usize {
-        self.len()
+        self.offset
     }
 
     fn fill(&mut self, data: &[u8]) -> crate::KResult<crate::io::FillResult> {
-        if self.remaining() == 0 {
+        let available = self.available_mut();
+        if available.len() == 0 {
             return Ok(FillResult::Full);
         }
 
-        let len = core::cmp::min(data.len(), self.remaining());
-        self.available_as_slice()[..len].copy_from_slice(&data[..len]);
+        let len = core::cmp::min(data.len(), available.len());
+        available[..len].copy_from_slice(&data[..len]);
         self.offset += len;
 
         if len < data.len() {
@@ -203,3 +81,14 @@ impl Buffer for PageBuffer {
         }
     }
 }
+
+impl AllocZeroed for Page {
+    fn zeroed() -> Self {
+        let page = Self::alloc();
+        unsafe {
+            // SAFETY: The page is exclusivly owned by us.
+            page.as_memblk().as_bytes_mut().fill(0);
+        }
+        page
+    }
+}

+ 0 - 80
src/kernel/mem/phys.rs

@@ -1,80 +0,0 @@
-use core::fmt;
-
-pub trait PhysPtr {
-    fn as_ptr<T>(&self) -> *mut T;
-
-    #[allow(dead_code)]
-    fn as_ref<'lifetime, T>(&self) -> &'lifetime T {
-        unsafe { &*(self.as_ptr()) }
-    }
-
-    fn as_mut<'lifetime, T>(&self) -> &'lifetime mut T {
-        unsafe { &mut *(self.as_ptr()) }
-    }
-
-    fn as_slice<'lifetime, T>(&self, len: usize) -> &'lifetime [T] {
-        unsafe { core::slice::from_raw_parts(self.as_ptr(), len) }
-    }
-
-    fn as_mut_slice<'lifetime, T>(&self, len: usize) -> &'lifetime mut [T] {
-        unsafe { core::slice::from_raw_parts_mut(self.as_ptr(), len) }
-    }
-}
-
-#[derive(Clone, Copy, PartialEq, Eq)]
-pub struct CachedPP {
-    addr: usize,
-}
-
-#[derive(Clone, Copy, PartialEq, Eq)]
-pub struct NoCachePP {
-    addr: usize,
-}
-
-impl CachedPP {
-    pub const fn new(addr: usize) -> Self {
-        Self { addr }
-    }
-
-    pub const fn offset(&self, offset: usize) -> Self {
-        Self {
-            addr: self.addr + offset,
-        }
-    }
-}
-
-impl PhysPtr for CachedPP {
-    fn as_ptr<T>(&self) -> *mut T {
-        (self.addr + 0xffffff0000000000) as *mut T
-    }
-}
-
-impl fmt::Debug for CachedPP {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "CachedPP({:#x})", self.addr)
-    }
-}
-
-impl NoCachePP {
-    pub fn new(addr: usize) -> Self {
-        Self { addr }
-    }
-
-    pub fn offset(&self, offset: isize) -> Self {
-        Self {
-            addr: self.addr + offset as usize,
-        }
-    }
-}
-
-impl PhysPtr for NoCachePP {
-    fn as_ptr<T>(&self) -> *mut T {
-        (self.addr + 0xffffff4000000000) as *mut T
-    }
-}
-
-impl fmt::Debug for NoCachePP {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "NoCachePP({:#x})", self.addr)
-    }
-}

+ 7 - 8
src/kernel/smp.rs

@@ -1,20 +1,19 @@
 use super::cpu::init_localcpu;
 use crate::{
-    kernel::{
-        cpu::local_cpu,
-        mem::{paging::Page, phys::PhysPtr as _},
-        task::KernelStack,
-    },
+    kernel::{cpu::local_cpu, mem::paging::Page, task::KernelStack},
     println_debug,
 };
 use arch::define_smp_bootstrap;
+use eonix_mm::address::Addr as _;
 use eonix_runtime::scheduler::Scheduler;
 
 define_smp_bootstrap!(4, ap_entry, {
-    let page = Page::alloc_many(9);
-    let stack_bottom = page.as_cached().as_ptr::<()>() as usize + page.len();
+    let page = Page::alloc_order(9);
+    let stack_bottom = page.range().end();
     core::mem::forget(page);
-    stack_bottom
+
+    // Physical address is used for init state APs.
+    stack_bottom.addr() as u64
 });
 
 unsafe extern "C" fn ap_entry() -> ! {

+ 14 - 18
src/kernel/syscall/mm.rs

@@ -1,15 +1,15 @@
-use bindings::{EINVAL, ENOMEM};
-
+use super::{define_syscall32, register_syscall, MapArgument, MapArgumentImpl};
 use crate::{
     kernel::{
         constants::{UserMmapFlags, UserMmapProtocol},
-        mem::{Mapping, Permission, VAddr},
+        mem::{Mapping, Permission},
         task::Thread,
     },
     prelude::*,
 };
-
-use super::{define_syscall32, register_syscall, MapArgument, MapArgumentImpl};
+use bindings::{EINVAL, ENOMEM};
+use eonix_mm::address::{Addr as _, AddrOps as _, VAddr};
+use eonix_runtime::task::Task;
 
 /// Check whether we are doing an implemented function.
 /// If `condition` is false, return `Err(err)`.
@@ -29,8 +29,8 @@ fn do_mmap_pgoff(
     fd: u32,
     pgoffset: usize,
 ) -> KResult<usize> {
-    let addr = VAddr(addr);
-    if addr.floor() != addr || len == 0 {
+    let addr = VAddr::from(addr);
+    if !addr.is_page_aligned() || len == 0 {
         return Err(EINVAL);
     }
 
@@ -45,7 +45,7 @@ fn do_mmap_pgoff(
 
     // PROT_NONE, we do unmapping.
     if prot.is_empty() {
-        mm_list.unmap(addr, len).map(|_| 0)?;
+        Task::block_on(mm_list.unmap(addr, len)).map(|_| 0)?;
         return Ok(0);
     }
     // Otherwise, do mmapping.
@@ -74,26 +74,22 @@ fn do_mmap_pgoff(
         )
     };
 
-    addr.map(|addr| addr.0)
+    addr.map(|addr| addr.addr())
 }
 
 fn do_munmap(addr: usize, len: usize) -> KResult<usize> {
-    let addr = VAddr(addr);
-    if addr.floor() != addr || len == 0 {
+    let addr = VAddr::from(addr);
+    if !addr.is_page_aligned() || len == 0 {
         return Err(EINVAL);
     }
 
     let len = (len + 0xfff) & !0xfff;
-    Thread::current()
-        .process
-        .mm_list
-        .unmap(addr, len)
-        .map(|_| 0)
+    Task::block_on(Thread::current().process.mm_list.unmap(addr, len)).map(|_| 0)
 }
 
 fn do_brk(addr: usize) -> KResult<usize> {
-    let vaddr = if addr == 0 { None } else { Some(VAddr(addr)) };
-    Ok(Thread::current().process.mm_list.set_break(vaddr).0)
+    let vaddr = if addr == 0 { None } else { Some(VAddr::from(addr)) };
+    Ok(Thread::current().process.mm_list.set_break(vaddr).addr())
 }
 
 impl MapArgument<'_, UserMmapProtocol> for MapArgumentImpl {

+ 11 - 8
src/kernel/syscall/procops.rs

@@ -5,7 +5,7 @@ use crate::io::Buffer;
 use crate::kernel::constants::{
     ENOSYS, PR_GET_NAME, PR_SET_NAME, RLIMIT_STACK, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK,
 };
-use crate::kernel::mem::{Page, PageBuffer, VAddr};
+use crate::kernel::mem::PageBuffer;
 use crate::kernel::task::{
     KernelStack, ProcessBuilder, ProcessList, Signal, SignalAction, SignalMask, Thread,
     ThreadBuilder, ThreadRunnable, UserDescriptor, WaitObject, WaitType,
@@ -22,6 +22,7 @@ use alloc::ffi::CString;
 use arch::{ExtendedContext, InterruptContext};
 use bindings::{EINVAL, ENOENT, ENOTDIR, ERANGE, ESRCH};
 use bitflags::bitflags;
+use eonix_mm::address::{Addr as _, VAddr};
 use eonix_runtime::scheduler::Scheduler;
 use eonix_runtime::task::Task;
 use eonix_sync::AsProof as _;
@@ -39,11 +40,10 @@ fn do_umask(mask: u32) -> KResult<u32> {
 fn do_getcwd(buffer: *mut u8, bufsize: usize) -> KResult<usize> {
     let context = FsContext::get_current();
     let mut user_buffer = UserBuffer::new(buffer, bufsize)?;
+    let mut buffer = PageBuffer::new();
 
-    let page = Page::alloc_one();
-    let mut buffer = PageBuffer::new(page.clone());
     context.cwd.lock().get_path(&context, &mut buffer)?;
-    user_buffer.fill(page.as_slice())?.ok_or(ERANGE)?;
+    user_buffer.fill(buffer.data())?.ok_or(ERANGE)?;
 
     Ok(buffer.wrote())
 }
@@ -99,7 +99,10 @@ fn do_execve(exec: &[u8], argv: Vec<CString>, envp: Vec<CString>) -> KResult<(VA
     let elf = ParsedElf32::parse(dentry.clone())?;
     let result = elf.load(argv, envp);
     if let Ok((ip, sp, mm_list)) = result {
-        Thread::current().process.mm_list.replace(mm_list);
+        unsafe {
+            // SAFETY: We are doing execve, all other threads are terminated.
+            Thread::current().process.mm_list.replace(Some(mm_list));
+        }
         Thread::current().files.on_exec();
         Thread::current().signal_list.clear_non_ignore();
         Thread::current().set_name(dentry.name().clone());
@@ -149,8 +152,8 @@ fn sys_execve(int_stack: &mut InterruptContext, _: &mut ExtendedContext) -> usiz
 
         let (ip, sp) = do_execve(exec.as_cstr().to_bytes(), argv_vec, envp_vec)?;
 
-        int_stack.rip = ip.0 as u64;
-        int_stack.rsp = sp.0 as u64;
+        int_stack.rip = ip.addr() as u64;
+        int_stack.rsp = sp.addr() as u64;
         Ok(())
     })() {
         Ok(_) => 0,
@@ -569,7 +572,7 @@ fn sys_fork(int_stack: &mut InterruptContext, _: &mut ExtendedContext) -> usize
 
     let thread_builder = ThreadBuilder::new().fork_from(&current);
     let (new_thread, new_process) = ProcessBuilder::new()
-        .mm_list(current_process.mm_list.new_cloned())
+        .mm_list(Task::block_on(current_process.mm_list.new_cloned()))
         .parent(current_process)
         .pgroup(current_pgroup)
         .session(current_session)

+ 11 - 7
src/kernel/task/kernel_stack.rs

@@ -1,10 +1,11 @@
-use crate::kernel::mem::{paging::Page, phys::PhysPtr};
+use crate::kernel::mem::{paging::Page, PhysAccess as _};
+use core::{num::NonZero, ptr::NonNull};
 use eonix_runtime::executor::Stack;
 
 #[derive(Debug)]
 pub struct KernelStack {
     _pages: Page,
-    bottom: usize,
+    bottom: NonZero<usize>,
 }
 
 impl KernelStack {
@@ -13,8 +14,11 @@ impl KernelStack {
     const KERNEL_STACK_ORDER: u32 = 7;
 
     pub fn new() -> Self {
-        let pages = Page::alloc_many(Self::KERNEL_STACK_ORDER);
-        let bottom = pages.as_cached().offset(pages.len()).as_ptr::<u8>() as usize;
+        let pages = Page::alloc_order(Self::KERNEL_STACK_ORDER);
+        let bottom = unsafe {
+            // SAFETY: The paddr is from a page, which should be valid.
+            pages.range().end().as_ptr::<u8>().addr()
+        };
 
         Self {
             _pages: pages,
@@ -28,8 +32,8 @@ impl Stack for KernelStack {
         Self::new()
     }
 
-    fn get_bottom(&self) -> &() {
-        // SAFETY: We hold the ownership of a valid stack.
-        unsafe { &*(self.bottom as *const ()) }
+    fn get_bottom(&self) -> NonNull<()> {
+        // SAFETY: The stack is allocated and `bottom` is non-zero.
+        unsafe { NonNull::new_unchecked(self.bottom.get() as *mut _) }
     }
 }

+ 3 - 9
src/kernel/task/process_list.rs

@@ -1,10 +1,9 @@
 use super::{Process, ProcessGroup, Session, Signal, Thread, WaitObject, WaitType};
-use crate::{prelude::*, rcu::rcu_sync};
+use crate::rcu::rcu_sync;
 use alloc::{
     collections::btree_map::BTreeMap,
     sync::{Arc, Weak},
 };
-use bindings::KERNEL_PML4;
 use eonix_runtime::task::Task;
 use eonix_sync::{AsProof as _, AsProofMut as _, RwLock};
 
@@ -142,17 +141,12 @@ impl ProcessList {
             }
         }
 
-        eonix_preempt::disable();
-
         // Release the MMList as well as the page table.
-        // Before we release the page table, we need to switch to the kernel page table.
-        arch::set_root_page_table(KERNEL_PML4 as usize);
         unsafe {
-            process.mm_list.release();
+            // SAFETY: We are exiting the process, so no one might be using it.
+            process.mm_list.replace(None);
         }
 
-        eonix_preempt::enable();
-
         // Make children orphans (adopted by init)
         {
             let init = self.init_process();

+ 10 - 10
src/kernel/task/signal/signal_action.rs

@@ -3,7 +3,6 @@ use crate::{
     io::BufferFill as _,
     kernel::{
         constants::{EFAULT, EINVAL, ENOSYS},
-        mem::VAddr,
         user::UserBuffer,
     },
     SIGNAL_NOW,
@@ -11,6 +10,7 @@ use crate::{
 use alloc::collections::btree_map::BTreeMap;
 use arch::{ExtendedContext, InterruptContext};
 use core::num::NonZero;
+use eonix_mm::address::{Addr as _, AddrOps as _, VAddr};
 use posix_types::signal::{SigAction, TryFromSigAction};
 
 #[derive(Debug, Clone, Copy)]
@@ -93,9 +93,9 @@ impl SignalAction {
                 // TODO!!!: Determine the size of the return address
                 let sp = VAddr::from(int_stack.rsp as usize - 128 - CONTEXT_SIZE).floor_to(16)
                     - size_of::<u32>();
-                let restorer_address = usize::from(restorer) as u32;
+                let restorer_address = restorer.addr() as u32;
                 let mut stack =
-                    UserBuffer::new(usize::from(sp) as *mut u8, CONTEXT_SIZE + size_of::<u32>())?;
+                    UserBuffer::new(sp.addr() as *mut u8, CONTEXT_SIZE + size_of::<u32>())?;
 
                 stack.copy(&restorer_address)?.ok_or(EFAULT)?; // Restorer address
                 stack.copy(&u32::from(signal))?.ok_or(EFAULT)?; // `signum`
@@ -103,8 +103,8 @@ impl SignalAction {
                 stack.copy(ext_ctx)?.ok_or(EFAULT)?; // MMX registers
                 stack.copy(int_stack)?.ok_or(EFAULT)?; // Interrupt stack
 
-                int_stack.rip = usize::from(handler) as u64;
-                int_stack.rsp = usize::from(sp) as u64;
+                int_stack.rip = handler.addr() as u64;
+                int_stack.rsp = sp.addr() as u64;
                 Ok(())
             }
         }
@@ -138,7 +138,7 @@ impl TryFromSigAction for SignalAction {
 
     fn new() -> Self {
         Self::SimpleHandler {
-            handler: VAddr(0),
+            handler: VAddr::NULL,
             restorer: None,
             mask: SignalMask::empty(),
         }
@@ -150,7 +150,7 @@ impl TryFromSigAction for SignalAction {
 
     fn handler(mut self, handler_addr: usize) -> Result<Self, Self::Error> {
         if let Self::SimpleHandler { handler, .. } = &mut self {
-            *handler = VAddr(handler_addr);
+            *handler = VAddr::from(handler_addr);
             Ok(self)
         } else {
             unreachable!()
@@ -159,7 +159,7 @@ impl TryFromSigAction for SignalAction {
 
     fn restorer(mut self, restorer_addr: usize) -> Result<Self, Self::Error> {
         if let Self::SimpleHandler { restorer, .. } = &mut self {
-            *restorer = NonZero::new(restorer_addr).map(|x| VAddr(x.get()));
+            *restorer = NonZero::new(restorer_addr).map(|x| VAddr::from(x.get()));
             Ok(self)
         } else {
             unreachable!()
@@ -187,11 +187,11 @@ impl From<SignalAction> for SigAction {
                 mask,
             } => {
                 let action = SigAction::new()
-                    .handler(usize::from(handler))
+                    .handler(handler.addr())
                     .mask(u64::from(mask));
 
                 if let Some(restorer) = restorer {
-                    action.restorer(usize::from(restorer))
+                    action.restorer(restorer.addr())
                 } else {
                     action
                 }

+ 5 - 8
src/kernel/task/thread.rs

@@ -5,7 +5,6 @@ use super::{
 use crate::{
     kernel::{
         cpu::local_cpu,
-        mem::VAddr,
         user::dataflow::CheckedUserPointer,
         vfs::{filearray::FileArray, FsContext},
     },
@@ -13,7 +12,6 @@ use crate::{
 };
 use alloc::sync::Arc;
 use arch::{InterruptContext, UserTLS, _arch_fork_return};
-use bindings::KERNEL_PML4;
 use core::{
     arch::asm,
     pin::Pin,
@@ -21,6 +19,7 @@ use core::{
     sync::atomic::{AtomicUsize, Ordering},
     task::Waker,
 };
+use eonix_mm::address::{Addr as _, VAddr};
 use eonix_runtime::{
     context::ExecutionContext,
     run::{Contexted, Run, RunState},
@@ -298,11 +297,9 @@ impl Thread {
 
 impl ThreadRunnable {
     pub fn new(thread: Arc<Thread>, entry: VAddr, stack_pointer: VAddr) -> Self {
-        let (VAddr(entry), VAddr(stack_pointer)) = (entry, stack_pointer);
-
         let mut interrupt_context = InterruptContext::default();
-        interrupt_context.set_return_address(entry as _, true);
-        interrupt_context.set_stack_pointer(stack_pointer as _, true);
+        interrupt_context.set_return_address(entry.addr() as _, true);
+        interrupt_context.set_stack_pointer(stack_pointer.addr() as _, true);
         interrupt_context.set_interrupt_enabled(true);
 
         Self {
@@ -347,7 +344,7 @@ impl Contexted for ThreadRunnable {
             CURRENT_THREAD.swap(Some(current_thread));
         }
 
-        thread.process.mm_list.switch_page_table();
+        thread.process.mm_list.activate();
 
         unsafe {
             // SAFETY: Preemption is disabled.
@@ -356,7 +353,7 @@ impl Contexted for ThreadRunnable {
     }
 
     fn restore_running_context(&self) {
-        arch::set_root_page_table(KERNEL_PML4 as usize);
+        self.thread.process.mm_list.deactivate();
     }
 }
 

+ 14 - 21
src/kernel/vfs/file.rs

@@ -4,10 +4,10 @@ use super::{
     s_isblk, s_isdir, s_isreg,
 };
 use crate::{
-    io::{Buffer, BufferFill, ByteBuffer},
+    io::{Buffer, BufferFill, ByteBuffer, Chunks},
     kernel::{
         constants::{TCGETS, TCSETS, TIOCGPGRP, TIOCGWINSZ, TIOCSPGRP},
-        mem::paging::Page,
+        mem::{paging::Page, AsMemoryBlock as _},
         task::{Signal, Thread},
         terminal::{Terminal, TerminalIORequest},
         user::{UserPointer, UserPointerMut},
@@ -498,38 +498,31 @@ impl File {
     }
 
     pub async fn sendfile(&self, dest_file: &Self, count: usize) -> KResult<usize> {
-        let buffer_page = Page::alloc_one();
+        let buffer_page = Page::alloc();
+        // SAFETY: We are the only owner of the page.
+        let buffer = unsafe { buffer_page.as_memblk().as_bytes_mut() };
 
         match self {
             File::Inode(file) if s_isblk(file.mode) || s_isreg(file.mode) => (),
             _ => return Err(EINVAL),
         }
 
-        // TODO!!!: zero copy implementation with mmap
-        let mut tot = 0usize;
-        while tot < count {
+        for (cur, len) in Chunks::new(0, count, buffer.len()) {
             if Thread::current().signal_list.has_pending_signal() {
-                if tot == 0 {
-                    return Err(EINTR);
-                } else {
-                    return Ok(tot);
-                }
+                return if cur == 0 { Err(EINTR) } else { Ok(cur) };
             }
-
-            let batch_size = usize::min(count - tot, buffer_page.len());
-            let slice = &mut buffer_page.as_mut_slice()[..batch_size];
-            let mut buffer = ByteBuffer::new(slice);
-
-            let nwrote = self.read(&mut buffer).await?;
-
-            if nwrote == 0 {
+            let nread = self.read(&mut ByteBuffer::new(&mut buffer[..len])).await?;
+            if nread == 0 {
                 break;
             }
 
-            tot += dest_file.write(&slice[..nwrote]).await?;
+            let nwrote = dest_file.write(&buffer[..nread]).await?;
+            if nwrote != nread {
+                return Ok(cur + nwrote);
+            }
         }
 
-        Ok(tot)
+        Ok(count)
     }
 
     pub fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {

+ 6 - 4
src/lib.rs

@@ -28,6 +28,7 @@ mod sync;
 use alloc::{ffi::CString, sync::Arc};
 use core::alloc::{GlobalAlloc, Layout};
 use elf::ParsedElf32;
+use eonix_mm::{address::PAddr, paging::PFN};
 use eonix_runtime::{run::FutureRun, scheduler::Scheduler, task::Task};
 use kernel::{
     cpu::init_localcpu,
@@ -95,7 +96,7 @@ extern "C" {
 }
 
 #[no_mangle]
-pub extern "C" fn rust_kinit(early_kstack_pfn: usize) -> ! {
+pub extern "C" fn rust_kinit(early_kstack_paddr: PAddr) -> ! {
     // We don't call global constructors.
     // Rust doesn't need that, and we're not going to use global variables in C++.
     init_localcpu();
@@ -114,7 +115,8 @@ pub extern "C" fn rust_kinit(early_kstack_pfn: usize) -> ! {
     // So call `init_vfs` first, then `init_multitasking`.
     Scheduler::init_local_scheduler::<KernelStack>();
 
-    Scheduler::get().spawn::<KernelStack, _>(FutureRun::new(init_process(early_kstack_pfn)));
+    Scheduler::get()
+        .spawn::<KernelStack, _>(FutureRun::new(init_process(PFN::from(early_kstack_paddr))));
 
     unsafe {
         // SAFETY: `preempt::count()` == 1.
@@ -122,8 +124,8 @@ pub extern "C" fn rust_kinit(early_kstack_pfn: usize) -> ! {
     }
 }
 
-async fn init_process(early_kstack_pfn: usize) {
-    unsafe { Page::take_pfn(early_kstack_pfn, 9) };
+async fn init_process(early_kstack_pfn: PFN) {
+    unsafe { Page::from_raw(early_kstack_pfn) };
 
     kernel::syscall::register_syscalls();
     CharDevice::init().unwrap();

+ 1 - 0
src/sync.rs

@@ -1,5 +1,6 @@
 mod arcswap;
 mod condvar;
+pub mod fence;
 
 pub use arcswap::ArcSwap;
 pub use eonix_sync::Spin;

+ 34 - 0
src/sync/fence.rs

@@ -0,0 +1,34 @@
+use core::sync::atomic::{compiler_fence, Ordering};
+
+/// A strong memory barrier that prevents reordering of memory operations.
+pub fn memory_barrier() {
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+
+    arch::memory_barrier();
+
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+}
+
+/// A read memory barrier that prevents reordering of read operations.
+pub fn read_memory_barrier() {
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+
+    arch::read_memory_barrier();
+
+    // A read memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+}
+
+/// A write memory barrier that prevents reordering of write operations.
+pub fn write_memory_barrier() {
+    // A full memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+
+    arch::write_memory_barrier();
+
+    // A write memory barrier to prevent the compiler from reordering.
+    compiler_fence(Ordering::SeqCst);
+}