ソースを参照

Merge pull request #46 from greatbridf/task-rework

Tasking subsystem rework and ext4 crate replace.

Suggested-By: Heinz <aurelianob808@gmail.com>
Signed-off-by: greatbridf <greatbridf@icloud.com>
greatbridf 6 日 前
コミット
affe0c764b
90 ファイル変更4070 行追加2713 行削除
  1. 1 0
      .cargo/config.toml
  2. 46 16
      Cargo.lock
  3. 14 6
      Cargo.toml
  4. 9 1
      Makefile.src
  5. 5 5
      configure
  6. 1 0
      crates/eonix_hal/eonix_hal_traits/src/trap.rs
  7. 2 2
      crates/eonix_hal/src/arch/loongarch64/link.x
  8. 0 2
      crates/eonix_hal/src/arch/loongarch64/memory.x
  9. 2 0
      crates/eonix_hal/src/arch/loongarch64/mm.rs
  10. 11 2
      crates/eonix_hal/src/arch/loongarch64/trap/mod.rs
  11. 13 2
      crates/eonix_hal/src/arch/loongarch64/trap/trap_context.rs
  12. 0 8
      crates/eonix_hal/src/arch/riscv64/bootstrap.rs
  13. 11 8
      crates/eonix_hal/src/arch/riscv64/cpu.rs
  14. 3 2
      crates/eonix_hal/src/arch/riscv64/link.x
  15. 0 2
      crates/eonix_hal/src/arch/riscv64/memory.x
  16. 2 0
      crates/eonix_hal/src/arch/riscv64/mm.rs
  17. 177 0
      crates/eonix_hal/src/arch/riscv64/trap/captured.rs
  18. 134 0
      crates/eonix_hal/src/arch/riscv64/trap/default.rs
  19. 24 265
      crates/eonix_hal/src/arch/riscv64/trap/mod.rs
  20. 57 51
      crates/eonix_hal/src/arch/riscv64/trap/trap_context.rs
  21. 14 9
      crates/eonix_hal/src/link.x.in
  22. 1 1
      crates/eonix_mm/src/page_table/page_table.rs
  23. 61 92
      crates/eonix_runtime/src/executor.rs
  24. 1 5
      crates/eonix_runtime/src/executor/builder.rs
  25. 0 4
      crates/eonix_runtime/src/executor/execute_status.rs
  26. 0 1
      crates/eonix_runtime/src/lib.rs
  27. 0 34
      crates/eonix_runtime/src/run.rs
  28. 0 34
      crates/eonix_runtime/src/run/future_run.rs
  29. 132 182
      crates/eonix_runtime/src/scheduler.rs
  30. 48 127
      crates/eonix_runtime/src/task.rs
  31. 2 1
      crates/eonix_runtime/src/task/adapter.rs
  32. 11 17
      crates/eonix_runtime/src/task/task_state.rs
  33. 3 0
      crates/eonix_sync/eonix_sync_base/src/locked/proof.rs
  34. 28 0
      crates/posix_types/src/getdent.rs
  35. 1 0
      crates/posix_types/src/lib.rs
  36. 10 0
      crates/posix_types/src/result.rs
  37. 40 21
      macros/src/lib.rs
  38. 2 2
      src/driver/ahci/mod.rs
  39. 3 3
      src/driver/ahci/port.rs
  40. 3 3
      src/driver/serial.rs
  41. 2 2
      src/driver/virtio/loongarch64.rs
  42. 3 9
      src/driver/virtio/riscv64.rs
  43. 17 1
      src/driver/virtio/virtio_blk.rs
  44. 456 70
      src/fs/ext4.rs
  45. 12 13
      src/fs/fat32.rs
  46. 11 11
      src/fs/procfs.rs
  47. 47 59
      src/fs/tmpfs.rs
  48. 4 2
      src/io.rs
  49. 97 15
      src/kernel/block.rs
  50. 7 11
      src/kernel/chardev.rs
  51. 5 12
      src/kernel/interrupt.rs
  52. 1 1
      src/kernel/mem.rs
  53. 5 3
      src/kernel/mem/mm_area.rs
  54. 16 14
      src/kernel/mem/mm_list.rs
  55. 11 8
      src/kernel/mem/mm_list/page_fault.rs
  56. 0 1
      src/kernel/mem/page_alloc/raw_page.rs
  57. 60 8
      src/kernel/mem/page_cache.rs
  58. 163 8
      src/kernel/syscall.rs
  59. 173 130
      src/kernel/syscall/file_rw.rs
  60. 64 44
      src/kernel/syscall/mm.rs
  61. 1 1
      src/kernel/syscall/net.rs
  62. 169 183
      src/kernel/syscall/procops.rs
  63. 9 8
      src/kernel/syscall/sysinfo.rs
  64. 209 1
      src/kernel/task.rs
  65. 13 15
      src/kernel/task/clone.rs
  66. 7 6
      src/kernel/task/futex.rs
  67. 103 86
      src/kernel/task/loader/elf.rs
  68. 2 2
      src/kernel/task/loader/mod.rs
  69. 45 40
      src/kernel/task/process.rs
  70. 8 11
      src/kernel/task/process_list.rs
  71. 2 2
      src/kernel/task/session.rs
  72. 8 11
      src/kernel/task/signal.rs
  73. 3 2
      src/kernel/task/signal/signal_action.rs
  74. 86 95
      src/kernel/task/thread.rs
  75. 16 17
      src/kernel/terminal.rs
  76. 2 6
      src/kernel/user.rs
  77. 70 45
      src/kernel/user/dataflow.rs
  78. 12 12
      src/kernel/vfs/dentry.rs
  79. 6 6
      src/kernel/vfs/dentry/dcache.rs
  80. 0 636
      src/kernel/vfs/file.rs
  81. 223 0
      src/kernel/vfs/file/inode_file.rs
  82. 232 0
      src/kernel/vfs/file/mod.rs
  83. 211 0
      src/kernel/vfs/file/pipe.rs
  84. 55 0
      src/kernel/vfs/file/terminal_file.rs
  85. 268 130
      src/kernel/vfs/filearray.rs
  86. 150 12
      src/kernel/vfs/inode.rs
  87. 4 23
      src/kernel/vfs/mod.rs
  88. 72 24
      src/lib.rs
  89. 29 0
      src/panic.rs
  90. 29 9
      src/rcu.rs

+ 1 - 0
.cargo/config.toml

@@ -1,6 +1,7 @@
 [build]
 target = "riscv64gc-unknown-none-elf"
 target-dir = 'build'
+rustflags = ["-C", "force-unwind-tables"]
 
 [unstable]
 build-std-features = ['compiler-builtins-mem']

+ 46 - 16
Cargo.lock

@@ -19,6 +19,15 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c1c330e503236d0b06386ae6cc42a513ef1ccc23c52b603c1b52f018564faf44"
 
+[[package]]
+name = "another_ext4"
+version = "0.1.0"
+source = "git+https://github.com/SMS-Derfflinger/another_ext4?branch=main#ed6d91718db721eb4a744483c289cc44a6f34bf4"
+dependencies = [
+ "bitflags",
+ "log",
+]
+
 [[package]]
 name = "atomic_unique_refcell"
 version = "0.1.0"
@@ -134,6 +143,7 @@ version = "0.1.0"
 dependencies = [
  "acpi",
  "align_ext",
+ "another_ext4",
  "atomic_unique_refcell",
  "bitflags",
  "buddy_allocator",
@@ -145,13 +155,14 @@ dependencies = [
  "eonix_preempt",
  "eonix_runtime",
  "eonix_sync",
- "ext4_rs",
- "intrusive-collections",
+ "intrusive-collections 0.9.8",
  "intrusive_list",
  "itertools",
  "pointers",
  "posix_types",
  "slab_allocator",
+ "stalloc",
+ "unwinding",
  "virtio-drivers",
  "xmas-elf",
 ]
@@ -212,7 +223,7 @@ dependencies = [
  "eonix_percpu",
  "eonix_preempt",
  "eonix_sync",
- "intrusive-collections",
+ "intrusive-collections 0.9.7",
  "pointers",
 ]
 
@@ -245,17 +256,7 @@ dependencies = [
  "eonix_preempt",
  "eonix_spin",
  "eonix_sync_base",
- "intrusive-collections",
-]
-
-[[package]]
-name = "ext4_rs"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a1a97344bde15b0ace15e265dab27228d4bdc37a0bfa8548c5645d7cfa6a144"
-dependencies = [
- "bitflags",
- "log",
+ "intrusive-collections 0.9.7",
 ]
 
 [[package]]
@@ -264,6 +265,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "784a4df722dc6267a04af36895398f59d21d07dce47232adf31ec0ff2fa45e67"
 
+[[package]]
+name = "gimli"
+version = "0.32.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93563d740bc9ef04104f9ed6f86f1e3275c2cdafb95664e26584b9ca807a8ffe"
+
 [[package]]
 name = "intrusive-collections"
 version = "0.9.7"
@@ -273,6 +280,14 @@ dependencies = [
  "memoffset",
 ]
 
+[[package]]
+name = "intrusive-collections"
+version = "0.9.8"
+source = "git+https://github.com/greatbridf/intrusive-rs#0e2d88bffc9df606566fba2d61d1217182b06975"
+dependencies = [
+ "memoffset",
+]
+
 [[package]]
 name = "intrusive_list"
 version = "0.1.0"
@@ -401,11 +416,17 @@ dependencies = [
  "intrusive_list",
 ]
 
+[[package]]
+name = "stalloc"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a37f0ead4094eeb54c6893316aa139e48b252f1c07511e5124fa1f9414df5b6c"
+
 [[package]]
 name = "syn"
-version = "2.0.103"
+version = "2.0.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4307e30089d6fd6aff212f2da3a1f9e32f3223b1f010fb09b7c95f90f3ca1e8"
+checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -438,6 +459,15 @@ version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
 
+[[package]]
+name = "unwinding"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60612c845ef41699f39dc8c5391f252942c0a88b7d15da672eff0d14101bbd6d"
+dependencies = [
+ "gimli",
+]
+
 [[package]]
 name = "virtio-drivers"
 version = "0.11.0"

+ 14 - 6
Cargo.toml

@@ -25,21 +25,32 @@ posix_types = { path = "./crates/posix_types" }
 slab_allocator = { path = "./crates/slab_allocator" }
 
 bitflags = "2.6.0"
-intrusive-collections = "0.9.7"
+intrusive-collections = { version = "0.9.8", git = "https://github.com/greatbridf/intrusive-rs" }
 itertools = { version = "0.13.0", default-features = false }
 acpi = "5.2.0"
 align_ext = "0.1.0"
 xmas-elf = "0.10.0"
-ext4_rs = "1.3.2"
+another_ext4 = { git = "https://github.com/SMS-Derfflinger/another_ext4", branch = "main" }
+stalloc = { version = "0.6.1", default-features = false, features = [
+    "allocator-api",
+] }
 
 [target.'cfg(any(target_arch = "riscv64", target_arch = "loongarch64"))'.dependencies]
 virtio-drivers = { version = "0.11.0" }
 
+[target.'cfg(target_arch = "riscv64")'.dependencies]
+unwinding = { version = "0.2.8", default-features = false, features = [
+    "unwinder",
+    "fde-static",
+    "personality",
+    "panic",
+] }
+
 [features]
 default = []
 trace_pci = []
 trace_syscall = []
-trace_scheduler = []
+trace_scheduler = ["eonix_runtime/trace_scheduler"]
 log_trace = ["trace_pci", "trace_syscall", "trace_scheduler"]
 log_debug = []
 smp = []
@@ -47,9 +58,6 @@ smp = []
 [profile.release]
 debug = true
 
-[profile.dev]
-panic = "abort"
-
 [profile.dev.package.eonix_preempt]
 opt-level = "s"
 

+ 9 - 1
Makefile.src

@@ -22,7 +22,15 @@ KERNEL_CARGO_MANIFESTS += $(shell find src macros crates -name Cargo.toml -type
 KERNEL_DEPS := $(KERNEL_SOURCES) $(KERNEL_CARGO_MANIFESTS)
 
 QEMU_ARGS ?= -no-reboot
-CARGO_FLAGS := --profile $(PROFILE) --features $(FEATURES)$(if $(SMP),$(COMMA)smp,)
+CARGO_FLAGS := --profile $(PROFILE)
+
+ifneq ($(SMP),)
+CARGO_FLAGS += --features smp
+endif
+
+ifneq ($(FEATURES),)
+CARGO_FLAGS += --features $(FEATURES)
+endif
 
 ifeq ($(HOST),darwin)
 QEMU_ACCEL ?= -accel tcg

+ 5 - 5
configure

@@ -11,7 +11,7 @@ event() {
     printf "$1... "
 }
 
-ARCH=${ARCH:-x86_64}
+ARCH=${ARCH:-"$DEFAULT_ARCH"}
 
 # Define toolchain and QEMU/GDB settings for per architecture
 event "target architecture"
@@ -40,7 +40,7 @@ esac
 
 if [ "$QEMU" = "" ]; then
     event "checking default qemu"
-    QEMU="qemu-system-$DEFAULT_ARCH"
+    QEMU="qemu-system-$ARCH"
     if $QEMU --version > /dev/null 2>&1; then
         QEMU="qemu-system-\$(ARCH)"
         break
@@ -65,7 +65,7 @@ check_gdb_arch() {
     local item="$1"
     if $item --init-eval-command 'set arch' \
              --init-eval-command 'q' 2>&1 \
-             | grep "$DEFAULT_ARCH" >/dev/null 2>&1; then
+             | grep "$ARCH" >/dev/null 2>&1; then
         return 0
     else
         return 1
@@ -74,7 +74,7 @@ check_gdb_arch() {
 
 if [ "$GDB" = "" ]; then
     event "checking default gdb"
-    if check_gdb_arch "$DEFAULT_ARCH-elf-gdb"; then
+    if check_gdb_arch "$ARCH-elf-gdb"; then
         GDB="\$(ARCH)-elf-gdb"
         break
     fi
@@ -126,7 +126,7 @@ else
 fi
 
 cp Makefile.src "$OUT"
-sed -i '' -e "s|##DEFAULT_ARCH##|$DEFAULT_ARCH|" "$OUT" > /dev/null 2>&1
+sed -i '' -e "s|##DEFAULT_ARCH##|$ARCH|" "$OUT" > /dev/null 2>&1
 sed -i '' -e "s|##GDB##|$GDB|" "$OUT" > /dev/null 2>&1
 sed -i '' -e "s|##QEMU##|$QEMU|" "$OUT" > /dev/null 2>&1
 sed -i '' -e "s|##FDISK##|$FDISK|" "$OUT" > /dev/null 2>&1

+ 1 - 0
crates/eonix_hal/eonix_hal_traits/src/trap.rs

@@ -66,6 +66,7 @@ where
 {
     Syscall { no: usize, args: [usize; 6] },
     Fault(Fault),
+    Breakpoint,
     Irq { callback: FIrq },
     Timer { callback: FTimer },
 }

+ 2 - 2
crates/eonix_hal/src/arch/loongarch64/link.x

@@ -91,6 +91,6 @@ SECTIONS {
     } > VDSO AT> RAM
 
     VDSO_PADDR = LOADADDR(.vdso);
-    __kernel_end = ABSOLUTE(LOADADDR(.vdso) + SIZEOF(.vdso));
+    __kernel_end = __edata;
 }
-INSERT BEFORE .bss;
+INSERT BEFORE .data.after;

+ 0 - 2
crates/eonix_hal/src/arch/loongarch64/memory.x

@@ -12,12 +12,10 @@ REGION_ALIAS("REGION_TEXT", KIMAGE);
 REGION_ALIAS("REGION_RODATA", KIMAGE);
 REGION_ALIAS("REGION_DATA", KIMAGE);
 REGION_ALIAS("REGION_BSS", KBSS);
-REGION_ALIAS("REGION_EHFRAME", KIMAGE);
 
 REGION_ALIAS("LINK_REGION_TEXT", RAM);
 REGION_ALIAS("LINK_REGION_RODATA", RAM);
 REGION_ALIAS("LINK_REGION_DATA", RAM);
 REGION_ALIAS("LINK_REGION_BSS", RAM);
-REGION_ALIAS("LINK_REGION_EHFRAME", RAM);
 
 _stext = ORIGIN(REGION_TEXT) + LOADADDR(.text) - ORIGIN(RAM);

+ 2 - 0
crates/eonix_hal/src/arch/loongarch64/mm.rs

@@ -87,6 +87,8 @@ impl PagingMode for PagingMode48 {
 
 pub type ArchPagingMode = PagingMode48;
 
+unsafe impl Send for RawPageTable48<'_> {}
+
 impl<'a> RawPageTable<'a> for RawPageTable48<'a> {
     type Entry = PTE64;
 

+ 11 - 2
crates/eonix_hal/src/arch/loongarch64/trap/mod.rs

@@ -278,11 +278,18 @@ impl TrapReturn for TrapContext {
         to_ctx.set_interrupt_enabled(false);
 
         unsafe {
+            let mut old_trap_ctx: usize;
+            let mut old_task_ctx: usize;
+
             asm!(
+                "csrrd {old_trap_ctx}, {CSR_CAPTURED_TRAP_CONTEXT_ADDR}",
+                "csrrd {old_task_ctx}, {CSR_CAPTURER_TASK_CONTEXT_ADDR}",
                 "csrwr {captured_trap_context}, {CSR_CAPTURED_TRAP_CONTEXT_ADDR}",
                 "csrwr {capturer_task_context}, {CSR_CAPTURER_TASK_CONTEXT_ADDR}",
                 captured_trap_context = inout(reg) &raw mut *self => _,
                 capturer_task_context = inout(reg) &raw mut capturer_ctx => _,
+                old_trap_ctx = out(reg) old_trap_ctx,
+                old_task_ctx = out(reg) old_task_ctx,
                 CSR_CAPTURED_TRAP_CONTEXT_ADDR = const CSR_CAPTURED_TRAP_CONTEXT_ADDR,
                 CSR_CAPTURER_TASK_CONTEXT_ADDR = const CSR_CAPTURER_TASK_CONTEXT_ADDR,
                 options(nomem, nostack, preserves_flags),
@@ -291,8 +298,10 @@ impl TrapReturn for TrapContext {
             TaskContext::switch(&mut capturer_ctx, &mut to_ctx);
 
             asm!(
-                "csrwr $zero, {CSR_CAPTURED_TRAP_CONTEXT_ADDR}",
-                "csrwr $zero, {CSR_CAPTURER_TASK_CONTEXT_ADDR}",
+                "csrwr {old_trap_ctx}, {CSR_CAPTURED_TRAP_CONTEXT_ADDR}",
+                "csrwr {old_task_ctx}, {CSR_CAPTURER_TASK_CONTEXT_ADDR}",
+                old_trap_ctx = inout(reg) old_trap_ctx,
+                old_task_ctx = inout(reg) old_task_ctx,
                 CSR_CAPTURED_TRAP_CONTEXT_ADDR = const CSR_CAPTURED_TRAP_CONTEXT_ADDR,
                 CSR_CAPTURER_TASK_CONTEXT_ADDR = const CSR_CAPTURER_TASK_CONTEXT_ADDR,
                 options(nomem, nostack, preserves_flags),

+ 13 - 2
crates/eonix_hal/src/arch/loongarch64/trap/trap_context.rs

@@ -1,4 +1,4 @@
-use crate::processor::CPU;
+use crate::{arch::trap::CSR_KERNEL_TP, processor::CPU};
 use core::{arch::asm, mem::offset_of};
 use eonix_hal_traits::{
     fault::{Fault, PageFaultErrorCode},
@@ -173,6 +173,7 @@ impl RawTrapContext for TrapContext {
                 | Exception::MemoryAccessAddressError
                 | Exception::PagePrivilegeIllegal,
             ) => TrapType::Fault(Fault::BadAccess),
+            Trap::Exception(Exception::Breakpoint) => TrapType::Breakpoint,
             Trap::Exception(Exception::InstructionNotExist) => TrapType::Fault(Fault::InvalidOp),
             Trap::Exception(Exception::Syscall) => TrapType::Syscall {
                 no: self.syscall_no(),
@@ -226,7 +227,17 @@ impl RawTrapContext for TrapContext {
     fn set_user_mode(&mut self, user: bool) {
         match user {
             true => self.prmd |= 0x3,
-            false => self.prmd &= !0x3,
+            false => {
+                unsafe {
+                    asm!(
+                        "csrrd {tp}, {CSR_KERNEL_TP}",
+                        tp = out(reg) self.regs.tp,
+                        CSR_KERNEL_TP = const CSR_KERNEL_TP,
+                        options(nomem, nostack, preserves_flags),
+                    )
+                }
+                self.prmd &= !0x3;
+            }
         }
     }
 

+ 0 - 8
crates/eonix_hal/src/arch/riscv64/bootstrap.rs

@@ -3,7 +3,6 @@ use super::{
     console::write_str,
     cpu::{CPUID, CPU_COUNT},
     time::set_next_timer,
-    trap::TRAP_SCRATCH,
 };
 use crate::{
     arch::{
@@ -234,13 +233,6 @@ fn setup_cpu(alloc: impl PageAlloc, hart_id: usize) {
     }
 
     percpu_area.register(cpu.cpuid());
-
-    unsafe {
-        // SAFETY: Interrupts are disabled.
-        TRAP_SCRATCH
-            .as_mut()
-            .set_kernel_tp(PercpuArea::get_for(cpu.cpuid()).unwrap().cast());
-    }
 }
 
 fn get_ap_start_addr() -> usize {

+ 11 - 8
crates/eonix_hal/src/arch/riscv64/cpu.rs

@@ -1,9 +1,13 @@
 use super::{
     interrupt::InterruptControl,
-    trap::{setup_trap, TRAP_SCRATCH},
+    trap::{setup_trap, TrapContext},
 };
 use crate::arch::fdt::{FdtExt, FDT};
-use core::{arch::asm, pin::Pin, ptr::NonNull, sync::atomic::AtomicUsize};
+use core::{
+    arch::asm, cell::UnsafeCell, mem::MaybeUninit, pin::Pin, ptr::NonNull,
+    sync::atomic::AtomicUsize,
+};
+use eonix_hal_traits::trap::RawTrapContext;
 use eonix_preempt::PreemptGuard;
 use eonix_sync_base::LazyLock;
 use riscv::register::{
@@ -17,6 +21,9 @@ pub static CPU_COUNT: AtomicUsize = AtomicUsize::new(0);
 #[eonix_percpu::define_percpu]
 pub static CPUID: usize = 0;
 
+#[eonix_percpu::define_percpu]
+static DEFAULT_TRAP_CONTEXT: MaybeUninit<TrapContext> = MaybeUninit::uninit();
+
 #[eonix_percpu::define_percpu]
 static LOCAL_CPU: LazyLock<CPU> = LazyLock::new(|| CPU::new(CPUID.get()));
 
@@ -56,14 +63,10 @@ impl CPU {
         interrupt.init();
 
         sstatus::set_sum();
-        sscratch::write(TRAP_SCRATCH.as_ptr() as usize);
+        sscratch::write(DEFAULT_TRAP_CONTEXT.as_ptr() as usize);
     }
 
-    pub unsafe fn load_interrupt_stack(self: Pin<&mut Self>, sp: u64) {
-        TRAP_SCRATCH
-            .as_mut()
-            .set_trap_context(NonNull::new(sp as *mut _).unwrap());
-    }
+    pub unsafe fn load_interrupt_stack(self: Pin<&mut Self>, sp: u64) {}
 
     pub fn set_tls32(self: Pin<&mut Self>, _user_tls: &UserTLS) {
         // nothing

+ 3 - 2
crates/eonix_hal/src/arch/riscv64/link.x

@@ -43,7 +43,6 @@ SECTIONS {
 
     KIMAGE_PAGES = (__edata - _stext + 0x1000 - 1) / 0x1000;
     KIMAGE_32K_COUNT = (KIMAGE_PAGES + 8 - 1) / 8;
-    __kernel_end = .;
 
     BSS_LENGTH = ABSOLUTE(__ebss - __sbss);
 }
@@ -89,4 +88,6 @@ SECTIONS {
 
     VDSO_PADDR = LOADADDR(.vdso);
 }
-INSERT AFTER .data;
+INSERT BEFORE .data.after;
+
+__kernel_end = __edata;

+ 0 - 2
crates/eonix_hal/src/arch/riscv64/memory.x

@@ -12,12 +12,10 @@ REGION_ALIAS("REGION_TEXT", KIMAGE);
 REGION_ALIAS("REGION_RODATA", KIMAGE);
 REGION_ALIAS("REGION_DATA", KIMAGE);
 REGION_ALIAS("REGION_BSS", KBSS);
-REGION_ALIAS("REGION_EHFRAME", KIMAGE);
 
 REGION_ALIAS("LINK_REGION_TEXT", RAM);
 REGION_ALIAS("LINK_REGION_RODATA", RAM);
 REGION_ALIAS("LINK_REGION_DATA", RAM);
 REGION_ALIAS("LINK_REGION_BSS", RAM);
-REGION_ALIAS("LINK_REGION_EHFRAME", RAM);
 
 _stext = ORIGIN(REGION_TEXT) + LOADADDR(.text) - ORIGIN(RAM);

+ 2 - 0
crates/eonix_hal/src/arch/riscv64/mm.rs

@@ -88,6 +88,8 @@ impl PagingMode for PagingModeSv48 {
 
 pub type ArchPagingMode = PagingModeSv48;
 
+unsafe impl Send for RawPageTableSv48<'_> {}
+
 impl<'a> RawPageTable<'a> for RawPageTableSv48<'a> {
     type Entry = PTE64;
 

+ 177 - 0
crates/eonix_hal/src/arch/riscv64/trap/captured.rs

@@ -0,0 +1,177 @@
+use crate::{arch::trap::Registers, context::TaskContext, trap::TrapContext};
+use core::{arch::naked_asm, mem::MaybeUninit};
+use eonix_hal_traits::context::RawTaskContext;
+
+static mut DIRTY_TASK_CONTEXT: MaybeUninit<TaskContext> = MaybeUninit::uninit();
+
+// If captured trap context is present, we use it directly.
+// We need to restore the kernel tp from that TrapContext but sp is
+// fine since we will use TaskContext::switch.
+#[unsafe(naked)]
+pub(super) unsafe extern "C" fn _captured_trap_entry() -> ! {
+    naked_asm!(
+        "csrrw t0, sscratch, t0",
+        "sd    tp, {tp}(t0)",
+        "ld    tp, {ra}(t0)", // Load kernel tp from trap_ctx.ra
+        "sd    ra, {ra}(t0)",
+        "ld    ra, {sp}(t0)", // Load capturer task context from trap_ctx.sp
+        "sd    sp, {sp}(t0)",
+        "sd    gp, {gp}(t0)",
+        "sd    a0, {a0}(t0)",
+        "sd    a1, {a1}(t0)",
+        "sd    a2, {a2}(t0)",
+        "sd    a3, {a3}(t0)",
+        "sd    a4, {a4}(t0)",
+        "sd    t1, {t1}(t0)",
+        "sd    a5, {a5}(t0)",
+        "sd    a6, {a6}(t0)",
+        "sd    a7, {a7}(t0)",
+        "sd    t3, {t3}(t0)",
+        "sd    t4, {t4}(t0)",
+        "sd    t5, {t5}(t0)",
+        "sd    t2, {t2}(t0)",
+        "sd    t6, {t6}(t0)",
+        "sd    s0, {s0}(t0)",
+        "sd    s1, {s1}(t0)",
+        "sd    s2, {s2}(t0)",
+        "sd    s3, {s3}(t0)",
+        "sd    s4, {s4}(t0)",
+        "sd    s5, {s5}(t0)",
+        "sd    s6, {s6}(t0)",
+        "sd    s7, {s7}(t0)",
+        "sd    s8, {s8}(t0)",
+        "sd    s9, {s9}(t0)",
+        "sd    s10, {s10}(t0)",
+        "sd    s11, {s11}(t0)",
+        "csrr  t2, sstatus",
+        "csrr  t3, sepc",
+        "csrr  t4, scause",
+        "csrr  t5, stval",
+        "csrrw t6, sscratch, t0",
+        "sd    t6, {t0}(t0)",
+        "sd    t2, {sstatus}(t0)",
+        "sd    t3, {sepc}(t0)",
+        "sd    t4, {scause}(t0)",
+        "sd    t5, {stval}(t0)",
+        "la    a0, {dirty_task_context}",
+        "mv    a1, ra",
+        "j     {task_context_switch}",
+        ra = const Registers::OFFSET_RA,
+        sp = const Registers::OFFSET_SP,
+        gp = const Registers::OFFSET_GP,
+        tp = const Registers::OFFSET_TP,
+        t1 = const Registers::OFFSET_T1,
+        t2 = const Registers::OFFSET_T2,
+        t0 = const Registers::OFFSET_T0,
+        a0 = const Registers::OFFSET_A0,
+        a1 = const Registers::OFFSET_A1,
+        a2 = const Registers::OFFSET_A2,
+        a3 = const Registers::OFFSET_A3,
+        a4 = const Registers::OFFSET_A4,
+        a5 = const Registers::OFFSET_A5,
+        a6 = const Registers::OFFSET_A6,
+        a7 = const Registers::OFFSET_A7,
+        t3 = const Registers::OFFSET_T3,
+        t4 = const Registers::OFFSET_T4,
+        t5 = const Registers::OFFSET_T5,
+        t6 = const Registers::OFFSET_T6,
+        s0 = const Registers::OFFSET_S0,
+        s1 = const Registers::OFFSET_S1,
+        s2 = const Registers::OFFSET_S2,
+        s3 = const Registers::OFFSET_S3,
+        s4 = const Registers::OFFSET_S4,
+        s5 = const Registers::OFFSET_S5,
+        s6 = const Registers::OFFSET_S6,
+        s7 = const Registers::OFFSET_S7,
+        s8 = const Registers::OFFSET_S8,
+        s9 = const Registers::OFFSET_S9,
+        s10 = const Registers::OFFSET_S10,
+        s11 = const Registers::OFFSET_S11,
+        sstatus = const TrapContext::OFFSET_SSTATUS,
+        sepc = const TrapContext::OFFSET_SEPC,
+        scause = const TrapContext::OFFSET_SCAUSE,
+        stval = const TrapContext::OFFSET_STVAL,
+        dirty_task_context = sym DIRTY_TASK_CONTEXT,
+        task_context_switch = sym TaskContext::switch,
+    );
+}
+
+#[unsafe(naked)]
+pub(super) unsafe extern "C" fn _captured_trap_return(ctx: &mut TrapContext) -> ! {
+    naked_asm!(
+        "csrr   t0,  sscratch",
+        "ld     t1,  {sstatus}(t0)",
+        "ld     t2,  {sepc}(t0)",
+        "csrw   sstatus, t1",
+        "csrw   sepc, t2",
+        "mv     t4,  tp",
+        "mv     t5,  sp",
+        "ld     tp,  {tp}(t0)",
+        "ld     ra,  {ra}(t0)",
+        "ld     sp,  {sp}(t0)",
+        "sd     t4,  {ra}(t0)", // Store kernel tp to trap_ctx.ra
+        "sd     t5,  {sp}(t0)", // Store capturer task context to trap_ctx.sp
+        "ld     gp,  {gp}(t0)",
+        "ld     a0,  {a0}(t0)",
+        "ld     a1,  {a1}(t0)",
+        "ld     a2,  {a2}(t0)",
+        "ld     a3,  {a3}(t0)",
+        "ld     a4,  {a4}(t0)",
+        "ld     t1,  {t1}(t0)",
+        "ld     a5,  {a5}(t0)",
+        "ld     a6,  {a6}(t0)",
+        "ld     a7,  {a7}(t0)",
+        "ld     t3,  {t3}(t0)",
+        "ld     t4,  {t4}(t0)",
+        "ld     t5,  {t5}(t0)",
+        "ld     t2,  {t2}(t0)",
+        "ld     t6,  {t6}(t0)",
+        "ld     s0,  {s0}(t0)",
+        "ld     s1,  {s1}(t0)",
+        "ld     s2,  {s2}(t0)",
+        "ld     s3,  {s3}(t0)",
+        "ld     s4,  {s4}(t0)",
+        "ld     s5,  {s5}(t0)",
+        "ld     s6,  {s6}(t0)",
+        "ld     s7,  {s7}(t0)",
+        "ld     s8,  {s8}(t0)",
+        "ld     s9,  {s9}(t0)",
+        "ld     s10, {s10}(t0)",
+        "ld     s11, {s11}(t0)",
+        "ld     t0,  {t0}(t0)",
+        "sret",
+        ra = const Registers::OFFSET_RA,
+        sp = const Registers::OFFSET_SP,
+        gp = const Registers::OFFSET_GP,
+        tp = const Registers::OFFSET_TP,
+        t1 = const Registers::OFFSET_T1,
+        t2 = const Registers::OFFSET_T2,
+        t0 = const Registers::OFFSET_T0,
+        a0 = const Registers::OFFSET_A0,
+        a1 = const Registers::OFFSET_A1,
+        a2 = const Registers::OFFSET_A2,
+        a3 = const Registers::OFFSET_A3,
+        a4 = const Registers::OFFSET_A4,
+        a5 = const Registers::OFFSET_A5,
+        a6 = const Registers::OFFSET_A6,
+        a7 = const Registers::OFFSET_A7,
+        t3 = const Registers::OFFSET_T3,
+        t4 = const Registers::OFFSET_T4,
+        t5 = const Registers::OFFSET_T5,
+        t6 = const Registers::OFFSET_T6,
+        s0 = const Registers::OFFSET_S0,
+        s1 = const Registers::OFFSET_S1,
+        s2 = const Registers::OFFSET_S2,
+        s3 = const Registers::OFFSET_S3,
+        s4 = const Registers::OFFSET_S4,
+        s5 = const Registers::OFFSET_S5,
+        s6 = const Registers::OFFSET_S6,
+        s7 = const Registers::OFFSET_S7,
+        s8 = const Registers::OFFSET_S8,
+        s9 = const Registers::OFFSET_S9,
+        s10 = const Registers::OFFSET_S10,
+        s11 = const Registers::OFFSET_S11,
+        sstatus = const TrapContext::OFFSET_SSTATUS,
+        sepc = const TrapContext::OFFSET_SEPC,
+    );
+}

+ 134 - 0
crates/eonix_hal/src/arch/riscv64/trap/default.rs

@@ -0,0 +1,134 @@
+use super::Registers;
+use crate::trap::TrapContext;
+use core::arch::naked_asm;
+
+unsafe extern "C" {
+    fn _default_trap_handler(trap_context: &mut TrapContext);
+}
+
+#[unsafe(naked)]
+pub(super) unsafe extern "C" fn _default_trap_entry() -> ! {
+    naked_asm!(
+        "csrrw t0,      sscratch, t0",
+        "sd    tp,      {tp}(t0)",
+        "sd    ra,      {ra}(t0)",
+        "sd    sp,      {sp}(t0)",
+        "sd    gp,      {gp}(t0)",
+        "sd    a0,      {a0}(t0)",
+        "sd    a1,      {a1}(t0)",
+        "sd    a2,      {a2}(t0)",
+        "sd    a3,      {a3}(t0)",
+        "sd    a4,      {a4}(t0)",
+        "sd    t1,      {t1}(t0)",
+        "sd    a5,      {a5}(t0)",
+        "sd    a6,      {a6}(t0)",
+        "sd    a7,      {a7}(t0)",
+        "sd    t3,      {t3}(t0)",
+        "sd    t4,      {t4}(t0)",
+        "sd    t5,      {t5}(t0)",
+        "sd    t2,      {t2}(t0)",
+        "sd    t6,      {t6}(t0)",
+        "sd    s0,      {s0}(t0)",
+        "sd    s1,      {s1}(t0)",
+        "sd    s2,      {s2}(t0)",
+        "sd    s3,      {s3}(t0)",
+        "sd    s4,      {s4}(t0)",
+        "sd    s5,      {s5}(t0)",
+        "sd    s6,      {s6}(t0)",
+        "sd    s7,      {s7}(t0)",
+        "sd    s8,      {s8}(t0)",
+        "sd    s9,      {s9}(t0)",
+        "sd    s10,     {s10}(t0)",
+        "sd    s11,     {s11}(t0)",
+        "mv    a0,      t0",
+        "csrrw t0,      sscratch, t0",
+        "sd    t0,      {t0}(a0)",
+        "csrr  t0,      sepc",
+        "csrr  t1,      scause",
+        "csrr  t2,      sstatus",
+        "csrr  t3,      stval",
+        "sd    t0,      {sepc}(a0)",
+        "sd    t1,      {scause}(a0)",
+        "sd    t2,      {sstatus}(a0)",
+        "sd    t3,      {stval}(a0)",
+
+        "la    t0,      {default_trap_handler}",
+        "jalr  t0",
+
+        "csrr  t0,      sscratch",
+        "ld    t1,      {sepc}(t0)",
+        "ld    t2,      {sstatus}(t0)",
+        "ld    tp,      {tp}(t0)",
+        "ld    ra,      {ra}(t0)",
+        "ld    sp,      {sp}(t0)",
+        "ld    gp,      {gp}(t0)",
+        "ld    a0,      {a0}(t0)",
+        "ld    a1,      {a1}(t0)",
+        "ld    a2,      {a2}(t0)",
+        "ld    a3,      {a3}(t0)",
+        "ld    a4,      {a4}(t0)",
+
+        "csrw  sepc,    t1",
+        "csrw  sstatus, t2",
+
+        "ld    t1,      {t1}(t0)",
+        "ld    a5,      {a5}(t0)",
+        "ld    a6,      {a6}(t0)",
+        "ld    a7,      {a7}(t0)",
+        "ld    t3,      {t3}(t0)",
+        "ld    t4,      {t4}(t0)",
+        "ld    t5,      {t5}(t0)",
+        "ld    t2,      {t2}(t0)",
+        "ld    t6,      {t6}(t0)",
+        "ld    s0,      {s0}(t0)",
+        "ld    s1,      {s1}(t0)",
+        "ld    s2,      {s2}(t0)",
+        "ld    s3,      {s3}(t0)",
+        "ld    s4,      {s4}(t0)",
+        "ld    s5,      {s5}(t0)",
+        "ld    s6,      {s6}(t0)",
+        "ld    s7,      {s7}(t0)",
+        "ld    s8,      {s8}(t0)",
+        "ld    s9,      {s9}(t0)",
+        "ld    s10,     {s10}(t0)",
+        "ld    s11,     {s11}(t0)",
+        "ld    t0,      {t0}(t0)",
+        "sret",
+        tp = const Registers::OFFSET_TP,
+        ra = const Registers::OFFSET_RA,
+        sp = const Registers::OFFSET_SP,
+        gp = const Registers::OFFSET_GP,
+        t0 = const Registers::OFFSET_T0,
+        t1 = const Registers::OFFSET_T1,
+        t2 = const Registers::OFFSET_T2,
+        t3 = const Registers::OFFSET_T3,
+        t4 = const Registers::OFFSET_T4,
+        t5 = const Registers::OFFSET_T5,
+        t6 = const Registers::OFFSET_T6,
+        a0 = const Registers::OFFSET_A0,
+        a1 = const Registers::OFFSET_A1,
+        a2 = const Registers::OFFSET_A2,
+        a3 = const Registers::OFFSET_A3,
+        a4 = const Registers::OFFSET_A4,
+        a5 = const Registers::OFFSET_A5,
+        a6 = const Registers::OFFSET_A6,
+        a7 = const Registers::OFFSET_A7,
+        s0 = const Registers::OFFSET_S0,
+        s1 = const Registers::OFFSET_S1,
+        s2 = const Registers::OFFSET_S2,
+        s3 = const Registers::OFFSET_S3,
+        s4 = const Registers::OFFSET_S4,
+        s5 = const Registers::OFFSET_S5,
+        s6 = const Registers::OFFSET_S6,
+        s7 = const Registers::OFFSET_S7,
+        s8 = const Registers::OFFSET_S8,
+        s9 = const Registers::OFFSET_S9,
+        s10 = const Registers::OFFSET_S10,
+        s11 = const Registers::OFFSET_S11,
+        sepc = const TrapContext::OFFSET_SEPC,
+        scause = const TrapContext::OFFSET_SCAUSE,
+        sstatus = const TrapContext::OFFSET_SSTATUS,
+        stval = const TrapContext::OFFSET_STVAL,
+        default_trap_handler = sym _default_trap_handler,
+    );
+}

+ 24 - 265
crates/eonix_hal/src/arch/riscv64/trap/mod.rs

@@ -1,18 +1,22 @@
+mod captured;
+mod default;
 mod trap_context;
 
 use super::config::platform::virt::*;
 use super::context::TaskContext;
+use captured::{_captured_trap_entry, _captured_trap_return};
 use core::arch::{global_asm, naked_asm};
 use core::mem::{offset_of, size_of};
 use core::num::NonZero;
 use core::ptr::NonNull;
+use default::_default_trap_entry;
 use eonix_hal_traits::{
     context::RawTaskContext,
     trap::{IrqState as IrqStateTrait, TrapReturn},
 };
 use riscv::register::sstatus::{self, Sstatus};
 use riscv::register::stvec::TrapMode;
-use riscv::register::{scause, sepc, stval};
+use riscv::register::{scause, sepc, sscratch, stval};
 use riscv::{
     asm::sfence_vma_all,
     register::stvec::{self, Stvec},
@@ -21,281 +25,36 @@ use sbi::SbiError;
 
 pub use trap_context::*;
 
-#[repr(C)]
-pub struct TrapScratch {
-    t1: u64,
-    t2: u64,
-    kernel_tp: Option<NonZero<u64>>,
-    trap_context: Option<NonNull<TrapContext>>,
-    handler: unsafe extern "C" fn(),
-    capturer_context: TaskContext,
-}
-
-#[eonix_percpu::define_percpu]
-pub(crate) static TRAP_SCRATCH: TrapScratch = TrapScratch {
-    t1: 0,
-    t2: 0,
-    kernel_tp: None,
-    trap_context: None,
-    handler: default_trap_handler,
-    capturer_context: TaskContext::new(),
-};
-
-static mut DIRTY_TASK_CONTEXT: TaskContext = TaskContext::new();
-
-#[unsafe(naked)]
-unsafe extern "C" fn _raw_trap_entry() -> ! {
-    naked_asm!(
-        "csrrw t0, sscratch, t0", // Swap t0 and sscratch
-        "sd    t1, 0(t0)",
-        "sd    t2, 8(t0)",
-        "csrr  t1, sstatus",
-        "andi  t1, t1, 0x100",
-        "beqz  t1, 2f",
-        // else SPP = 1, supervisor mode
-        "addi  t1, sp, -{trap_context_size}",
-        "mv    t2, tp",
-        "sd    ra, {ra}(t1)",
-        "sd    sp, {sp}(t1)",
-        "mv    sp, t1",
-        "j     4f",
-        // SPP = 0, user mode
-        "2:",
-        "ld    t1, 24(t0)", // Load captured TrapContext address
-        "mv    t2, tp",
-        "ld    tp, 16(t0)", // Restore kernel tp
-        // t0: &mut TrapScratch, t1: &mut TrapContext, t2: tp before trap
-        "3:",
-        "sd    ra, {ra}(t1)",
-        "sd    sp, {sp}(t1)",
-        "4:",
-        "sd    gp, {gp}(t1)",
-        "sd    t2, {tp}(t1)",
-        "ld    ra, 0(t0)",
-        "ld    t2, 8(t0)",
-        "sd    ra, {t1}(t1)",     // Save t1
-        "sd    t2, {t2}(t1)",     // Save t2
-        "ld    ra, 32(t0)",       // Load handler address
-        "csrrw t2, sscratch, t0", // Swap t0 and sscratch
-        "sd    t2, {t0}(t1)",
-        "sd    a0, {a0}(t1)",
-        "sd    a1, {a1}(t1)",
-        "sd    a2, {a2}(t1)",
-        "sd    a3, {a3}(t1)",
-        "sd    a4, {a4}(t1)",
-        "sd    a5, {a5}(t1)",
-        "sd    a6, {a6}(t1)",
-        "sd    a7, {a7}(t1)",
-        "sd    t3, {t3}(t1)",
-        "sd    t4, {t4}(t1)",
-        "sd    t5, {t5}(t1)",
-        "sd    t6, {t6}(t1)",
-        "sd    s0, {s0}(t1)",
-        "sd    s1, {s1}(t1)",
-        "sd    s2, {s2}(t1)",
-        "sd    s3, {s3}(t1)",
-        "sd    s4, {s4}(t1)",
-        "sd    s5, {s5}(t1)",
-        "sd    s6, {s6}(t1)",
-        "sd    s7, {s7}(t1)",
-        "sd    s8, {s8}(t1)",
-        "sd    s9, {s9}(t1)",
-        "sd    s10, {s10}(t1)",
-        "sd    s11, {s11}(t1)",
-        "csrr  t2, sstatus",
-        "csrr  t3, sepc",
-        "csrr  t4, scause",
-        "sd    t2, {sstatus}(t1)",
-        "sd    t3, {sepc}(t1)",
-        "sd    t4, {scause}(t1)",
-        "ret",
-        trap_context_size = const size_of::<TrapContext>(),
-        ra = const Registers::OFFSET_RA,
-        sp = const Registers::OFFSET_SP,
-        gp = const Registers::OFFSET_GP,
-        tp = const Registers::OFFSET_TP,
-        t1 = const Registers::OFFSET_T1,
-        t2 = const Registers::OFFSET_T2,
-        t0 = const Registers::OFFSET_T0,
-        a0 = const Registers::OFFSET_A0,
-        a1 = const Registers::OFFSET_A1,
-        a2 = const Registers::OFFSET_A2,
-        a3 = const Registers::OFFSET_A3,
-        a4 = const Registers::OFFSET_A4,
-        a5 = const Registers::OFFSET_A5,
-        a6 = const Registers::OFFSET_A6,
-        a7 = const Registers::OFFSET_A7,
-        t3 = const Registers::OFFSET_T3,
-        t4 = const Registers::OFFSET_T4,
-        t5 = const Registers::OFFSET_T5,
-        t6 = const Registers::OFFSET_T6,
-        s0 = const Registers::OFFSET_S0,
-        s1 = const Registers::OFFSET_S1,
-        s2 = const Registers::OFFSET_S2,
-        s3 = const Registers::OFFSET_S3,
-        s4 = const Registers::OFFSET_S4,
-        s5 = const Registers::OFFSET_S5,
-        s6 = const Registers::OFFSET_S6,
-        s7 = const Registers::OFFSET_S7,
-        s8 = const Registers::OFFSET_S8,
-        s9 = const Registers::OFFSET_S9,
-        s10 = const Registers::OFFSET_S10,
-        s11 = const Registers::OFFSET_S11,
-        sstatus = const TrapContext::OFFSET_SSTATUS,
-        sepc = const TrapContext::OFFSET_SEPC,
-        scause = const TrapContext::OFFSET_SCAUSE,
-    );
-}
-
-#[unsafe(naked)]
-unsafe extern "C" fn _raw_trap_return(ctx: &mut TrapContext) -> ! {
-    naked_asm!(
-        "ld ra, {ra}(a0)",
-        "ld sp, {sp}(a0)",
-        "ld gp, {gp}(a0)",
-        "ld tp, {tp}(a0)",
-        "ld t1, {t1}(a0)",
-        "ld t2, {t2}(a0)",
-        "ld t0, {t0}(a0)",
-        "ld a1, {a1}(a0)",
-        "ld a2, {a2}(a0)",
-        "ld a3, {a3}(a0)",
-        "ld a4, {a4}(a0)",
-        "ld a5, {a5}(a0)",
-        "ld a6, {a6}(a0)",
-        "ld a7, {a7}(a0)",
-        "ld t3, {t3}(a0)",
-        "ld t4, {sepc}(a0)",    // Load sepc from TrapContext
-        "ld t5, {sstatus}(a0)", // Load sstatus from TrapContext
-        "ld s0, {s0}(a0)",
-        "ld s1, {s1}(a0)",
-        "ld s2, {s2}(a0)",
-        "ld s3, {s3}(a0)",
-        "ld s4, {s4}(a0)",
-        "ld s5, {s5}(a0)",
-        "ld s6, {s6}(a0)",
-        "ld s7, {s7}(a0)",
-        "ld s8, {s8}(a0)",
-        "ld s9, {s9}(a0)",
-        "ld s10, {s10}(a0)",
-        "ld s11, {s11}(a0)",
-        "csrw sepc, t4",        // Restore sepc
-        "csrw sstatus, t5",     // Restore sstatus
-        "ld t4, {t4}(a0)",
-        "ld t5, {t5}(a0)",
-        "ld t6, {t6}(a0)",
-        "ld a0, {a0}(a0)",
-        "sret",
-        ra = const Registers::OFFSET_RA,
-        sp = const Registers::OFFSET_SP,
-        gp = const Registers::OFFSET_GP,
-        tp = const Registers::OFFSET_TP,
-        t1 = const Registers::OFFSET_T1,
-        t2 = const Registers::OFFSET_T2,
-        t0 = const Registers::OFFSET_T0,
-        a0 = const Registers::OFFSET_A0,
-        a1 = const Registers::OFFSET_A1,
-        a2 = const Registers::OFFSET_A2,
-        a3 = const Registers::OFFSET_A3,
-        a4 = const Registers::OFFSET_A4,
-        a5 = const Registers::OFFSET_A5,
-        a6 = const Registers::OFFSET_A6,
-        a7 = const Registers::OFFSET_A7,
-        t3 = const Registers::OFFSET_T3,
-        t4 = const Registers::OFFSET_T4,
-        t5 = const Registers::OFFSET_T5,
-        t6 = const Registers::OFFSET_T6,
-        s0 = const Registers::OFFSET_S0,
-        s1 = const Registers::OFFSET_S1,
-        s2 = const Registers::OFFSET_S2,
-        s3 = const Registers::OFFSET_S3,
-        s4 = const Registers::OFFSET_S4,
-        s5 = const Registers::OFFSET_S5,
-        s6 = const Registers::OFFSET_S6,
-        s7 = const Registers::OFFSET_S7,
-        s8 = const Registers::OFFSET_S8,
-        s9 = const Registers::OFFSET_S9,
-        s10 = const Registers::OFFSET_S10,
-        s11 = const Registers::OFFSET_S11,
-        sstatus = const TrapContext::OFFSET_SSTATUS,
-        sepc = const TrapContext::OFFSET_SEPC,
-    );
-}
-
-#[unsafe(naked)]
-unsafe extern "C" fn default_trap_handler() {
-    unsafe extern "C" {
-        fn _default_trap_handler(trap_context: &mut TrapContext);
-    }
-
-    naked_asm!(
-        "andi sp, sp, -16", // Align stack pointer to 16 bytes
-        "addi sp, sp, -16",
-        "mv   a0, t1",      // TrapContext pointer in t1
-        "sd   a0, 0(sp)",   // Save TrapContext pointer
-        "",
-        "call {default_handler}",
-        "",
-        "ld   a0, 0(sp)",   // Restore TrapContext pointer
-        "j {trap_return}",
-        default_handler = sym _default_trap_handler,
-        trap_return = sym _raw_trap_return,
-    );
-}
-
-#[unsafe(naked)]
-unsafe extern "C" fn captured_trap_handler() {
-    naked_asm!(
-        "la   a0, {dirty_task_context}",
-        "addi a1, t0, {capturer_context_offset}",
-        "j {switch}",
-        dirty_task_context = sym DIRTY_TASK_CONTEXT,
-        capturer_context_offset = const offset_of!(TrapScratch, capturer_context),
-        switch = sym TaskContext::switch,
-    );
-}
-
-#[unsafe(naked)]
-unsafe extern "C" fn captured_trap_return(trap_context: usize) -> ! {
-    naked_asm!(
-        "mv a0, sp",
-        "j {raw_trap_return}",
-        raw_trap_return = sym _raw_trap_return,
-    );
-}
-
-impl TrapScratch {
-    pub fn set_trap_context(&mut self, ctx: NonNull<TrapContext>) {
-        self.trap_context = Some(ctx);
-    }
-
-    pub fn clear_trap_context(&mut self) {
-        self.trap_context = None;
-    }
-
-    pub fn set_kernel_tp(&mut self, tp: NonNull<u8>) {
-        self.kernel_tp = Some(NonZero::new(tp.addr().get() as u64).unwrap());
-    }
-}
-
 impl TrapReturn for TrapContext {
     type TaskContext = TaskContext;
 
     unsafe fn trap_return(&mut self) {
         let irq_states = disable_irqs_save();
-        let old_handler =
-            core::mem::replace(&mut TRAP_SCRATCH.as_mut().handler, captured_trap_handler);
 
+        let old_stvec = stvec::read();
+        stvec::write({
+            let mut stvec_val = Stvec::from_bits(0);
+            stvec_val.set_address(_captured_trap_entry as usize);
+            stvec_val.set_trap_mode(TrapMode::Direct);
+            stvec_val
+        });
+
+        let old_trap_ctx = sscratch::read();
+        sscratch::write(&raw mut *self as usize);
+
+        let mut from_ctx = TaskContext::new();
         let mut to_ctx = TaskContext::new();
-        to_ctx.set_program_counter(captured_trap_return as usize);
-        to_ctx.set_stack_pointer(&raw mut *self as usize);
+        to_ctx.set_program_counter(_captured_trap_return as usize);
+        to_ctx.set_stack_pointer(&raw mut from_ctx as usize);
         to_ctx.set_interrupt_enabled(false);
 
         unsafe {
-            TaskContext::switch(&mut TRAP_SCRATCH.as_mut().capturer_context, &mut to_ctx);
+            TaskContext::switch(&mut from_ctx, &mut to_ctx);
         }
 
-        TRAP_SCRATCH.as_mut().handler = old_handler;
+        sscratch::write(old_trap_ctx);
+        stvec::write(old_stvec);
+
         irq_states.restore();
     }
 }
@@ -311,7 +70,7 @@ fn setup_trap_handler(trap_entry_addr: usize) {
 }
 
 pub fn setup_trap() {
-    setup_trap_handler(_raw_trap_entry as usize);
+    setup_trap_handler(_default_trap_entry as usize);
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]

+ 57 - 51
crates/eonix_hal/src/arch/riscv64/trap/trap_context.rs

@@ -1,5 +1,5 @@
 use crate::{arch::time::set_next_timer, processor::CPU};
-use core::arch::asm;
+use core::{arch::asm, mem::offset_of};
 use eonix_hal_traits::{
     fault::{Fault, PageFaultErrorCode},
     trap::{RawTrapContext, TrapType},
@@ -18,24 +18,23 @@ use riscv::{
 #[repr(C)]
 #[derive(Default, Clone, Copy)]
 pub struct Registers {
+    tp: u64,
     ra: u64,
     sp: u64,
     gp: u64,
-    tp: u64,
-    t1: u64,
-    t2: u64,
-    t0: u64,
     a0: u64,
     a1: u64,
     a2: u64,
     a3: u64,
     a4: u64,
+    t1: u64,
     a5: u64,
     a6: u64,
     a7: u64,
     t3: u64,
     t4: u64,
     t5: u64,
+    t2: u64,
     t6: u64,
     s0: u64,
     s1: u64,
@@ -49,10 +48,11 @@ pub struct Registers {
     s9: u64,
     s10: u64,
     s11: u64,
+    t0: u64,
 }
 
 /// Saved CPU context when a trap (interrupt or exception) occurs on RISC-V 64.
-#[repr(C)]
+#[repr(C, align(16))]
 #[derive(Clone, Copy)]
 pub struct TrapContext {
     regs: Registers,
@@ -60,46 +60,48 @@ pub struct TrapContext {
     sstatus: Sstatus,
     sepc: usize,
     scause: Scause,
+    stval: usize,
 }
 
 impl Registers {
-    pub const OFFSET_RA: usize = 0 * 8;
-    pub const OFFSET_SP: usize = 1 * 8;
-    pub const OFFSET_GP: usize = 2 * 8;
-    pub const OFFSET_TP: usize = 3 * 8;
-    pub const OFFSET_T1: usize = 4 * 8;
-    pub const OFFSET_T2: usize = 5 * 8;
-    pub const OFFSET_T0: usize = 6 * 8;
-    pub const OFFSET_A0: usize = 7 * 8;
-    pub const OFFSET_A1: usize = 8 * 8;
-    pub const OFFSET_A2: usize = 9 * 8;
-    pub const OFFSET_A3: usize = 10 * 8;
-    pub const OFFSET_A4: usize = 11 * 8;
-    pub const OFFSET_A5: usize = 12 * 8;
-    pub const OFFSET_A6: usize = 13 * 8;
-    pub const OFFSET_A7: usize = 14 * 8;
-    pub const OFFSET_T3: usize = 15 * 8;
-    pub const OFFSET_T4: usize = 16 * 8;
-    pub const OFFSET_T5: usize = 17 * 8;
-    pub const OFFSET_T6: usize = 18 * 8;
-    pub const OFFSET_S0: usize = 19 * 8;
-    pub const OFFSET_S1: usize = 20 * 8;
-    pub const OFFSET_S2: usize = 21 * 8;
-    pub const OFFSET_S3: usize = 22 * 8;
-    pub const OFFSET_S4: usize = 23 * 8;
-    pub const OFFSET_S5: usize = 24 * 8;
-    pub const OFFSET_S6: usize = 25 * 8;
-    pub const OFFSET_S7: usize = 26 * 8;
-    pub const OFFSET_S8: usize = 27 * 8;
-    pub const OFFSET_S9: usize = 28 * 8;
-    pub const OFFSET_S10: usize = 29 * 8;
-    pub const OFFSET_S11: usize = 30 * 8;
+    pub const OFFSET_TP: usize = offset_of!(Registers, tp);
+    pub const OFFSET_SP: usize = offset_of!(Registers, sp);
+    pub const OFFSET_RA: usize = offset_of!(Registers, ra);
+    pub const OFFSET_GP: usize = offset_of!(Registers, gp);
+    pub const OFFSET_T1: usize = offset_of!(Registers, t1);
+    pub const OFFSET_T2: usize = offset_of!(Registers, t2);
+    pub const OFFSET_T0: usize = offset_of!(Registers, t0);
+    pub const OFFSET_A0: usize = offset_of!(Registers, a0);
+    pub const OFFSET_A1: usize = offset_of!(Registers, a1);
+    pub const OFFSET_A2: usize = offset_of!(Registers, a2);
+    pub const OFFSET_A3: usize = offset_of!(Registers, a3);
+    pub const OFFSET_A4: usize = offset_of!(Registers, a4);
+    pub const OFFSET_A5: usize = offset_of!(Registers, a5);
+    pub const OFFSET_A6: usize = offset_of!(Registers, a6);
+    pub const OFFSET_A7: usize = offset_of!(Registers, a7);
+    pub const OFFSET_T3: usize = offset_of!(Registers, t3);
+    pub const OFFSET_T4: usize = offset_of!(Registers, t4);
+    pub const OFFSET_T5: usize = offset_of!(Registers, t5);
+    pub const OFFSET_T6: usize = offset_of!(Registers, t6);
+    pub const OFFSET_S0: usize = offset_of!(Registers, s0);
+    pub const OFFSET_S1: usize = offset_of!(Registers, s1);
+    pub const OFFSET_S2: usize = offset_of!(Registers, s2);
+    pub const OFFSET_S3: usize = offset_of!(Registers, s3);
+    pub const OFFSET_S4: usize = offset_of!(Registers, s4);
+    pub const OFFSET_S5: usize = offset_of!(Registers, s5);
+    pub const OFFSET_S6: usize = offset_of!(Registers, s6);
+    pub const OFFSET_S7: usize = offset_of!(Registers, s7);
+    pub const OFFSET_S8: usize = offset_of!(Registers, s8);
+    pub const OFFSET_S9: usize = offset_of!(Registers, s9);
+    pub const OFFSET_S10: usize = offset_of!(Registers, s10);
+    pub const OFFSET_S11: usize = offset_of!(Registers, s11);
 }
 
 impl TrapContext {
-    pub const OFFSET_SSTATUS: usize = 31 * 8;
-    pub const OFFSET_SEPC: usize = 32 * 8;
-    pub const OFFSET_SCAUSE: usize = 33 * 8;
+    pub const OFFSET_SSTATUS: usize = offset_of!(TrapContext, sstatus);
+    pub const OFFSET_SEPC: usize = offset_of!(TrapContext, sepc);
+    pub const OFFSET_SCAUSE: usize = offset_of!(TrapContext, scause);
+    pub const OFFSET_STVAL: usize = offset_of!(TrapContext, stval);
 
     fn syscall_no(&self) -> usize {
         self.regs.a7 as usize
@@ -131,6 +133,7 @@ impl RawTrapContext for TrapContext {
             sstatus,
             sepc: 0,
             scause: Scause::from_bits(0),
+            stval: 0,
         }
     }
 
@@ -163,6 +166,7 @@ impl RawTrapContext for TrapContext {
             }
             Trap::Exception(e) => {
                 match Exception::from_number(e).unwrap() {
+                    Exception::Breakpoint => TrapType::Breakpoint,
                     Exception::InstructionMisaligned
                     | Exception::LoadMisaligned
                     | Exception::InstructionFault
@@ -176,16 +180,10 @@ impl RawTrapContext for TrapContext {
                     },
                     exception @ (Exception::InstructionPageFault
                     | Exception::LoadPageFault
-                    | Exception::StorePageFault) => {
-                        #[inline(always)]
-                        fn get_page_fault_address() -> VAddr {
-                            VAddr::from(stval::read())
-                        }
-                        TrapType::Fault(Fault::PageFault {
-                            error_code: self.get_page_fault_error_code(exception),
-                            address: get_page_fault_address(),
-                        })
-                    }
+                    | Exception::StorePageFault) => TrapType::Fault(Fault::PageFault {
+                        error_code: self.get_page_fault_error_code(exception),
+                        address: VAddr::from(self.stval),
+                    }),
                     // breakpoint and supervisor env call
                     _ => TrapType::Fault(Fault::Unknown(e)),
                 }
@@ -224,7 +222,15 @@ impl RawTrapContext for TrapContext {
     fn set_user_mode(&mut self, user: bool) {
         match user {
             true => self.sstatus.set_spp(SPP::User),
-            false => self.sstatus.set_spp(SPP::Supervisor),
+            false => {
+                unsafe {
+                    core::arch::asm!(
+                        "mv {}, tp",
+                        out(reg) self.regs.tp,
+                    );
+                };
+                self.sstatus.set_spp(SPP::Supervisor);
+            }
         }
     }
 

+ 14 - 9
crates/eonix_hal/src/link.x.in

@@ -18,6 +18,15 @@ SECTIONS {
         __srodata = .;
 
         *(.rodata .rodata.*);
+        
+        . = ALIGN(8);
+
+        PROVIDE(__eh_frame = .);
+        PROVIDE(__executable_start = __stext);
+
+        KEEP(*(.eh_frame_hdr));
+        KEEP(*(.eh_frame));
+        KEEP(*(.eh_frame.*));
 
     } > REGION_RODATA AT> LINK_REGION_RODATA
 
@@ -32,6 +41,11 @@ SECTIONS {
 
     } > REGION_DATA AT> LINK_REGION_DATA
 
+    .data.after :
+    {
+        __data_after = .;
+    } > REGION_DATA AT> LINK_REGION_DATA
+
     __edata = .;
 
     .bss (NOLOAD) : ALIGN(16)
@@ -45,16 +59,7 @@ SECTIONS {
 
     __ebss = .;
 
-    .eh_frame : ALIGN(16)
-    {
-        __seh_frame = .;
-
-        KEEP(*(.eh_frame .eh_frame*));
-
-    } > REGION_EHFRAME AT> LINK_REGION_EHFRAME
-
     . = ALIGN(0x1000);
-    __eeh_frame = .;
 }
 
 SECTIONS {

+ 1 - 1
crates/eonix_mm/src/page_table/page_table.rs

@@ -11,7 +11,7 @@ use crate::{
 };
 use core::{marker::PhantomData, ptr::NonNull};
 
-pub trait RawPageTable<'a>: 'a {
+pub trait RawPageTable<'a>: Send + 'a {
     type Entry: PTE + 'a;
 
     /// Return the entry at the given index.

+ 61 - 92
crates/eonix_runtime/src/executor.rs

@@ -1,125 +1,94 @@
-mod builder;
-mod execute_status;
+// mod builder;
 mod output_handle;
 mod stack;
 
-use crate::{
-    run::{Contexted, Run, RunState},
-    scheduler::Scheduler,
-    task::Task,
+use alloc::{
+    boxed::Box,
+    sync::{Arc, Weak},
 };
-use alloc::sync::Weak;
 use core::{
+    marker::PhantomData,
     pin::Pin,
-    sync::atomic::{compiler_fence, fence, AtomicBool, Ordering},
-    task::Waker,
+    task::{Context, Poll},
 };
 use eonix_sync::Spin;
 
-pub use builder::ExecutorBuilder;
-pub use execute_status::ExecuteStatus;
 pub use output_handle::OutputHandle;
 pub use stack::Stack;
 
-/// An `Executor` executes a `Run` object in a separate thread of execution
-/// where we have a dedicated stack and context.
-pub trait Executor: Send {
-    fn progress(&self) -> ExecuteStatus;
+/// An `Executor` executes a Future object in a separate thread of execution.
+///
+/// When the Future is finished, the `Executor` will call the `OutputHandle` to commit the output.
+/// Then the `Executor` will release the resources associated with the Future.
+pub struct Executor(Option<Pin<Box<dyn TypeErasedExecutor>>>);
+
+trait TypeErasedExecutor: Send {
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<()>;
 }
 
-struct RealExecutor<S, R>
+struct RealExecutor<'a, F>
 where
-    R: Run + Send + Contexted + 'static,
-    R::Output: Send,
+    F: Future + Send + 'a,
+    F::Output: Send + 'a,
 {
-    _stack: S,
-    runnable: R,
-    output_handle: Weak<Spin<OutputHandle<R::Output>>>,
-    finished: AtomicBool,
+    future: F,
+    output_handle: Weak<Spin<OutputHandle<F::Output>>>,
+    _phantom: PhantomData<&'a ()>,
 }
 
-impl<S, R> RealExecutor<S, R>
+impl<F> TypeErasedExecutor for RealExecutor<'_, F>
 where
-    R: Run + Send + Contexted + 'static,
-    R::Output: Send,
+    F: Future + Send,
+    F::Output: Send,
 {
-    extern "C" fn execute(self: Pin<&Self>) -> ! {
-        // We get here with preempt count == 1.
-        eonix_preempt::enable();
-
-        {
-            let waker = Waker::from(Task::current().clone());
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<()> {
+        if self.output_handle.as_ptr().is_null() {
+            return Poll::Ready(());
+        }
 
-            let output_data = loop {
-                // TODO!!!!!!: CHANGE THIS.
-                let runnable_pointer = &raw const self.get_ref().runnable;
+        let future = unsafe {
+            // SAFETY: We don't move the future.
+            self.as_mut().map_unchecked_mut(|me| &mut me.future)
+        };
 
-                // SAFETY: We don't move the runnable object and we MIGHT not be using the
-                //         part that is used in `pinned_run` in the runnable...?
-                let mut pinned_runnable =
-                    unsafe { Pin::new_unchecked(&mut *(runnable_pointer as *mut R)) };
+        future.poll(cx).map(|output| {
+            if let Some(output_handle) = self.output_handle.upgrade() {
+                output_handle.lock().commit_output(output);
 
-                match pinned_runnable.as_mut().run(&waker) {
-                    RunState::Finished(output) => break output,
-                    RunState::Running => Task::park(),
+                unsafe {
+                    // SAFETY: `output_handle` is Unpin.
+                    self.get_unchecked_mut().output_handle = Weak::new();
                 }
-            };
-
-            if let Some(output_handle) = self.output_handle.upgrade() {
-                output_handle.lock().commit_output(output_data);
             }
-        }
-
-        // SAFETY: We are on the same CPU as the task.
-        self.finished.store(true, Ordering::Relaxed);
-
-        unsafe {
-            // SAFETY: `preempt::count()` == 1.
-            eonix_preempt::disable();
-            Scheduler::goto_scheduler_noreturn()
-        }
+        })
     }
 }
 
-impl<S, R> Executor for RealExecutor<S, R>
-where
-    S: Send,
-    R: Run + Contexted + Send,
-    R::Output: Send,
-{
-    fn progress(&self) -> ExecuteStatus {
-        // TODO!!!: If the task comes from another cpu, we need to sync.
-        //
-        // The other cpu should see the changes of kernel stack of the target thread
-        // made in this cpu.
-        //
-        // Can we find a better way other than `fence`s?
-        //
-        // An alternative way is to use an atomic variable to store the cpu id of
-        // the current task. Then we can use acquire release swap to ensure that the
-        // other cpu sees the changes.
-        fence(Ordering::SeqCst);
-        compiler_fence(Ordering::SeqCst);
-
-        // TODO!!!: We should load the context only if the previous task is
-        // different from the current task.
-
-        self.runnable.load_running_context();
-
-        unsafe {
-            // SAFETY: We are in the scheduler context and we are not preempted.
-            Scheduler::go_from_scheduler(&Task::current().execution_context);
-        }
-
-        self.runnable.restore_running_context();
-
-        compiler_fence(Ordering::SeqCst);
-        fence(Ordering::SeqCst);
+impl Executor {
+    pub fn new<F>(future: F) -> (Self, Arc<Spin<OutputHandle<F::Output>>>)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send + 'static,
+    {
+        let output_handle = OutputHandle::new();
+
+        (
+            Executor(Some(Box::pin(RealExecutor {
+                future,
+                output_handle: Arc::downgrade(&output_handle),
+                _phantom: PhantomData,
+            }))),
+            output_handle,
+        )
+    }
 
-        if self.finished.load(Ordering::Acquire) {
-            ExecuteStatus::Finished
+    pub fn poll(&mut self, cx: &mut Context<'_>) -> Poll<()> {
+        if let Some(executor) = self.0.as_mut() {
+            executor.as_mut().poll(cx).map(|_| {
+                self.0.take();
+            })
         } else {
-            ExecuteStatus::Executing
+            Poll::Ready(())
         }
     }
 }

+ 1 - 5
crates/eonix_runtime/src/executor/builder.rs

@@ -1,8 +1,5 @@
 use super::{Executor, OutputHandle, RealExecutor, Stack};
-use crate::{
-    context::ExecutionContext,
-    run::{Contexted, Run},
-};
+use crate::context::ExecutionContext;
 use alloc::{boxed::Box, sync::Arc};
 use core::{pin::Pin, sync::atomic::AtomicBool};
 use eonix_sync::Spin;
@@ -15,7 +12,6 @@ pub struct ExecutorBuilder<S, R> {
 impl<S, R> ExecutorBuilder<S, R>
 where
     S: Stack,
-    R: Run + Contexted + Send + 'static,
     R::Output: Send,
 {
     pub fn new() -> Self {

+ 0 - 4
crates/eonix_runtime/src/executor/execute_status.rs

@@ -1,4 +0,0 @@
-pub enum ExecuteStatus {
-    Executing,
-    Finished,
-}

+ 0 - 1
crates/eonix_runtime/src/lib.rs

@@ -3,7 +3,6 @@
 pub mod context;
 pub mod executor;
 mod ready_queue;
-pub mod run;
 pub mod scheduler;
 pub mod task;
 

+ 0 - 34
crates/eonix_runtime/src/run.rs

@@ -1,34 +0,0 @@
-mod future_run;
-
-use core::{pin::Pin, task::Waker};
-pub use future_run::FutureRun;
-
-pub enum RunState<Output> {
-    Running,
-    Finished(Output),
-}
-
-pub trait Contexted {
-    /// # Safety
-    /// This function should be called in a preemption disabled context.
-    fn load_running_context(&self) {}
-
-    /// # Safety
-    /// This function should be called in a preemption disabled context.
-    fn restore_running_context(&self) {}
-}
-
-pub trait Run {
-    type Output;
-
-    fn run(self: Pin<&mut Self>, waker: &Waker) -> RunState<Self::Output>;
-
-    fn join(mut self: Pin<&mut Self>, waker: &Waker) -> Self::Output {
-        loop {
-            match self.as_mut().run(waker) {
-                RunState::Running => continue,
-                RunState::Finished(output) => break output,
-            }
-        }
-    }
-}

+ 0 - 34
crates/eonix_runtime/src/run/future_run.rs

@@ -1,34 +0,0 @@
-use super::{Contexted, Run, RunState};
-use core::{
-    pin::Pin,
-    task::{Context, Poll, Waker},
-};
-
-pub struct FutureRun<F: Future>(F);
-
-impl<F> FutureRun<F>
-where
-    F: Future,
-{
-    pub const fn new(future: F) -> Self {
-        Self(future)
-    }
-}
-
-impl<F> Contexted for FutureRun<F> where F: Future {}
-impl<F> Run for FutureRun<F>
-where
-    F: Future + 'static,
-{
-    type Output = F::Output;
-
-    fn run(self: Pin<&mut Self>, waker: &Waker) -> RunState<Self::Output> {
-        let mut future = unsafe { self.map_unchecked_mut(|me| &mut me.0) };
-        let mut context = Context::from_waker(waker);
-
-        match future.as_mut().poll(&mut context) {
-            Poll::Ready(output) => RunState::Finished(output),
-            Poll::Pending => RunState::Running,
-        }
-    }
-}

+ 132 - 182
crates/eonix_runtime/src/scheduler.rs

@@ -1,20 +1,16 @@
 use crate::{
-    context::ExecutionContext,
-    executor::{ExecuteStatus, OutputHandle, Stack},
-    ready_queue::{cpu_rq, local_rq},
-    run::{Contexted, Run},
-    task::{Task, TaskAdapter, TaskHandle},
+    executor::OutputHandle,
+    ready_queue::{local_rq, ReadyQueue},
+    task::{Task, TaskAdapter, TaskHandle, TaskState},
 };
-use alloc::sync::Arc;
+use alloc::{sync::Arc, task::Wake};
 use core::{
-    mem::forget,
+    ops::{Deref, DerefMut},
     ptr::NonNull,
-    sync::atomic::{compiler_fence, Ordering},
-    task::Waker,
+    task::{Context, Poll, Waker},
 };
 use eonix_hal::processor::halt;
 use eonix_log::println_trace;
-use eonix_preempt::assert_preempt_count_eq;
 use eonix_sync::{LazyLock, Spin, SpinIrq as _};
 use intrusive_collections::RBTree;
 use pointers::BorrowedArc;
@@ -22,13 +18,12 @@ use pointers::BorrowedArc;
 #[eonix_percpu::define_percpu]
 static CURRENT_TASK: Option<NonNull<Task>> = None;
 
-#[eonix_percpu::define_percpu]
-static LOCAL_SCHEDULER_CONTEXT: ExecutionContext = ExecutionContext::new();
-
 static TASKS: LazyLock<Spin<RBTree<TaskAdapter>>> =
     LazyLock::new(|| Spin::new(RBTree::new(TaskAdapter::new())));
 
-pub struct Scheduler;
+pub static RUNTIME: Runtime = Runtime();
+
+pub struct Runtime();
 
 pub struct JoinHandle<Output>(Arc<Spin<OutputHandle<Output>>>)
 where
@@ -68,209 +63,164 @@ where
     }
 }
 
-impl Scheduler {
-    /// `Scheduler` might be used in various places. Do not hold it for a long time.
-    ///
-    /// # Safety
-    /// The locked returned by this function should be locked with `lock_irq` to prevent from
-    /// rescheduling during access to the scheduler. Disabling preemption will do the same.
-    ///
-    /// Drop the lock before calling `schedule`.
-    pub fn get() -> &'static Self {
-        static GLOBAL_SCHEDULER: Scheduler = Scheduler;
-        &GLOBAL_SCHEDULER
-    }
-
-    pub fn init_local_scheduler<S>()
+impl Runtime {
+    pub fn spawn<F>(&self, future: F) -> JoinHandle<F::Output>
     where
-        S: Stack,
+        F: Future + Send + 'static,
+        F::Output: Send + 'static,
     {
-        let stack = S::new();
-
-        unsafe {
-            eonix_preempt::disable();
-            // SAFETY: Preemption is disabled.
-            let context: &mut ExecutionContext = LOCAL_SCHEDULER_CONTEXT.as_mut();
-            context.set_ip(local_scheduler as _);
-            context.set_sp(stack.get_bottom().addr().get() as usize);
-            context.set_interrupt(true);
-            eonix_preempt::enable();
-        }
-
-        // We don't need to keep the stack around.
-        forget(stack);
-    }
+        let TaskHandle {
+            task,
+            output_handle,
+        } = Task::new(future);
 
-    /// # Safety
-    /// This function must not be called inside of the scheulder context.
-    ///
-    /// The caller must ensure that `preempt::count` == 1.
-    pub unsafe fn go_from_scheduler(to: &ExecutionContext) {
-        // SAFETY: Preemption is disabled.
-        unsafe { LOCAL_SCHEDULER_CONTEXT.as_ref() }.switch_to(to);
-    }
+        self.add_task(task.clone());
+        task.wake_by_ref();
 
-    /// # Safety
-    /// This function must not be called inside of the scheulder context.
-    ///
-    /// The caller must ensure that `preempt::count` == 1.
-    pub unsafe fn goto_scheduler(from: &ExecutionContext) {
-        // SAFETY: Preemption is disabled.
-        from.switch_to(unsafe { LOCAL_SCHEDULER_CONTEXT.as_ref() });
+        JoinHandle(output_handle)
     }
 
-    /// # Safety
-    /// This function must not be called inside of the scheulder context.
-    ///
-    /// The caller must ensure that `preempt::count` == 1.
-    pub unsafe fn goto_scheduler_noreturn() -> ! {
-        // SAFETY: Preemption is disabled.
-        unsafe { LOCAL_SCHEDULER_CONTEXT.as_ref().switch_noreturn() }
+    fn add_task(&self, task: Arc<Task>) {
+        TASKS.lock_irq().insert(task);
     }
 
-    fn add_task(task: Arc<Task>) {
-        TASKS.lock().insert(task);
+    fn remove_task(&self, task: &impl Deref<Target = Arc<Task>>) {
+        unsafe {
+            TASKS
+                .lock_irq()
+                .cursor_mut_from_ptr(Arc::as_ptr(task))
+                .remove();
+        }
     }
 
-    fn remove_task(task: &Task) {
-        unsafe { TASKS.lock().cursor_mut_from_ptr(task as *const _).remove() };
+    fn current(&self) -> Option<BorrowedArc<Task>> {
+        CURRENT_TASK
+            .get()
+            .map(|ptr| unsafe { BorrowedArc::from_raw(ptr) })
     }
 
-    fn select_cpu_for_task(&self, task: &Task) -> usize {
-        task.cpu.load(Ordering::Relaxed) as _
-    }
+    fn remove_and_enqueue_current(&self, rq: &mut impl DerefMut<Target = dyn ReadyQueue>) {
+        let Some(current) = CURRENT_TASK
+            .swap(None)
+            .map(|cur| unsafe { Arc::from_raw(cur.as_ptr()) })
+        else {
+            return;
+        };
+
+        match current.state.update(|state| match state {
+            TaskState::READY_RUNNING => Some(TaskState::READY),
+            TaskState::RUNNING => Some(TaskState::BLOCKED),
+            _ => {
+                unreachable!("Current task should be at least in RUNNING state, but got {state:?}")
+            }
+        }) {
+            Ok(TaskState::READY_RUNNING) => {
+                println_trace!(
+                    "trace_scheduler",
+                    "Re-enqueueing task {:?} (CPU{})",
+                    current.id,
+                    eonix_hal::processor::CPU::local().cpuid(),
+                );
 
-    pub fn activate(&self, task: &Arc<Task>) {
-        // Only one cpu can be activating the task at a time.
-        // TODO: Add some checks.
-
-        if task.on_rq.swap(true, Ordering::Acquire) {
-            // Lock the rq and check whether the task is on the rq again.
-            let cpuid = task.cpu.load(Ordering::Acquire);
-            let mut rq = cpu_rq(cpuid as _).lock_irq();
-
-            if !task.on_rq.load(Ordering::Acquire) {
-                // Task has just got off the rq. Put it back.
-                rq.put(task.clone());
-            } else {
-                // Task is already on the rq. Do nothing.
-                return;
+                rq.put(current);
+            }
+            Ok(_) => {
+                println_trace!(
+                    "trace_scheduler",
+                    "Current task {:?} (CPU{}) is blocked, not re-enqueueing",
+                    current.id,
+                    eonix_hal::processor::CPU::local().cpuid(),
+                );
             }
-        } else {
-            // Task not on some rq. Select one and put it here.
-            let cpu = self.select_cpu_for_task(&task);
-            let mut rq = cpu_rq(cpu).lock_irq();
-            task.cpu.store(cpu as _, Ordering::Release);
-            rq.put(task.clone());
+            _ => unreachable!(),
         }
     }
 
-    pub fn spawn<S, R>(&self, runnable: R) -> JoinHandle<R::Output>
-    where
-        S: Stack + 'static,
-        R: Run + Contexted + Send + 'static,
-        R::Output: Send + 'static,
-    {
-        let TaskHandle {
-            task,
-            output_handle,
-        } = Task::new::<S, _>(runnable);
+    pub fn block_till_woken(set_waker: impl FnOnce(&Waker)) -> impl Future<Output = ()> {
+        struct BlockTillWoken<F: FnOnce(&Waker)> {
+            set_waker: Option<F>,
+            slept: bool,
+        }
 
-        Self::add_task(task.clone());
-        self.activate(&task);
+        impl<F: FnOnce(&Waker)> Future for BlockTillWoken<F> {
+            type Output = ();
 
-        JoinHandle(output_handle)
-    }
+            fn poll(self: core::pin::Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<()> {
+                if self.slept {
+                    Poll::Ready(())
+                } else {
+                    let (set_waker, slept) = unsafe {
+                        let me = self.get_unchecked_mut();
+                        (me.set_waker.take().unwrap(), &mut me.slept)
+                    };
+
+                    set_waker(cx.waker());
+                    *slept = true;
+                    Poll::Pending
+                }
+            }
+        }
 
-    /// Go to idle task. Call this with `preempt_count == 1`.
-    /// The preempt count will be decremented by this function.
-    ///
-    /// # Safety
-    /// We might never return from here.
-    /// Drop all variables that take ownership of some resource before calling this function.
-    pub fn schedule() {
-        assert_preempt_count_eq!(1, "Scheduler::schedule");
-
-        // Make sure all works are done before scheduling.
-        compiler_fence(Ordering::SeqCst);
-
-        // TODO!!!!!: Use of reference here needs further consideration.
-        //
-        // Since we might never return to here, we can't take ownership of `current()`.
-        // Is it safe to believe that `current()` will never change across calls?
-        unsafe {
-            // SAFETY: Preemption is disabled.
-            Scheduler::goto_scheduler(&Task::current().execution_context);
+        BlockTillWoken {
+            set_waker: Some(set_waker),
+            slept: false,
         }
-        eonix_preempt::enable();
     }
-}
 
-extern "C" fn local_scheduler() -> ! {
-    loop {
-        assert_preempt_count_eq!(1, "Scheduler::idle_task");
-        let mut rq = local_rq().lock_irq();
+    /// Enter the runtime with an "init" future and run till its completion.
+    ///
+    /// The "init" future has the highest priority and when it completes,
+    /// the runtime will exit immediately and yield its output.
+    pub fn enter(&self) {
+        loop {
+            let mut rq = local_rq().lock_irq();
 
-        let previous_task = CURRENT_TASK
-            .get()
-            .map(|ptr| unsafe { Arc::from_raw(ptr.as_ptr()) });
-        let next_task = rq.get();
+            self.remove_and_enqueue_current(&mut rq);
 
-        match (previous_task, next_task) {
-            (None, None) => {
-                // Nothing to do, halt the cpu and rerun the loop.
+            let Some(next) = rq.get() else {
                 drop(rq);
                 halt();
                 continue;
+            };
+
+            println_trace!(
+                "trace_scheduler",
+                "Switching to task {:?} (CPU{})",
+                next.id,
+                eonix_hal::processor::CPU::local().cpuid(),
+            );
+
+            let old_state = next.state.swap(TaskState::RUNNING);
+            assert_eq!(
+                old_state,
+                TaskState::READY,
+                "Next task should be in READY state"
+            );
+
+            unsafe {
+                CURRENT_TASK.set(Some(NonNull::new_unchecked(Arc::into_raw(next) as *mut _)));
             }
-            (None, Some(next)) => {
-                CURRENT_TASK.set(NonNull::new(Arc::into_raw(next) as *mut _));
-            }
-            (Some(previous), None) => {
-                if previous.state.is_running() {
-                    // Previous thread is `Running`, return to the current running thread.
-                    println_trace!(
-                        "trace_scheduler",
-                        "Returning to task id({}) without doing context switch",
-                        previous.id
-                    );
-                    CURRENT_TASK.set(NonNull::new(Arc::into_raw(previous) as *mut _));
-                } else {
-                    // Nothing to do, halt the cpu and rerun the loop.
-                    CURRENT_TASK.set(NonNull::new(Arc::into_raw(previous) as *mut _));
-                    drop(rq);
-                    halt();
-                    continue;
-                }
-            }
-            (Some(previous), Some(next)) => {
+
+            drop(rq);
+
+            // TODO: MAYBE we can move the release of finished tasks to some worker thread.
+            if Task::current().poll().is_ready() {
+                let old_state = Task::current().state.swap(TaskState::DEAD);
+                assert!(
+                    old_state & TaskState::RUNNING != 0,
+                    "Current task should be at least in RUNNING state"
+                );
+
                 println_trace!(
                     "trace_scheduler",
-                    "Switching from task id({}) to task id({})",
-                    previous.id,
-                    next.id
+                    "Task {:?} finished execution, removing...",
+                    Task::current().id,
                 );
 
-                debug_assert_ne!(previous.id, next.id, "Switching to the same task");
+                self.remove_task(&Task::current());
 
-                if previous.state.is_running() || !previous.state.try_park() {
-                    rq.put(previous);
-                } else {
-                    previous.on_rq.store(false, Ordering::Release);
-                }
-
-                CURRENT_TASK.set(NonNull::new(Arc::into_raw(next) as *mut _));
+                CURRENT_TASK.set(None);
             }
         }
-
-        drop(rq);
-        // TODO: We can move the release of finished tasks to some worker thread.
-        if let ExecuteStatus::Finished = Task::current().run() {
-            let current = CURRENT_TASK
-                .swap(None)
-                .map(|ptr| unsafe { Arc::from_raw(ptr.as_ptr()) })
-                .expect("Current task should be present");
-            Scheduler::remove_task(&current);
-        }
     }
 }

+ 48 - 127
crates/eonix_runtime/src/task.rs

@@ -2,25 +2,22 @@ mod adapter;
 mod task_state;
 
 use crate::{
-    context::ExecutionContext,
-    executor::{ExecuteStatus, Executor, ExecutorBuilder, OutputHandle, Stack},
-    run::{Contexted, Run},
-    scheduler::Scheduler,
+    executor::{Executor, OutputHandle},
+    ready_queue::{cpu_rq, ReadyQueue},
 };
-use alloc::{boxed::Box, sync::Arc, task::Wake};
+use alloc::{sync::Arc, task::Wake};
 use atomic_unique_refcell::AtomicUniqueRefCell;
 use core::{
-    pin::{pin, Pin},
-    sync::atomic::{AtomicBool, AtomicU32, Ordering},
+    ops::DerefMut,
+    sync::atomic::{AtomicU32, Ordering},
     task::{Context, Poll, Waker},
 };
 use eonix_hal::processor::CPU;
-use eonix_preempt::assert_preempt_enabled;
-use eonix_sync::Spin;
-use intrusive_collections::RBTreeAtomicLink;
-use task_state::TaskState;
+use eonix_sync::{Spin, SpinIrq};
+use intrusive_collections::{LinkedListAtomicLink, RBTreeAtomicLink};
 
-pub use adapter::TaskAdapter;
+pub use adapter::{TaskAdapter, TaskRqAdapter};
+pub(crate) use task_state::TaskState;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 pub struct TaskId(u32);
@@ -33,160 +30,72 @@ where
     pub(crate) output_handle: Arc<Spin<OutputHandle<Output>>>,
 }
 
-/// A `Task` represents a schedulable unit.
-///
-/// Initial: state = Running, unparked = false
-///
-/// Task::park() => swap state <- Parking, assert prev == Running
-///              => swap unparked <- false
-///              -> true => store state <- Running => return
-///              -> false => goto scheduler => get rq lock => load state
-///                                                        -> Running => enqueue
-///                                                        -> Parking => cmpxchg Parking -> Parked
-///                                                                   -> Running => enqueue
-///                                                                   -> Parking => on_rq <- false
-///                                                                   -> Parked => ???
-///
-/// Task::unpark() => swap unparked <- true
-///                -> true => return
-///                -> false => swap state <- Running
-///                         -> Running => return
-///                         -> Parking | Parked => Scheduler::activate
 pub struct Task {
     /// Unique identifier of the task.
     pub id: TaskId,
-    /// Whether the task is on some run queue (a.k.a ready).
-    pub(crate) on_rq: AtomicBool,
-    /// Whether someone has called `unpark` on this task.
-    pub(crate) unparked: AtomicBool,
     /// The last cpu that the task was executed on.
     /// If `on_rq` is `false`, we can't assume that this task is still on the cpu.
     pub(crate) cpu: AtomicU32,
     /// Task state.
     pub(crate) state: TaskState,
-    /// Task execution context.
-    pub(crate) execution_context: ExecutionContext,
     /// Executor object.
-    executor: AtomicUniqueRefCell<Option<Pin<Box<dyn Executor>>>>,
+    executor: AtomicUniqueRefCell<Executor>,
     /// Link in the global task list.
     link_task_list: RBTreeAtomicLink,
+    /// Link in the ready queue.
+    link_ready_queue: LinkedListAtomicLink,
 }
 
 impl Task {
-    pub fn new<S, R>(runnable: R) -> TaskHandle<R::Output>
+    pub fn new<F>(future: F) -> TaskHandle<F::Output>
     where
-        S: Stack + 'static,
-        R: Run + Contexted + Send + 'static,
-        R::Output: Send + 'static,
+        F: Future + Send + 'static,
+        F::Output: Send + 'static,
     {
         static ID: AtomicU32 = AtomicU32::new(0);
 
-        let (executor, execution_context, output) = ExecutorBuilder::new()
-            .stack(S::new())
-            .runnable(runnable)
-            .build();
+        let (executor, output_handle) = Executor::new(future);
 
         let task = Arc::new(Self {
             id: TaskId(ID.fetch_add(1, Ordering::Relaxed)),
-            on_rq: AtomicBool::new(false),
-            unparked: AtomicBool::new(false),
             cpu: AtomicU32::new(CPU::local().cpuid() as u32),
-            state: TaskState::new(TaskState::RUNNING),
-            executor: AtomicUniqueRefCell::new(Some(executor)),
-            execution_context,
+            state: TaskState::new(TaskState::BLOCKED),
+            executor: AtomicUniqueRefCell::new(executor),
             link_task_list: RBTreeAtomicLink::new(),
+            link_ready_queue: LinkedListAtomicLink::new(),
         });
 
         TaskHandle {
             task,
-            output_handle: output,
+            output_handle,
         }
     }
 
-    pub fn run(&self) -> ExecuteStatus {
+    pub fn poll(self: &Arc<Self>) -> Poll<()> {
         let mut executor_borrow = self.executor.borrow();
+        let waker = Waker::from(self.clone());
+        let mut cx = Context::from_waker(&waker);
 
-        let executor = executor_borrow
-            .as_ref()
-            .expect("Executor should be present")
-            .as_ref()
-            .get_ref();
-
-        if let ExecuteStatus::Finished = executor.progress() {
-            executor_borrow.take();
-            ExecuteStatus::Finished
-        } else {
-            ExecuteStatus::Executing
-        }
+        executor_borrow.poll(&mut cx)
     }
 
-    pub fn unpark(self: &Arc<Self>) {
-        if self.unparked.swap(true, Ordering::Release) {
-            return;
-        }
-
-        eonix_preempt::disable();
+    /// Get the stabilized lock for the task's run queue.
+    pub fn rq(&self) -> impl DerefMut<Target = dyn ReadyQueue> + 'static {
+        loop {
+            let cpu = self.cpu.load(Ordering::Relaxed);
+            let rq = cpu_rq(cpu as usize).lock_irq();
 
-        match self.state.swap(TaskState::RUNNING) {
-            TaskState::RUNNING => {}
-            TaskState::PARKED | TaskState::PARKING => {
-                // We are waking up from sleep or someone else is parking this task.
-                // Try to wake it up.
-                Scheduler::get().activate(self);
+            // We stabilize the task cpu with the cpu rq here for now.
+            if cpu != self.cpu.load(Ordering::Acquire) {
+                continue;
             }
-            _ => unreachable!(),
-        }
 
-        eonix_preempt::enable();
-    }
-
-    pub fn park() {
-        eonix_preempt::disable();
-        Self::park_preempt_disabled();
-    }
-
-    /// Park the current task with `preempt::count() == 1`.
-    pub fn park_preempt_disabled() {
-        let task = Task::current();
-
-        let old_state = task.state.swap(TaskState::PARKING);
-        assert_eq!(
-            old_state,
-            TaskState::RUNNING,
-            "Parking a task that is not running."
-        );
-
-        if task.unparked.swap(false, Ordering::AcqRel) {
-            // Someone has called `unpark` on this task previously.
-            task.state.swap(TaskState::RUNNING);
-        } else {
-            unsafe {
-                // SAFETY: Preemption is disabled.
-                Scheduler::goto_scheduler(&Task::current().execution_context)
-            };
-            assert!(task.unparked.swap(false, Ordering::Acquire));
+            return rq;
         }
-
-        eonix_preempt::enable();
     }
 
-    pub fn block_on<F>(future: F) -> F::Output
-    where
-        F: Future,
-    {
-        assert_preempt_enabled!("block_on() must be called with preemption enabled");
-
-        let waker = Waker::from(Task::current().clone());
-        let mut context = Context::from_waker(&waker);
-        let mut future = pin!(future);
-
-        loop {
-            if let Poll::Ready(output) = future.as_mut().poll(&mut context) {
-                break output;
-            }
-
-            Task::park();
-        }
+    pub fn is_ready(&self) -> bool {
+        self.state.is_ready()
     }
 }
 
@@ -196,6 +105,18 @@ impl Wake for Task {
     }
 
     fn wake_by_ref(self: &Arc<Self>) {
-        self.unpark();
+        let Ok(old) = self.state.update(|state| match state {
+            TaskState::BLOCKED => Some(TaskState::READY),
+            TaskState::RUNNING => Some(TaskState::READY | TaskState::RUNNING),
+            TaskState::READY | TaskState::READY_RUNNING => None,
+            state => unreachable!("Waking a {state:?} task"),
+        }) else {
+            return;
+        };
+
+        if old == TaskState::BLOCKED {
+            // If the task was blocked, we need to put it back to the ready queue.
+            self.rq().put(self.clone());
+        }
     }
 }

+ 2 - 1
crates/eonix_runtime/src/task/adapter.rs

@@ -1,8 +1,9 @@
 use super::{Task, TaskId};
 use alloc::sync::Arc;
-use intrusive_collections::{intrusive_adapter, KeyAdapter, RBTreeAtomicLink};
+use intrusive_collections::{intrusive_adapter, KeyAdapter, LinkedListAtomicLink, RBTreeAtomicLink};
 
 intrusive_adapter!(pub TaskAdapter = Arc<Task>: Task { link_task_list: RBTreeAtomicLink });
+intrusive_adapter!(pub TaskRqAdapter = Arc<Task>: Task { link_ready_queue: LinkedListAtomicLink });
 
 impl<'a> KeyAdapter<'a> for TaskAdapter {
     type Key = TaskId;

+ 11 - 17
crates/eonix_runtime/src/task/task_state.rs

@@ -4,32 +4,26 @@ use core::sync::atomic::{AtomicU32, Ordering};
 pub struct TaskState(AtomicU32);
 
 impl TaskState {
-    pub const RUNNING: u32 = 0;
-    pub const PARKING: u32 = 1;
-    pub const PARKED: u32 = 2;
+    pub const BLOCKED: u32 = 0;
+    pub const READY: u32 = 1;
+    pub const RUNNING: u32 = 2;
+    pub const READY_RUNNING: u32 = TaskState::READY | TaskState::RUNNING;
+    pub const DEAD: u32 = 1 << 31;
 
     pub(crate) const fn new(state: u32) -> Self {
         Self(AtomicU32::new(state))
     }
 
     pub(crate) fn swap(&self, state: u32) -> u32 {
-        self.0.swap(state, Ordering::AcqRel)
+        self.0.swap(state, Ordering::SeqCst)
     }
 
-    pub(crate) fn try_park(&self) -> bool {
-        match self.0.compare_exchange(
-            TaskState::PARKING,
-            TaskState::PARKED,
-            Ordering::AcqRel,
-            Ordering::Acquire,
-        ) {
-            Ok(_) => true,
-            Err(TaskState::RUNNING) => false,
-            Err(_) => unreachable!("Invalid task state while trying to park."),
-        }
+    pub(crate) fn update(&self, func: impl FnMut(u32) -> Option<u32>) -> Result<u32, u32> {
+        self.0
+            .fetch_update(Ordering::SeqCst, Ordering::SeqCst, func)
     }
 
-    pub(crate) fn is_running(&self) -> bool {
-        self.0.load(Ordering::Acquire) == Self::RUNNING
+    pub(crate) fn is_ready(&self) -> bool {
+        self.0.load(Ordering::SeqCst) & Self::READY == Self::READY
     }
 }

+ 3 - 0
crates/eonix_sync/eonix_sync_base/src/locked/proof.rs

@@ -25,6 +25,9 @@ where
     _phantom: PhantomData<&'pos ()>,
 }
 
+unsafe impl<T: ?Sized> Send for Proof<'_, T> {}
+unsafe impl<T: ?Sized> Send for ProofMut<'_, T> {}
+
 /// A trait for types that can be converted to a proof of mutable access.
 ///
 /// This is used to prove that a mutable reference is valid for the lifetime `'pos`

+ 28 - 0
crates/posix_types/src/getdent.rs

@@ -0,0 +1,28 @@
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+pub struct UserDirent64 {
+    /// Inode number
+    pub d_ino: u64,
+    /// Implementation defined. We ignore it
+    pub d_off: u64,
+    /// Length of this record
+    pub d_reclen: u16,
+    /// File type. Set to 0
+    pub d_type: u8,
+    /// Filename with a padding '\0'
+    pub d_name: [u8; 0],
+}
+
+/// File type is at offset `d_reclen - 1`. Set it to 0
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+pub struct UserDirent {
+    /// Inode number
+    pub d_ino: u32,
+    /// Implementation defined. We ignore it
+    pub d_off: u32,
+    /// Length of this record
+    pub d_reclen: u16,
+    /// Filename with a padding '\0'
+    pub d_name: [u8; 0],
+}

+ 1 - 0
crates/posix_types/src/lib.rs

@@ -2,6 +2,7 @@
 
 pub mod constants;
 pub mod ctypes;
+pub mod getdent;
 pub mod namei;
 pub mod open;
 pub mod poll;

+ 10 - 0
crates/posix_types/src/result.rs

@@ -13,3 +13,13 @@ impl From<PosixError> for u32 {
         }
     }
 }
+
+impl core::fmt::Debug for PosixError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self {
+            Self::EFAULT => write!(f, "EFAULT"),
+            Self::EXDEV => write!(f, "EXDEV"),
+            Self::EINVAL => write!(f, "EINVAL"),
+        }
+    }
+}

+ 40 - 21
macros/src/lib.rs

@@ -18,6 +18,11 @@ fn define_syscall_impl(attrs: TokenStream, item: TokenStream) -> TokenStream {
     let args = item.sig.inputs.iter();
     let ty_ret = item.sig.output;
 
+    assert!(
+        item.sig.asyncness.is_some(),
+        "Syscall must be async function"
+    );
+
     let args_mapped = item
         .sig
         .inputs
@@ -100,36 +105,50 @@ fn define_syscall_impl(attrs: TokenStream, item: TokenStream) -> TokenStream {
             };
 
         #[link_section = #syscall_fn_section]
-        fn #helper_fn (
-            thd: &crate::kernel::task::Thread,
+        fn #helper_fn <'thd, 'alloc>(
+            thd: &'thd crate::kernel::task::Thread,
+            thd_alloc: crate::kernel::task::ThreadAlloc<'alloc>,
             args: [usize; 6]
-        ) -> Option<usize> {
+        ) -> core::pin::Pin<Box<
+            dyn core::future::Future<Output = Option<usize>> + Send + 'thd,
+            crate::kernel::task::ThreadAlloc<'alloc>
+        >> {
             use crate::kernel::syscall::{FromSyscallArg, SyscallRetVal};
+            use alloc::boxed::Box;
 
             #(#args_mapped)*
 
-            eonix_log::println_trace!(
-                "trace_syscall",
-                "tid{}: {}({}) => {{",
-                thd.tid,
-                #syscall_name_str,
-                format_args!(#trace_format_string, #trace_format_args),
-            );
-
-            let retval = #real_fn(thd, #(#args_call),*).into_retval();
-
-            eonix_log::println_trace!(
-                "trace_syscall",
-                "}} => {:x?}",
-                retval,
-            );
-
-            retval
+            unsafe {
+                core::pin::Pin::new_unchecked(
+                    Box::new_in(
+                        async move {
+                            eonix_log::println_trace!(
+                                "trace_syscall",
+                                "tid{}: {}({}) => {{",
+                                thd.tid,
+                                #syscall_name_str,
+                                format_args!(#trace_format_string, #trace_format_args),
+                            );
+
+                            let retval = #real_fn(thd, #(#args_call),*).await.into_retval();
+
+                            eonix_log::println_trace!(
+                                "trace_syscall",
+                                "}} => {:x?}",
+                                retval,
+                            );
+
+                            retval
+                        },
+                        thd_alloc
+                    )
+                )
+            }
         }
 
         #(#attrs)*
         #[link_section = #syscall_fn_section]
-        #vis fn #real_fn(
+        #vis async fn #real_fn(
             thread: &crate::kernel::task::Thread,
             #(#args),*
         ) #ty_ret #body

+ 2 - 2
src/driver/ahci/mod.rs

@@ -6,6 +6,7 @@ use crate::{
         constants::{EINVAL, EIO},
         interrupt::register_irq_handler,
         pcie::{self, Header, PCIDevice, PCIDriver, PciError},
+        task::block_on,
     },
     prelude::*,
 };
@@ -13,7 +14,6 @@ use alloc::{format, sync::Arc};
 use control::AdapterControl;
 use defs::*;
 use eonix_mm::address::{AddrOps as _, PAddr};
-use eonix_runtime::task::Task;
 use eonix_sync::SpinIrq as _;
 use port::AdapterPort;
 
@@ -133,7 +133,7 @@ impl Device<'static> {
                     port,
                 )?;
 
-                Task::block_on(port.partprobe())?;
+                block_on(port.partprobe())?;
 
                 Ok(())
             })() {

+ 3 - 3
src/driver/ahci/port.rs

@@ -9,11 +9,11 @@ use crate::kernel::block::{BlockDeviceRequest, BlockRequestQueue};
 use crate::kernel::constants::{EINVAL, EIO};
 use crate::kernel::mem::paging::Page;
 use crate::kernel::mem::AsMemoryBlock as _;
+use crate::kernel::task::block_on;
 use crate::prelude::*;
 use alloc::collections::vec_deque::VecDeque;
 use core::pin::pin;
 use eonix_mm::address::{Addr as _, PAddr};
-use eonix_runtime::task::Task;
 use eonix_sync::{SpinIrq as _, WaitList};
 
 /// An `AdapterPort` is an HBA device in AHCI mode.
@@ -156,7 +156,7 @@ impl AdapterPort<'_> {
             wait.as_mut().add_to_wait_list();
             drop(free_list);
 
-            Task::block_on(wait);
+            block_on(wait);
         }
     }
 
@@ -222,7 +222,7 @@ impl AdapterPort<'_> {
 
         self.stats.inc_cmd_sent();
 
-        if let Err(_) = Task::block_on(slot.wait_finish()) {
+        if let Err(_) = block_on(slot.wait_finish()) {
             self.stats.inc_cmd_error();
             return Err(EIO);
         };

+ 3 - 3
src/driver/serial.rs

@@ -3,14 +3,14 @@ mod io;
 use crate::{
     kernel::{
         block::make_device, console::set_console, constants::EIO, interrupt::register_irq_handler,
-        task::KernelStack, CharDevice, CharDeviceType, Terminal, TerminalDevice,
+        CharDevice, CharDeviceType, Terminal, TerminalDevice,
     },
     prelude::*,
 };
 use alloc::{collections::vec_deque::VecDeque, format, sync::Arc};
 use bitflags::bitflags;
 use core::pin::pin;
-use eonix_runtime::{run::FutureRun, scheduler::Scheduler};
+use eonix_runtime::scheduler::RUNTIME;
 use eonix_sync::{SpinIrq as _, WaitList};
 use io::SerialIO;
 
@@ -161,7 +161,7 @@ impl Serial {
             })?;
         }
 
-        Scheduler::get().spawn::<KernelStack, _>(FutureRun::new(Self::worker(port.clone())));
+        RUNTIME.spawn(Self::worker(port.clone()));
 
         let _ = set_console(terminal.clone());
         eonix_log::set_console(terminal.clone());

+ 2 - 2
src/driver/virtio/loongarch64.rs

@@ -3,13 +3,13 @@ use crate::kernel::{
     block::{make_device, BlockDevice},
     constants::EIO,
     pcie::{self, PCIDevice, PCIDriver, PciError, SegmentGroup},
+    task::block_on,
 };
 use alloc::sync::Arc;
 use core::sync::atomic::{AtomicUsize, Ordering};
 use eonix_hal::{fence::memory_barrier, mm::ArchPhysAccess};
 use eonix_log::println_warn;
 use eonix_mm::address::PhysAccess;
-use eonix_runtime::task::Task;
 use eonix_sync::Spin;
 use virtio_drivers::{
     device::blk::VirtIOBlk,
@@ -134,7 +134,7 @@ impl PCIDriver for VirtIODriver {
             Arc::new(Spin::new(virtio_block)),
         )?;
 
-        Task::block_on(block_device.partprobe()).map_err(|err| {
+        block_on(block_device.partprobe()).map_err(|err| {
             println_warn!(
                 "Failed to probe partitions for VirtIO Block device: {}",
                 err

+ 3 - 9
src/driver/virtio/riscv64.rs

@@ -1,23 +1,17 @@
 use super::virtio_blk::HAL;
 use crate::kernel::{
     block::{make_device, BlockDevice},
-    mem::{AsMemoryBlock, MemoryBlock, Page},
+    task::block_on,
 };
 use alloc::{sync::Arc, vec::Vec};
-use core::num::NonZero;
 use eonix_hal::arch_exported::fdt::FDT;
 use eonix_hal::mm::ArchPhysAccess;
 use eonix_log::{println_info, println_warn};
-use eonix_mm::{
-    address::{Addr, PAddr, PhysAccess},
-    paging::PFN,
-};
-use eonix_runtime::task::Task;
+use eonix_mm::address::{PAddr, PhysAccess};
 use eonix_sync::Spin;
 use virtio_drivers::{
     device::blk::VirtIOBlk,
     transport::{mmio::MmioTransport, Transport},
-    Hal,
 };
 
 pub fn init() {
@@ -55,7 +49,7 @@ pub fn init() {
                     )
                     .expect("Failed to register VirtIO Block device");
 
-                    Task::block_on(block_device.partprobe())
+                    block_on(block_device.partprobe())
                         .expect("Failed to probe partitions for VirtIO Block device");
 
                     disk_id += 1;

+ 17 - 1
src/driver/virtio/virtio_blk.rs

@@ -84,7 +84,23 @@ where
 
     fn submit(&self, req: BlockDeviceRequest) -> KResult<()> {
         match req {
-            BlockDeviceRequest::Write { .. } => todo!(),
+            BlockDeviceRequest::Write {
+                sector,
+                count,
+                buffer,
+            } => {
+                let mut dev = self.lock();
+                for ((start, len), buffer_page) in
+                    Chunks::new(sector as usize, count as usize, 8).zip(buffer.iter())
+                {
+                    let buffer = unsafe {
+                        // SAFETY: Pages in `req.buffer` are guaranteed to be exclusively owned by us.
+                        &buffer_page.as_memblk().as_bytes()[..len as usize * 512]
+                    };
+
+                    dev.write_blocks(start, buffer).map_err(|_| EIO)?;
+                }
+            }
             BlockDeviceRequest::Read {
                 sector,
                 count,

+ 456 - 70
src/fs/ext4.rs

@@ -1,17 +1,21 @@
 use core::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 
-use crate::kernel::mem::{PageCache, PageCacheBackend};
+use crate::kernel::mem::{CachePage, CachePageStream, PageCache, PageCacheBackend};
+use crate::kernel::task::block_on;
+use crate::kernel::timer::Ticks;
+use crate::kernel::vfs::inode::{AtomicMode, Mode};
 use crate::{
-    io::{Buffer, ByteBuffer},
+    io::{Buffer, ByteBuffer, Stream},
     kernel::{
         block::BlockDevice,
-        constants::EIO,
+        constants::{EEXIST, EINVAL, EIO, ENOSYS},
         timer::Instant,
         vfs::{
-            dentry::Dentry,
-            inode::{define_struct_inode, AtomicNlink, Ino, Inode, InodeData},
+            dentry::{dcache, Dentry},
+            inode::{
+                define_struct_inode, AtomicNlink, Ino, Inode, InodeData, RenameData, WriteOffset,
+            },
             mount::{register_filesystem, Mount, MountCreator},
-            s_isdir, s_isreg,
             vfs::Vfs,
             DevId, FsContext,
         },
@@ -24,10 +28,10 @@ use alloc::{
     collections::btree_map::{BTreeMap, Entry},
     sync::Arc,
 };
-use eonix_runtime::task::Task;
+use another_ext4::{
+    Block, BlockDevice as Ext4BlockDeviceTrait, Ext4, FileType, InodeMode, PBlockId,
+};
 use eonix_sync::RwLock;
-use ext4_rs::{BlockDevice as Ext4BlockDeviceTrait, Ext4Error};
-use ext4_rs::{Errno, Ext4};
 
 pub struct Ext4BlockDevice {
     device: Arc<BlockDevice>,
@@ -40,20 +44,25 @@ impl Ext4BlockDevice {
 }
 
 impl Ext4BlockDeviceTrait for Ext4BlockDevice {
-    fn read_offset(&self, offset: usize) -> Vec<u8> {
-        let mut buffer = vec![0u8; 4096];
+    fn read_block(&self, block_id: PBlockId) -> Block {
+        let mut buffer = [0u8; 4096];
         let mut byte_buffer = ByteBuffer::new(buffer.as_mut_slice());
 
         let _ = self
             .device
-            .read_some(offset, &mut byte_buffer)
+            .read_some((block_id as usize) * 4096, &mut byte_buffer)
             .expect("Failed to read from block device");
 
-        buffer
+        Block {
+            id: block_id,
+            data: buffer,
+        }
     }
 
-    fn write_offset(&self, _offset: usize, _data: &[u8]) {
-        todo!()
+    fn write_block(&self, block: &another_ext4::Block) {
+        let _ = self
+            .device
+            .write_some((block.id as usize) * 4096, &block.data);
     }
 }
 
@@ -74,7 +83,7 @@ impl Vfs for Ext4Fs {
     }
 
     fn is_read_only(&self) -> bool {
-        true
+        false
     }
 }
 
@@ -83,33 +92,67 @@ impl Ext4Fs {
         icache.get(&ino).cloned().map(Ext4Inode::into_inner)
     }
 
+    fn modify_inode_stat(&self, ino: u32, size: Option<u64>, mtime: u32) {
+        let _ = self
+            .inner
+            .setattr(ino, None, None, None, size, None, Some(mtime), None, None);
+    }
+
+    fn create_inode_stat(&self, parent: u32, child: u32, mtime: u32) {
+        let _ = self.inner.setattr(
+            parent,
+            None,
+            None,
+            None,
+            None,
+            None,
+            Some(mtime),
+            None,
+            None,
+        );
+        let _ = self
+            .inner
+            .setattr(child, None, None, None, None, None, Some(mtime), None, None);
+    }
+
+    fn chmod_stat(&self, ino: u32, new_mode: u16, ctime: u32) {
+        let _ = self.inner.setattr(
+            ino,
+            Some(InodeMode::from_bits_retain(new_mode.try_into().unwrap())),
+            None,
+            None,
+            None,
+            None,
+            None,
+            Some(ctime),
+            None,
+        );
+    }
+
     fn get_or_insert(
         &self,
         icache: &mut BTreeMap<Ino, Ext4Inode>,
-        mut idata: InodeData,
+        idata: InodeData,
     ) -> Arc<dyn Inode> {
         match icache.entry(idata.ino) {
             Entry::Occupied(occupied) => occupied.get().clone().into_inner(),
-            Entry::Vacant(vacant) => {
-                let mode = *idata.mode.get_mut();
-                if s_isreg(mode) {
-                    vacant
-                        .insert(Ext4Inode::File(FileInode::new(idata)))
-                        .clone()
-                        .into_inner()
-                } else if s_isdir(mode) {
+            Entry::Vacant(vacant) => match idata.mode.load().format() {
+                Mode::REG => vacant
+                    .insert(Ext4Inode::File(FileInode::with_idata(idata)))
+                    .clone()
+                    .into_inner(),
+                Mode::DIR => vacant
+                    .insert(Ext4Inode::Dir(Arc::new(DirInode { idata })))
+                    .clone()
+                    .into_inner(),
+                mode => {
+                    println_warn!("ext4: Unsupported inode type: {:#o}", mode.format_bits());
                     vacant
-                        .insert(Ext4Inode::Dir(Arc::new(DirInode { idata })))
-                        .clone()
-                        .into_inner()
-                } else {
-                    println_warn!("ext4: Unsupported inode type: {mode:#o}");
-                    vacant
-                        .insert(Ext4Inode::File(FileInode::new(idata)))
+                        .insert(Ext4Inode::File(FileInode::with_idata(idata)))
                         .clone()
                         .into_inner()
                 }
-            }
+            },
         }
     }
 }
@@ -117,7 +160,7 @@ impl Ext4Fs {
 impl Ext4Fs {
     pub fn create(device: Arc<BlockDevice>) -> KResult<(Arc<Self>, Arc<dyn Inode>)> {
         let ext4_device = Ext4BlockDevice::new(device.clone());
-        let ext4 = Ext4::open(Arc::new(ext4_device));
+        let ext4 = Ext4::load(Arc::new(ext4_device)).unwrap();
 
         let ext4fs = Arc::new(Self {
             inner: ext4,
@@ -126,29 +169,29 @@ impl Ext4Fs {
         });
 
         let root_inode = {
-            let mut icache = Task::block_on(ext4fs.icache.write());
-            let root_inode = ext4fs.inner.get_inode_ref(2);
+            let mut icache = block_on(ext4fs.icache.write());
+            let root_inode = ext4fs.inner.read_root_inode();
 
             ext4fs.get_or_insert(
                 &mut icache,
                 InodeData {
-                    ino: root_inode.inode_num as Ino,
+                    ino: root_inode.id as Ino,
                     size: AtomicU64::new(root_inode.inode.size()),
-                    nlink: AtomicNlink::new(root_inode.inode.links_count() as _),
+                    nlink: AtomicNlink::new(root_inode.inode.link_count() as _),
                     uid: AtomicU32::new(root_inode.inode.uid() as _),
                     gid: AtomicU32::new(root_inode.inode.gid() as _),
-                    mode: AtomicU32::new(root_inode.inode.mode() as _),
+                    mode: AtomicMode::new(root_inode.inode.mode().bits() as _),
                     atime: Spin::new(Instant::new(
                         root_inode.inode.atime() as _,
-                        root_inode.inode.i_atime_extra() as _,
+                        root_inode.inode.atime_extra() as _,
                     )),
                     ctime: Spin::new(Instant::new(
                         root_inode.inode.ctime() as _,
-                        root_inode.inode.i_ctime_extra() as _,
+                        root_inode.inode.ctime_extra() as _,
                     )),
                     mtime: Spin::new(Instant::new(
                         root_inode.inode.mtime() as _,
-                        root_inode.inode.i_mtime_extra() as _,
+                        root_inode.inode.mtime_extra() as _,
                     )),
                     rwsem: RwLock::new(()),
                     vfs: Arc::downgrade(&ext4fs) as _,
@@ -177,6 +220,7 @@ impl Ext4Inode {
 
 define_struct_inode! {
     struct FileInode {
+        last_sync: AtomicU64,
         page_cache: PageCache,
     }
 }
@@ -186,23 +230,49 @@ define_struct_inode! {
 }
 
 impl FileInode {
-    fn new(idata: InodeData) -> Arc<Self> {
+    fn with_idata(idata: InodeData) -> Arc<Self> {
         let inode = Arc::new_cyclic(|weak_self: &Weak<FileInode>| Self {
             idata,
+            last_sync: AtomicU64::new(0),
             page_cache: PageCache::new(weak_self.clone()),
         });
 
         inode
     }
+
+    pub fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Arc::new_cyclic(|weak_self: &Weak<FileInode>| Self {
+            idata: {
+                let inode_data = InodeData::new(ino, vfs);
+                inode_data.mode.store(Mode::REG.perm(mode.bits()));
+                inode_data.nlink.store(1, Ordering::Relaxed);
+                inode_data
+            },
+            last_sync: AtomicU64::new(0),
+            page_cache: PageCache::new(weak_self.clone()),
+        })
+    }
+
+    fn sync_if_needed(&self) {
+        let now = Ticks::now().in_secs();
+        let last = self.last_sync.load(Ordering::Relaxed);
+
+        // TODO: this is a temporary implement,
+        // consider change this with some update strategy such as LRU future
+        if now - last > 10 {
+            self.last_sync.store(now, Ordering::Relaxed);
+            let _ = block_on(self.page_cache.fsync());
+        }
+    }
 }
 
 impl PageCacheBackend for FileInode {
-    fn read_page(&self, page: &mut crate::kernel::mem::CachePage, offset: usize) -> KResult<usize> {
+    fn read_page(&self, page: &mut CachePage, offset: usize) -> KResult<usize> {
         self.read_direct(page, offset)
     }
 
-    fn write_page(&self, page: &crate::kernel::mem::CachePage, offset: usize) -> KResult<usize> {
-        todo!()
+    fn write_page(&self, page: &mut CachePageStream, offset: usize) -> KResult<usize> {
+        self.write_direct(page, offset)
     }
 
     fn size(&self) -> usize {
@@ -216,7 +286,7 @@ impl Inode for FileInode {
     }
 
     fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
-        Task::block_on(self.page_cache.read(buffer, offset))
+        block_on(self.page_cache.read(buffer, offset))
     }
 
     fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
@@ -224,13 +294,135 @@ impl Inode for FileInode {
         let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
 
         let mut temp_buf = vec![0u8; buffer.total()];
-        match ext4fs.inner.read_at(self.ino as u32, offset, &mut temp_buf) {
+        match ext4fs.inner.read(self.ino as u32, offset, &mut temp_buf) {
             Ok(bytes_read) => {
                 let _ = buffer.fill(&temp_buf[..bytes_read])?;
                 Ok(buffer.wrote())
             }
-            Err(e) => Err(e.error() as u32),
+            Err(e) => Err(e.code() as u32),
+        }
+    }
+
+    fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+        let _lock = block_on(self.rwsem.write());
+
+        let mut store_new_end = None;
+        let offset = match offset {
+            WriteOffset::Position(offset) => offset,
+            // TODO: here need to add some operate
+            WriteOffset::End(end) => {
+                store_new_end = Some(end);
+                self.size.load(Ordering::Relaxed) as usize
+            }
+        };
+
+        let total_written = block_on(self.page_cache.write(stream, offset))?;
+        let cursor_end = offset + total_written;
+        if let Some(store_end) = store_new_end {
+            *store_end = cursor_end;
+        }
+
+        let mtime = Instant::now();
+        *self.mtime.lock() = mtime;
+        self.size.store(cursor_end as u64, Ordering::Relaxed);
+
+        self.sync_if_needed();
+
+        Ok(total_written)
+    }
+
+    fn write_direct(&self, stream: &mut dyn Stream, offset: usize) -> KResult<usize> {
+        //let _lock = Task::block_on(self.rwsem.write());
+
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
+
+        let mut temp_buf = vec![0u8; 4096];
+        let mut total_written = 0;
+
+        while let Some(data) = stream.poll_data(&mut temp_buf)? {
+            let written = ext4fs
+                .inner
+                .write(self.ino as u32, offset + total_written, data)
+                .unwrap();
+            total_written += written;
+            if written < data.len() {
+                break;
+            }
         }
+
+        ext4fs.modify_inode_stat(
+            self.ino as u32,
+            Some(self.size() as u64),
+            self.mtime.lock().since_epoch().as_secs() as u32,
+        );
+
+        Ok(total_written)
+    }
+
+    fn chmod(&self, mode: Mode) -> KResult<()> {
+        let _lock = block_on(self.rwsem.write());
+
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
+        let old_mode = self.mode.load();
+        let new_mode = old_mode.perm(mode.bits());
+
+        let now = Instant::now();
+        ext4fs.chmod_stat(
+            self.ino as u32,
+            new_mode.bits() as u16,
+            now.since_epoch().as_secs() as u32,
+        );
+
+        // SAFETY: `rwsem` has done the synchronization
+        self.mode.store(new_mode);
+        *self.ctime.lock() = now;
+
+        Ok(())
+    }
+
+    // TODO
+    fn truncate(&self, _length: usize) -> KResult<()> {
+        Ok(())
+    }
+}
+
+impl DirInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Arc::new_cyclic(|_| DirInode {
+            idata: {
+                let inode_data = InodeData::new(ino, vfs);
+                inode_data.mode.store(Mode::DIR.perm(mode.bits()));
+                inode_data.nlink.store(2, Ordering::Relaxed);
+                inode_data.size.store(4096, Ordering::Relaxed);
+                inode_data
+            },
+        })
+    }
+
+    fn update_time(&self, time: Instant) {
+        *self.ctime.lock() = time;
+        *self.mtime.lock() = time;
+    }
+
+    fn update_child_time(&self, child: &dyn Inode, time: Instant) {
+        self.update_time(time);
+        *child.ctime.lock() = time;
+        *child.mtime.lock() = time;
+    }
+
+    fn link_file(&self) {
+        self.size.fetch_add(1, Ordering::Relaxed);
+    }
+
+    fn link_dir(&self) {
+        self.nlink.fetch_add(1, Ordering::Relaxed);
+        self.size.fetch_add(1, Ordering::Relaxed);
+    }
+
+    fn unlink_dir(&self) {
+        self.nlink.fetch_sub(1, Ordering::Relaxed);
     }
 }
 
@@ -241,36 +433,46 @@ impl Inode for DirInode {
 
         let name = dentry.get_name();
         let name = String::from_utf8_lossy(&name);
-        let lookup_result = ext4fs.inner.fuse_lookup(self.ino, &name);
+        let lookup_result = ext4fs.inner.lookup(self.ino as u32, &name);
 
-        const EXT4_ERROR_ENOENT: Ext4Error = Ext4Error::new(Errno::ENOENT);
+        // TODO: wtf
+        //const EXT4_ERROR_ENOENT: Ext4Error_ = Ext4Error_::new(ErrCode::ENOENT);
         let attr = match lookup_result {
-            Ok(attr) => attr,
-            Err(EXT4_ERROR_ENOENT) => return Ok(None),
-            Err(error) => return Err(error.error() as u32),
+            Ok(inode_id) => ext4fs.inner.getattr(inode_id).unwrap(),
+            //Err(EXT4_ERROR_ENOENT) => return Ok(None),
+            Err(error) => return Err(error.code() as u32),
         };
 
         // Fast path: if the inode is already in the cache, return it.
-        if let Some(inode) = ext4fs.try_get(&Task::block_on(ext4fs.icache.read()), attr.ino as u64)
-        {
+        if let Some(inode) = ext4fs.try_get(&block_on(ext4fs.icache.read()), attr.ino as u64) {
             return Ok(Some(inode));
         }
 
-        let extra_perm = attr.perm.bits() as u32 & 0o7000;
-        let perm = attr.perm.bits() as u32 & 0o0700;
-        let real_perm = extra_perm | perm | perm >> 3 | perm >> 6;
+        let file_type_bits = match attr.ftype {
+            FileType::RegularFile => InodeMode::FILE.bits(),
+            FileType::Directory => InodeMode::DIRECTORY.bits(),
+            FileType::CharacterDev => InodeMode::CHARDEV.bits(),
+            FileType::BlockDev => InodeMode::BLOCKDEV.bits(),
+            FileType::Fifo => InodeMode::FIFO.bits(),
+            FileType::Socket => InodeMode::SOCKET.bits(),
+            FileType::SymLink => InodeMode::SOFTLINK.bits(),
+            FileType::Unknown => 0,
+        };
+
+        let perm_bits = attr.perm.bits() & InodeMode::PERM_MASK.bits();
+        let mode = file_type_bits | perm_bits;
 
         // Create a new inode based on the attributes.
-        let mut icache = Task::block_on(ext4fs.icache.write());
+        let mut icache = block_on(ext4fs.icache.write());
         let inode = ext4fs.get_or_insert(
             &mut icache,
             InodeData {
                 ino: attr.ino as Ino,
                 size: AtomicU64::new(attr.size),
-                nlink: AtomicNlink::new(attr.nlink as _),
+                nlink: AtomicNlink::new(attr.links as _),
                 uid: AtomicU32::new(attr.uid),
                 gid: AtomicU32::new(attr.gid),
-                mode: AtomicU32::new(attr.kind.bits() as u32 | real_perm),
+                mode: AtomicMode::new(mode as _),
                 atime: Spin::new(Instant::new(attr.atime as _, 0)),
                 ctime: Spin::new(Instant::new(attr.ctime as _, 0)),
                 mtime: Spin::new(Instant::new(attr.mtime as _, 0)),
@@ -292,22 +494,206 @@ impl Inode for DirInode {
 
         let entries = ext4fs
             .inner
-            .fuse_readdir(self.ino as u64, 0, offset as i64)
-            .map_err(|err| err.error() as u32)?;
-        let mut current_offset = 0;
+            .listdir(self.ino as u32)
+            .map_err(|err| err.code() as u32)?;
 
-        for entry in entries {
-            let name_len = entry.name_len as usize;
-            let name = &entry.name[..name_len];
+        let entries_to_process = if offset < entries.len() {
+            &entries[offset..]
+        } else {
+            &entries[0..0]
+        };
+        let mut current_offset = 0;
+        for entry in entries_to_process {
+            let name_string = entry.name();
+            let name = name_string.as_bytes();
+            let inode = entry.inode() as Ino;
 
-            if callback(name, entry.inode as Ino)?.is_break() {
+            if callback(name, inode)?.is_break() {
                 break;
             }
-
             current_offset += 1;
         }
         Ok(current_offset)
     }
+
+    fn creat(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
+        let _lock = block_on(self.rwsem.write());
+
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
+
+        let name = at.get_name();
+        let name = String::from_utf8_lossy(&name);
+
+        let new_ino = ext4fs
+            .inner
+            .create(
+                self.ino as u32,
+                &name,
+                InodeMode::from_bits_retain(Mode::REG.perm(mode.bits()).bits() as u16),
+            )
+            .unwrap();
+
+        let file = FileInode::new(new_ino as u64, self.vfs.clone(), mode);
+        let now = Instant::now();
+        self.update_child_time(file.as_ref(), now);
+        self.link_file();
+
+        ext4fs.create_inode_stat(self.ino as u32, new_ino, now.since_epoch().as_secs() as u32);
+
+        at.save_reg(file)
+    }
+
+    fn mkdir(&self, at: &Dentry, mode: Mode) -> KResult<()> {
+        let _lock = block_on(self.rwsem.write());
+
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
+
+        let name = at.get_name();
+        let name = String::from_utf8_lossy(&name);
+
+        let new_ino = ext4fs
+            .inner
+            .mkdir(
+                self.ino as u32,
+                &name,
+                InodeMode::from(Mode::DIR.perm(mode.bits())),
+            )
+            .unwrap();
+
+        let new_dir = DirInode::new(new_ino as u64, self.vfs.clone(), mode);
+        let now = Instant::now();
+        self.update_child_time(new_dir.as_ref(), now);
+        self.link_dir();
+
+        ext4fs.create_inode_stat(self.ino as u32, new_ino, now.since_epoch().as_secs() as u32);
+
+        at.save_dir(new_dir)
+    }
+
+    fn unlink(&self, at: &Arc<Dentry>) -> KResult<()> {
+        let _dir_lock = block_on(self.rwsem.write());
+
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
+
+        let file = at.get_inode()?;
+
+        let name = at.get_name();
+        let name = String::from_utf8_lossy(&name);
+        let _file_lock = block_on(file.rwsem.write());
+
+        if file.is_dir() {
+            let _ = ext4fs.inner.rmdir(self.ino as u32, &name);
+            self.unlink_dir();
+        } else {
+            let _ = ext4fs.inner.unlink(self.ino as u32, &name);
+        }
+        let now = Instant::now();
+        self.update_time(now);
+        ext4fs.modify_inode_stat(self.ino as u32, None, now.since_epoch().as_secs() as u32);
+
+        dcache::d_remove(at);
+
+        Ok(())
+    }
+
+    fn chmod(&self, mode: Mode) -> KResult<()> {
+        let _lock = block_on(self.rwsem.write());
+
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
+        let old_mode = self.mode.load();
+        let new_mode = old_mode.perm(mode.bits());
+
+        let now = Instant::now();
+        ext4fs.chmod_stat(
+            self.ino as u32,
+            new_mode.non_format_bits() as _,
+            now.since_epoch().as_secs() as u32,
+        );
+
+        // SAFETY: `rwsem` has done the synchronization
+        self.mode.store(new_mode);
+        *self.ctime.lock() = now;
+
+        Ok(())
+    }
+
+    fn rename(&self, rename_data: RenameData) -> KResult<()> {
+        let RenameData {
+            old_dentry,
+            new_dentry,
+            new_parent,
+            is_exchange,
+            no_replace,
+            ..
+        } = rename_data;
+
+        if is_exchange {
+            println_warn!("Ext4Fs does not support exchange rename for now");
+            return Err(ENOSYS);
+        }
+
+        // TODO: may need another lock
+        let _lock = block_on(self.rwsem.write());
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
+
+        let old_file = old_dentry.get_inode()?;
+        let new_file = new_dentry.get_inode();
+        if no_replace && new_file.is_ok() {
+            return Err(EEXIST);
+        }
+
+        let name = old_dentry.name();
+        let name = core::str::from_utf8(&*name).map_err(|_| EINVAL)?;
+        let new_name = new_dentry.name();
+        let new_name = core::str::from_utf8(&*new_name).map_err(|_| EINVAL)?;
+
+        ext4fs
+            .inner
+            .rename(self.ino as u32, name, new_parent.ino as u32, new_name)
+            .map_err(|err| err.code() as u32)?;
+
+        // TODO: may need more operations
+        let now = Instant::now();
+        *old_file.ctime.lock() = now;
+        *self.mtime.lock() = now;
+
+        let same_parent = Arc::as_ptr(&new_parent) == &raw const *self;
+        if !same_parent {
+            *new_parent.mtime.lock() = now;
+            if old_file.is_dir() {
+                self.nlink.fetch_sub(1, Ordering::Relaxed);
+                new_parent.nlink.fetch_add(1, Ordering::Relaxed);
+            }
+        }
+
+        if let Ok(replaced_file) = new_dentry.get_inode() {
+            if !no_replace {
+                *replaced_file.ctime.lock() = now;
+                replaced_file.nlink.fetch_sub(1, Ordering::Relaxed);
+            }
+        }
+
+        block_on(dcache::d_exchange(old_dentry, new_dentry));
+
+        Ok(())
+    }
+}
+
+impl From<Mode> for InodeMode {
+    fn from(value: Mode) -> Self {
+        InodeMode::from_bits_retain(value.bits() as u16)
+    }
+}
+
+impl From<InodeMode> for Mode {
+    fn from(value: InodeMode) -> Self {
+        Mode::new(value.bits() as u32)
+    }
 }
 
 struct Ext4MountCreator;

+ 12 - 13
src/fs/fat32.rs

@@ -3,13 +3,13 @@ mod file;
 
 use crate::io::Stream;
 use crate::kernel::constants::EIO;
-use crate::kernel::mem::AsMemoryBlock;
-use crate::kernel::vfs::inode::WriteOffset;
+use crate::kernel::mem::{AsMemoryBlock, CachePageStream};
+use crate::kernel::task::block_on;
+use crate::kernel::vfs::inode::{Mode, WriteOffset};
 use crate::{
     io::{Buffer, ByteBuffer, UninitBuffer},
     kernel::{
         block::{make_device, BlockDevice, BlockDeviceRequest},
-        constants::{S_IFDIR, S_IFREG},
         mem::{
             paging::Page,
             {CachePage, PageCache, PageCacheBackend},
@@ -32,7 +32,6 @@ use alloc::{
 };
 use core::{ops::ControlFlow, sync::atomic::Ordering};
 use dir::Dirs as _;
-use eonix_runtime::task::Task;
 use eonix_sync::RwLock;
 use file::ClusterRead;
 
@@ -253,7 +252,7 @@ impl FileInode {
 
         // Safety: We are initializing the inode
         inode.nlink.store(1, Ordering::Relaxed);
-        inode.mode.store(S_IFREG | 0o777, Ordering::Relaxed);
+        inode.mode.store(Mode::REG.perm(0o777));
         inode.size.store(size as u64, Ordering::Relaxed);
 
         inode
@@ -266,13 +265,13 @@ impl Inode for FileInode {
     }
 
     fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
-        Task::block_on(self.page_cache.read(buffer, offset))
+        block_on(self.page_cache.read(buffer, offset))
     }
 
     fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
-        let fat = Task::block_on(vfs.fat.read());
+        let fat = block_on(vfs.fat.read());
 
         if self.size.load(Ordering::Relaxed) as usize == 0 {
             return Ok(0);
@@ -308,11 +307,11 @@ impl Inode for FileInode {
         Ok(buffer.wrote())
     }
 
-    fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+    fn write(&self, _stream: &mut dyn Stream, _offset: WriteOffset) -> KResult<usize> {
         todo!()
     }
 
-    fn write_direct(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+    fn write_direct(&self, _stream: &mut dyn Stream, _offset: usize) -> KResult<usize> {
         todo!()
     }
 }
@@ -322,7 +321,7 @@ impl PageCacheBackend for FileInode {
         self.read_direct(page, offset)
     }
 
-    fn write_page(&self, page: &CachePage, offset: usize) -> KResult<usize> {
+    fn write_page(&self, _page: &mut CachePageStream, _offset: usize) -> KResult<usize> {
         todo!()
     }
 
@@ -343,7 +342,7 @@ impl DirInode {
 
         // Safety: We are initializing the inode
         inode.nlink.store(2, Ordering::Relaxed);
-        inode.mode.store(S_IFDIR | 0o777, Ordering::Relaxed);
+        inode.mode.store(Mode::DIR.perm(0o777));
         inode.size.store(size as u64, Ordering::Relaxed);
 
         inode
@@ -354,7 +353,7 @@ impl Inode for DirInode {
     fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
-        let fat = Task::block_on(vfs.fat.read());
+        let fat = block_on(vfs.fat.read());
 
         let mut entries = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo)
             .read(vfs, 0)
@@ -385,7 +384,7 @@ impl Inode for DirInode {
     ) -> KResult<usize> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
-        let fat = Task::block_on(vfs.fat.read());
+        let fat = block_on(vfs.fat.read());
 
         let cluster_iter = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo)
             .read(vfs, offset)

+ 11 - 11
src/fs/procfs.rs

@@ -1,9 +1,10 @@
 use crate::kernel::constants::{EACCES, ENOTDIR};
+use crate::kernel::task::block_on;
 use crate::kernel::timer::Instant;
+use crate::kernel::vfs::inode::{AtomicMode, Mode};
 use crate::{
     io::Buffer,
     kernel::{
-        constants::{S_IFDIR, S_IFREG},
         mem::paging::PageBuffer,
         vfs::{
             dentry::Dentry,
@@ -17,7 +18,6 @@ use crate::{
 };
 use alloc::sync::{Arc, Weak};
 use core::{ops::ControlFlow, sync::atomic::Ordering};
-use eonix_runtime::task::Task;
 use eonix_sync::{AsProof as _, AsProofMut as _, LazyLock, Locked};
 use itertools::Itertools;
 
@@ -69,12 +69,12 @@ define_struct_inode! {
 
 impl FileInode {
     pub fn new(ino: Ino, vfs: Weak<ProcFs>, file: Box<dyn ProcFsFile>) -> Arc<Self> {
-        let mut mode = S_IFREG;
+        let mut mode = Mode::REG;
         if file.can_read() {
-            mode |= 0o444;
+            mode.set_perm(0o444);
         }
         if file.can_write() {
-            mode |= 0o200;
+            mode.set_perm(0o222);
         }
 
         let mut inode = Self {
@@ -82,7 +82,7 @@ impl FileInode {
             file,
         };
 
-        inode.idata.mode.store(mode, Ordering::Relaxed);
+        inode.idata.mode.store(mode);
         inode.idata.nlink.store(1, Ordering::Relaxed);
         *inode.ctime.get_mut() = Instant::now();
         *inode.mtime.get_mut() = Instant::now();
@@ -123,7 +123,7 @@ impl DirInode {
     pub fn new(ino: Ino, vfs: Weak<ProcFs>) -> Arc<Self> {
         Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
             addr_of_mut_field!(inode, entries).write(Locked::new(vec![], rwsem));
-            addr_of_mut_field!(&mut *inode, mode).write((S_IFDIR | 0o755).into());
+            addr_of_mut_field!(&mut *inode, mode).write(AtomicMode::from(Mode::DIR.perm(0o755)));
             addr_of_mut_field!(&mut *inode, nlink).write(1.into());
             addr_of_mut_field!(&mut *inode, ctime).write(Spin::new(Instant::now()));
             addr_of_mut_field!(&mut *inode, mtime).write(Spin::new(Instant::now()));
@@ -134,7 +134,7 @@ impl DirInode {
 
 impl Inode for DirInode {
     fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
-        let lock = Task::block_on(self.rwsem.read());
+        let lock = block_on(self.rwsem.read());
         Ok(self
             .entries
             .access(lock.prove())
@@ -147,7 +147,7 @@ impl Inode for DirInode {
         offset: usize,
         callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        let lock = Task::block_on(self.rwsem.read());
+        let lock = block_on(self.rwsem.read());
         self.entries
             .access(lock.prove())
             .iter()
@@ -234,7 +234,7 @@ pub fn creat(
     let inode = FileInode::new(ino, Arc::downgrade(&fs), file);
 
     {
-        let lock = Task::block_on(parent.idata.rwsem.write());
+        let lock = block_on(parent.idata.rwsem.write());
         parent
             .entries
             .access_mut(lock.prove_mut())
@@ -258,7 +258,7 @@ pub fn mkdir(parent: &ProcFsNode, name: &[u8]) -> KResult<ProcFsNode> {
 
     parent
         .entries
-        .access_mut(Task::block_on(inode.rwsem.write()).prove_mut())
+        .access_mut(block_on(inode.rwsem.write()).prove_mut())
         .push((Arc::from(name), ProcFsNode::Dir(inode.clone())));
 
     Ok(ProcFsNode::Dir(inode))

+ 47 - 59
src/fs/tmpfs.rs

@@ -1,17 +1,16 @@
 use crate::io::Stream;
 use crate::kernel::constants::{EEXIST, EINVAL, EIO, EISDIR, ENOENT, ENOSYS, ENOTDIR};
-use crate::kernel::mem::{CachePage, PageCache, PageCacheBackend};
+use crate::kernel::mem::{CachePage, CachePageStream, PageCache, PageCacheBackend};
+use crate::kernel::task::block_on;
 use crate::kernel::timer::Instant;
-use crate::kernel::vfs::inode::InodeData;
 use crate::kernel::vfs::inode::RenameData;
+use crate::kernel::vfs::inode::{AtomicMode, InodeData};
 use crate::{
     io::Buffer,
-    kernel::constants::{S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG},
     kernel::vfs::{
         dentry::{dcache, Dentry},
         inode::{define_struct_inode, AtomicIno, Ino, Inode, Mode, WriteOffset},
         mount::{register_filesystem, Mount, MountCreator, MS_RDONLY},
-        s_isblk, s_ischr,
         vfs::Vfs,
         DevId,
     },
@@ -21,7 +20,6 @@ use alloc::sync::{Arc, Weak};
 use core::fmt::Debug;
 use core::{ops::ControlFlow, sync::atomic::Ordering};
 use eonix_mm::paging::PAGE_SIZE;
-use eonix_runtime::task::Task;
 use eonix_sync::{AsProof as _, AsProofMut as _, Locked, Mutex, ProofMut};
 use itertools::Itertools;
 
@@ -46,7 +44,7 @@ impl NodeInode {
         Self::new_locked(ino, vfs, |inode, _| unsafe {
             addr_of_mut_field!(inode, devid).write(devid);
 
-            addr_of_mut_field!(&mut *inode, mode).write(mode.into());
+            addr_of_mut_field!(&mut *inode, mode).write(AtomicMode::from(mode));
             addr_of_mut_field!(&mut *inode, nlink).write(1.into());
             addr_of_mut_field!(&mut *inode, ctime).write(Spin::new(Instant::now()));
             addr_of_mut_field!(&mut *inode, mtime).write(Spin::new(Instant::now()));
@@ -74,7 +72,8 @@ impl DirectoryInode {
                 .write(Locked::new(vec![(Arc::from(b".".as_slice()), ino)], rwsem));
 
             addr_of_mut_field!(&mut *inode, size).write(1.into());
-            addr_of_mut_field!(&mut *inode, mode).write((S_IFDIR | (mode & 0o777)).into());
+            addr_of_mut_field!(&mut *inode, mode)
+                .write(AtomicMode::from(Mode::DIR.perm(mode.non_format_bits())));
             addr_of_mut_field!(&mut *inode, nlink).write(1.into()); // link from `.` to itself
             addr_of_mut_field!(&mut *inode, ctime).write(Spin::new(Instant::now()));
             addr_of_mut_field!(&mut *inode, mtime).write(Spin::new(Instant::now()));
@@ -108,7 +107,7 @@ impl DirectoryInode {
         _file_lock: ProofMut<()>,
     ) -> KResult<()> {
         // SAFETY: `file_lock` has done the synchronization
-        if file.mode.load(Ordering::Relaxed) & S_IFDIR != 0 {
+        if file.mode.load().is_dir() {
             return Err(EISDIR);
         }
 
@@ -138,7 +137,7 @@ impl Inode for DirectoryInode {
         offset: usize,
         callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        let lock = Task::block_on(self.rwsem.read());
+        let lock = block_on(self.rwsem.read());
         self.entries
             .access(lock.prove())
             .iter()
@@ -153,7 +152,7 @@ impl Inode for DirectoryInode {
         let vfs = acquire(&self.vfs)?;
         let vfs = astmp(&vfs);
 
-        let rwsem = Task::block_on(self.rwsem.write());
+        let rwsem = block_on(self.rwsem.write());
 
         let ino = vfs.assign_ino();
         let file = FileInode::new(ino, self.vfs.clone(), 0, mode);
@@ -163,22 +162,17 @@ impl Inode for DirectoryInode {
     }
 
     fn mknod(&self, at: &Dentry, mode: Mode, dev: DevId) -> KResult<()> {
-        if !s_ischr(mode) && !s_isblk(mode) {
+        if !mode.is_chr() && !mode.is_blk() {
             return Err(EINVAL);
         }
 
         let vfs = acquire(&self.vfs)?;
         let vfs = astmp(&vfs);
 
-        let rwsem = Task::block_on(self.rwsem.write());
+        let rwsem = block_on(self.rwsem.write());
 
         let ino = vfs.assign_ino();
-        let file = NodeInode::new(
-            ino,
-            self.vfs.clone(),
-            mode & (0o777 | S_IFBLK | S_IFCHR),
-            dev,
-        );
+        let file = NodeInode::new(ino, self.vfs.clone(), mode, dev);
 
         self.link(at.get_name(), file.as_ref(), rwsem.prove_mut());
         at.save_reg(file)
@@ -188,7 +182,7 @@ impl Inode for DirectoryInode {
         let vfs = acquire(&self.vfs)?;
         let vfs = astmp(&vfs);
 
-        let rwsem = Task::block_on(self.rwsem.write());
+        let rwsem = block_on(self.rwsem.write());
 
         let ino = vfs.assign_ino();
         let file = SymlinkInode::new(ino, self.vfs.clone(), target.into());
@@ -201,7 +195,7 @@ impl Inode for DirectoryInode {
         let vfs = acquire(&self.vfs)?;
         let vfs = astmp(&vfs);
 
-        let rwsem = Task::block_on(self.rwsem.write());
+        let rwsem = block_on(self.rwsem.write());
 
         let ino = vfs.assign_ino();
         let newdir = DirectoryInode::new(ino, self.vfs.clone(), mode);
@@ -213,11 +207,11 @@ impl Inode for DirectoryInode {
     fn unlink(&self, at: &Arc<Dentry>) -> KResult<()> {
         let _vfs = acquire(&self.vfs)?;
 
-        let dir_lock = Task::block_on(self.rwsem.write());
+        let dir_lock = block_on(self.rwsem.write());
 
         let file = at.get_inode()?;
         let filename = at.get_name();
-        let file_lock = Task::block_on(file.rwsem.write());
+        let file_lock = block_on(file.rwsem.write());
 
         let entries = self.entries.access_mut(dir_lock.prove_mut());
 
@@ -240,12 +234,11 @@ impl Inode for DirectoryInode {
 
     fn chmod(&self, mode: Mode) -> KResult<()> {
         let _vfs = acquire(&self.vfs)?;
-        let _lock = Task::block_on(self.rwsem.write());
+        let _lock = block_on(self.rwsem.write());
 
         // SAFETY: `rwsem` has done the synchronization
-        let old = self.mode.load(Ordering::Relaxed);
-        self.mode
-            .store((old & !0o777) | (mode & 0o777), Ordering::Relaxed);
+        let old = self.mode.load();
+        self.mode.store(old.perm(mode.non_format_bits()));
         *self.ctime.lock() = Instant::now();
 
         Ok(())
@@ -271,7 +264,7 @@ impl Inode for DirectoryInode {
             .downcast_ref::<TmpFs>()
             .expect("vfs must be a TmpFs");
 
-        let _rename_lock = Task::block_on(vfs.rename_lock.lock());
+        let _rename_lock = block_on(vfs.rename_lock.lock());
 
         let old_file = old_dentry.get_inode()?;
         let new_file = new_dentry.get_inode();
@@ -284,7 +277,7 @@ impl Inode for DirectoryInode {
         if same_parent {
             // Same directory rename
             // Remove from old location and add to new location
-            let parent_lock = Task::block_on(self.rwsem.write());
+            let parent_lock = block_on(self.rwsem.write());
             let entries = self.entries.access_mut(parent_lock.prove_mut());
 
             fn rename_old(
@@ -328,15 +321,13 @@ impl Inode for DirectoryInode {
             if let Some(new_idx) = new_entry_idx {
                 // Replace existing file (i.e. rename the old and unlink the new)
                 let new_file = new_file.unwrap();
-                let _new_file_lock = Task::block_on(new_file.rwsem.write());
+                let _new_file_lock = block_on(new_file.rwsem.write());
 
                 // SAFETY: `new_file_lock` has done the synchronization
-                if new_file.mode.load(Ordering::Relaxed) & S_IFDIR != 0 {
-                    return Err(EISDIR);
-                } else {
-                    if old_file.mode.load(Ordering::Relaxed) & S_IFDIR != 0 {
-                        return Err(ENOTDIR);
-                    }
+                match (new_file.mode.load(), old_file.mode.load()) {
+                    (Mode::DIR, _) => return Err(EISDIR),
+                    (_, Mode::DIR) => return Err(ENOTDIR),
+                    _ => {}
                 }
 
                 entries.remove(new_idx);
@@ -364,8 +355,8 @@ impl Inode for DirectoryInode {
                 .downcast_ref::<DirectoryInode>()
                 .expect("new parent must be a DirectoryInode");
 
-            let old_parent_lock = Task::block_on(self.rwsem.write());
-            let new_parent_lock = Task::block_on(new_parent_inode.rwsem.write());
+            let old_parent_lock = block_on(self.rwsem.write());
+            let new_parent_lock = block_on(new_parent_inode.rwsem.write());
 
             let old_ino = old_file.ino;
             let new_ino = new_file.as_ref().ok().map(|f| f.ino);
@@ -391,12 +382,12 @@ impl Inode for DirectoryInode {
             if has_new {
                 // Replace existing file (i.e. move the old and unlink the new)
                 let new_file = new_file.unwrap();
-                let new_file_lock = Task::block_on(new_file.rwsem.write());
+                let new_file_lock = block_on(new_file.rwsem.write());
 
-                if old_file.mode.load(Ordering::Relaxed) & S_IFDIR != 0
-                    && new_file.mode.load(Ordering::Relaxed) & S_IFDIR == 0
-                {
-                    return Err(ENOTDIR);
+                match (old_file.mode.load(), new_file.mode.load()) {
+                    (Mode::DIR, Mode::DIR) => {}
+                    (Mode::DIR, _) => return Err(ENOTDIR),
+                    (_, _) => {}
                 }
 
                 // Unlink the old file that was replaced
@@ -424,7 +415,7 @@ impl Inode for DirectoryInode {
             *old_file.ctime.lock() = now;
         }
 
-        Task::block_on(dcache::d_exchange(old_dentry, new_dentry));
+        block_on(dcache::d_exchange(old_dentry, new_dentry));
 
         Ok(())
     }
@@ -442,7 +433,7 @@ impl SymlinkInode {
             let len = target.len();
             addr_of_mut_field!(inode, target).write(target);
 
-            addr_of_mut_field!(&mut *inode, mode).write((S_IFLNK | 0o777).into());
+            addr_of_mut_field!(&mut *inode, mode).write(AtomicMode::from(Mode::LNK.perm(0o777)));
             addr_of_mut_field!(&mut *inode, size).write((len as u64).into());
             addr_of_mut_field!(&mut *inode, ctime).write(Spin::new(Instant::now()));
             addr_of_mut_field!(&mut *inode, mtime).write(Spin::new(Instant::now()));
@@ -482,9 +473,7 @@ impl FileInode {
             pages: PageCache::new(weak_self.clone()),
         });
 
-        inode
-            .mode
-            .store(S_IFREG | (mode & 0o777), Ordering::Relaxed);
+        inode.mode.store(Mode::REG.perm(mode.non_format_bits()));
         inode.nlink.store(1, Ordering::Relaxed);
         inode.size.store(size as u64, Ordering::Relaxed);
         inode
@@ -496,7 +485,7 @@ impl PageCacheBackend for FileInode {
         Ok(PAGE_SIZE)
     }
 
-    fn write_page(&self, _page: &CachePage, _offset: usize) -> KResult<usize> {
+    fn write_page(&self, _page: &mut CachePageStream, _offset: usize) -> KResult<usize> {
         Ok(PAGE_SIZE)
     }
 
@@ -511,13 +500,13 @@ impl Inode for FileInode {
     }
 
     fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
-        let lock = Task::block_on(self.rwsem.write());
-        Task::block_on(self.pages.read(buffer, offset))
+        let _lock = block_on(self.rwsem.write());
+        block_on(self.pages.read(buffer, offset))
     }
 
     fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
         // TODO: We don't need that strong guarantee, find some way to avoid locks
-        let lock = Task::block_on(self.rwsem.write());
+        let _lock = block_on(self.rwsem.write());
 
         let mut store_new_end = None;
         let offset = match offset {
@@ -530,7 +519,7 @@ impl Inode for FileInode {
             }
         };
 
-        let wrote = Task::block_on(self.pages.write(stream, offset))?;
+        let wrote = block_on(self.pages.write(stream, offset))?;
         let cursor_end = offset + wrote;
 
         if let Some(store_end) = store_new_end {
@@ -545,8 +534,8 @@ impl Inode for FileInode {
     }
 
     fn truncate(&self, length: usize) -> KResult<()> {
-        let lock = Task::block_on(self.rwsem.write());
-        Task::block_on(self.pages.resize(length))?;
+        let _lock = block_on(self.rwsem.write());
+        block_on(self.pages.resize(length))?;
         self.size.store(length as u64, Ordering::Relaxed);
         *self.mtime.lock() = Instant::now();
         Ok(())
@@ -554,12 +543,11 @@ impl Inode for FileInode {
 
     fn chmod(&self, mode: Mode) -> KResult<()> {
         let _vfs = acquire(&self.vfs)?;
-        let _lock = Task::block_on(self.rwsem.write());
+        let _lock = block_on(self.rwsem.write());
 
         // SAFETY: `rwsem` has done the synchronization
-        let old = self.mode.load(Ordering::Relaxed);
-        self.mode
-            .store((old & !0o777) | (mode & 0o777), Ordering::Relaxed);
+        let old = self.mode.load();
+        self.mode.store(old.perm(mode.non_format_bits()));
         *self.ctime.lock() = Instant::now();
 
         Ok(())
@@ -600,7 +588,7 @@ impl TmpFs {
         });
 
         let weak = Arc::downgrade(&tmpfs);
-        let root_dir = DirectoryInode::new(0, weak, 0o755);
+        let root_dir = DirectoryInode::new(0, weak, Mode::new(0o755));
 
         Ok((tmpfs, root_dir))
     }

+ 4 - 2
src/io.rs

@@ -30,7 +30,7 @@ impl FillResult {
     }
 }
 
-pub trait Buffer {
+pub trait Buffer: Send {
     fn total(&self) -> usize;
     fn wrote(&self) -> usize;
 
@@ -49,7 +49,7 @@ pub trait Buffer {
     }
 }
 
-pub trait Stream {
+pub trait Stream: Send {
     fn poll_data<'a>(&mut self, buf: &'a mut [u8]) -> KResult<Option<&'a mut [u8]>>;
     fn ignore(&mut self, len: usize) -> KResult<Option<usize>>;
 }
@@ -131,6 +131,8 @@ pub struct UninitBuffer<'lt, T: Copy + Sized> {
     buffer: ByteBuffer<'lt>,
 }
 
+unsafe impl<'lt, T: Copy> Send for UninitBuffer<'lt, T> {}
+
 impl<'lt, T: Copy + Sized> UninitBuffer<'lt, T> {
     pub fn new() -> Self {
         let mut data = Box::new(MaybeUninit::uninit());

+ 97 - 15
src/kernel/block.rs

@@ -48,21 +48,6 @@ enum BlockDeviceType {
     },
 }
 
-#[derive(Debug, Clone)]
-pub enum FileSystemType {
-    Ext4,
-    Fat32,
-}
-
-impl FileSystemType {
-    pub fn as_str(&self) -> &'static str {
-        match self {
-            FileSystemType::Ext4 => "ext4",
-            FileSystemType::Fat32 => "fat32",
-        }
-    }
-}
-
 pub struct BlockDevice {
     /// Unique device identifier, major and minor numbers
     devid: DevId,
@@ -285,6 +270,103 @@ impl BlockDevice {
             Ok(FillResult::Partial(nfilled))
         }
     }
+
+    /// Write some data to the block device, may involve some copy and fragmentation
+    ///
+    /// # Arguments
+    /// `offset` - offset in bytes
+    /// `data` - data to write
+    ///
+    pub fn write_some(&self, offset: usize, data: &[u8]) -> KResult<usize> {
+        let mut sector_start = offset as u64 / 512;
+        let mut first_sector_offset = offset as u64 % 512;
+        let mut remaining_data = data;
+        let mut nwritten = 0;
+
+        while !remaining_data.is_empty() {
+            let pages: &[Page];
+            let page: Option<Page>;
+            let page_vec: Option<Vec<Page>>;
+
+            // Calculate sectors needed for this write
+            let write_end = first_sector_offset + remaining_data.len() as u64;
+            let sector_count = ((write_end + 511) / 512).min(self.queue().max_request_pages());
+
+            match sector_count {
+                count if count <= 8 => {
+                    let _page = Page::alloc();
+                    page = Some(_page);
+                    pages = core::slice::from_ref(page.as_ref().unwrap());
+                }
+                count if count <= 16 => {
+                    let _pages = Page::alloc_order(1);
+                    page = Some(_pages);
+                    pages = core::slice::from_ref(page.as_ref().unwrap());
+                }
+                count => {
+                    let npages = (count + 15) / 16;
+                    let mut _page_vec = Vec::with_capacity(npages as usize);
+                    for _ in 0..npages {
+                        _page_vec.push(Page::alloc_order(1));
+                    }
+                    page_vec = Some(_page_vec);
+                    pages = page_vec.as_ref().unwrap().as_slice();
+                }
+            }
+
+            if first_sector_offset != 0 || remaining_data.len() < (sector_count * 512) as usize {
+                let read_req = BlockDeviceRequest::Read {
+                    sector: sector_start,
+                    count: sector_count,
+                    buffer: pages,
+                };
+                self.commit_request(read_req)?;
+            }
+
+            let mut data_offset = 0;
+            let mut page_offset = first_sector_offset as usize;
+
+            for page in pages.iter() {
+                // SAFETY: We own the page and can modify it
+                let page_data = unsafe {
+                    let memblk = page.as_memblk();
+                    core::slice::from_raw_parts_mut(memblk.addr().get() as *mut u8, memblk.len())
+                };
+
+                let copy_len =
+                    (remaining_data.len() - data_offset).min(page_data.len() - page_offset);
+
+                if copy_len == 0 {
+                    break;
+                }
+
+                page_data[page_offset..page_offset + copy_len]
+                    .copy_from_slice(&remaining_data[data_offset..data_offset + copy_len]);
+
+                data_offset += copy_len;
+                page_offset = 0; // Only first page has offset
+
+                if data_offset >= remaining_data.len() {
+                    break;
+                }
+            }
+
+            let write_req = BlockDeviceRequest::Write {
+                sector: sector_start,
+                count: sector_count,
+                buffer: pages,
+            };
+            self.commit_request(write_req)?;
+
+            let bytes_written = data_offset;
+            nwritten += bytes_written;
+            remaining_data = &remaining_data[bytes_written..];
+            sector_start += sector_count;
+            first_sector_offset = 0;
+        }
+
+        Ok(nwritten)
+    }
 }
 
 pub enum BlockDeviceRequest<'lt> {

+ 7 - 11
src/kernel/chardev.rs

@@ -2,12 +2,9 @@ use super::{
     block::make_device,
     console::get_console,
     constants::{EEXIST, EIO},
-    task::{ProcessList, Thread},
+    task::{block_on, ProcessList, Thread},
     terminal::Terminal,
-    vfs::{
-        file::{File, FileType, TerminalFile},
-        DevId,
-    },
+    vfs::{DevId, File, FileType, TerminalFile},
 };
 use crate::{
     io::{Buffer, Stream, StreamRead},
@@ -18,7 +15,6 @@ use alloc::{
     collections::btree_map::{BTreeMap, Entry},
     sync::Arc,
 };
-use eonix_runtime::task::Task;
 use eonix_sync::AsProof as _;
 use posix_types::open::OpenFlags;
 
@@ -43,7 +39,7 @@ static CHAR_DEVICES: Spin<BTreeMap<DevId, Arc<CharDevice>>> = Spin::new(BTreeMap
 impl CharDevice {
     pub fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
         match &self.device {
-            CharDeviceType::Terminal(terminal) => Task::block_on(terminal.read(buffer)),
+            CharDeviceType::Terminal(terminal) => block_on(terminal.read(buffer)),
             CharDeviceType::Virtual(device) => device.read(buffer),
         }
     }
@@ -72,16 +68,16 @@ impl CharDevice {
         }
     }
 
-    pub fn open(self: &Arc<Self>, flags: OpenFlags) -> KResult<Arc<File>> {
+    pub fn open(self: &Arc<Self>, flags: OpenFlags) -> KResult<File> {
         Ok(match &self.device {
             CharDeviceType::Terminal(terminal) => {
-                let procs = Task::block_on(ProcessList::get().read());
+                let procs = block_on(ProcessList::get().read());
                 let current = Thread::current();
                 let session = current.process.session(procs.prove());
                 // We only set the control terminal if the process is the session leader.
                 if session.sid == Thread::current().process.pid {
                     // Silently fail if we can't set the control terminal.
-                    dont_check!(Task::block_on(session.set_control_terminal(
+                    dont_check!(block_on(session.set_control_terminal(
                         &terminal,
                         false,
                         procs.prove()
@@ -123,7 +119,7 @@ struct ConsoleDevice;
 impl VirtualCharDevice for ConsoleDevice {
     fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
         let console_terminal = get_console().ok_or(EIO)?;
-        Task::block_on(console_terminal.read(buffer))
+        block_on(console_terminal.read(buffer))
     }
 
     fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {

+ 5 - 12
src/kernel/interrupt.rs

@@ -1,5 +1,6 @@
 use super::mem::handle_kernel_page_fault;
-use super::timer::{should_reschedule, timer_interrupt};
+use super::task::block_on;
+use super::timer::timer_interrupt;
 use crate::kernel::constants::EINVAL;
 use crate::prelude::*;
 use alloc::sync::Arc;
@@ -7,7 +8,6 @@ use eonix_hal::traits::fault::Fault;
 use eonix_hal::traits::trap::{RawTrapContext, TrapType};
 use eonix_hal::trap::TrapContext;
 use eonix_mm::address::{Addr as _, VAddr};
-use eonix_runtime::scheduler::Scheduler;
 use eonix_sync::SpinIrq as _;
 
 static IRQ_HANDLERS: Spin<[Vec<Arc<dyn Fn() + Send + Sync>>; 16]> =
@@ -37,7 +37,7 @@ pub fn default_fault_handler(fault_type: Fault, trap_ctx: &mut TrapContext) {
         } => {
             let fault_pc = VAddr::from(trap_ctx.get_program_counter());
 
-            if let Some(new_pc) = handle_kernel_page_fault(fault_pc, vaddr, error_code) {
+            if let Some(new_pc) = block_on(handle_kernel_page_fault(fault_pc, vaddr, error_code)) {
                 trap_ctx.set_program_counter(new_pc.addr());
             }
         }
@@ -49,17 +49,10 @@ pub fn default_fault_handler(fault_type: Fault, trap_ctx: &mut TrapContext) {
 pub fn interrupt_handler(trap_ctx: &mut TrapContext) {
     match trap_ctx.trap_type() {
         TrapType::Syscall { no, .. } => unreachable!("Syscall {} in kernel space.", no),
+        TrapType::Breakpoint => unreachable!("Breakpoint in kernel space."),
         TrapType::Fault(fault) => default_fault_handler(fault, trap_ctx),
         TrapType::Irq { callback } => callback(default_irq_handler),
-        TrapType::Timer { callback } => {
-            callback(timer_interrupt);
-
-            if eonix_preempt::count() == 0 && should_reschedule() {
-                // To make scheduler satisfied.
-                eonix_preempt::disable();
-                Scheduler::schedule();
-            }
-        }
+        TrapType::Timer { callback } => callback(timer_interrupt),
     }
 }
 

+ 1 - 1
src/kernel/mem.rs

@@ -12,5 +12,5 @@ pub use access::{AsMemoryBlock, MemoryBlock, PhysAccess};
 pub(self) use mm_area::MMArea;
 pub use mm_list::{handle_kernel_page_fault, FileMapping, MMList, Mapping, Permission};
 pub use page_alloc::{GlobalPageAlloc, RawPage};
-pub use page_cache::{CachePage, PageCache, PageCacheBackend};
+pub use page_cache::{CachePage, CachePageStream, PageCache, PageCacheBackend};
 pub use paging::{Page, PageBuffer};

+ 5 - 3
src/kernel/mem/mm_area.rs

@@ -9,7 +9,6 @@ use core::cmp;
 use eonix_mm::address::{AddrOps as _, VAddr, VRange};
 use eonix_mm::page_table::{PageAttribute, RawAttribute, PTE};
 use eonix_mm::paging::{PAGE_SIZE, PFN};
-use eonix_runtime::task::Task;
 
 #[derive(Debug)]
 pub struct MMArea {
@@ -19,6 +18,9 @@ pub struct MMArea {
     pub is_shared: bool,
 }
 
+unsafe impl Send for MMArea {}
+unsafe impl Sync for MMArea {}
+
 impl Clone for MMArea {
     fn clone(&self) -> Self {
         Self {
@@ -200,7 +202,7 @@ impl MMArea {
         Ok(())
     }
 
-    pub fn handle(&self, pte: &mut impl PTE, offset: usize, write: bool) -> KResult<()> {
+    pub async fn handle(&self, pte: &mut impl PTE, offset: usize, write: bool) -> KResult<()> {
         let mut attr = pte.get_attr().as_page_attr().expect("Not a page attribute");
         let mut pfn = pte.get_pfn();
 
@@ -209,7 +211,7 @@ impl MMArea {
         }
 
         if attr.contains(PageAttribute::MAPPED) {
-            Task::block_on(self.handle_mmap(&mut pfn, &mut attr, offset, write))?;
+            self.handle_mmap(&mut pfn, &mut attr, offset, write).await?;
         }
 
         attr.insert(PageAttribute::ACCESSED);

+ 16 - 14
src/kernel/mem/mm_list.rs

@@ -23,7 +23,6 @@ use eonix_mm::{
     page_table::{PageTable, RawAttribute, PTE},
     paging::PAGE_SIZE,
 };
-use eonix_runtime::task::Task;
 use eonix_sync::{LazyLock, Mutex};
 
 pub use mapping::{FileMapping, Mapping};
@@ -488,7 +487,7 @@ impl MMList {
         Ok(())
     }
 
-    pub fn map_vdso(&self) -> KResult<()> {
+    pub async fn map_vdso(&self) -> KResult<()> {
         unsafe extern "C" {
             fn VDSO_PADDR();
         }
@@ -507,7 +506,7 @@ impl MMList {
         const VDSO_SIZE: usize = 0x1000;
 
         let inner = self.inner.borrow();
-        let inner = Task::block_on(inner.lock());
+        let inner = inner.lock().await;
 
         let mut pte_iter = inner
             .page_table
@@ -529,7 +528,7 @@ impl MMList {
         Ok(())
     }
 
-    pub fn mmap_hint(
+    pub async fn mmap_hint(
         &self,
         hint: VAddr,
         len: usize,
@@ -538,7 +537,7 @@ impl MMList {
         is_shared: bool,
     ) -> KResult<VAddr> {
         let inner = self.inner.borrow();
-        let mut inner = Task::block_on(inner.lock());
+        let mut inner = inner.lock().await;
 
         if hint == VAddr::NULL {
             let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
@@ -557,7 +556,7 @@ impl MMList {
         }
     }
 
-    pub fn mmap_fixed(
+    pub async fn mmap_fixed(
         &self,
         at: VAddr,
         len: usize,
@@ -565,14 +564,17 @@ impl MMList {
         permission: Permission,
         is_shared: bool,
     ) -> KResult<VAddr> {
-        Task::block_on(self.inner.borrow().lock())
+        self.inner
+            .borrow()
+            .lock()
+            .await
             .mmap(at, len, mapping.clone(), permission, is_shared)
             .map(|_| at)
     }
 
-    pub fn set_break(&self, pos: Option<VAddr>) -> VAddr {
+    pub async fn set_break(&self, pos: Option<VAddr>) -> VAddr {
         let inner = self.inner.borrow();
-        let mut inner = Task::block_on(inner.lock());
+        let mut inner = inner.lock().await;
 
         // SAFETY: `set_break` is only called in syscalls, where program break should be valid.
         assert!(inner.break_start.is_some() && inner.break_pos.is_some());
@@ -629,9 +631,9 @@ impl MMList {
     }
 
     /// This should be called only **once** for every thread.
-    pub fn register_break(&self, start: VAddr) {
+    pub async fn register_break(&self, start: VAddr) {
         let inner = self.inner.borrow();
-        let mut inner = Task::block_on(inner.lock());
+        let mut inner = inner.lock().await;
         assert!(inner.break_start.is_none() && inner.break_pos.is_none());
 
         inner.break_start = Some(start.into());
@@ -640,7 +642,7 @@ impl MMList {
 
     /// Access the memory area with the given function.
     /// The function will be called with the offset of the area and the slice of the area.
-    pub fn access_mut<F>(&self, start: VAddr, len: usize, func: F) -> KResult<()>
+    pub async fn access_mut<F>(&self, start: VAddr, len: usize, func: F) -> KResult<()>
     where
         F: Fn(usize, &mut [u8]),
     {
@@ -651,7 +653,7 @@ impl MMList {
         }
 
         let inner = self.inner.borrow();
-        let inner = Task::block_on(inner.lock());
+        let inner = inner.lock().await;
 
         let mut offset = 0;
         let mut remaining = len;
@@ -676,7 +678,7 @@ impl MMList {
                 let page_end = page_start + 0x1000;
 
                 // Prepare for the worst case that we might write to the page...
-                area.handle(pte, page_start - area_start, true)?;
+                area.handle(pte, page_start - area_start, true).await?;
 
                 let start_offset;
                 if page_start < current {

+ 11 - 8
src/kernel/mem/mm_list/page_fault.rs

@@ -4,7 +4,6 @@ use eonix_hal::mm::flush_tlb;
 use eonix_hal::traits::fault::PageFaultErrorCode;
 use eonix_mm::address::{Addr as _, AddrOps as _, VRange};
 use eonix_mm::paging::PAGE_SIZE;
-use eonix_runtime::task::Task;
 use posix_types::signal::Signal;
 
 #[repr(C)]
@@ -95,6 +94,7 @@ impl MMList {
             addr.floor() - area.range().start(),
             error.contains(PageFaultErrorCode::Write),
         )
+        .await
         .map_err(|_| Signal::SIGBUS)?;
 
         flush_tlb(addr.floor().addr());
@@ -129,7 +129,7 @@ fn kernel_page_fault_die(vaddr: VAddr, pc: VAddr) -> ! {
     )
 }
 
-pub fn handle_kernel_page_fault(
+pub async fn handle_kernel_page_fault(
     fault_pc: VAddr,
     addr: VAddr,
     error: PageFaultErrorCode,
@@ -149,7 +149,7 @@ pub fn handle_kernel_page_fault(
 
     let mms = &Thread::current().process.mm_list;
     let inner = mms.inner.borrow();
-    let inner = Task::block_on(inner.lock());
+    let inner = inner.lock().await;
 
     let area = match inner.areas.get(&VRange::from(addr)) {
         Some(area) => area,
@@ -164,11 +164,14 @@ pub fn handle_kernel_page_fault(
         .next()
         .expect("If we can find the mapped area, we should be able to find the PTE");
 
-    if let Err(_) = area.handle(
-        pte,
-        addr.floor() - area.range().start(),
-        error.contains(PageFaultErrorCode::Write),
-    ) {
+    if let Err(_) = area
+        .handle(
+            pte,
+            addr.floor() - area.range().start(),
+            error.contains(PageFaultErrorCode::Write),
+        )
+        .await
+    {
         return Some(try_page_fault_fix(fault_pc, addr));
     }
 

+ 0 - 1
src/kernel/mem/page_alloc/raw_page.rs

@@ -6,7 +6,6 @@ use core::{
     sync::atomic::{AtomicU32, AtomicUsize, Ordering},
 };
 use eonix_hal::mm::ArchPhysAccess;
-use eonix_mm::paging::PAGE_SIZE;
 use eonix_mm::{
     address::{PAddr, PhysAccess as _},
     paging::{RawPage as RawPageTrait, PFN},

+ 60 - 8
src/kernel/mem/page_cache.rs

@@ -26,6 +26,8 @@ unsafe impl Sync for PageCache {}
 #[derive(Clone, Copy)]
 pub struct CachePage(RawPagePtr);
 
+unsafe impl Send for CachePage {}
+
 impl Buffer for CachePage {
     fn total(&self) -> usize {
         PAGE_SIZE
@@ -125,27 +127,32 @@ impl PageCache {
 
     pub async fn read(&self, buffer: &mut dyn Buffer, mut offset: usize) -> KResult<usize> {
         let mut pages = self.pages.lock().await;
+        let size = self.backend.upgrade().unwrap().size();
 
         loop {
+            if offset >= size {
+                break;
+            }
             let page_id = offset >> PAGE_SIZE_BITS;
             let page = pages.get(&page_id);
 
             match page {
                 Some(page) => {
                     let inner_offset = offset % PAGE_SIZE;
+                    let available_in_file = size.saturating_sub(offset);
 
                     // TODO: still cause unnecessary IO if valid_size < PAGESIZE
                     //       and fill result is Done
-                    if page.valid_size() == 0
-                        || buffer
-                            .fill(&page.valid_data()[inner_offset..])?
-                            .should_stop()
+                    let page_data = &page.valid_data()[inner_offset..];
+                    let read_size = page_data.len().min(available_in_file);
+
+                    if read_size == 0
+                        || buffer.fill(&page_data[..read_size])?.should_stop()
                         || buffer.available() == 0
                     {
                         break;
                     }
-
-                    offset += PAGE_SIZE - inner_offset;
+                    offset += read_size;
                 }
                 None => {
                     let mut new_page = CachePage::new();
@@ -217,7 +224,7 @@ impl PageCache {
                 self.backend
                     .upgrade()
                     .unwrap()
-                    .write_page(page, page_id << PAGE_SIZE_BITS)?;
+                    .write_page(&mut CachePageStream::new(*page), page_id << PAGE_SIZE_BITS)?;
                 page.clear_dirty();
             }
         }
@@ -293,6 +300,51 @@ impl PageCache {
     }
 }
 
+pub struct CachePageStream {
+    page: CachePage,
+    cur: usize,
+}
+
+impl CachePageStream {
+    pub fn new(page: CachePage) -> Self {
+        Self { page, cur: 0 }
+    }
+
+    pub fn remaining(&self) -> usize {
+        self.page.valid_size().saturating_sub(self.cur)
+    }
+
+    pub fn is_drained(&self) -> bool {
+        self.cur >= self.page.valid_size()
+    }
+}
+
+impl Stream for CachePageStream {
+    fn poll_data<'a>(&mut self, buf: &'a mut [u8]) -> KResult<Option<&'a mut [u8]>> {
+        if self.cur >= self.page.valid_size() {
+            return Ok(None);
+        }
+
+        let page_data = &self.page.all()[self.cur..self.page.valid_size()];
+        let to_read = buf.len().min(page_data.len());
+
+        buf[..to_read].copy_from_slice(&page_data[..to_read]);
+        self.cur += to_read;
+
+        Ok(Some(&mut buf[..to_read]))
+    }
+
+    fn ignore(&mut self, len: usize) -> KResult<Option<usize>> {
+        if self.cur >= self.page.valid_size() {
+            return Ok(None);
+        }
+
+        let to_ignore = len.min(self.page.valid_size() - self.cur);
+        self.cur += to_ignore;
+        Ok(Some(to_ignore))
+    }
+}
+
 // with this trait, "page cache" and "block cache" are unified,
 // for fs, offset is file offset (floor algin to PAGE_SIZE)
 // for blkdev, offset is block idx (floor align to PAGE_SIZE / BLK_SIZE)
@@ -300,7 +352,7 @@ impl PageCache {
 pub trait PageCacheBackend {
     fn read_page(&self, page: &mut CachePage, offset: usize) -> KResult<usize>;
 
-    fn write_page(&self, page: &CachePage, offset: usize) -> KResult<usize>;
+    fn write_page(&self, page: &mut CachePageStream, offset: usize) -> KResult<usize>;
 
     fn size(&self) -> usize;
 }

+ 163 - 8
src/kernel/syscall.rs

@@ -1,5 +1,10 @@
+use super::task::ThreadAlloc;
 use crate::kernel::task::Thread;
+use alloc::boxed::Box;
+use core::{future::Future, marker::PhantomData, ops::Deref, pin::Pin};
+use eonix_mm::address::{Addr, VAddr};
 use eonix_sync::LazyLock;
+use posix_types::ctypes::PtrT;
 
 pub mod file_rw;
 pub mod mm;
@@ -12,15 +17,33 @@ const MAX_SYSCALL_NO: usize = 512;
 #[derive(Debug, Clone, Copy)]
 pub struct SyscallNoReturn;
 
+#[derive(Clone, Copy)]
+pub struct User<T>(VAddr, PhantomData<T>);
+
+#[derive(Clone, Copy)]
+pub struct UserMut<T>(VAddr, PhantomData<T>);
+
 #[repr(C)]
 pub(self) struct RawSyscallHandler {
     no: usize,
-    handler: fn(&Thread, [usize; 6]) -> Option<usize>,
+    handler: for<'thd, 'alloc> fn(
+        &'thd Thread,
+        ThreadAlloc<'alloc>,
+        [usize; 6],
+    ) -> Pin<
+        Box<dyn Future<Output = Option<usize>> + Send + 'thd, ThreadAlloc<'alloc>>,
+    >,
     name: &'static str,
 }
 
 pub struct SyscallHandler {
-    pub handler: fn(&Thread, [usize; 6]) -> Option<usize>,
+    pub handler: for<'thd, 'alloc> fn(
+        &'thd Thread,
+        ThreadAlloc<'alloc>,
+        [usize; 6],
+    ) -> Pin<
+        Box<dyn Future<Output = Option<usize>> + Send + 'thd, ThreadAlloc<'alloc>>,
+    >,
     pub name: &'static str,
 }
 
@@ -80,6 +103,18 @@ impl SyscallRetVal for SyscallNoReturn {
     }
 }
 
+impl<T> SyscallRetVal for User<T> {
+    fn into_retval(self) -> Option<usize> {
+        Some(self.0.addr())
+    }
+}
+
+impl<T> SyscallRetVal for UserMut<T> {
+    fn into_retval(self) -> Option<usize> {
+        Some(self.0.addr())
+    }
+}
+
 #[cfg(not(target_arch = "x86_64"))]
 impl SyscallRetVal for u64 {
     fn into_retval(self) -> Option<usize> {
@@ -112,15 +147,135 @@ impl FromSyscallArg for usize {
     }
 }
 
-impl<T> FromSyscallArg for *const T {
-    fn from_arg(value: usize) -> *const T {
-        value as *const T
+impl FromSyscallArg for PtrT {
+    fn from_arg(value: usize) -> Self {
+        PtrT::new(value).expect("Invalid user pointer value")
+    }
+}
+
+impl<T> FromSyscallArg for User<T> {
+    fn from_arg(value: usize) -> User<T> {
+        User(VAddr::from(value), PhantomData)
+    }
+}
+
+impl<T> FromSyscallArg for UserMut<T> {
+    fn from_arg(value: usize) -> UserMut<T> {
+        UserMut(VAddr::from(value), PhantomData)
+    }
+}
+
+impl<T> User<T> {
+    pub const fn new(addr: VAddr) -> Self {
+        Self(addr, PhantomData)
+    }
+
+    pub const fn with_addr(addr: usize) -> Self {
+        Self::new(VAddr::from(addr))
+    }
+
+    pub const fn null() -> Self {
+        Self(VAddr::NULL, PhantomData)
+    }
+
+    pub fn is_null(&self) -> bool {
+        self.0.addr() == 0
+    }
+
+    pub const fn cast<U>(self) -> User<U> {
+        User(self.0, PhantomData)
+    }
+
+    pub fn offset(self, off: isize) -> Self {
+        Self(
+            VAddr::from(
+                self.0
+                    .addr()
+                    .checked_add_signed(off)
+                    .expect("offset overflow"),
+            ),
+            PhantomData,
+        )
+    }
+
+    pub const unsafe fn as_mut(self) -> UserMut<T> {
+        UserMut(self.0, PhantomData)
     }
 }
 
-impl<T> FromSyscallArg for *mut T {
-    fn from_arg(value: usize) -> *mut T {
-        value as *mut T
+impl<T> UserMut<T> {
+    pub const fn new(addr: VAddr) -> Self {
+        Self(addr, PhantomData)
+    }
+
+    pub const fn with_addr(addr: usize) -> Self {
+        Self::new(VAddr::from(addr))
+    }
+
+    pub const fn null() -> Self {
+        Self(VAddr::NULL, PhantomData)
+    }
+
+    pub fn is_null(&self) -> bool {
+        self.0.addr() == 0
+    }
+
+    pub const fn cast<U>(self) -> UserMut<U> {
+        UserMut(self.0, PhantomData)
+    }
+
+    pub fn offset(self, off: isize) -> Self {
+        Self(
+            VAddr::from(
+                self.0
+                    .addr()
+                    .checked_add_signed(off)
+                    .expect("offset overflow"),
+            ),
+            PhantomData,
+        )
+    }
+
+    pub const fn as_const(self) -> User<T> {
+        User(self.0, PhantomData)
+    }
+
+    pub const fn vaddr(&self) -> VAddr {
+        self.0
+    }
+}
+
+impl<T> Deref for User<T> {
+    type Target = VAddr;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> Deref for UserMut<T> {
+    type Target = VAddr;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> core::fmt::Debug for User<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self.0 {
+            VAddr::NULL => write!(f, "User(NULL)"),
+            _ => write!(f, "User({:#018x?})", self.0.addr()),
+        }
+    }
+}
+
+impl<T> core::fmt::Debug for UserMut<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self.0 {
+            VAddr::NULL => write!(f, "UserMut(NULL)"),
+            _ => write!(f, "UserMut({:#018x?})", self.0.addr()),
+        }
     }
 }
 

+ 173 - 130
src/kernel/syscall/file_rw.rs

@@ -1,30 +1,25 @@
-use core::time::Duration;
-
-use super::FromSyscallArg;
+use super::{FromSyscallArg, User};
 use crate::io::IntoStream;
 use crate::kernel::constants::{
-    EBADF, EFAULT, EINVAL, ENOENT, ENOSYS, ENOTDIR, SEEK_CUR, SEEK_END, SEEK_SET, S_IFBLK, S_IFCHR,
+    EBADF, EFAULT, EINVAL, ENOENT, ENOSYS, ENOTDIR, SEEK_CUR, SEEK_END, SEEK_SET,
 };
+use crate::kernel::syscall::UserMut;
 use crate::kernel::task::Thread;
 use crate::kernel::timer::sleep;
 use crate::kernel::vfs::filearray::FD;
+use crate::kernel::vfs::inode::Mode;
+use crate::kernel::vfs::{PollEvent, SeekOption};
 use crate::{
     io::{Buffer, BufferFill},
     kernel::{
-        user::{
-            dataflow::{CheckedUserPointer, UserBuffer, UserString},
-            UserPointer, UserPointerMut,
-        },
-        vfs::{
-            dentry::Dentry,
-            file::{PollEvent, SeekOption},
-        },
+        user::{CheckedUserPointer, UserBuffer, UserPointer, UserPointerMut, UserString},
+        vfs::dentry::Dentry,
     },
     path::Path,
     prelude::*,
 };
 use alloc::sync::Arc;
-use eonix_runtime::task::Task;
+use core::time::Duration;
 use posix_types::ctypes::{Long, PtrT};
 use posix_types::namei::RenameFlags;
 use posix_types::open::{AtFlags, OpenFlags};
@@ -49,7 +44,7 @@ impl FromSyscallArg for AtFlags {
 fn dentry_from(
     thread: &Thread,
     dirfd: FD,
-    pathname: *const u8,
+    pathname: User<u8>,
     follow_symlink: bool,
 ) -> KResult<Arc<Dentry>> {
     let path = UserString::new(pathname)?;
@@ -74,83 +69,95 @@ fn dentry_from(
 }
 
 #[eonix_macros::define_syscall(SYS_READ)]
-fn read(fd: FD, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+async fn read(fd: FD, buffer: UserMut<u8>, bufsize: usize) -> KResult<usize> {
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
 
-    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.read(&mut buffer, None))
+    thread
+        .files
+        .get(fd)
+        .ok_or(EBADF)?
+        .read(&mut buffer, None)
+        .await
 }
 
 #[eonix_macros::define_syscall(SYS_PREAD64)]
-fn pread64(fd: FD, buffer: *mut u8, bufsize: usize, offset: usize) -> KResult<usize> {
+async fn pread64(fd: FD, buffer: UserMut<u8>, bufsize: usize, offset: usize) -> KResult<usize> {
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
 
-    Task::block_on(
-        thread
-            .files
-            .get(fd)
-            .ok_or(EBADF)?
-            .read(&mut buffer, Some(offset)),
-    )
+    thread
+        .files
+        .get(fd)
+        .ok_or(EBADF)?
+        .read(&mut buffer, Some(offset))
+        .await
 }
 
 #[eonix_macros::define_syscall(SYS_WRITE)]
-fn write(fd: FD, buffer: *const u8, count: usize) -> KResult<usize> {
+async fn write(fd: FD, buffer: User<u8>, count: usize) -> KResult<usize> {
     let buffer = CheckedUserPointer::new(buffer, count)?;
     let mut stream = buffer.into_stream();
 
-    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.write(&mut stream, None))
+    thread
+        .files
+        .get(fd)
+        .ok_or(EBADF)?
+        .write(&mut stream, None)
+        .await
 }
 
 #[eonix_macros::define_syscall(SYS_PWRITE64)]
-fn pwrite64(fd: FD, buffer: *const u8, count: usize, offset: usize) -> KResult<usize> {
+async fn pwrite64(fd: FD, buffer: User<u8>, count: usize, offset: usize) -> KResult<usize> {
     let buffer = CheckedUserPointer::new(buffer, count)?;
     let mut stream = buffer.into_stream();
 
-    Task::block_on(
-        thread
-            .files
-            .get(fd)
-            .ok_or(EBADF)?
-            .write(&mut stream, Some(offset)),
-    )
+    thread
+        .files
+        .get(fd)
+        .ok_or(EBADF)?
+        .write(&mut stream, Some(offset))
+        .await
 }
 
 #[eonix_macros::define_syscall(SYS_OPENAT)]
-fn openat(dirfd: FD, pathname: *const u8, flags: OpenFlags, mode: u32) -> KResult<FD> {
+async fn openat(dirfd: FD, pathname: User<u8>, flags: OpenFlags, mut mode: Mode) -> KResult<FD> {
     let dentry = dentry_from(thread, dirfd, pathname, flags.follow_symlink())?;
+
+    let umask = *thread.fs_context.umask.lock();
+    mode.mask_perm(!umask.non_format_bits());
+
     thread.files.open(&dentry, flags, mode)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_OPEN)]
-fn open(path: *const u8, flags: OpenFlags, mode: u32) -> KResult<FD> {
-    sys_openat(thread, FD::AT_FDCWD, path, flags, mode)
+async fn open(path: User<u8>, flags: OpenFlags, mode: u32) -> KResult<FD> {
+    sys_openat(thread, FD::AT_FDCWD, path, flags, mode).await
 }
 
 #[eonix_macros::define_syscall(SYS_CLOSE)]
-fn close(fd: FD) -> KResult<()> {
-    thread.files.close(fd)
+async fn close(fd: FD) -> KResult<()> {
+    thread.files.close(fd).await
 }
 
 #[eonix_macros::define_syscall(SYS_DUP)]
-fn dup(fd: FD) -> KResult<FD> {
+async fn dup(fd: FD) -> KResult<FD> {
     thread.files.dup(fd)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_DUP2)]
-fn dup2(old_fd: FD, new_fd: FD) -> KResult<FD> {
+async fn dup2(old_fd: FD, new_fd: FD) -> KResult<FD> {
     thread.files.dup_to(old_fd, new_fd, OpenFlags::empty())
 }
 
 #[eonix_macros::define_syscall(SYS_DUP3)]
-fn dup3(old_fd: FD, new_fd: FD, flags: OpenFlags) -> KResult<FD> {
-    thread.files.dup_to(old_fd, new_fd, flags)
+async fn dup3(old_fd: FD, new_fd: FD, flags: OpenFlags) -> KResult<FD> {
+    thread.files.dup_to(old_fd, new_fd, flags).await
 }
 
 #[eonix_macros::define_syscall(SYS_PIPE2)]
-fn pipe2(pipe_fd: *mut [FD; 2], flags: OpenFlags) -> KResult<()> {
-    let mut buffer = UserBuffer::new(pipe_fd as *mut u8, core::mem::size_of::<[FD; 2]>())?;
+async fn pipe2(pipe_fd: UserMut<[FD; 2]>, flags: OpenFlags) -> KResult<()> {
+    let mut buffer = UserBuffer::new(pipe_fd.cast(), core::mem::size_of::<[FD; 2]>())?;
     let (read_fd, write_fd) = thread.files.pipe(flags)?;
 
     buffer.copy(&[read_fd, write_fd])?.ok_or(EFAULT)
@@ -158,13 +165,13 @@ fn pipe2(pipe_fd: *mut [FD; 2], flags: OpenFlags) -> KResult<()> {
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_PIPE)]
-fn pipe(pipe_fd: *mut [FD; 2]) -> KResult<()> {
-    sys_pipe2(thread, pipe_fd, OpenFlags::empty())
+async fn pipe(pipe_fd: UserMut<[FD; 2]>) -> KResult<()> {
+    sys_pipe2(thread, pipe_fd, OpenFlags::empty()).await
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_GETDENTS)]
-fn getdents(fd: FD, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+async fn getdents(fd: FD, buffer: UserMut<u8>, bufsize: usize) -> KResult<usize> {
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
 
     thread.files.get(fd).ok_or(EBADF)?.getdents(&mut buffer)?;
@@ -172,10 +179,15 @@ fn getdents(fd: FD, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
 }
 
 #[eonix_macros::define_syscall(SYS_GETDENTS64)]
-fn getdents64(fd: FD, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+async fn getdents64(fd: FD, buffer: UserMut<u8>, bufsize: usize) -> KResult<usize> {
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
 
-    thread.files.get(fd).ok_or(EBADF)?.getdents64(&mut buffer)?;
+    thread
+        .files
+        .get(fd)
+        .ok_or(EBADF)?
+        .getdents64(&mut buffer)
+        .await?;
     Ok(buffer.wrote())
 }
 
@@ -184,7 +196,12 @@ fn getdents64(fd: FD, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
     eonix_macros::define_syscall(SYS_NEWFSTATAT)
 )]
 #[cfg_attr(target_arch = "x86_64", eonix_macros::define_syscall(SYS_FSTATAT64))]
-fn newfstatat(dirfd: FD, pathname: *const u8, statbuf: *mut Stat, flags: AtFlags) -> KResult<()> {
+async fn newfstatat(
+    dirfd: FD,
+    pathname: User<u8>,
+    statbuf: UserMut<Stat>,
+    flags: AtFlags,
+) -> KResult<()> {
     let dentry = if flags.at_empty_path() {
         let file = thread.files.get(dirfd).ok_or(EBADF)?;
         file.as_path().ok_or(EBADF)?.clone()
@@ -207,23 +224,17 @@ fn newfstatat(dirfd: FD, pathname: *const u8, statbuf: *mut Stat, flags: AtFlags
     eonix_macros::define_syscall(SYS_NEWFSTAT)
 )]
 #[cfg_attr(target_arch = "x86_64", eonix_macros::define_syscall(SYS_FSTAT64))]
-fn newfstat(fd: FD, statbuf: *mut Stat) -> KResult<()> {
-    sys_newfstatat(
-        thread,
-        fd,
-        core::ptr::null(),
-        statbuf,
-        AtFlags::AT_EMPTY_PATH,
-    )
+async fn newfstat(fd: FD, statbuf: UserMut<Stat>) -> KResult<()> {
+    sys_newfstatat(thread, fd, User::null(), statbuf, AtFlags::AT_EMPTY_PATH).await
 }
 
 #[eonix_macros::define_syscall(SYS_STATX)]
-fn statx(
+async fn statx(
     dirfd: FD,
-    pathname: *const u8,
+    pathname: User<u8>,
     flags: AtFlags,
     mask: u32,
-    buffer: *mut StatX,
+    buffer: UserMut<StatX>,
 ) -> KResult<()> {
     if !flags.statx_default_sync() {
         unimplemented!("statx with no default sync flags: {:?}", flags);
@@ -246,9 +257,9 @@ fn statx(
 }
 
 #[eonix_macros::define_syscall(SYS_MKDIRAT)]
-fn mkdirat(dirfd: FD, pathname: *const u8, mode: u32) -> KResult<()> {
+async fn mkdirat(dirfd: FD, pathname: User<u8>, mut mode: Mode) -> KResult<()> {
     let umask = *thread.fs_context.umask.lock();
-    let mode = mode & !umask & 0o777;
+    mode.mask_perm(!umask.non_format_bits());
 
     let dentry = dentry_from(thread, dirfd, pathname, true)?;
     dentry.mkdir(mode)
@@ -256,19 +267,19 @@ fn mkdirat(dirfd: FD, pathname: *const u8, mode: u32) -> KResult<()> {
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_MKDIR)]
-fn mkdir(pathname: *const u8, mode: u32) -> KResult<()> {
-    sys_mkdirat(thread, FD::AT_FDCWD, pathname, mode)
+async fn mkdir(pathname: User<u8>, mode: u32) -> KResult<()> {
+    sys_mkdirat(thread, FD::AT_FDCWD, pathname, mode).await
 }
 
 #[eonix_macros::define_syscall(SYS_FTRUNCATE64)]
-fn truncate64(fd: FD, length: usize) -> KResult<()> {
+async fn truncate64(fd: FD, length: usize) -> KResult<()> {
     let file = thread.files.get(fd).ok_or(EBADF)?;
     file.as_path().ok_or(EBADF)?.truncate(length)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_TRUNCATE)]
-fn truncate(pathname: *const u8, length: usize) -> KResult<()> {
+async fn truncate(pathname: User<u8>, length: usize) -> KResult<()> {
     let path = UserString::new(pathname)?;
     let path = Path::new(path.as_cstr().to_bytes())?;
 
@@ -278,18 +289,18 @@ fn truncate(pathname: *const u8, length: usize) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_UNLINKAT)]
-fn unlinkat(dirfd: FD, pathname: *const u8) -> KResult<()> {
+async fn unlinkat(dirfd: FD, pathname: User<u8>) -> KResult<()> {
     dentry_from(thread, dirfd, pathname, false)?.unlink()
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_UNLINK)]
-fn unlink(pathname: *const u8) -> KResult<()> {
+async fn unlink(pathname: User<u8>) -> KResult<()> {
     sys_unlinkat(thread, FD::AT_FDCWD, pathname)
 }
 
 #[eonix_macros::define_syscall(SYS_SYMLINKAT)]
-fn symlinkat(target: *const u8, dirfd: FD, linkpath: *const u8) -> KResult<()> {
+async fn symlinkat(target: User<u8>, dirfd: FD, linkpath: User<u8>) -> KResult<()> {
     let target = UserString::new(target)?;
     let dentry = dentry_from(thread, dirfd, linkpath, false)?;
 
@@ -298,28 +309,37 @@ fn symlinkat(target: *const u8, dirfd: FD, linkpath: *const u8) -> KResult<()> {
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_SYMLINK)]
-fn symlink(target: *const u8, linkpath: *const u8) -> KResult<()> {
+async fn symlink(target: User<u8>, linkpath: User<u8>) -> KResult<()> {
     sys_symlinkat(thread, target, FD::AT_FDCWD, linkpath)
 }
 
 #[eonix_macros::define_syscall(SYS_MKNODAT)]
-fn mknodat(dirfd: FD, pathname: *const u8, mode: u32, dev: u32) -> KResult<()> {
+async fn mknodat(dirfd: FD, pathname: User<u8>, mut mode: Mode, dev: u32) -> KResult<()> {
+    if !mode.is_blk() && !mode.is_chr() {
+        return Err(EINVAL);
+    }
+
     let dentry = dentry_from(thread, dirfd, pathname, true)?;
 
     let umask = *thread.fs_context.umask.lock();
-    let mode = mode & ((!umask & 0o777) | (S_IFBLK | S_IFCHR));
+    mode.mask_perm(!umask.non_format_bits());
 
     dentry.mknod(mode, dev)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_MKNOD)]
-fn mknod(pathname: *const u8, mode: u32, dev: u32) -> KResult<()> {
-    sys_mknodat(thread, FD::AT_FDCWD, pathname, mode, dev)
+async fn mknod(pathname: User<u8>, mode: u32, dev: u32) -> KResult<()> {
+    sys_mknodat(thread, FD::AT_FDCWD, pathname, mode, dev).await
 }
 
 #[eonix_macros::define_syscall(SYS_READLINKAT)]
-fn readlinkat(dirfd: FD, pathname: *const u8, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+async fn readlinkat(
+    dirfd: FD,
+    pathname: User<u8>,
+    buffer: UserMut<u8>,
+    bufsize: usize,
+) -> KResult<usize> {
     let dentry = dentry_from(thread, dirfd, pathname, false)?;
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
 
@@ -328,34 +348,40 @@ fn readlinkat(dirfd: FD, pathname: *const u8, buffer: *mut u8, bufsize: usize) -
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_READLINK)]
-fn readlink(pathname: *const u8, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
-    sys_readlinkat(thread, FD::AT_FDCWD, pathname, buffer, bufsize)
+async fn readlink(pathname: User<u8>, buffer: UserMut<u8>, bufsize: usize) -> KResult<usize> {
+    sys_readlinkat(thread, FD::AT_FDCWD, pathname, buffer, bufsize).await
 }
 
-fn do_lseek(thread: &Thread, fd: FD, offset: u64, whence: u32) -> KResult<u64> {
+async fn do_lseek(thread: &Thread, fd: FD, offset: u64, whence: u32) -> KResult<u64> {
     let file = thread.files.get(fd).ok_or(EBADF)?;
 
     Ok(match whence {
-        SEEK_SET => file.seek(SeekOption::Set(offset as usize))?,
-        SEEK_CUR => file.seek(SeekOption::Current(offset as isize))?,
-        SEEK_END => file.seek(SeekOption::End(offset as isize))?,
+        SEEK_SET => file.seek(SeekOption::Set(offset as usize)).await?,
+        SEEK_CUR => file.seek(SeekOption::Current(offset as isize)).await?,
+        SEEK_END => file.seek(SeekOption::End(offset as isize)).await?,
         _ => return Err(EINVAL),
     } as u64)
 }
 
 #[cfg(not(target_arch = "x86_64"))]
 #[eonix_macros::define_syscall(SYS_LSEEK)]
-fn lseek(fd: FD, offset: u64, whence: u32) -> KResult<u64> {
-    do_lseek(thread, fd, offset, whence)
+async fn lseek(fd: FD, offset: u64, whence: u32) -> KResult<u64> {
+    do_lseek(thread, fd, offset, whence).await
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_LLSEEK)]
-fn llseek(fd: FD, offset_high: u32, offset_low: u32, result: *mut u64, whence: u32) -> KResult<()> {
-    let mut result = UserBuffer::new(result as *mut u8, core::mem::size_of::<u64>())?;
+fn llseek(
+    fd: FD,
+    offset_high: u32,
+    offset_low: u32,
+    result: UserMut<u64>,
+    whence: u32,
+) -> KResult<()> {
+    let mut result = UserBuffer::new(result.cast(), core::mem::size_of::<u64>())?;
     let offset = ((offset_high as u64) << 32) | (offset_low as u64);
 
-    let new_offset = do_lseek(thread, fd, offset, whence)?;
+    let new_offset = do_lseek(thread, fd, offset, whence).await?;
 
     result.copy(&new_offset)?.ok_or(EFAULT)
 }
@@ -368,7 +394,7 @@ struct IoVec {
 }
 
 #[eonix_macros::define_syscall(SYS_READV)]
-fn readv(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
+async fn readv(fd: FD, iov_user: User<IoVec>, iovcnt: u32) -> KResult<usize> {
     let file = thread.files.get(fd).ok_or(EBADF)?;
 
     let mut iov_user = UserPointer::new(iov_user)?;
@@ -383,14 +409,16 @@ fn readv(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
             Ok(IoVec {
                 len: Long::ZERO, ..
             }) => None,
-            Ok(IoVec { base, len }) => Some(UserBuffer::new(base.addr() as *mut u8, len.get())),
+            Ok(IoVec { base, len }) => {
+                Some(UserBuffer::new(UserMut::with_addr(base.addr()), len.get()))
+            }
         })
         .collect::<KResult<Vec<_>>>()?;
 
     let mut tot = 0usize;
     for mut buffer in iov_buffers.into_iter() {
         // TODO!!!: `readv`
-        let nread = Task::block_on(file.read(&mut buffer, None))?;
+        let nread = file.read(&mut buffer, None).await?;
         tot += nread;
 
         if nread != buffer.total() {
@@ -402,7 +430,7 @@ fn readv(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
 }
 
 #[eonix_macros::define_syscall(SYS_WRITEV)]
-fn writev(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
+async fn writev(fd: FD, iov_user: User<IoVec>, iovcnt: u32) -> KResult<usize> {
     let file = thread.files.get(fd).ok_or(EBADF)?;
 
     let mut iov_user = UserPointer::new(iov_user)?;
@@ -418,7 +446,7 @@ fn writev(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
                 len: Long::ZERO, ..
             }) => None,
             Ok(IoVec { base, len }) => Some(
-                CheckedUserPointer::new(base.addr() as *mut u8, len.get())
+                CheckedUserPointer::new(User::with_addr(base.addr()), len.get())
                     .map(|ptr| ptr.into_stream()),
             ),
         })
@@ -426,7 +454,7 @@ fn writev(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
 
     let mut tot = 0usize;
     for mut stream in iov_streams.into_iter() {
-        let nread = Task::block_on(file.write(&mut stream, None))?;
+        let nread = file.write(&mut stream, None).await?;
         tot += nread;
 
         if nread == 0 || !stream.is_drained() {
@@ -438,7 +466,7 @@ fn writev(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
 }
 
 #[eonix_macros::define_syscall(SYS_FACCESSAT)]
-fn faccessat(dirfd: FD, pathname: *const u8, _mode: u32, flags: AtFlags) -> KResult<()> {
+async fn faccessat(dirfd: FD, pathname: User<u8>, _mode: u32, flags: AtFlags) -> KResult<()> {
     let dentry = if flags.at_empty_path() {
         let file = thread.files.get(dirfd).ok_or(EBADF)?;
         file.as_path().ok_or(EBADF)?.clone()
@@ -464,12 +492,12 @@ fn faccessat(dirfd: FD, pathname: *const u8, _mode: u32, flags: AtFlags) -> KRes
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_ACCESS)]
-fn access(pathname: *const u8, mode: u32) -> KResult<()> {
-    sys_faccessat(thread, FD::AT_FDCWD, pathname, mode, AtFlags::empty())
+async fn access(pathname: User<u8>, mode: u32) -> KResult<()> {
+    sys_faccessat(thread, FD::AT_FDCWD, pathname, mode, AtFlags::empty()).await
 }
 
 #[eonix_macros::define_syscall(SYS_SENDFILE64)]
-fn sendfile64(out_fd: FD, in_fd: FD, offset: *mut u8, count: usize) -> KResult<usize> {
+async fn sendfile64(out_fd: FD, in_fd: FD, offset: UserMut<u8>, count: usize) -> KResult<usize> {
     let in_file = thread.files.get(in_fd).ok_or(EBADF)?;
     let out_file = thread.files.get(out_fd).ok_or(EBADF)?;
 
@@ -477,18 +505,18 @@ fn sendfile64(out_fd: FD, in_fd: FD, offset: *mut u8, count: usize) -> KResult<u
         unimplemented!("sendfile64 with offset");
     }
 
-    Task::block_on(in_file.sendfile(&out_file, count))
+    in_file.sendfile(&out_file, count).await
 }
 
 #[eonix_macros::define_syscall(SYS_IOCTL)]
-fn ioctl(fd: FD, request: usize, arg3: usize) -> KResult<usize> {
+async fn ioctl(fd: FD, request: usize, arg3: usize) -> KResult<usize> {
     let file = thread.files.get(fd).ok_or(EBADF)?;
 
-    file.ioctl(request, arg3)
+    file.ioctl(request, arg3).await
 }
 
 #[eonix_macros::define_syscall(SYS_FCNTL64)]
-fn fcntl64(fd: FD, cmd: u32, arg: usize) -> KResult<usize> {
+async fn fcntl64(fd: FD, cmd: u32, arg: usize) -> KResult<usize> {
     thread.files.fcntl(fd, cmd, arg)
 }
 
@@ -500,7 +528,12 @@ struct UserPollFd {
     revents: u16,
 }
 
-fn do_poll(thread: &Thread, fds: *mut UserPollFd, nfds: u32, _timeout: u32) -> KResult<u32> {
+async fn do_poll(
+    thread: &Thread,
+    fds: UserMut<UserPollFd>,
+    nfds: u32,
+    _timeout: u32,
+) -> KResult<u32> {
     match nfds {
         0 => Ok(0),
         2.. => unimplemented!("Poll with {} fds", nfds),
@@ -513,7 +546,10 @@ fn do_poll(thread: &Thread, fds: *mut UserPollFd, nfds: u32, _timeout: u32) -> K
             let mut fd = fds.read()?;
 
             let file = thread.files.get(fd.fd).ok_or(EBADF)?;
-            fd.revents = Task::block_on(file.poll(PollEvent::from_bits_retain(fd.events)))?.bits();
+            fd.revents = file
+                .poll(PollEvent::from_bits_retain(fd.events))
+                .await?
+                .bits();
 
             fds.write(fd)?;
             Ok(1)
@@ -522,24 +558,24 @@ fn do_poll(thread: &Thread, fds: *mut UserPollFd, nfds: u32, _timeout: u32) -> K
 }
 
 #[eonix_macros::define_syscall(SYS_PPOLL)]
-fn ppoll(
-    fds: *mut UserPollFd,
+async fn ppoll(
+    fds: UserMut<UserPollFd>,
     nfds: u32,
-    _timeout_ptr: *const TimeSpec,
-    _sigmask: *const SigSet,
+    _timeout_ptr: User<TimeSpec>,
+    _sigmask: User<SigSet>,
 ) -> KResult<u32> {
     // TODO: Implement ppoll with signal mask and timeout
-    do_poll(thread, fds, nfds, 0)
+    do_poll(thread, fds, nfds, 0).await
 }
 
 #[eonix_macros::define_syscall(SYS_PSELECT6)]
-fn pselect6(
+async fn pselect6(
     nfds: u32,
-    _readfds: *mut FDSet,
-    _writefds: *mut FDSet,
-    _exceptfds: *mut FDSet,
-    timeout: *mut TimeSpec,
-    _sigmask: *const (),
+    _readfds: UserMut<FDSet>,
+    _writefds: UserMut<FDSet>,
+    _exceptfds: UserMut<FDSet>,
+    timeout: UserMut<TimeSpec>,
+    _sigmask: User<()>,
 ) -> KResult<usize> {
     // According to [pthread6(2)](https://linux.die.net/man/2/pselect6):
     // Some code calls select() with all three sets empty, nfds zero, and
@@ -550,11 +586,11 @@ fn pselect6(
     }
 
     let timeout = UserPointerMut::new(timeout)?;
-    
+
     // Read here to check for invalid pointers.
     let _timeout_value = timeout.read()?;
 
-    Task::block_on(sleep(Duration::from_millis(10)));
+    sleep(Duration::from_millis(10)).await;
 
     timeout.write(TimeSpec {
         tv_sec: 0,
@@ -566,12 +602,18 @@ fn pselect6(
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_POLL)]
-fn poll(fds: *mut UserPollFd, nfds: u32, timeout: u32) -> KResult<u32> {
-    do_poll(thread, fds, nfds, timeout)
+async fn poll(fds: UserMut<UserPollFd>, nfds: u32, timeout: u32) -> KResult<u32> {
+    do_poll(thread, fds, nfds, timeout).await
 }
 
 #[eonix_macros::define_syscall(SYS_FCHOWNAT)]
-fn fchownat(dirfd: FD, pathname: *const u8, uid: u32, gid: u32, flags: AtFlags) -> KResult<()> {
+async fn fchownat(
+    dirfd: FD,
+    pathname: User<u8>,
+    uid: u32,
+    gid: u32,
+    flags: AtFlags,
+) -> KResult<()> {
     let dentry = dentry_from(thread, dirfd, pathname, !flags.no_follow())?;
     if !dentry.is_valid() {
         return Err(ENOENT);
@@ -581,7 +623,7 @@ fn fchownat(dirfd: FD, pathname: *const u8, uid: u32, gid: u32, flags: AtFlags)
 }
 
 #[eonix_macros::define_syscall(SYS_FCHMODAT)]
-fn fchmodat(dirfd: FD, pathname: *const u8, mode: u32, flags: AtFlags) -> KResult<()> {
+async fn fchmodat(dirfd: FD, pathname: User<u8>, mode: Mode, flags: AtFlags) -> KResult<()> {
     let dentry = if flags.at_empty_path() {
         let file = thread.files.get(dirfd).ok_or(EBADF)?;
         file.as_path().ok_or(EBADF)?.clone()
@@ -597,15 +639,15 @@ fn fchmodat(dirfd: FD, pathname: *const u8, mode: u32, flags: AtFlags) -> KResul
 }
 
 #[eonix_macros::define_syscall(SYS_FCHMOD)]
-fn chmod(pathname: *const u8, mode: u32) -> KResult<()> {
-    sys_fchmodat(thread, FD::AT_FDCWD, pathname, mode, AtFlags::empty())
+async fn chmod(pathname: User<u8>, mode: Mode) -> KResult<()> {
+    sys_fchmodat(thread, FD::AT_FDCWD, pathname, mode, AtFlags::empty()).await
 }
 
 #[eonix_macros::define_syscall(SYS_UTIMENSAT)]
-fn utimensat(
+async fn utimensat(
     dirfd: FD,
-    pathname: *const u8,
-    times: *const TimeSpec,
+    pathname: User<u8>,
+    times: User<TimeSpec>,
     flags: AtFlags,
 ) -> KResult<()> {
     let dentry = if flags.at_empty_path() {
@@ -632,11 +674,11 @@ fn utimensat(
 }
 
 #[eonix_macros::define_syscall(SYS_RENAMEAT2)]
-fn renameat2(
+async fn renameat2(
     old_dirfd: FD,
-    old_pathname: *const u8,
+    old_pathname: User<u8>,
     new_dirfd: FD,
-    new_pathname: *const u8,
+    new_pathname: User<u8>,
     flags: u32,
 ) -> KResult<()> {
     let flags = RenameFlags::from_bits(flags).ok_or(EINVAL)?;
@@ -654,7 +696,7 @@ fn renameat2(
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_RENAME)]
-fn rename(old_pathname: *const u8, new_pathname: *const u8) -> KResult<()> {
+async fn rename(old_pathname: User<u8>, new_pathname: User<u8>) -> KResult<()> {
     sys_renameat2(
         thread,
         FD::AT_FDCWD,
@@ -663,6 +705,7 @@ fn rename(old_pathname: *const u8, new_pathname: *const u8) -> KResult<()> {
         new_pathname,
         0,
     )
+    .await
 }
 
 pub fn keep_alive() {}

+ 64 - 44
src/kernel/syscall/mm.rs

@@ -1,9 +1,10 @@
 use super::FromSyscallArg;
 use crate::fs::shm::{gen_shm_id, ShmFlags, IPC_PRIVATE, SHM_MANAGER};
-use crate::kernel::constants::{EBADF, EEXIST, EINVAL, ENOENT, ENOMEM};
+use crate::kernel::constants::{EBADF, EEXIST, EINVAL, ENOENT};
 use crate::kernel::mem::FileMapping;
 use crate::kernel::task::Thread;
 use crate::kernel::vfs::filearray::FD;
+use crate::kernel::vfs::inode::Mode;
 use crate::{
     kernel::{
         constants::{UserMmapFlags, UserMmapProtocol},
@@ -14,7 +15,6 @@ use crate::{
 use align_ext::AlignExt;
 use eonix_mm::address::{Addr as _, AddrOps as _, VAddr};
 use eonix_mm::paging::PAGE_SIZE;
-use eonix_runtime::task::Task;
 use posix_types::syscall_no::*;
 
 impl FromSyscallArg for UserMmapProtocol {
@@ -40,7 +40,7 @@ fn check_impl(condition: bool, err: u32) -> KResult<()> {
     }
 }
 
-fn do_mmap2(
+async fn do_mmap2(
     thread: &Thread,
     addr: usize,
     len: usize,
@@ -67,10 +67,10 @@ fn do_mmap2(
             Mapping::Anonymous
         } else {
             // The mode is unimportant here, since we are checking prot in mm_area.
-            let shared_area = Task::block_on(SHM_MANAGER.lock()).create_shared_area(
+            let shared_area = SHM_MANAGER.lock().await.create_shared_area(
                 len,
                 thread.process.pid,
-                0x777,
+                Mode::REG.perm(0o777),
             );
             Mapping::File(FileMapping::new(shared_area.area.clone(), 0, len))
         }
@@ -94,10 +94,14 @@ fn do_mmap2(
     // TODO!!!: If we are doing mmap's in 32-bit mode, we should check whether
     //          `addr` is above user reachable memory.
     let addr = if flags.contains(UserMmapFlags::MAP_FIXED) {
-        Task::block_on(mm_list.unmap(addr, len));
-        mm_list.mmap_fixed(addr, len, mapping, permission, is_shared)
+        mm_list.unmap(addr, len).await?;
+        mm_list
+            .mmap_fixed(addr, len, mapping, permission, is_shared)
+            .await
     } else {
-        mm_list.mmap_hint(addr, len, mapping, permission, is_shared)
+        mm_list
+            .mmap_hint(addr, len, mapping, permission, is_shared)
+            .await
     };
 
     addr.map(|addr| addr.addr())
@@ -105,7 +109,7 @@ fn do_mmap2(
 
 #[cfg(any(target_arch = "riscv64", target_arch = "loongarch64"))]
 #[eonix_macros::define_syscall(SYS_MMAP)]
-fn mmap(
+async fn mmap(
     addr: usize,
     len: usize,
     prot: UserMmapProtocol,
@@ -113,12 +117,12 @@ fn mmap(
     fd: FD,
     offset: usize,
 ) -> KResult<usize> {
-    do_mmap2(thread, addr, len, prot, flags, fd, offset)
+    do_mmap2(thread, addr, len, prot, flags, fd, offset).await
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_MMAP2)]
-fn mmap2(
+async fn mmap2(
     addr: usize,
     len: usize,
     prot: UserMmapProtocol,
@@ -126,33 +130,33 @@ fn mmap2(
     fd: FD,
     pgoffset: usize,
 ) -> KResult<usize> {
-    do_mmap2(thread, addr, len, prot, flags, fd, pgoffset)
+    do_mmap2(thread, addr, len, prot, flags, fd, pgoffset).await
 }
 
 #[eonix_macros::define_syscall(SYS_MUNMAP)]
-fn munmap(addr: usize, len: usize) -> KResult<usize> {
+async fn munmap(addr: usize, len: usize) -> KResult<()> {
     let addr = VAddr::from(addr);
     if !addr.is_page_aligned() || len == 0 {
         return Err(EINVAL);
     }
 
     let len = len.align_up(PAGE_SIZE);
-    Task::block_on(thread.process.mm_list.unmap(addr, len)).map(|_| 0)
+    thread.process.mm_list.unmap(addr, len).await
 }
 
 #[eonix_macros::define_syscall(SYS_BRK)]
-fn brk(addr: usize) -> KResult<usize> {
+async fn brk(addr: usize) -> KResult<usize> {
     let vaddr = if addr == 0 { None } else { Some(VAddr::from(addr)) };
-    Ok(thread.process.mm_list.set_break(vaddr).addr())
+    Ok(thread.process.mm_list.set_break(vaddr).await.addr())
 }
 
 #[eonix_macros::define_syscall(SYS_MADVISE)]
-fn madvise(_addr: usize, _len: usize, _advice: u32) -> KResult<()> {
+async fn madvise(_addr: usize, _len: usize, _advice: u32) -> KResult<()> {
     Ok(())
 }
 
 #[eonix_macros::define_syscall(SYS_MPROTECT)]
-fn mprotect(addr: usize, len: usize, prot: UserMmapProtocol) -> KResult<()> {
+async fn mprotect(addr: usize, len: usize, prot: UserMmapProtocol) -> KResult<()> {
     let addr = VAddr::from(addr);
     if !addr.is_page_aligned() || len == 0 {
         return Err(EINVAL);
@@ -160,25 +164,29 @@ fn mprotect(addr: usize, len: usize, prot: UserMmapProtocol) -> KResult<()> {
 
     let len = len.align_up(PAGE_SIZE);
 
-    Task::block_on(thread.process.mm_list.protect(
-        addr,
-        len,
-        Permission {
-            read: prot.contains(UserMmapProtocol::PROT_READ),
-            write: prot.contains(UserMmapProtocol::PROT_WRITE),
-            execute: prot.contains(UserMmapProtocol::PROT_EXEC),
-        },
-    ))
+    thread
+        .process
+        .mm_list
+        .protect(
+            addr,
+            len,
+            Permission {
+                read: prot.contains(UserMmapProtocol::PROT_READ),
+                write: prot.contains(UserMmapProtocol::PROT_WRITE),
+                execute: prot.contains(UserMmapProtocol::PROT_EXEC),
+            },
+        )
+        .await
 }
 
 #[eonix_macros::define_syscall(SYS_SHMGET)]
-fn shmget(key: usize, size: usize, shmflg: u32) -> KResult<u32> {
+async fn shmget(key: usize, size: usize, shmflg: u32) -> KResult<u32> {
     let size = size.align_up(PAGE_SIZE);
 
-    let mut shm_manager = Task::block_on(SHM_MANAGER.lock());
+    let mut shm_manager = SHM_MANAGER.lock().await;
     let shmid = gen_shm_id(key)?;
 
-    let mode = shmflg & 0o777;
+    let mode = Mode::REG.perm(shmflg);
     let shmflg = ShmFlags::from_bits_truncate(shmflg);
 
     if key == IPC_PRIVATE {
@@ -201,16 +209,17 @@ fn shmget(key: usize, size: usize, shmflg: u32) -> KResult<u32> {
         return Ok(shmid);
     }
 
-    return Err(ENOENT);
+    Err(ENOENT)
 }
 
 #[eonix_macros::define_syscall(SYS_SHMAT)]
-fn shmat(shmid: u32, addr: usize, shmflg: u32) -> KResult<usize> {
+async fn shmat(shmid: u32, addr: usize, shmflg: u32) -> KResult<usize> {
     let mm_list = &thread.process.mm_list;
-    let shm_manager = Task::block_on(SHM_MANAGER.lock());
+    let shm_manager = SHM_MANAGER.lock().await;
     let shm_area = shm_manager.get(shmid).ok_or(EINVAL)?;
 
-    let mode = shmflg & 0o777;
+    // Why is this not used?
+    let _mode = shmflg & 0o777;
     let shmflg = ShmFlags::from_bits_truncate(shmflg);
 
     let mut permission = Permission {
@@ -239,9 +248,13 @@ fn shmat(shmid: u32, addr: usize, shmflg: u32) -> KResult<usize> {
             return Err(EINVAL);
         }
         let addr = VAddr::from(addr.align_down(PAGE_SIZE));
-        mm_list.mmap_fixed(addr, size, mapping, permission, true)
+        mm_list
+            .mmap_fixed(addr, size, mapping, permission, true)
+            .await
     } else {
-        mm_list.mmap_hint(VAddr::NULL, size, mapping, permission, true)
+        mm_list
+            .mmap_hint(VAddr::NULL, size, mapping, permission, true)
+            .await
     }?;
 
     thread.process.shm_areas.lock().insert(addr, size);
@@ -250,22 +263,29 @@ fn shmat(shmid: u32, addr: usize, shmflg: u32) -> KResult<usize> {
 }
 
 #[eonix_macros::define_syscall(SYS_SHMDT)]
-fn shmdt(addr: usize) -> KResult<usize> {
+async fn shmdt(addr: usize) -> KResult<()> {
     let addr = VAddr::from(addr);
-    let mut shm_areas = thread.process.shm_areas.lock();
-    let size = *shm_areas.get(&addr).ok_or(EINVAL)?;
-    shm_areas.remove(&addr);
-    drop(shm_areas);
-    return Task::block_on(thread.process.mm_list.unmap(addr, size)).map(|_| 0);
+
+    let size = {
+        let mut shm_areas = thread.process.shm_areas.lock();
+        let size = *shm_areas.get(&addr).ok_or(EINVAL)?;
+        shm_areas.remove(&addr);
+
+        size
+    };
+
+    thread.process.mm_list.unmap(addr, size).await
 }
 
 #[eonix_macros::define_syscall(SYS_SHMCTL)]
-fn shmctl(shmid: u32, op: i32, shmid_ds: usize) -> KResult<usize> {
+async fn shmctl(_shmid: u32, _op: i32, _shmid_ds: usize) -> KResult<usize> {
+    // TODO
     Ok(0)
 }
 
 #[eonix_macros::define_syscall(SYS_MEMBARRIER)]
-fn membarrier(_cmd: usize, _flags: usize) -> KResult<()> {
+async fn membarrier(_cmd: usize, _flags: usize) -> KResult<()> {
+    // TODO
     Ok(())
 }
 

+ 1 - 1
src/kernel/syscall/net.rs

@@ -3,7 +3,7 @@ use crate::prelude::*;
 use posix_types::syscall_no::*;
 
 #[eonix_macros::define_syscall(SYS_SOCKET)]
-fn socket(_domain: u32, _socket_type: u32, _protocol: u32) -> KResult<u32> {
+async fn socket(_domain: u32, _socket_type: u32, _protocol: u32) -> KResult<u32> {
     Err(EINVAL)
 }
 

+ 169 - 183
src/kernel/syscall/procops.rs

@@ -7,27 +7,27 @@ use crate::kernel::constants::{
     ENOSYS, PR_GET_NAME, PR_SET_NAME, RLIMIT_STACK, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK,
 };
 use crate::kernel::mem::PageBuffer;
+use crate::kernel::syscall::{User, UserMut};
 use crate::kernel::task::{
     do_clone, futex_wait, futex_wake, yield_now, FutexFlags, FutexOp, ProcessList, ProgramLoader,
     RobustListHead, SignalAction, Thread, WaitId, WaitType,
 };
 use crate::kernel::task::{parse_futexop, CloneArgs};
 use crate::kernel::timer::sleep;
-use crate::kernel::user::dataflow::UserString;
+use crate::kernel::user::UserString;
 use crate::kernel::user::{UserPointer, UserPointerMut};
+use crate::kernel::vfs::inode::Mode;
 use crate::kernel::vfs::{self, dentry::Dentry};
 use crate::path::Path;
-use crate::{kernel::user::dataflow::UserBuffer, prelude::*};
+use crate::{kernel::user::UserBuffer, prelude::*};
 use alloc::borrow::ToOwned;
 use alloc::ffi::CString;
 use bitflags::bitflags;
-use core::ptr::NonNull;
 use core::time::Duration;
 use eonix_hal::processor::UserTLS;
 use eonix_hal::traits::trap::RawTrapContext;
 use eonix_hal::trap::TrapContext;
-use eonix_mm::address::{Addr as _, VAddr};
-use eonix_runtime::task::Task;
+use eonix_mm::address::Addr as _;
 use eonix_sync::AsProof as _;
 use posix_types::ctypes::PtrT;
 use posix_types::signal::{SigAction, SigInfo, SigSet, Signal};
@@ -50,7 +50,7 @@ bitflags! {
 }
 
 #[eonix_macros::define_syscall(SYS_NANOSLEEP)]
-fn nanosleep(req: *const (u32, u32), rem: *mut (u32, u32)) -> KResult<usize> {
+async fn nanosleep(req: User<(u32, u32)>, rem: UserMut<(u32, u32)>) -> KResult<usize> {
     let req = UserPointer::new(req)?.read()?;
     let rem = if rem.is_null() {
         None
@@ -59,7 +59,7 @@ fn nanosleep(req: *const (u32, u32), rem: *mut (u32, u32)) -> KResult<usize> {
     };
 
     let duration = Duration::from_secs(req.0 as u64) + Duration::from_nanos(req.1 as u64);
-    Task::block_on(sleep(duration));
+    sleep(duration).await;
 
     if let Some(rem) = rem {
         rem.write((0, 0))?;
@@ -69,11 +69,11 @@ fn nanosleep(req: *const (u32, u32), rem: *mut (u32, u32)) -> KResult<usize> {
 }
 
 #[eonix_macros::define_syscall(SYS_CLOCK_NANOSLEEP)]
-fn clock_nanosleep(
+async fn clock_nanosleep(
     clock_id: u32,
-    flags: u32,
-    req: *const (u32, u32),
-    rem: *mut (u32, u32),
+    _flags: u32,
+    req: User<(u32, u32)>,
+    rem: UserMut<(u32, u32)>,
 ) -> KResult<usize> {
     if clock_id != CLOCK_REALTIME
         && clock_id != CLOCK_REALTIME_COARSE
@@ -90,7 +90,7 @@ fn clock_nanosleep(
     };
 
     let duration = Duration::from_secs(req.0 as u64) + Duration::from_nanos(req.1 as u64);
-    Task::block_on(sleep(duration));
+    sleep(duration).await;
 
     if let Some(rem) = rem {
         rem.write((0, 0))?;
@@ -100,16 +100,14 @@ fn clock_nanosleep(
 }
 
 #[eonix_macros::define_syscall(SYS_UMASK)]
-fn umask(mask: u32) -> KResult<u32> {
+async fn umask(mask: Mode) -> KResult<Mode> {
     let mut umask = thread.fs_context.umask.lock();
 
-    let old = *umask;
-    *umask = mask & 0o777;
-    Ok(old)
+    Ok(core::mem::replace(&mut *umask, mask.non_format()))
 }
 
 #[eonix_macros::define_syscall(SYS_GETCWD)]
-fn getcwd(buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+async fn getcwd(buffer: UserMut<u8>, bufsize: usize) -> KResult<usize> {
     let mut user_buffer = UserBuffer::new(buffer, bufsize)?;
     let mut buffer = PageBuffer::new();
 
@@ -122,7 +120,7 @@ fn getcwd(buffer: *mut u8, bufsize: usize) -> KResult<usize> {
 }
 
 #[eonix_macros::define_syscall(SYS_CHDIR)]
-fn chdir(path: *const u8) -> KResult<()> {
+async fn chdir(path: User<u8>) -> KResult<()> {
     let path = UserString::new(path)?;
     let path = Path::new(path.as_cstr().to_bytes())?;
 
@@ -140,7 +138,7 @@ fn chdir(path: *const u8) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_UMOUNT)]
-fn umount(source: *const u8) -> KResult<()> {
+async fn umount(source: User<u8>) -> KResult<()> {
     let source = UserString::new(source)?;
     if source.as_cstr().to_str().unwrap() == "./mnt" {
         return Ok(());
@@ -149,7 +147,7 @@ fn umount(source: *const u8) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_MOUNT)]
-fn mount(source: *const u8, target: *const u8, fstype: *const u8, flags: usize) -> KResult<()> {
+async fn mount(source: User<u8>, target: User<u8>, fstype: User<u8>, flags: usize) -> KResult<()> {
     let source = UserString::new(source)?;
     if source.as_cstr().to_str().unwrap() == "/dev/vda2" {
         return Ok(());
@@ -185,7 +183,7 @@ fn get_strings(mut ptr_strings: UserPointer<'_, PtrT>) -> KResult<Vec<CString>>
             break;
         }
 
-        let user_string = UserString::new(ptr.addr() as *const u8)?;
+        let user_string = UserString::new(User::with_addr(ptr.addr()))?;
         strings.push(user_string.as_cstr().to_owned());
         ptr_strings = ptr_strings.offset(1)?;
     }
@@ -194,7 +192,7 @@ fn get_strings(mut ptr_strings: UserPointer<'_, PtrT>) -> KResult<Vec<CString>>
 }
 
 #[eonix_macros::define_syscall(SYS_EXECVE)]
-fn execve(exec: *const u8, argv: *const PtrT, envp: *const PtrT) -> KResult<SyscallNoReturn> {
+async fn execve(exec: User<u8>, argv: User<PtrT>, envp: User<PtrT>) -> KResult<SyscallNoReturn> {
     let exec = UserString::new(exec)?;
     let exec = exec.as_cstr().to_owned();
 
@@ -208,11 +206,12 @@ fn execve(exec: *const u8, argv: *const PtrT, envp: *const PtrT) -> KResult<Sysc
 
     // TODO: When `execve` is called by one of the threads in a process, the other threads
     //       should be terminated and `execve` is performed in the thread group leader.
-    let load_info =
-        ProgramLoader::parse(&thread.fs_context, exec, dentry.clone(), argv, envp)?.load()?;
+    let load_info = ProgramLoader::parse(&thread.fs_context, exec, dentry.clone(), argv, envp)?
+        .load()
+        .await?;
 
     if let Some(robust_list) = thread.get_robust_list() {
-        let _ = Task::block_on(robust_list.wake_all());
+        let _ = robust_list.wake_all().await;
         thread.set_robust_list(None);
     }
 
@@ -221,7 +220,7 @@ fn execve(exec: *const u8, argv: *const PtrT, envp: *const PtrT) -> KResult<Sysc
         thread.process.mm_list.replace(Some(load_info.mm_list));
     }
 
-    thread.files.on_exec();
+    thread.files.on_exec().await;
     thread.signal_list.clear_non_ignore();
     thread.set_name(dentry.get_name());
 
@@ -237,37 +236,41 @@ fn execve(exec: *const u8, argv: *const PtrT, envp: *const PtrT) -> KResult<Sysc
 }
 
 #[eonix_macros::define_syscall(SYS_EXIT)]
-fn exit(status: u32) -> SyscallNoReturn {
+async fn exit(status: u32) -> SyscallNoReturn {
+    let mut procs = ProcessList::get().write().await;
+
     unsafe {
-        let mut procs = Task::block_on(ProcessList::get().write());
-        Task::block_on(procs.do_exit(&thread, WaitType::Exited(status), false));
+        procs
+            .do_exit(&thread, WaitType::Exited(status), false)
+            .await;
     }
 
     SyscallNoReturn
 }
 
 #[eonix_macros::define_syscall(SYS_EXIT_GROUP)]
-fn exit_group(status: u32) -> SyscallNoReturn {
+async fn exit_group(status: u32) -> SyscallNoReturn {
+    let mut procs = ProcessList::get().write().await;
+
     unsafe {
-        let mut procs = Task::block_on(ProcessList::get().write());
-        Task::block_on(procs.do_exit(&thread, WaitType::Exited(status), true));
+        procs.do_exit(&thread, WaitType::Exited(status), true).await;
     }
 
     SyscallNoReturn
 }
 
 enum WaitInfo {
-    SigInfo(NonNull<SigInfo>),
-    Status(NonNull<u32>),
+    SigInfo(UserMut<SigInfo>),
+    Status(UserMut<u32>),
     None,
 }
 
-fn do_waitid(
+async fn do_waitid(
     thread: &Thread,
     wait_id: WaitId,
     info: WaitInfo,
     options: u32,
-    rusage: *mut RUsage,
+    rusage: UserMut<RUsage>,
 ) -> KResult<u32> {
     if !rusage.is_null() {
         unimplemented!("waitid with rusage pointer");
@@ -278,12 +281,15 @@ fn do_waitid(
         Some(options) => options,
     };
 
-    let Some(wait_object) = Task::block_on(thread.process.wait(
-        wait_id,
-        options.contains(UserWaitOptions::WNOHANG),
-        options.contains(UserWaitOptions::WUNTRACED),
-        options.contains(UserWaitOptions::WCONTINUED),
-    ))?
+    let Some(wait_object) = thread
+        .process
+        .wait(
+            wait_id,
+            options.contains(UserWaitOptions::WNOHANG),
+            options.contains(UserWaitOptions::WUNTRACED),
+            options.contains(UserWaitOptions::WCONTINUED),
+        )
+        .await?
     else {
         return Ok(0);
     };
@@ -299,11 +305,11 @@ fn do_waitid(
             siginfo.si_status = status;
             siginfo.si_code = code;
 
-            UserPointerMut::new(siginfo_ptr.as_ptr())?.write(siginfo)?;
+            UserPointerMut::new(siginfo_ptr)?.write(siginfo)?;
             Ok(0)
         }
         WaitInfo::Status(status_ptr) => {
-            UserPointerMut::new(status_ptr.as_ptr())?.write(wait_object.code.to_wstatus())?;
+            UserPointerMut::new(status_ptr)?.write(wait_object.code.to_wstatus())?;
             Ok(wait_object.pid)
         }
         WaitInfo::None => Ok(wait_object.pid),
@@ -311,18 +317,16 @@ fn do_waitid(
 }
 
 #[eonix_macros::define_syscall(SYS_WAITID)]
-fn waitid(
+async fn waitid(
     id_type: u32,
     id: u32,
-    info: *mut SigInfo,
+    info: UserMut<SigInfo>,
     options: u32,
-    rusage: *mut RUsage,
+    rusage: UserMut<RUsage>,
 ) -> KResult<u32> {
     let wait_id = WaitId::from_type_and_id(id_type, id)?;
 
-    if let Some(info) = NonNull::new(info) {
-        do_waitid(thread, wait_id, WaitInfo::SigInfo(info), options, rusage)
-    } else {
+    if info.is_null() {
         /*
          * According to POSIX.1-2008, an application calling waitid() must
          * ensure that infop points to a siginfo_t structure (i.e., that it
@@ -333,34 +337,41 @@ fn waitid(
          */
         unimplemented!("waitid with null info pointer");
     }
+
+    do_waitid(thread, wait_id, WaitInfo::SigInfo(info), options, rusage).await
 }
 
 #[eonix_macros::define_syscall(SYS_WAIT4)]
-fn wait4(wait_id: i32, arg1: *mut u32, options: u32, rusage: *mut RUsage) -> KResult<u32> {
-    let waitinfo = if let Some(status) = NonNull::new(arg1) {
-        WaitInfo::Status(status)
-    } else {
+async fn wait4(
+    wait_id: i32,
+    arg1: UserMut<u32>,
+    options: u32,
+    rusage: UserMut<RUsage>,
+) -> KResult<u32> {
+    let waitinfo = if arg1.is_null() {
         WaitInfo::None
+    } else {
+        WaitInfo::Status(arg1)
     };
 
     let wait_id = WaitId::from_id(wait_id, thread);
 
-    do_waitid(thread, wait_id, waitinfo, options, rusage)
+    do_waitid(thread, wait_id, waitinfo, options, rusage).await
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_WAITPID)]
-fn waitpid(waitpid: i32, arg1: *mut u32, options: u32) -> KResult<u32> {
-    sys_wait4(thread, waitpid, arg1, options, core::ptr::null_mut())
+async fn waitpid(waitpid: i32, arg1: UserMut<u32>, options: u32) -> KResult<u32> {
+    sys_wait4(thread, waitpid, arg1, options, core::ptr::null_mut()).await
 }
 
 #[eonix_macros::define_syscall(SYS_SETSID)]
-fn setsid() -> KResult<u32> {
-    thread.process.setsid()
+async fn setsid() -> KResult<u32> {
+    thread.process.setsid().await
 }
 
 #[eonix_macros::define_syscall(SYS_SETPGID)]
-fn setpgid(pid: u32, pgid: i32) -> KResult<()> {
+async fn setpgid(pid: u32, pgid: i32) -> KResult<()> {
     let pid = if pid == 0 { thread.process.pid } else { pid };
 
     let pgid = match pgid {
@@ -369,15 +380,15 @@ fn setpgid(pid: u32, pgid: i32) -> KResult<()> {
         _ => return Err(EINVAL),
     };
 
-    thread.process.setpgid(pid, pgid)
+    thread.process.setpgid(pid, pgid).await
 }
 
 #[eonix_macros::define_syscall(SYS_GETSID)]
-fn getsid(pid: u32) -> KResult<u32> {
+async fn getsid(pid: u32) -> KResult<u32> {
     if pid == 0 {
         Ok(thread.process.session_rcu().sid)
     } else {
-        let procs = Task::block_on(ProcessList::get().read());
+        let procs = ProcessList::get().read().await;
         procs
             .try_find_process(pid)
             .map(|proc| proc.session(procs.prove()).sid)
@@ -386,11 +397,11 @@ fn getsid(pid: u32) -> KResult<u32> {
 }
 
 #[eonix_macros::define_syscall(SYS_GETPGID)]
-fn getpgid(pid: u32) -> KResult<u32> {
+async fn getpgid(pid: u32) -> KResult<u32> {
     if pid == 0 {
         Ok(thread.process.pgroup_rcu().pgid)
     } else {
-        let procs = Task::block_on(ProcessList::get().read());
+        let procs = ProcessList::get().read().await;
         procs
             .try_find_process(pid)
             .map(|proc| proc.pgroup(procs.prove()).pgid)
@@ -399,12 +410,12 @@ fn getpgid(pid: u32) -> KResult<u32> {
 }
 
 #[eonix_macros::define_syscall(SYS_GETPID)]
-fn getpid() -> KResult<u32> {
+async fn getpid() -> KResult<u32> {
     Ok(thread.process.pid)
 }
 
 #[eonix_macros::define_syscall(SYS_GETPPID)]
-fn getppid() -> KResult<u32> {
+async fn getppid() -> KResult<u32> {
     Ok(thread.process.parent_rcu().map_or(0, |x| x.pid))
 }
 
@@ -420,78 +431,61 @@ fn do_getuid(_thread: &Thread) -> KResult<u32> {
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_GETUID32)]
-fn getuid32() -> KResult<u32> {
+async fn getuid32() -> KResult<u32> {
     do_getuid(thread)
 }
 
 #[eonix_macros::define_syscall(SYS_GETUID)]
-fn getuid() -> KResult<u32> {
+async fn getuid() -> KResult<u32> {
     do_getuid(thread)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_GETEUID32)]
-fn geteuid32() -> KResult<u32> {
+async fn geteuid32() -> KResult<u32> {
     do_geteuid(thread)
 }
 
 #[eonix_macros::define_syscall(SYS_GETEUID)]
-fn geteuid() -> KResult<u32> {
+async fn geteuid() -> KResult<u32> {
     do_geteuid(thread)
 }
 
 #[eonix_macros::define_syscall(SYS_GETEGID)]
-fn getegid() -> KResult<u32> {
+async fn getegid() -> KResult<u32> {
     // All users are root for now.
     Ok(0)
 }
 
 #[eonix_macros::define_syscall(SYS_GETGID)]
-fn getgid() -> KResult<u32> {
-    sys_getegid(thread)
+async fn getgid() -> KResult<u32> {
+    sys_getegid(thread).await
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_GETGID32)]
-fn getgid32() -> KResult<u32> {
-    sys_getegid(thread)
-}
-
-#[eonix_macros::define_syscall(SYS_GETRANDOM)]
-fn getrandom(buf: *mut u8, buflen: usize, _flags: u32) -> isize {
-    if buf.is_null() || buflen == 0 {
-        return -14;
-    }
-
-    static mut SEED: u64 = 1;
-    unsafe {
-        for i in 0..buflen {
-            SEED = SEED.wrapping_mul(1103515245).wrapping_add(12345);
-            *buf.add(i) = (SEED >> 8) as u8;
-        }
-    }
-
-    buflen as isize
+async fn getgid32() -> KResult<u32> {
+    sys_getegid(thread).await
 }
 
 #[eonix_macros::define_syscall(SYS_SCHED_YIELD)]
-fn sched_yield() -> KResult<()> {
-    Task::block_on(yield_now());
+async fn sched_yield() -> KResult<()> {
+    yield_now().await;
     Ok(())
 }
 
 #[eonix_macros::define_syscall(SYS_SYNC)]
-fn sync() -> KResult<()> {
+async fn sync() -> KResult<()> {
     Ok(())
 }
 
 #[eonix_macros::define_syscall(SYS_FSYNC)]
-fn fsync() -> KResult<()> {
+async fn fsync() -> KResult<()> {
     Ok(())
 }
 
 #[eonix_macros::define_syscall(SYS_GETTID)]
-fn gettid() -> KResult<u32> {
+async fn gettid() -> KResult<u32> {
     Ok(thread.tid)
 }
 
@@ -531,7 +525,7 @@ pub fn parse_user_tls(arch_tls: usize) -> KResult<UserTLS> {
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_SET_THREAD_AREA)]
-fn set_thread_area(arch_tls: usize) -> KResult<()> {
+async fn set_thread_area(arch_tls: usize) -> KResult<()> {
     thread.set_user_tls(parse_user_tls(arch_tls)?)?;
 
     // SAFETY: Preemption is disabled on calling `load_thread_area32()`.
@@ -545,16 +539,16 @@ fn set_thread_area(arch_tls: usize) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_SET_TID_ADDRESS)]
-fn set_tid_address(tidptr: usize) -> KResult<u32> {
+async fn set_tid_address(tidptr: UserMut<u32>) -> KResult<u32> {
     thread.clear_child_tid(Some(tidptr));
     Ok(thread.tid)
 }
 
 #[eonix_macros::define_syscall(SYS_PRCTL)]
-fn prctl(option: u32, arg2: usize) -> KResult<()> {
+async fn prctl(option: u32, arg2: PtrT) -> KResult<()> {
     match option {
         PR_SET_NAME => {
-            let name = UserPointer::new(arg2 as *mut [u8; 16])?.read()?;
+            let name = UserPointer::<[u8; 16]>::new(User::with_addr(arg2.addr()))?.read()?;
             let len = name.iter().position(|&c| c == 0).unwrap_or(15);
             thread.set_name(name[..len].into());
             Ok(())
@@ -563,7 +557,7 @@ fn prctl(option: u32, arg2: usize) -> KResult<()> {
             let name = thread.get_name();
             let len = name.len().min(15);
             let name: [u8; 16] = core::array::from_fn(|i| if i < len { name[i] } else { 0 });
-            UserPointerMut::new(arg2 as *mut [u8; 16])?.write(name)?;
+            UserPointerMut::<[u8; 16]>::new(UserMut::with_addr(arg2.addr()))?.write(name)?;
             Ok(())
         }
         _ => Err(EINVAL),
@@ -571,8 +565,8 @@ fn prctl(option: u32, arg2: usize) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_KILL)]
-fn kill(pid: i32, sig: u32) -> KResult<()> {
-    let procs = Task::block_on(ProcessList::get().read());
+async fn kill(pid: i32, sig: u32) -> KResult<()> {
+    let procs = ProcessList::get().read().await;
     match pid {
         // Send signal to every process for which the calling process has
         // permission to send signals.
@@ -598,8 +592,10 @@ fn kill(pid: i32, sig: u32) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_TKILL)]
-fn tkill(tid: u32, sig: u32) -> KResult<()> {
-    Task::block_on(ProcessList::get().read())
+async fn tkill(tid: u32, sig: u32) -> KResult<()> {
+    ProcessList::get()
+        .read()
+        .await
         .try_find_thread(tid)
         .ok_or(ESRCH)?
         .raise(Signal::try_from_raw(sig)?);
@@ -607,8 +603,8 @@ fn tkill(tid: u32, sig: u32) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_TGKILL)]
-fn tgkill(tgid: u32, tid: u32, sig: u32) -> KResult<()> {
-    let procs = Task::block_on(ProcessList::get().read());
+async fn tgkill(tgid: u32, tid: u32, sig: u32) -> KResult<()> {
+    let procs = ProcessList::get().read().await;
 
     let thread_to_kill = procs.try_find_thread(tid).ok_or(ESRCH)?;
     if thread_to_kill.process.pid != tgid {
@@ -620,10 +616,10 @@ fn tgkill(tgid: u32, tid: u32, sig: u32) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_RT_SIGPROCMASK)]
-fn rt_sigprocmask(
+async fn rt_sigprocmask(
     how: u32,
-    set: *mut SigSet,
-    oldset: *mut SigSet,
+    set: UserMut<SigSet>,
+    oldset: UserMut<SigSet>,
     sigsetsize: usize,
 ) -> KResult<()> {
     if sigsetsize != size_of::<SigSet>() {
@@ -636,7 +632,7 @@ fn rt_sigprocmask(
     }
 
     let new_mask = if !set.is_null() {
-        UserPointer::new(set)?.read()?
+        UserPointer::new(set.as_const())?.read()?
     } else {
         return Ok(());
     };
@@ -658,27 +654,21 @@ struct TimeSpec32 {
     tv_nsec: i32,
 }
 
-impl TimeSpec32 {
-    fn to_duration(&self) -> Duration {
-        Duration::new(self.tv_sec as u64, self.tv_nsec as u32)
-    }
-}
-
 #[eonix_macros::define_syscall(SYS_RT_SIGTIMEDWAIT_TIME32)]
-fn rt_sigtimedwait_time32(
-    _uthese: *const SigSet,
-    _uinfo: *mut SigInfo,
-    _uts: *const TimeSpec32,
+async fn rt_sigtimedwait_time32(
+    _uthese: User<SigSet>,
+    _uinfo: UserMut<SigInfo>,
+    _uts: User<TimeSpec32>,
 ) -> KResult<i32> {
     // TODO
     Ok(0)
 }
 
 #[eonix_macros::define_syscall(SYS_RT_SIGACTION)]
-fn rt_sigaction(
+async fn rt_sigaction(
     signum: u32,
-    act: *const SigAction,
-    oldact: *mut SigAction,
+    act: User<SigAction>,
+    oldact: UserMut<SigAction>,
     sigsetsize: usize,
 ) -> KResult<()> {
     let signal = Signal::try_from_raw(signum)?;
@@ -707,11 +697,11 @@ fn rt_sigaction(
 }
 
 #[eonix_macros::define_syscall(SYS_PRLIMIT64)]
-fn prlimit64(
+async fn prlimit64(
     pid: u32,
     resource: u32,
-    new_limit: *const RLimit,
-    old_limit: *mut RLimit,
+    new_limit: User<RLimit>,
+    old_limit: UserMut<RLimit>,
 ) -> KResult<()> {
     if pid != 0 {
         return Err(ENOSYS);
@@ -743,13 +733,13 @@ fn prlimit64(
 }
 
 #[eonix_macros::define_syscall(SYS_GETRLIMIT)]
-fn getrlimit(resource: u32, rlimit: *mut RLimit) -> KResult<()> {
-    sys_prlimit64(thread, 0, resource, core::ptr::null(), rlimit)
+async fn getrlimit(resource: u32, rlimit: UserMut<RLimit>) -> KResult<()> {
+    sys_prlimit64(thread, 0, resource, User::null(), rlimit).await
 }
 
 #[eonix_macros::define_syscall(SYS_SETRLIMIT)]
-fn setrlimit(resource: u32, rlimit: *const RLimit) -> KResult<()> {
-    sys_prlimit64(thread, 0, resource, rlimit, core::ptr::null_mut())
+async fn setrlimit(resource: u32, rlimit: User<RLimit>) -> KResult<()> {
+    sys_prlimit64(thread, 0, resource, rlimit, UserMut::null()).await
 }
 
 #[repr(C)]
@@ -774,7 +764,7 @@ struct RUsage {
 }
 
 #[eonix_macros::define_syscall(SYS_GETRUSAGE)]
-fn getrusage(who: u32, rusage: *mut RUsage) -> KResult<()> {
+async fn getrusage(who: u32, rusage: UserMut<RUsage>) -> KResult<()> {
     if who != 0 {
         return Err(ENOSYS);
     }
@@ -804,52 +794,52 @@ fn getrusage(who: u32, rusage: *mut RUsage) -> KResult<()> {
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_VFORK)]
-fn vfork() -> KResult<u32> {
+async fn vfork() -> KResult<u32> {
     let clone_args = CloneArgs::for_vfork();
 
-    do_clone(thread, clone_args)
+    do_clone(thread, clone_args).await
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_FORK)]
-fn fork() -> KResult<u32> {
+async fn fork() -> KResult<u32> {
     let clone_args = CloneArgs::for_fork();
 
-    do_clone(thread, clone_args)
+    do_clone(thread, clone_args).await
 }
 
 // Some old platforms including x86_32, riscv and arm have the last two arguments
 // swapped, so we need to define two versions of `clone` syscall.
 #[cfg(not(target_arch = "loongarch64"))]
 #[eonix_macros::define_syscall(SYS_CLONE)]
-fn clone(
+async fn clone(
     clone_flags: usize,
     new_sp: usize,
-    parent_tidptr: usize,
+    parent_tidptr: UserMut<u32>,
     tls: usize,
-    child_tidptr: usize,
+    child_tidptr: UserMut<u32>,
 ) -> KResult<u32> {
     let clone_args = CloneArgs::for_clone(clone_flags, new_sp, child_tidptr, parent_tidptr, tls)?;
 
-    do_clone(thread, clone_args)
+    do_clone(thread, clone_args).await
 }
 
 #[cfg(target_arch = "loongarch64")]
 #[eonix_macros::define_syscall(SYS_CLONE)]
-fn clone(
+async fn clone(
     clone_flags: usize,
     new_sp: usize,
-    parent_tidptr: usize,
-    child_tidptr: usize,
+    parent_tidptr: UserMut<u32>,
+    child_tidptr: UserMut<u32>,
     tls: usize,
 ) -> KResult<u32> {
     let clone_args = CloneArgs::for_clone(clone_flags, new_sp, child_tidptr, parent_tidptr, tls)?;
 
-    do_clone(thread, clone_args)
+    do_clone(thread, clone_args).await
 }
 
 #[eonix_macros::define_syscall(SYS_FUTEX)]
-fn futex(
+async fn futex(
     uaddr: usize,
     op: u32,
     val: u32,
@@ -867,11 +857,11 @@ fn futex(
 
     match futex_op {
         FutexOp::FUTEX_WAIT => {
-            Task::block_on(futex_wait(uaddr, pid, val as u32, None))?;
+            futex_wait(uaddr, pid, val as u32, None).await?;
             return Ok(0);
         }
         FutexOp::FUTEX_WAKE => {
-            return Task::block_on(futex_wake(uaddr, pid, val as u32));
+            return futex_wake(uaddr, pid, val as u32).await;
         }
         FutexOp::FUTEX_REQUEUE => {
             todo!()
@@ -883,60 +873,56 @@ fn futex(
 }
 
 #[eonix_macros::define_syscall(SYS_SET_ROBUST_LIST)]
-fn set_robust_list(head: usize, len: usize) -> KResult<()> {
+async fn set_robust_list(head: User<RobustListHead>, len: usize) -> KResult<()> {
     if len != size_of::<RobustListHead>() {
         return Err(EINVAL);
     }
 
-    thread.set_robust_list(Some(VAddr::from(head)));
+    thread.set_robust_list(Some(head));
     Ok(())
 }
 
 #[eonix_macros::define_syscall(SYS_RT_SIGRETURN)]
-fn rt_sigreturn() -> KResult<SyscallNoReturn> {
-    thread
-        .signal_list
-        .restore(
-            &mut thread.trap_ctx.borrow(),
-            &mut thread.fpu_state.borrow(),
-            false,
-        )
-        .inspect_err(|err| {
-            println_warn!(
-                "`rt_sigreturn` failed in thread {} with error {err}!",
-                thread.tid
-            );
-            Task::block_on(thread.force_kill(Signal::SIGSEGV));
-        })?;
+async fn rt_sigreturn() -> KResult<SyscallNoReturn> {
+    if let Err(err) = thread.signal_list.restore(
+        &mut thread.trap_ctx.borrow(),
+        &mut thread.fpu_state.borrow(),
+        false,
+    ) {
+        println_warn!(
+            "`rt_sigreturn` failed in thread {} with error {err}!",
+            thread.tid
+        );
+        thread.force_kill(Signal::SIGSEGV).await;
+        return Err(err);
+    }
 
     Ok(SyscallNoReturn)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_SIGRETURN)]
-fn sigreturn() -> KResult<SyscallNoReturn> {
-    thread
-        .signal_list
-        .restore(
-            &mut thread.trap_ctx.borrow(),
-            &mut thread.fpu_state.borrow(),
-            true,
-        )
-        .inspect_err(|err| {
-            println_warn!(
-                "`sigreturn` failed in thread {} with error {err}!",
-                thread.tid
-            );
-            Task::block_on(thread.force_kill(Signal::SIGSEGV));
-        })?;
+async fn sigreturn() -> KResult<SyscallNoReturn> {
+    if let Err(err) = thread.signal_list.restore(
+        &mut thread.trap_ctx.borrow(),
+        &mut thread.fpu_state.borrow(),
+        true,
+    ) {
+        println_warn!(
+            "`sigreturn` failed in thread {} with error {err}!",
+            thread.tid
+        );
+        thread.force_kill(Signal::SIGSEGV).await;
+        return Err(err);
+    }
 
     Ok(SyscallNoReturn)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_ARCH_PRCTL)]
-fn arch_prctl(option: u32, addr: u32) -> KResult<u32> {
-    sys_arch_prctl(thread, option, addr)
+async fn arch_prctl(option: u32, addr: u32) -> KResult<u32> {
+    sys_arch_prctl(thread, option, addr).await
 }
 
 pub fn keep_alive() {}

+ 9 - 8
src/kernel/syscall/sysinfo.rs

@@ -2,6 +2,7 @@ use crate::{
     io::Buffer as _,
     kernel::{
         constants::{CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_REALTIME_COARSE, EINTR, EINVAL},
+        syscall::UserMut,
         task::Thread,
         timer::{Instant, Ticks},
         user::{UserBuffer, UserPointerMut},
@@ -30,7 +31,7 @@ fn copy_cstr_to_array(cstr: &[u8], array: &mut [u8]) {
 }
 
 #[eonix_macros::define_syscall(SYS_NEWUNAME)]
-fn newuname(buffer: *mut NewUTSName) -> KResult<()> {
+async fn newuname(buffer: UserMut<NewUTSName>) -> KResult<()> {
     let buffer = UserPointerMut::new(buffer)?;
     let mut uname = NewUTSName {
         sysname: [0; 65],
@@ -62,7 +63,7 @@ fn newuname(buffer: *mut NewUTSName) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_GETTIMEOFDAY)]
-fn gettimeofday(timeval: *mut TimeVal, timezone: *mut ()) -> KResult<()> {
+async fn gettimeofday(timeval: UserMut<TimeVal>, timezone: UserMut<()>) -> KResult<()> {
     if !timezone.is_null() {
         return Err(EINVAL);
     }
@@ -81,7 +82,7 @@ fn gettimeofday(timeval: *mut TimeVal, timezone: *mut ()) -> KResult<()> {
     Ok(())
 }
 
-fn do_clock_gettime64(_thread: &Thread, clock_id: u32, timespec: *mut TimeSpec) -> KResult<()> {
+fn do_clock_gettime64(_thread: &Thread, clock_id: u32, timespec: UserMut<TimeSpec>) -> KResult<()> {
     let timespec = UserPointerMut::new(timespec)?;
 
     match clock_id {
@@ -106,13 +107,13 @@ fn do_clock_gettime64(_thread: &Thread, clock_id: u32, timespec: *mut TimeSpec)
 
 #[cfg(not(target_arch = "x86_64"))]
 #[eonix_macros::define_syscall(SYS_CLOCK_GETTIME)]
-fn clock_gettime(clock_id: u32, timespec: *mut TimeSpec) -> KResult<()> {
+async fn clock_gettime(clock_id: u32, timespec: UserMut<TimeSpec>) -> KResult<()> {
     do_clock_gettime64(thread, clock_id, timespec)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_CLOCK_GETTIME64)]
-fn clock_gettime64(clock_id: u32, timespec: *mut TimeSpec) -> KResult<()> {
+async fn clock_gettime64(clock_id: u32, timespec: UserMut<TimeSpec>) -> KResult<()> {
     do_clock_gettime64(thread, clock_id, timespec)
 }
 
@@ -135,7 +136,7 @@ struct Sysinfo {
 }
 
 #[eonix_macros::define_syscall(SYS_SYSINFO)]
-fn sysinfo(info: *mut Sysinfo) -> KResult<()> {
+async fn sysinfo(info: UserMut<Sysinfo>) -> KResult<()> {
     let info = UserPointerMut::new(info)?;
     info.write(Sysinfo {
         uptime: Ticks::since_boot().as_secs() as u32,
@@ -164,7 +165,7 @@ struct TMS {
 }
 
 #[eonix_macros::define_syscall(SYS_TIMES)]
-fn times(tms: *mut TMS) -> KResult<()> {
+async fn times(tms: UserMut<TMS>) -> KResult<()> {
     let tms = UserPointerMut::new(tms)?;
     tms.write(TMS {
         tms_utime: 0,
@@ -175,7 +176,7 @@ fn times(tms: *mut TMS) -> KResult<()> {
 }
 
 #[eonix_macros::define_syscall(SYS_GETRANDOM)]
-fn get_random(buf: *mut u8, len: usize, flags: u32) -> KResult<usize> {
+async fn get_random(buf: UserMut<u8>, len: usize, flags: u32) -> KResult<usize> {
     if flags != 0 {
         return Err(EINVAL);
     }

+ 209 - 1
src/kernel/task.rs

@@ -18,4 +18,212 @@ pub use process_group::ProcessGroup;
 pub use process_list::ProcessList;
 pub use session::Session;
 pub use signal::SignalAction;
-pub use thread::{new_thread_runnable, yield_now, Thread, ThreadBuilder};
+pub use thread::{yield_now, Thread, ThreadAlloc, ThreadBuilder};
+
+fn do_block_on<F>(mut future: core::pin::Pin<&mut F>) -> F::Output
+where
+    F: core::future::Future,
+{
+    let waker = core::task::Waker::noop();
+    let mut cx = core::task::Context::from_waker(&waker);
+
+    loop {
+        match future.as_mut().poll(&mut cx) {
+            core::task::Poll::Ready(output) => return output,
+            core::task::Poll::Pending => {}
+        }
+    }
+}
+
+/// Constantly poll the given future until it is ready, blocking the current thread.
+///
+/// # Warning
+/// This function will block the current thread and should not be used in async
+/// contexts as it might cause infinite blocking or deadlocks. The following is
+/// a bad example:
+///
+/// ```ignore
+/// block_on(async {
+///     // This will block the current thread forever.
+///     loop {
+///         println_debug!("This will never end!");
+///     }
+/// });
+///
+/// // The code below will never be reached.
+/// println_debug!("You'll never see this message!");
+/// ```
+///
+/// Use [`stackful`] instead to run async (or computational) code in a separate
+/// stackful (and preemptive) context or `RUNTIME.spawn` to run async code in
+/// the runtime's executor.
+pub fn block_on<F>(future: F) -> F::Output
+where
+    F: core::future::Future,
+{
+    do_block_on(core::pin::pin!(future))
+}
+
+/// Run the given future in a stackful context, allowing it to be preempted by
+/// timer interrupts.
+///
+/// ```ignore
+/// RUNTIME.spawn(stackful(async {
+///     // Some simulated computation heavy task.
+///     loop {
+///         println_debug!("Hello from stackful future!");
+///     }
+/// }));
+/// ```
+pub async fn stackful<F>(mut future: F) -> F::Output
+where
+    F: core::future::Future,
+{
+    use crate::kernel::{
+        interrupt::{default_fault_handler, default_irq_handler},
+        timer::{should_reschedule, timer_interrupt},
+    };
+    use alloc::sync::Arc;
+    use alloc::task::Wake;
+    use core::cell::UnsafeCell;
+    use core::future::Future;
+    use core::pin::Pin;
+    use core::ptr::NonNull;
+    use core::sync::atomic::AtomicBool;
+    use core::sync::atomic::Ordering;
+    use core::task::Context;
+    use core::task::Poll;
+    use core::task::Waker;
+    use eonix_hal::traits::trap::RawTrapContext;
+    use eonix_hal::traits::trap::TrapReturn;
+    use eonix_hal::traits::trap::TrapType;
+    use eonix_hal::trap::TrapContext;
+    use eonix_preempt::assert_preempt_enabled;
+    use eonix_runtime::executor::Stack;
+    use eonix_runtime::task::Task;
+    use thread::wait_for_wakeups;
+
+    let stack = KernelStack::new();
+
+    fn execute<F>(mut future: Pin<&mut F>, output_ptr: NonNull<Option<F::Output>>) -> !
+    where
+        F: Future,
+    {
+        struct WakeSaver {
+            task: Arc<Task>,
+            woken: AtomicBool,
+        }
+
+        impl Wake for WakeSaver {
+            fn wake_by_ref(self: &Arc<Self>) {
+                // SAFETY: If we read true below in the loop, we must have been
+                //         woken up and acquired our waker's work by the runtime.
+                self.woken.store(true, Ordering::Relaxed);
+                self.task.wake_by_ref();
+            }
+
+            fn wake(self: Arc<Self>) {
+                self.wake_by_ref();
+            }
+        }
+
+        let wake_saver = Arc::new(WakeSaver {
+            task: Task::current().clone(),
+            woken: AtomicBool::new(false),
+        });
+        let waker = Waker::from(wake_saver.clone());
+        let mut cx = Context::from_waker(&waker);
+
+        let output = loop {
+            match future.as_mut().poll(&mut cx) {
+                Poll::Ready(output) => break output,
+                Poll::Pending => {
+                    assert_preempt_enabled!("Blocking in stackful futures is not allowed.");
+
+                    if Task::current().is_ready() {
+                        continue;
+                    }
+
+                    // SAFETY: The runtime must have ensured that we can see the
+                    //         work done by the waker.
+                    if wake_saver.woken.swap(false, Ordering::Relaxed) {
+                        continue;
+                    }
+
+                    unsafe {
+                        #[cfg(target_arch = "riscv64")]
+                        core::arch::asm!("ebreak");
+
+                        #[cfg(target_arch = "loongarch64")]
+                        core::arch::asm!("break 1");
+                    }
+                }
+            }
+        };
+
+        drop(cx);
+        drop(waker);
+        drop(wake_saver);
+
+        unsafe {
+            output_ptr.write(Some(output));
+        }
+
+        unsafe {
+            #[cfg(target_arch = "riscv64")]
+            core::arch::asm!("ebreak");
+
+            #[cfg(target_arch = "loongarch64")]
+            core::arch::asm!("break 1");
+        }
+
+        unreachable!()
+    }
+
+    let sp = stack.get_bottom();
+    let mut output = UnsafeCell::new(None);
+
+    let mut trap_ctx = TrapContext::new();
+
+    trap_ctx.set_user_mode(false);
+    trap_ctx.set_interrupt_enabled(true);
+    let _ = trap_ctx.set_user_call_frame(
+        execute::<F> as usize,
+        Some(sp.addr().get()),
+        None,
+        &[(&raw mut future) as usize, output.get() as usize],
+        |_, _| Ok::<(), u32>(()),
+    );
+
+    loop {
+        unsafe {
+            trap_ctx.trap_return();
+        }
+
+        match trap_ctx.trap_type() {
+            TrapType::Syscall { .. } => {}
+            TrapType::Fault(fault) => default_fault_handler(fault, &mut trap_ctx),
+            TrapType::Irq { callback } => callback(default_irq_handler),
+            TrapType::Timer { callback } => {
+                callback(timer_interrupt);
+
+                if eonix_preempt::count() == 0 && should_reschedule() {
+                    yield_now().await;
+                }
+            }
+            TrapType::Breakpoint => {
+                if let Some(output) = output.get_mut().take() {
+                    break output;
+                } else {
+                    wait_for_wakeups().await;
+                }
+
+                #[cfg(target_arch = "riscv64")]
+                trap_ctx.set_program_counter(trap_ctx.get_program_counter() + 2);
+
+                #[cfg(target_arch = "loongarch64")]
+                trap_ctx.set_program_counter(trap_ctx.get_program_counter() + 4);
+            }
+        }
+    }
+}

+ 13 - 15
src/kernel/task/clone.rs

@@ -1,10 +1,7 @@
 use crate::{
     kernel::{
-        syscall::procops::parse_user_tls,
-        task::{
-            alloc_pid, new_thread_runnable, KernelStack, ProcessBuilder, ProcessList, Thread,
-            ThreadBuilder,
-        },
+        syscall::{procops::parse_user_tls, UserMut},
+        task::{alloc_pid, ProcessBuilder, ProcessList, Thread, ThreadBuilder},
         user::UserPointerMut,
     },
     KResult,
@@ -12,7 +9,7 @@ use crate::{
 use bitflags::bitflags;
 use core::num::NonZero;
 use eonix_hal::processor::UserTLS;
-use eonix_runtime::{scheduler::Scheduler, task::Task};
+use eonix_runtime::scheduler::RUNTIME;
 use eonix_sync::AsProof;
 use posix_types::signal::Signal;
 
@@ -51,9 +48,9 @@ pub struct CloneArgs {
     pub flags: CloneFlags,
     pub sp: Option<NonZero<usize>>, // Stack pointer for the new thread.
     pub exit_signal: Option<Signal>, // Signal to send to the parent on exit.
-    pub set_tid_ptr: Option<usize>, // Pointer to set child TID in user space.
-    pub clear_tid_ptr: Option<usize>, // Pointer to clear child TID in user space.
-    pub parent_tid_ptr: Option<usize>, // Pointer to parent TID in user space.
+    pub set_tid_ptr: Option<UserMut<u32>>, // Pointer to set child TID in user space.
+    pub clear_tid_ptr: Option<UserMut<u32>>, // Pointer to clear child TID in user space.
+    pub parent_tid_ptr: Option<UserMut<u32>>, // Pointer to parent TID in user space.
     pub tls: Option<UserTLS>,       // Pointer to TLS information.
 }
 
@@ -63,8 +60,8 @@ impl CloneArgs {
     pub fn for_clone(
         flags: usize,
         sp: usize,
-        child_tid_ptr: usize,
-        parent_tid_ptr: usize,
+        child_tid_ptr: UserMut<u32>,
+        parent_tid_ptr: UserMut<u32>,
         tls: usize,
     ) -> KResult<Self> {
         let clone_flags = CloneFlags::from_bits_truncate(flags & !Self::MASK);
@@ -133,8 +130,8 @@ impl CloneArgs {
     }
 }
 
-pub fn do_clone(thread: &Thread, clone_args: CloneArgs) -> KResult<u32> {
-    let mut procs = Task::block_on(ProcessList::get().write());
+pub async fn do_clone(thread: &Thread, clone_args: CloneArgs) -> KResult<u32> {
+    let mut procs = ProcessList::get().write().await;
 
     let thread_builder = ThreadBuilder::new().clone_from(&thread, &clone_args)?;
     let current_process = thread.process.clone();
@@ -154,6 +151,7 @@ pub fn do_clone(thread: &Thread, clone_args: CloneArgs) -> KResult<u32> {
 
         let (new_thread, _) = ProcessBuilder::new()
             .clone_from(current_process, &clone_args)
+            .await
             .pid(new_pid)
             .pgroup(current_pgroup)
             .session(current_session)
@@ -163,10 +161,10 @@ pub fn do_clone(thread: &Thread, clone_args: CloneArgs) -> KResult<u32> {
     };
 
     if let Some(parent_tid_ptr) = clone_args.parent_tid_ptr {
-        UserPointerMut::new(parent_tid_ptr as *mut u32)?.write(new_pid)?
+        UserPointerMut::new(parent_tid_ptr)?.write(new_pid)?
     }
 
-    Scheduler::get().spawn::<KernelStack, _>(new_thread_runnable(new_thread));
+    RUNTIME.spawn(new_thread.run());
 
     Ok(new_pid)
 }

+ 7 - 6
src/kernel/task/futex.rs

@@ -9,6 +9,7 @@ use intrusive_collections::{intrusive_adapter, KeyAdapter, RBTree, RBTreeAtomicL
 use crate::{
     kernel::{
         constants::{EAGAIN, EINVAL},
+        syscall::User,
         user::UserPointer,
     },
     prelude::KResult,
@@ -174,7 +175,7 @@ pub async fn futex_wait(
         let (_, futex_bucket_ref) = FUTEX_TABLE.get_bucket(&futex_key);
         let mut futex_bucket = futex_bucket_ref.lock().await;
 
-        let val = UserPointer::new(uaddr as *const u32)?.read()?;
+        let val = UserPointer::new(User::<u32>::with_addr(uaddr))?.read()?;
 
         if val != expected_val {
             return Err(EAGAIN);
@@ -238,20 +239,20 @@ async fn futex_requeue(
     pid: Option<u32>,
     wake_count: u32,
     requeue_uaddr: usize,
-    requeue_count: u32,
+    _requeue_count: u32,
 ) -> KResult<usize> {
     let futex_key = FutexKey::new(uaddr, pid);
     let futex_requeue_key = FutexKey::new(requeue_uaddr, pid);
 
-    let (bucket_idx0, bucket_ref0) = FUTEX_TABLE.get_bucket(&futex_key);
-    let (bucket_idx1, bucket_ref1) = FUTEX_TABLE.get_bucket(&futex_requeue_key);
+    let (bucket_idx0, _bucket_ref0) = FUTEX_TABLE.get_bucket(&futex_key);
+    let (bucket_idx1, _bucket_ref1) = FUTEX_TABLE.get_bucket(&futex_requeue_key);
 
     if bucket_idx0 == bucket_idx1 {
         // If the keys are the same, we can just wake up the waiters.
         return futex_wake(uaddr, pid, wake_count).await;
     }
 
-    let (futex_bucket, futex_requeue_bucket) =
+    let (_futex_bucket, _futex_requeue_bucket) =
         double_lock_bucket(futex_key, futex_requeue_key).await;
 
     todo!()
@@ -299,7 +300,7 @@ impl RobustListHead {
             futex_wake(futex_addr, None, usize::MAX as u32).await?;
 
             // Move to the next entry in the robust list.
-            let robust_list = UserPointer::new(entry_ptr as *const RobustList)?.read()?;
+            let robust_list = UserPointer::new(User::<RobustList>::with_addr(entry_ptr))?.read()?;
 
             entry_ptr = robust_list.next;
 

+ 103 - 86
src/kernel/task/loader/elf.rs

@@ -215,20 +215,20 @@ impl<E: ElfArch> Elf<E> {
         })
     }
 
-    fn load(&self, args: Vec<CString>, envs: Vec<CString>) -> KResult<LoadInfo> {
+    async fn load(&self, args: Vec<CString>, envs: Vec<CString>) -> KResult<LoadInfo> {
         let mm_list = MMList::new();
 
         // Load Segments
-        let (elf_base, data_segment_end) = self.load_segments(&mm_list)?;
+        let (elf_base, data_segment_end) = self.load_segments(&mm_list).await?;
 
         // Load ldso (if any)
-        let ldso_load_info = self.load_ldso(&mm_list)?;
+        let ldso_load_info = self.load_ldso(&mm_list).await?;
 
         // Load vdso
-        self.load_vdso(&mm_list)?;
+        self.load_vdso(&mm_list).await?;
 
         // Heap
-        mm_list.register_break(data_segment_end + 0x10000);
+        mm_list.register_break(data_segment_end + 0x10000).await;
 
         let aux_vec = self.init_aux_vec(
             elf_base,
@@ -238,7 +238,9 @@ impl<E: ElfArch> Elf<E> {
         )?;
 
         // Map stack
-        let sp = self.create_and_init_stack(&mm_list, args, envs, aux_vec)?;
+        let sp = self
+            .create_and_init_stack(&mm_list, args, envs, aux_vec)
+            .await?;
 
         let entry_ip = if let Some(ldso_load_info) = ldso_load_info {
             // Normal shared object(DYN)
@@ -258,26 +260,30 @@ impl<E: ElfArch> Elf<E> {
         })
     }
 
-    fn create_and_init_stack(
+    async fn create_and_init_stack(
         &self,
         mm_list: &MMList,
         args: Vec<CString>,
         envs: Vec<CString>,
         aux_vec: AuxVec<E::Ea>,
     ) -> KResult<VAddr> {
-        mm_list.mmap_fixed(
-            VAddr::from(E::STACK_BASE_ADDR - INIT_STACK_SIZE),
-            INIT_STACK_SIZE,
-            Mapping::Anonymous,
-            Permission {
-                read: true,
-                write: true,
-                execute: false,
-            },
-            false,
-        )?;
+        mm_list
+            .mmap_fixed(
+                VAddr::from(E::STACK_BASE_ADDR - INIT_STACK_SIZE),
+                INIT_STACK_SIZE,
+                Mapping::Anonymous,
+                Permission {
+                    read: true,
+                    write: true,
+                    execute: false,
+                },
+                false,
+            )
+            .await?;
 
-        StackInitializer::new(&mm_list, E::STACK_BASE_ADDR, args, envs, aux_vec).init()
+        StackInitializer::new(&mm_list, E::STACK_BASE_ADDR, args, envs, aux_vec)
+            .init()
+            .await
     }
 
     fn init_aux_vec(&self, elf_base: VAddr, ldso_base: Option<VAddr>) -> KResult<AuxVec<E::Ea>> {
@@ -309,7 +315,7 @@ impl<E: ElfArch> Elf<E> {
         Ok(aux_vec)
     }
 
-    fn load_segments(&self, mm_list: &MMList) -> KResult<(VAddr, VAddr)> {
+    async fn load_segments(&self, mm_list: &MMList) -> KResult<(VAddr, VAddr)> {
         let base: VAddr = if self.is_shared_object() { E::DYN_BASE_ADDR } else { 0 }.into();
 
         let mut segments_end = VAddr::NULL;
@@ -318,7 +324,7 @@ impl<E: ElfArch> Elf<E> {
             let type_ = program_header.type_().map_err(|_| ENOEXEC)?;
 
             if type_ == program::Type::Load {
-                let segment_end = self.load_segment(program_header, mm_list, base)?;
+                let segment_end = self.load_segment(program_header, mm_list, base).await?;
 
                 if segment_end > segments_end {
                     segments_end = segment_end;
@@ -329,7 +335,7 @@ impl<E: ElfArch> Elf<E> {
         Ok((base, segments_end))
     }
 
-    fn load_segment(
+    async fn load_segment(
         &self,
         program_header: &E::Ph,
         mm_list: &MMList,
@@ -353,33 +359,37 @@ impl<E: ElfArch> Elf<E> {
         if file_len != 0 {
             let real_file_length = load_vaddr_end - vmap_start;
 
-            mm_list.mmap_fixed(
-                vmap_start,
-                file_len,
-                Mapping::File(FileMapping::new(
-                    self.file.get_inode()?,
-                    file_offset,
-                    real_file_length,
-                )),
-                permission,
-                false,
-            )?;
+            mm_list
+                .mmap_fixed(
+                    vmap_start,
+                    file_len,
+                    Mapping::File(FileMapping::new(
+                        self.file.get_inode()?,
+                        file_offset,
+                        real_file_length,
+                    )),
+                    permission,
+                    false,
+                )
+                .await?;
         }
 
         if vmem_len > file_len {
-            mm_list.mmap_fixed(
-                vmap_start + file_len,
-                vmem_len - file_len,
-                Mapping::Anonymous,
-                permission,
-                false,
-            )?;
+            mm_list
+                .mmap_fixed(
+                    vmap_start + file_len,
+                    vmem_len - file_len,
+                    Mapping::Anonymous,
+                    permission,
+                    false,
+                )
+                .await?;
         }
 
         Ok(vmap_start + vmem_len)
     }
 
-    fn load_ldso(&self, mm_list: &MMList) -> KResult<Option<LdsoLoadInfo>> {
+    async fn load_ldso(&self, mm_list: &MMList) -> KResult<Option<LdsoLoadInfo>> {
         let ldso_path = self.ldso_path()?;
 
         if let Some(ldso_path) = ldso_path {
@@ -393,7 +403,7 @@ impl<E: ElfArch> Elf<E> {
                 let type_ = program_header.type_().map_err(|_| ENOEXEC)?;
 
                 if type_ == program::Type::Load {
-                    ldso_elf.load_segment(program_header, mm_list, base)?;
+                    ldso_elf.load_segment(program_header, mm_list, base).await?;
                 }
             }
 
@@ -406,8 +416,8 @@ impl<E: ElfArch> Elf<E> {
         Ok(None)
     }
 
-    fn load_vdso(&self, mm_list: &MMList) -> KResult<()> {
-        mm_list.map_vdso()
+    async fn load_vdso(&self, mm_list: &MMList) -> KResult<()> {
+        mm_list.map_vdso().await
     }
 
     fn ldso_path(&self) -> KResult<Option<String>> {
@@ -449,10 +459,10 @@ impl ELF {
         }
     }
 
-    pub fn load(&self, args: Vec<CString>, envs: Vec<CString>) -> KResult<LoadInfo> {
+    pub async fn load(&self, args: Vec<CString>, envs: Vec<CString>) -> KResult<LoadInfo> {
         match &self {
-            ELF::Elf32(elf32) => elf32.load(args, envs),
-            ELF::Elf64(elf64) => elf64.load(args, envs),
+            ELF::Elf32(elf32) => elf32.load(args, envs).await,
+            ELF::Elf64(elf64) => elf64.load(args, envs).await,
         }
     }
 }
@@ -483,21 +493,21 @@ impl<'a, T: ElfAddr + Clone + Copy> StackInitializer<'a, T> {
     }
 
     // return sp after stack init
-    fn init(mut self) -> KResult<VAddr> {
-        let env_pointers = self.push_envs()?;
-        let arg_pointers = self.push_args()?;
+    async fn init(mut self) -> KResult<VAddr> {
+        let env_pointers = self.push_envs().await?;
+        let arg_pointers = self.push_args().await?;
 
         self.stack_alignment();
-        self.push_aux_vec()?;
-        self.push_pointers(env_pointers)?;
-        self.push_pointers(arg_pointers)?;
-        self.push_argc(T::from_usize(self.args.len()))?;
+        self.push_aux_vec().await?;
+        self.push_pointers(env_pointers).await?;
+        self.push_pointers(arg_pointers).await?;
+        self.push_argc(T::from_usize(self.args.len())).await?;
 
         assert_eq!(self.sp.align_down(16), self.sp);
         Ok(VAddr::from(self.sp))
     }
 
-    fn push_envs(&mut self) -> KResult<Vec<T>> {
+    async fn push_envs(&mut self) -> KResult<Vec<T>> {
         let mut addrs = Vec::with_capacity(self.envs.len());
         for string in self.envs.iter().rev() {
             let len = string.as_bytes_with_nul().len();
@@ -505,14 +515,15 @@ impl<'a, T: ElfAddr + Clone + Copy> StackInitializer<'a, T> {
             self.mm_list
                 .access_mut(VAddr::from(self.sp), len, |offset, data| {
                     data.copy_from_slice(&string.as_bytes_with_nul()[offset..offset + data.len()])
-                })?;
+                })
+                .await?;
             addrs.push(T::from_usize(self.sp));
         }
         addrs.reverse();
         Ok(addrs)
     }
 
-    fn push_args(&mut self) -> KResult<Vec<T>> {
+    async fn push_args(&mut self) -> KResult<Vec<T>> {
         let mut addrs = Vec::with_capacity(self.args.len());
         for string in self.args.iter().rev() {
             let len = string.as_bytes_with_nul().len();
@@ -520,7 +531,8 @@ impl<'a, T: ElfAddr + Clone + Copy> StackInitializer<'a, T> {
             self.mm_list
                 .access_mut(VAddr::from(self.sp), len, |offset, data| {
                     data.copy_from_slice(&string.as_bytes_with_nul()[offset..offset + data.len()])
-                })?;
+                })
+                .await?;
             addrs.push(T::from_usize(self.sp));
         }
         addrs.reverse();
@@ -538,27 +550,29 @@ impl<'a, T: ElfAddr + Clone + Copy> StackInitializer<'a, T> {
         self.sp = align_sp + all_size;
     }
 
-    fn push_pointers(&mut self, mut pointers: Vec<T>) -> KResult<()> {
+    async fn push_pointers(&mut self, mut pointers: Vec<T>) -> KResult<()> {
         pointers.push(T::from_usize(0));
         self.sp -= pointers.len() * size_of::<T>();
 
-        self.mm_list.access_mut(
-            VAddr::from(self.sp),
-            pointers.len() * size_of::<T>(),
-            |offset, data| {
-                data.copy_from_slice(unsafe {
-                    core::slice::from_raw_parts(
-                        pointers.as_ptr().byte_add(offset) as *const u8,
-                        data.len(),
-                    )
-                })
-            },
-        )?;
+        self.mm_list
+            .access_mut(
+                VAddr::from(self.sp),
+                pointers.len() * size_of::<T>(),
+                |offset, data| {
+                    data.copy_from_slice(unsafe {
+                        core::slice::from_raw_parts(
+                            pointers.as_ptr().byte_add(offset) as *const u8,
+                            data.len(),
+                        )
+                    })
+                },
+            )
+            .await?;
 
         Ok(())
     }
 
-    fn push_argc(&mut self, val: T) -> KResult<()> {
+    async fn push_argc(&mut self, val: T) -> KResult<()> {
         self.sp -= size_of::<T>();
 
         self.mm_list
@@ -566,12 +580,13 @@ impl<'a, T: ElfAddr + Clone + Copy> StackInitializer<'a, T> {
                 data.copy_from_slice(unsafe {
                     core::slice::from_raw_parts(&val as *const _ as *const u8, data.len())
                 })
-            })?;
+            })
+            .await?;
 
         Ok(())
     }
 
-    fn push_aux_vec(&mut self) -> KResult<()> {
+    async fn push_aux_vec(&mut self) -> KResult<()> {
         let mut longs: Vec<T> = vec![];
 
         // Write Auxiliary vectors
@@ -593,18 +608,20 @@ impl<'a, T: ElfAddr + Clone + Copy> StackInitializer<'a, T> {
 
         self.sp -= longs.len() * size_of::<T>();
 
-        self.mm_list.access_mut(
-            VAddr::from(self.sp),
-            longs.len() * size_of::<T>(),
-            |offset, data| {
-                data.copy_from_slice(unsafe {
-                    core::slice::from_raw_parts(
-                        longs.as_ptr().byte_add(offset) as *const u8,
-                        data.len(),
-                    )
-                })
-            },
-        )?;
+        self.mm_list
+            .access_mut(
+                VAddr::from(self.sp),
+                longs.len() * size_of::<T>(),
+                |offset, data| {
+                    data.copy_from_slice(unsafe {
+                        core::slice::from_raw_parts(
+                            longs.as_ptr().byte_add(offset) as *const u8,
+                            data.len(),
+                        )
+                    })
+                },
+            )
+            .await?;
 
         Ok(())
     }

+ 2 - 2
src/kernel/task/loader/mod.rs

@@ -106,9 +106,9 @@ impl ProgramLoader {
         })
     }
 
-    pub fn load(self) -> KResult<LoadInfo> {
+    pub async fn load(self) -> KResult<LoadInfo> {
         match self.object {
-            Object::ELF(elf) => elf.load(self.args, self.envs),
+            Object::ELF(elf) => elf.load(self.args, self.envs).await,
         }
     }
 }

+ 45 - 40
src/kernel/task/process.rs

@@ -4,10 +4,11 @@ use super::{
 };
 use crate::kernel::constants::{ECHILD, EINTR, EINVAL, EPERM, ESRCH};
 use crate::kernel::task::{CloneArgs, CloneFlags};
+use crate::rcu::call_rcu;
 use crate::{
     kernel::mem::MMList,
     prelude::*,
-    rcu::{rcu_sync, RCUPointer, RCUReadGuard},
+    rcu::{RCUPointer, RCUReadGuard},
     sync::CondVar,
 };
 use alloc::{
@@ -16,7 +17,6 @@ use alloc::{
 };
 use core::sync::atomic::{AtomicU32, Ordering};
 use eonix_mm::address::VAddr;
-use eonix_runtime::task::Task;
 use eonix_sync::{
     AsProof as _, AsProofMut as _, Locked, Proof, ProofMut, RwLockReadGuard, SpinGuard,
     UnlockableGuard as _, UnlockedGuard as _,
@@ -108,6 +108,7 @@ pub struct DrainExited<'waitlist> {
     wait_procs: SpinGuard<'waitlist, VecDeque<WaitObject>>,
 }
 
+#[derive(Debug, Clone, Copy)]
 pub enum WaitId {
     Any,
     Pid(u32),
@@ -120,23 +121,17 @@ impl WaitId {
             P_ALL => Ok(WaitId::Any),
             P_PID => Ok(WaitId::Pid(id)),
             P_PGID => Ok(WaitId::Pgid(id)),
-            P_PIDFD => {
-                panic!("PDIFD type is unsupported")
-            }
+            P_PIDFD => panic!("P_PIDFD type is not supported"),
             _ => Err(EINVAL),
         }
     }
 
     pub fn from_id(id: i32, thread: &Thread) -> Self {
-        if id < -1 {
-            WaitId::Pgid((-id).cast_unsigned())
-        } else if id == -1 {
-            WaitId::Any
-        } else if id == 0 {
-            let procs = Task::block_on(ProcessList::get().read());
-            WaitId::Pgid(thread.process.pgroup(procs.prove()).pgid)
-        } else {
-            WaitId::Pid(id.cast_unsigned())
+        match id {
+            ..-1 => WaitId::Pgid((-id).cast_unsigned()),
+            -1 => WaitId::Any,
+            0 => WaitId::Pgid(thread.process.pgroup_rcu().pgid),
+            _ => WaitId::Pid(id.cast_unsigned()),
         }
     }
 }
@@ -205,11 +200,11 @@ impl ProcessBuilder {
         }
     }
 
-    pub fn clone_from(mut self, process: Arc<Process>, clone_args: &CloneArgs) -> Self {
+    pub async fn clone_from(mut self, process: Arc<Process>, clone_args: &CloneArgs) -> Self {
         let mm_list = if clone_args.flags.contains(CloneFlags::CLONE_VM) {
-            Task::block_on(process.mm_list.new_shared())
+            process.mm_list.new_shared().await
         } else {
-            Task::block_on(process.mm_list.new_cloned())
+            process.mm_list.new_cloned().await
         };
 
         if let Some(exit_signal) = clone_args.exit_signal {
@@ -350,8 +345,18 @@ impl Process {
         trace_continue: bool,
     ) -> KResult<Option<WaitObject>> {
         let wait_object = {
-            let mut waits = self.wait_list.entry(wait_id, trace_stop, trace_continue);
+            let mut unlocked_waits = None;
+
             loop {
+                let mut waits = match unlocked_waits {
+                    Some(wait) => wait.await?,
+                    None => {
+                        self.wait_list
+                            .entry(wait_id, trace_stop, trace_continue)
+                            .await
+                    }
+                };
+
                 if let Some(object) = waits.get() {
                     break object;
                 }
@@ -369,7 +374,7 @@ impl Process {
                     return Ok(None);
                 }
 
-                waits = waits.wait(no_block).await?;
+                unlocked_waits = Some(waits.wait(no_block));
             }
         };
 
@@ -377,7 +382,7 @@ impl Process {
             Ok(Some(wait_object))
         } else {
             let mut procs = ProcessList::get().write().await;
-            procs.remove_process(wait_object.pid);
+            procs.remove_process(wait_object.pid).await;
             assert!(self
                 .inner
                 .access_mut(procs.prove_mut())
@@ -390,8 +395,8 @@ impl Process {
     }
 
     /// Create a new session for the process.
-    pub fn setsid(self: &Arc<Self>) -> KResult<u32> {
-        let mut process_list = Task::block_on(ProcessList::get().write());
+    pub async fn setsid(self: &Arc<Self>) -> KResult<u32> {
+        let mut process_list = ProcessList::get().write().await;
         // If there exists a session that has the same sid as our pid, we can't create a new
         // session. The standard says that we should create a new process group and be the
         // only process in the new process group and session.
@@ -404,12 +409,14 @@ impl Process {
             .session(session.clone())
             .build(&mut process_list);
 
-        {
-            let _old_session = unsafe { self.session.swap(Some(session.clone())) }.unwrap();
-            let old_pgroup = unsafe { self.pgroup.swap(Some(pgroup.clone())) }.unwrap();
-            old_pgroup.remove_member(self.pid, process_list.prove_mut());
-            Task::block_on(rcu_sync());
-        }
+        let old_session = unsafe { self.session.swap(Some(session.clone())) }.unwrap();
+        let old_pgroup = unsafe { self.pgroup.swap(Some(pgroup.clone())) }.unwrap();
+        old_pgroup.remove_member(self.pid, process_list.prove_mut());
+
+        call_rcu(move || {
+            drop(old_session);
+            drop(old_pgroup);
+        });
 
         Ok(pgroup.pgid)
     }
@@ -455,10 +462,9 @@ impl Process {
         };
 
         pgroup.remove_member(self.pid, procs.prove_mut());
-        {
-            let _old_pgroup = unsafe { self.pgroup.swap(Some(new_pgroup)) }.unwrap();
-            Task::block_on(rcu_sync());
-        }
+
+        let old_pgroup = unsafe { self.pgroup.swap(Some(new_pgroup)) }.unwrap();
+        call_rcu(move || drop(old_pgroup));
 
         Ok(())
     }
@@ -467,8 +473,8 @@ impl Process {
     ///
     /// This function should be called on the process that issued the syscall in order to do
     /// permission checks.
-    pub fn setpgid(self: &Arc<Self>, pid: u32, pgid: u32) -> KResult<()> {
-        let mut procs = Task::block_on(ProcessList::get().write());
+    pub async fn setpgid(self: &Arc<Self>, pid: u32, pgid: u32) -> KResult<()> {
+        let mut procs = ProcessList::get().write().await;
         // We may set pgid of either the calling process or a child process.
         if pid == self.pid {
             self.do_setpgid(pgid, &mut procs)
@@ -572,9 +578,9 @@ impl WaitList {
     /// # Safety
     /// Locks `ProcessList` and `WaitList` at the same time. When `wait` is called,
     /// releases the lock on `ProcessList` and `WaitList` and waits on `cv_wait_procs`.
-    pub fn entry(&self, wait_id: WaitId, want_stop: bool, want_continue: bool) -> Entry {
+    pub async fn entry(&self, wait_id: WaitId, want_stop: bool, want_continue: bool) -> Entry {
         Entry {
-            process_list: Task::block_on(ProcessList::get().read()),
+            process_list: ProcessList::get().read().await,
             wait_procs: self.wait_procs.lock(),
             cv: &self.cv_wait_procs,
             want_stop,
@@ -603,9 +609,8 @@ impl Entry<'_, '_, '_> {
                 WaitId::Any => true,
                 WaitId::Pid(pid) => item.pid == pid,
                 WaitId::Pgid(pgid) => {
-                    let procs = Task::block_on(ProcessList::get().read());
-                    if let Some(process) = procs.try_find_process(item.pid) {
-                        return process.pgroup(procs.prove()).pgid == pgid;
+                    if let Some(process) = self.process_list.try_find_process(item.pid) {
+                        return process.pgroup(self.process_list.prove()).pgid == pgid;
                     }
                     false
                 }
@@ -619,7 +624,7 @@ impl Entry<'_, '_, '_> {
         }
     }
 
-    pub fn wait(self, no_block: bool) -> impl core::future::Future<Output = KResult<Self>> {
+    pub fn wait(self, no_block: bool) -> impl core::future::Future<Output = KResult<Self>> + Send {
         let wait_procs = self.wait_procs.unlock();
 
         async move {

+ 8 - 11
src/kernel/task/process_list.rs

@@ -9,7 +9,7 @@ use alloc::{
     collections::btree_map::BTreeMap,
     sync::{Arc, Weak},
 };
-use eonix_runtime::task::Task;
+use eonix_mm::address::Addr;
 use eonix_sync::{AsProof as _, AsProofMut as _, RwLock};
 
 pub struct ProcessList {
@@ -54,7 +54,7 @@ impl ProcessList {
         self.threads.insert(thread.tid, thread.clone());
     }
 
-    pub fn remove_process(&mut self, pid: u32) {
+    pub async fn remove_process(&mut self, pid: u32) {
         // Thread group leader has the same tid as the pid.
         if let Some(thread) = self.threads.remove(&pid) {
             self.processes.remove(&pid);
@@ -64,7 +64,7 @@ impl ProcessList {
             let pgroup = unsafe { thread.process.pgroup.swap(None) }.unwrap();
             let _parent = unsafe { thread.process.parent.swap(None) }.unwrap();
             pgroup.remove_member(pid, self.prove_mut());
-            Task::block_on(rcu_sync());
+            rcu_sync().await;
 
             if Arc::strong_count(&pgroup) == 1 {
                 self.pgroups.remove(&pgroup.pgid);
@@ -135,11 +135,9 @@ impl ProcessList {
         }
 
         if let Some(clear_ctid) = thread.get_clear_ctid() {
-            let _ = UserPointerMut::new(clear_ctid as *mut u32)
-                .unwrap()
-                .write(0u32);
+            let _ = UserPointerMut::new(clear_ctid).unwrap().write(0u32);
 
-            let _ = futex_wake(clear_ctid, None, 1).await;
+            let _ = futex_wake(clear_ctid.addr(), None, 1).await;
         }
 
         if let Some(robust_list) = thread.get_robust_list() {
@@ -150,14 +148,13 @@ impl ProcessList {
         if thread.tid == process.pid {
             assert_eq!(thread.tid, process.pid);
 
-            thread.files.close_all();
+            thread.files.close_all().await;
 
             // If we are the session leader, we should drop the control terminal.
             if process.session(self.prove()).sid == process.pid {
-                if let Some(terminal) =
-                    Task::block_on(process.session(self.prove()).drop_control_terminal())
+                if let Some(terminal) = process.session(self.prove()).drop_control_terminal().await
                 {
-                    terminal.drop_session();
+                    terminal.drop_session().await;
                 }
             }
 

+ 2 - 2
src/kernel/task/session.rs

@@ -87,14 +87,14 @@ impl Session {
     ) -> KResult<()> {
         let mut job_control = self.job_control.write().await;
         if let Some(_) = job_control.control_terminal.as_ref() {
-            if let Some(session) = terminal.session().as_ref() {
+            if let Some(session) = terminal.session().await.as_ref() {
                 if session.sid == self.sid {
                     return Ok(());
                 }
             }
             return Err(EPERM);
         }
-        terminal.set_session(self, forced)?;
+        terminal.set_session(self, forced).await?;
         job_control.control_terminal = Some(terminal.clone());
         job_control.foreground = Arc::downgrade(&Thread::current().process.pgroup(procs));
         Ok(())

+ 8 - 11
src/kernel/task/signal.rs

@@ -9,7 +9,7 @@ use core::{cmp::Reverse, task::Waker};
 use eonix_hal::fpu::FpuState;
 use eonix_hal::traits::trap::RawTrapContext;
 use eonix_hal::trap::TrapContext;
-use eonix_runtime::task::Task;
+use eonix_runtime::scheduler::Runtime;
 use eonix_sync::AsProof as _;
 use intrusive_collections::UnsafeRef;
 use posix_types::signal::{SigSet, Signal};
@@ -226,15 +226,12 @@ impl SignalList {
 
                     // `SIGSTOP` can only be waken up by `SIGCONT` or `SIGKILL`.
                     // SAFETY: Preempt disabled above.
-                    {
+                    Runtime::block_till_woken(|waker| {
                         let mut inner = self.inner.lock();
-                        let waker = Waker::from(Task::current().clone());
-
-                        let old_waker = inner.stop_waker.replace(waker);
+                        let old_waker = inner.stop_waker.replace(waker.clone());
                         assert!(old_waker.is_none(), "We should not have a waker here");
-                    }
-
-                    Task::park_preempt_disabled();
+                    })
+                    .await;
 
                     if let Some(parent) = thread.process.parent.load() {
                         parent.notify(
@@ -296,15 +293,15 @@ impl SignalList {
         let old_fpu_state_vaddr = old_trap_ctx_vaddr + size_of::<TrapContext>();
         let old_mask_vaddr = old_fpu_state_vaddr + size_of::<FpuState>();
 
-        *trap_ctx = UserPointer::<TrapContext>::new_vaddr(old_trap_ctx_vaddr)?.read()?;
+        *trap_ctx = UserPointer::<TrapContext>::with_addr(old_trap_ctx_vaddr)?.read()?;
 
         // Make sure that at least we won't crash the kernel.
         if !trap_ctx.is_user_mode() || !trap_ctx.is_interrupt_enabled() {
             return Err(EFAULT)?;
         }
 
-        *fpu_state = UserPointer::<FpuState>::new_vaddr(old_fpu_state_vaddr)?.read()?;
-        self.inner.lock().mask = UserPointer::<SigSet>::new_vaddr(old_mask_vaddr)?.read()?;
+        *fpu_state = UserPointer::<FpuState>::with_addr(old_fpu_state_vaddr)?.read()?;
+        self.inner.lock().mask = UserPointer::<SigSet>::with_addr(old_mask_vaddr)?.read()?;
 
         Ok(())
     }

+ 3 - 2
src/kernel/task/signal/signal_action.rs

@@ -3,6 +3,7 @@ use crate::{
     io::BufferFill as _,
     kernel::{
         constants::{EFAULT, EINVAL},
+        syscall::UserMut,
         user::UserBuffer,
     },
 };
@@ -152,7 +153,7 @@ impl SignalAction {
         let saved_data_addr = (current_sp - SAVED_DATA_SIZE).floor_to(16);
 
         let mut saved_data_buffer =
-            UserBuffer::new(saved_data_addr.addr() as *mut u8, SAVED_DATA_SIZE)?;
+            UserBuffer::new(UserMut::new(saved_data_addr), SAVED_DATA_SIZE)?;
 
         saved_data_buffer.copy(trap_ctx)?.ok_or(EFAULT)?;
         saved_data_buffer.copy(fpu_state)?.ok_or(EFAULT)?;
@@ -200,7 +201,7 @@ impl SignalAction {
             Some(return_address),
             &[Long::new_val(signal.into_raw() as _).get()],
             |vaddr, data| -> Result<(), u32> {
-                let mut buffer = UserBuffer::new(vaddr.addr() as *mut u8, data.len())?;
+                let mut buffer = UserBuffer::new(UserMut::new(vaddr), data.len())?;
                 for ch in data.iter() {
                     buffer.copy(&ch)?.ok_or(EFAULT)?;
                 }

+ 86 - 95
src/kernel/task/thread.rs

@@ -1,11 +1,11 @@
 use super::{
     signal::{RaiseResult, SignalList},
-    Process, ProcessList, WaitType,
+    stackful, Process, ProcessList, WaitType,
 };
 use crate::{
     kernel::{
         interrupt::default_irq_handler,
-        syscall::{syscall_handlers, SyscallHandler},
+        syscall::{syscall_handlers, SyscallHandler, User, UserMut},
         task::{clone::CloneArgs, futex::RobustListHead, CloneFlags},
         timer::{should_reschedule, timer_interrupt},
         user::{UserPointer, UserPointerMut},
@@ -13,14 +13,14 @@ use crate::{
     },
     prelude::*,
 };
-use alloc::sync::Arc;
+use alloc::{alloc::Allocator, sync::Arc};
 use atomic_unique_refcell::AtomicUniqueRefCell;
 use core::{
-    future::Future,
+    future::{poll_fn, Future},
     pin::Pin,
     ptr::NonNull,
     sync::atomic::{AtomicBool, Ordering},
-    task::{Context, Poll, Waker},
+    task::{Context, Poll},
 };
 use eonix_hal::{
     fpu::FpuState,
@@ -28,23 +28,21 @@ use eonix_hal::{
     traits::{
         fault::Fault,
         fpu::RawFpuState as _,
-        trap::{IrqState as _, RawTrapContext, TrapReturn, TrapType},
+        trap::{RawTrapContext, TrapReturn, TrapType},
     },
-    trap::{disable_irqs_save, TrapContext},
+    trap::TrapContext,
 };
 use eonix_mm::address::{Addr as _, VAddr};
-use eonix_runtime::run::{Contexted, Run, RunState};
 use eonix_sync::AsProofMut as _;
 use pointers::BorrowedArc;
 use posix_types::signal::Signal;
+use stalloc::UnsafeStalloc;
 
 #[eonix_percpu::define_percpu]
 static CURRENT_THREAD: Option<NonNull<Thread>> = None;
 
-pub struct ThreadRunnable<F: Future> {
-    thread: Arc<Thread>,
-    future: F,
-}
+#[derive(Clone, Copy)]
+pub struct ThreadAlloc<'a>(pub &'a UnsafeStalloc<1023, 32>);
 
 pub struct ThreadBuilder {
     tid: Option<u32>,
@@ -54,8 +52,8 @@ pub struct ThreadBuilder {
     fs_context: Option<Arc<FsContext>>,
     signal_list: Option<SignalList>,
     tls: Option<UserTLS>,
-    set_child_tid: Option<usize>,
-    clear_child_tid: Option<usize>,
+    set_child_tid: Option<UserMut<u32>>,
+    clear_child_tid: Option<UserMut<u32>>,
 
     trap_ctx: Option<TrapContext>,
     fpu_state: Option<FpuState>,
@@ -71,11 +69,11 @@ struct ThreadInner {
 
     /// User pointer
     /// Store child thread's tid when child thread returns to user space.
-    set_child_tid: Option<usize>,
+    set_child_tid: Option<UserMut<u32>>,
 
-    clear_child_tid: Option<usize>,
+    clear_child_tid: Option<UserMut<u32>>,
 
-    robust_list_address: Option<VAddr>,
+    robust_list_address: Option<User<RobustListHead>>,
 }
 
 pub struct Thread {
@@ -147,12 +145,12 @@ impl ThreadBuilder {
         self
     }
 
-    pub fn set_child_tid(mut self, set_child_tid: Option<usize>) -> Self {
+    pub fn set_child_tid(mut self, set_child_tid: Option<UserMut<u32>>) -> Self {
         self.set_child_tid = set_child_tid;
         self
     }
 
-    pub fn clear_child_tid(mut self, clear_child_tid: Option<usize>) -> Self {
+    pub fn clear_child_tid(mut self, clear_child_tid: Option<UserMut<u32>>) -> Self {
         self.clear_child_tid = clear_child_tid;
         self
     }
@@ -291,13 +289,13 @@ impl Thread {
         Ok(())
     }
 
-    pub fn set_robust_list(&self, robust_list_address: Option<VAddr>) {
+    pub fn set_robust_list(&self, robust_list_address: Option<User<RobustListHead>>) {
         self.inner.lock().robust_list_address = robust_list_address;
     }
 
     pub fn get_robust_list(&self) -> Option<RobustListHead> {
         let addr = self.inner.lock().robust_list_address?;
-        let user_pointer = UserPointer::new(addr.addr() as *const RobustListHead).ok()?;
+        let user_pointer = UserPointer::new(addr).ok()?;
 
         user_pointer.read().ok()
     }
@@ -310,25 +308,30 @@ impl Thread {
         self.inner.lock().name.clone()
     }
 
-    pub fn clear_child_tid(&self, clear_child_tid: Option<usize>) {
+    pub fn clear_child_tid(&self, clear_child_tid: Option<UserMut<u32>>) {
         self.inner.lock().clear_child_tid = clear_child_tid;
     }
 
-    pub fn get_set_ctid(&self) -> Option<usize> {
+    pub fn get_set_ctid(&self) -> Option<UserMut<u32>> {
         self.inner.lock().set_child_tid
     }
 
-    pub fn get_clear_ctid(&self) -> Option<usize> {
+    pub fn get_clear_ctid(&self) -> Option<UserMut<u32>> {
         self.inner.lock().clear_child_tid
     }
 
-    pub fn handle_syscall(&self, no: usize, args: [usize; 6]) -> Option<usize> {
+    pub async fn handle_syscall(
+        &self,
+        thd_alloc: ThreadAlloc<'_>,
+        no: usize,
+        args: [usize; 6],
+    ) -> Option<usize> {
         match syscall_handlers().get(no) {
             Some(Some(SyscallHandler {
                 handler,
                 name: _name,
                 ..
-            })) => handler(self, args),
+            })) => handler(self, thd_alloc, args).await,
             _ => {
                 println_warn!("Syscall {no}({no:#x}) isn't implemented.");
                 self.raise(Signal::SIGSYS);
@@ -353,12 +356,18 @@ impl Thread {
 
     async fn real_run(&self) {
         if let Some(set_ctid) = self.get_set_ctid() {
-            UserPointerMut::new(set_ctid as *mut u32)
+            UserPointerMut::new(set_ctid)
                 .expect("set_child_tid pointer is invalid")
                 .write(self.tid)
                 .expect("set_child_tid write failed");
         }
 
+        let stack_alloc = unsafe {
+            // SAFETY: The allocator will only be used within the context of this thread.
+            UnsafeStalloc::new()
+        };
+        let thd_alloc = ThreadAlloc(&stack_alloc);
+
         while !self.is_dead() {
             if self.signal_list.has_pending_signal() {
                 self.signal_list
@@ -397,6 +406,7 @@ impl Thread {
                     self.signal_list.raise(Signal::SIGILL);
                 }
                 TrapType::Fault(Fault::Unknown(_)) => unimplemented!("Unhandled fault"),
+                TrapType::Breakpoint => unimplemented!("Breakpoint in user space"),
                 TrapType::Irq { callback } => callback(default_irq_handler),
                 TrapType::Timer { callback } => {
                     callback(timer_interrupt);
@@ -406,7 +416,7 @@ impl Thread {
                     }
                 }
                 TrapType::Syscall { no, args } => {
-                    if let Some(retval) = self.handle_syscall(no, args) {
+                    if let Some(retval) = self.handle_syscall(thd_alloc, no, args).await {
                         let mut trap_ctx = self.trap_ctx.borrow();
                         trap_ctx.set_user_return_value(retval);
 
@@ -421,28 +431,52 @@ impl Thread {
         }
     }
 
-    pub async fn run(self: Arc<Thread>) {
-        struct ContextedRun<'a, F: Future>(F, &'a Thread);
+    async fn contexted<F>(&self, future: F) -> F::Output
+    where
+        F: Future,
+    {
+        let mut future = core::pin::pin!(future);
 
-        impl<F: Future> Future for ContextedRun<'_, F> {
-            type Output = F::Output;
+        core::future::poll_fn(|cx| {
+            self.process.mm_list.activate();
 
-            fn poll(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Self::Output> {
-                let irq_state = disable_irqs_save();
-                let (future, _) = unsafe {
-                    // SAFETY: We construct a pinned future and `&Thread` is `Unpin`.
-                    let me = self.as_mut().get_unchecked_mut();
-                    (Pin::new_unchecked(&mut me.0), me.1)
-                };
+            CURRENT_THREAD.set(NonNull::new(&raw const *self as *mut _));
+
+            unsafe {
+                eonix_preempt::disable();
 
-                let retval = future.poll(ctx);
+                // SAFETY: Preemption is disabled.
+                self.load_thread_area32();
 
-                irq_state.restore();
-                retval
+                eonix_preempt::enable();
             }
-        }
 
-        ContextedRun(self.real_run(), &self).await
+            let result = future.as_mut().poll(cx);
+
+            self.process.mm_list.deactivate();
+
+            CURRENT_THREAD.set(None);
+
+            result
+        })
+        .await
+    }
+
+    pub fn run(self: Arc<Thread>) -> impl Future<Output = ()> + Send + 'static {
+        async move { self.contexted(stackful(self.real_run())).await }
+    }
+}
+
+unsafe impl Allocator for ThreadAlloc<'_> {
+    fn allocate(
+        &self,
+        layout: core::alloc::Layout,
+    ) -> Result<NonNull<[u8]>, alloc::alloc::AllocError> {
+        self.0.allocate(layout)
+    }
+
+    unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: core::alloc::Layout) {
+        self.0.deallocate(ptr, layout);
     }
 }
 
@@ -468,56 +502,13 @@ pub async fn yield_now() {
     Yield { yielded: false }.await;
 }
 
-pub fn new_thread_runnable(
-    thread: Arc<Thread>,
-) -> ThreadRunnable<impl Future<Output = impl Send + 'static> + Send + 'static> {
-    ThreadRunnable {
-        thread: thread.clone(),
-        future: thread.run(),
-    }
-}
-
-impl<F: Future> Contexted for ThreadRunnable<F> {
-    fn load_running_context(&self) {
-        self.thread.process.mm_list.activate();
-
-        let raw_ptr: *const Thread = &raw const *self.thread;
-        CURRENT_THREAD.set(NonNull::new(raw_ptr as *mut _));
-
-        unsafe {
-            // SAFETY: Preemption is disabled.
-            self.thread.load_thread_area32();
-        }
-
-        unsafe {
-            let trap_ctx_ptr: *const TrapContext = &raw const *self.thread.trap_ctx.borrow();
-            // SAFETY:
-            CPU::local()
-                .as_mut()
-                .load_interrupt_stack(trap_ctx_ptr as u64);
-        }
-    }
-
-    fn restore_running_context(&self) {
-        self.thread.process.mm_list.deactivate();
-
-        CURRENT_THREAD.set(None);
-    }
-}
-
-impl<F: Future> Run for ThreadRunnable<F> {
-    type Output = F::Output;
-
-    fn run(mut self: Pin<&mut Self>, waker: &Waker) -> RunState<Self::Output> {
-        let mut ctx = Context::from_waker(waker);
-
-        match unsafe {
-            self.as_mut()
-                .map_unchecked_mut(|me| &mut me.future)
-                .poll(&mut ctx)
-        } {
-            Poll::Ready(output) => RunState::Finished(output),
-            Poll::Pending => RunState::Running,
+pub fn wait_for_wakeups() -> impl Future<Output = ()> {
+    let mut waited = false;
+    poll_fn(move |_| match waited {
+        true => Poll::Ready(()),
+        false => {
+            waited = true;
+            Poll::Pending
         }
-    }
+    })
 }

+ 16 - 17
src/kernel/terminal.rs

@@ -10,7 +10,6 @@ use alloc::{
 };
 use bitflags::bitflags;
 use eonix_log::ConsoleWrite;
-use eonix_runtime::task::Task;
 use eonix_sync::{AsProof as _, Mutex};
 use posix_types::signal::Signal;
 
@@ -447,18 +446,18 @@ impl Terminal {
         }
     }
 
-    fn signal(&self, inner: &mut TerminalInner, signal: Signal) {
+    async fn signal(&self, inner: &mut TerminalInner, signal: Signal) {
         if let Some(session) = inner.session.upgrade() {
-            Task::block_on(session.raise_foreground(signal));
+            session.raise_foreground(signal).await;
         }
         if !inner.termio.noflsh() {
             self.clear_read_buffer(inner);
         }
     }
 
-    fn echo_and_signal(&self, inner: &mut TerminalInner, ch: u8, signal: Signal) {
+    async fn echo_and_signal(&self, inner: &mut TerminalInner, ch: u8, signal: Signal) {
         self.echo_char(inner, ch);
-        self.signal(inner, signal);
+        self.signal(inner, signal).await;
     }
 
     fn do_commit_char(&self, inner: &mut TerminalInner, ch: u8) {
@@ -482,13 +481,13 @@ impl Terminal {
             match ch {
                 0xff => {}
                 ch if ch == inner.termio.vintr() => {
-                    return self.echo_and_signal(&mut inner, ch, Signal::SIGINT)
+                    return self.echo_and_signal(&mut inner, ch, Signal::SIGINT).await
                 }
                 ch if ch == inner.termio.vquit() => {
-                    return self.echo_and_signal(&mut inner, ch, Signal::SIGQUIT)
+                    return self.echo_and_signal(&mut inner, ch, Signal::SIGQUIT).await
                 }
                 ch if ch == inner.termio.vsusp() => {
-                    return self.echo_and_signal(&mut inner, ch, Signal::SIGTSTP)
+                    return self.echo_and_signal(&mut inner, ch, Signal::SIGTSTP).await
                 }
                 _ => {}
             }
@@ -623,12 +622,12 @@ impl Terminal {
                 ptr.write(window_size)
             }
             TerminalIORequest::GetTermios(ptr) => {
-                let termios = Task::block_on(self.inner.lock()).termio.get_user();
+                let termios = self.inner.lock().await.termio.get_user();
                 ptr.write(termios)
             }
             TerminalIORequest::SetTermios(ptr) => {
                 let user_termios = ptr.read()?;
-                let mut inner = Task::block_on(self.inner.lock());
+                let mut inner = self.inner.lock().await;
 
                 // TODO: We ignore unknown bits for now.
                 inner.termio.iflag = TermioIFlags::from_bits_truncate(user_termios.iflag as u16);
@@ -644,13 +643,13 @@ impl Terminal {
     }
 
     /// Assign the `session` to this terminal. Drop the previous session if `forced` is true.
-    pub fn set_session(&self, session: &Arc<Session>, forced: bool) -> KResult<()> {
-        let mut inner = Task::block_on(self.inner.lock());
+    pub async fn set_session(&self, session: &Arc<Session>, forced: bool) -> KResult<()> {
+        let mut inner = self.inner.lock().await;
         if let Some(session) = inner.session.upgrade() {
             if !forced {
                 Err(EPERM)
             } else {
-                Task::block_on(session.drop_control_terminal());
+                session.drop_control_terminal().await;
                 inner.session = Arc::downgrade(&session);
                 Ok(())
             }
@@ -661,12 +660,12 @@ impl Terminal {
         }
     }
 
-    pub fn drop_session(&self) {
-        Task::block_on(self.inner.lock()).session = Weak::new();
+    pub async fn drop_session(&self) {
+        self.inner.lock().await.session = Weak::new();
     }
 
-    pub fn session(&self) -> Option<Arc<Session>> {
-        Task::block_on(self.inner.lock()).session.upgrade()
+    pub async fn session(&self) -> Option<Arc<Session>> {
+        self.inner.lock().await.session.upgrade()
     }
 }
 

+ 2 - 6
src/kernel/user.rs

@@ -1,7 +1,3 @@
-pub mod dataflow;
+mod dataflow;
 
-#[allow(unused_imports)]
-pub use dataflow::{UserBuffer, UserString};
-
-pub type UserPointer<'a, T> = dataflow::UserPointer<'a, T, true>;
-pub type UserPointerMut<'a, T> = dataflow::UserPointer<'a, T, false>;
+pub use dataflow::{CheckedUserPointer, UserBuffer, UserPointer, UserPointerMut, UserString};

+ 70 - 45
src/kernel/user/dataflow.rs

@@ -1,17 +1,20 @@
+use crate::{
+    io::{Buffer, FillResult},
+    prelude::*,
+};
 use crate::{
     io::{IntoStream, Stream},
-    kernel::constants::{EFAULT, EINVAL},
+    kernel::{
+        constants::{EFAULT, EINVAL},
+        syscall::{User, UserMut},
+    },
 };
 use core::{arch::asm, ffi::CStr, marker::PhantomData};
+use eonix_mm::address::Addr;
 use eonix_preempt::assert_preempt_enabled;
 
-use crate::{
-    io::{Buffer, FillResult},
-    prelude::*,
-};
-
 pub struct CheckedUserPointer<'a> {
-    ptr: *const u8,
+    ptr: User<u8>,
     len: usize,
     _phantom: PhantomData<&'a ()>,
 }
@@ -27,7 +30,12 @@ pub struct UserString<'a> {
     len: usize,
 }
 
-pub struct UserPointer<'a, T: Copy, const CONST: bool> {
+pub struct UserPointer<'a, T: Copy> {
+    pointer: CheckedUserPointer<'a>,
+    _phantom: PhantomData<T>,
+}
+
+pub struct UserPointerMut<'a, T: Copy> {
     pointer: CheckedUserPointer<'a>,
     _phantom: PhantomData<T>,
 }
@@ -37,9 +45,9 @@ pub struct UserStream<'a> {
     cur: usize,
 }
 
-impl<T: Copy, const CONST: bool> UserPointer<'_, T, CONST> {
-    pub fn new(ptr: *const T) -> KResult<Self> {
-        let pointer = CheckedUserPointer::new(ptr as *const u8, core::mem::size_of::<T>())?;
+impl<T: Copy> UserPointer<'_, T> {
+    pub fn new(ptr: User<T>) -> KResult<Self> {
+        let pointer = CheckedUserPointer::new(ptr.cast(), core::mem::size_of::<T>())?;
 
         Ok(Self {
             pointer,
@@ -47,8 +55,8 @@ impl<T: Copy, const CONST: bool> UserPointer<'_, T, CONST> {
         })
     }
 
-    pub fn new_vaddr(vaddr: usize) -> KResult<Self> {
-        Self::new(vaddr as *mut T)
+    pub fn with_addr(vaddr: usize) -> KResult<Self> {
+        Self::new(User::with_addr(vaddr))
     }
 
     /// # Might Sleep
@@ -60,22 +68,48 @@ impl<T: Copy, const CONST: bool> UserPointer<'_, T, CONST> {
     }
 
     pub fn offset(&self, offset: isize) -> KResult<Self> {
-        let new_vaddr = self.pointer.ptr as isize + offset * size_of::<T>() as isize;
-        Self::new_vaddr(new_vaddr as usize)
+        let new_ptr = self.pointer.ptr.offset(offset * size_of::<T>() as isize);
+        Self::new(new_ptr.cast())
     }
 }
 
-impl<'a, T: Copy> UserPointer<'a, T, false> {
+impl<'a, T: Copy> UserPointerMut<'a, T> {
+    pub fn new(ptr: UserMut<T>) -> KResult<Self> {
+        let pointer = CheckedUserPointer::new(ptr.cast().as_const(), core::mem::size_of::<T>())?;
+
+        Ok(Self {
+            pointer,
+            _phantom: PhantomData,
+        })
+    }
+
+    pub fn with_addr(vaddr: usize) -> KResult<Self> {
+        Self::new(UserMut::with_addr(vaddr))
+    }
+
+    /// # Might Sleep
+    pub fn read(&self) -> KResult<T> {
+        let mut value = core::mem::MaybeUninit::<T>::uninit();
+        self.pointer
+            .read(value.as_mut_ptr() as *mut (), core::mem::size_of::<T>())?;
+        Ok(unsafe { value.assume_init() })
+    }
+
+    pub fn offset(&self, offset: isize) -> KResult<Self> {
+        let new_ptr = self.pointer.ptr.offset(offset * size_of::<T>() as isize);
+        Self::new(unsafe { new_ptr.cast().as_mut() })
+    }
+
     pub fn write(&self, value: T) -> KResult<()> {
         self.pointer
-            .write(&value as *const T as *mut (), core::mem::size_of::<T>())
+            .write(&raw const value as *mut (), core::mem::size_of::<T>())
     }
 }
 
 impl CheckedUserPointer<'_> {
-    pub fn new(ptr: *const u8, len: usize) -> KResult<Self> {
+    pub fn new(ptr: User<u8>, len: usize) -> KResult<Self> {
         const USER_MAX_ADDR: usize = 0x7ff_fff_fff_fff;
-        let end = (ptr as usize).checked_add(len);
+        let end = ptr.addr().checked_add(len);
         if ptr.is_null() || end.ok_or(EFAULT)? > USER_MAX_ADDR {
             Err(EFAULT)
         } else {
@@ -89,19 +123,10 @@ impl CheckedUserPointer<'_> {
 
     pub fn forward(&mut self, offset: usize) {
         assert!(offset <= self.len);
-        self.ptr = self.ptr.wrapping_offset(offset as isize);
+        self.ptr = self.ptr.offset(offset as isize);
         self.len -= offset;
     }
 
-    pub fn get_const<T>(&self) -> *const T {
-        self.ptr as *const T
-    }
-
-    pub fn as_slice(&self) -> &[u8] {
-        // SAFETY: the pointer's validity is checked in `new`
-        unsafe { core::slice::from_raw_parts(self.ptr, self.len) }
-    }
-
     /// # Might Sleep
     pub fn read(&self, buffer: *mut (), total: usize) -> KResult<()> {
         assert_preempt_enabled!("UserPointer::read");
@@ -126,7 +151,7 @@ impl CheckedUserPointer<'_> {
                 ".quad 0x3",     // type: load
                 ".popsection",
                 inout("rcx") total => error_bytes,
-                inout("rsi") self.ptr => _,
+                inout("rsi") self.ptr.addr() => _,
                 inout("rdi") buffer => _,
             );
 
@@ -148,7 +173,7 @@ impl CheckedUserPointer<'_> {
                 ".8byte 0x3",     // type: load
                 ".popsection",
                 inout("a0") total => error_bytes,
-                inout("a1") self.ptr => _,
+                inout("a1") self.ptr.addr() => _,
                 inout("a2") buffer => _,
                 out("t0") _,
             );
@@ -171,7 +196,7 @@ impl CheckedUserPointer<'_> {
                 ".8byte 0x3",     // type: load
                 ".popsection",
                 inout("$a0") total => error_bytes,
-                inout("$a1") self.ptr => _,
+                inout("$a1") self.ptr.addr() => _,
                 inout("$a2") buffer => _,
                 out("$t0") _,
             );
@@ -210,7 +235,7 @@ impl CheckedUserPointer<'_> {
                 ".popsection",
                 inout("rcx") total => error_bytes,
                 inout("rsi") data => _,
-                inout("rdi") self.ptr => _,
+                inout("rdi") self.ptr.addr() => _,
             );
 
             #[cfg(target_arch = "riscv64")]
@@ -232,7 +257,7 @@ impl CheckedUserPointer<'_> {
                 ".popsection",
                 inout("a0") total => error_bytes,
                 inout("a1") data => _,
-                inout("a2") self.ptr => _,
+                inout("a2") self.ptr.addr() => _,
                 out("t0") _,
             );
 
@@ -255,7 +280,7 @@ impl CheckedUserPointer<'_> {
                 ".popsection",
                 inout("$a0") total => error_bytes,
                 inout("$a1") data => _,
-                inout("$a2") self.ptr => _,
+                inout("$a2") self.ptr.addr() => _,
                 out("$t0") _,
             );
         };
@@ -293,7 +318,7 @@ impl CheckedUserPointer<'_> {
                 ".popsection",
                 in("rax") 0,
                 inout("rcx") self.len => error_bytes,
-                inout("rdi") self.ptr => _,
+                inout("rdi") self.ptr.addr() => _,
                 options(att_syntax)
             );
 
@@ -313,7 +338,7 @@ impl CheckedUserPointer<'_> {
                 ".8byte 0x1", // type: store
                 ".popsection",
                 inout("a0") self.len => error_bytes,
-                inout("a1") self.ptr => _,
+                inout("a1") self.ptr.addr() => _,
             );
 
             #[cfg(target_arch = "loongarch64")]
@@ -332,7 +357,7 @@ impl CheckedUserPointer<'_> {
                 ".8byte 0x1", // type: store
                 ".popsection",
                 inout("$a0") self.len => error_bytes,
-                inout("$a1") self.ptr => _,
+                inout("$a1") self.ptr.addr() => _,
             );
         };
 
@@ -345,8 +370,8 @@ impl CheckedUserPointer<'_> {
 }
 
 impl UserBuffer<'_> {
-    pub fn new(ptr: *mut u8, size: usize) -> KResult<Self> {
-        let ptr = CheckedUserPointer::new(ptr, size)?;
+    pub fn new(ptr: UserMut<u8>, size: usize) -> KResult<Self> {
+        let ptr = CheckedUserPointer::new(ptr.as_const(), size)?;
 
         Ok(Self { ptr, size, cur: 0 })
     }
@@ -388,7 +413,7 @@ impl<'lt> Buffer for UserBuffer<'lt> {
 
 impl<'lt> UserString<'lt> {
     /// # Might Sleep
-    pub fn new(ptr: *const u8) -> KResult<Self> {
+    pub fn new(ptr: User<u8>) -> KResult<Self> {
         assert_preempt_enabled!("UserString::new");
 
         const MAX_LEN: usize = 4096;
@@ -416,7 +441,7 @@ impl<'lt> UserString<'lt> {
                 ".popsection",
                 out("al") _,
                 inout("rcx") MAX_LEN => result,
-                ptr = inout(reg) ptr.ptr => _,
+                ptr = inout(reg) ptr.ptr.addr() => _,
                 options(att_syntax),
             );
 
@@ -439,7 +464,7 @@ impl<'lt> UserString<'lt> {
                 ".popsection",
                 out("t0") _,
                 inout("a0") MAX_LEN => result,
-                inout("a1") ptr.ptr => _,
+                inout("a1") ptr.ptr.addr() => _,
             );
 
             #[cfg(target_arch = "loongarch64")]
@@ -461,7 +486,7 @@ impl<'lt> UserString<'lt> {
                 ".popsection",
                 out("$t0") _,
                 inout("$a0") MAX_LEN => result,
-                inout("$a1") ptr.ptr => _,
+                inout("$a1") ptr.ptr.addr() => _,
             );
         };
 
@@ -478,7 +503,7 @@ impl<'lt> UserString<'lt> {
     pub fn as_cstr(&self) -> &'lt CStr {
         unsafe {
             CStr::from_bytes_with_nul_unchecked(core::slice::from_raw_parts(
-                self.ptr.get_const(),
+                self.ptr.ptr.addr() as *const u8,
                 self.len + 1,
             ))
         }

+ 12 - 12
src/kernel/vfs/dentry.rs

@@ -2,7 +2,7 @@ pub mod dcache;
 
 use super::{
     inode::{Ino, Inode, Mode, RenameData, WriteOffset},
-    s_isblk, s_ischr, s_isdir, s_isreg, DevId, FsContext,
+    DevId, FsContext,
 };
 use crate::{
     hash::KernelHasher,
@@ -250,7 +250,7 @@ impl Dentry {
             }
 
             let parent = self.parent().get_inode()?;
-            parent.creat(self, mode as u32)
+            parent.creat(self, mode)
         }
     }
 }
@@ -409,14 +409,14 @@ impl Dentry {
         let inode = self.get_inode()?;
 
         // Safety: Changing mode alone will have no effect on the file's contents
-        match inode.mode.load(Ordering::Relaxed) {
-            mode if s_isdir(mode) => Err(EISDIR),
-            mode if s_isreg(mode) => inode.read(buffer, offset),
-            mode if s_isblk(mode) => {
+        match inode.mode.load().format() {
+            Mode::DIR => Err(EISDIR),
+            Mode::REG => inode.read(buffer, offset),
+            Mode::BLK => {
                 let device = BlockDevice::get(inode.devid()?)?;
                 Ok(device.read_some(offset, buffer)?.allow_partial())
             }
-            mode if s_ischr(mode) => {
+            Mode::CHR => {
                 let device = CharDevice::get(inode.devid()?).ok_or(EPERM)?;
                 device.read(buffer)
             }
@@ -427,11 +427,11 @@ impl Dentry {
     pub fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
         let inode = self.get_inode()?;
         // Safety: Changing mode alone will have no effect on the file's contents
-        match inode.mode.load(Ordering::Relaxed) {
-            mode if s_isdir(mode) => Err(EISDIR),
-            mode if s_isreg(mode) => inode.write(stream, offset),
-            mode if s_isblk(mode) => Err(EINVAL), // TODO
-            mode if s_ischr(mode) => CharDevice::get(inode.devid()?).ok_or(EPERM)?.write(stream),
+        match inode.mode.load().format() {
+            Mode::DIR => Err(EISDIR),
+            Mode::REG => inode.write(stream, offset),
+            Mode::BLK => Err(EINVAL), // TODO
+            Mode::CHR => CharDevice::get(inode.devid()?).ok_or(EPERM)?.write(stream),
             _ => Err(EINVAL),
         }
     }

+ 6 - 6
src/kernel/vfs/dentry/dcache.rs

@@ -1,14 +1,14 @@
 use super::{Dentry, Inode};
 use crate::kernel::constants::ENOENT;
+use crate::kernel::task::block_on;
+use crate::kernel::vfs::inode::Mode;
 use crate::rcu::RCUPointer;
 use crate::{
-    kernel::vfs::{s_isdir, s_islnk},
     prelude::*,
     rcu::{RCUIterator, RCUList},
 };
 use alloc::sync::Arc;
 use core::sync::atomic::Ordering;
-use eonix_runtime::task::Task;
 use eonix_sync::Mutex;
 
 const DCACHE_HASH_BITS: u32 = 8;
@@ -42,7 +42,7 @@ pub fn d_find_fast(dentry: &Dentry) -> Option<Arc<Dentry>> {
 ///
 /// Silently fail without any side effects
 pub fn d_try_revalidate(dentry: &Arc<Dentry>) {
-    let _lock = Task::block_on(D_EXCHANGE_LOCK.lock());
+    let _lock = block_on(D_EXCHANGE_LOCK.lock());
 
     (|| -> KResult<()> {
         let parent = dentry.parent().get_inode()?;
@@ -57,9 +57,9 @@ pub fn d_try_revalidate(dentry: &Arc<Dentry>) {
 ///
 /// Dentry flags will be determined by the inode's mode.
 pub fn d_save(dentry: &Arc<Dentry>, inode: Arc<dyn Inode>) -> KResult<()> {
-    match inode.mode.load(Ordering::Acquire) {
-        mode if s_isdir(mode) => dentry.save_dir(inode),
-        mode if s_islnk(mode) => dentry.save_symlink(inode),
+    match inode.mode.load().format() {
+        Mode::DIR => dentry.save_dir(inode),
+        Mode::LNK => dentry.save_symlink(inode),
         _ => dentry.save_reg(inode),
     }
 }

+ 0 - 636
src/kernel/vfs/file.rs

@@ -1,636 +0,0 @@
-use super::{
-    dentry::Dentry,
-    inode::{Mode, WriteOffset},
-    s_isblk, s_isreg,
-};
-use crate::{
-    io::{Buffer, BufferFill, ByteBuffer, Chunks, IntoStream},
-    kernel::{
-        constants::{TCGETS, TCSETS, TIOCGPGRP, TIOCGWINSZ, TIOCSPGRP},
-        mem::{paging::Page, AsMemoryBlock as _},
-        task::Thread,
-        terminal::{Terminal, TerminalIORequest},
-        user::{UserPointer, UserPointerMut},
-        vfs::inode::Inode,
-        CharDevice,
-    },
-    prelude::*,
-    sync::CondVar,
-};
-use crate::{
-    io::{Stream, StreamRead},
-    kernel::constants::{
-        EBADF, EFAULT, EINTR, EINVAL, ENOTDIR, ENOTTY, EOVERFLOW, EPIPE, ESPIPE, S_IFMT,
-    },
-};
-use alloc::{collections::vec_deque::VecDeque, sync::Arc};
-use bitflags::bitflags;
-use core::{
-    ops::{ControlFlow, Deref},
-    sync::atomic::{AtomicU32, Ordering},
-};
-use eonix_runtime::task::Task;
-use eonix_sync::Mutex;
-use posix_types::{open::OpenFlags, signal::Signal, stat::StatX};
-
-pub struct InodeFile {
-    read: bool,
-    write: bool,
-    append: bool,
-    /// Only a few modes those won't possibly change are cached here to speed up file operations.
-    /// Specifically, `S_IFMT` masked bits.
-    mode: Mode,
-    cursor: Mutex<usize>,
-    dentry: Arc<Dentry>,
-}
-
-pub struct PipeInner {
-    buffer: VecDeque<u8>,
-    read_closed: bool,
-    write_closed: bool,
-}
-
-pub struct Pipe {
-    inner: Mutex<PipeInner>,
-    cv_read: CondVar,
-    cv_write: CondVar,
-}
-
-pub struct PipeReadEnd {
-    pipe: Arc<Pipe>,
-}
-
-pub struct PipeWriteEnd {
-    pipe: Arc<Pipe>,
-}
-
-pub struct TerminalFile {
-    terminal: Arc<Terminal>,
-}
-
-// TODO: We should use `File` as the base type, instead of `Arc<File>`
-//       If we need shared states, like for `InodeFile`, the files themselves should
-//       have their own shared semantics. All `File` variants will just keep the
-//       `Clone` semantics.
-//
-//       e.g. The `CharDevice` itself is stateless.
-pub enum FileType {
-    Inode(InodeFile),
-    PipeRead(PipeReadEnd),
-    PipeWrite(PipeWriteEnd),
-    TTY(TerminalFile),
-    CharDev(Arc<CharDevice>),
-}
-
-pub struct File {
-    flags: AtomicU32,
-    file_type: FileType,
-}
-
-impl File {
-    pub fn get_inode(&self) -> KResult<Option<Arc<dyn Inode>>> {
-        match &self.file_type {
-            FileType::Inode(inode_file) => Ok(Some(inode_file.dentry.get_inode()?)),
-            _ => Ok(None),
-        }
-    }
-}
-
-pub enum SeekOption {
-    Set(usize),
-    Current(isize),
-    End(isize),
-}
-
-bitflags! {
-    pub struct PollEvent: u16 {
-        const Readable = 0x0001;
-        const Writable = 0x0002;
-    }
-}
-
-impl Drop for PipeReadEnd {
-    fn drop(&mut self) {
-        self.pipe.close_read();
-    }
-}
-
-impl Drop for PipeWriteEnd {
-    fn drop(&mut self) {
-        self.pipe.close_write();
-    }
-}
-
-fn send_sigpipe_to_current() {
-    let current = Thread::current();
-    current.raise(Signal::SIGPIPE);
-}
-
-impl Pipe {
-    const PIPE_SIZE: usize = 4096;
-
-    /// # Return
-    /// `(read_end, write_end)`
-    pub fn new(flags: OpenFlags) -> (Arc<File>, Arc<File>) {
-        let pipe = Arc::new(Self {
-            inner: Mutex::new(PipeInner {
-                buffer: VecDeque::with_capacity(Self::PIPE_SIZE),
-                read_closed: false,
-                write_closed: false,
-            }),
-            cv_read: CondVar::new(),
-            cv_write: CondVar::new(),
-        });
-
-        let read_flags = flags.difference(OpenFlags::O_WRONLY | OpenFlags::O_RDWR);
-        let mut write_flags = read_flags;
-        write_flags.insert(OpenFlags::O_WRONLY);
-
-        (
-            Arc::new(File {
-                flags: AtomicU32::new(read_flags.bits()),
-                file_type: FileType::PipeRead(PipeReadEnd { pipe: pipe.clone() }),
-            }),
-            Arc::new(File {
-                flags: AtomicU32::new(write_flags.bits()),
-                file_type: FileType::PipeWrite(PipeWriteEnd { pipe }),
-            }),
-        )
-    }
-
-    fn close_read(&self) {
-        let mut inner = Task::block_on(self.inner.lock());
-        if inner.read_closed {
-            return;
-        }
-
-        inner.read_closed = true;
-        self.cv_write.notify_all();
-    }
-
-    fn close_write(&self) {
-        let mut inner = Task::block_on(self.inner.lock());
-        if inner.write_closed {
-            return;
-        }
-
-        inner.write_closed = true;
-        self.cv_read.notify_all();
-    }
-
-    async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
-        if !event.contains(PollEvent::Readable) {
-            unimplemented!("Poll event not supported.");
-        }
-
-        let mut inner = self.inner.lock().await;
-        while inner.buffer.is_empty() && !inner.write_closed {
-            inner = self.cv_read.wait(inner).await;
-        }
-
-        if Thread::current().signal_list.has_pending_signal() {
-            return Err(EINTR);
-        }
-
-        let mut retval = PollEvent::empty();
-        if inner.write_closed {
-            retval |= PollEvent::Writable;
-        }
-
-        if !inner.buffer.is_empty() {
-            retval |= PollEvent::Readable;
-        }
-
-        Ok(retval)
-    }
-
-    async fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
-        let mut inner = self.inner.lock().await;
-
-        while !inner.write_closed && inner.buffer.is_empty() {
-            inner = self.cv_read.wait(inner).await;
-            if Thread::current().signal_list.has_pending_signal() {
-                return Err(EINTR);
-            }
-        }
-
-        let (data1, data2) = inner.buffer.as_slices();
-        let nread = buffer.fill(data1)?.allow_partial() + buffer.fill(data2)?.allow_partial();
-        inner.buffer.drain(..nread);
-
-        self.cv_write.notify_all();
-        Ok(nread)
-    }
-
-    async fn write_atomic(&self, data: &[u8]) -> KResult<usize> {
-        let mut inner = self.inner.lock().await;
-
-        if inner.read_closed {
-            send_sigpipe_to_current();
-            return Err(EPIPE);
-        }
-
-        while inner.buffer.len() + data.len() > Self::PIPE_SIZE {
-            inner = self.cv_write.wait(inner).await;
-            if Thread::current().signal_list.has_pending_signal() {
-                return Err(EINTR);
-            }
-
-            if inner.read_closed {
-                send_sigpipe_to_current();
-                return Err(EPIPE);
-            }
-        }
-
-        inner.buffer.extend(data);
-
-        self.cv_read.notify_all();
-        return Ok(data.len());
-    }
-
-    async fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
-        let mut buffer = [0; Self::PIPE_SIZE];
-        let mut total = 0;
-        while let Some(data) = stream.poll_data(&mut buffer)? {
-            let nwrote = self.write_atomic(data).await?;
-            total += nwrote;
-            if nwrote != data.len() {
-                break;
-            }
-        }
-        Ok(total)
-    }
-}
-
-#[derive(Copy, Clone, Debug)]
-#[repr(C, packed)]
-struct UserDirent64 {
-    /// Inode number
-    d_ino: u64,
-    /// Implementation defined. We ignore it
-    d_off: u64,
-    /// Length of this record
-    d_reclen: u16,
-    /// File type. Set to 0
-    d_type: u8,
-    /// Filename with a padding '\0'
-    d_name: [u8; 0],
-}
-
-/// File type is at offset `d_reclen - 1`. Set it to 0
-#[derive(Copy, Clone, Debug)]
-#[repr(C, packed)]
-struct UserDirent {
-    /// Inode number
-    d_ino: u32,
-    /// Implementation defined. We ignore it
-    d_off: u32,
-    /// Length of this record
-    d_reclen: u16,
-    /// Filename with a padding '\0'
-    d_name: [u8; 0],
-}
-
-impl InodeFile {
-    pub fn new(dentry: Arc<Dentry>, flags: OpenFlags) -> Arc<File> {
-        // SAFETY: `dentry` used to create `InodeFile` is valid.
-        // SAFETY: `mode` should never change with respect to the `S_IFMT` fields.
-        let cached_mode = dentry
-            .get_inode()
-            .expect("`dentry` is invalid")
-            .mode
-            .load(Ordering::Relaxed)
-            & S_IFMT;
-
-        let (read, write, append) = flags.as_rwa();
-
-        Arc::new(File {
-            flags: AtomicU32::new(flags.bits()),
-            file_type: FileType::Inode(InodeFile {
-                dentry,
-                read,
-                write,
-                append,
-                mode: cached_mode,
-                cursor: Mutex::new(0),
-            }),
-        })
-    }
-
-    fn seek(&self, option: SeekOption) -> KResult<usize> {
-        let mut cursor = Task::block_on(self.cursor.lock());
-
-        let new_cursor = match option {
-            SeekOption::Current(off) => cursor.checked_add_signed(off).ok_or(EOVERFLOW)?,
-            SeekOption::Set(n) => n,
-            SeekOption::End(off) => {
-                let inode = self.dentry.get_inode()?;
-                let size = inode.size.load(Ordering::Relaxed) as usize;
-                size.checked_add_signed(off).ok_or(EOVERFLOW)?
-            }
-        };
-
-        *cursor = new_cursor;
-        Ok(new_cursor)
-    }
-
-    fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
-        if !self.write {
-            return Err(EBADF);
-        }
-
-        let mut cursor = Task::block_on(self.cursor.lock());
-
-        if self.append {
-            let nwrote = self.dentry.write(stream, WriteOffset::End(&mut cursor))?;
-
-            Ok(nwrote)
-        } else {
-            let nwrote = if let Some(offset) = offset {
-                self.dentry.write(stream, WriteOffset::Position(offset))?
-            } else {
-                let nwrote = self.dentry.write(stream, WriteOffset::Position(*cursor))?;
-                *cursor += nwrote;
-                nwrote
-            };
-
-            Ok(nwrote)
-        }
-    }
-
-    fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
-        if !self.read {
-            return Err(EBADF);
-        }
-
-        let nread = if let Some(offset) = offset {
-            let nread = self.dentry.read(buffer, offset)?;
-            nread
-        } else {
-            let mut cursor = Task::block_on(self.cursor.lock());
-
-            let nread = self.dentry.read(buffer, *cursor)?;
-
-            *cursor += nread;
-            nread
-        };
-
-        Ok(nread)
-    }
-
-    fn getdents64(&self, buffer: &mut dyn Buffer) -> KResult<()> {
-        let mut cursor = Task::block_on(self.cursor.lock());
-
-        let nread = self.dentry.readdir(*cursor, |filename, ino| {
-            // Filename length + 1 for padding '\0'
-            let real_record_len = core::mem::size_of::<UserDirent64>() + filename.len() + 1;
-
-            if buffer.available() < real_record_len {
-                return Ok(ControlFlow::Break(()));
-            }
-
-            let record = UserDirent64 {
-                d_ino: ino,
-                d_off: 0,
-                d_reclen: real_record_len as u16,
-                d_type: 0,
-                d_name: [0; 0],
-            };
-
-            buffer.copy(&record)?.ok_or(EFAULT)?;
-            buffer.fill(filename)?.ok_or(EFAULT)?;
-            buffer.fill(&[0])?.ok_or(EFAULT)?;
-
-            Ok(ControlFlow::Continue(()))
-        })?;
-
-        *cursor += nread;
-        Ok(())
-    }
-
-    fn getdents(&self, buffer: &mut dyn Buffer) -> KResult<()> {
-        let mut cursor = Task::block_on(self.cursor.lock());
-
-        let nread = self.dentry.readdir(*cursor, |filename, ino| {
-            // + 1 for filename length padding '\0', + 1 for d_type.
-            let real_record_len = core::mem::size_of::<UserDirent>() + filename.len() + 2;
-
-            if buffer.available() < real_record_len {
-                return Ok(ControlFlow::Break(()));
-            }
-
-            let record = UserDirent {
-                d_ino: ino as u32,
-                d_off: 0,
-                d_reclen: real_record_len as u16,
-                d_name: [0; 0],
-            };
-
-            buffer.copy(&record)?.ok_or(EFAULT)?;
-            buffer.fill(filename)?.ok_or(EFAULT)?;
-            buffer.fill(&[0, 0])?.ok_or(EFAULT)?;
-
-            Ok(ControlFlow::Continue(()))
-        })?;
-
-        *cursor += nread;
-        Ok(())
-    }
-}
-
-impl TerminalFile {
-    pub fn new(tty: Arc<Terminal>, flags: OpenFlags) -> Arc<File> {
-        Arc::new(File {
-            flags: AtomicU32::new(flags.bits()),
-            file_type: FileType::TTY(TerminalFile { terminal: tty }),
-        })
-    }
-
-    async fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
-        self.terminal.read(buffer).await
-    }
-
-    fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
-        stream.read_till_end(&mut [0; 128], |data| {
-            self.terminal.write(data);
-            Ok(())
-        })
-    }
-
-    async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
-        if !event.contains(PollEvent::Readable) {
-            unimplemented!("Poll event not supported.")
-        }
-
-        self.terminal.poll_in().await.map(|_| PollEvent::Readable)
-    }
-
-    fn ioctl(&self, request: usize, arg3: usize) -> KResult<()> {
-        Task::block_on(self.terminal.ioctl(match request as u32 {
-            TCGETS => TerminalIORequest::GetTermios(UserPointerMut::new_vaddr(arg3)?),
-            TCSETS => TerminalIORequest::SetTermios(UserPointer::new_vaddr(arg3)?),
-            TIOCGPGRP => TerminalIORequest::GetProcessGroup(UserPointerMut::new_vaddr(arg3)?),
-            TIOCSPGRP => TerminalIORequest::SetProcessGroup(UserPointer::new_vaddr(arg3)?),
-            TIOCGWINSZ => TerminalIORequest::GetWindowSize(UserPointerMut::new_vaddr(arg3)?),
-            _ => return Err(EINVAL),
-        }))
-    }
-}
-
-impl FileType {
-    pub async fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
-        match self {
-            FileType::Inode(inode) => inode.read(buffer, offset),
-            FileType::PipeRead(pipe) => pipe.pipe.read(buffer).await,
-            FileType::TTY(tty) => tty.read(buffer).await,
-            FileType::CharDev(device) => device.read(buffer),
-            _ => Err(EBADF),
-        }
-    }
-
-    // TODO
-    // /// Read from the file into the given buffers.
-    // ///
-    // /// Reads are atomic, not intermingled with other reads or writes.
-    // pub fn readv<'r, 'i, I: Iterator<Item = &'i mut dyn Buffer>>(
-    //     &'r self,
-    //     buffers: I,
-    // ) -> KResult<usize> {
-    //     match self {
-    //         File::Inode(inode) => inode.readv(buffers),
-    //         File::PipeRead(pipe) => pipe.pipe.readv(buffers),
-    //         _ => Err(EBADF),
-    //     }
-    // }
-
-    pub async fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
-        match self {
-            FileType::Inode(inode) => inode.write(stream, offset),
-            FileType::PipeWrite(pipe) => pipe.pipe.write(stream).await,
-            FileType::TTY(tty) => tty.write(stream),
-            FileType::CharDev(device) => device.write(stream),
-            _ => Err(EBADF),
-        }
-    }
-
-    pub fn seek(&self, option: SeekOption) -> KResult<usize> {
-        match self {
-            FileType::Inode(inode) => inode.seek(option),
-            _ => Err(ESPIPE),
-        }
-    }
-
-    pub fn getdents(&self, buffer: &mut dyn Buffer) -> KResult<()> {
-        match self {
-            FileType::Inode(inode) => inode.getdents(buffer),
-            _ => Err(ENOTDIR),
-        }
-    }
-
-    pub fn getdents64(&self, buffer: &mut dyn Buffer) -> KResult<()> {
-        match self {
-            FileType::Inode(inode) => inode.getdents64(buffer),
-            _ => Err(ENOTDIR),
-        }
-    }
-
-    pub async fn sendfile(&self, dest_file: &Self, count: usize) -> KResult<usize> {
-        let buffer_page = Page::alloc();
-        // SAFETY: We are the only owner of the page.
-        let buffer = unsafe { buffer_page.as_memblk().as_bytes_mut() };
-
-        match self {
-            FileType::Inode(file) if s_isblk(file.mode) || s_isreg(file.mode) => (),
-            _ => return Err(EINVAL),
-        }
-
-        let mut nsent = 0;
-        for (cur, len) in Chunks::new(0, count, buffer.len()) {
-            if Thread::current().signal_list.has_pending_signal() {
-                return if cur == 0 { Err(EINTR) } else { Ok(cur) };
-            }
-            let nread = self
-                .read(&mut ByteBuffer::new(&mut buffer[..len]), None)
-                .await?;
-            if nread == 0 {
-                break;
-            }
-
-            let nwrote = dest_file
-                .write(&mut buffer[..nread].into_stream(), None)
-                .await?;
-            nsent += nwrote;
-
-            if nwrote != len {
-                break;
-            }
-        }
-
-        Ok(nsent)
-    }
-
-    pub fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
-        match self {
-            FileType::TTY(tty) => tty.ioctl(request, arg3).map(|_| 0),
-            _ => Err(ENOTTY),
-        }
-    }
-
-    pub async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
-        match self {
-            FileType::Inode(_) => Ok(event),
-            FileType::TTY(tty) => tty.poll(event).await,
-            FileType::PipeRead(PipeReadEnd { pipe })
-            | FileType::PipeWrite(PipeWriteEnd { pipe }) => pipe.poll(event).await,
-            _ => unimplemented!("Poll event not supported."),
-        }
-    }
-
-    pub fn statx(&self, buffer: &mut StatX, mask: u32) -> KResult<()> {
-        match self {
-            FileType::Inode(inode) => inode.dentry.statx(buffer, mask),
-            _ => Err(EBADF),
-        }
-    }
-
-    pub fn as_path(&self) -> Option<&Arc<Dentry>> {
-        match self {
-            FileType::Inode(inode_file) => Some(&inode_file.dentry),
-            _ => None,
-        }
-    }
-}
-
-impl File {
-    pub fn new(flags: OpenFlags, file_type: FileType) -> Arc<Self> {
-        Arc::new(Self {
-            flags: AtomicU32::new(flags.bits()),
-            file_type,
-        })
-    }
-
-    pub fn get_flags(&self) -> OpenFlags {
-        OpenFlags::from_bits_retain(self.flags.load(Ordering::Relaxed))
-    }
-
-    pub fn set_flags(&self, flags: OpenFlags) {
-        let flags = flags.difference(
-            OpenFlags::O_WRONLY
-                | OpenFlags::O_RDWR
-                | OpenFlags::O_CREAT
-                | OpenFlags::O_TRUNC
-                | OpenFlags::O_EXCL,
-            // | OpenFlags::O_NOCTTY,
-        );
-
-        self.flags.store(flags.bits(), Ordering::Relaxed);
-    }
-}
-
-impl Deref for File {
-    type Target = FileType;
-
-    fn deref(&self) -> &Self::Target {
-        &self.file_type
-    }
-}

+ 223 - 0
src/kernel/vfs/file/inode_file.rs

@@ -0,0 +1,223 @@
+use super::{File, FileType, SeekOption};
+use crate::{
+    io::{Buffer, BufferFill, Stream},
+    kernel::{
+        constants::{EBADF, EFAULT, ENOTDIR, EOVERFLOW, ESPIPE},
+        vfs::{
+            dentry::Dentry,
+            inode::{Inode, Mode, WriteOffset},
+        },
+    },
+    prelude::KResult,
+};
+use alloc::sync::Arc;
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use eonix_sync::Mutex;
+use posix_types::{
+    getdent::{UserDirent, UserDirent64},
+    open::OpenFlags,
+    stat::StatX,
+};
+
+pub struct InodeFile {
+    pub r: bool,
+    pub w: bool,
+    pub a: bool,
+    /// Only a few modes those won't possibly change are cached here to speed up file operations.
+    /// Specifically, `S_IFMT` masked bits.
+    pub mode: Mode,
+    cursor: Mutex<usize>,
+    dentry: Arc<Dentry>,
+}
+
+impl InodeFile {
+    pub fn new(dentry: Arc<Dentry>, flags: OpenFlags) -> File {
+        // SAFETY: `dentry` used to create `InodeFile` is valid.
+        // SAFETY: `mode` should never change with respect to the `S_IFMT` fields.
+        let cached_mode = dentry
+            .get_inode()
+            .expect("`dentry` is invalid")
+            .mode
+            .load()
+            .format();
+
+        let (r, w, a) = flags.as_rwa();
+
+        File::new(
+            flags,
+            FileType::Inode(InodeFile {
+                dentry,
+                r,
+                w,
+                a,
+                mode: cached_mode,
+                cursor: Mutex::new(0),
+            }),
+        )
+    }
+
+    pub fn sendfile_check(&self) -> KResult<()> {
+        match self.mode {
+            Mode::REG | Mode::BLK => Ok(()),
+            _ => Err(EBADF),
+        }
+    }
+
+    pub async fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
+        if !self.w {
+            return Err(EBADF);
+        }
+
+        let mut cursor = self.cursor.lock().await;
+
+        if self.a {
+            let nwrote = self.dentry.write(stream, WriteOffset::End(&mut cursor))?;
+
+            Ok(nwrote)
+        } else {
+            let nwrote = if let Some(offset) = offset {
+                self.dentry.write(stream, WriteOffset::Position(offset))?
+            } else {
+                let nwrote = self.dentry.write(stream, WriteOffset::Position(*cursor))?;
+                *cursor += nwrote;
+                nwrote
+            };
+
+            Ok(nwrote)
+        }
+    }
+
+    pub async fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
+        if !self.r {
+            return Err(EBADF);
+        }
+
+        let nread = if let Some(offset) = offset {
+            let nread = self.dentry.read(buffer, offset)?;
+            nread
+        } else {
+            let mut cursor = self.cursor.lock().await;
+
+            let nread = self.dentry.read(buffer, *cursor)?;
+
+            *cursor += nread;
+            nread
+        };
+
+        Ok(nread)
+    }
+}
+
+impl File {
+    pub fn get_inode(&self) -> KResult<Option<Arc<dyn Inode>>> {
+        if let FileType::Inode(inode_file) = &**self {
+            Ok(Some(inode_file.dentry.get_inode()?))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub async fn getdents(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        let FileType::Inode(inode_file) = &**self else {
+            return Err(ENOTDIR);
+        };
+
+        let mut cursor = inode_file.cursor.lock().await;
+
+        let nread = inode_file.dentry.readdir(*cursor, |filename, ino| {
+            // + 1 for filename length padding '\0', + 1 for d_type.
+            let real_record_len = core::mem::size_of::<UserDirent>() + filename.len() + 2;
+
+            if buffer.available() < real_record_len {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            let record = UserDirent {
+                d_ino: ino as u32,
+                d_off: 0,
+                d_reclen: real_record_len as u16,
+                d_name: [0; 0],
+            };
+
+            buffer.copy(&record)?.ok_or(EFAULT)?;
+            buffer.fill(filename)?.ok_or(EFAULT)?;
+            buffer.fill(&[0, 0])?.ok_or(EFAULT)?;
+
+            Ok(ControlFlow::Continue(()))
+        })?;
+
+        *cursor += nread;
+        Ok(())
+    }
+
+    pub async fn getdents64(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        let FileType::Inode(inode_file) = &**self else {
+            return Err(ENOTDIR);
+        };
+
+        let mut cursor = inode_file.cursor.lock().await;
+
+        let nread = inode_file.dentry.readdir(*cursor, |filename, ino| {
+            // Filename length + 1 for padding '\0'
+            let real_record_len = core::mem::size_of::<UserDirent64>() + filename.len() + 1;
+
+            if buffer.available() < real_record_len {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            let record = UserDirent64 {
+                d_ino: ino,
+                d_off: 0,
+                d_reclen: real_record_len as u16,
+                d_type: 0,
+                d_name: [0; 0],
+            };
+
+            buffer.copy(&record)?.ok_or(EFAULT)?;
+            buffer.fill(filename)?.ok_or(EFAULT)?;
+            buffer.fill(&[0])?.ok_or(EFAULT)?;
+
+            Ok(ControlFlow::Continue(()))
+        })?;
+
+        *cursor += nread;
+        Ok(())
+    }
+
+    pub async fn seek(&self, option: SeekOption) -> KResult<usize> {
+        let FileType::Inode(inode_file) = &**self else {
+            return Err(ESPIPE);
+        };
+
+        let mut cursor = inode_file.cursor.lock().await;
+
+        let new_cursor = match option {
+            SeekOption::Current(off) => cursor.checked_add_signed(off).ok_or(EOVERFLOW)?,
+            SeekOption::Set(n) => n,
+            SeekOption::End(off) => {
+                let inode = inode_file.dentry.get_inode()?;
+                let size = inode.size.load(Ordering::Relaxed) as usize;
+                size.checked_add_signed(off).ok_or(EOVERFLOW)?
+            }
+        };
+
+        *cursor = new_cursor;
+        Ok(new_cursor)
+    }
+
+    pub fn statx(&self, buffer: &mut StatX, mask: u32) -> KResult<()> {
+        if let FileType::Inode(inode) = &**self {
+            inode.dentry.statx(buffer, mask)
+        } else {
+            Err(EBADF)
+        }
+    }
+
+    pub fn as_path(&self) -> Option<&Arc<Dentry>> {
+        if let FileType::Inode(inode_file) = &**self {
+            Some(&inode_file.dentry)
+        } else {
+            None
+        }
+    }
+}

+ 232 - 0
src/kernel/vfs/file/mod.rs

@@ -0,0 +1,232 @@
+mod inode_file;
+mod pipe;
+mod terminal_file;
+
+use crate::{
+    io::{Buffer, ByteBuffer, Chunks, IntoStream, Stream},
+    kernel::{
+        constants::{EBADF, EINTR, EINVAL, ENOTTY},
+        mem::{AsMemoryBlock, Page},
+        task::Thread,
+        CharDevice,
+    },
+    prelude::KResult,
+};
+use alloc::sync::Arc;
+use bitflags::bitflags;
+use core::{
+    ops::Deref,
+    sync::atomic::{AtomicI32, AtomicU32, Ordering},
+};
+use pipe::{PipeReadEnd, PipeWriteEnd};
+use posix_types::open::OpenFlags;
+
+pub use inode_file::InodeFile;
+pub use pipe::Pipe;
+pub use terminal_file::TerminalFile;
+
+pub enum FileType {
+    Inode(InodeFile),
+    PipeRead(PipeReadEnd),
+    PipeWrite(PipeWriteEnd),
+    Terminal(TerminalFile),
+    CharDev(Arc<CharDevice>),
+}
+
+struct FileData {
+    flags: AtomicU32,
+    open_count: AtomicI32,
+    file_type: FileType,
+}
+
+#[derive(Clone)]
+pub struct File(Arc<FileData>);
+
+pub enum SeekOption {
+    Set(usize),
+    Current(isize),
+    End(isize),
+}
+
+bitflags! {
+    pub struct PollEvent: u16 {
+        const Readable = 0x0001;
+        const Writable = 0x0002;
+    }
+}
+
+impl FileType {
+    pub async fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
+        match self {
+            FileType::Inode(inode) => inode.read(buffer, offset).await,
+            FileType::PipeRead(pipe) => pipe.read(buffer).await,
+            FileType::Terminal(tty) => tty.read(buffer).await,
+            FileType::CharDev(device) => device.read(buffer),
+            _ => Err(EBADF),
+        }
+    }
+
+    // TODO
+    // /// Read from the file into the given buffers.
+    // ///
+    // /// Reads are atomic, not intermingled with other reads or writes.
+    // pub fn readv<'r, 'i, I: Iterator<Item = &'i mut dyn Buffer>>(
+    //     &'r self,
+    //     buffers: I,
+    // ) -> KResult<usize> {
+    //     match self {
+    //         File::Inode(inode) => inode.readv(buffers),
+    //         File::PipeRead(pipe) => pipe.pipe.readv(buffers),
+    //         _ => Err(EBADF),
+    //     }
+    // }
+
+    pub async fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
+        match self {
+            FileType::Inode(inode) => inode.write(stream, offset).await,
+            FileType::PipeWrite(pipe) => pipe.write(stream).await,
+            FileType::Terminal(tty) => tty.write(stream),
+            FileType::CharDev(device) => device.write(stream),
+            _ => Err(EBADF),
+        }
+    }
+
+    fn sendfile_check(&self) -> KResult<()> {
+        match self {
+            FileType::Inode(file) => file.sendfile_check(),
+            _ => Err(EINVAL),
+        }
+    }
+
+    pub async fn sendfile(&self, dest_file: &Self, count: usize) -> KResult<usize> {
+        let buffer_page = Page::alloc();
+        // SAFETY: We are the only owner of the page.
+        let buffer = unsafe { buffer_page.as_memblk().as_bytes_mut() };
+
+        self.sendfile_check()?;
+
+        let mut nsent = 0;
+        for (cur, len) in Chunks::new(0, count, buffer.len()) {
+            if Thread::current().signal_list.has_pending_signal() {
+                return if cur == 0 { Err(EINTR) } else { Ok(cur) };
+            }
+            let nread = self
+                .read(&mut ByteBuffer::new(&mut buffer[..len]), None)
+                .await?;
+            if nread == 0 {
+                break;
+            }
+
+            let nwrote = dest_file
+                .write(&mut buffer[..nread].into_stream(), None)
+                .await?;
+            nsent += nwrote;
+
+            if nwrote != len {
+                break;
+            }
+        }
+
+        Ok(nsent)
+    }
+
+    pub async fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
+        match self {
+            FileType::Terminal(tty) => tty.ioctl(request, arg3).await.map(|_| 0),
+            _ => Err(ENOTTY),
+        }
+    }
+
+    pub async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
+        match self {
+            FileType::Inode(_) => Ok(event),
+            FileType::Terminal(tty) => tty.poll(event).await,
+            FileType::PipeRead(pipe) => pipe.poll(event).await,
+            FileType::PipeWrite(pipe) => pipe.poll(event).await,
+            _ => unimplemented!("Poll event not supported."),
+        }
+    }
+}
+
+impl File {
+    pub fn new(flags: OpenFlags, file_type: FileType) -> Self {
+        Self(Arc::new(FileData {
+            flags: AtomicU32::new(flags.bits()),
+            open_count: AtomicI32::new(1),
+            file_type,
+        }))
+    }
+
+    pub fn get_flags(&self) -> OpenFlags {
+        OpenFlags::from_bits_retain(self.0.flags.load(Ordering::Relaxed))
+    }
+
+    pub fn set_flags(&self, flags: OpenFlags) {
+        let flags = flags.difference(
+            OpenFlags::O_WRONLY
+                | OpenFlags::O_RDWR
+                | OpenFlags::O_CREAT
+                | OpenFlags::O_TRUNC
+                | OpenFlags::O_EXCL,
+            // | OpenFlags::O_NOCTTY,
+        );
+
+        self.0.flags.store(flags.bits(), Ordering::Relaxed);
+    }
+
+    /// Duplicate the file descriptor in order to store it in some [FileArray].
+    ///
+    /// The [`File`]s stored in [FileArray]s hold an "open count", which is used
+    /// to track how many references to the file are currently open.
+    ///
+    /// # Panics
+    /// The [`File`]s stored in [FileArray]s MUST be retrieved by calling this
+    /// method. Otherwise, when the last reference to the file is dropped,
+    /// something bad will happen. ;)
+    ///
+    /// [FileArray]: crate::kernel::vfs::filearray::FileArray
+    pub fn dup(&self) -> Self {
+        self.0.open_count.fetch_add(1, Ordering::Relaxed);
+        Self(self.0.clone())
+    }
+
+    /// Close the file descriptor, decrementing the open count.
+    pub async fn close(self) {
+        // Due to rust async drop limits, we have to do this manually...
+        //
+        // Users of files can clone and drop it freely, but references held by
+        // file arrays must be dropped by calling this function (in order to
+        // await for the async close operation of the inner FileType).
+        match self.0.open_count.fetch_sub(1, Ordering::Relaxed) {
+            ..1 => panic!("File open count underflow."),
+            1 => {}
+            _ => return,
+        }
+
+        match &self.0.file_type {
+            FileType::PipeRead(pipe) => pipe.close().await,
+            FileType::PipeWrite(pipe) => pipe.close().await,
+            _ => {}
+        }
+    }
+}
+
+impl Drop for FileData {
+    fn drop(&mut self) {
+        // If you're "lucky" enough to see this, it means that you've violated
+        // the file reference counting rules. Check File::close() for details. ;)
+        assert_eq!(
+            self.open_count.load(Ordering::Relaxed),
+            0,
+            "File dropped with open count 0, check the comments for details."
+        );
+    }
+}
+
+impl Deref for File {
+    type Target = FileType;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0.file_type
+    }
+}

+ 211 - 0
src/kernel/vfs/file/pipe.rs

@@ -0,0 +1,211 @@
+use super::{File, FileType, PollEvent};
+use crate::{
+    io::{Buffer, Stream},
+    kernel::{
+        constants::{EINTR, EPIPE},
+        task::Thread,
+    },
+    prelude::KResult,
+    sync::CondVar,
+};
+use alloc::{collections::vec_deque::VecDeque, sync::Arc};
+use eonix_sync::Mutex;
+use posix_types::{open::OpenFlags, signal::Signal};
+
+struct PipeInner {
+    buffer: VecDeque<u8>,
+    read_closed: bool,
+    write_closed: bool,
+}
+
+pub struct Pipe {
+    inner: Mutex<PipeInner>,
+    cv_read: CondVar,
+    cv_write: CondVar,
+}
+
+pub struct PipeReadEnd {
+    pipe: Arc<Pipe>,
+}
+
+pub struct PipeWriteEnd {
+    pipe: Arc<Pipe>,
+}
+
+fn send_sigpipe_to_current() {
+    let current = Thread::current();
+    current.raise(Signal::SIGPIPE);
+}
+
+impl Pipe {
+    const PIPE_SIZE: usize = 4096;
+
+    /// # Return
+    /// `(read_end, write_end)`
+    pub fn new(flags: OpenFlags) -> (File, File) {
+        let pipe = Arc::new(Self {
+            inner: Mutex::new(PipeInner {
+                buffer: VecDeque::with_capacity(Self::PIPE_SIZE),
+                read_closed: false,
+                write_closed: false,
+            }),
+            cv_read: CondVar::new(),
+            cv_write: CondVar::new(),
+        });
+
+        let read_flags = flags.difference(OpenFlags::O_WRONLY | OpenFlags::O_RDWR);
+        let mut write_flags = read_flags;
+        write_flags.insert(OpenFlags::O_WRONLY);
+
+        let read_pipe = pipe.clone();
+        let write_pipe = pipe;
+
+        (
+            File::new(
+                read_flags,
+                FileType::PipeRead(PipeReadEnd { pipe: read_pipe }),
+            ),
+            File::new(
+                write_flags,
+                FileType::PipeWrite(PipeWriteEnd { pipe: write_pipe }),
+            ),
+        )
+    }
+
+    pub async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
+        if !event.contains(PollEvent::Readable) {
+            unimplemented!("Poll event not supported.");
+        }
+
+        let mut inner = self.inner.lock().await;
+        while inner.buffer.is_empty() && !inner.write_closed {
+            inner = self.cv_read.wait(inner).await;
+        }
+
+        if Thread::current().signal_list.has_pending_signal() {
+            return Err(EINTR);
+        }
+
+        let mut retval = PollEvent::empty();
+        if inner.write_closed {
+            retval |= PollEvent::Writable;
+        }
+
+        if !inner.buffer.is_empty() {
+            retval |= PollEvent::Readable;
+        }
+
+        Ok(retval)
+    }
+
+    pub async fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        let mut inner = self.inner.lock().await;
+
+        while !inner.write_closed && inner.buffer.is_empty() {
+            inner = self.cv_read.wait(inner).await;
+            if Thread::current().signal_list.has_pending_signal() {
+                return Err(EINTR);
+            }
+        }
+
+        let (data1, data2) = inner.buffer.as_slices();
+        let nread = buffer.fill(data1)?.allow_partial() + buffer.fill(data2)?.allow_partial();
+        inner.buffer.drain(..nread);
+
+        self.cv_write.notify_all();
+        Ok(nread)
+    }
+
+    async fn write_atomic(&self, data: &[u8]) -> KResult<usize> {
+        let mut inner = self.inner.lock().await;
+
+        if inner.read_closed {
+            send_sigpipe_to_current();
+            return Err(EPIPE);
+        }
+
+        while inner.buffer.len() + data.len() > Self::PIPE_SIZE {
+            inner = self.cv_write.wait(inner).await;
+            if Thread::current().signal_list.has_pending_signal() {
+                return Err(EINTR);
+            }
+
+            if inner.read_closed {
+                send_sigpipe_to_current();
+                return Err(EPIPE);
+            }
+        }
+
+        inner.buffer.extend(data);
+
+        self.cv_read.notify_all();
+        return Ok(data.len());
+    }
+
+    pub async fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
+        let mut buffer = [0; Self::PIPE_SIZE];
+        let mut total = 0;
+        while let Some(data) = stream.poll_data(&mut buffer)? {
+            let nwrote = self.write_atomic(data).await?;
+            total += nwrote;
+            if nwrote != data.len() {
+                break;
+            }
+        }
+        Ok(total)
+    }
+}
+
+impl PipeReadEnd {
+    pub async fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        self.pipe.read(buffer).await
+    }
+
+    pub async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
+        self.pipe.poll(event).await
+    }
+
+    pub async fn close(&self) {
+        let mut inner = self.pipe.inner.lock().await;
+        if inner.read_closed {
+            return;
+        }
+
+        inner.read_closed = true;
+        self.pipe.cv_write.notify_all();
+    }
+}
+
+impl PipeWriteEnd {
+    pub async fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
+        self.pipe.write(stream).await
+    }
+
+    pub async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
+        self.pipe.poll(event).await
+    }
+
+    pub async fn close(&self) {
+        let mut inner = self.pipe.inner.lock().await;
+        if inner.write_closed {
+            return;
+        }
+
+        inner.write_closed = true;
+        self.pipe.cv_read.notify_all();
+    }
+}
+
+impl Drop for Pipe {
+    fn drop(&mut self) {
+        debug_assert!(
+            self.inner.get_mut().read_closed,
+            "Pipe read end should be closed before dropping (check File::close())."
+        );
+
+        debug_assert!(
+            self.inner.get_mut().write_closed,
+            "Pipe write end should be closed before dropping (check File::close())."
+        );
+    }
+}

+ 55 - 0
src/kernel/vfs/file/terminal_file.rs

@@ -0,0 +1,55 @@
+use super::{File, FileType, PollEvent};
+use crate::{
+    io::{Buffer, Stream, StreamRead},
+    kernel::{
+        constants::{EINVAL, TCGETS, TCSETS, TIOCGPGRP, TIOCGWINSZ, TIOCSPGRP},
+        terminal::TerminalIORequest,
+        user::{UserPointer, UserPointerMut},
+        Terminal,
+    },
+    prelude::KResult,
+};
+use alloc::sync::Arc;
+use posix_types::open::OpenFlags;
+
+pub struct TerminalFile {
+    terminal: Arc<Terminal>,
+}
+
+impl TerminalFile {
+    pub fn new(tty: Arc<Terminal>, flags: OpenFlags) -> File {
+        File::new(flags, FileType::Terminal(TerminalFile { terminal: tty }))
+    }
+
+    pub async fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        self.terminal.read(buffer).await
+    }
+
+    pub fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
+        stream.read_till_end(&mut [0; 128], |data| {
+            self.terminal.write(data);
+            Ok(())
+        })
+    }
+
+    pub async fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
+        if !event.contains(PollEvent::Readable) {
+            unimplemented!("Poll event not supported.")
+        }
+
+        self.terminal.poll_in().await.map(|_| PollEvent::Readable)
+    }
+
+    pub async fn ioctl(&self, request: usize, arg3: usize) -> KResult<()> {
+        self.terminal
+            .ioctl(match request as u32 {
+                TCGETS => TerminalIORequest::GetTermios(UserPointerMut::with_addr(arg3)?),
+                TCSETS => TerminalIORequest::SetTermios(UserPointer::with_addr(arg3)?),
+                TIOCGPGRP => TerminalIORequest::GetProcessGroup(UserPointerMut::with_addr(arg3)?),
+                TIOCSPGRP => TerminalIORequest::SetProcessGroup(UserPointer::with_addr(arg3)?),
+                TIOCGWINSZ => TerminalIORequest::GetWindowSize(UserPointerMut::with_addr(arg3)?),
+                _ => return Err(EINVAL),
+            })
+            .await
+    }
+}

+ 268 - 130
src/kernel/vfs/filearray.rs

@@ -1,7 +1,7 @@
 use super::{
-    file::{File, InodeFile, TerminalFile},
+    file::{File, InodeFile, Pipe},
     inode::Mode,
-    s_ischr, Spin,
+    Spin, TerminalFile,
 };
 use crate::kernel::{
     constants::{
@@ -10,38 +10,51 @@ use crate::kernel::{
     syscall::{FromSyscallArg, SyscallRetVal},
 };
 use crate::{
-    kernel::{
-        console::get_console,
-        constants::ENXIO,
-        vfs::{dentry::Dentry, file::Pipe, s_isdir, s_isreg},
-        CharDevice,
-    },
+    kernel::{console::get_console, constants::ENXIO, vfs::dentry::Dentry, CharDevice},
     prelude::*,
 };
-use alloc::{
-    collections::btree_map::{BTreeMap, Entry},
-    sync::Arc,
+use alloc::sync::Arc;
+use intrusive_collections::{
+    intrusive_adapter, rbtree::Entry, Bound, KeyAdapter, RBTree, RBTreeAtomicLink,
 };
-use core::sync::atomic::Ordering;
 use itertools::{
     FoldWhile::{Continue, Done},
     Itertools,
 };
 use posix_types::open::{FDFlags, OpenFlags};
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 pub struct FD(u32);
 
 #[derive(Clone)]
 struct OpenFile {
+    fd: FD,
     flags: FDFlags,
-    file: Arc<File>,
+    file: File,
+
+    link: RBTreeAtomicLink,
+}
+
+intrusive_adapter!(
+    OpenFileAdapter = Box<OpenFile>: OpenFile { link: RBTreeAtomicLink }
+);
+
+impl<'a> KeyAdapter<'a> for OpenFileAdapter {
+    type Key = FD;
+
+    fn get_key(&self, value: &'a OpenFile) -> Self::Key {
+        value.fd
+    }
 }
 
 #[derive(Clone)]
+struct FDAllocator {
+    min_avail: FD,
+}
+
 struct FileArrayInner {
-    files: BTreeMap<FD, OpenFile>,
-    fd_min_avail: FD,
+    files: RBTree<OpenFileAdapter>,
+    fd_alloc: FDAllocator,
 }
 
 pub struct FileArray {
@@ -49,109 +62,202 @@ pub struct FileArray {
 }
 
 impl OpenFile {
+    fn new(fd: FD, flags: FDFlags, file: File) -> Box<Self> {
+        Box::new(Self {
+            fd,
+            flags,
+            file,
+            link: RBTreeAtomicLink::new(),
+        })
+    }
+
     pub fn close_on_exec(&self) -> bool {
         self.flags.contains(FDFlags::FD_CLOEXEC)
     }
 }
 
+impl FDAllocator {
+    const fn new() -> Self {
+        Self { min_avail: FD(0) }
+    }
+
+    fn reinit(&mut self) {
+        self.min_avail = FD(0);
+    }
+
+    fn find_available(&mut self, from: FD, files: &RBTree<OpenFileAdapter>) -> FD {
+        files
+            .range(Bound::Included(&from), Bound::Unbounded)
+            .fold_while(from, |current, OpenFile { fd, .. }| {
+                if current == *fd {
+                    Continue(FD(current.0 + 1))
+                } else {
+                    Done(current)
+                }
+            })
+            .into_inner()
+    }
+
+    /// Allocate a new file descriptor starting from `from`.
+    ///
+    /// Returned file descriptor should be used immediately.
+    ///
+    fn allocate_fd(&mut self, from: FD, files: &RBTree<OpenFileAdapter>) -> FD {
+        let from = FD::max(from, self.min_avail);
+
+        if from == self.min_avail {
+            let next_min_avail = self.find_available(FD(from.0 + 1), files);
+            let allocated = self.min_avail;
+            self.min_avail = next_min_avail;
+            allocated
+        } else {
+            self.find_available(from, files)
+        }
+    }
+
+    fn release_fd(&mut self, fd: FD) {
+        if fd < self.min_avail {
+            self.min_avail = fd;
+        }
+    }
+
+    fn next_fd(&mut self, files: &RBTree<OpenFileAdapter>) -> FD {
+        self.allocate_fd(self.min_avail, files)
+    }
+}
+
 impl FileArray {
     pub fn new() -> Arc<Self> {
         Arc::new(FileArray {
             inner: Spin::new(FileArrayInner {
-                files: BTreeMap::new(),
-                fd_min_avail: FD(0),
+                files: RBTree::new(OpenFileAdapter::new()),
+                fd_alloc: FDAllocator::new(),
             }),
         })
     }
 
-    #[allow(dead_code)]
     pub fn new_shared(other: &Arc<Self>) -> Arc<Self> {
         other.clone()
     }
 
     pub fn new_cloned(other: &Self) -> Arc<Self> {
         Arc::new(Self {
-            inner: Spin::new(other.inner.lock().clone()),
+            inner: Spin::new({
+                let (new_files, new_fd_alloc) = {
+                    let mut new_files = RBTree::new(OpenFileAdapter::new());
+                    let other_inner = other.inner.lock();
+
+                    for file in other_inner.files.iter() {
+                        let new_file = OpenFile::new(file.fd, file.flags, file.file.dup());
+                        new_files.insert(new_file);
+                    }
+                    (new_files, other_inner.fd_alloc.clone())
+                };
+
+                FileArrayInner {
+                    files: new_files,
+                    fd_alloc: new_fd_alloc,
+                }
+            }),
         })
     }
 
     /// Acquires the file array lock.
-    pub fn get(&self, fd: FD) -> Option<Arc<File>> {
+    pub fn get(&self, fd: FD) -> Option<File> {
         self.inner.lock().get(fd)
     }
 
-    pub fn close_all(&self) {
-        let _old_files = {
+    pub async fn close_all(&self) {
+        let old_files = {
             let mut inner = self.inner.lock();
-            inner.fd_min_avail = FD(0);
-            core::mem::take(&mut inner.files)
+            inner.fd_alloc.reinit();
+            inner.files.take()
         };
+
+        for file in old_files.into_iter() {
+            file.file.close().await;
+        }
     }
 
-    pub fn close(&self, fd: FD) -> KResult<()> {
-        let _file = {
+    pub async fn close(&self, fd: FD) -> KResult<()> {
+        let file = {
             let mut inner = self.inner.lock();
-            let file = inner.files.remove(&fd).ok_or(EBADF)?;
-            inner.release_fd(fd);
-            file
+            let file = inner.files.find_mut(&fd).remove().ok_or(EBADF)?;
+            inner.fd_alloc.release_fd(file.fd);
+            file.file
         };
+
+        file.close().await;
         Ok(())
     }
 
-    pub fn on_exec(&self) -> () {
-        let mut inner = self.inner.lock();
+    pub async fn on_exec(&self) {
+        let files_to_close = {
+            let mut inner = self.inner.lock();
+            let (files, fd_alloc) = inner.split_borrow();
 
-        // TODO: This is not efficient. We should avoid cloning.
-        let fds_to_close = inner
-            .files
-            .iter()
-            .filter(|(_, ofile)| ofile.close_on_exec())
-            .map(|(&fd, _)| fd)
-            .collect::<Vec<_>>();
+            files.pick(|ofile| {
+                if ofile.close_on_exec() {
+                    fd_alloc.release_fd(ofile.fd);
+                    true
+                } else {
+                    false
+                }
+            })
+        };
 
-        inner.files.retain(|_, ofile| !ofile.close_on_exec());
-        fds_to_close.into_iter().for_each(|fd| inner.release_fd(fd));
+        for open_file in files_to_close.into_iter() {
+            open_file.file.close().await;
+        }
     }
-}
 
-impl FileArray {
     pub fn dup(&self, old_fd: FD) -> KResult<FD> {
         let mut inner = self.inner.lock();
-        let old_file = inner.files.get(&old_fd).ok_or(EBADF)?;
+        let (files, fd_alloc) = inner.split_borrow();
+
+        let old_file = files.get_fd(old_fd).ok_or(EBADF)?;
 
-        let new_file_data = old_file.file.clone();
+        let new_file_data = old_file.file.dup();
         let new_file_flags = old_file.flags;
-        let new_fd = inner.next_fd();
+        let new_fd = fd_alloc.next_fd(files);
 
         inner.do_insert(new_fd, new_file_flags, new_file_data);
 
         Ok(new_fd)
     }
 
-    pub fn dup_to(&self, old_fd: FD, new_fd: FD, flags: OpenFlags) -> KResult<FD> {
-        let fdflags = flags.as_fd_flags();
-
+    /// Duplicates the file to a new file descriptor, returning the old file
+    /// description to be dropped.
+    fn dup_to_no_close(&self, old_fd: FD, new_fd: FD, fd_flags: FDFlags) -> KResult<Option<File>> {
         let mut inner = self.inner.lock();
-        let old_file = inner.files.get(&old_fd).ok_or(EBADF)?;
+        let (files, fd_alloc) = inner.split_borrow();
+
+        let old_file = files.get_fd(old_fd).ok_or(EBADF)?;
+        let new_file_data = old_file.file.dup();
 
-        let new_file_data = old_file.file.clone();
+        match files.entry(&new_fd) {
+            Entry::Vacant(_) => {
+                assert_eq!(new_fd, fd_alloc.allocate_fd(new_fd, files));
+                inner.do_insert(new_fd, fd_flags, new_file_data);
 
-        match inner.files.entry(new_fd) {
-            Entry::Vacant(_) => {}
-            Entry::Occupied(entry) => {
-                let new_file = entry.into_mut();
-                let mut file_swap = new_file_data;
+                Ok(None)
+            }
+            Entry::Occupied(mut entry) => {
+                let mut file = entry.remove().unwrap();
+                file.flags = fd_flags;
+                let old_file = core::mem::replace(&mut file.file, new_file_data);
 
-                new_file.flags = fdflags;
-                core::mem::swap(&mut file_swap, &mut new_file.file);
+                entry.insert(file);
 
-                drop(inner);
-                return Ok(new_fd);
+                Ok(Some(old_file))
             }
         }
+    }
 
-        assert_eq!(new_fd, inner.allocate_fd(new_fd));
-        inner.do_insert(new_fd, fdflags, new_file_data);
+    pub async fn dup_to(&self, old_fd: FD, new_fd: FD, flags: OpenFlags) -> KResult<FD> {
+        if let Some(old_file) = self.dup_to_no_close(old_fd, new_fd, flags.as_fd_flags())? {
+            old_file.close().await;
+        }
 
         Ok(new_fd)
     }
@@ -160,9 +266,10 @@ impl FileArray {
     /// `(read_fd, write_fd)`
     pub fn pipe(&self, flags: OpenFlags) -> KResult<(FD, FD)> {
         let mut inner = self.inner.lock();
+        let (files, fd_alloc) = inner.split_borrow();
 
-        let read_fd = inner.next_fd();
-        let write_fd = inner.next_fd();
+        let read_fd = fd_alloc.next_fd(files);
+        let write_fd = fd_alloc.next_fd(files);
 
         let fdflag = flags.as_fd_flags();
 
@@ -179,23 +286,20 @@ impl FileArray {
         let fdflag = flags.as_fd_flags();
 
         let inode = dentry.get_inode()?;
-        let filemode = inode.mode.load(Ordering::Relaxed);
+        let file_format = inode.mode.load().format();
 
-        if flags.directory() {
-            if !s_isdir(filemode) {
-                return Err(ENOTDIR);
-            }
-        } else {
-            if s_isdir(filemode) && flags.write() {
-                return Err(EISDIR);
-            }
+        match (flags.directory(), file_format, flags.write()) {
+            (true, Mode::DIR, _) => {}
+            (true, _, _) => return Err(ENOTDIR),
+            (false, Mode::DIR, true) => return Err(EISDIR),
+            _ => {}
         }
 
-        if flags.truncate() && flags.write() && s_isreg(filemode) {
+        if flags.truncate() && flags.write() && file_format.is_reg() {
             inode.truncate(0)?;
         }
 
-        let file = if s_ischr(filemode) {
+        let file = if file_format.is_chr() {
             let device = CharDevice::get(inode.devid()?).ok_or(ENXIO)?;
             device.open(flags)?
         } else {
@@ -203,7 +307,8 @@ impl FileArray {
         };
 
         let mut inner = self.inner.lock();
-        let fd = inner.next_fd();
+        let (files, fd_alloc) = inner.split_borrow();
+        let fd = fd_alloc.next_fd(files);
         inner.do_insert(fd, fdflag, file);
 
         Ok(fd)
@@ -211,43 +316,59 @@ impl FileArray {
 
     pub fn fcntl(&self, fd: FD, cmd: u32, arg: usize) -> KResult<usize> {
         let mut inner = self.inner.lock();
-        let ofile = inner.files.get_mut(&fd).ok_or(EBADF)?;
+        let (files, fd_alloc) = inner.split_borrow();
+
+        let mut cursor = files.find_mut(&fd);
 
-        match cmd {
+        let ret = match cmd {
             F_DUPFD | F_DUPFD_CLOEXEC => {
+                let ofile = cursor.get().ok_or(EBADF)?;
+
                 let cloexec = cmd == F_DUPFD_CLOEXEC || ofile.flags.close_on_exec();
                 let flags = cloexec
                     .then_some(FDFlags::FD_CLOEXEC)
                     .unwrap_or(FDFlags::empty());
 
-                let new_file_data = ofile.file.clone();
-                let new_fd = inner.allocate_fd(FD(arg as u32));
+                let new_file_data = ofile.file.dup();
+                let new_fd = fd_alloc.allocate_fd(FD(arg as u32), files);
 
                 inner.do_insert(new_fd, flags, new_file_data);
 
-                Ok(new_fd.0 as usize)
+                new_fd.0 as usize
             }
-            F_GETFD => Ok(ofile.flags.bits() as usize),
+            F_GETFD => cursor.get().ok_or(EBADF)?.flags.bits() as usize,
             F_SETFD => {
+                let mut ofile = cursor.remove().ok_or(EBADF)?;
                 ofile.flags = FDFlags::from_bits_truncate(arg as u32);
-                Ok(0)
+                cursor.insert(ofile);
+                0
             }
-            F_GETFL => Ok(ofile.file.get_flags().bits() as usize),
+            F_GETFL => cursor.get().ok_or(EBADF)?.file.get_flags().bits() as usize,
             F_SETFL => {
-                ofile
+                cursor
+                    .get()
+                    .ok_or(EBADF)?
                     .file
                     .set_flags(OpenFlags::from_bits_retain(arg as u32));
 
-                Ok(0)
+                0
             }
             _ => unimplemented!("fcntl: cmd={}", cmd),
-        }
+        };
+
+        Ok(ret)
     }
 
     /// Only used for init process.
     pub fn open_console(&self) {
         let mut inner = self.inner.lock();
-        let (stdin, stdout, stderr) = (inner.next_fd(), inner.next_fd(), inner.next_fd());
+        let (files, fd_alloc) = inner.split_borrow();
+
+        let (stdin, stdout, stderr) = (
+            fd_alloc.next_fd(files),
+            fd_alloc.next_fd(files),
+            fd_alloc.next_fd(files),
+        );
         let console_terminal = get_console().expect("No console terminal");
 
         inner.do_insert(
@@ -269,53 +390,25 @@ impl FileArray {
 }
 
 impl FileArrayInner {
-    fn get(&mut self, fd: FD) -> Option<Arc<File>> {
-        self.files.get(&fd).map(|f| f.file.clone())
-    }
-
-    fn find_available(&mut self, from: FD) -> FD {
-        self.files
-            .range(&from..)
-            .fold_while(from, |current, (&key, _)| {
-                if current == key {
-                    Continue(FD(current.0 + 1))
-                } else {
-                    Done(current)
-                }
-            })
-            .into_inner()
-    }
-
-    /// Allocate a new file descriptor starting from `from`.
-    ///
-    /// Returned file descriptor should be used immediately.
-    ///
-    fn allocate_fd(&mut self, from: FD) -> FD {
-        let from = FD::max(from, self.fd_min_avail);
-
-        if from == self.fd_min_avail {
-            let next_min_avail = self.find_available(FD(from.0 + 1));
-            let allocated = self.fd_min_avail;
-            self.fd_min_avail = next_min_avail;
-            allocated
-        } else {
-            self.find_available(from)
-        }
+    fn get(&mut self, fd: FD) -> Option<File> {
+        self.files.get_fd(fd).map(|open| open.file.clone())
     }
 
-    fn release_fd(&mut self, fd: FD) {
-        if fd < self.fd_min_avail {
-            self.fd_min_avail = fd;
+    /// Insert a file description to the file array.
+    fn do_insert(&mut self, fd: FD, flags: FDFlags, file: File) {
+        match self.files.entry(&fd) {
+            Entry::Occupied(_) => {
+                panic!("File descriptor {fd:?} already exists in the file array.");
+            }
+            Entry::Vacant(insert_cursor) => {
+                insert_cursor.insert(OpenFile::new(fd, flags, file));
+            }
         }
     }
 
-    fn next_fd(&mut self) -> FD {
-        self.allocate_fd(self.fd_min_avail)
-    }
-
-    /// Insert a file description to the file array.
-    fn do_insert(&mut self, fd: FD, flags: FDFlags, file: Arc<File>) {
-        assert!(self.files.insert(fd, OpenFile { flags, file }).is_none());
+    fn split_borrow(&mut self) -> (&mut RBTree<OpenFileAdapter>, &mut FDAllocator) {
+        let Self { files, fd_alloc } = self;
+        (files, fd_alloc)
     }
 }
 
@@ -323,6 +416,15 @@ impl FD {
     pub const AT_FDCWD: FD = FD(-100i32 as u32);
 }
 
+impl core::fmt::Debug for FD {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self {
+            &Self::AT_FDCWD => f.write_str("FD(AT_FDCWD)"),
+            FD(no) => f.debug_tuple("FD").field(&no).finish(),
+        }
+    }
+}
+
 impl FromSyscallArg for FD {
     fn from_arg(value: usize) -> Self {
         Self(value as u32)
@@ -334,3 +436,39 @@ impl SyscallRetVal for FD {
         Some(self.0 as usize)
     }
 }
+
+trait FilesExt {
+    fn get_fd(&self, fd: FD) -> Option<&OpenFile>;
+
+    fn pick<P>(&mut self, pred: P) -> Self
+    where
+        P: FnMut(&OpenFile) -> bool;
+}
+
+impl FilesExt for RBTree<OpenFileAdapter> {
+    fn get_fd(&self, fd: FD) -> Option<&OpenFile> {
+        self.find(&fd).get()
+    }
+
+    fn pick<P>(&mut self, mut pred: P) -> Self
+    where
+        P: FnMut(&OpenFile) -> bool,
+    {
+        let mut picked = RBTree::new(OpenFileAdapter::new());
+
+        // TODO: might be better if we start picking from somewhere else
+        //       or using a different approach.
+        let mut cursor = self.front_mut();
+        while let Some(open_file) = cursor.get() {
+            if !pred(open_file) {
+                cursor.move_next();
+                continue;
+            }
+
+            picked.insert(cursor.remove().unwrap());
+            cursor.move_next();
+        }
+
+        picked
+    }
+}

+ 150 - 12
src/kernel/vfs/inode.rs

@@ -1,10 +1,13 @@
-use super::{dentry::Dentry, s_isblk, s_ischr, vfs::Vfs, DevId};
+use super::{dentry::Dentry, vfs::Vfs, DevId};
 use crate::io::Stream;
 use crate::kernel::constants::{
     EINVAL, EISDIR, ENOTDIR, EPERM, STATX_ATIME, STATX_BLOCKS, STATX_CTIME, STATX_GID, STATX_INO,
-    STATX_MODE, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_TYPE, STATX_UID, S_IFDIR, S_IFMT,
+    STATX_MODE, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_TYPE, STATX_UID, S_IFBLK, S_IFCHR,
+    S_IFDIR, S_IFLNK, S_IFMT, S_IFREG,
 };
 use crate::kernel::mem::PageCache;
+use crate::kernel::syscall::{FromSyscallArg, SyscallRetVal};
+use crate::kernel::task::block_on;
 use crate::kernel::timer::Instant;
 use crate::{io::Buffer, prelude::*};
 use alloc::sync::{Arc, Weak};
@@ -14,7 +17,6 @@ use core::{
     ptr::addr_of_mut,
     sync::atomic::{AtomicU32, AtomicU64, Ordering},
 };
-use eonix_runtime::task::Task;
 use eonix_sync::RwLock;
 use posix_types::stat::StatX;
 
@@ -32,8 +34,11 @@ pub type AtomicUid = AtomicU32;
 #[allow(dead_code)]
 pub type Gid = u32;
 pub type AtomicGid = AtomicU32;
-pub type Mode = u32;
-pub type AtomicMode = AtomicU32;
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct Mode(u32);
+
+pub struct AtomicMode(AtomicU32);
 
 #[derive(Debug)]
 pub struct InodeData {
@@ -97,7 +102,7 @@ pub struct RenameData<'a, 'b> {
 #[allow(unused_variables)]
 pub trait Inode: Send + Sync + InodeInner + Any {
     fn is_dir(&self) -> bool {
-        self.mode.load(Ordering::SeqCst) & S_IFDIR != 0
+        self.mode.load().is_dir()
     }
 
     fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
@@ -136,7 +141,7 @@ pub trait Inode: Send + Sync + InodeInner + Any {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
-    fn write_direct(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+    fn write_direct(&self, stream: &mut dyn Stream, offset: usize) -> KResult<usize> {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
@@ -181,7 +186,7 @@ pub trait Inode: Send + Sync + InodeInner + Any {
         let vfs = self.vfs.upgrade().expect("Vfs is dropped");
 
         let size = self.size.load(Ordering::Relaxed);
-        let mode = self.mode.load(Ordering::Relaxed);
+        let mode = self.mode.load();
 
         if mask & STATX_NLINK != 0 {
             stat.stx_nlink = self.nlink.load(Ordering::Acquire) as _;
@@ -213,13 +218,13 @@ pub trait Inode: Send + Sync + InodeInner + Any {
 
         stat.stx_mode = 0;
         if mask & STATX_MODE != 0 {
-            stat.stx_mode |= (mode & !S_IFMT) as u16;
+            stat.stx_mode |= mode.non_format_bits() as u16;
             stat.stx_mask |= STATX_MODE;
         }
 
         if mask & STATX_TYPE != 0 {
-            stat.stx_mode |= (mode & S_IFMT) as u16;
-            if s_isblk(mode) || s_ischr(mode) {
+            stat.stx_mode |= mode.format_bits() as u16;
+            if mode.is_blk() || mode.is_chr() {
                 let devid = self.devid();
                 stat.stx_rdev_major = (devid? >> 8) & 0xff;
                 stat.stx_rdev_minor = devid? & 0xff;
@@ -280,7 +285,7 @@ pub trait Inode: Send + Sync + InodeInner + Any {
         f(
             uninit_mut.as_mut_ptr(),
             // SAFETY: `idata` is initialized and we will never move the lock.
-            &Task::block_on(unsafe { idata.assume_init_ref() }.rwsem.read()),
+            &block_on(unsafe { idata.assume_init_ref() }.rwsem.read()),
         );
 
         // Safety: `uninit` is initialized
@@ -354,3 +359,136 @@ macro_rules! define_struct_inode {
 }
 
 pub(crate) use define_struct_inode;
+
+impl Mode {
+    pub const REG: Self = Self(S_IFREG);
+    pub const DIR: Self = Self(S_IFDIR);
+    pub const LNK: Self = Self(S_IFLNK);
+    pub const BLK: Self = Self(S_IFBLK);
+    pub const CHR: Self = Self(S_IFCHR);
+
+    pub const fn new(bits: u32) -> Self {
+        Self(bits)
+    }
+
+    pub const fn is_blk(&self) -> bool {
+        (self.0 & S_IFMT) == S_IFBLK
+    }
+
+    pub const fn is_chr(&self) -> bool {
+        (self.0 & S_IFMT) == S_IFCHR
+    }
+
+    pub const fn is_reg(&self) -> bool {
+        (self.0 & S_IFMT) == S_IFREG
+    }
+
+    pub const fn is_dir(&self) -> bool {
+        (self.0 & S_IFMT) == S_IFDIR
+    }
+
+    pub const fn is_lnk(&self) -> bool {
+        (self.0 & S_IFMT) == S_IFLNK
+    }
+
+    pub const fn bits(&self) -> u32 {
+        self.0
+    }
+
+    pub const fn format_bits(&self) -> u32 {
+        self.0 & S_IFMT
+    }
+
+    pub const fn format(&self) -> Self {
+        Self::new(self.format_bits())
+    }
+
+    pub const fn non_format_bits(&self) -> u32 {
+        self.0 & !S_IFMT
+    }
+
+    pub const fn non_format(&self) -> Self {
+        Self::new(self.non_format_bits())
+    }
+
+    pub const fn perm(self, perm: u32) -> Self {
+        Self::new((self.0 & !0o777) | (perm & 0o777))
+    }
+
+    pub const fn set_perm(&mut self, perm: u32) {
+        *self = self.perm(perm);
+    }
+
+    pub const fn mask_perm(&mut self, perm_mask: u32) {
+        let perm_mask = perm_mask & 0o777;
+        let self_perm = self.non_format_bits() & 0o777;
+
+        *self = self.perm(self_perm & perm_mask);
+    }
+}
+
+impl AtomicMode {
+    pub const fn new(bits: u32) -> Self {
+        Self(AtomicU32::new(bits))
+    }
+
+    pub const fn from(mode: Mode) -> Self {
+        Self::new(mode.0)
+    }
+
+    pub fn load(&self) -> Mode {
+        Mode(self.0.load(Ordering::Relaxed))
+    }
+
+    pub fn store(&self, mode: Mode) {
+        self.0.store(mode.0, Ordering::Relaxed);
+    }
+}
+
+impl core::fmt::Debug for AtomicMode {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("AtomicMode")
+            .field("bits", &self.load().0)
+            .finish()
+    }
+}
+
+impl core::fmt::Debug for Mode {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let format_name = match self.format() {
+            Mode::REG => "REG",
+            Mode::DIR => "DIR",
+            Mode::LNK => "LNK",
+            Mode::BLK => "BLK",
+            Mode::CHR => "CHR",
+            _ => "UNK",
+        };
+
+        match self.non_format_bits() & !0o777 {
+            0 => write!(
+                f,
+                "Mode({format_name}, {perm:#o})",
+                perm = self.non_format_bits()
+            )?,
+            rem => write!(
+                f,
+                "Mode({format_name}, {perm:#o}, rem={rem:#x})",
+                perm = self.non_format_bits() & 0o777
+            )?,
+        }
+
+        Ok(())
+    }
+}
+
+impl FromSyscallArg for Mode {
+    fn from_arg(value: usize) -> Self {
+        Mode::new(value as u32)
+    }
+}
+
+impl SyscallRetVal for Mode {
+    fn into_retval(self) -> Option<usize> {
+        Some(self.bits() as usize)
+    }
+}

+ 4 - 23
src/kernel/vfs/mod.rs

@@ -1,4 +1,3 @@
-use crate::kernel::constants::{S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFMT, S_IFREG};
 use crate::prelude::*;
 use alloc::sync::Arc;
 use dentry::Dentry;
@@ -6,33 +5,15 @@ use eonix_sync::LazyLock;
 use inode::Mode;
 
 pub mod dentry;
-pub mod file;
+mod file;
 pub mod filearray;
 pub mod inode;
 pub mod mount;
 pub mod vfs;
 
-pub type DevId = u32;
-
-pub fn s_isreg(mode: Mode) -> bool {
-    (mode & S_IFMT) == S_IFREG
-}
+pub use file::{File, FileType, PollEvent, SeekOption, TerminalFile};
 
-pub fn s_isdir(mode: Mode) -> bool {
-    (mode & S_IFMT) == S_IFDIR
-}
-
-pub fn s_ischr(mode: Mode) -> bool {
-    (mode & S_IFMT) == S_IFCHR
-}
-
-pub fn s_isblk(mode: Mode) -> bool {
-    (mode & S_IFMT) == S_IFBLK
-}
-
-pub fn s_islnk(mode: Mode) -> bool {
-    (mode & S_IFMT) == S_IFLNK
-}
+pub type DevId = u32;
 
 pub struct FsContext {
     pub fsroot: Arc<Dentry>,
@@ -44,7 +25,7 @@ static GLOBAL_FS_CONTEXT: LazyLock<Arc<FsContext>> = LazyLock::new(|| {
     Arc::new(FsContext {
         fsroot: Dentry::root().clone(),
         cwd: Spin::new(Dentry::root().clone()),
-        umask: Spin::new(0o022),
+        umask: Spin::new(Mode::new(0o022)),
     })
 });
 

+ 72 - 24
src/lib.rs

@@ -1,5 +1,6 @@
 #![no_std]
 #![no_main]
+#![feature(allocator_api)]
 #![feature(c_size_t)]
 #![feature(concat_idents)]
 #![feature(arbitrary_self_types)]
@@ -8,6 +9,9 @@
 
 extern crate alloc;
 
+#[cfg(any(target_arch = "riscv64", target_arch = "x86_64"))]
+extern crate unwinding;
+
 mod driver;
 mod fs;
 mod hash;
@@ -15,6 +19,8 @@ mod io;
 mod kernel;
 mod kernel_init;
 mod net;
+#[cfg(any(target_arch = "riscv64", target_arch = "x86_64"))]
+mod panic;
 mod path;
 mod prelude;
 mod rcu;
@@ -24,21 +30,23 @@ use crate::kernel::task::alloc_pid;
 use alloc::{ffi::CString, sync::Arc};
 use core::{
     hint::spin_loop,
-    sync::atomic::{AtomicBool, Ordering},
+    sync::atomic::{AtomicBool, AtomicUsize, Ordering},
 };
 use eonix_hal::{
-    arch_exported::bootstrap::shutdown, processor::CPU, traits::trap::IrqState,
+    arch_exported::bootstrap::shutdown,
+    context::TaskContext,
+    processor::{halt, CPU, CPU_COUNT},
+    traits::{context::RawTaskContext, trap::IrqState},
     trap::disable_irqs_save,
 };
 use eonix_mm::address::PRange;
-use eonix_runtime::{run::FutureRun, scheduler::Scheduler, task::Task};
+use eonix_runtime::{executor::Stack, scheduler::RUNTIME};
 use kernel::{
     mem::GlobalPageAlloc,
-    task::{
-        new_thread_runnable, KernelStack, ProcessBuilder, ProcessList, ProgramLoader, ThreadBuilder,
-    },
+    task::{KernelStack, ProcessBuilder, ProcessList, ProgramLoader, ThreadBuilder},
     vfs::{
         dentry::Dentry,
+        inode::Mode,
         mount::{do_mount, MS_NOATIME, MS_NODEV, MS_NOSUID, MS_RDONLY},
         FsContext,
     },
@@ -50,6 +58,9 @@ use prelude::*;
 
 #[cfg(any(target_arch = "riscv64", target_arch = "loongarch64"))]
 fn do_panic() -> ! {
+    #[cfg(target_arch = "riscv64")]
+    panic::stack_trace();
+
     shutdown();
 }
 
@@ -80,6 +91,25 @@ fn panic(info: &core::panic::PanicInfo) -> ! {
 }
 
 static BSP_OK: AtomicBool = AtomicBool::new(false);
+static CPU_SHUTTING_DOWN: AtomicUsize = AtomicUsize::new(0);
+
+fn shutdown_system() -> ! {
+    let cpu_count = CPU_COUNT.load(Ordering::Relaxed);
+
+    if CPU_SHUTTING_DOWN.fetch_add(1, Ordering::AcqRel) + 1 == cpu_count {
+        println_info!("All CPUs are shutting down. Gracefully powering off...");
+        shutdown();
+    } else {
+        println_info!(
+            "CPU {} is shutting down. Waiting for other CPUs...",
+            CPU::local().cpuid()
+        );
+
+        loop {
+            halt();
+        }
+    }
+}
 
 #[eonix_hal::main]
 fn kernel_init(mut data: eonix_hal::bootstrap::BootStrapData) -> ! {
@@ -90,21 +120,26 @@ fn kernel_init(mut data: eonix_hal::bootstrap::BootStrapData) -> ! {
         driver::sbi_console::init_console();
     }
 
-    // To satisfy the `Scheduler` "preempt count == 0" assertion.
-    eonix_preempt::disable();
+    BSP_OK.store(true, Ordering::Release);
 
-    // We need root dentry to be present in constructor of `FsContext`.
-    // So call `init_vfs` first, then `init_multitasking`.
-    Scheduler::init_local_scheduler::<KernelStack>();
+    RUNTIME.spawn(init_process(data.get_early_stack()));
 
-    Scheduler::get().spawn::<KernelStack, _>(FutureRun::new(init_process(data.get_early_stack())));
+    drop(data);
 
-    BSP_OK.store(true, Ordering::Release);
+    let mut ctx = TaskContext::new();
+    let stack_bottom = {
+        let stack = KernelStack::new();
+        let bottom = stack.get_bottom().addr().get();
+        core::mem::forget(stack);
+
+        bottom
+    };
+    ctx.set_interrupt_enabled(true);
+    ctx.set_program_counter(standard_main as usize);
+    ctx.set_stack_pointer(stack_bottom);
 
-    drop(data);
     unsafe {
-        // SAFETY: `preempt::count()` == 1.
-        Scheduler::goto_scheduler_noreturn()
+        TaskContext::switch_to_noreturn(&mut ctx);
     }
 }
 
@@ -115,18 +150,30 @@ fn kernel_ap_main(_stack_range: PRange) -> ! {
         spin_loop();
     }
 
-    Scheduler::init_local_scheduler::<KernelStack>();
     println_debug!("AP{} started", CPU::local().cpuid());
 
-    eonix_preempt::disable();
+    let mut ctx = TaskContext::new();
+    let stack_bottom = {
+        let stack = KernelStack::new();
+        let bottom = stack.get_bottom().addr().get();
+        core::mem::forget(stack);
+
+        bottom
+    };
+    ctx.set_interrupt_enabled(true);
+    ctx.set_program_counter(standard_main as usize);
+    ctx.set_stack_pointer(stack_bottom);
 
-    // TODO!!!!!: Free the stack after having switched to idle task.
     unsafe {
-        // SAFETY: `preempt::count()` == 1.
-        Scheduler::goto_scheduler_noreturn()
+        TaskContext::switch_to_noreturn(&mut ctx);
     }
 }
 
+fn standard_main() -> ! {
+    RUNTIME.enter();
+    shutdown_system();
+}
+
 async fn init_process(early_kstack: PRange) {
     unsafe {
         let irq_ctx = disable_irqs_save();
@@ -176,7 +223,7 @@ async fn init_process(early_kstack: PRange) {
         let fs_context = FsContext::global();
         let mnt_dir = Dentry::open(fs_context, Path::new(b"/mnt/").unwrap(), true).unwrap();
 
-        mnt_dir.mkdir(0o755).unwrap();
+        mnt_dir.mkdir(Mode::new(0o755)).unwrap();
 
         do_mount(
             &mnt_dir,
@@ -216,6 +263,7 @@ async fn init_process(early_kstack: PRange) {
         ProgramLoader::parse(fs_context, init_name, init.clone(), argv, envp)
             .expect("Failed to parse init program")
             .load()
+            .await
             .expect("Failed to load init program")
     };
 
@@ -223,7 +271,7 @@ async fn init_process(early_kstack: PRange) {
         .name(Arc::from(&b"busybox"[..]))
         .entry(load_info.entry_ip, load_info.sp);
 
-    let mut process_list = Task::block_on(ProcessList::get().write());
+    let mut process_list = ProcessList::get().write().await;
     let (thread, process) = ProcessBuilder::new()
         .pid(alloc_pid())
         .mm_list(load_info.mm_list)
@@ -235,5 +283,5 @@ async fn init_process(early_kstack: PRange) {
     // TODO!!!: Remove this.
     thread.files.open_console();
 
-    Scheduler::get().spawn::<KernelStack, _>(new_thread_runnable(thread));
+    RUNTIME.spawn(thread.run());
 }

+ 29 - 0
src/panic.rs

@@ -0,0 +1,29 @@
+use core::ffi::c_void;
+
+use eonix_log::println_fatal;
+use unwinding::abi::{
+    UnwindContext, UnwindReasonCode, _Unwind_Backtrace, _Unwind_GetIP, _Unwind_GetRegionStart,
+};
+
+pub fn stack_trace() {
+    struct CallbackData {
+        counter: usize,
+    }
+
+    extern "C" fn callback(unwind_ctx: &UnwindContext<'_>, arg: *mut c_void) -> UnwindReasonCode {
+        let data = unsafe { &mut *(arg as *mut CallbackData) };
+        data.counter += 1;
+
+        println_fatal!(
+            "{:4}: {:#018x} - <unknown> at function {:#018x}",
+            data.counter,
+            _Unwind_GetIP(unwind_ctx),
+            _Unwind_GetRegionStart(unwind_ctx),
+        );
+
+        UnwindReasonCode::NO_REASON
+    }
+
+    let mut data = CallbackData { counter: 0 };
+    _Unwind_Backtrace(callback, &raw mut data as *mut c_void);
+}

+ 29 - 9
src/rcu.rs

@@ -1,11 +1,11 @@
-use crate::prelude::*;
+use crate::{kernel::task::block_on, prelude::*};
 use alloc::sync::Arc;
 use core::{
     ops::Deref,
     ptr::NonNull,
     sync::atomic::{AtomicPtr, Ordering},
 };
-use eonix_runtime::task::Task;
+use eonix_runtime::scheduler::RUNTIME;
 use eonix_sync::{Mutex, RwLock, RwLockReadGuard};
 use pointers::BorrowedArc;
 
@@ -21,7 +21,7 @@ impl<'data, T> RCUReadGuard<'data, BorrowedArc<'data, T>> {
     fn lock(value: BorrowedArc<'data, T>) -> Self {
         Self {
             value,
-            _guard: Task::block_on(GLOBAL_RCU_SEM.read()),
+            _guard: block_on(GLOBAL_RCU_SEM.read()),
             _phantom: PhantomData,
         }
     }
@@ -48,6 +48,14 @@ pub async fn rcu_sync() {
     let _ = GLOBAL_RCU_SEM.write().await;
 }
 
+pub fn call_rcu(func: impl FnOnce() + Send + 'static) {
+    RUNTIME.spawn(async move {
+        // Wait for all readers to finish.
+        rcu_sync().await;
+        func();
+    });
+}
+
 pub trait RCUNode<MySelf> {
     fn rcu_prev(&self) -> &AtomicPtr<MySelf>;
     fn rcu_next(&self) -> &AtomicPtr<MySelf>;
@@ -154,7 +162,7 @@ impl<T: RCUNode<T>> RCUList<T> {
     }
 
     pub fn iter(&self) -> RCUIterator<T> {
-        let _lck = Task::block_on(self.reader_lock.read());
+        let _lck = block_on(self.reader_lock.read());
 
         RCUIterator {
             // SAFETY: We have a read lock, so the node is still alive.
@@ -186,9 +194,15 @@ impl<'lt, T: RCUNode<T>> Iterator for RCUIterator<'lt, T> {
     }
 }
 
-pub struct RCUPointer<T>(AtomicPtr<T>);
+pub struct RCUPointer<T>(AtomicPtr<T>)
+where
+    T: Send + Sync + 'static;
 
-impl<T: core::fmt::Debug> core::fmt::Debug for RCUPointer<T> {
+impl<T> core::fmt::Debug for RCUPointer<T>
+where
+    T: core::fmt::Debug,
+    T: Send + Sync + 'static,
+{
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         match NonNull::new(self.0.load(Ordering::Acquire)) {
             Some(pointer) => {
@@ -201,7 +215,10 @@ impl<T: core::fmt::Debug> core::fmt::Debug for RCUPointer<T> {
     }
 }
 
-impl<T> RCUPointer<T> {
+impl<T> RCUPointer<T>
+where
+    T: Send + Sync + 'static,
+{
     pub const fn empty() -> Self {
         Self(AtomicPtr::new(core::ptr::null_mut()))
     }
@@ -258,13 +275,16 @@ impl<T> RCUPointer<T> {
     }
 }
 
-impl<T> Drop for RCUPointer<T> {
+impl<T> Drop for RCUPointer<T>
+where
+    T: Send + Sync + 'static,
+{
     fn drop(&mut self) {
         // SAFETY: We call `rcu_sync()` to ensure that all readers are done.
         if let Some(arc) = unsafe { self.swap(None) } {
             // We only wait if there are other references.
             if Arc::strong_count(&arc) == 1 {
-                Task::block_on(rcu_sync());
+                call_rcu(move || drop(arc));
             }
         }
     }