Эх сурвалжийг харах

Merge branch 'master' into loongarch64

Signed-off-by: greatbridf <greatbridf@icloud.com>
greatbridf 6 сар өмнө
parent
commit
333f3907d4

+ 10 - 10
Cargo.lock

@@ -25,9 +25,9 @@ version = "0.1.0"
 
 [[package]]
 name = "autocfg"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "bit_field"
@@ -51,9 +51,9 @@ dependencies = [
 
 [[package]]
 name = "cfg-if"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268"
 
 [[package]]
 name = "critical-section"
@@ -403,9 +403,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.101"
+version = "2.0.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+checksum = "e4307e30089d6fd6aff212f2da3a1f9e32f3223b1f010fb09b7c95f90f3ca1e8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -470,18 +470,18 @@ checksum = "2fe21bcc34ca7fe6dd56cc2cb1261ea59d6b93620215aefb5ea6032265527784"
 
 [[package]]
 name = "zerocopy"
-version = "0.8.25"
+version = "0.8.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
+checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.25"
+version = "0.8.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
+checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
 dependencies = [
  "proc-macro2",
  "quote",

+ 5 - 2
Cargo.toml

@@ -44,6 +44,9 @@ log_trace = ["trace_pci", "trace_syscall", "trace_scheduler"]
 log_debug = []
 smp = []
 
+[profile.release]
+debug = true
+
 [profile.dev]
 panic = "abort"
 
@@ -51,7 +54,7 @@ panic = "abort"
 opt-level = 2
 
 [profile.dev.package.eonix_runtime]
-opt-level = 0
+opt-level = 2
 
 [profile.dev.package.eonix_sync]
 opt-level = 2
@@ -60,7 +63,7 @@ opt-level = 2
 opt-level = 2
 
 [profile.dev.package.eonix_hal]
-opt-level = 0
+opt-level = 2
 
 [profile.dev.package."*"]
 opt-level = "s"

+ 35 - 5
Makefile.src

@@ -64,6 +64,30 @@ CARGO_FLAGS += --target riscv64gc-unknown-none-elf
 .PHONY: build
 build: $(BINARY_DIR)/eonix_kernel build/boot-riscv64.img
 
+else ifeq ($(ARCH),loongarch64)
+
+BINARY_DIR_BASE := build/loongarch64-unknown-none-softfloat
+BINARY_DIR := $(BINARY_DIR_BASE)/$(MODE)
+
+QEMU_ARGS += \
+	-machine virt -kernel $(BINARY_DIR)/eonix_kernel -m 1G \
+	-device virtio-blk-pci,drive=disk0 \
+	-device virtio-net-pci,netdev=mynet0 \
+	-drive id=disk0,file=build/boot-loongarch64.img,format=raw,if=none \
+	-netdev user,id=mynet0,hostfwd=tcp::5555-:5555,hostfwd=udp::5555-:5555 \
+	-rtc base=utc
+
+ifneq ($(IMG),)
+QEMU_ARGS += \
+	-drive id=disk1,file=$(IMG),format=raw,if=none \
+	-device virtio-blk-pci,drive=disk1
+endif
+
+CARGO_FLAGS += --target loongarch64-unknown-none-softfloat
+
+.PHONY: build
+build: $(BINARY_DIR)/eonix_kernel build/boot-loongarch64.img
+
 else ifeq ($(ARCH),x86_64)
 
 BINARY_DIR_BASE := build/x86_64-unknown-none
@@ -128,22 +152,22 @@ tmux-debug:
 	tmux kill-session -t gbos-debug
 
 $(BINARY_DIR)/eonix_kernel: $(KERNEL_DEPS)
-	cargo build $(CARGO_FLAGS)
+	CARGO_TARGET_DIR=build cargo build $(CARGO_FLAGS)
 
 build/kernel.sym: $(BINARY_DIR)/eonix_kernel
-	cargo objcopy -q $(CARGO_FLAGS) -- --only-keep-debug build/kernel.sym
+	CARGO_TARGET_DIR=build cargo objcopy -q $(CARGO_FLAGS) -- --only-keep-debug build/kernel.sym
 
 build/fs-%.img: user-programs/init_script_%.sh script/build-img.sh $(USER_PROGRAMS)
 	ARCH=$* OUTPUT=$@ sh script/build-img.sh
 
 build/mbr.bin: $(BINARY_DIR)/eonix_kernel
-	cargo objcopy -q $(CARGO_FLAGS) -- -O binary -j .mbr build/mbr.bin
+	CARGO_TARGET_DIR=build cargo objcopy -q $(CARGO_FLAGS) -- -O binary -j .mbr build/mbr.bin
 
 build/stage1.bin: $(BINARY_DIR)/eonix_kernel
-	cargo objcopy -q $(CARGO_FLAGS) -- -O binary -j .stage1 build/stage1.bin
+	CARGO_TARGET_DIR=build cargo objcopy -q $(CARGO_FLAGS) -- -O binary -j .stage1 build/stage1.bin
 
 build/kernel.bin: $(BINARY_DIR)/eonix_kernel
-	cargo objcopy -q $(CARGO_FLAGS) -- -O binary --strip-debug \
+	CARGO_TARGET_DIR=build cargo objcopy -q $(CARGO_FLAGS) -- -O binary --strip-debug \
 		-R .mbr -R .stage1 build/kernel.bin
 
 build/boot-x86_64.img: build/fs-x86_64.img build/mbr.bin build/stage1.bin build/kernel.bin
@@ -161,4 +185,10 @@ build/boot-riscv64.img: build/fs-riscv64.img
 	sh -c 'echo n; echo; echo; echo 8192; echo; echo a; echo w' \
 		| $(FDISK) $@ 2> /dev/null > /dev/null
 
+build/boot-loongarch64.img: build/fs-loongarch64.img
+	dd if=$< of=$@ bs=$(shell expr 4 \* 1024 \* 1024) \
+		seek=1 conv=notrunc 2> /dev/null
+	sh -c 'echo n; echo; echo; echo 8192; echo; echo a; echo w' \
+		| $(FDISK) $@ 2> /dev/null > /dev/null
+
 .DEFAULT_GOAL := build

+ 163 - 9
crates/eonix_hal/src/arch/riscv64/bootstrap.rs

@@ -8,19 +8,23 @@ use super::{
 use crate::{
     arch::{
         cpu::CPU,
-        fdt::{init_dtb_and_fdt, FdtExt},
+        fdt::{init_dtb_and_fdt, FdtExt, FDT},
         mm::{ArchPhysAccess, FreeRam, PageAttribute64, GLOBAL_PAGE_TABLE},
     },
     bootstrap::BootStrapData,
     mm::{ArchMemory, ArchPagingMode, BasicPageAlloc, BasicPageAllocRef, ScopedAllocator},
 };
-use core::arch::naked_asm;
 use core::{
     alloc::Allocator,
     arch::asm,
     cell::RefCell,
     sync::atomic::{AtomicBool, AtomicUsize},
 };
+use core::{
+    arch::{global_asm, naked_asm},
+    hint::spin_loop,
+    sync::atomic::{AtomicPtr, Ordering},
+};
 use eonix_hal_traits::mm::Memory;
 use eonix_mm::{
     address::{Addr as _, PAddr, PRange, PhysAccess, VAddr, VRange},
@@ -30,13 +34,18 @@ use eonix_mm::{
 use eonix_percpu::PercpuArea;
 use fdt::Fdt;
 use riscv::{asm::sfence_vma_all, register::satp};
-use sbi::legacy::console_putchar;
+use sbi::{hsm::hart_start, legacy::console_putchar, PhysicalAddress};
 
 #[unsafe(link_section = ".bootstrap.stack")]
 static BOOT_STACK: [u8; 4096 * 16] = [0; 4096 * 16];
 
 static BOOT_STACK_START: &'static [u8; 4096 * 16] = &BOOT_STACK;
 
+#[unsafe(link_section = ".bootstrap.stack")]
+static TEMP_AP_STACK: [u8; 256] = [0; 256];
+
+static TEMP_AP_STACK_START: &'static [u8; 256] = &TEMP_AP_STACK;
+
 #[repr(C, align(4096))]
 struct PageTable([u64; PTES_PER_PAGE]);
 
@@ -60,6 +69,12 @@ static PT1: PageTable = {
     PageTable(arr)
 };
 
+static BSP_PAGE_ALLOC: AtomicPtr<RefCell<BasicPageAlloc>> = AtomicPtr::new(core::ptr::null_mut());
+
+static AP_COUNT: AtomicUsize = AtomicUsize::new(0);
+static AP_STACK: AtomicUsize = AtomicUsize::new(0);
+static AP_SEM: AtomicBool = AtomicBool::new(false);
+
 /// bootstrap in rust
 #[unsafe(naked)]
 #[unsafe(no_mangle)]
@@ -94,8 +109,6 @@ unsafe extern "C" fn _start(hart_id: usize, dtb_addr: usize) -> ! {
     )
 }
 
-/// TODO:
-/// 启动所有的cpu
 pub unsafe extern "C" fn riscv64_start(hart_id: usize, dtb_addr: PAddr) -> ! {
     let fdt = Fdt::from_ptr(ArchPhysAccess::as_ptr(dtb_addr).as_ptr())
         .expect("Failed to parse DTB from static memory.");
@@ -131,6 +144,9 @@ pub unsafe extern "C" fn riscv64_start(hart_id: usize, dtb_addr: PAddr) -> ! {
         allocator: Some(real_allocator),
     };
 
+    // set current hart's mtimecmp register
+    set_next_timer();
+
     unsafe {
         _eonix_hal_main(bootstrap_data);
     }
@@ -138,7 +154,6 @@ pub unsafe extern "C" fn riscv64_start(hart_id: usize, dtb_addr: PAddr) -> ! {
 
 unsafe extern "C" {
     fn BSS_LENGTH();
-    fn KIMAGE_PAGES();
 }
 
 /// TODO:
@@ -225,13 +240,152 @@ fn setup_cpu(alloc: impl PageAlloc, hart_id: usize) {
             .as_mut()
             .set_kernel_tp(PercpuArea::get_for(cpu.cpuid()).unwrap().cast());
     }
+}
+
+fn get_ap_start_addr() -> usize {
+    unsafe extern "C" {
+        fn _ap_start();
+    }
+    static AP_START_VALUE: &'static unsafe extern "C" fn() =
+        &(_ap_start as unsafe extern "C" fn());
+    unsafe { (AP_START_VALUE as *const _ as *const usize).read_volatile() }
+}
+
+fn bootstrap_smp(alloc: impl Allocator, page_alloc: &RefCell<BasicPageAlloc>) {
+    let local_hart_id = CPU::local().cpuid();
+    let mut ap_count = 0;
+
+    for hart_id in FDT.harts().filter(|&id| id != local_hart_id) {
+        let stack_range = {
+            let page_alloc = BasicPageAllocRef::new(&page_alloc);
+            let ap_stack = Page::alloc_order_in(4, page_alloc);
+            let stack_range = ap_stack.range();
+            ap_stack.into_raw();
+            stack_range
+        };
+
+        let old = BSP_PAGE_ALLOC.swap((&raw const *page_alloc) as *mut _, Ordering::Release);
+        assert!(old.is_null());
+
+        while AP_STACK
+            .compare_exchange_weak(
+                0,
+                stack_range.end().addr(),
+                Ordering::Release,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            spin_loop();
+        }
+
+        unsafe {
+            hart_start(hart_id, PhysicalAddress::new(get_ap_start_addr()), 0);
+        }
+
+        while AP_COUNT.load(Ordering::Acquire) == ap_count {
+            spin_loop();
+        }
+
+        let old = BSP_PAGE_ALLOC.swap(core::ptr::null_mut(), Ordering::Acquire);
+        assert_eq!(old as *const _, &raw const *page_alloc);
+        ap_count += 1;
+    }
+}
+
+#[unsafe(naked)]
+#[unsafe(no_mangle)]
+#[unsafe(link_section = ".bootstrap.apentry")]
+unsafe extern "C" fn _ap_start(hart_id: usize) -> ! {
+    naked_asm!(
+        "
+            la    sp, 1f        // set temp stack
+            mv    s0, a0        // save hart id
+
+            ld    t0, 2f
+            srli  t0, t0, 12
+            li    t1, 9 << 60
+            or    t0, t0, t1
+            csrw  satp, t0
+            sfence.vma
+
+            ld    t0, 3f
+            jalr  t0
+            mv    sp, a0
+
+            mv    a0, s0
+            ld    t0, 4f
+            jalr  t0
+
+            .pushsection .bootstrap.data, \"aw\", @progbits
+            1: .8byte {temp_stack}
+            2: .8byte {page_table}
+            3: .8byte {get_ap_stack}
+            4: .8byte {ap_entry}
+            .popsection
+        ",
+        temp_stack = sym TEMP_AP_STACK_START,
+        page_table = sym BOOT_PAGE_TABLE,
+        get_ap_stack = sym get_ap_stack,
+        ap_entry = sym ap_entry,
+    )
+}
+
+fn get_ap_stack() -> usize {
+    while AP_SEM
+        .compare_exchange_weak(false, true, Ordering::Acquire, Ordering::Relaxed)
+        .is_err()
+    {
+        core::hint::spin_loop();
+    }
+
+    let stack_addr = loop {
+        let addr = AP_STACK.swap(0, Ordering::AcqRel);
+        if addr != 0 {
+            break addr;
+        }
+        core::hint::spin_loop();
+    };
+
+    AP_SEM.store(false, Ordering::Release);
+
+    stack_addr
+}
+
+fn ap_entry(hart_id: usize, stack_bottom: PAddr) -> ! {
+    let stack_range = PRange::new(stack_bottom - (1 << 3) * PAGE_SIZE, stack_bottom);
+
+    {
+        // SAFETY: Acquire all the work done by the BSP and other APs.
+        let alloc = loop {
+            let alloc = BSP_PAGE_ALLOC.swap(core::ptr::null_mut(), Ordering::AcqRel);
+
+            if !alloc.is_null() {
+                break alloc;
+            }
+        };
+
+        let ref_alloc = unsafe { &*alloc };
+        setup_cpu(BasicPageAllocRef::new(&ref_alloc), hart_id);
+
+        // SAFETY: Release our allocation work.
+        BSP_PAGE_ALLOC.store(alloc, Ordering::Release);
+    }
+
+    // SAFETY: Make sure the allocator is set before we increment the AP count.
+    AP_COUNT.fetch_add(1, Ordering::Release);
+
+    unsafe extern "Rust" {
+        fn _eonix_hal_ap_main(stack_range: PRange) -> !;
+    }
 
     // set current hart's mtimecmp register
     set_next_timer();
-}
 
-/// TODO
-fn bootstrap_smp(alloc: impl Allocator, page_alloc: &RefCell<BasicPageAlloc>) {}
+    unsafe {
+        _eonix_hal_ap_main(stack_range);
+    }
+}
 
 pub fn early_console_write(s: &str) {
     write_str(s);

+ 0 - 9
crates/eonix_hal/src/arch/riscv64/cpu.rs

@@ -57,15 +57,6 @@ impl CPU {
         sscratch::write(TRAP_SCRATCH.as_ptr() as usize);
     }
 
-    /// Boot all other hart.
-    pub unsafe fn bootstrap_cpus(&self) {
-        let total_harts = FDT.hart_count();
-        for i in (0..total_harts).filter(|&i| i != self.cpuid()) {
-            sbi::hsm::hart_start(i, todo!("AP entry"), 0)
-                .expect("Failed to start secondary hart via SBI");
-        }
-    }
-
     pub unsafe fn load_interrupt_stack(self: Pin<&mut Self>, sp: u64) {
         TRAP_SCRATCH
             .as_mut()

+ 2 - 1
crates/eonix_hal/src/arch/riscv64/link.x

@@ -1,7 +1,8 @@
 SECTIONS {
     .bootstrap ORIGIN(RAM) :
     {
-        KEEP(*(.bootstrap.entry .bootstrap.data));
+        KEEP(*(.bootstrap.entry));
+        KEEP(*(.bootstrap.apentry .bootstrap.data));
 
         . = ORIGIN(RAM) + 0x1000;
         KEEP(*(.bootstrap.page_table.1));

+ 1 - 1
crates/eonix_mm/src/page_table/pte.rs

@@ -10,7 +10,7 @@ bitflags! {
         const GLOBAL = 8;
     }
 
-    #[derive(Clone, Copy, PartialEq)]
+    #[derive(Debug, Clone, Copy, PartialEq)]
     pub struct PageAttribute: usize {
         const PRESENT = 1;
         const READ = 2;

+ 1 - 0
crates/posix_types/src/lib.rs

@@ -4,6 +4,7 @@ pub mod constants;
 pub mod ctypes;
 pub mod namei;
 pub mod open;
+pub mod poll;
 pub mod result;
 pub mod signal;
 pub mod stat;

+ 5 - 0
crates/posix_types/src/poll.rs

@@ -0,0 +1,5 @@
+pub const FDSET_LENGTH: usize = 1024 / (8 * size_of::<usize>());
+
+pub struct FDSet {
+    fds_bits: [usize; FDSET_LENGTH],
+}

+ 8 - 0
crates/posix_types/src/stat.rs

@@ -1,3 +1,5 @@
+use core::time::Duration;
+
 #[repr(C)]
 #[derive(Debug, Default, Copy, Clone)]
 pub struct StatXTimestamp {
@@ -100,3 +102,9 @@ impl From<StatX> for Stat {
         }
     }
 }
+
+impl From<TimeSpec> for Duration {
+    fn from(value: TimeSpec) -> Self {
+        Self::new(value.tv_sec, value.tv_nsec)
+    }
+}

+ 2 - 2
crates/posix_types/src/syscall_no/riscv64.rs

@@ -71,7 +71,7 @@ pub const SYS_PWRITE64: usize = 68;
 pub const SYS_PREADV: usize = 69;
 pub const SYS_PWRITEV: usize = 70;
 pub const SYS_SENDFILE64: usize = 71;
-pub const SYS_PSELECT6_TIME32: usize = 72;
+pub const SYS_PSELECT6: usize = 72;
 pub const SYS_PPOLL: usize = 73;
 pub const SYS_SIGNALFD4: usize = 74;
 pub const SYS_VMSPLICE: usize = 75;
@@ -114,7 +114,7 @@ pub const SYS_TIMER_DELETE: usize = 111;
 pub const SYS_CLOCK_SETTIME: usize = 404;
 pub const SYS_CLOCK_GETTIME: usize = 113;
 pub const SYS_CLOCK_GETRES: usize = 406;
-pub const SYS_CLOCK_NANOSLEEP: usize = 407;
+pub const SYS_CLOCK_NANOSLEEP: usize = 115;
 pub const SYS_SYSLOG: usize = 116;
 pub const SYS_PTRACE: usize = 117;
 pub const SYS_SCHED_SETPARAM: usize = 118;

+ 1 - 1
crates/slab_allocator/src/slab_cache.rs

@@ -89,7 +89,7 @@ where
     Allocator: PageAlloc<RawPage = Raw>,
 {
     pub(crate) const fn new_in(object_size: u32) -> Self {
-        // avoid uncessary branch in alloc and dealloc
+        // avoid unnecessary branch in alloc and dealloc
         assert!(object_size <= PAGE_SIZE as u32 / 2);
 
         Self {

+ 41 - 4
src/fs/ext4.rs

@@ -1,5 +1,6 @@
-use core::sync::atomic::{AtomicU32, AtomicU64};
+use core::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 
+use crate::kernel::mem::{PageCache, PageCacheBackend};
 use crate::{
     io::{Buffer, ByteBuffer},
     kernel::{
@@ -18,6 +19,7 @@ use crate::{
     path::Path,
     prelude::*,
 };
+use alloc::sync::Weak;
 use alloc::{
     collections::btree_map::{BTreeMap, Entry},
     sync::Arc,
@@ -92,7 +94,7 @@ impl Ext4Fs {
                 let mode = *idata.mode.get_mut();
                 if s_isreg(mode) {
                     vacant
-                        .insert(Ext4Inode::File(Arc::new(FileInode { idata })))
+                        .insert(Ext4Inode::File(FileInode::new(idata)))
                         .clone()
                         .into_inner()
                 } else if s_isdir(mode) {
@@ -103,7 +105,7 @@ impl Ext4Fs {
                 } else {
                     println_warn!("ext4: Unsupported inode type: {mode:#o}");
                     vacant
-                        .insert(Ext4Inode::File(Arc::new(FileInode { idata })))
+                        .insert(Ext4Inode::File(FileInode::new(idata)))
                         .clone()
                         .into_inner()
                 }
@@ -174,15 +176,50 @@ impl Ext4Inode {
 }
 
 define_struct_inode! {
-    struct FileInode;
+    struct FileInode {
+        page_cache: PageCache,
+    }
 }
 
 define_struct_inode! {
     struct DirInode;
 }
 
+impl FileInode {
+    fn new(idata: InodeData) -> Arc<Self> {
+        let inode = Arc::new_cyclic(|weak_self: &Weak<FileInode>| Self {
+            idata,
+            page_cache: PageCache::new(weak_self.clone()),
+        });
+
+        inode
+    }
+}
+
+impl PageCacheBackend for FileInode {
+    fn read_page(&self, page: &mut crate::kernel::mem::CachePage, offset: usize) -> KResult<usize> {
+        self.read_direct(page, offset)
+    }
+
+    fn write_page(&self, page: &crate::kernel::mem::CachePage, offset: usize) -> KResult<usize> {
+        todo!()
+    }
+
+    fn size(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
+    }
+}
+
 impl Inode for FileInode {
+    fn page_cache(&self) -> Option<&PageCache> {
+        Some(&self.page_cache)
+    }
+
     fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        Task::block_on(self.page_cache.read(buffer, offset))
+    }
+
+    fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
 

+ 66 - 6
src/fs/fat32.rs

@@ -1,13 +1,19 @@
 mod dir;
 mod file;
 
+use crate::io::Stream;
 use crate::kernel::constants::EIO;
+use crate::kernel::mem::AsMemoryBlock;
+use crate::kernel::vfs::inode::WriteOffset;
 use crate::{
     io::{Buffer, ByteBuffer, UninitBuffer},
     kernel::{
         block::{make_device, BlockDevice, BlockDeviceRequest},
         constants::{S_IFDIR, S_IFREG},
-        mem::paging::Page,
+        mem::{
+            paging::Page,
+            {CachePage, PageCache, PageCacheBackend},
+        },
         vfs::{
             dentry::Dentry,
             inode::{define_struct_inode, Ino, Inode, InodeData},
@@ -32,6 +38,8 @@ use file::ClusterRead;
 
 type ClusterNo = u32;
 
+const SECTOR_SIZE: usize = 512;
+
 #[derive(Clone, Copy)]
 #[repr(C, packed)]
 struct Bootsector {
@@ -231,13 +239,16 @@ impl FatInode {
 }
 
 define_struct_inode! {
-    struct FileInode;
+    struct FileInode {
+        page_cache: PageCache,
+    }
 }
 
 impl FileInode {
     fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
-        let inode = Arc::new(Self {
+        let inode = Arc::new_cyclic(|weak_self: &Weak<FileInode>| Self {
             idata: InodeData::new(ino, weak),
+            page_cache: PageCache::new(weak_self.clone()),
         });
 
         // Safety: We are initializing the inode
@@ -250,7 +261,15 @@ impl FileInode {
 }
 
 impl Inode for FileInode {
+    fn page_cache(&self) -> Option<&PageCache> {
+        Some(&self.page_cache)
+    }
+
     fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        Task::block_on(self.page_cache.read(buffer, offset))
+    }
+
+    fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
         let fat = Task::block_on(vfs.fat.read());
@@ -259,16 +278,57 @@ impl Inode for FileInode {
             return Ok(0);
         }
 
-        let iter = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).read(vfs, offset);
+        let cluster_size = vfs.sectors_per_cluster as usize * SECTOR_SIZE;
+        assert!(cluster_size <= 0x1000, "Cluster size is too large");
+
+        let skip_clusters = offset / cluster_size;
+        let inner_offset = offset % cluster_size;
+
+        let cluster_iter =
+            ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).skip(skip_clusters);
+
+        let buffer_page = Page::alloc();
+        for cluster in cluster_iter {
+            vfs.read_cluster(cluster, &buffer_page)?;
+
+            let data = unsafe {
+                // SAFETY: We are the only one holding this page.
+                &buffer_page.as_memblk().as_bytes()[inner_offset..]
+            };
 
-        for data in iter {
-            if buffer.fill(data?)?.should_stop() {
+            let end = offset + data.len();
+            let real_end = core::cmp::min(end, self.size.load(Ordering::Relaxed) as usize);
+            let real_size = real_end - offset;
+
+            if buffer.fill(&data[..real_size])?.should_stop() {
                 break;
             }
         }
 
         Ok(buffer.wrote())
     }
+
+    fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+        todo!()
+    }
+
+    fn write_direct(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+        todo!()
+    }
+}
+
+impl PageCacheBackend for FileInode {
+    fn read_page(&self, page: &mut CachePage, offset: usize) -> KResult<usize> {
+        self.read_direct(page, offset)
+    }
+
+    fn write_page(&self, page: &CachePage, offset: usize) -> KResult<usize> {
+        todo!()
+    }
+
+    fn size(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
+    }
 }
 
 define_struct_inode! {

+ 1 - 0
src/fs/mod.rs

@@ -1,4 +1,5 @@
 pub mod fat32;
 pub mod procfs;
+pub mod shm;
 pub mod tmpfs;
 pub mod ext4;

+ 146 - 0
src/fs/shm.rs

@@ -0,0 +1,146 @@
+use core::sync::atomic::{AtomicU32, Ordering};
+
+use alloc::{collections::btree_map::BTreeMap, sync::Arc};
+use bitflags::bitflags;
+use eonix_sync::{LazyLock, Mutex};
+
+use crate::{
+    fs::tmpfs::{DirectoryInode, FileInode, TmpFs},
+    kernel::{constants::ENOSPC, vfs::inode::Mode},
+    prelude::KResult,
+};
+
+bitflags! {
+    #[derive(Debug, Clone, Copy)]
+    pub struct ShmFlags: u32 {
+        /// Create a new segment. If this flag is not used, then shmget() will
+        /// find the segment associated with key and check to see if the user
+        /// has permission to access the segment.
+        const IPC_CREAT = 0o1000;
+        /// This flag is used with IPC_CREAT to ensure that this call creates
+        /// the segment.  If the segment already exists, the call fails.
+        const IPC_EXCL = 0o2000;
+
+        /// Attach the segment for read-only access.If this flag is not specified,
+        /// the segment is attached for read and write access, and the process
+        /// must have read and write permission for the segment.
+        const SHM_RDONLY = 0o10000;
+        /// round attach address to SHMLBA boundary
+        const SHM_RND = 0o20000;
+        /// Allow the contents of the segment to be executed.
+        const SHM_EXEC = 0o100000;
+    }
+}
+
+pub const IPC_PRIVATE: usize = 0;
+
+pub struct ShmManager {
+    tmpfs: Arc<TmpFs>,
+    root: Arc<DirectoryInode>,
+    areas: BTreeMap<u32, ShmArea>,
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy, Debug)]
+pub struct IpcPerm {
+    key: i32,
+    uid: u32,
+    gid: u32,
+    cuid: u32,
+    cgid: u32,
+    mode: u16,
+    seq: u16,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct ShmIdDs {
+    // Ownership and permissions
+    pub shm_perm: IpcPerm,
+    // Size of segment (bytes). In our system, this must be aligned
+    pub shm_segsz: usize,
+    // Last attach time
+    pub shm_atime: usize,
+    // Last detach time
+    pub shm_dtime: usize,
+    // Creation time/time of last modification via shmctl()
+    pub shm_ctime: usize,
+    // PID of creator
+    pub shm_cpid: usize,
+    // PID of last shmat(2)/shmdt(2)
+    pub shm_lpid: usize,
+    // No. of current attaches
+    pub shm_nattch: usize,
+}
+
+impl ShmIdDs {
+    fn new(size: usize, pid: u32) -> Self {
+        Self {
+            shm_perm: IpcPerm::default(),
+            shm_segsz: size,
+            shm_atime: 0,
+            shm_dtime: 0,
+            shm_ctime: 0, // Should set instant now
+            shm_cpid: pid as usize,
+            shm_lpid: 0,
+            shm_nattch: 0,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct ShmArea {
+    pub area: Arc<FileInode>,
+    pub shmid_ds: ShmIdDs,
+}
+
+// A big lock here to protect the shared memory area.
+// Can be improved with finer-grained locking?
+pub static SHM_MANAGER: LazyLock<Mutex<ShmManager>> =
+    LazyLock::new(|| Mutex::new(ShmManager::new()));
+
+impl ShmManager {
+    fn new() -> Self {
+        let (tmpfs, root) = TmpFs::create(false).expect("should create shm_area successfully");
+        Self {
+            tmpfs,
+            root,
+            areas: BTreeMap::new(),
+        }
+    }
+
+    pub fn create_shared_area(&self, size: usize, pid: u32, mode: Mode) -> ShmArea {
+        let ino = self.tmpfs.assign_ino();
+        let vfs = Arc::downgrade(&self.tmpfs);
+        ShmArea {
+            area: FileInode::new(ino, vfs, size, mode),
+            shmid_ds: ShmIdDs::new(size, pid),
+        }
+    }
+
+    pub fn get(&self, shmid: u32) -> Option<&ShmArea> {
+        self.areas.get(&shmid)
+    }
+
+    pub fn insert(&mut self, shmid: u32, area: ShmArea) {
+        self.areas.insert(shmid, area);
+    }
+}
+
+pub fn gen_shm_id(key: usize) -> KResult<u32> {
+    const SHM_MAGIC: u32 = 114514000;
+
+    static NEXT_SHMID: AtomicU32 = AtomicU32::new(0);
+
+    if key == IPC_PRIVATE {
+        let shmid = NEXT_SHMID.fetch_add(1, Ordering::Relaxed);
+
+        if shmid >= SHM_MAGIC {
+            return Err(ENOSPC);
+        } else {
+            return Ok(shmid);
+        }
+    }
+
+    (key as u32).checked_add(SHM_MAGIC).ok_or(ENOSPC)
+}

+ 54 - 46
src/fs/tmpfs.rs

@@ -1,6 +1,8 @@
 use crate::io::Stream;
 use crate::kernel::constants::{EEXIST, EINVAL, EIO, EISDIR, ENOENT, ENOSYS, ENOTDIR};
+use crate::kernel::mem::{CachePage, PageCache, PageCacheBackend};
 use crate::kernel::timer::Instant;
+use crate::kernel::vfs::inode::InodeData;
 use crate::kernel::vfs::inode::RenameData;
 use crate::{
     io::Buffer,
@@ -16,7 +18,9 @@ use crate::{
     prelude::*,
 };
 use alloc::sync::{Arc, Weak};
+use core::fmt::Debug;
 use core::{ops::ControlFlow, sync::atomic::Ordering};
+use eonix_mm::paging::PAGE_SIZE;
 use eonix_runtime::task::Task;
 use eonix_sync::{AsProof as _, AsProofMut as _, Locked, Mutex, ProofMut};
 use itertools::Itertools;
@@ -58,7 +62,7 @@ impl Inode for NodeInode {
 }
 
 define_struct_inode! {
-    struct DirectoryInode {
+    pub(super) struct DirectoryInode {
         entries: Locked<Vec<(Arc<[u8]>, Ino)>, ()>,
     }
 }
@@ -152,7 +156,7 @@ impl Inode for DirectoryInode {
         let rwsem = Task::block_on(self.rwsem.write());
 
         let ino = vfs.assign_ino();
-        let file = FileInode::new(ino, self.vfs.clone(), mode);
+        let file = FileInode::new(ino, self.vfs.clone(), 0, mode);
 
         self.link(at.get_name(), file.as_ref(), rwsem.prove_mut());
         at.save_reg(file)
@@ -460,40 +464,60 @@ impl Inode for SymlinkInode {
 }
 
 define_struct_inode! {
-    struct FileInode {
-        filedata: Locked<Vec<u8>, ()>,
+    pub struct FileInode {
+        pages: PageCache,
+    }
+}
+
+impl Debug for FileInode {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "FileInode({:?})", self.idata)
     }
 }
 
 impl FileInode {
-    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
-        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
-            addr_of_mut_field!(inode, filedata).write(Locked::new(vec![], rwsem));
+    pub fn new(ino: Ino, vfs: Weak<dyn Vfs>, size: usize, mode: Mode) -> Arc<Self> {
+        let inode = Arc::new_cyclic(|weak_self: &Weak<FileInode>| FileInode {
+            idata: InodeData::new(ino, vfs),
+            pages: PageCache::new(weak_self.clone()),
+        });
 
-            addr_of_mut_field!(&mut *inode, mode).write((S_IFREG | (mode & 0o777)).into());
-            addr_of_mut_field!(&mut *inode, nlink).write(1.into());
-            addr_of_mut_field!(&mut *inode, ctime).write(Spin::new(Instant::now()));
-            addr_of_mut_field!(&mut *inode, mtime).write(Spin::new(Instant::now()));
-            addr_of_mut_field!(&mut *inode, atime).write(Spin::new(Instant::now()));
-        })
+        inode
+            .mode
+            .store(S_IFREG | (mode & 0o777), Ordering::Relaxed);
+        inode.nlink.store(1, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+        inode
+    }
+}
+
+impl PageCacheBackend for FileInode {
+    fn read_page(&self, _cache_page: &mut CachePage, _offset: usize) -> KResult<usize> {
+        Ok(PAGE_SIZE)
+    }
+
+    fn write_page(&self, _page: &CachePage, _offset: usize) -> KResult<usize> {
+        Ok(PAGE_SIZE)
+    }
+
+    fn size(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
     }
 }
 
 impl Inode for FileInode {
-    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
-        // TODO: We don't need that strong guarantee, find some way to avoid locks
-        let lock = Task::block_on(self.rwsem.read());
+    fn page_cache(&self) -> Option<&PageCache> {
+        Some(&self.pages)
+    }
 
-        match self.filedata.access(lock.prove()).split_at_checked(offset) {
-            Some((_, data)) => buffer.fill(data).map(|result| result.allow_partial()),
-            None => Ok(0),
-        }
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        let lock = Task::block_on(self.rwsem.write());
+        Task::block_on(self.pages.read(buffer, offset))
     }
 
     fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
         // TODO: We don't need that strong guarantee, find some way to avoid locks
         let lock = Task::block_on(self.rwsem.write());
-        let filedata = self.filedata.access_mut(lock.prove_mut());
 
         let mut store_new_end = None;
         let offset = match offset {
@@ -506,41 +530,25 @@ impl Inode for FileInode {
             }
         };
 
-        let mut pos = offset;
-        loop {
-            if pos >= filedata.len() {
-                filedata.resize(pos + 4096, 0);
-            }
+        let wrote = Task::block_on(self.pages.write(stream, offset))?;
+        let cursor_end = offset + wrote;
 
-            match stream.poll_data(&mut filedata[pos..])? {
-                Some(data) => pos += data.len(),
-                None => break,
-            }
-        }
-
-        filedata.resize(pos, 0);
         if let Some(store_end) = store_new_end {
-            *store_end = pos;
+            *store_end = cursor_end;
         }
 
         // SAFETY: `lock` has done the synchronization
-        self.size.store(pos as u64, Ordering::Relaxed);
         *self.mtime.lock() = Instant::now();
+        self.size.store(cursor_end as u64, Ordering::Relaxed);
 
-        Ok(pos - offset)
+        Ok(wrote)
     }
 
     fn truncate(&self, length: usize) -> KResult<()> {
-        // TODO: We don't need that strong guarantee, find some way to avoid locks
         let lock = Task::block_on(self.rwsem.write());
-        let filedata = self.filedata.access_mut(lock.prove_mut());
-
-        // SAFETY: `lock` has done the synchronization
+        Task::block_on(self.pages.resize(length))?;
         self.size.store(length as u64, Ordering::Relaxed);
         *self.mtime.lock() = Instant::now();
-
-        filedata.resize(length, 0);
-
         Ok(())
     }
 
@@ -559,7 +567,7 @@ impl Inode for FileInode {
 }
 
 impl_any!(TmpFs);
-struct TmpFs {
+pub(super) struct TmpFs {
     next_ino: AtomicIno,
     readonly: bool,
     rename_lock: Mutex<()>,
@@ -580,11 +588,11 @@ impl Vfs for TmpFs {
 }
 
 impl TmpFs {
-    fn assign_ino(&self) -> Ino {
+    pub(super) fn assign_ino(&self) -> Ino {
         self.next_ino.fetch_add(1, Ordering::AcqRel)
     }
 
-    pub fn create(readonly: bool) -> KResult<(Arc<dyn Vfs>, Arc<dyn Inode>)> {
+    pub fn create(readonly: bool) -> KResult<(Arc<TmpFs>, Arc<DirectoryInode>)> {
         let tmpfs = Arc::new(Self {
             next_ino: AtomicIno::new(1),
             readonly,

+ 1 - 0
src/io.rs

@@ -3,6 +3,7 @@ use crate::prelude::*;
 use core::{cmp, mem::MaybeUninit};
 
 #[must_use]
+#[derive(Debug)]
 pub enum FillResult {
     Done(usize),
     Partial(usize),

+ 2 - 0
src/kernel/constants.rs

@@ -15,6 +15,7 @@ pub const SIG_SETMASK: u32 = 2;
 
 pub const CLOCK_REALTIME: u32 = 0;
 pub const CLOCK_MONOTONIC: u32 = 1;
+pub const CLOCK_REALTIME_COARSE: u32 = 5;
 
 pub const EPERM: u32 = 1;
 pub const ENOENT: u32 = 2;
@@ -35,6 +36,7 @@ pub const ENOTDIR: u32 = 20;
 pub const EISDIR: u32 = 21;
 pub const EINVAL: u32 = 22;
 pub const ENOTTY: u32 = 25;
+pub const ENOSPC: u32 = 28;
 pub const ESPIPE: u32 = 29;
 // pub const EROFS: u32 = 30;
 pub const EPIPE: u32 = 32;

+ 2 - 0
src/kernel/mem.rs

@@ -6,9 +6,11 @@ mod allocator;
 mod mm_area;
 mod mm_list;
 mod page_alloc;
+mod page_cache;
 
 pub use access::{AsMemoryBlock, MemoryBlock, PhysAccess};
 pub(self) use mm_area::MMArea;
 pub use mm_list::{handle_kernel_page_fault, FileMapping, MMList, Mapping, Permission};
 pub use page_alloc::{GlobalPageAlloc, RawPage};
+pub use page_cache::{CachePage, PageCache, PageCacheBackend};
 pub use paging::{Page, PageBuffer};

+ 69 - 26
src/kernel/mem/mm_area.rs

@@ -1,18 +1,22 @@
 use super::mm_list::EMPTY_PAGE;
 use super::paging::AllocZeroed as _;
 use super::{AsMemoryBlock, Mapping, Page, Permission};
-use crate::io::ByteBuffer;
+use crate::kernel::constants::EINVAL;
+use crate::kernel::mem::page_cache::PageCacheRawPage;
 use crate::KResult;
-use core::{borrow::Borrow, cell::UnsafeCell, cmp::Ordering};
+use core::sync::atomic;
+use core::{borrow::Borrow, cell::UnsafeCell, cmp};
 use eonix_mm::address::{AddrOps as _, VAddr, VRange};
 use eonix_mm::page_table::{PageAttribute, RawAttribute, PTE};
-use eonix_mm::paging::PFN;
+use eonix_mm::paging::{PAGE_SIZE, PFN};
+use eonix_runtime::task::Task;
 
 #[derive(Debug)]
 pub struct MMArea {
     range: UnsafeCell<VRange>,
     pub(super) mapping: Mapping,
     pub(super) permission: Permission,
+    pub is_shared: bool,
 }
 
 impl Clone for MMArea {
@@ -21,16 +25,18 @@ impl Clone for MMArea {
             range: UnsafeCell::new(self.range()),
             mapping: self.mapping.clone(),
             permission: self.permission,
+            is_shared: self.is_shared,
         }
     }
 }
 
 impl MMArea {
-    pub fn new(range: VRange, mapping: Mapping, permission: Permission) -> Self {
+    pub fn new(range: VRange, mapping: Mapping, permission: Permission, is_shared: bool) -> Self {
         Self {
             range: range.into(),
             mapping,
             permission,
+            is_shared,
         }
     }
 
@@ -56,9 +62,9 @@ impl MMArea {
         assert!(at.is_page_aligned());
 
         match self.range_borrow().cmp(&VRange::from(at)) {
-            Ordering::Less => (Some(self), None),
-            Ordering::Greater => (None, Some(self)),
-            Ordering::Equal => {
+            cmp::Ordering::Less => (Some(self), None),
+            cmp::Ordering::Greater => (None, Some(self)),
+            cmp::Ordering::Equal => {
                 let diff = at - self.range_borrow().start();
                 if diff == 0 {
                     return (None, Some(self));
@@ -71,6 +77,7 @@ impl MMArea {
                         Mapping::Anonymous => Mapping::Anonymous,
                         Mapping::File(mapping) => Mapping::File(mapping.offset(diff)),
                     },
+                    is_shared: self.is_shared,
                 };
 
                 let new_range = self.range_borrow().shrink(self.range_borrow().end() - at);
@@ -119,35 +126,71 @@ impl MMArea {
 
     /// # Arguments
     /// * `offset`: The offset from the start of the mapping, aligned to 4KB boundary.
-    pub fn handle_mmap(
+    pub async fn handle_mmap(
         &self,
         pfn: &mut PFN,
         attr: &mut PageAttribute,
         offset: usize,
+        write: bool,
     ) -> KResult<()> {
-        // TODO: Implement shared mapping
-        let Mapping::File(mapping) = &self.mapping else {
+        let Mapping::File(file_mapping) = &self.mapping else {
             panic!("Anonymous mapping should not be PA_MMAP");
         };
 
-        assert!(offset < mapping.length, "Offset out of range");
-        unsafe {
-            Page::with_raw(*pfn, |page| {
-                // SAFETY: `page` is marked as mapped, so others trying to read or write to
-                //         it will be blocked and enter the page fault handler, where they will
-                //         be blocked by the mutex held by us.
-                let page_data = page.as_memblk().as_bytes_mut();
+        assert!(offset < file_mapping.length, "Offset out of range");
 
-                let cnt_to_read = (mapping.length - offset).min(0x1000);
-                let cnt_read = mapping.file.read(
-                    &mut ByteBuffer::new(&mut page_data[..cnt_to_read]),
-                    mapping.offset + offset,
-                )?;
+        let Some(page_cache) = file_mapping.file.page_cache() else {
+            panic!("Mapping file should have pagecache");
+        };
 
-                page_data[cnt_read..].fill(0);
+        let file_offset = file_mapping.offset + offset;
+        let cnt_to_read = (file_mapping.length - offset).min(0x1000);
+        let raw_page = page_cache.get_page(file_offset).await?.ok_or(EINVAL)?;
+
+        // Non-write faults: we find page in pagecache and do mapping
+        // Write fault: we need to care about shared or private mapping.
+        if !write {
+            // Bss is embarrassing in pagecache!
+            // We have to assume cnt_to_read < PAGE_SIZE all bss
+            if cnt_to_read < PAGE_SIZE {
+                let new_page = Page::zeroed();
+                unsafe {
+                    let page_data = new_page.as_memblk().as_bytes_mut();
+                    page_data[..cnt_to_read]
+                        .copy_from_slice(&raw_page.as_memblk().as_bytes()[..cnt_to_read]);
+                }
+                *pfn = new_page.into_raw();
+            } else {
+                raw_page.refcount().fetch_add(1, atomic::Ordering::Relaxed);
+                *pfn = Into::<PFN>::into(raw_page);
+            }
+
+            if self.permission.write {
+                if self.is_shared {
+                    // The page may will not be written,
+                    // But we simply assume page will be dirty
+                    raw_page.set_dirty();
+                    attr.insert(PageAttribute::WRITE);
+                } else {
+                    attr.insert(PageAttribute::COPY_ON_WRITE);
+                }
+            }
+        } else {
+            if self.is_shared {
+                raw_page.refcount().fetch_add(1, atomic::Ordering::Relaxed);
+                raw_page.set_dirty();
+                *pfn = Into::<PFN>::into(raw_page);
+            } else {
+                let new_page = Page::zeroed();
+                unsafe {
+                    let page_data = new_page.as_memblk().as_bytes_mut();
+                    page_data[..cnt_to_read]
+                        .copy_from_slice(&raw_page.as_memblk().as_bytes()[..cnt_to_read]);
+                }
+                *pfn = new_page.into_raw();
+            }
 
-                KResult::Ok(())
-            })?;
+            attr.insert(PageAttribute::WRITE);
         }
 
         attr.insert(PageAttribute::PRESENT);
@@ -164,7 +207,7 @@ impl MMArea {
         }
 
         if attr.contains(PageAttribute::MAPPED) {
-            self.handle_mmap(&mut pfn, &mut attr, offset)?;
+            Task::block_on(self.handle_mmap(&mut pfn, &mut attr, offset, write))?;
         }
 
         attr.insert(PageAttribute::ACCESSED);

+ 54 - 12
src/kernel/mem/mm_list.rs

@@ -6,6 +6,7 @@ use super::page_alloc::GlobalPageAlloc;
 use super::paging::AllocZeroed as _;
 use super::{AsMemoryBlock, MMArea, Page};
 use crate::kernel::constants::{EEXIST, EFAULT, EINVAL, ENOMEM};
+use crate::kernel::mem::page_alloc::RawPagePtr;
 use crate::{prelude::*, sync::ArcSwap};
 use alloc::collections::btree_set::BTreeSet;
 use core::fmt;
@@ -256,6 +257,7 @@ impl MMListInner<'_> {
         len: usize,
         mapping: Mapping,
         permission: Permission,
+        is_shared: bool,
     ) -> KResult<()> {
         assert_eq!(at.floor(), at);
         assert_eq!(len & (PAGE_SIZE - 1), 0);
@@ -271,13 +273,33 @@ impl MMListInner<'_> {
             Mapping::File(_) => self.page_table.set_mmapped(range, permission),
         }
 
-        self.areas.insert(MMArea::new(range, mapping, permission));
+        self.areas
+            .insert(MMArea::new(range, mapping, permission, is_shared));
         Ok(())
     }
 }
 
 impl Drop for MMListInner<'_> {
     fn drop(&mut self) {
+        // May buggy
+        for area in &self.areas {
+            if area.is_shared {
+                for pte in self.page_table.iter_user(area.range()) {
+                    let (pfn, _) = pte.take();
+                    let raw_page = RawPagePtr::from(pfn);
+                    if raw_page.refcount().fetch_sub(1, Ordering::Relaxed) == 1 {
+                        // Wrong here
+                        // unsafe { Page::from_raw(pfn) };
+                    }
+                }
+            } else {
+                for pte in self.page_table.iter_user(area.range()) {
+                    let (pfn, _) = pte.take();
+                    unsafe { Page::from_raw(pfn) };
+                }
+            }
+        }
+
         // TODO: Recycle all pages in the page table.
     }
 }
@@ -343,9 +365,15 @@ impl MMList {
             let list_inner = list_inner.lock().await;
 
             for area in list_inner.areas.iter() {
-                list_inner
-                    .page_table
-                    .set_copy_on_write(&mut inner.page_table, area.range());
+                if !area.is_shared {
+                    list_inner
+                        .page_table
+                        .set_copy_on_write(&mut inner.page_table, area.range());
+                } else {
+                    list_inner
+                        .page_table
+                        .set_copied(&mut inner.page_table, area.range());
+                }
             }
         }
 
@@ -507,21 +535,22 @@ impl MMList {
         len: usize,
         mapping: Mapping,
         permission: Permission,
+        is_shared: bool,
     ) -> KResult<VAddr> {
         let inner = self.inner.borrow();
         let mut inner = Task::block_on(inner.lock());
 
         if hint == VAddr::NULL {
             let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
-            inner.mmap(at, len, mapping, permission)?;
+            inner.mmap(at, len, mapping, permission, is_shared)?;
             return Ok(at);
         }
 
-        match inner.mmap(hint, len, mapping.clone(), permission) {
+        match inner.mmap(hint, len, mapping.clone(), permission, is_shared) {
             Ok(()) => Ok(hint),
             Err(EEXIST) => {
                 let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
-                inner.mmap(at, len, mapping, permission)?;
+                inner.mmap(at, len, mapping, permission, is_shared)?;
                 Ok(at)
             }
             Err(err) => Err(err),
@@ -534,9 +563,10 @@ impl MMList {
         len: usize,
         mapping: Mapping,
         permission: Permission,
+        is_shared: bool,
     ) -> KResult<VAddr> {
         Task::block_on(self.inner.borrow().lock())
-            .mmap(at, len, mapping.clone(), permission)
+            .mmap(at, len, mapping.clone(), permission, is_shared)
             .map(|_| at)
     }
 
@@ -571,6 +601,7 @@ impl MMList {
                     write: true,
                     execute: false,
                 },
+                false,
             ));
         }
 
@@ -644,6 +675,7 @@ impl MMList {
                 let page_start = current.floor() + idx * 0x1000;
                 let page_end = page_start + 0x1000;
 
+                // Prepare for the worst case that we might write to the page...
                 area.handle(pte, page_start - area_start, true)?;
 
                 let start_offset;
@@ -692,6 +724,7 @@ trait PageTableExt {
     fn set_anonymous(&self, range: VRange, permission: Permission);
     fn set_mmapped(&self, range: VRange, permission: Permission);
     fn set_copy_on_write(&self, from: &Self, range: VRange);
+    fn set_copied(&self, from: &Self, range: VRange);
 }
 
 impl PageTableExt for KernelPageTable<'_> {
@@ -715,10 +748,22 @@ impl PageTableExt for KernelPageTable<'_> {
             to.set_copy_on_write(from);
         }
     }
+
+    fn set_copied(&self, from: &Self, range: VRange) {
+        let to_iter = self.iter_user(range);
+        let from_iter = from.iter_user(range);
+
+        for (to, from) in to_iter.zip(from_iter) {
+            let (pfn, attr) = from.get();
+            to.set(pfn, attr);
+        }
+    }
 }
 
 trait PTEExt {
+    // private anonymous
     fn set_anonymous(&mut self, execute: bool);
+    // file mapped or shared anonymous
     fn set_mapped(&mut self, execute: bool);
     fn set_copy_on_write(&mut self, from: &mut Self);
 }
@@ -742,10 +787,7 @@ where
     fn set_mapped(&mut self, execute: bool) {
         // Writable flag is set during page fault handling while executable flag is
         // preserved across page faults, so we set executable flag now.
-        let mut attr = PageAttribute::READ
-            | PageAttribute::USER
-            | PageAttribute::MAPPED
-            | PageAttribute::COPY_ON_WRITE;
+        let mut attr = PageAttribute::READ | PageAttribute::USER | PageAttribute::MAPPED;
         attr.set(PageAttribute::EXECUTE, execute);
 
         self.set(EMPTY_PAGE.clone().into_raw(), T::Attr::from(attr));

+ 14 - 3
src/kernel/mem/mm_list/mapping.rs

@@ -1,23 +1,34 @@
-use crate::kernel::vfs::dentry::Dentry;
+use core::fmt::Debug;
+
+use crate::kernel::vfs::inode::Inode;
 use alloc::sync::Arc;
 use eonix_mm::paging::PAGE_SIZE;
 
 #[derive(Debug, Clone)]
 pub struct FileMapping {
-    pub file: Arc<Dentry>,
+    pub file: Arc<dyn Inode>,
     /// Offset in the file, aligned to 4KB boundary.
     pub offset: usize,
     /// Length of the mapping. Exceeding part will be zeroed.
     pub length: usize,
 }
+
+impl Debug for dyn Inode {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "Inode()")
+    }
+}
+
 #[derive(Debug, Clone)]
 pub enum Mapping {
+    // private anonymous memory
     Anonymous,
+    // file-backed memory or shared anonymous memory(tmp file)
     File(FileMapping),
 }
 
 impl FileMapping {
-    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
+    pub fn new(file: Arc<dyn Inode>, offset: usize, length: usize) -> Self {
         assert_eq!(offset & (PAGE_SIZE - 1), 0);
         Self {
             file,

+ 47 - 3
src/kernel/mem/page_alloc/raw_page.rs

@@ -1,10 +1,12 @@
-use crate::kernel::mem::PhysAccess;
+use crate::kernel::mem::{page_cache::PageCacheRawPage, MemoryBlock};
+use crate::kernel::mem::{AsMemoryBlock, PhysAccess};
 use buddy_allocator::BuddyRawPage;
 use core::{
     ptr::NonNull,
     sync::atomic::{AtomicU32, AtomicUsize, Ordering},
 };
 use eonix_hal::mm::ArchPhysAccess;
+use eonix_mm::paging::PAGE_SIZE;
 use eonix_mm::{
     address::{PAddr, PhysAccess as _},
     paging::{RawPage as RawPageTrait, PFN},
@@ -31,11 +33,16 @@ impl SlabPageInner {
     }
 }
 
+struct PageCacheInner {
+    valid_size: usize,
+}
+
 pub struct BuddyPageInner {}
 
 enum PageType {
     Buddy(BuddyPageInner),
     Slab(SlabPageInner),
+    PageCache(PageCacheInner),
 }
 
 impl PageType {
@@ -46,6 +53,14 @@ impl PageType {
             unreachable!()
         }
     }
+
+    fn page_cache_data(&mut self) -> &mut PageCacheInner {
+        if let PageType::PageCache(cache_data) = self {
+            return cache_data;
+        } else {
+            unreachable!()
+        }
+    }
 }
 
 pub struct RawPage {
@@ -69,8 +84,8 @@ impl PageFlags {
     pub const PRESENT: u32 = 1 << 0;
     // pub const LOCKED: u32 = 1 << 1;
     pub const BUDDY: u32 = 1 << 2;
-    // pub const SLAB: u32 = 1 << 3;
-    // pub const DIRTY: u32 = 1 << 4;
+    pub const SLAB: u32 = 1 << 3;
+    pub const DIRTY: u32 = 1 << 4;
     pub const FREE: u32 = 1 << 5;
     pub const LOCAL: u32 = 1 << 6;
 
@@ -234,3 +249,32 @@ impl SlabRawPage for RawPagePtr {
         self.as_mut().shared_data = PageType::Slab(SlabPageInner::new(first_free));
     }
 }
+
+impl PageCacheRawPage for RawPagePtr {
+    fn valid_size(&self) -> &mut usize {
+        &mut self.as_mut().shared_data.page_cache_data().valid_size
+    }
+
+    fn is_dirty(&self) -> bool {
+        self.flags().has(PageFlags::DIRTY)
+    }
+
+    fn clear_dirty(&self) {
+        self.flags().clear(PageFlags::DIRTY);
+    }
+
+    fn set_dirty(&self) {
+        self.flags().set(PageFlags::DIRTY);
+    }
+
+    fn cache_init(&self) {
+        self.as_mut().shared_data = PageType::PageCache(PageCacheInner { valid_size: 0 });
+    }
+}
+
+/// SAFETY: `RawPagePtr` is a pointer to a valid `RawPage` struct.
+impl AsMemoryBlock for RawPagePtr {
+    fn as_memblk(&self) -> MemoryBlock {
+        unsafe { MemoryBlock::new(self.real_ptr::<()>().addr(), PAGE_SIZE) }
+    }
+}

+ 303 - 0
src/kernel/mem/page_cache.rs

@@ -0,0 +1,303 @@
+use super::access::AsMemoryBlock;
+use crate::{
+    io::{Buffer, FillResult, Stream},
+    kernel::mem::page_alloc::RawPagePtr,
+    prelude::KResult,
+    GlobalPageAlloc,
+};
+use align_ext::AlignExt;
+use alloc::{collections::btree_map::BTreeMap, sync::Weak};
+use eonix_mm::paging::{PageAlloc, RawPage, PAGE_SIZE, PAGE_SIZE_BITS};
+use eonix_sync::Mutex;
+
+pub struct PageCache {
+    pages: Mutex<BTreeMap<usize, CachePage>>,
+    backend: Weak<dyn PageCacheBackend>,
+}
+
+unsafe impl Send for PageCache {}
+unsafe impl Sync for PageCache {}
+
+#[derive(Clone, Copy)]
+pub struct CachePage(RawPagePtr);
+
+impl Buffer for CachePage {
+    fn total(&self) -> usize {
+        PAGE_SIZE
+    }
+
+    fn wrote(&self) -> usize {
+        self.valid_size()
+    }
+
+    fn fill(&mut self, data: &[u8]) -> KResult<FillResult> {
+        let valid_size = self.valid_size();
+        let available = &mut self.all_mut()[valid_size..];
+        if available.len() == 0 {
+            return Ok(FillResult::Full);
+        }
+
+        let len = core::cmp::min(data.len(), available.len());
+        available[..len].copy_from_slice(&data[..len]);
+
+        *self.0.valid_size() += len;
+
+        if len < data.len() {
+            Ok(FillResult::Partial(len))
+        } else {
+            Ok(FillResult::Done(len))
+        }
+    }
+}
+
+impl CachePage {
+    pub fn new() -> Self {
+        let page = GlobalPageAlloc.alloc().unwrap();
+        page.cache_init();
+        Self(page)
+    }
+
+    pub fn new_zeroed() -> Self {
+        let page = GlobalPageAlloc.alloc().unwrap();
+        // SAFETY: We own the page exclusively, so we can safely zero it.
+        unsafe {
+            page.as_memblk().as_bytes_mut().fill(0);
+        }
+        page.cache_init();
+        Self(page)
+    }
+
+    pub fn valid_size(&self) -> usize {
+        *self.0.valid_size()
+    }
+
+    pub fn set_valid_size(&mut self, valid_size: usize) {
+        *self.0.valid_size() = valid_size;
+    }
+
+    pub fn all(&self) -> &[u8] {
+        unsafe {
+            self.0.as_memblk().as_bytes()
+        }
+    }
+
+    pub fn all_mut(&mut self) -> &mut [u8] {
+        unsafe {
+            self.0.as_memblk().as_bytes_mut()
+        }
+    }
+
+    pub fn valid_data(&self) -> &[u8] {
+        &self.all()[..self.valid_size()]
+    }
+
+    pub fn is_dirty(&self) -> bool {
+        self.0.is_dirty()
+    }
+
+    pub fn set_dirty(&self) {
+        self.0.set_dirty();
+    }
+
+    pub fn clear_dirty(&self) {
+        self.0.clear_dirty();
+    }
+}
+
+impl PageCache {
+    pub fn new(backend: Weak<dyn PageCacheBackend>) -> Self {
+        Self {
+            pages: Mutex::new(BTreeMap::new()),
+            backend: backend,
+        }
+    }
+
+    pub async fn read(&self, buffer: &mut dyn Buffer, mut offset: usize) -> KResult<usize> {
+        let mut pages = self.pages.lock().await;
+
+        loop {
+            let page_id = offset >> PAGE_SIZE_BITS;
+            let page = pages.get(&page_id);
+
+            match page {
+                Some(page) => {
+                    let inner_offset = offset % PAGE_SIZE;
+
+                    // TODO: still cause unnecessary IO if valid_size < PAGESIZE
+                    //       and fill result is Done
+                    if page.valid_size() == 0
+                        || buffer
+                            .fill(&page.valid_data()[inner_offset..])?
+                            .should_stop()
+                        || buffer.available() == 0
+                    {
+                        break;
+                    }
+
+                    offset += PAGE_SIZE - inner_offset;
+                }
+                None => {
+                    let mut new_page = CachePage::new();
+                    self.backend
+                        .upgrade()
+                        .unwrap()
+                        .read_page(&mut new_page, offset.align_down(PAGE_SIZE))?;
+                    pages.insert(page_id, new_page);
+                }
+            }
+        }
+
+        Ok(buffer.wrote())
+    }
+
+    pub async fn write(&self, stream: &mut dyn Stream, mut offset: usize) -> KResult<usize> {
+        let mut pages = self.pages.lock().await;
+        let old_size = self.backend.upgrade().unwrap().size();
+        let mut wrote = 0;
+
+        loop {
+            let page_id = offset >> PAGE_SIZE_BITS;
+            let page = pages.get_mut(&page_id);
+
+            match page {
+                Some(page) => {
+                    let inner_offset = offset % PAGE_SIZE;
+                    let cursor_end = match stream.poll_data(&mut page.all_mut()[inner_offset..])? {
+                        Some(buf) => {
+                            wrote += buf.len();
+                            inner_offset + buf.len()
+                        }
+                        None => {
+                            break;
+                        }
+                    };
+
+                    if page.valid_size() < cursor_end {
+                        page.set_valid_size(cursor_end);
+                    }
+                    page.set_dirty();
+                    offset += PAGE_SIZE - inner_offset;
+                }
+                None => {
+                    let new_page = if (offset >> PAGE_SIZE_BITS) > (old_size >> PAGE_SIZE_BITS) {
+                        let new_page = CachePage::new_zeroed();
+                        new_page
+                    } else {
+                        let mut new_page = CachePage::new();
+                        self.backend
+                            .upgrade()
+                            .unwrap()
+                            .read_page(&mut new_page, offset.align_down(PAGE_SIZE))?;
+                        new_page
+                    };
+
+                    pages.insert(page_id, new_page);
+                }
+            }
+        }
+
+        Ok(wrote)
+    }
+
+    pub async fn fsync(&self) -> KResult<()> {
+        let pages = self.pages.lock().await;
+        for (page_id, page) in pages.iter() {
+            if page.is_dirty() {
+                self.backend
+                    .upgrade()
+                    .unwrap()
+                    .write_page(page, page_id << PAGE_SIZE_BITS)?;
+                page.clear_dirty();
+            }
+        }
+        Ok(())
+    }
+
+    // This function is used for extend write or truncate
+    pub async fn resize(&self, new_size: usize) -> KResult<()> {
+        let mut pages = self.pages.lock().await;
+        let old_size = self.backend.upgrade().unwrap().size();
+
+        if new_size < old_size {
+            let begin = new_size.align_down(PAGE_SIZE) >> PAGE_SIZE_BITS;
+            let end = old_size.align_up(PAGE_SIZE) >> PAGE_SIZE_BITS;
+
+            for page_id in begin..end {
+                pages.remove(&page_id);
+            }
+        } else if new_size > old_size {
+            let begin = old_size.align_down(PAGE_SIZE) >> PAGE_SIZE_BITS;
+            let end = new_size.align_up(PAGE_SIZE) >> PAGE_SIZE_BITS;
+
+            pages.remove(&begin);
+
+            for page_id in begin..end {
+                let mut new_page = CachePage::new_zeroed();
+
+                if page_id != end - 1 {
+                    new_page.set_valid_size(PAGE_SIZE);
+                } else {
+                    new_page.set_valid_size(new_size % PAGE_SIZE);
+                }
+                new_page.set_dirty();
+                pages.insert(page_id, new_page);
+            }
+        }
+
+        Ok(())
+    }
+
+    pub async fn get_page(&self, offset: usize) -> KResult<Option<RawPagePtr>> {
+        let offset_aligin = offset.align_down(PAGE_SIZE);
+        let page_id = offset_aligin >> PAGE_SIZE_BITS;
+        let size = self.backend.upgrade().unwrap().size();
+
+        if offset_aligin > size {
+            return Ok(None);
+        }
+
+        let mut pages = self.pages.lock().await;
+
+        if let Some(page) = pages.get(&page_id) {
+            Ok(Some(page.0))
+        } else {
+            let mut new_page = CachePage::new();
+            self.backend
+                .upgrade()
+                .unwrap()
+                .read_page(&mut new_page, offset_aligin)?;
+            pages.insert(page_id, new_page);
+            Ok(Some(new_page.0))
+        }
+    }
+}
+
+// with this trait, "page cache" and "block cache" are unified,
+// for fs, offset is file offset (floor algin to PAGE_SIZE)
+// for blkdev, offset is block idx (floor align to PAGE_SIZE / BLK_SIZE)
+// Oh no, this would make unnecessary cache
+pub trait PageCacheBackend {
+    fn read_page(&self, page: &mut CachePage, offset: usize) -> KResult<usize>;
+
+    fn write_page(&self, page: &CachePage, offset: usize) -> KResult<usize>;
+
+    fn size(&self) -> usize;
+}
+
+pub trait PageCacheRawPage: RawPage {
+    fn valid_size(&self) -> &mut usize;
+
+    fn is_dirty(&self) -> bool;
+
+    fn set_dirty(&self);
+
+    fn clear_dirty(&self);
+
+    fn cache_init(&self);
+}
+
+impl Drop for PageCache {
+    fn drop(&mut self) {
+        let _ = self.fsync();
+    }
+}

+ 75 - 6
src/kernel/syscall/file_rw.rs

@@ -1,9 +1,12 @@
+use core::time::Duration;
+
 use super::FromSyscallArg;
 use crate::io::IntoStream;
 use crate::kernel::constants::{
-    EBADF, EFAULT, EINVAL, ENOENT, ENOTDIR, SEEK_CUR, SEEK_END, SEEK_SET, S_IFBLK, S_IFCHR,
+    EBADF, EFAULT, EINVAL, ENOENT, ENOSYS, ENOTDIR, SEEK_CUR, SEEK_END, SEEK_SET, S_IFBLK, S_IFCHR,
 };
 use crate::kernel::task::Thread;
+use crate::kernel::timer::sleep;
 use crate::kernel::vfs::filearray::FD;
 use crate::{
     io::{Buffer, BufferFill},
@@ -25,7 +28,8 @@ use eonix_runtime::task::Task;
 use posix_types::ctypes::{Long, PtrT};
 use posix_types::namei::RenameFlags;
 use posix_types::open::{AtFlags, OpenFlags};
-use posix_types::signal::SigSet;
+use posix_types::poll::FDSet;
+use posix_types::signal::{SigSet, Signal};
 use posix_types::stat::Stat;
 use posix_types::stat::{StatX, TimeSpec};
 use posix_types::syscall_no::*;
@@ -73,7 +77,20 @@ fn dentry_from(
 fn read(fd: FD, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
 
-    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.read(&mut buffer))
+    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.read(&mut buffer, None))
+}
+
+#[eonix_macros::define_syscall(SYS_PREAD64)]
+fn pread64(fd: FD, buffer: *mut u8, bufsize: usize, offset: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+
+    Task::block_on(
+        thread
+            .files
+            .get(fd)
+            .ok_or(EBADF)?
+            .read(&mut buffer, Some(offset)),
+    )
 }
 
 #[eonix_macros::define_syscall(SYS_WRITE)]
@@ -81,7 +98,21 @@ fn write(fd: FD, buffer: *const u8, count: usize) -> KResult<usize> {
     let buffer = CheckedUserPointer::new(buffer, count)?;
     let mut stream = buffer.into_stream();
 
-    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.write(&mut stream))
+    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.write(&mut stream, None))
+}
+
+#[eonix_macros::define_syscall(SYS_PWRITE64)]
+fn pwrite64(fd: FD, buffer: *const u8, count: usize, offset: usize) -> KResult<usize> {
+    let buffer = CheckedUserPointer::new(buffer, count)?;
+    let mut stream = buffer.into_stream();
+
+    Task::block_on(
+        thread
+            .files
+            .get(fd)
+            .ok_or(EBADF)?
+            .write(&mut stream, Some(offset)),
+    )
 }
 
 #[eonix_macros::define_syscall(SYS_OPENAT)]
@@ -229,6 +260,12 @@ fn mkdir(pathname: *const u8, mode: u32) -> KResult<()> {
     sys_mkdirat(thread, FD::AT_FDCWD, pathname, mode)
 }
 
+#[eonix_macros::define_syscall(SYS_FTRUNCATE64)]
+fn truncate64(fd: FD, length: usize) -> KResult<()> {
+    let file = thread.files.get(fd).ok_or(EBADF)?;
+    file.as_path().ok_or(EBADF)?.truncate(length)
+}
+
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_TRUNCATE)]
 fn truncate(pathname: *const u8, length: usize) -> KResult<()> {
@@ -353,7 +390,7 @@ fn readv(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
     let mut tot = 0usize;
     for mut buffer in iov_buffers.into_iter() {
         // TODO!!!: `readv`
-        let nread = Task::block_on(file.read(&mut buffer))?;
+        let nread = Task::block_on(file.read(&mut buffer, None))?;
         tot += nread;
 
         if nread != buffer.total() {
@@ -389,7 +426,7 @@ fn writev(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
 
     let mut tot = 0usize;
     for mut stream in iov_streams.into_iter() {
-        let nread = Task::block_on(file.write(&mut stream))?;
+        let nread = Task::block_on(file.write(&mut stream, None))?;
         tot += nread;
 
         if nread == 0 || !stream.is_drained() {
@@ -495,6 +532,38 @@ fn ppoll(
     do_poll(thread, fds, nfds, 0)
 }
 
+#[eonix_macros::define_syscall(SYS_PSELECT6)]
+fn pselect6(
+    nfds: u32,
+    _readfds: *mut FDSet,
+    _writefds: *mut FDSet,
+    _exceptfds: *mut FDSet,
+    timeout: *mut TimeSpec,
+    _sigmask: *const (),
+) -> KResult<usize> {
+    // According to [pthread6(2)](https://linux.die.net/man/2/pselect6):
+    // Some code calls select() with all three sets empty, nfds zero, and
+    // a non-NULL timeout as a fairly portable way to sleep with subsecond precision.
+    if nfds != 0 {
+        thread.raise(Signal::SIGSYS);
+        return Err(ENOSYS);
+    }
+
+    let timeout = UserPointerMut::new(timeout)?;
+    
+    // Read here to check for invalid pointers.
+    let _timeout_value = timeout.read()?;
+
+    Task::block_on(sleep(Duration::from_millis(10)));
+
+    timeout.write(TimeSpec {
+        tv_sec: 0,
+        tv_nsec: 0,
+    })?;
+
+    Ok(0)
+}
+
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_POLL)]
 fn poll(fds: *mut UserPollFd, nfds: u32, timeout: u32) -> KResult<u32> {

+ 140 - 45
src/kernel/syscall/mm.rs

@@ -1,6 +1,9 @@
 use super::FromSyscallArg;
-use crate::kernel::constants::{EINVAL, ENOMEM};
+use crate::fs::shm::{gen_shm_id, ShmFlags, IPC_PRIVATE, SHM_MANAGER};
+use crate::kernel::constants::{EBADF, EEXIST, EINVAL, ENOENT, ENOMEM};
+use crate::kernel::mem::FileMapping;
 use crate::kernel::task::Thread;
+use crate::kernel::vfs::filearray::FD;
 use crate::{
     kernel::{
         constants::{UserMmapFlags, UserMmapProtocol},
@@ -28,6 +31,7 @@ impl FromSyscallArg for UserMmapFlags {
 
 /// Check whether we are doing an implemented function.
 /// If `condition` is false, return `Err(err)`.
+#[allow(unused)]
 fn check_impl(condition: bool, err: u32) -> KResult<()> {
     if !condition {
         Err(err)
@@ -42,60 +46,58 @@ fn do_mmap2(
     len: usize,
     prot: UserMmapProtocol,
     flags: UserMmapFlags,
-    fd: u32,
+    fd: FD,
     pgoffset: usize,
 ) -> KResult<usize> {
     let addr = VAddr::from(addr);
-    if !addr.is_page_aligned() || len == 0 {
+    if !addr.is_page_aligned() || pgoffset % PAGE_SIZE != 0 || len == 0 {
         return Err(EINVAL);
     }
 
     let len = len.align_up(PAGE_SIZE);
-    check_impl(flags.contains(UserMmapFlags::MAP_ANONYMOUS), ENOMEM)?;
-    check_impl(flags.contains(UserMmapFlags::MAP_PRIVATE), EINVAL)?;
-    if fd != u32::MAX || pgoffset != 0 {
-        return Err(EINVAL);
-    }
-
     let mm_list = &thread.process.mm_list;
+    let is_shared = flags.contains(UserMmapFlags::MAP_SHARED);
 
-    // TODO!!!: If we are doing mmap's in 32-bit mode, we should check whether
-    //          `addr` is above user reachable memory.
-    let addr = if flags.contains(UserMmapFlags::MAP_FIXED) {
-        if prot.is_empty() {
-            Task::block_on(mm_list.protect(
-                addr,
-                len,
-                Permission {
-                    read: prot.contains(UserMmapProtocol::PROT_READ),
-                    write: prot.contains(UserMmapProtocol::PROT_WRITE),
-                    execute: prot.contains(UserMmapProtocol::PROT_EXEC),
-                },
-            ))
-            .map(|_| addr)
+    let mapping = if flags.contains(UserMmapFlags::MAP_ANONYMOUS) {
+        if pgoffset != 0 {
+            return Err(EINVAL);
+        }
+
+        if !is_shared {
+            Mapping::Anonymous
         } else {
-            mm_list.mmap_fixed(
-                addr,
+            // The mode is unimportant here, since we are checking prot in mm_area.
+            let shared_area = Task::block_on(SHM_MANAGER.lock()).create_shared_area(
                 len,
-                Mapping::Anonymous,
-                Permission {
-                    read: prot.contains(UserMmapProtocol::PROT_READ),
-                    write: prot.contains(UserMmapProtocol::PROT_WRITE),
-                    execute: prot.contains(UserMmapProtocol::PROT_EXEC),
-                },
-            )
+                thread.process.pid,
+                0x777,
+            );
+            Mapping::File(FileMapping::new(shared_area.area.clone(), 0, len))
         }
     } else {
-        mm_list.mmap_hint(
-            addr,
-            len,
-            Mapping::Anonymous,
-            Permission {
-                read: prot.contains(UserMmapProtocol::PROT_READ),
-                write: prot.contains(UserMmapProtocol::PROT_WRITE),
-                execute: prot.contains(UserMmapProtocol::PROT_EXEC),
-            },
-        )
+        let file = thread
+            .files
+            .get(fd)
+            .ok_or(EBADF)?
+            .get_inode()?
+            .ok_or(EBADF)?;
+
+        Mapping::File(FileMapping::new(file, pgoffset, len))
+    };
+
+    let permission = Permission {
+        read: prot.contains(UserMmapProtocol::PROT_READ),
+        write: prot.contains(UserMmapProtocol::PROT_WRITE),
+        execute: prot.contains(UserMmapProtocol::PROT_EXEC),
+    };
+
+    // TODO!!!: If we are doing mmap's in 32-bit mode, we should check whether
+    //          `addr` is above user reachable memory.
+    let addr = if flags.contains(UserMmapFlags::MAP_FIXED) {
+        Task::block_on(mm_list.unmap(addr, len));
+        mm_list.mmap_fixed(addr, len, mapping, permission, is_shared)
+    } else {
+        mm_list.mmap_hint(addr, len, mapping, permission, is_shared)
     };
 
     addr.map(|addr| addr.addr())
@@ -108,10 +110,10 @@ fn mmap(
     len: usize,
     prot: UserMmapProtocol,
     flags: UserMmapFlags,
-    fd: u32,
+    fd: FD,
     offset: usize,
 ) -> KResult<usize> {
-    do_mmap2(thread, addr, len, prot, flags, fd, offset / PAGE_SIZE)
+    do_mmap2(thread, addr, len, prot, flags, fd, offset)
 }
 
 #[cfg(target_arch = "x86_64")]
@@ -121,7 +123,7 @@ fn mmap2(
     len: usize,
     prot: UserMmapProtocol,
     flags: UserMmapFlags,
-    fd: u32,
+    fd: FD,
     pgoffset: usize,
 ) -> KResult<usize> {
     do_mmap2(thread, addr, len, prot, flags, fd, pgoffset)
@@ -169,6 +171,99 @@ fn mprotect(addr: usize, len: usize, prot: UserMmapProtocol) -> KResult<()> {
     ))
 }
 
+#[eonix_macros::define_syscall(SYS_SHMGET)]
+fn shmget(key: usize, size: usize, shmflg: u32) -> KResult<u32> {
+    let size = size.align_up(PAGE_SIZE);
+
+    let mut shm_manager = Task::block_on(SHM_MANAGER.lock());
+    let shmid = gen_shm_id(key)?;
+
+    let mode = shmflg & 0o777;
+    let shmflg = ShmFlags::from_bits_truncate(shmflg);
+
+    if key == IPC_PRIVATE {
+        let new_shm = shm_manager.create_shared_area(size, thread.process.pid, mode);
+        shm_manager.insert(shmid, new_shm);
+        return Ok(shmid);
+    }
+
+    if let Some(_) = shm_manager.get(shmid) {
+        if shmflg.contains(ShmFlags::IPC_CREAT | ShmFlags::IPC_EXCL) {
+            return Err(EEXIST);
+        }
+
+        return Ok(shmid);
+    }
+
+    if shmflg.contains(ShmFlags::IPC_CREAT) {
+        let new_shm = shm_manager.create_shared_area(size, thread.process.pid, mode);
+        shm_manager.insert(shmid, new_shm);
+        return Ok(shmid);
+    }
+
+    return Err(ENOENT);
+}
+
+#[eonix_macros::define_syscall(SYS_SHMAT)]
+fn shmat(shmid: u32, addr: usize, shmflg: u32) -> KResult<usize> {
+    let mm_list = &thread.process.mm_list;
+    let shm_manager = Task::block_on(SHM_MANAGER.lock());
+    let shm_area = shm_manager.get(shmid).ok_or(EINVAL)?;
+
+    let mode = shmflg & 0o777;
+    let shmflg = ShmFlags::from_bits_truncate(shmflg);
+
+    let mut permission = Permission {
+        read: true,
+        write: true,
+        execute: false,
+    };
+
+    if shmflg.contains(ShmFlags::SHM_EXEC) {
+        permission.execute = true;
+    }
+    if shmflg.contains(ShmFlags::SHM_RDONLY) {
+        permission.write = false;
+    }
+
+    let size = shm_area.shmid_ds.shm_segsz;
+
+    let mapping = Mapping::File(FileMapping {
+        file: shm_area.area.clone(),
+        offset: 0,
+        length: size,
+    });
+
+    let addr = if addr != 0 {
+        if addr % PAGE_SIZE != 0 && !shmflg.contains(ShmFlags::SHM_RND) {
+            return Err(EINVAL);
+        }
+        let addr = VAddr::from(addr.align_down(PAGE_SIZE));
+        mm_list.mmap_fixed(addr, size, mapping, permission, true)
+    } else {
+        mm_list.mmap_hint(VAddr::NULL, size, mapping, permission, true)
+    }?;
+
+    thread.process.shm_areas.lock().insert(addr, size);
+
+    Ok(addr.addr())
+}
+
+#[eonix_macros::define_syscall(SYS_SHMDT)]
+fn shmdt(addr: usize) -> KResult<usize> {
+    let addr = VAddr::from(addr);
+    let mut shm_areas = thread.process.shm_areas.lock();
+    let size = *shm_areas.get(&addr).ok_or(EINVAL)?;
+    shm_areas.remove(&addr);
+    drop(shm_areas);
+    return Task::block_on(thread.process.mm_list.unmap(addr, size)).map(|_| 0);
+}
+
+#[eonix_macros::define_syscall(SYS_SHMCTL)]
+fn shmctl(shmid: u32, op: i32, shmid_ds: usize) -> KResult<usize> {
+    Ok(0)
+}
+
 #[eonix_macros::define_syscall(SYS_MEMBARRIER)]
 fn membarrier(_cmd: usize, _flags: usize) -> KResult<()> {
     Ok(())

+ 54 - 24
src/kernel/syscall/procops.rs

@@ -1,13 +1,15 @@
 use super::SyscallNoReturn;
 use crate::io::Buffer;
-use crate::kernel::constants::{EINVAL, ENOENT, ENOTDIR, ERANGE, ESRCH};
+use crate::kernel::constants::{
+    CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_REALTIME_COARSE, EINVAL, ENOENT, ENOTDIR, ERANGE, ESRCH,
+};
 use crate::kernel::constants::{
     ENOSYS, PR_GET_NAME, PR_SET_NAME, RLIMIT_STACK, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK,
 };
 use crate::kernel::mem::PageBuffer;
 use crate::kernel::task::{
     do_clone, futex_wait, futex_wake, FutexFlags, FutexOp, ProcessList, ProgramLoader,
-    RobustListHead, SignalAction, Thread, WaitType,
+    RobustListHead, SignalAction, Thread, WaitId, WaitType,
 };
 use crate::kernel::task::{parse_futexop, CloneArgs};
 use crate::kernel::timer::sleep;
@@ -27,7 +29,6 @@ use eonix_hal::trap::TrapContext;
 use eonix_mm::address::{Addr as _, VAddr};
 use eonix_runtime::task::Task;
 use eonix_sync::AsProof as _;
-use posix_types::constants::{P_ALL, P_PID};
 use posix_types::ctypes::PtrT;
 use posix_types::signal::{SigAction, SigInfo, SigSet, Signal};
 use posix_types::stat::TimeVal;
@@ -67,6 +68,37 @@ fn nanosleep(req: *const (u32, u32), rem: *mut (u32, u32)) -> KResult<usize> {
     Ok(0)
 }
 
+#[eonix_macros::define_syscall(SYS_CLOCK_NANOSLEEP)]
+fn clock_nanosleep(
+    clock_id: u32,
+    flags: u32,
+    req: *const (u32, u32),
+    rem: *mut (u32, u32),
+) -> KResult<usize> {
+    if clock_id != CLOCK_REALTIME
+        && clock_id != CLOCK_REALTIME_COARSE
+        && clock_id != CLOCK_MONOTONIC
+    {
+        unimplemented!("Unsupported clock_id: {}", clock_id);
+    }
+
+    let req = UserPointer::new(req)?.read()?;
+    let rem = if rem.is_null() {
+        None
+    } else {
+        Some(UserPointerMut::new(rem)?)
+    };
+
+    let duration = Duration::from_secs(req.0 as u64) + Duration::from_nanos(req.1 as u64);
+    Task::block_on(sleep(duration));
+
+    if let Some(rem) = rem {
+        rem.write((0, 0))?;
+    }
+
+    Ok(0)
+}
+
 #[eonix_macros::define_syscall(SYS_UMASK)]
 fn umask(mask: u32) -> KResult<u32> {
     let mut umask = thread.fs_context.umask.lock();
@@ -220,16 +252,11 @@ enum WaitInfo {
 
 fn do_waitid(
     thread: &Thread,
-    id_type: u32,
-    _id: u32,
+    wait_id: WaitId,
     info: WaitInfo,
     options: u32,
     rusage: *mut RUsage,
 ) -> KResult<u32> {
-    if id_type != P_ALL {
-        unimplemented!("waitid with id_type {id_type}");
-    }
-
     if !rusage.is_null() {
         unimplemented!("waitid with rusage pointer");
     }
@@ -240,6 +267,7 @@ fn do_waitid(
     };
 
     let Some(wait_object) = Task::block_on(thread.process.wait(
+        wait_id,
         options.contains(UserWaitOptions::WNOHANG),
         options.contains(UserWaitOptions::WUNTRACED),
         options.contains(UserWaitOptions::WCONTINUED),
@@ -278,15 +306,10 @@ fn waitid(
     options: u32,
     rusage: *mut RUsage,
 ) -> KResult<u32> {
+    let wait_id = WaitId::from_type_and_id(id_type, id)?;
+
     if let Some(info) = NonNull::new(info) {
-        do_waitid(
-            thread,
-            id_type,
-            id,
-            WaitInfo::SigInfo(info),
-            options,
-            rusage,
-        )
+        do_waitid(thread, wait_id, WaitInfo::SigInfo(info), options, rusage)
     } else {
         /*
          * According to POSIX.1-2008, an application calling waitid() must
@@ -301,24 +324,21 @@ fn waitid(
 }
 
 #[eonix_macros::define_syscall(SYS_WAIT4)]
-fn wait4(waitpid: u32, arg1: *mut u32, options: u32, rusage: *mut RUsage) -> KResult<u32> {
+fn wait4(wait_id: i32, arg1: *mut u32, options: u32, rusage: *mut RUsage) -> KResult<u32> {
     let waitinfo = if let Some(status) = NonNull::new(arg1) {
         WaitInfo::Status(status)
     } else {
         WaitInfo::None
     };
 
-    let idtype = match waitpid {
-        u32::MAX => P_ALL,
-        _ => P_PID,
-    };
+    let wait_id = WaitId::from_id(wait_id, thread);
 
-    do_waitid(thread, idtype, waitpid, waitinfo, options, rusage)
+    do_waitid(thread, wait_id, waitinfo, options, rusage)
 }
 
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_WAITPID)]
-fn waitpid(waitpid: u32, arg1: *mut u32, options: u32) -> KResult<u32> {
+fn waitpid(waitpid: i32, arg1: *mut u32, options: u32) -> KResult<u32> {
     sys_wait4(thread, waitpid, arg1, options, core::ptr::null_mut())
 }
 
@@ -425,6 +445,16 @@ fn getgid32() -> KResult<u32> {
     sys_getegid(thread)
 }
 
+#[eonix_macros::define_syscall(SYS_SYNC)]
+fn sync() -> KResult<()> {
+    Ok(())
+}
+
+#[eonix_macros::define_syscall(SYS_FSYNC)]
+fn fsync() -> KResult<()> {
+    Ok(())
+}
+
 #[eonix_macros::define_syscall(SYS_GETTID)]
 fn gettid() -> KResult<u32> {
     Ok(thread.tid)

+ 5 - 2
src/kernel/syscall/sysinfo.rs

@@ -1,7 +1,7 @@
 use crate::{
     io::Buffer as _,
     kernel::{
-        constants::{CLOCK_MONOTONIC, CLOCK_REALTIME, EINTR, EINVAL},
+        constants::{CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_REALTIME_COARSE, EINTR, EINVAL},
         task::Thread,
         timer::{Instant, Ticks},
         user::{UserBuffer, UserPointerMut},
@@ -82,7 +82,10 @@ fn gettimeofday(timeval: *mut TimeVal, timezone: *mut ()) -> KResult<()> {
 }
 
 fn do_clock_gettime64(_thread: &Thread, clock_id: u32, timespec: *mut TimeSpec) -> KResult<()> {
-    if clock_id != CLOCK_REALTIME && clock_id != CLOCK_MONOTONIC {
+    if clock_id != CLOCK_REALTIME
+        && clock_id != CLOCK_REALTIME_COARSE
+        && clock_id != CLOCK_MONOTONIC
+    {
         unimplemented!("Unsupported clock_id: {}", clock_id);
     }
 

+ 1 - 1
src/kernel/task.rs

@@ -13,7 +13,7 @@ pub use clone::{do_clone, CloneArgs, CloneFlags};
 pub use futex::{futex_wait, futex_wake, parse_futexop, FutexFlags, FutexOp, RobustListHead};
 pub use kernel_stack::KernelStack;
 pub use loader::ProgramLoader;
-pub use process::{alloc_pid, Process, ProcessBuilder, WaitObject, WaitType};
+pub use process::{alloc_pid, Process, ProcessBuilder, WaitId, WaitObject, WaitType};
 pub use process_group::ProcessGroup;
 pub use process_list::ProcessList;
 pub use session::Session;

+ 4 - 1
src/kernel/task/loader/elf.rs

@@ -274,6 +274,7 @@ impl<E: ElfArch> Elf<E> {
                 write: true,
                 execute: false,
             },
+            false,
         )?;
 
         StackInitializer::new(&mm_list, E::STACK_BASE_ADDR, args, envs, aux_vec).init()
@@ -356,11 +357,12 @@ impl<E: ElfArch> Elf<E> {
                 vmap_start,
                 file_len,
                 Mapping::File(FileMapping::new(
-                    self.file.clone(),
+                    self.file.get_inode()?,
                     file_offset,
                     real_file_length,
                 )),
                 permission,
+                false,
             )?;
         }
 
@@ -370,6 +372,7 @@ impl<E: ElfArch> Elf<E> {
                 vmem_len - file_len,
                 Mapping::Anonymous,
                 permission,
+                false,
             )?;
         }
 

+ 59 - 4
src/kernel/task/process.rs

@@ -2,7 +2,7 @@ use super::{
     process_group::ProcessGroupBuilder, signal::RaiseResult, thread::ThreadBuilder, ProcessGroup,
     ProcessList, Session, Thread,
 };
-use crate::kernel::constants::{ECHILD, EINTR, EPERM, ESRCH};
+use crate::kernel::constants::{ECHILD, EINTR, EINVAL, EPERM, ESRCH};
 use crate::kernel::task::{CloneArgs, CloneFlags};
 use crate::{
     kernel::mem::MMList,
@@ -15,13 +15,17 @@ use alloc::{
     sync::{Arc, Weak},
 };
 use core::sync::atomic::{AtomicU32, Ordering};
+use eonix_mm::address::VAddr;
 use eonix_runtime::task::Task;
 use eonix_sync::{
     AsProof as _, AsProofMut as _, Locked, Proof, ProofMut, RwLockReadGuard, SpinGuard,
     UnlockableGuard as _, UnlockedGuard as _,
 };
 use pointers::BorrowedArc;
-use posix_types::constants::{CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED};
+use posix_types::constants::{
+    CLD_CONTINUED, CLD_DUMPED, CLD_EXITED, CLD_KILLED, CLD_STOPPED, P_PGID, P_PIDFD,
+};
+use posix_types::constants::{P_ALL, P_PID};
 use posix_types::signal::Signal;
 use posix_types::SIGNAL_COREDUMP;
 
@@ -47,6 +51,8 @@ pub struct Process {
 
     pub exit_signal: Option<Signal>,
 
+    pub shm_areas: Spin<BTreeMap<VAddr, usize>>,
+
     /// Parent process
     ///
     /// `parent` must be valid during the whole life of the process.
@@ -95,12 +101,46 @@ pub struct Entry<'waitlist, 'proclist, 'cv> {
     cv: &'cv CondVar,
     want_stop: bool,
     want_continue: bool,
+    want_id: WaitId,
 }
 
 pub struct DrainExited<'waitlist> {
     wait_procs: SpinGuard<'waitlist, VecDeque<WaitObject>>,
 }
 
+pub enum WaitId {
+    Any,
+    Pid(u32),
+    Pgid(u32),
+}
+
+impl WaitId {
+    pub fn from_type_and_id(id_type: u32, id: u32) -> KResult<Self> {
+        match id_type {
+            P_ALL => Ok(WaitId::Any),
+            P_PID => Ok(WaitId::Pid(id)),
+            P_PGID => Ok(WaitId::Pgid(id)),
+            P_PIDFD => {
+                panic!("PDIFD type is unsupported")
+            }
+            _ => Err(EINVAL),
+        }
+    }
+
+    pub fn from_id(id: i32, thread: &Thread) -> Self {
+        if id < -1 {
+            WaitId::Pgid((-id).cast_unsigned())
+        } else if id == -1 {
+            WaitId::Any
+        } else if id == 0 {
+            let procs = Task::block_on(ProcessList::get().read());
+            WaitId::Pgid(thread.process.pgroup(procs.prove()).pgid)
+        } else {
+            WaitId::Pid(id.cast_unsigned())
+        }
+    }
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum WaitType {
     Exited(u32),
@@ -221,6 +261,7 @@ impl ProcessBuilder {
             pid: self.pid.expect("should set pid before building"),
             wait_list: WaitList::new(),
             mm_list,
+            shm_areas: Spin::new(BTreeMap::new()),
             exit_signal: self.exit_signal,
             parent: RCUPointer::empty(),
             pgroup: RCUPointer::empty(),
@@ -303,12 +344,13 @@ impl Process {
 
     pub async fn wait(
         &self,
+        wait_id: WaitId,
         no_block: bool,
         trace_stop: bool,
         trace_continue: bool,
     ) -> KResult<Option<WaitObject>> {
         let wait_object = {
-            let mut waits = self.wait_list.entry(trace_stop, trace_continue);
+            let mut waits = self.wait_list.entry(wait_id, trace_stop, trace_continue);
             loop {
                 if let Some(object) = waits.get() {
                     break object;
@@ -530,13 +572,14 @@ impl WaitList {
     /// # Safety
     /// Locks `ProcessList` and `WaitList` at the same time. When `wait` is called,
     /// releases the lock on `ProcessList` and `WaitList` and waits on `cv_wait_procs`.
-    pub fn entry(&self, want_stop: bool, want_continue: bool) -> Entry {
+    pub fn entry(&self, wait_id: WaitId, want_stop: bool, want_continue: bool) -> Entry {
         Entry {
             process_list: Task::block_on(ProcessList::get().read()),
             wait_procs: self.wait_procs.lock(),
             cv: &self.cv_wait_procs,
             want_stop,
             want_continue,
+            want_id: wait_id,
         }
     }
 }
@@ -556,6 +599,17 @@ impl Entry<'_, '_, '_> {
                     true
                 }
             })
+            .filter(|(_, item)| match self.want_id {
+                WaitId::Any => true,
+                WaitId::Pid(pid) => item.pid == pid,
+                WaitId::Pgid(pgid) => {
+                    let procs = Task::block_on(ProcessList::get().read());
+                    if let Some(process) = procs.try_find_process(item.pid) {
+                        return process.pgroup(procs.prove()).pgid == pgid;
+                    }
+                    false
+                }
+            })
             .map(|(idx, _)| idx)
             .next()
         {
@@ -581,6 +635,7 @@ impl Entry<'_, '_, '_> {
                     cv: self.cv,
                     want_stop: self.want_stop,
                     want_continue: self.want_continue,
+                    want_id: self.want_id,
                 })
             }
         }

+ 1 - 1
src/kernel/timer.rs

@@ -18,7 +18,7 @@ static SLEEPERS_LIST: Spin<BinaryHeap<Reverse<Sleepers>>> = Spin::new(BinaryHeap
 #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Ticks(usize);
 
-#[derive(Default, Clone, Copy)]
+#[derive(Debug, Default, Clone, Copy)]
 pub struct Instant {
     secs_since_epoch: u64,
     nsecs_within: u32,

+ 39 - 13
src/kernel/vfs/file.rs

@@ -11,6 +11,7 @@ use crate::{
         task::Thread,
         terminal::{Terminal, TerminalIORequest},
         user::{UserPointer, UserPointerMut},
+        vfs::inode::Inode,
         CharDevice,
     },
     prelude::*,
@@ -86,6 +87,15 @@ pub struct File {
     file_type: FileType,
 }
 
+impl File {
+    pub fn get_inode(&self) -> KResult<Option<Arc<dyn Inode>>> {
+        match &self.file_type {
+            FileType::Inode(inode_file) => Ok(Some(inode_file.dentry.get_inode()?)),
+            _ => Ok(None),
+        }
+    }
+}
+
 pub enum SeekOption {
     Set(usize),
     Current(isize),
@@ -324,7 +334,7 @@ impl InodeFile {
         Ok(new_cursor)
     }
 
-    fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
+    fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
         if !self.write {
             return Err(EBADF);
         }
@@ -336,23 +346,35 @@ impl InodeFile {
 
             Ok(nwrote)
         } else {
-            let nwrote = self.dentry.write(stream, WriteOffset::Position(*cursor))?;
+            let nwrote = if let Some(offset) = offset {
+                self.dentry.write(stream, WriteOffset::Position(offset))?
+            } else {
+                let nwrote = self.dentry.write(stream, WriteOffset::Position(*cursor))?;
+                *cursor += nwrote;
+                nwrote
+            };
 
-            *cursor += nwrote;
             Ok(nwrote)
         }
     }
 
-    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+    fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
         if !self.read {
             return Err(EBADF);
         }
 
-        let mut cursor = Task::block_on(self.cursor.lock());
+        let nread = if let Some(offset) = offset {
+            let nread = self.dentry.read(buffer, offset)?;
+            nread
+        } else {
+            let mut cursor = Task::block_on(self.cursor.lock());
 
-        let nread = self.dentry.read(buffer, *cursor)?;
+            let nread = self.dentry.read(buffer, *cursor)?;
+
+            *cursor += nread;
+            nread
+        };
 
-        *cursor += nread;
         Ok(nread)
     }
 
@@ -456,9 +478,9 @@ impl TerminalFile {
 }
 
 impl FileType {
-    pub async fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+    pub async fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
         match self {
-            FileType::Inode(inode) => inode.read(buffer),
+            FileType::Inode(inode) => inode.read(buffer, offset),
             FileType::PipeRead(pipe) => pipe.pipe.read(buffer).await,
             FileType::TTY(tty) => tty.read(buffer).await,
             FileType::CharDev(device) => device.read(buffer),
@@ -481,9 +503,9 @@ impl FileType {
     //     }
     // }
 
-    pub async fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
+    pub async fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
         match self {
-            FileType::Inode(inode) => inode.write(stream),
+            FileType::Inode(inode) => inode.write(stream, offset),
             FileType::PipeWrite(pipe) => pipe.pipe.write(stream).await,
             FileType::TTY(tty) => tty.write(stream),
             FileType::CharDev(device) => device.write(stream),
@@ -527,12 +549,16 @@ impl FileType {
             if Thread::current().signal_list.has_pending_signal() {
                 return if cur == 0 { Err(EINTR) } else { Ok(cur) };
             }
-            let nread = self.read(&mut ByteBuffer::new(&mut buffer[..len])).await?;
+            let nread = self
+                .read(&mut ByteBuffer::new(&mut buffer[..len]), None)
+                .await?;
             if nread == 0 {
                 break;
             }
 
-            let nwrote = dest_file.write(&mut buffer[..nread].into_stream()).await?;
+            let nwrote = dest_file
+                .write(&mut buffer[..nread].into_stream(), None)
+                .await?;
             nsent += nwrote;
 
             if nwrote != len {

+ 18 - 4
src/kernel/vfs/inode.rs

@@ -4,6 +4,7 @@ use crate::kernel::constants::{
     EINVAL, EISDIR, ENOTDIR, EPERM, STATX_ATIME, STATX_BLOCKS, STATX_CTIME, STATX_GID, STATX_INO,
     STATX_MODE, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_TYPE, STATX_UID, S_IFDIR, S_IFMT,
 };
+use crate::kernel::mem::PageCache;
 use crate::kernel::timer::Instant;
 use crate::{io::Buffer, prelude::*};
 use alloc::sync::{Arc, Weak};
@@ -34,6 +35,7 @@ pub type AtomicGid = AtomicU32;
 pub type Mode = u32;
 pub type AtomicMode = AtomicU32;
 
+#[derive(Debug)]
 pub struct InodeData {
     pub ino: Ino,
     pub size: AtomicISize,
@@ -53,13 +55,13 @@ pub struct InodeData {
 }
 
 impl InodeData {
-    pub const fn new(ino: Ino, vfs: Weak<dyn Vfs>) -> Self {
+    pub fn new(ino: Ino, vfs: Weak<dyn Vfs>) -> Self {
         Self {
             ino,
             vfs,
-            atime: Spin::new(Instant::default()),
-            ctime: Spin::new(Instant::default()),
-            mtime: Spin::new(Instant::default()),
+            atime: Spin::new(Instant::now()),
+            ctime: Spin::new(Instant::now()),
+            mtime: Spin::new(Instant::now()),
             rwsem: RwLock::new(()),
             size: AtomicU64::new(0),
             nlink: AtomicNlink::new(0),
@@ -126,10 +128,18 @@ pub trait Inode: Send + Sync + InodeInner + Any {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
+    fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
+    }
+
     fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
+    fn write_direct(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
+    }
+
     fn devid(&self) -> KResult<DevId> {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
@@ -162,6 +172,10 @@ pub trait Inode: Send + Sync + InodeInner + Any {
         Err(EPERM)
     }
 
+    fn page_cache(&self) -> Option<&PageCache> {
+        None
+    }
+
     fn statx(&self, stat: &mut StatX, mask: u32) -> KResult<()> {
         // Safety: ffi should have checked reference
         let vfs = self.vfs.upgrade().expect("Vfs is dropped");

+ 2 - 2
src/lib.rs

@@ -69,8 +69,6 @@ static BSP_OK: AtomicBool = AtomicBool::new(false);
 fn kernel_init(mut data: eonix_hal::bootstrap::BootStrapData) -> ! {
     setup_memory(&mut data);
 
-    BSP_OK.store(true, Ordering::Release);
-
     #[cfg(target_arch = "riscv64")]
     {
         driver::sbi_console::init_console();
@@ -85,6 +83,8 @@ fn kernel_init(mut data: eonix_hal::bootstrap::BootStrapData) -> ! {
 
     Scheduler::get().spawn::<KernelStack, _>(FutureRun::new(init_process(data.get_early_stack())));
 
+    BSP_OK.store(true, Ordering::Release);
+
     drop(data);
     unsafe {
         // SAFETY: `preempt::count()` == 1.