Explorar o código

Merge pull request #36 from Shao-ZW/pagecache

Implement page cache, better mmap and shared memory syscalls
greatbridf hai 6 meses
pai
achega
6c6195b183

+ 10 - 10
Cargo.lock

@@ -25,9 +25,9 @@ version = "0.1.0"
 
 [[package]]
 name = "autocfg"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "bit_field"
@@ -51,9 +51,9 @@ dependencies = [
 
 [[package]]
 name = "cfg-if"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268"
 
 [[package]]
 name = "critical-section"
@@ -392,9 +392,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.101"
+version = "2.0.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+checksum = "e4307e30089d6fd6aff212f2da3a1f9e32f3223b1f010fb09b7c95f90f3ca1e8"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -459,18 +459,18 @@ checksum = "2fe21bcc34ca7fe6dd56cc2cb1261ea59d6b93620215aefb5ea6032265527784"
 
 [[package]]
 name = "zerocopy"
-version = "0.8.25"
+version = "0.8.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
+checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
 dependencies = [
  "zerocopy-derive",
 ]
 
 [[package]]
 name = "zerocopy-derive"
-version = "0.8.25"
+version = "0.8.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
+checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
 dependencies = [
  "proc-macro2",
  "quote",

+ 5 - 2
Cargo.toml

@@ -43,6 +43,9 @@ log_trace = ["trace_syscall", "trace_scheduler"]
 log_debug = []
 smp = []
 
+[profile.release]
+debug = true
+
 [profile.dev]
 panic = "abort"
 
@@ -50,7 +53,7 @@ panic = "abort"
 opt-level = 2
 
 [profile.dev.package.eonix_runtime]
-opt-level = 0
+opt-level = 2
 
 [profile.dev.package.eonix_sync]
 opt-level = 2
@@ -59,7 +62,7 @@ opt-level = 2
 opt-level = 2
 
 [profile.dev.package.eonix_hal]
-opt-level = 0
+opt-level = 2
 
 [profile.dev.package."*"]
 opt-level = "s"

+ 1 - 1
crates/eonix_mm/src/page_table/pte.rs

@@ -10,7 +10,7 @@ bitflags! {
         const GLOBAL = 8;
     }
 
-    #[derive(Clone, Copy, PartialEq)]
+    #[derive(Debug, Clone, Copy, PartialEq)]
     pub struct PageAttribute: usize {
         const PRESENT = 1;
         const READ = 2;

+ 1 - 0
crates/posix_types/src/lib.rs

@@ -4,6 +4,7 @@ pub mod constants;
 pub mod ctypes;
 pub mod namei;
 pub mod open;
+pub mod poll;
 pub mod result;
 pub mod signal;
 pub mod stat;

+ 5 - 0
crates/posix_types/src/poll.rs

@@ -0,0 +1,5 @@
+pub const FDSET_LENGTH: usize = 1024 / (8 * size_of::<usize>());
+
+pub struct FDSet {
+    fds_bits: [usize; FDSET_LENGTH],
+}

+ 8 - 0
crates/posix_types/src/stat.rs

@@ -1,3 +1,5 @@
+use core::time::Duration;
+
 #[repr(C)]
 #[derive(Debug, Default, Copy, Clone)]
 pub struct StatXTimestamp {
@@ -100,3 +102,9 @@ impl From<StatX> for Stat {
         }
     }
 }
+
+impl From<TimeSpec> for Duration {
+    fn from(value: TimeSpec) -> Self {
+        Self::new(value.tv_sec, value.tv_nsec)
+    }
+}

+ 2 - 2
crates/posix_types/src/syscall_no/riscv64.rs

@@ -71,7 +71,7 @@ pub const SYS_PWRITE64: usize = 68;
 pub const SYS_PREADV: usize = 69;
 pub const SYS_PWRITEV: usize = 70;
 pub const SYS_SENDFILE64: usize = 71;
-pub const SYS_PSELECT6_TIME32: usize = 72;
+pub const SYS_PSELECT6: usize = 72;
 pub const SYS_PPOLL: usize = 73;
 pub const SYS_SIGNALFD4: usize = 74;
 pub const SYS_VMSPLICE: usize = 75;
@@ -114,7 +114,7 @@ pub const SYS_TIMER_DELETE: usize = 111;
 pub const SYS_CLOCK_SETTIME: usize = 404;
 pub const SYS_CLOCK_GETTIME: usize = 113;
 pub const SYS_CLOCK_GETRES: usize = 406;
-pub const SYS_CLOCK_NANOSLEEP: usize = 407;
+pub const SYS_CLOCK_NANOSLEEP: usize = 115;
 pub const SYS_SYSLOG: usize = 116;
 pub const SYS_PTRACE: usize = 117;
 pub const SYS_SCHED_SETPARAM: usize = 118;

+ 1 - 1
crates/slab_allocator/src/slab_cache.rs

@@ -89,7 +89,7 @@ where
     Allocator: PageAlloc<RawPage = Raw>,
 {
     pub(crate) const fn new_in(object_size: u32) -> Self {
-        // avoid uncessary branch in alloc and dealloc
+        // avoid unnecessary branch in alloc and dealloc
         assert!(object_size <= PAGE_SIZE as u32 / 2);
 
         Self {

+ 41 - 4
src/fs/ext4.rs

@@ -1,5 +1,6 @@
-use core::sync::atomic::{AtomicU32, AtomicU64};
+use core::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 
+use crate::kernel::mem::{PageCache, PageCacheBackend};
 use crate::{
     io::{Buffer, ByteBuffer},
     kernel::{
@@ -18,6 +19,7 @@ use crate::{
     path::Path,
     prelude::*,
 };
+use alloc::sync::Weak;
 use alloc::{
     collections::btree_map::{BTreeMap, Entry},
     sync::Arc,
@@ -92,7 +94,7 @@ impl Ext4Fs {
                 let mode = *idata.mode.get_mut();
                 if s_isreg(mode) {
                     vacant
-                        .insert(Ext4Inode::File(Arc::new(FileInode { idata })))
+                        .insert(Ext4Inode::File(FileInode::new(idata)))
                         .clone()
                         .into_inner()
                 } else if s_isdir(mode) {
@@ -103,7 +105,7 @@ impl Ext4Fs {
                 } else {
                     println_warn!("ext4: Unsupported inode type: {mode:#o}");
                     vacant
-                        .insert(Ext4Inode::File(Arc::new(FileInode { idata })))
+                        .insert(Ext4Inode::File(FileInode::new(idata)))
                         .clone()
                         .into_inner()
                 }
@@ -174,15 +176,50 @@ impl Ext4Inode {
 }
 
 define_struct_inode! {
-    struct FileInode;
+    struct FileInode {
+        page_cache: PageCache,
+    }
 }
 
 define_struct_inode! {
     struct DirInode;
 }
 
+impl FileInode {
+    fn new(idata: InodeData) -> Arc<Self> {
+        let inode = Arc::new_cyclic(|weak_self: &Weak<FileInode>| Self {
+            idata,
+            page_cache: PageCache::new(weak_self.clone()),
+        });
+
+        inode
+    }
+}
+
+impl PageCacheBackend for FileInode {
+    fn read_page(&self, page: &mut crate::kernel::mem::CachePage, offset: usize) -> KResult<usize> {
+        self.read_direct(page, offset)
+    }
+
+    fn write_page(&self, page: &crate::kernel::mem::CachePage, offset: usize) -> KResult<usize> {
+        todo!()
+    }
+
+    fn size(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
+    }
+}
+
 impl Inode for FileInode {
+    fn page_cache(&self) -> Option<&PageCache> {
+        Some(&self.page_cache)
+    }
+
     fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        Task::block_on(self.page_cache.read(buffer, offset))
+    }
+
+    fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let ext4fs = vfs.as_any().downcast_ref::<Ext4Fs>().unwrap();
 

+ 66 - 6
src/fs/fat32.rs

@@ -1,13 +1,19 @@
 mod dir;
 mod file;
 
+use crate::io::Stream;
 use crate::kernel::constants::EIO;
+use crate::kernel::mem::AsMemoryBlock;
+use crate::kernel::vfs::inode::WriteOffset;
 use crate::{
     io::{Buffer, ByteBuffer, UninitBuffer},
     kernel::{
         block::{make_device, BlockDevice, BlockDeviceRequest},
         constants::{S_IFDIR, S_IFREG},
-        mem::paging::Page,
+        mem::{
+            paging::Page,
+            {CachePage, PageCache, PageCacheBackend},
+        },
         vfs::{
             dentry::Dentry,
             inode::{define_struct_inode, Ino, Inode, InodeData},
@@ -32,6 +38,8 @@ use file::ClusterRead;
 
 type ClusterNo = u32;
 
+const SECTOR_SIZE: usize = 512;
+
 #[derive(Clone, Copy)]
 #[repr(C, packed)]
 struct Bootsector {
@@ -231,13 +239,16 @@ impl FatInode {
 }
 
 define_struct_inode! {
-    struct FileInode;
+    struct FileInode {
+        page_cache: PageCache,
+    }
 }
 
 impl FileInode {
     fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
-        let inode = Arc::new(Self {
+        let inode = Arc::new_cyclic(|weak_self: &Weak<FileInode>| Self {
             idata: InodeData::new(ino, weak),
+            page_cache: PageCache::new(weak_self.clone()),
         });
 
         // Safety: We are initializing the inode
@@ -250,7 +261,15 @@ impl FileInode {
 }
 
 impl Inode for FileInode {
+    fn page_cache(&self) -> Option<&PageCache> {
+        Some(&self.page_cache)
+    }
+
     fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        Task::block_on(self.page_cache.read(buffer, offset))
+    }
+
+    fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
         let fat = Task::block_on(vfs.fat.read());
@@ -259,16 +278,57 @@ impl Inode for FileInode {
             return Ok(0);
         }
 
-        let iter = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).read(vfs, offset);
+        let cluster_size = vfs.sectors_per_cluster as usize * SECTOR_SIZE;
+        assert!(cluster_size <= 0x1000, "Cluster size is too large");
+
+        let skip_clusters = offset / cluster_size;
+        let inner_offset = offset % cluster_size;
+
+        let cluster_iter =
+            ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).skip(skip_clusters);
+
+        let buffer_page = Page::alloc();
+        for cluster in cluster_iter {
+            vfs.read_cluster(cluster, &buffer_page)?;
+
+            let data = unsafe {
+                // SAFETY: We are the only one holding this page.
+                &buffer_page.as_memblk().as_bytes()[inner_offset..]
+            };
 
-        for data in iter {
-            if buffer.fill(data?)?.should_stop() {
+            let end = offset + data.len();
+            let real_end = core::cmp::min(end, self.size.load(Ordering::Relaxed) as usize);
+            let real_size = real_end - offset;
+
+            if buffer.fill(&data[..real_size])?.should_stop() {
                 break;
             }
         }
 
         Ok(buffer.wrote())
     }
+
+    fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+        todo!()
+    }
+
+    fn write_direct(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+        todo!()
+    }
+}
+
+impl PageCacheBackend for FileInode {
+    fn read_page(&self, page: &mut CachePage, offset: usize) -> KResult<usize> {
+        self.read_direct(page, offset)
+    }
+
+    fn write_page(&self, page: &CachePage, offset: usize) -> KResult<usize> {
+        todo!()
+    }
+
+    fn size(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
+    }
 }
 
 define_struct_inode! {

+ 1 - 0
src/fs/mod.rs

@@ -1,4 +1,5 @@
 pub mod fat32;
 pub mod procfs;
+pub mod shm;
 pub mod tmpfs;
 pub mod ext4;

+ 146 - 0
src/fs/shm.rs

@@ -0,0 +1,146 @@
+use core::sync::atomic::{AtomicU32, Ordering};
+
+use alloc::{collections::btree_map::BTreeMap, sync::Arc};
+use bitflags::bitflags;
+use eonix_sync::{LazyLock, Mutex};
+
+use crate::{
+    fs::tmpfs::{DirectoryInode, FileInode, TmpFs},
+    kernel::{constants::ENOSPC, timer::Instant, vfs::inode::Mode},
+    prelude::KResult,
+};
+
+bitflags! {
+    #[derive(Debug, Clone, Copy)]
+    pub struct ShmFlags: u32 {
+        /// Create a new segment. If this flag is not used, then shmget() will
+        /// find the segment associated with key and check to see if the user
+        /// has permission to access the segment.
+        const IPC_CREAT = 0o1000;
+        /// This flag is used with IPC_CREAT to ensure that this call creates
+        /// the segment.  If the segment already exists, the call fails.
+        const IPC_EXCL = 0o2000;
+
+        /// Attach the segment for read-only access.If this flag is not specified,
+        /// the segment is attached for read and write access, and the process
+        /// must have read and write permission for the segment.
+        const SHM_RDONLY = 0o10000;
+        /// round attach address to SHMLBA boundary
+        const SHM_RND = 0o20000;
+        /// Allow the contents of the segment to be executed.
+        const SHM_EXEC = 0o100000;
+    }
+}
+
+pub const IPC_PRIVATE: usize = 0;
+
+pub struct ShmManager {
+    tmpfs: Arc<TmpFs>,
+    root: Arc<DirectoryInode>,
+    areas: BTreeMap<u32, ShmArea>,
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy, Debug)]
+pub struct IpcPerm {
+    key: i32,
+    uid: u32,
+    gid: u32,
+    cuid: u32,
+    cgid: u32,
+    mode: u16,
+    seq: u16,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct ShmIdDs {
+    // Ownership and permissions
+    pub shm_perm: IpcPerm,
+    // Size of segment (bytes). In our system, this must be aligned
+    pub shm_segsz: usize,
+    // Last attach time
+    pub shm_atime: usize,
+    // Last detach time
+    pub shm_dtime: usize,
+    // Creation time/time of last modification via shmctl()
+    pub shm_ctime: usize,
+    // PID of creator
+    pub shm_cpid: usize,
+    // PID of last shmat(2)/shmdt(2)
+    pub shm_lpid: usize,
+    // No. of current attaches
+    pub shm_nattch: usize,
+}
+
+impl ShmIdDs {
+    fn new(size: usize, pid: u32) -> Self {
+        Self {
+            shm_perm: IpcPerm::default(),
+            shm_segsz: size,
+            shm_atime: 0,
+            shm_dtime: 0,
+            shm_ctime: 0, // Should set instant now
+            shm_cpid: pid as usize,
+            shm_lpid: 0,
+            shm_nattch: 0,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct ShmArea {
+    pub area: Arc<FileInode>,
+    pub shmid_ds: ShmIdDs,
+}
+
+// A big lock here to protect the shared memory area.
+// Can be improved with finer-grained locking?
+pub static SHM_MANAGER: LazyLock<Mutex<ShmManager>> =
+    LazyLock::new(|| Mutex::new(ShmManager::new()));
+
+impl ShmManager {
+    fn new() -> Self {
+        let (tmpfs, root) = TmpFs::create(false).expect("should create shm_area successfully");
+        Self {
+            tmpfs,
+            root,
+            areas: BTreeMap::new(),
+        }
+    }
+
+    pub fn create_shared_area(&self, size: usize, pid: u32, mode: Mode) -> ShmArea {
+        let ino = self.tmpfs.assign_ino();
+        let vfs = Arc::downgrade(&self.tmpfs);
+        ShmArea {
+            area: FileInode::new(ino, vfs, size, mode),
+            shmid_ds: ShmIdDs::new(size, pid),
+        }
+    }
+
+    pub fn get(&self, shmid: u32) -> Option<&ShmArea> {
+        self.areas.get(&shmid)
+    }
+
+    pub fn insert(&mut self, shmid: u32, area: ShmArea) {
+        self.areas.insert(shmid, area);
+    }
+}
+
+pub fn gen_shm_id(key: usize) -> KResult<u32> {
+    const SHM_MAGIC: u32 = 114514000;
+
+    static NEXT_SHMID: AtomicU32 = AtomicU32::new(0);
+
+    if key == IPC_PRIVATE {
+        let shmid = NEXT_SHMID.fetch_add(1, Ordering::Relaxed);
+
+        if shmid >= SHM_MAGIC {
+            return Err(ENOSPC);
+        } else {
+            return Ok(shmid);
+        }
+    }
+
+    (key as u32).checked_add(SHM_MAGIC).ok_or(ENOSPC)
+}

+ 54 - 46
src/fs/tmpfs.rs

@@ -1,6 +1,8 @@
 use crate::io::Stream;
 use crate::kernel::constants::{EEXIST, EINVAL, EIO, EISDIR, ENOENT, ENOSYS, ENOTDIR};
+use crate::kernel::mem::{CachePage, PageCache, PageCacheBackend};
 use crate::kernel::timer::Instant;
+use crate::kernel::vfs::inode::InodeData;
 use crate::kernel::vfs::inode::RenameData;
 use crate::{
     io::Buffer,
@@ -16,7 +18,9 @@ use crate::{
     prelude::*,
 };
 use alloc::sync::{Arc, Weak};
+use core::fmt::Debug;
 use core::{ops::ControlFlow, sync::atomic::Ordering};
+use eonix_mm::paging::PAGE_SIZE;
 use eonix_runtime::task::Task;
 use eonix_sync::{AsProof as _, AsProofMut as _, Locked, Mutex, ProofMut};
 use itertools::Itertools;
@@ -58,7 +62,7 @@ impl Inode for NodeInode {
 }
 
 define_struct_inode! {
-    struct DirectoryInode {
+    pub(super) struct DirectoryInode {
         entries: Locked<Vec<(Arc<[u8]>, Ino)>, ()>,
     }
 }
@@ -152,7 +156,7 @@ impl Inode for DirectoryInode {
         let rwsem = Task::block_on(self.rwsem.write());
 
         let ino = vfs.assign_ino();
-        let file = FileInode::new(ino, self.vfs.clone(), mode);
+        let file = FileInode::new(ino, self.vfs.clone(), 0, mode);
 
         self.link(at.get_name(), file.as_ref(), rwsem.prove_mut());
         at.save_reg(file)
@@ -460,40 +464,60 @@ impl Inode for SymlinkInode {
 }
 
 define_struct_inode! {
-    struct FileInode {
-        filedata: Locked<Vec<u8>, ()>,
+    pub struct FileInode {
+        pages: PageCache,
+    }
+}
+
+impl Debug for FileInode {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "FileInode({:?})", self.idata)
     }
 }
 
 impl FileInode {
-    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
-        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
-            addr_of_mut_field!(inode, filedata).write(Locked::new(vec![], rwsem));
+    pub fn new(ino: Ino, vfs: Weak<dyn Vfs>, size: usize, mode: Mode) -> Arc<Self> {
+        let inode = Arc::new_cyclic(|weak_self: &Weak<FileInode>| FileInode {
+            idata: InodeData::new(ino, vfs),
+            pages: PageCache::new(weak_self.clone()),
+        });
 
-            addr_of_mut_field!(&mut *inode, mode).write((S_IFREG | (mode & 0o777)).into());
-            addr_of_mut_field!(&mut *inode, nlink).write(1.into());
-            addr_of_mut_field!(&mut *inode, ctime).write(Spin::new(Instant::now()));
-            addr_of_mut_field!(&mut *inode, mtime).write(Spin::new(Instant::now()));
-            addr_of_mut_field!(&mut *inode, atime).write(Spin::new(Instant::now()));
-        })
+        inode
+            .mode
+            .store(S_IFREG | (mode & 0o777), Ordering::Relaxed);
+        inode.nlink.store(1, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+        inode
+    }
+}
+
+impl PageCacheBackend for FileInode {
+    fn read_page(&self, _cache_page: &mut CachePage, _offset: usize) -> KResult<usize> {
+        Ok(PAGE_SIZE)
+    }
+
+    fn write_page(&self, _page: &CachePage, _offset: usize) -> KResult<usize> {
+        Ok(PAGE_SIZE)
+    }
+
+    fn size(&self) -> usize {
+        self.size.load(Ordering::Relaxed) as usize
     }
 }
 
 impl Inode for FileInode {
-    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
-        // TODO: We don't need that strong guarantee, find some way to avoid locks
-        let lock = Task::block_on(self.rwsem.read());
+    fn page_cache(&self) -> Option<&PageCache> {
+        Some(&self.pages)
+    }
 
-        match self.filedata.access(lock.prove()).split_at_checked(offset) {
-            Some((_, data)) => buffer.fill(data).map(|result| result.allow_partial()),
-            None => Ok(0),
-        }
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        let lock = Task::block_on(self.rwsem.write());
+        Task::block_on(self.pages.read(buffer, offset))
     }
 
     fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
         // TODO: We don't need that strong guarantee, find some way to avoid locks
         let lock = Task::block_on(self.rwsem.write());
-        let filedata = self.filedata.access_mut(lock.prove_mut());
 
         let mut store_new_end = None;
         let offset = match offset {
@@ -506,41 +530,25 @@ impl Inode for FileInode {
             }
         };
 
-        let mut pos = offset;
-        loop {
-            if pos >= filedata.len() {
-                filedata.resize(pos + 4096, 0);
-            }
+        let wrote = Task::block_on(self.pages.write(stream, offset))?;
+        let cursor_end = offset + wrote;
 
-            match stream.poll_data(&mut filedata[pos..])? {
-                Some(data) => pos += data.len(),
-                None => break,
-            }
-        }
-
-        filedata.resize(pos, 0);
         if let Some(store_end) = store_new_end {
-            *store_end = pos;
+            *store_end = cursor_end;
         }
 
         // SAFETY: `lock` has done the synchronization
-        self.size.store(pos as u64, Ordering::Relaxed);
         *self.mtime.lock() = Instant::now();
+        self.size.store(cursor_end as u64, Ordering::Relaxed);
 
-        Ok(pos - offset)
+        Ok(wrote)
     }
 
     fn truncate(&self, length: usize) -> KResult<()> {
-        // TODO: We don't need that strong guarantee, find some way to avoid locks
         let lock = Task::block_on(self.rwsem.write());
-        let filedata = self.filedata.access_mut(lock.prove_mut());
-
-        // SAFETY: `lock` has done the synchronization
+        Task::block_on(self.pages.resize(length))?;
         self.size.store(length as u64, Ordering::Relaxed);
         *self.mtime.lock() = Instant::now();
-
-        filedata.resize(length, 0);
-
         Ok(())
     }
 
@@ -559,7 +567,7 @@ impl Inode for FileInode {
 }
 
 impl_any!(TmpFs);
-struct TmpFs {
+pub(super) struct TmpFs {
     next_ino: AtomicIno,
     readonly: bool,
     rename_lock: Mutex<()>,
@@ -580,11 +588,11 @@ impl Vfs for TmpFs {
 }
 
 impl TmpFs {
-    fn assign_ino(&self) -> Ino {
+    pub(super) fn assign_ino(&self) -> Ino {
         self.next_ino.fetch_add(1, Ordering::AcqRel)
     }
 
-    pub fn create(readonly: bool) -> KResult<(Arc<dyn Vfs>, Arc<dyn Inode>)> {
+    pub fn create(readonly: bool) -> KResult<(Arc<TmpFs>, Arc<DirectoryInode>)> {
         let tmpfs = Arc::new(Self {
             next_ino: AtomicIno::new(1),
             readonly,

+ 1 - 0
src/io.rs

@@ -3,6 +3,7 @@ use crate::prelude::*;
 use core::{cmp, mem::MaybeUninit};
 
 #[must_use]
+#[derive(Debug)]
 pub enum FillResult {
     Done(usize),
     Partial(usize),

+ 2 - 0
src/kernel/constants.rs

@@ -15,6 +15,7 @@ pub const SIG_SETMASK: u32 = 2;
 
 pub const CLOCK_REALTIME: u32 = 0;
 pub const CLOCK_MONOTONIC: u32 = 1;
+pub const CLOCK_REALTIME_COARSE: u32 = 5;
 
 pub const EPERM: u32 = 1;
 pub const ENOENT: u32 = 2;
@@ -35,6 +36,7 @@ pub const ENOTDIR: u32 = 20;
 pub const EISDIR: u32 = 21;
 pub const EINVAL: u32 = 22;
 pub const ENOTTY: u32 = 25;
+pub const ENOSPC: u32 = 28;
 pub const ESPIPE: u32 = 29;
 // pub const EROFS: u32 = 30;
 pub const EPIPE: u32 = 32;

+ 2 - 0
src/kernel/mem.rs

@@ -6,9 +6,11 @@ mod allocator;
 mod mm_area;
 mod mm_list;
 mod page_alloc;
+mod page_cache;
 
 pub use access::{AsMemoryBlock, MemoryBlock, PhysAccess};
 pub(self) use mm_area::MMArea;
 pub use mm_list::{handle_kernel_page_fault, FileMapping, MMList, Mapping, Permission};
 pub use page_alloc::{GlobalPageAlloc, RawPage};
+pub use page_cache::{CachePage, PageCache, PageCacheBackend};
 pub use paging::{Page, PageBuffer};

+ 82 - 27
src/kernel/mem/mm_area.rs

@@ -1,18 +1,23 @@
 use super::mm_list::EMPTY_PAGE;
 use super::paging::AllocZeroed as _;
 use super::{AsMemoryBlock, Mapping, Page, Permission};
-use crate::io::ByteBuffer;
+use crate::kernel::constants::EINVAL;
+use crate::kernel::mem::page_cache::PageCacheRawPage;
 use crate::KResult;
-use core::{borrow::Borrow, cell::UnsafeCell, cmp::Ordering};
+use core::sync::atomic;
+use core::{borrow::Borrow, cell::UnsafeCell, cmp};
+use eonix_hal::traits::fault::PageFaultErrorCode;
 use eonix_mm::address::{AddrOps as _, VAddr, VRange};
 use eonix_mm::page_table::{PageAttribute, RawAttribute, PTE};
-use eonix_mm::paging::PFN;
+use eonix_mm::paging::{PAGE_SIZE, PFN};
+use eonix_runtime::task::Task;
 
 #[derive(Debug)]
 pub struct MMArea {
     range: UnsafeCell<VRange>,
     pub(super) mapping: Mapping,
     pub(super) permission: Permission,
+    pub is_shared: bool,
 }
 
 impl Clone for MMArea {
@@ -21,16 +26,18 @@ impl Clone for MMArea {
             range: UnsafeCell::new(self.range()),
             mapping: self.mapping.clone(),
             permission: self.permission,
+            is_shared: self.is_shared,
         }
     }
 }
 
 impl MMArea {
-    pub fn new(range: VRange, mapping: Mapping, permission: Permission) -> Self {
+    pub fn new(range: VRange, mapping: Mapping, permission: Permission, is_shared: bool) -> Self {
         Self {
             range: range.into(),
             mapping,
             permission,
+            is_shared,
         }
     }
 
@@ -56,9 +63,9 @@ impl MMArea {
         assert!(at.is_page_aligned());
 
         match self.range_borrow().cmp(&VRange::from(at)) {
-            Ordering::Less => (Some(self), None),
-            Ordering::Greater => (None, Some(self)),
-            Ordering::Equal => {
+            cmp::Ordering::Less => (Some(self), None),
+            cmp::Ordering::Greater => (None, Some(self)),
+            cmp::Ordering::Equal => {
                 let diff = at - self.range_borrow().start();
                 if diff == 0 {
                     return (None, Some(self));
@@ -71,6 +78,7 @@ impl MMArea {
                         Mapping::Anonymous => Mapping::Anonymous,
                         Mapping::File(mapping) => Mapping::File(mapping.offset(diff)),
                     },
+                    is_shared: self.is_shared,
                 };
 
                 let new_range = self.range_borrow().shrink(self.range_borrow().end() - at);
@@ -119,35 +127,75 @@ impl MMArea {
 
     /// # Arguments
     /// * `offset`: The offset from the start of the mapping, aligned to 4KB boundary.
-    pub fn handle_mmap(
+    pub async fn handle_mmap(
         &self,
         pfn: &mut PFN,
         attr: &mut PageAttribute,
         offset: usize,
+        error: PageFaultErrorCode,
     ) -> KResult<()> {
-        // TODO: Implement shared mapping
-        let Mapping::File(mapping) = &self.mapping else {
+        let Mapping::File(file_mapping) = &self.mapping else {
             panic!("Anonymous mapping should not be PA_MMAP");
         };
 
-        assert!(offset < mapping.length, "Offset out of range");
-        unsafe {
-            Page::with_raw(*pfn, |page| {
-                // SAFETY: `page` is marked as mapped, so others trying to read or write to
-                //         it will be blocked and enter the page fault handler, where they will
-                //         be blocked by the mutex held by us.
-                let page_data = page.as_memblk().as_bytes_mut();
+        assert!(offset < file_mapping.length, "Offset out of range");
 
-                let cnt_to_read = (mapping.length - offset).min(0x1000);
-                let cnt_read = mapping.file.read(
-                    &mut ByteBuffer::new(&mut page_data[..cnt_to_read]),
-                    mapping.offset + offset,
-                )?;
+        let Some(page_cache) = file_mapping.file.page_cache() else {
+            panic!("Mapping file should have pagecache");
+        };
+
+        let file_offset = file_mapping.offset + offset;
+        let cnt_to_read = (file_mapping.length - offset).min(0x1000);
+        let raw_page = page_cache.get_page(file_offset).await?.ok_or(EINVAL)?;
+
+        // Read or ifetch fault, we find page in pagecache and do mapping
+        // Write falut, we need to care about shared or private mapping.
+        if error.contains(PageFaultErrorCode::Read)
+            || error.contains(PageFaultErrorCode::InstructionFetch)
+        {
+            // Bss is embarrassing in pagecache!
+            // We have to assume cnt_to_read < PAGE_SIZE all bss
+            if cnt_to_read < PAGE_SIZE {
+                let new_page = Page::zeroed();
+                unsafe {
+                    let page_data = new_page.as_memblk().as_bytes_mut();
+                    page_data[..cnt_to_read]
+                        .copy_from_slice(&raw_page.as_memblk().as_bytes()[..cnt_to_read]);
+                }
+                *pfn = new_page.into_raw();
+            } else {
+                raw_page.refcount().fetch_add(1, atomic::Ordering::Relaxed);
+                *pfn = Into::<PFN>::into(raw_page);
+            }
 
-                page_data[cnt_read..].fill(0);
+            if self.permission.write {
+                if self.is_shared {
+                    // The page may will not be written,
+                    // But we simply assume page will be dirty
+                    raw_page.set_dirty();
+                    attr.insert(PageAttribute::WRITE);
+                } else {
+                    attr.insert(PageAttribute::COPY_ON_WRITE);
+                }
+            }
+        } else if error.contains(PageFaultErrorCode::Write) {
+            if self.is_shared {
+                raw_page.refcount().fetch_add(1, atomic::Ordering::Relaxed);
+                raw_page.set_dirty();
+                *pfn = Into::<PFN>::into(raw_page);
+            } else {
+                let new_page = Page::zeroed();
+                unsafe {
+                    let page_data = new_page.as_memblk().as_bytes_mut();
+                    page_data[..cnt_to_read]
+                        .copy_from_slice(&raw_page.as_memblk().as_bytes()[..cnt_to_read]);
+                }
+                *pfn = new_page.into_raw();
+            }
 
-                KResult::Ok(())
-            })?;
+            attr.insert(PageAttribute::WRITE);
+        } else {
+            unreachable!("Unexpected page fault error code: {:?}", error);
         }
 
         attr.insert(PageAttribute::PRESENT);
@@ -155,7 +203,12 @@ impl MMArea {
         Ok(())
     }
 
-    pub fn handle(&self, pte: &mut impl PTE, offset: usize) -> KResult<()> {
+    pub fn handle(
+        &self,
+        pte: &mut impl PTE,
+        offset: usize,
+        error: Option<PageFaultErrorCode>,
+    ) -> KResult<()> {
         let mut attr = pte.get_attr().as_page_attr().expect("Not a page attribute");
         let mut pfn = pte.get_pfn();
 
@@ -164,7 +217,9 @@ impl MMArea {
         }
 
         if attr.contains(PageAttribute::MAPPED) {
-            self.handle_mmap(&mut pfn, &mut attr, offset)?;
+            let error =
+                error.expect("Mapped area should not be accessed without a page fault error code");
+            Task::block_on(self.handle_mmap(&mut pfn, &mut attr, offset, error))?;
         }
 
         attr.set(PageAttribute::ACCESSED, true);

+ 54 - 13
src/kernel/mem/mm_list.rs

@@ -6,6 +6,7 @@ use super::page_alloc::GlobalPageAlloc;
 use super::paging::AllocZeroed as _;
 use super::{AsMemoryBlock, MMArea, Page};
 use crate::kernel::constants::{EEXIST, EFAULT, EINVAL, ENOMEM};
+use crate::kernel::mem::page_alloc::RawPagePtr;
 use crate::{prelude::*, sync::ArcSwap};
 use alloc::collections::btree_set::BTreeSet;
 use core::fmt;
@@ -256,6 +257,7 @@ impl MMListInner<'_> {
         len: usize,
         mapping: Mapping,
         permission: Permission,
+        is_shared: bool,
     ) -> KResult<()> {
         assert_eq!(at.floor(), at);
         assert_eq!(len & (PAGE_SIZE - 1), 0);
@@ -271,13 +273,33 @@ impl MMListInner<'_> {
             Mapping::File(_) => self.page_table.set_mmapped(range, permission),
         }
 
-        self.areas.insert(MMArea::new(range, mapping, permission));
+        self.areas
+            .insert(MMArea::new(range, mapping, permission, is_shared));
         Ok(())
     }
 }
 
 impl Drop for MMListInner<'_> {
     fn drop(&mut self) {
+        // May buggy
+        for area in &self.areas {
+            if area.is_shared {
+                for pte in self.page_table.iter_user(area.range()) {
+                    let (pfn, _) = pte.take();
+                    let raw_page = RawPagePtr::from(pfn);
+                    if raw_page.refcount().fetch_sub(1, Ordering::Relaxed) == 1 {
+                        // Wrong here
+                        // unsafe { Page::from_raw(pfn) };
+                    }
+                }
+            } else {
+                for pte in self.page_table.iter_user(area.range()) {
+                    let (pfn, _) = pte.take();
+                    unsafe { Page::from_raw(pfn) };
+                }
+            }
+        }
+
         // TODO: Recycle all pages in the page table.
     }
 }
@@ -343,9 +365,15 @@ impl MMList {
             let list_inner = list_inner.lock().await;
 
             for area in list_inner.areas.iter() {
-                list_inner
-                    .page_table
-                    .set_copy_on_write(&mut inner.page_table, area.range());
+                if !area.is_shared {
+                    list_inner
+                        .page_table
+                        .set_copy_on_write(&mut inner.page_table, area.range());
+                } else {
+                    list_inner
+                        .page_table
+                        .set_copied(&mut inner.page_table, area.range());
+                }
             }
         }
 
@@ -507,21 +535,22 @@ impl MMList {
         len: usize,
         mapping: Mapping,
         permission: Permission,
+        is_shared: bool,
     ) -> KResult<VAddr> {
         let inner = self.inner.borrow();
         let mut inner = Task::block_on(inner.lock());
 
         if hint == VAddr::NULL {
             let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
-            inner.mmap(at, len, mapping, permission)?;
+            inner.mmap(at, len, mapping, permission, is_shared)?;
             return Ok(at);
         }
 
-        match inner.mmap(hint, len, mapping.clone(), permission) {
+        match inner.mmap(hint, len, mapping.clone(), permission, is_shared) {
             Ok(()) => Ok(hint),
             Err(EEXIST) => {
                 let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
-                inner.mmap(at, len, mapping, permission)?;
+                inner.mmap(at, len, mapping, permission, is_shared)?;
                 Ok(at)
             }
             Err(err) => Err(err),
@@ -534,9 +563,10 @@ impl MMList {
         len: usize,
         mapping: Mapping,
         permission: Permission,
+        is_shared: bool,
     ) -> KResult<VAddr> {
         Task::block_on(self.inner.borrow().lock())
-            .mmap(at, len, mapping.clone(), permission)
+            .mmap(at, len, mapping.clone(), permission, is_shared)
             .map(|_| at)
     }
 
@@ -571,6 +601,7 @@ impl MMList {
                     write: true,
                     execute: false,
                 },
+                false,
             ));
         }
 
@@ -644,7 +675,7 @@ impl MMList {
                 let page_start = current.floor() + idx * 0x1000;
                 let page_end = page_start + 0x1000;
 
-                area.handle(pte, page_start - area_start)?;
+                area.handle(pte, page_start - area_start, None)?;
 
                 let start_offset;
                 if page_start < current {
@@ -692,6 +723,7 @@ trait PageTableExt {
     fn set_anonymous(&self, range: VRange, permission: Permission);
     fn set_mmapped(&self, range: VRange, permission: Permission);
     fn set_copy_on_write(&self, from: &Self, range: VRange);
+    fn set_copied(&self, from: &Self, range: VRange);
 }
 
 impl PageTableExt for KernelPageTable<'_> {
@@ -715,10 +747,22 @@ impl PageTableExt for KernelPageTable<'_> {
             to.set_copy_on_write(from);
         }
     }
+
+    fn set_copied(&self, from: &Self, range: VRange) {
+        let to_iter = self.iter_user(range);
+        let from_iter = from.iter_user(range);
+
+        for (to, from) in to_iter.zip(from_iter) {
+            let (pfn, attr) = from.get();
+            to.set(pfn, attr);
+        }
+    }
 }
 
 trait PTEExt {
+    // private anonymous
     fn set_anonymous(&mut self, execute: bool);
+    // file mapped or shared anonymous
     fn set_mapped(&mut self, execute: bool);
     fn set_copy_on_write(&mut self, from: &mut Self);
 }
@@ -742,10 +786,7 @@ where
     fn set_mapped(&mut self, execute: bool) {
         // Writable flag is set during page fault handling while executable flag is
         // preserved across page faults, so we set executable flag now.
-        let mut attr = PageAttribute::READ
-            | PageAttribute::USER
-            | PageAttribute::MAPPED
-            | PageAttribute::COPY_ON_WRITE;
+        let mut attr = PageAttribute::READ | PageAttribute::USER | PageAttribute::MAPPED;
         attr.set(PageAttribute::EXECUTE, execute);
 
         self.set(EMPTY_PAGE.clone().into_raw(), T::Attr::from(attr));

+ 14 - 3
src/kernel/mem/mm_list/mapping.rs

@@ -1,23 +1,34 @@
-use crate::kernel::vfs::dentry::Dentry;
+use core::fmt::Debug;
+
+use crate::kernel::vfs::inode::Inode;
 use alloc::sync::Arc;
 use eonix_mm::paging::PAGE_SIZE;
 
 #[derive(Debug, Clone)]
 pub struct FileMapping {
-    pub file: Arc<Dentry>,
+    pub file: Arc<dyn Inode>,
     /// Offset in the file, aligned to 4KB boundary.
     pub offset: usize,
     /// Length of the mapping. Exceeding part will be zeroed.
     pub length: usize,
 }
+
+impl Debug for dyn Inode {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "Inode()")
+    }
+}
+
 #[derive(Debug, Clone)]
 pub enum Mapping {
+    // private anonymous memory
     Anonymous,
+    // file-backed memory or shared anonymous memory(tmp file)
     File(FileMapping),
 }
 
 impl FileMapping {
-    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
+    pub fn new(file: Arc<dyn Inode>, offset: usize, length: usize) -> Self {
         assert_eq!(offset & (PAGE_SIZE - 1), 0);
         Self {
             file,

+ 4 - 2
src/kernel/mem/mm_list/page_fault.rs

@@ -90,7 +90,7 @@ impl MMList {
             .next()
             .expect("If we can find the mapped area, we should be able to find the PTE");
 
-        area.handle(pte, addr.floor() - area.range().start())
+        area.handle(pte, addr.floor() - area.range().start(), Some(error))
             .map_err(|_| Signal::SIGBUS)?;
 
         flush_tlb(addr.floor().addr());
@@ -160,9 +160,11 @@ pub fn handle_kernel_page_fault(
         .next()
         .expect("If we can find the mapped area, we should be able to find the PTE");
 
-    if let Err(_) = area.handle(pte, addr.floor() - area.range().start()) {
+    if let Err(_) = area.handle(pte, addr.floor() - area.range().start(), Some(error)) {
         return Some(try_page_fault_fix(fault_pc, addr));
     }
 
+    flush_tlb(addr.addr());
+
     None
 }

+ 46 - 4
src/kernel/mem/page_alloc/raw_page.rs

@@ -10,8 +10,8 @@ use eonix_mm::{
 use intrusive_list::{container_of, Link};
 use slab_allocator::SlabRawPage;
 
-use crate::kernel::mem::access::RawPageAccess;
-use crate::kernel::mem::PhysAccess;
+use crate::kernel::mem::{access::RawPageAccess, page_cache::PageCacheRawPage, MemoryBlock};
+use crate::kernel::mem::{AsMemoryBlock, PhysAccess};
 
 const PAGE_ARRAY: NonNull<RawPage> =
     unsafe { NonNull::new_unchecked(0xffffff8040000000 as *mut _) };
@@ -32,11 +32,16 @@ impl SlabPageInner {
     }
 }
 
+struct PageCacheInner {
+    valid_size: usize,
+}
+
 pub struct BuddyPageInner {}
 
 enum PageType {
     Buddy(BuddyPageInner),
     Slab(SlabPageInner),
+    PageCache(PageCacheInner),
 }
 
 impl PageType {
@@ -47,6 +52,14 @@ impl PageType {
             unreachable!()
         }
     }
+
+    fn page_cache_data(&mut self) -> &mut PageCacheInner {
+        if let PageType::PageCache(cache_data) = self {
+            return cache_data;
+        } else {
+            unreachable!()
+        }
+    }
 }
 
 pub struct RawPage {
@@ -70,8 +83,8 @@ impl PageFlags {
     pub const PRESENT: u32 = 1 << 0;
     // pub const LOCKED: u32 = 1 << 1;
     pub const BUDDY: u32 = 1 << 2;
-    // pub const SLAB: u32 = 1 << 3;
-    // pub const DIRTY: u32 = 1 << 4;
+    pub const SLAB: u32 = 1 << 3;
+    pub const DIRTY: u32 = 1 << 4;
     pub const FREE: u32 = 1 << 5;
     pub const LOCAL: u32 = 1 << 6;
 
@@ -226,3 +239,32 @@ impl SlabRawPage for RawPagePtr {
         self.as_mut().shared_data = PageType::Slab(SlabPageInner::new(first_free));
     }
 }
+
+impl PageCacheRawPage for RawPagePtr {
+    fn valid_size(&self) -> &mut usize {
+        &mut self.as_mut().shared_data.page_cache_data().valid_size
+    }
+
+    fn is_dirty(&self) -> bool {
+        self.flags().has(PageFlags::DIRTY)
+    }
+
+    fn clear_dirty(&self) {
+        self.flags().clear(PageFlags::DIRTY);
+    }
+
+    fn set_dirty(&self) {
+        self.flags().set(PageFlags::DIRTY);
+    }
+
+    fn cache_init(&self) {
+        self.as_mut().shared_data = PageType::PageCache(PageCacheInner { valid_size: 0 });
+    }
+}
+
+/// SAFETY: `RawPagePtr` is a pointer to a valid `RawPage` struct.
+impl AsMemoryBlock for RawPagePtr {
+    fn as_memblk(&self) -> MemoryBlock {
+        unsafe { MemoryBlock::new(self.real_ptr::<()>().addr(), PAGE_SIZE) }
+    }
+}

+ 303 - 0
src/kernel/mem/page_cache.rs

@@ -0,0 +1,303 @@
+use super::access::AsMemoryBlock;
+use crate::{
+    io::{Buffer, FillResult, Stream},
+    kernel::mem::page_alloc::RawPagePtr,
+    prelude::KResult,
+    GlobalPageAlloc,
+};
+use align_ext::AlignExt;
+use alloc::{collections::btree_map::BTreeMap, sync::Weak};
+use eonix_mm::paging::{PageAlloc, RawPage, PAGE_SIZE, PAGE_SIZE_BITS};
+use eonix_sync::Mutex;
+
+pub struct PageCache {
+    pages: Mutex<BTreeMap<usize, CachePage>>,
+    backend: Weak<dyn PageCacheBackend>,
+}
+
+unsafe impl Send for PageCache {}
+unsafe impl Sync for PageCache {}
+
+#[derive(Clone, Copy)]
+pub struct CachePage(RawPagePtr);
+
+impl Buffer for CachePage {
+    fn total(&self) -> usize {
+        PAGE_SIZE
+    }
+
+    fn wrote(&self) -> usize {
+        self.valid_size()
+    }
+
+    fn fill(&mut self, data: &[u8]) -> KResult<FillResult> {
+        let valid_size = self.valid_size();
+        let available = &mut self.all_mut()[valid_size..];
+        if available.len() == 0 {
+            return Ok(FillResult::Full);
+        }
+
+        let len = core::cmp::min(data.len(), available.len());
+        available[..len].copy_from_slice(&data[..len]);
+
+        *self.0.valid_size() += len;
+
+        if len < data.len() {
+            Ok(FillResult::Partial(len))
+        } else {
+            Ok(FillResult::Done(len))
+        }
+    }
+}
+
+impl CachePage {
+    pub fn new() -> Self {
+        let page = GlobalPageAlloc.alloc().unwrap();
+        page.cache_init();
+        Self(page)
+    }
+
+    pub fn new_zeroed() -> Self {
+        let page = GlobalPageAlloc.alloc().unwrap();
+        // SAFETY: We own the page exclusively, so we can safely zero it.
+        unsafe {
+            page.as_memblk().as_bytes_mut().fill(0);
+        }
+        page.cache_init();
+        Self(page)
+    }
+
+    pub fn valid_size(&self) -> usize {
+        *self.0.valid_size()
+    }
+
+    pub fn set_valid_size(&mut self, valid_size: usize) {
+        *self.0.valid_size() = valid_size;
+    }
+
+    pub fn all(&self) -> &[u8] {
+        unsafe {
+            self.0.as_memblk().as_bytes()
+        }
+    }
+
+    pub fn all_mut(&mut self) -> &mut [u8] {
+        unsafe {
+            self.0.as_memblk().as_bytes_mut()
+        }
+    }
+
+    pub fn valid_data(&self) -> &[u8] {
+        &self.all()[..self.valid_size()]
+    }
+
+    pub fn is_dirty(&self) -> bool {
+        self.0.is_dirty()
+    }
+
+    pub fn set_dirty(&self) {
+        self.0.set_dirty();
+    }
+
+    pub fn clear_dirty(&self) {
+        self.0.clear_dirty();
+    }
+}
+
+impl PageCache {
+    pub fn new(backend: Weak<dyn PageCacheBackend>) -> Self {
+        Self {
+            pages: Mutex::new(BTreeMap::new()),
+            backend: backend,
+        }
+    }
+
+    pub async fn read(&self, buffer: &mut dyn Buffer, mut offset: usize) -> KResult<usize> {
+        let mut pages = self.pages.lock().await;
+
+        loop {
+            let page_id = offset >> PAGE_SIZE_BITS;
+            let page = pages.get(&page_id);
+
+            match page {
+                Some(page) => {
+                    let inner_offset = offset % PAGE_SIZE;
+
+                    // TODO: still cause unnecessary IO if valid_size < PAGESIZE
+                    //       and fill result is Done
+                    if page.valid_size() == 0
+                        || buffer
+                            .fill(&page.valid_data()[inner_offset..])?
+                            .should_stop()
+                        || buffer.available() == 0
+                    {
+                        break;
+                    }
+
+                    offset += PAGE_SIZE - inner_offset;
+                }
+                None => {
+                    let mut new_page = CachePage::new();
+                    self.backend
+                        .upgrade()
+                        .unwrap()
+                        .read_page(&mut new_page, offset.align_down(PAGE_SIZE))?;
+                    pages.insert(page_id, new_page);
+                }
+            }
+        }
+
+        Ok(buffer.wrote())
+    }
+
+    pub async fn write(&self, stream: &mut dyn Stream, mut offset: usize) -> KResult<usize> {
+        let mut pages = self.pages.lock().await;
+        let old_size = self.backend.upgrade().unwrap().size();
+        let mut wrote = 0;
+
+        loop {
+            let page_id = offset >> PAGE_SIZE_BITS;
+            let page = pages.get_mut(&page_id);
+
+            match page {
+                Some(page) => {
+                    let inner_offset = offset % PAGE_SIZE;
+                    let cursor_end = match stream.poll_data(&mut page.all_mut()[inner_offset..])? {
+                        Some(buf) => {
+                            wrote += buf.len();
+                            inner_offset + buf.len()
+                        }
+                        None => {
+                            break;
+                        }
+                    };
+
+                    if page.valid_size() < cursor_end {
+                        page.set_valid_size(cursor_end);
+                    }
+                    page.set_dirty();
+                    offset += PAGE_SIZE - inner_offset;
+                }
+                None => {
+                    let new_page = if (offset >> PAGE_SIZE_BITS) > (old_size >> PAGE_SIZE_BITS) {
+                        let new_page = CachePage::new_zeroed();
+                        new_page
+                    } else {
+                        let mut new_page = CachePage::new();
+                        self.backend
+                            .upgrade()
+                            .unwrap()
+                            .read_page(&mut new_page, offset.align_down(PAGE_SIZE))?;
+                        new_page
+                    };
+
+                    pages.insert(page_id, new_page);
+                }
+            }
+        }
+
+        Ok(wrote)
+    }
+
+    pub async fn fsync(&self) -> KResult<()> {
+        let pages = self.pages.lock().await;
+        for (page_id, page) in pages.iter() {
+            if page.is_dirty() {
+                self.backend
+                    .upgrade()
+                    .unwrap()
+                    .write_page(page, page_id << PAGE_SIZE_BITS)?;
+                page.clear_dirty();
+            }
+        }
+        Ok(())
+    }
+
+    // This function is used for extend write or truncate
+    pub async fn resize(&self, new_size: usize) -> KResult<()> {
+        let mut pages = self.pages.lock().await;
+        let old_size = self.backend.upgrade().unwrap().size();
+
+        if new_size < old_size {
+            let begin = new_size.align_down(PAGE_SIZE) >> PAGE_SIZE_BITS;
+            let end = old_size.align_up(PAGE_SIZE) >> PAGE_SIZE_BITS;
+
+            for page_id in begin..end {
+                pages.remove(&page_id);
+            }
+        } else if new_size > old_size {
+            let begin = old_size.align_down(PAGE_SIZE) >> PAGE_SIZE_BITS;
+            let end = new_size.align_up(PAGE_SIZE) >> PAGE_SIZE_BITS;
+
+            pages.remove(&begin);
+
+            for page_id in begin..end {
+                let mut new_page = CachePage::new_zeroed();
+
+                if page_id != end - 1 {
+                    new_page.set_valid_size(PAGE_SIZE);
+                } else {
+                    new_page.set_valid_size(new_size % PAGE_SIZE);
+                }
+                new_page.set_dirty();
+                pages.insert(page_id, new_page);
+            }
+        }
+
+        Ok(())
+    }
+
+    pub async fn get_page(&self, offset: usize) -> KResult<Option<RawPagePtr>> {
+        let offset_aligin = offset.align_down(PAGE_SIZE);
+        let page_id = offset_aligin >> PAGE_SIZE_BITS;
+        let size = self.backend.upgrade().unwrap().size();
+
+        if offset_aligin > size {
+            return Ok(None);
+        }
+
+        let mut pages = self.pages.lock().await;
+
+        if let Some(page) = pages.get(&page_id) {
+            Ok(Some(page.0))
+        } else {
+            let mut new_page = CachePage::new();
+            self.backend
+                .upgrade()
+                .unwrap()
+                .read_page(&mut new_page, offset_aligin)?;
+            pages.insert(page_id, new_page);
+            Ok(Some(new_page.0))
+        }
+    }
+}
+
+// with this trait, "page cache" and "block cache" are unified,
+// for fs, offset is file offset (floor algin to PAGE_SIZE)
+// for blkdev, offset is block idx (floor align to PAGE_SIZE / BLK_SIZE)
+// Oh no, this would make unnecessary cache
+pub trait PageCacheBackend {
+    fn read_page(&self, page: &mut CachePage, offset: usize) -> KResult<usize>;
+
+    fn write_page(&self, page: &CachePage, offset: usize) -> KResult<usize>;
+
+    fn size(&self) -> usize;
+}
+
+pub trait PageCacheRawPage: RawPage {
+    fn valid_size(&self) -> &mut usize;
+
+    fn is_dirty(&self) -> bool;
+
+    fn set_dirty(&self);
+
+    fn clear_dirty(&self);
+
+    fn cache_init(&self);
+}
+
+impl Drop for PageCache {
+    fn drop(&mut self) {
+        let _ = self.fsync();
+    }
+}

+ 75 - 6
src/kernel/syscall/file_rw.rs

@@ -1,9 +1,12 @@
+use core::time::Duration;
+
 use super::FromSyscallArg;
 use crate::io::IntoStream;
 use crate::kernel::constants::{
-    EBADF, EFAULT, EINVAL, ENOENT, ENOTDIR, SEEK_CUR, SEEK_END, SEEK_SET, S_IFBLK, S_IFCHR,
+    EBADF, EFAULT, EINVAL, ENOENT, ENOSYS, ENOTDIR, SEEK_CUR, SEEK_END, SEEK_SET, S_IFBLK, S_IFCHR,
 };
 use crate::kernel::task::Thread;
+use crate::kernel::timer::sleep;
 use crate::kernel::vfs::filearray::FD;
 use crate::{
     io::{Buffer, BufferFill},
@@ -25,7 +28,8 @@ use eonix_runtime::task::Task;
 use posix_types::ctypes::{Long, PtrT};
 use posix_types::namei::RenameFlags;
 use posix_types::open::{AtFlags, OpenFlags};
-use posix_types::signal::SigSet;
+use posix_types::poll::FDSet;
+use posix_types::signal::{SigSet, Signal};
 use posix_types::stat::Stat;
 use posix_types::stat::{StatX, TimeSpec};
 use posix_types::syscall_no::*;
@@ -73,7 +77,20 @@ fn dentry_from(
 fn read(fd: FD, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
 
-    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.read(&mut buffer))
+    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.read(&mut buffer, None))
+}
+
+#[eonix_macros::define_syscall(SYS_PREAD64)]
+fn pread64(fd: FD, buffer: *mut u8, bufsize: usize, offset: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+
+    Task::block_on(
+        thread
+            .files
+            .get(fd)
+            .ok_or(EBADF)?
+            .read(&mut buffer, Some(offset)),
+    )
 }
 
 #[eonix_macros::define_syscall(SYS_WRITE)]
@@ -81,7 +98,21 @@ fn write(fd: FD, buffer: *const u8, count: usize) -> KResult<usize> {
     let buffer = CheckedUserPointer::new(buffer, count)?;
     let mut stream = buffer.into_stream();
 
-    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.write(&mut stream))
+    Task::block_on(thread.files.get(fd).ok_or(EBADF)?.write(&mut stream, None))
+}
+
+#[eonix_macros::define_syscall(SYS_PWRITE64)]
+fn pwrite64(fd: FD, buffer: *const u8, count: usize, offset: usize) -> KResult<usize> {
+    let buffer = CheckedUserPointer::new(buffer, count)?;
+    let mut stream = buffer.into_stream();
+
+    Task::block_on(
+        thread
+            .files
+            .get(fd)
+            .ok_or(EBADF)?
+            .write(&mut stream, Some(offset)),
+    )
 }
 
 #[eonix_macros::define_syscall(SYS_OPENAT)]
@@ -229,6 +260,12 @@ fn mkdir(pathname: *const u8, mode: u32) -> KResult<()> {
     sys_mkdirat(thread, FD::AT_FDCWD, pathname, mode)
 }
 
+#[eonix_macros::define_syscall(SYS_FTRUNCATE64)]
+fn truncate64(fd: FD, length: usize) -> KResult<()> {
+    let file = thread.files.get(fd).ok_or(EBADF)?;
+    file.as_path().ok_or(EBADF)?.truncate(length)
+}
+
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_TRUNCATE)]
 fn truncate(pathname: *const u8, length: usize) -> KResult<()> {
@@ -353,7 +390,7 @@ fn readv(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
     let mut tot = 0usize;
     for mut buffer in iov_buffers.into_iter() {
         // TODO!!!: `readv`
-        let nread = Task::block_on(file.read(&mut buffer))?;
+        let nread = Task::block_on(file.read(&mut buffer, None))?;
         tot += nread;
 
         if nread != buffer.total() {
@@ -389,7 +426,7 @@ fn writev(fd: FD, iov_user: *const IoVec, iovcnt: u32) -> KResult<usize> {
 
     let mut tot = 0usize;
     for mut stream in iov_streams.into_iter() {
-        let nread = Task::block_on(file.write(&mut stream))?;
+        let nread = Task::block_on(file.write(&mut stream, None))?;
         tot += nread;
 
         if nread == 0 || !stream.is_drained() {
@@ -495,6 +532,38 @@ fn ppoll(
     do_poll(thread, fds, nfds, 0)
 }
 
+#[eonix_macros::define_syscall(SYS_PSELECT6)]
+fn pselect6(
+    nfds: u32,
+    _readfds: *mut FDSet,
+    _writefds: *mut FDSet,
+    _exceptfds: *mut FDSet,
+    timeout: *mut TimeSpec,
+    _sigmask: *const (),
+) -> KResult<usize> {
+    // According to [pthread6(2)](https://linux.die.net/man/2/pselect6):
+    // Some code calls select() with all three sets empty, nfds zero, and
+    // a non-NULL timeout as a fairly portable way to sleep with subsecond precision.
+    if nfds != 0 {
+        thread.raise(Signal::SIGSYS);
+        return Err(ENOSYS);
+    }
+
+    let timeout = UserPointerMut::new(timeout)?;
+    
+    // Read here to check for invalid pointers.
+    let _timeout_value = timeout.read()?;
+
+    Task::block_on(sleep(Duration::from_millis(10)));
+
+    timeout.write(TimeSpec {
+        tv_sec: 0,
+        tv_nsec: 0,
+    })?;
+
+    Ok(0)
+}
+
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_POLL)]
 fn poll(fds: *mut UserPollFd, nfds: u32, timeout: u32) -> KResult<u32> {

+ 140 - 45
src/kernel/syscall/mm.rs

@@ -1,6 +1,9 @@
 use super::FromSyscallArg;
-use crate::kernel::constants::{EINVAL, ENOMEM};
+use crate::fs::shm::{gen_shm_id, ShmFlags, IPC_PRIVATE, SHM_MANAGER};
+use crate::kernel::constants::{EBADF, EEXIST, EINVAL, ENOENT, ENOMEM};
+use crate::kernel::mem::FileMapping;
 use crate::kernel::task::Thread;
+use crate::kernel::vfs::filearray::FD;
 use crate::{
     kernel::{
         constants::{UserMmapFlags, UserMmapProtocol},
@@ -28,6 +31,7 @@ impl FromSyscallArg for UserMmapFlags {
 
 /// Check whether we are doing an implemented function.
 /// If `condition` is false, return `Err(err)`.
+#[allow(unused)]
 fn check_impl(condition: bool, err: u32) -> KResult<()> {
     if !condition {
         Err(err)
@@ -42,60 +46,58 @@ fn do_mmap2(
     len: usize,
     prot: UserMmapProtocol,
     flags: UserMmapFlags,
-    fd: u32,
+    fd: FD,
     pgoffset: usize,
 ) -> KResult<usize> {
     let addr = VAddr::from(addr);
-    if !addr.is_page_aligned() || len == 0 {
+    if !addr.is_page_aligned() || pgoffset % PAGE_SIZE != 0 || len == 0 {
         return Err(EINVAL);
     }
 
     let len = len.align_up(PAGE_SIZE);
-    check_impl(flags.contains(UserMmapFlags::MAP_ANONYMOUS), ENOMEM)?;
-    check_impl(flags.contains(UserMmapFlags::MAP_PRIVATE), EINVAL)?;
-    if fd != u32::MAX || pgoffset != 0 {
-        return Err(EINVAL);
-    }
-
     let mm_list = &thread.process.mm_list;
+    let is_shared = flags.contains(UserMmapFlags::MAP_SHARED);
 
-    // TODO!!!: If we are doing mmap's in 32-bit mode, we should check whether
-    //          `addr` is above user reachable memory.
-    let addr = if flags.contains(UserMmapFlags::MAP_FIXED) {
-        if prot.is_empty() {
-            Task::block_on(mm_list.protect(
-                addr,
-                len,
-                Permission {
-                    read: prot.contains(UserMmapProtocol::PROT_READ),
-                    write: prot.contains(UserMmapProtocol::PROT_WRITE),
-                    execute: prot.contains(UserMmapProtocol::PROT_EXEC),
-                },
-            ))
-            .map(|_| addr)
+    let mapping = if flags.contains(UserMmapFlags::MAP_ANONYMOUS) {
+        if pgoffset != 0 {
+            return Err(EINVAL);
+        }
+
+        if !is_shared {
+            Mapping::Anonymous
         } else {
-            mm_list.mmap_fixed(
-                addr,
+            // The mode is unimportant here, since we are checking prot in mm_area.
+            let shared_area = Task::block_on(SHM_MANAGER.lock()).create_shared_area(
                 len,
-                Mapping::Anonymous,
-                Permission {
-                    read: prot.contains(UserMmapProtocol::PROT_READ),
-                    write: prot.contains(UserMmapProtocol::PROT_WRITE),
-                    execute: prot.contains(UserMmapProtocol::PROT_EXEC),
-                },
-            )
+                thread.process.pid,
+                0x777,
+            );
+            Mapping::File(FileMapping::new(shared_area.area.clone(), 0, len))
         }
     } else {
-        mm_list.mmap_hint(
-            addr,
-            len,
-            Mapping::Anonymous,
-            Permission {
-                read: prot.contains(UserMmapProtocol::PROT_READ),
-                write: prot.contains(UserMmapProtocol::PROT_WRITE),
-                execute: prot.contains(UserMmapProtocol::PROT_EXEC),
-            },
-        )
+        let file = thread
+            .files
+            .get(fd)
+            .ok_or(EBADF)?
+            .get_inode()?
+            .ok_or(EBADF)?;
+
+        Mapping::File(FileMapping::new(file, pgoffset, len))
+    };
+
+    let permission = Permission {
+        read: prot.contains(UserMmapProtocol::PROT_READ),
+        write: prot.contains(UserMmapProtocol::PROT_WRITE),
+        execute: prot.contains(UserMmapProtocol::PROT_EXEC),
+    };
+
+    // TODO!!!: If we are doing mmap's in 32-bit mode, we should check whether
+    //          `addr` is above user reachable memory.
+    let addr = if flags.contains(UserMmapFlags::MAP_FIXED) {
+        Task::block_on(mm_list.unmap(addr, len));
+        mm_list.mmap_fixed(addr, len, mapping, permission, is_shared)
+    } else {
+        mm_list.mmap_hint(addr, len, mapping, permission, is_shared)
     };
 
     addr.map(|addr| addr.addr())
@@ -108,10 +110,10 @@ fn mmap(
     len: usize,
     prot: UserMmapProtocol,
     flags: UserMmapFlags,
-    fd: u32,
+    fd: FD,
     offset: usize,
 ) -> KResult<usize> {
-    do_mmap2(thread, addr, len, prot, flags, fd, offset / PAGE_SIZE)
+    do_mmap2(thread, addr, len, prot, flags, fd, offset)
 }
 
 #[cfg(target_arch = "x86_64")]
@@ -121,7 +123,7 @@ fn mmap2(
     len: usize,
     prot: UserMmapProtocol,
     flags: UserMmapFlags,
-    fd: u32,
+    fd: FD,
     pgoffset: usize,
 ) -> KResult<usize> {
     do_mmap2(thread, addr, len, prot, flags, fd, pgoffset)
@@ -169,6 +171,99 @@ fn mprotect(addr: usize, len: usize, prot: UserMmapProtocol) -> KResult<()> {
     ))
 }
 
+#[eonix_macros::define_syscall(SYS_SHMGET)]
+fn shmget(key: usize, size: usize, shmflg: u32) -> KResult<u32> {
+    let size = size.align_up(PAGE_SIZE);
+
+    let mut shm_manager = Task::block_on(SHM_MANAGER.lock());
+    let shmid = gen_shm_id(key)?;
+
+    let mode = shmflg & 0o777;
+    let shmflg = ShmFlags::from_bits_truncate(shmflg);
+
+    if key == IPC_PRIVATE {
+        let new_shm = shm_manager.create_shared_area(size, thread.process.pid, mode);
+        shm_manager.insert(shmid, new_shm);
+        return Ok(shmid);
+    }
+
+    if let Some(_) = shm_manager.get(shmid) {
+        if shmflg.contains(ShmFlags::IPC_CREAT | ShmFlags::IPC_EXCL) {
+            return Err(EEXIST);
+        }
+
+        return Ok(shmid);
+    }
+
+    if shmflg.contains(ShmFlags::IPC_CREAT) {
+        let new_shm = shm_manager.create_shared_area(size, thread.process.pid, mode);
+        shm_manager.insert(shmid, new_shm);
+        return Ok(shmid);
+    }
+
+    return Err(ENOENT);
+}
+
+#[eonix_macros::define_syscall(SYS_SHMAT)]
+fn shmat(shmid: u32, addr: usize, shmflg: u32) -> KResult<usize> {
+    let mm_list = &thread.process.mm_list;
+    let shm_manager = Task::block_on(SHM_MANAGER.lock());
+    let shm_area = shm_manager.get(shmid).ok_or(EINVAL)?;
+
+    let mode = shmflg & 0o777;
+    let shmflg = ShmFlags::from_bits_truncate(shmflg);
+
+    let mut permission = Permission {
+        read: true,
+        write: true,
+        execute: false,
+    };
+
+    if shmflg.contains(ShmFlags::SHM_EXEC) {
+        permission.execute = true;
+    }
+    if shmflg.contains(ShmFlags::SHM_RDONLY) {
+        permission.write = false;
+    }
+
+    let size = shm_area.shmid_ds.shm_segsz;
+
+    let mapping = Mapping::File(FileMapping {
+        file: shm_area.area.clone(),
+        offset: 0,
+        length: size,
+    });
+
+    let addr = if addr != 0 {
+        if addr % PAGE_SIZE != 0 && !shmflg.contains(ShmFlags::SHM_RND) {
+            return Err(EINVAL);
+        }
+        let addr = VAddr::from(addr.align_down(PAGE_SIZE));
+        mm_list.mmap_fixed(addr, size, mapping, permission, true)
+    } else {
+        mm_list.mmap_hint(VAddr::NULL, size, mapping, permission, true)
+    }?;
+
+    thread.process.shm_areas.lock().insert(addr, size);
+
+    Ok(addr.addr())
+}
+
+#[eonix_macros::define_syscall(SYS_SHMDT)]
+fn shmdt(addr: usize) -> KResult<usize> {
+    let addr = VAddr::from(addr);
+    let mut shm_areas = thread.process.shm_areas.lock();
+    let size = *shm_areas.get(&addr).ok_or(EINVAL)?;
+    shm_areas.remove(&addr);
+    drop(shm_areas);
+    return Task::block_on(thread.process.mm_list.unmap(addr, size)).map(|_| 0);
+}
+
+#[eonix_macros::define_syscall(SYS_SHMCTL)]
+fn shmctl(shmid: u32, op: i32, shmid_ds: usize) -> KResult<usize> {
+    Ok(0)
+}
+
 #[eonix_macros::define_syscall(SYS_MEMBARRIER)]
 fn membarrier(_cmd: usize, _flags: usize) -> KResult<()> {
     Ok(())

+ 44 - 1
src/kernel/syscall/procops.rs

@@ -1,6 +1,8 @@
 use super::SyscallNoReturn;
 use crate::io::Buffer;
-use crate::kernel::constants::{EINVAL, ENOENT, ENOTDIR, ERANGE, ESRCH};
+use crate::kernel::constants::{
+    CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_REALTIME_COARSE, EINVAL, ENOENT, ENOTDIR, ERANGE, ESRCH,
+};
 use crate::kernel::constants::{
     ENOSYS, PR_GET_NAME, PR_SET_NAME, RLIMIT_STACK, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK,
 };
@@ -66,6 +68,37 @@ fn nanosleep(req: *const (u32, u32), rem: *mut (u32, u32)) -> KResult<usize> {
     Ok(0)
 }
 
+#[eonix_macros::define_syscall(SYS_CLOCK_NANOSLEEP)]
+fn clock_nanosleep(
+    clock_id: u32,
+    flags: u32,
+    req: *const (u32, u32),
+    rem: *mut (u32, u32),
+) -> KResult<usize> {
+    if clock_id != CLOCK_REALTIME
+        && clock_id != CLOCK_REALTIME_COARSE
+        && clock_id != CLOCK_MONOTONIC
+    {
+        unimplemented!("Unsupported clock_id: {}", clock_id);
+    }
+
+    let req = UserPointer::new(req)?.read()?;
+    let rem = if rem.is_null() {
+        None
+    } else {
+        Some(UserPointerMut::new(rem)?)
+    };
+
+    let duration = Duration::from_secs(req.0 as u64) + Duration::from_nanos(req.1 as u64);
+    Task::block_on(sleep(duration));
+
+    if let Some(rem) = rem {
+        rem.write((0, 0))?;
+    }
+
+    Ok(0)
+}
+
 #[eonix_macros::define_syscall(SYS_UMASK)]
 fn umask(mask: u32) -> KResult<u32> {
     let mut umask = thread.fs_context.umask.lock();
@@ -401,6 +434,16 @@ fn getgid() -> KResult<u32> {
     Ok(0)
 }
 
+#[eonix_macros::define_syscall(SYS_SYNC)]
+fn sync() -> KResult<()> {
+    Ok(())
+}
+
+#[eonix_macros::define_syscall(SYS_FSYNC)]
+fn fsync() -> KResult<()> {
+    Ok(())
+}
+
 #[cfg(target_arch = "x86_64")]
 #[eonix_macros::define_syscall(SYS_GETGID32)]
 fn getgid32() -> KResult<u32> {

+ 5 - 2
src/kernel/syscall/sysinfo.rs

@@ -1,6 +1,6 @@
 use crate::{
     kernel::{
-        constants::{CLOCK_MONOTONIC, CLOCK_REALTIME, EINVAL},
+        constants::{CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_REALTIME_COARSE, EINVAL},
         task::Thread,
         timer::{Instant, Ticks},
         user::UserPointerMut,
@@ -78,7 +78,10 @@ fn gettimeofday(timeval: *mut TimeVal, timezone: *mut ()) -> KResult<()> {
 }
 
 fn do_clock_gettime64(_thread: &Thread, clock_id: u32, timespec: *mut TimeSpec) -> KResult<()> {
-    if clock_id != CLOCK_REALTIME && clock_id != CLOCK_MONOTONIC {
+    if clock_id != CLOCK_REALTIME
+        && clock_id != CLOCK_REALTIME_COARSE
+        && clock_id != CLOCK_MONOTONIC
+    {
         unimplemented!("Unsupported clock_id: {}", clock_id);
     }
 

+ 4 - 1
src/kernel/task/loader/elf.rs

@@ -274,6 +274,7 @@ impl<E: ElfArch> Elf<E> {
                 write: true,
                 execute: false,
             },
+            false,
         )?;
 
         StackInitializer::new(&mm_list, E::STACK_BASE_ADDR, args, envs, aux_vec).init()
@@ -356,11 +357,12 @@ impl<E: ElfArch> Elf<E> {
                 vmap_start,
                 file_len,
                 Mapping::File(FileMapping::new(
-                    self.file.clone(),
+                    self.file.get_inode()?,
                     file_offset,
                     real_file_length,
                 )),
                 permission,
+                false,
             )?;
         }
 
@@ -370,6 +372,7 @@ impl<E: ElfArch> Elf<E> {
                 vmem_len - file_len,
                 Mapping::Anonymous,
                 permission,
+                false,
             )?;
         }
 

+ 4 - 0
src/kernel/task/process.rs

@@ -15,6 +15,7 @@ use alloc::{
     sync::{Arc, Weak},
 };
 use core::sync::atomic::{AtomicU32, Ordering};
+use eonix_mm::address::VAddr;
 use eonix_runtime::task::Task;
 use eonix_sync::{
     AsProof as _, AsProofMut as _, Locked, Proof, ProofMut, RwLockReadGuard, SpinGuard,
@@ -50,6 +51,8 @@ pub struct Process {
 
     pub exit_signal: Option<Signal>,
 
+    pub shm_areas: Spin<BTreeMap<VAddr, usize>>,
+
     /// Parent process
     ///
     /// `parent` must be valid during the whole life of the process.
@@ -258,6 +261,7 @@ impl ProcessBuilder {
             pid: self.pid.expect("should set pid before building"),
             wait_list: WaitList::new(),
             mm_list,
+            shm_areas: Spin::new(BTreeMap::new()),
             exit_signal: self.exit_signal,
             parent: RCUPointer::empty(),
             pgroup: RCUPointer::empty(),

+ 1 - 1
src/kernel/timer.rs

@@ -18,7 +18,7 @@ static SLEEPERS_LIST: Spin<BinaryHeap<Reverse<Sleepers>>> = Spin::new(BinaryHeap
 #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Ticks(usize);
 
-#[derive(Default, Clone, Copy)]
+#[derive(Debug, Default, Clone, Copy)]
 pub struct Instant {
     secs_since_epoch: u64,
     nsecs_within: u32,

+ 39 - 13
src/kernel/vfs/file.rs

@@ -11,6 +11,7 @@ use crate::{
         task::Thread,
         terminal::{Terminal, TerminalIORequest},
         user::{UserPointer, UserPointerMut},
+        vfs::inode::Inode,
         CharDevice,
     },
     prelude::*,
@@ -86,6 +87,15 @@ pub struct File {
     file_type: FileType,
 }
 
+impl File {
+    pub fn get_inode(&self) -> KResult<Option<Arc<dyn Inode>>> {
+        match &self.file_type {
+            FileType::Inode(inode_file) => Ok(Some(inode_file.dentry.get_inode()?)),
+            _ => Ok(None),
+        }
+    }
+}
+
 pub enum SeekOption {
     Set(usize),
     Current(isize),
@@ -324,7 +334,7 @@ impl InodeFile {
         Ok(new_cursor)
     }
 
-    fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
+    fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
         if !self.write {
             return Err(EBADF);
         }
@@ -336,23 +346,35 @@ impl InodeFile {
 
             Ok(nwrote)
         } else {
-            let nwrote = self.dentry.write(stream, WriteOffset::Position(*cursor))?;
+            let nwrote = if let Some(offset) = offset {
+                self.dentry.write(stream, WriteOffset::Position(offset))?
+            } else {
+                let nwrote = self.dentry.write(stream, WriteOffset::Position(*cursor))?;
+                *cursor += nwrote;
+                nwrote
+            };
 
-            *cursor += nwrote;
             Ok(nwrote)
         }
     }
 
-    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+    fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
         if !self.read {
             return Err(EBADF);
         }
 
-        let mut cursor = Task::block_on(self.cursor.lock());
+        let nread = if let Some(offset) = offset {
+            let nread = self.dentry.read(buffer, offset)?;
+            nread
+        } else {
+            let mut cursor = Task::block_on(self.cursor.lock());
 
-        let nread = self.dentry.read(buffer, *cursor)?;
+            let nread = self.dentry.read(buffer, *cursor)?;
+
+            *cursor += nread;
+            nread
+        };
 
-        *cursor += nread;
         Ok(nread)
     }
 
@@ -456,9 +478,9 @@ impl TerminalFile {
 }
 
 impl FileType {
-    pub async fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+    pub async fn read(&self, buffer: &mut dyn Buffer, offset: Option<usize>) -> KResult<usize> {
         match self {
-            FileType::Inode(inode) => inode.read(buffer),
+            FileType::Inode(inode) => inode.read(buffer, offset),
             FileType::PipeRead(pipe) => pipe.pipe.read(buffer).await,
             FileType::TTY(tty) => tty.read(buffer).await,
             FileType::CharDev(device) => device.read(buffer),
@@ -481,9 +503,9 @@ impl FileType {
     //     }
     // }
 
-    pub async fn write(&self, stream: &mut dyn Stream) -> KResult<usize> {
+    pub async fn write(&self, stream: &mut dyn Stream, offset: Option<usize>) -> KResult<usize> {
         match self {
-            FileType::Inode(inode) => inode.write(stream),
+            FileType::Inode(inode) => inode.write(stream, offset),
             FileType::PipeWrite(pipe) => pipe.pipe.write(stream).await,
             FileType::TTY(tty) => tty.write(stream),
             FileType::CharDev(device) => device.write(stream),
@@ -527,12 +549,16 @@ impl FileType {
             if Thread::current().signal_list.has_pending_signal() {
                 return if cur == 0 { Err(EINTR) } else { Ok(cur) };
             }
-            let nread = self.read(&mut ByteBuffer::new(&mut buffer[..len])).await?;
+            let nread = self
+                .read(&mut ByteBuffer::new(&mut buffer[..len]), None)
+                .await?;
             if nread == 0 {
                 break;
             }
 
-            let nwrote = dest_file.write(&mut buffer[..nread].into_stream()).await?;
+            let nwrote = dest_file
+                .write(&mut buffer[..nread].into_stream(), None)
+                .await?;
             nsent += nwrote;
 
             if nwrote != len {

+ 18 - 4
src/kernel/vfs/inode.rs

@@ -4,6 +4,7 @@ use crate::kernel::constants::{
     EINVAL, EISDIR, ENOTDIR, EPERM, STATX_ATIME, STATX_BLOCKS, STATX_CTIME, STATX_GID, STATX_INO,
     STATX_MODE, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_TYPE, STATX_UID, S_IFDIR, S_IFMT,
 };
+use crate::kernel::mem::PageCache;
 use crate::kernel::timer::Instant;
 use crate::{io::Buffer, prelude::*};
 use alloc::sync::{Arc, Weak};
@@ -34,6 +35,7 @@ pub type AtomicGid = AtomicU32;
 pub type Mode = u32;
 pub type AtomicMode = AtomicU32;
 
+#[derive(Debug)]
 pub struct InodeData {
     pub ino: Ino,
     pub size: AtomicISize,
@@ -53,13 +55,13 @@ pub struct InodeData {
 }
 
 impl InodeData {
-    pub const fn new(ino: Ino, vfs: Weak<dyn Vfs>) -> Self {
+    pub fn new(ino: Ino, vfs: Weak<dyn Vfs>) -> Self {
         Self {
             ino,
             vfs,
-            atime: Spin::new(Instant::default()),
-            ctime: Spin::new(Instant::default()),
-            mtime: Spin::new(Instant::default()),
+            atime: Spin::new(Instant::now()),
+            ctime: Spin::new(Instant::now()),
+            mtime: Spin::new(Instant::now()),
             rwsem: RwLock::new(()),
             size: AtomicU64::new(0),
             nlink: AtomicNlink::new(0),
@@ -126,10 +128,18 @@ pub trait Inode: Send + Sync + InodeInner + Any {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
+    fn read_direct(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
+    }
+
     fn write(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
+    fn write_direct(&self, stream: &mut dyn Stream, offset: WriteOffset) -> KResult<usize> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
+    }
+
     fn devid(&self) -> KResult<DevId> {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
@@ -162,6 +172,10 @@ pub trait Inode: Send + Sync + InodeInner + Any {
         Err(EPERM)
     }
 
+    fn page_cache(&self) -> Option<&PageCache> {
+        None
+    }
+
     fn statx(&self, stat: &mut StatX, mask: u32) -> KResult<()> {
         // Safety: ffi should have checked reference
         let vfs = self.vfs.upgrade().expect("Vfs is dropped");