Sfoglia il codice sorgente

feat: rewrite filearray with rust

greatbridf 1 mese fa
parent
commit
7930b71762
49 ha cambiato i file con 2422 aggiunte e 2103 eliminazioni
  1. 0 4
      CMakeLists.txt
  2. 1 0
      build.rs
  3. 2 0
      gblibc/include/errno.h
  4. 6 10
      include/kernel/mem/paging.hpp
  5. 3 5
      include/kernel/process.hpp
  6. 12 18
      include/kernel/syscall.hpp
  7. 2 0
      include/kernel/tty.hpp
  8. 31 44
      include/kernel/vfs.hpp
  9. 0 2
      include/kernel/vfs/dentry.hpp
  10. 0 106
      include/kernel/vfs/file.hpp
  11. 0 51
      include/kernel/vfs/filearr.hpp
  12. 0 7
      include/kernel/vfs/vfsfwd.hpp
  13. 5 3
      include/types/elf.hpp
  14. 0 1
      src/driver/ahci/mod.rs
  15. 7 13
      src/fs/fat32.rs
  16. 12 16
      src/fs/procfs.rs
  17. 38 34
      src/fs/tmpfs.rs
  18. 23 51
      src/io.rs
  19. 8 5
      src/kernel.ld
  20. 2 0
      src/kernel.rs
  21. 1 1
      src/kernel/interrupt.cpp
  22. 1 0
      src/kernel/mem/mm_list.cc
  23. 37 4
      src/kernel/mem/paging.cc
  24. 65 67
      src/kernel/process.cpp
  25. 34 190
      src/kernel/syscall.cpp
  26. 128 0
      src/kernel/syscall.rs
  27. 363 0
      src/kernel/syscall/file_rw.rs
  28. 0 401
      src/kernel/syscall/fileops.cc
  29. 0 22
      src/kernel/syscall/mount.cc
  30. 3 62
      src/kernel/syscall/procops.cc
  31. 171 0
      src/kernel/syscall/procops.rs
  32. 50 14
      src/kernel/tty.cpp
  33. 1 0
      src/kernel/user.rs
  34. 202 0
      src/kernel/user/dataflow.rs
  35. 0 243
      src/kernel/vfs.cpp
  36. 178 53
      src/kernel/vfs/dentry.rs
  37. 4 301
      src/kernel/vfs/ffi.rs
  38. 551 0
      src/kernel/vfs/file.rs
  39. 0 290
      src/kernel/vfs/filearr.cc
  40. 307 0
      src/kernel/vfs/filearray.rs
  41. 13 10
      src/kernel/vfs/inode.rs
  42. 58 11
      src/kernel/vfs/mod.rs
  43. 16 26
      src/kernel/vfs/mount.rs
  44. 4 0
      src/kinit.cpp
  45. 64 30
      src/lib.rs
  46. 3 1
      src/rcu.rs
  47. 1 1
      src/sync/condvar.rs
  48. 10 1
      src/sync/lock.rs
  49. 5 5
      src/types/elf.cpp

+ 0 - 4
CMakeLists.txt

@@ -49,7 +49,6 @@ set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
                         src/kernel/syscall.cpp
                         src/kernel/syscall/fileops.cc
                         src/kernel/syscall/infoops.cc
-                        src/kernel/syscall/mount.cc
                         src/kernel/syscall/procops.cc
                         src/kernel/mem/mm_list.cc
                         src/kernel/mem/paging.cc
@@ -64,7 +63,6 @@ set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
                         src/kernel/task/thread.cc
                         src/kernel/task/readyqueue.cc
                         src/kernel/user/thread_local.cc
-                        src/kernel/vfs/filearr.cc
                         src/kernel/signal.cpp
                         src/net/ethernet.cc
                         src/types/crc.cc
@@ -87,8 +85,6 @@ set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
                         include/kernel/utsname.hpp
                         include/kernel/vfs.hpp
                         include/kernel/vfs/dentry.hpp
-                        include/kernel/vfs/file.hpp
-                        include/kernel/vfs/filearr.hpp
                         include/kernel/vga.hpp
                         include/kernel/signal.hpp
                         include/kernel/task/forward.hpp

+ 1 - 0
build.rs

@@ -5,6 +5,7 @@ fn main() {
     let headers = [
         "include/kernel/process.hpp",
         "include/kernel/hw/pci.hpp",
+        "include/types/elf.hpp",
     ];
 
     let bindings = bindgen::Builder::default()

+ 2 - 0
gblibc/include/errno.h

@@ -30,7 +30,9 @@ extern int* __errno_location(void);
 #define ESPIPE 29
 #define EROFS 30
 #define EPIPE 32
+#define ERANGE 34
 #define ELOOP 40
+#define EOVERFLOW 75
 
 #ifdef __cplusplus
 }

+ 6 - 10
include/kernel/mem/paging.hpp

@@ -6,6 +6,7 @@
 
 #include <stdint.h>
 
+#include <kernel/interrupt.hpp>
 #include <kernel/mem/paging_asm.h>
 #include <kernel/mem/phys.hpp>
 
@@ -27,10 +28,8 @@ constexpr int idx_p1(uintptr_t vaddr) noexcept {
     return (vaddr >> 12) & 0x1ff;
 }
 
-constexpr std::tuple<int, int, int, int, int> idx_all(
-    uintptr_t vaddr) noexcept {
-    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr),
-            idx_p1(vaddr)};
+constexpr std::tuple<int, int, int, int, int> idx_all(uintptr_t vaddr) noexcept {
+    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr), idx_p1(vaddr)};
 }
 
 // page frame number
@@ -74,9 +73,7 @@ class PSE {
 
     constexpr pfn_t pfn() const noexcept { return *m_ptrbase & ~PA_MASK; }
 
-    constexpr psattr_t attributes() const noexcept {
-        return *m_ptrbase & PA_MASK;
-    }
+    constexpr psattr_t attributes() const noexcept { return *m_ptrbase & PA_MASK; }
 
     constexpr PSE operator[](std::size_t nth) const noexcept {
         return PSE{m_ptrbase.phys() + 8 * nth};
@@ -135,7 +132,7 @@ constexpr unsigned long PAGE_FAULT_PK = 0x00000020;
 constexpr unsigned long PAGE_FAULT_SS = 0x00000040;
 constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
 
-void handle_page_fault(unsigned long err);
+void handle_page_fault(interrupt_stack* int_stack);
 
 class vaddr_range {
     std::size_t n;
@@ -156,8 +153,7 @@ class vaddr_range {
     bool is_privilege;
 
    public:
-    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end,
-                         bool is_privilege = false);
+    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool is_privilege = false);
     explicit vaddr_range(std::nullptr_t);
 
     vaddr_range begin() const noexcept;

+ 3 - 5
include/kernel/process.hpp

@@ -7,6 +7,7 @@
 #include <utility>
 
 #include <assert.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <stdint.h>
 #include <sys/types.h>
@@ -27,7 +28,6 @@
 #include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/filearr.hpp>
 
 class process;
 
@@ -57,9 +57,8 @@ class process {
     std::list<wait_obj> waitprocs;
 
     process_attr attr{};
-    fs::filearray files;
-    fs::dentry_pointer cwd{};
-    mode_t umask{0022};
+    fs::rust_file_array files;
+    fs::rust_fs_context fs_context;
 
     pid_t pid{};
     pid_t ppid{};
@@ -67,7 +66,6 @@ class process {
     pid_t sid{};
 
     kernel::tty::tty* control_tty{};
-    struct fs::fs_context fs_context;
     std::set<pid_t> children;
 
    public:

+ 12 - 18
include/kernel/syscall.hpp

@@ -33,7 +33,6 @@ void handle_syscall64(int no, interrupt_stack* data, mmx_registers* mmxregs);
 namespace syscall {
     // in fileops.cc
     ssize_t do_write(int fd, const char __user* buf, size_t n);
-    ssize_t do_read(int fd, char __user* buf, size_t n);
     int do_close(int fd);
     int do_dup(int old_fd);
     int do_dup2(int old_fd, int new_fd);
@@ -42,19 +41,17 @@ namespace syscall {
     ssize_t do_getdents64(int fd, char __user* buf, size_t cnt);
     int do_open(const char __user* path, int flags, mode_t mode);
     int do_symlink(const char __user* target, const char __user* linkpath);
-    int do_readlink(const char __user* pathname, char __user* buf,
-                    size_t buf_size);
+    int do_readlink(const char __user* pathname, char __user* buf, size_t buf_size);
     int do_ioctl(int fd, unsigned long request, uintptr_t arg3);
     ssize_t do_readv(int fd, const iovec* iov, int iovcnt);
     ssize_t do_writev(int fd, const iovec* iov, int iovcnt);
     off_t do_lseek(int fd, off_t offset, int whence);
-    uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags,
-                            int fd, off_t pgoffset);
+    uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags, int fd,
+                            off_t pgoffset);
     int do_munmap(uintptr_t addr, size_t len);
-    ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset,
-                        size_t count);
-    int do_statx(int dirfd, const char __user* path, int flags,
-                 unsigned int mask, statx __user* statxbuf);
+    ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset, size_t count);
+    int do_statx(int dirfd, const char __user* path, int flags, unsigned int mask,
+                 statx __user* statxbuf);
     int do_fcntl(int fd, int cmd, unsigned long arg);
     int do_poll(pollfd __user* fds, nfds_t nfds, int timeout);
     int do_mknod(const char __user* pathname, mode_t mode, dev_t dev);
@@ -88,10 +85,9 @@ namespace syscall {
     int do_kill(pid_t pid, int sig);
     int do_tkill(pid_t pid, int sig);
     int do_rt_sigprocmask(int how, const kernel::sigmask_type __user* set,
-                          kernel::sigmask_type __user* oldset,
-                          size_t sigsetsize);
-    int do_rt_sigaction(int signum, const sigaction __user* act,
-                        sigaction __user* oldact, size_t sigsetsize);
+                          kernel::sigmask_type __user* oldset, size_t sigsetsize);
+    int do_rt_sigaction(int signum, const sigaction __user* act, sigaction __user* oldact,
+                        size_t sigsetsize);
     int do_newuname(new_utsname __user* buf);
 
     struct execve_retval {
@@ -100,14 +96,12 @@ namespace syscall {
         int status;
     };
 
-    execve_retval do_execve(const std::string& exec,
-                            const std::vector<std::string>& args,
+    execve_retval do_execve(const std::string& exec, const std::vector<std::string>& args,
                             const std::vector<std::string>& envs);
 
     // in mount.cc
-    int do_mount(const char __user* source, const char __user* target,
-                 const char __user* fstype, unsigned long flags,
-                 const void __user* _fsdata);
+    int do_mount(const char __user* source, const char __user* target, const char __user* fstype,
+                 unsigned long flags, const void __user* _fsdata);
 
     // in infoops.cc
     int do_clock_gettime(clockid_t clk_id, timespec __user* tp);

+ 2 - 0
include/kernel/tty.hpp

@@ -45,6 +45,8 @@ class tty : public types::non_copyable {
     // TODO: formal poll support
     int poll();
 
+    int ioctl(int request, unsigned long arg3);
+
     constexpr void set_pgrp(pid_t pgid) { fg_pgroup = pgid; }
 
     constexpr pid_t get_pgrp(void) const { return fg_pgroup; }

+ 31 - 44
include/kernel/vfs.hpp

@@ -9,7 +9,6 @@
 
 #include <kernel/mem/paging.hpp>
 #include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/file.hpp>
 
 #define NODE_MAJOR(node) (((node) >> 8) & 0xFFU)
 #define NODE_MINOR(node) ((node) & 0xFFU)
@@ -31,60 +30,48 @@ struct chrdev_ops {
     chrdev_write write;
 };
 
-struct PACKED user_dirent {
-    ino_t d_ino;       // inode number
-    uint32_t d_off;    // ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    char d_name[1];    // file name with a padding zero
-    // uint8_t d_type; // file type, with offset of (d_reclen - 1)
-};
-
-struct PACKED user_dirent64 {
-    ino64_t d_ino;     // inode number
-    uint64_t d_off;    // implementation-defined field, ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    uint8_t d_type;    // file type, with offset of (d_reclen - 1)
-    char d_name[1];    // file name with a padding zero
-};
-
-struct fs_context {
-    dentry_pointer root;
-};
-
 int register_char_device(dev_t node, const chrdev_ops& ops);
 ssize_t char_device_read(dev_t node, char* buf, size_t buf_size, size_t n);
 ssize_t char_device_write(dev_t node, const char* buf, size_t n);
 
-extern "C" int fs_creat(struct dentry* at, mode_t mode);
-extern "C" int fs_mkdir(struct dentry* at, mode_t mode);
-extern "C" int fs_mknod(struct dentry* at, mode_t mode, dev_t sn);
-extern "C" int fs_unlink(struct dentry* at);
-extern "C" int fs_symlink(struct dentry* at, const char* target);
+class rust_file_array {
+   public:
+    struct handle;
 
-extern "C" int fs_statx(struct dentry* file, struct statx* stat, unsigned int mask);
-extern "C" int fs_readlink(struct dentry* file, char* buf, size_t buf_size);
-extern "C" int fs_truncate(struct dentry* file, size_t size);
-extern "C" size_t fs_read(struct dentry* file, char* buf, size_t buf_size, size_t offset, size_t n);
-extern "C" size_t fs_write(struct dentry* file, const char* buf, size_t offset, size_t n);
+   private:
+    struct handle* m_handle;
+
+   public:
+    rust_file_array(struct handle* handle);
+    rust_file_array(const rust_file_array&) = delete;
+    ~rust_file_array();
+
+    constexpr rust_file_array(rust_file_array&& other) noexcept
+        : m_handle(std::exchange(other.m_handle, nullptr)) {}
 
-using readdir_callback_fn = std::function<int(const char*, size_t, ino_t)>;
+    struct handle* get() const;
+    void drop();
+};
 
-extern "C" ssize_t fs_readdir(struct dentry* dir, size_t offset,
-                              const readdir_callback_fn* callback);
+class rust_fs_context {
+   public:
+    struct handle;
 
-extern "C" int fs_mount(dentry* mnt, const char* source, const char* mount_point,
-                        const char* fstype, unsigned long flags, const void* data);
+   private:
+    struct handle* m_handle;
 
-extern "C" mode_t r_dentry_get_mode(struct dentry* dentry);
-extern "C" size_t r_dentry_get_size(struct dentry* dentry);
-extern "C" bool r_dentry_is_directory(struct dentry* dentry);
-extern "C" bool r_dentry_is_invalid(struct dentry* dentry);
+   public:
+    rust_fs_context(struct handle* handle);
+    rust_fs_context(const rust_fs_context&) = delete;
+    ~rust_fs_context();
 
-extern "C" struct dentry* r_get_root_dentry();
+    constexpr rust_fs_context(rust_fs_context&& other) noexcept
+        : m_handle(std::exchange(other.m_handle, nullptr)) {}
 
-#define current_open(...) fs::open(current_process->fs_context, current_process->cwd, __VA_ARGS__)
+    struct handle* get() const;
+    void drop();
+};
 
-std::pair<dentry_pointer, int> open(const fs_context& context, const dentry_pointer& cwd,
-                                    types::string_view path, bool follow_symlinks = true);
+extern "C" size_t fs_read(struct dentry* file, char* buf, size_t buf_size, size_t offset, size_t n);
 
 } // namespace fs

+ 0 - 2
include/kernel/vfs/dentry.hpp

@@ -21,8 +21,6 @@ struct dentry_deleter {
 };
 
 using dentry_pointer = std::unique_ptr<struct dentry, dentry_deleter>;
-extern "C" int d_path(struct dentry* dentry, struct dentry* root,
-                      char* out_path, size_t buflen);
 dentry_pointer d_get(const dentry_pointer& dp);
 
 } // namespace fs

+ 0 - 106
include/kernel/vfs/file.hpp

@@ -1,106 +0,0 @@
-#pragma once
-
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/types.h>
-
-#include <types/buffer.hpp>
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace fs {
-
-class pipe : public types::non_copyable {
-   private:
-    static constexpr size_t PIPE_SIZE = 4096;
-    static constexpr uint32_t READABLE = 1;
-    static constexpr uint32_t WRITABLE = 2;
-
-   private:
-    types::buffer buf;
-    uint32_t flags;
-    kernel::async::mutex mtx;
-
-    kernel::async::wait_list waitlist_r;
-    kernel::async::wait_list waitlist_w;
-
-   public:
-    pipe();
-
-    void close_read();
-    void close_write();
-
-    int write(const char* buf, size_t n);
-    int read(char* buf, size_t n);
-
-    constexpr bool is_readable() const { return flags & READABLE; }
-
-    constexpr bool is_writeable() const { return flags & WRITABLE; }
-};
-
-struct file {
-    struct file_flags {
-        uint32_t read : 1;
-        uint32_t write : 1;
-        uint32_t append : 1;
-    } flags{};
-
-    file(file_flags flags) : flags(flags) {}
-
-    virtual ~file() = default;
-
-    virtual ssize_t read(char* __user buf, size_t n) = 0;
-    virtual ssize_t do_write(const char* __user buf, size_t n) = 0;
-
-    virtual off_t seek(off_t n, int whence) {
-        return (void)n, (void)whence, -ESPIPE;
-    }
-
-    ssize_t write(const char* __user buf, size_t n) {
-        if (!flags.write)
-            return -EBADF;
-
-        if (flags.append) {
-            seek(0, SEEK_END);
-        }
-
-        return do_write(buf, n);
-    }
-
-    // regular files should override this method
-    virtual int getdents(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-    virtual int getdents64(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-};
-
-struct regular_file : public virtual file {
-    virtual ~regular_file() = default;
-    std::size_t cursor{};
-    dentry_pointer dentry;
-
-    regular_file(file_flags flags, size_t cursor, dentry_pointer dentry);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-    virtual off_t seek(off_t n, int whence) override;
-    virtual int getdents(char* __user buf, size_t cnt) override;
-    virtual int getdents64(char* __user buf, size_t cnt) override;
-};
-
-struct fifo_file : public virtual file {
-    virtual ~fifo_file() override;
-    std::shared_ptr<pipe> ppipe;
-
-    fifo_file(file_flags flags, std::shared_ptr<fs::pipe> ppipe);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-};
-
-} // namespace fs

+ 0 - 51
include/kernel/vfs/filearr.hpp

@@ -1,51 +0,0 @@
-#pragma once
-
-#include "dentry.hpp"
-#include "file.hpp"
-
-#include <memory>
-
-#include <types/path.hpp>
-
-#include <kernel/vfs.hpp>
-
-namespace fs {
-
-class filearray {
-   private:
-    struct impl;
-    std::shared_ptr<impl> pimpl;
-    filearray(std::shared_ptr<impl>);
-
-   public:
-    filearray(const fs_context* ctx);
-    filearray(filearray&& other) = default;
-
-    filearray copy() const;
-    filearray share() const;
-
-    // dup old_fd to some random fd
-    int dup(int old_fd);
-
-    // dup old_fd to new_fd, close new_fd if it is already open
-    int dup(int old_fd, int new_fd, int flags);
-
-    // dup old_fd to the first available fd starting from min_fd
-    int dupfd(int fd, int min_fd, int flags);
-
-    fs::file* operator[](int i) const;
-    int set_flags(int fd, int flags);
-
-    int pipe(int (&pipefd)[2]);
-    int open(const dentry_pointer& cwd, types::string_view filepath, int flags,
-             mode_t mode);
-    int open(types::string_view filepath, int flags, mode_t mode);
-
-    int close(int fd);
-
-    // any call to member methods will be invalid after clear()
-    void clear();
-    void onexec();
-};
-
-} // namespace fs

+ 0 - 7
include/kernel/vfs/vfsfwd.hpp

@@ -6,13 +6,6 @@ namespace fs {
 struct dcache;
 struct dentry;
 
-// in file.hpp
-struct file;
-struct regular_file;
-struct fifo_file;
-
-class pipe;
-
 // in filearray.hpp
 class file_array;
 

+ 5 - 3
include/types/elf.hpp

@@ -148,9 +148,11 @@ struct PACKED elf32_section_header_entry {
 };
 
 struct elf32_load_data {
-    fs::dentry_pointer exec_dent;
-    const std::vector<std::string>& argv;
-    const std::vector<std::string>& envp;
+    struct dentry* exec_dent; // Owned
+    const char* const* argv;
+    size_t argv_count;
+    const char* const* envp;
+    size_t envp_count;
     uintptr_t ip;
     uintptr_t sp;
 };

+ 0 - 1
src/driver/ahci/mod.rs

@@ -3,7 +3,6 @@ use crate::{
     kernel::{
         block::{make_device, BlockDevice},
         interrupt::register_irq_handler,
-        mem::paging::PageBuffer,
     },
     prelude::*,
 };

+ 7 - 13
src/fs/fat32.rs

@@ -1,4 +1,4 @@
-use core::sync::atomic::Ordering;
+use core::{ops::ControlFlow, sync::atomic::Ordering};
 
 use alloc::{
     collections::btree_map::BTreeMap,
@@ -19,7 +19,7 @@ use crate::{
             inode::{define_struct_inode, Ino, Inode, InodeData},
             mount::{register_filesystem, Mount, MountCreator},
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
     },
     prelude::*,
@@ -447,10 +447,10 @@ impl Inode for DirInode {
         }
     }
 
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
+    fn do_readdir(
+        &self,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
         let vfs = self.vfs.upgrade().ok_or(EIO)?;
         let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
@@ -474,7 +474,7 @@ impl Inode for DirInode {
 
             vfs.get_or_alloc_inode(ino, entry.is_directory(), entry.size);
 
-            if callback(name.as_ref(), ino).is_err() {
+            if callback(name.as_ref(), ino)?.is_break() {
                 break;
             }
 
@@ -488,13 +488,7 @@ impl Inode for DirInode {
 struct FatMountCreator;
 
 impl MountCreator for FatMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fatfs, root_inode) = FatFs::create(make_device(8, 1))?;
 
         Mount::new(mp, fatfs, root_inode)

+ 12 - 16
src/fs/procfs.rs

@@ -3,7 +3,8 @@ use alloc::{
     sync::{Arc, Weak},
 };
 use bindings::{EACCES, ENOTDIR, S_IFDIR, S_IFREG};
-use core::sync::atomic::Ordering;
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
 use lazy_static::lazy_static;
 
 use crate::{
@@ -15,7 +16,7 @@ use crate::{
             inode::{define_struct_inode, AtomicIno, Ino, Inode, InodeData},
             mount::{dump_mounts, register_filesystem, Mount, MountCreator},
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
     },
     prelude::*,
@@ -143,19 +144,20 @@ impl Inode for DirInode {
             }))
     }
 
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
+    fn do_readdir(
+        &self,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
         let lock = self.rwsem.lock_shared();
-        Ok(self
-            .entries
+        self.entries
             .access(lock.as_ref())
             .iter()
             .skip(offset)
-            .take_while(|(name, node)| callback(name, node.ino()).is_ok())
-            .count())
+            .map(|(name, node)| callback(name.as_ref(), node.ino()))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
 }
 
@@ -204,13 +206,7 @@ impl ProcFsMountCreator {
 }
 
 impl MountCreator for ProcFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let vfs = ProcFsMountCreator::get();
         let root_inode = vfs.root_node.clone();
         Mount::new(mp, vfs, root_inode)

+ 38 - 34
src/fs/tmpfs.rs

@@ -1,16 +1,17 @@
 use alloc::sync::{Arc, Weak};
 use bindings::{EINVAL, EIO, EISDIR, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG};
-use core::sync::atomic::Ordering;
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
 
 use crate::{
     io::Buffer,
     kernel::vfs::{
         dentry::{dcache, Dentry},
-        inode::{define_struct_inode, AtomicIno, Ino, Inode, Mode},
+        inode::{define_struct_inode, AtomicIno, Ino, Inode, Mode, WriteOffset},
         mount::{register_filesystem, Mount, MountCreator, MS_RDONLY},
         s_isblk, s_ischr,
         vfs::Vfs,
-        DevId, ReadDirCallback,
+        DevId,
     },
     prelude::*,
     sync::Locked,
@@ -68,11 +69,11 @@ impl DirectoryInode {
     }
 
     fn link(&self, name: Arc<[u8]>, file: &dyn Inode, dlock: &mut ()) {
-        // Safety: Only `unlink` will do something based on `nlink` count
+        // SAFETY: Only `unlink` will do something based on `nlink` count
         //         No need to synchronize here
         file.nlink.fetch_add(1, Ordering::Relaxed);
 
-        // Safety: `rwsem` has done the synchronization
+        // SAFETY: `rwsem` has done the synchronization
         self.size.fetch_add(1, Ordering::Relaxed);
 
         self.entries.access_mut(dlock).push((name, file.ino));
@@ -80,19 +81,20 @@ impl DirectoryInode {
 }
 
 impl Inode for DirectoryInode {
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
+    fn do_readdir(
+        &self,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
         let lock = self.rwsem.lock_shared();
-        Ok(self
-            .entries
+        self.entries
             .access(lock.as_ref())
             .iter()
             .skip(offset)
-            .take_while(|(name, ino)| callback(name, *ino).is_ok())
-            .count())
+            .map(|(name, ino)| callback(&name, *ino))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
 
     fn creat(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
@@ -108,7 +110,7 @@ impl Inode for DirectoryInode {
         at.save_reg(file)
     }
 
-    fn mknod(&self, at: &Arc<Dentry>, mode: Mode, dev: DevId) -> KResult<()> {
+    fn mknod(&self, at: &Dentry, mode: Mode, dev: DevId) -> KResult<()> {
         if !s_ischr(mode) && !s_isblk(mode) {
             return Err(EINVAL);
         }
@@ -143,7 +145,7 @@ impl Inode for DirectoryInode {
         at.save_symlink(file)
     }
 
-    fn mkdir(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
+    fn mkdir(&self, at: &Dentry, mode: Mode) -> KResult<()> {
         let vfs = acquire(&self.vfs)?;
         let vfs = astmp(&vfs);
 
@@ -165,7 +167,7 @@ impl Inode for DirectoryInode {
         let file = at.get_inode()?;
         let _flock = file.rwsem.lock();
 
-        // Safety: `flock` has done the synchronization
+        // SAFETY: `flock` has done the synchronization
         if file.mode.load(Ordering::Relaxed) & S_IFDIR != 0 {
             return Err(EISDIR);
         }
@@ -175,11 +177,11 @@ impl Inode for DirectoryInode {
 
         assert_eq!(
             entries.len() as u64,
-            // Safety: `dlock` has done the synchronization
+            // SAFETY: `dlock` has done the synchronization
             self.size.fetch_sub(1, Ordering::Relaxed) - 1
         );
 
-        // Safety: `flock` has done the synchronization
+        // SAFETY: `flock` has done the synchronization
         let file_nlink = file.nlink.fetch_sub(1, Ordering::Relaxed) - 1;
 
         if file_nlink == 0 {
@@ -251,27 +253,35 @@ impl Inode for FileInode {
         // TODO: We don't need that strong guarantee, find some way to avoid locks
         let lock = self.rwsem.lock_shared();
 
-        let (_, data) = self
-            .filedata
-            .access(lock.as_ref())
-            .split_at_checked(offset)
-            .ok_or(EINVAL)?;
-
-        buffer.fill(data).map(|result| result.allow_partial())
+        match self.filedata.access(lock.as_ref()).split_at_checked(offset) {
+            Some((_, data)) => buffer.fill(data).map(|result| result.allow_partial()),
+            None => Ok(0),
+        }
     }
 
-    fn write(&self, buffer: &[u8], offset: usize) -> KResult<usize> {
+    fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
         // TODO: We don't need that strong guarantee, find some way to avoid locks
         let mut lock = self.rwsem.lock();
         let filedata = self.filedata.access_mut(lock.as_mut());
 
+        let offset = match offset {
+            WriteOffset::Position(offset) => offset,
+            // SAFETY: `lock` has done the synchronization
+            WriteOffset::End(end) => {
+                let size = self.size.load(Ordering::Relaxed) as usize;
+                *end = size + buffer.len();
+
+                size
+            }
+        };
+
         if filedata.len() < offset + buffer.len() {
             filedata.resize(offset + buffer.len(), 0);
         }
 
         filedata[offset..offset + buffer.len()].copy_from_slice(&buffer);
 
-        // Safety: `lock` has done the synchronization
+        // SAFETY: `lock` has done the synchronization
         self.size.store(filedata.len() as u64, Ordering::Relaxed);
 
         Ok(buffer.len())
@@ -282,7 +292,7 @@ impl Inode for FileInode {
         let mut lock = self.rwsem.lock();
         let filedata = self.filedata.access_mut(lock.as_mut());
 
-        // Safety: `lock` has done the synchronization
+        // SAFETY: `lock` has done the synchronization
         self.size.store(length as u64, Ordering::Relaxed);
         filedata.resize(length, 0);
 
@@ -331,13 +341,7 @@ impl TmpFs {
 struct TmpFsMountCreator;
 
 impl MountCreator for TmpFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fs, root_inode) = TmpFs::create(flags & MS_RDONLY != 0)?;
 
         Mount::new(mp, fs, root_inode)

+ 23 - 51
src/io.rs

@@ -2,7 +2,7 @@ use bindings::EFAULT;
 
 use crate::prelude::*;
 
-use core::{ffi::c_char, fmt::Write, mem::MaybeUninit};
+use core::{fmt::Write, mem::MaybeUninit};
 
 pub enum FillResult {
     Done(usize),
@@ -34,6 +34,24 @@ pub trait Buffer {
     fn total(&self) -> usize;
     fn wrote(&self) -> usize;
     fn fill(&mut self, data: &[u8]) -> KResult<FillResult>;
+
+    fn available(&self) -> usize {
+        self.total() - self.wrote()
+    }
+}
+
+pub trait BufferFill<T: Copy> {
+    fn copy(&mut self, object: &T) -> KResult<FillResult>;
+}
+
+impl<T: Copy, B: Buffer + ?Sized> BufferFill<T> for B {
+    fn copy(&mut self, object: &T) -> KResult<FillResult> {
+        let ptr = object as *const T as *const u8;
+        let len = core::mem::size_of::<T>();
+
+        // SAFETY: `object` is a valid object.
+        self.fill(unsafe { core::slice::from_raw_parts(ptr, len) })
+    }
 }
 
 pub struct UninitBuffer<'lt, T: Copy + Sized> {
@@ -49,10 +67,7 @@ impl<'lt, T: Copy + Sized> UninitBuffer<'lt, T> {
         Self {
             data,
             buffer: RawBuffer::new_from_slice(unsafe {
-                core::slice::from_raw_parts_mut(
-                    ptr as *mut u8,
-                    core::mem::size_of::<T>(),
-                )
+                core::slice::from_raw_parts_mut(ptr as *mut u8, core::mem::size_of::<T>())
             }),
         }
     }
@@ -106,9 +121,9 @@ impl<'lt> RawBuffer<'lt> {
         }
     }
 
-    pub fn new_from_raw(buf: &'lt mut *mut u8, tot: usize) -> Self {
+    pub fn new_from_raw(buf: *mut u8, tot: usize) -> Self {
         Self {
-            buf: *buf,
+            buf,
             tot,
             cur: 0,
             _phantom: core::marker::PhantomData,
@@ -136,11 +151,7 @@ impl<'lt> RawBuffer<'lt> {
             n if n == 0 => Ok(FillResult::Full),
             n if n < data.len() => {
                 unsafe {
-                    core::ptr::copy_nonoverlapping(
-                        data.as_ptr(),
-                        self.buf.add(self.count()),
-                        n,
-                    );
+                    core::ptr::copy_nonoverlapping(data.as_ptr(), self.buf.add(self.count()), n);
                 }
                 self.cur += n;
                 Ok(FillResult::Partial(n))
@@ -227,42 +238,3 @@ impl Write for RawBuffer<'_> {
         }
     }
 }
-
-pub fn get_str_from_cstr<'a>(cstr: *const c_char) -> KResult<&'a str> {
-    if cstr.is_null() {
-        return Err(EFAULT);
-    }
-
-    let cstr = unsafe { core::ffi::CStr::from_ptr::<'a>(cstr) };
-    cstr.to_str().map_err(|_| EFAULT)
-}
-
-/// Copy data from src to dst, starting from offset, and copy at most count bytes.
-///
-/// # Return
-///
-/// The number of bytes copied.
-pub fn copy_offset_count(
-    src: &[u8],
-    dst: &mut [u8],
-    offset: usize,
-    count: usize,
-) -> usize {
-    if offset >= src.len() {
-        return 0;
-    }
-
-    let count = {
-        let count = count.min(dst.len());
-
-        if offset + count > src.len() {
-            src.len() - offset
-        } else {
-            count
-        }
-    };
-
-    dst[..count].copy_from_slice(&src[offset..offset + count]);
-
-    count
-}

+ 8 - 5
src/kernel.ld

@@ -61,6 +61,11 @@ SECTIONS
         KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
         end_ctors = .;
 
+        . = ALIGN(16);
+        START_SYSCALL_HANDLERS = .;
+        KEEP(*(.syscall_handlers));
+        END_SYSCALL_HANDLERS = .;
+
         . = ALIGN(16);
         *(.data.kinit)
 
@@ -91,15 +96,13 @@ SECTIONS
 
         . = ALIGN(16);
         KMOD_LOADERS_START = .;
-
         KEEP(*(.kmods));
         QUAD(0);
 
         . = ALIGN(16);
-        late_init_start = .;
-        KEEP(*(.late_init));
-        QUAD(0);
-        late_init_end = .;
+        FIX_START = .;
+        KEEP(*(.fix));
+        FIX_END = .;
 
         . = ALIGN(16);
 

+ 2 - 0
src/kernel.rs

@@ -2,4 +2,6 @@ pub mod block;
 pub mod console;
 pub mod interrupt;
 pub mod mem;
+pub mod syscall;
+pub mod user;
 pub mod vfs;

+ 1 - 1
src/kernel/interrupt.cpp

@@ -112,7 +112,7 @@ static inline void fault_handler(interrupt_stack* context, mmx_registers*) {
                 kill_current(SIGILL); // noreturn
         } break;
         case 14: {
-            kernel::mem::paging::handle_page_fault(context->error_code);
+            kernel::mem::paging::handle_page_fault(context);
             return;
         } break;
     }

+ 1 - 0
src/kernel/mem/mm_list.cc

@@ -1,4 +1,5 @@
 #include <assert.h>
+#include <errno.h>
 #include <stdint.h>
 
 #include <kernel/mem/mm_list.hpp>

+ 37 - 4
src/kernel/mem/paging.cc

@@ -270,10 +270,34 @@ void kernel::mem::paging::increase_refcount(page* pg) {
     pg->refcount++;
 }
 
-void kernel::mem::paging::handle_page_fault(unsigned long err) {
+struct fix_entry {
+    uint64_t start;
+    uint64_t length;
+    uint64_t jump_address;
+    uint64_t type;
+};
+
+extern "C" fix_entry FIX_START[], FIX_END[];
+bool page_fault_fix(interrupt_stack* int_stack) {
+    // TODO: type load
+
+    // type store
+    for (fix_entry* fix = FIX_START; fix < FIX_END; fix++) {
+        if (int_stack->v_rip >= fix->start && int_stack->v_rip < fix->start + fix->length) {
+            int_stack->v_rip = fix->jump_address;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void kernel::mem::paging::handle_page_fault(interrupt_stack* int_stack) {
     using namespace kernel::mem;
     using namespace paging;
 
+    auto err = int_stack->error_code;
+
     uintptr_t vaddr;
     asm volatile("mov %%cr2, %0" : "=g"(vaddr) : :);
     auto& mms = current_process->mms;
@@ -284,7 +308,11 @@ void kernel::mem::paging::handle_page_fault(unsigned long err) {
         if (err & PAGE_FAULT_U)
             kill_current(SIGSEGV);
 
-        __page_fault_die(vaddr);
+        if (!page_fault_fix(int_stack)) {
+            __page_fault_die(vaddr);
+        } else {
+            return;
+        }
     }
 
     // user access to a present page caused the fault
@@ -312,8 +340,13 @@ void kernel::mem::paging::handle_page_fault(unsigned long err) {
     bool mmapped = mm_area->flags & MM_MAPPED;
     assert(!mmapped || mm_area->mapped_file);
 
-    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
-        __page_fault_die(vaddr);
+    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]] {
+        if (!page_fault_fix(int_stack)) {
+            __page_fault_die(vaddr);
+        } else {
+            return;
+        }
+    }
 
     pfn_t pfn = pe.pfn();
     auto attr = pe.attributes();

+ 65 - 67
src/kernel/process.cpp

@@ -21,25 +21,67 @@
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
 
+extern "C" fs::rust_file_array::handle* r_filearray_new_for_init();
+extern "C" fs::rust_fs_context::handle* r_fs_context_new_for_init();
+extern "C" fs::rust_file_array::handle* r_filearray_new_cloned(
+    struct fs::rust_file_array::handle* other);
+extern "C" fs::rust_fs_context::handle* r_fs_context_new_cloned(
+    struct fs::rust_fs_context::handle* other);
+extern "C" void r_filearray_drop(struct fs::rust_file_array::handle* other);
+extern "C" void r_fs_context_drop(struct fs::rust_fs_context::handle* other);
+
+fs::rust_fs_context::rust_fs_context(rust_fs_context::handle* handle) : m_handle(handle) {}
+fs::rust_file_array::rust_file_array(rust_file_array::handle* handle) : m_handle(handle) {}
+
+fs::rust_fs_context::~rust_fs_context() {
+    drop();
+}
+
+fs::rust_file_array::~rust_file_array() {
+    drop();
+}
+
+void fs::rust_fs_context::drop() {
+    if (m_handle) {
+        r_fs_context_drop(m_handle);
+        m_handle = nullptr;
+    }
+}
+
+void fs::rust_file_array::drop() {
+    if (m_handle) {
+        r_filearray_drop(m_handle);
+        m_handle = nullptr;
+    }
+}
+
+fs::rust_fs_context::handle* fs::rust_fs_context::get() const {
+    assert(m_handle);
+    return m_handle;
+}
+
+fs::rust_file_array::handle* fs::rust_file_array::get() const {
+    assert(m_handle);
+    return m_handle;
+}
+
 process::process(const process& parent, pid_t pid)
     : mms{parent.mms}
     , attr{parent.attr}
-    , files{parent.files.copy()}
-    , umask{parent.umask}
+    , files{r_filearray_new_cloned(parent.files.get())}
+    , fs_context{r_fs_context_new_cloned(parent.fs_context.get())}
     , pid{pid}
     , ppid{parent.pid}
     , pgid{parent.pgid}
     , sid{parent.sid}
-    , control_tty{parent.control_tty} {
-    assert(parent.cwd);
-    cwd = fs::d_get(parent.cwd);
-
-    assert(parent.fs_context.root);
-    fs_context.root = fs::d_get(parent.fs_context.root);
-}
+    , control_tty{parent.control_tty} {}
 
 process::process(pid_t pid, pid_t ppid)
-    : attr{.system = true}, files{&fs_context}, pid{pid}, ppid{ppid} {
+    : attr{.system = true}
+    , files{r_filearray_new_for_init()}
+    , fs_context{r_fs_context_new_for_init()}
+    , pid{pid}
+    , ppid{ppid} {
     bool inserted;
     std::tie(std::ignore, inserted) = thds.emplace("", pid);
     assert(inserted);
@@ -85,6 +127,9 @@ proclist::proclist() {
     auto thd = init.thds.begin();
     thd->name.assign("[kernel init]");
 
+    init.attr.system = 0;
+    thd->attr &= ~kernel::task::thread::SYSTEM;
+
     current_process = &init;
     current_thread = &thd;
 
@@ -134,15 +179,14 @@ void proclist::kill(pid_t pid, int exit_code) {
     //       files should only be closed when this is the last thread
     //
     // write back mmap'ped files and close them
-    proc.files.clear();
+    proc.files.drop();
+
+    // free fs_context
+    proc.fs_context.drop();
 
     // unmap all user memory areas
     proc.mms.clear();
 
-    // free cwd and fs_context dentry
-    proc.cwd.reset();
-    proc.fs_context.root.reset();
-
     // make child processes orphans (children of init)
     this->make_children_orphans(pid);
 
@@ -191,16 +235,14 @@ static void release_kinit() {
     extern uintptr_t volatile KINIT_START_ADDR, KINIT_END_ADDR, KINIT_PAGES;
 
     std::size_t pages = KINIT_PAGES;
-    auto range =
-        vaddr_range{KERNEL_PML4, KINIT_START_ADDR, KINIT_END_ADDR, true};
+    auto range = vaddr_range{KERNEL_PML4, KINIT_START_ADDR, KINIT_END_ADDR, true};
     for (auto pte : range)
         pte.clear();
 
     create_zone(KERNEL_IMAGE_PADDR, KERNEL_IMAGE_PADDR + 0x1000 * pages);
 }
 
-extern "C" void (*const late_init_start[])();
-extern "C" void late_init_rust();
+extern "C" void late_init_rust(uintptr_t* out_sp, uintptr_t* out_ip);
 
 void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
     kernel::mem::paging::free_pages(kernel_stack_pfn, 9);
@@ -208,58 +250,15 @@ void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
 
     kernel::kmod::load_internal_modules();
 
-    late_init_rust();
+    uintptr_t sp, ip;
+    late_init_rust(&sp, &ip);
 
     asm volatile("sti");
 
-    current_process->fs_context.root = fs::r_get_root_dentry();
-    current_process->cwd = fs::r_get_root_dentry();
-
     // ------------------------------------------
     // interrupt enabled
     // ------------------------------------------
 
-    for (auto* init = late_init_start; *init; ++init)
-        (*init)();
-
-    const auto& context = current_process->fs_context;
-
-    // mount fat32 /mnt directory
-    // TODO: parse kernel parameters
-    if (1) {
-        auto [mnt, status] = fs::open(context, context.root, "/mnt");
-        assert(mnt && status == -ENOENT);
-
-        if (int ret = fs::fs_mkdir(mnt.get(), 0755); 1)
-            assert(ret == 0);
-
-        int ret = fs::fs_mount(mnt.get(), "/dev/sda", "/mnt", "fat32",
-                               MS_RDONLY | MS_NOATIME | MS_NODEV | MS_NOSUID,
-                               "ro,nodev");
-
-        assert(ret == 0);
-    }
-
-    current_process->attr.system = 0;
-    current_thread->attr &= ~kernel::task::thread::SYSTEM;
-
-    types::elf::elf32_load_data d{
-        .exec_dent{},
-        .argv{"/mnt/busybox", "sh", "/mnt/initsh"},
-        .envp{"LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/"},
-        .ip{},
-        .sp{}};
-
-    auto [exec, ret] = fs::open(context, context.root.get(), d.argv[0]);
-    if (!exec || ret) {
-        kmsg("kernel panic: init not found!");
-        freeze();
-    }
-
-    d.exec_dent = std::move(exec);
-    if (int ret = types::elf::elf32_load(d); 1)
-        assert(ret == 0);
-
     int ds = 0x33, cs = 0x2b;
 
     asm volatile(
@@ -277,7 +276,7 @@ void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
 
         "iretq\n"
         :
-        : "g"(ds), "g"(cs), "g"(d.sp), "g"(d.ip)
+        : "g"(ds), "g"(cs), "g"(sp), "g"(ip)
         : "eax", "memory");
 
     freeze();
@@ -315,8 +314,7 @@ void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn) {
         "%=:\n"
         "ud2"
         :
-        : "a"(current_thread->kstack.sp), "c"(_kernel_init),
-          "g"(kernel_stack_pfn)
+        : "a"(current_thread->kstack.sp), "c"(_kernel_init), "g"(kernel_stack_pfn)
         : "memory");
 
     freeze();

+ 34 - 190
src/kernel/syscall.cpp

@@ -1,8 +1,6 @@
 #include <assert.h>
 #include <bits/alltypes.h>
-#include <bits/ioctl.h>
 #include <errno.h>
-#include <fcntl.h>
 #include <poll.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -84,20 +82,16 @@
 #define _DEFINE_SYSCALL32_END_PARAMS6(type, name, ...) name __VA_OPT__(, void)
 
 #define _DEFINE_SYSCALL32_END(name, ...) \
-    kernel::syscall::do_##name(          \
-        __VA_OPT__(_DEFINE_SYSCALL32_END_PARAMS1(__VA_ARGS__)))
-
-#define DEFINE_SYSCALL32_TO(name, to, ...)                      \
-    static uint32_t _syscall32_##name(interrupt_stack* data,    \
-                                      mmx_registers* mmxregs) { \
-        (void)data, (void)mmxregs;                              \
-        __VA_OPT__(_DEFINE_SYSCALL32_ARGS1(__VA_ARGS__);)       \
-        return (uint32_t)(uintptr_t)_DEFINE_SYSCALL32_END(      \
-            to __VA_OPT__(, __VA_ARGS__));                      \
+    kernel::syscall::do_##name(__VA_OPT__(_DEFINE_SYSCALL32_END_PARAMS1(__VA_ARGS__)))
+
+#define DEFINE_SYSCALL32_TO(sname, to, ...)                                              \
+    static uint32_t _syscall32_##sname(interrupt_stack* data, mmx_registers* mmxregs) {  \
+        (void)data, (void)mmxregs;                                                       \
+        __VA_OPT__(_DEFINE_SYSCALL32_ARGS1(__VA_ARGS__);)                                \
+        return (uint32_t)(uintptr_t)_DEFINE_SYSCALL32_END(to __VA_OPT__(, __VA_ARGS__)); \
     }
 
-#define DEFINE_SYSCALL32(name, ...) \
-    DEFINE_SYSCALL32_TO(name, name __VA_OPT__(, ) __VA_ARGS__)
+#define DEFINE_SYSCALL32(name, ...) DEFINE_SYSCALL32_TO(name, name __VA_OPT__(, ) __VA_ARGS__)
 
 #define DEFINE_SYSCALL32_NORETURN(name, ...)                                 \
     [[noreturn]] static uint32_t _syscall32_##name(interrupt_stack* data,    \
@@ -122,50 +116,18 @@ static inline void not_implemented(const char* pos, int line) {
     current_thread->send_signal(SIGSYS);
 }
 
-DEFINE_SYSCALL32(write, int, fd, const char __user*, buf, size_t, n)
-DEFINE_SYSCALL32(read, int, fd, char __user*, buf, size_t, n)
-DEFINE_SYSCALL32(close, int, fd)
-DEFINE_SYSCALL32(dup, int, old_fd)
-DEFINE_SYSCALL32(dup2, int, old_fd, int, new_fd)
-DEFINE_SYSCALL32(pipe, int __user*, pipefd)
-DEFINE_SYSCALL32(getdents, int, fd, char __user*, buf, size_t, cnt)
-DEFINE_SYSCALL32(getdents64, int, fd, char __user*, buf, size_t, cnt)
-DEFINE_SYSCALL32(open, const char __user*, path, int, flags, mode_t, mode)
-DEFINE_SYSCALL32(chdir, const char __user*, path)
-DEFINE_SYSCALL32(symlink, const char __user*, target, const char __user*,
-                 linkpath)
-DEFINE_SYSCALL32(readlink, const char __user*, pathname, char __user*, buf,
-                 size_t, buf_size)
-DEFINE_SYSCALL32(ioctl, int, fd, unsigned long, request, uintptr_t, arg3)
 DEFINE_SYSCALL32(munmap, uintptr_t, addr, size_t, len)
 DEFINE_SYSCALL32(poll, pollfd __user*, fds, nfds_t, nfds, int, timeout)
-DEFINE_SYSCALL32(mknod, const char __user*, pathname, mode_t, mode, dev_t, dev)
-DEFINE_SYSCALL32(access, const char __user*, pathname, int, mode)
-DEFINE_SYSCALL32(unlink, const char __user*, pathname)
-DEFINE_SYSCALL32(truncate, const char __user*, pathname, long, length)
-DEFINE_SYSCALL32(mkdir, const char __user*, pathname, mode_t, mode)
 DEFINE_SYSCALL32(socket, int, domain, int, type, int, protocol)
-DEFINE_SYSCALL32_TO(fcntl64, fcntl, int, fd, int, cmd, unsigned long, arg)
-
-DEFINE_SYSCALL32_TO(sendfile64, sendfile, int, out_fd, int, in_fd,
-                    off_t __user*, offset, size_t, count)
-
-DEFINE_SYSCALL32(statx, int, dirfd, const char __user*, path, int, flags,
-                 unsigned int, mask, statx __user*, statxbuf)
 
-DEFINE_SYSCALL32(mmap_pgoff, uintptr_t, addr, size_t, len, int, prot, int,
-                 flags, int, fd, off_t, pgoffset)
-
-DEFINE_SYSCALL32(mount, const char __user*, source, const char __user*, target,
-                 const char __user*, fstype, unsigned long, flags,
-                 const void __user*, _fsdata)
+DEFINE_SYSCALL32(mmap_pgoff, uintptr_t, addr, size_t, len, int, prot, int, flags, int, fd, off_t,
+                 pgoffset)
 
 DEFINE_SYSCALL32(waitpid, pid_t, waitpid, int __user*, arg1, int, options)
 DEFINE_SYSCALL32(getsid, pid_t, pid)
 DEFINE_SYSCALL32(setsid)
 DEFINE_SYSCALL32(getpgid, pid_t, pid)
 DEFINE_SYSCALL32(setpgid, pid_t, pid, pid_t, pgid)
-DEFINE_SYSCALL32(getcwd, char __user*, buf, size_t, buf_size)
 DEFINE_SYSCALL32(getpid)
 DEFINE_SYSCALL32(getppid)
 DEFINE_SYSCALL32(getuid)
@@ -179,26 +141,23 @@ DEFINE_SYSCALL32(set_tid_address, int __user*, tidptr)
 DEFINE_SYSCALL32(prctl, int, option, uintptr_t, arg2)
 DEFINE_SYSCALL32(arch_prctl, int, option, uintptr_t, arg2)
 DEFINE_SYSCALL32(brk, uintptr_t, addr)
-DEFINE_SYSCALL32(umask, mode_t, mask)
 DEFINE_SYSCALL32(kill, pid_t, pid, int, sig)
 DEFINE_SYSCALL32(tkill, pid_t, tid, int, sig)
-DEFINE_SYSCALL32(rt_sigprocmask, int, how, const kernel::sigmask_type __user*,
-                 set, kernel::sigmask_type __user*, oldset, size_t, sigsetsize)
-DEFINE_SYSCALL32(rt_sigaction, int, signum, const kernel::sigaction __user*,
-                 act, kernel::sigaction __user*, oldact, size_t, sigsetsize)
+DEFINE_SYSCALL32(rt_sigprocmask, int, how, const kernel::sigmask_type __user*, set,
+                 kernel::sigmask_type __user*, oldset, size_t, sigsetsize)
+DEFINE_SYSCALL32(rt_sigaction, int, signum, const kernel::sigaction __user*, act,
+                 kernel::sigaction __user*, oldact, size_t, sigsetsize)
 DEFINE_SYSCALL32(newuname, new_utsname __user*, buf)
 
 DEFINE_SYSCALL32_NORETURN(exit, int, status)
 
 DEFINE_SYSCALL32(gettimeofday, timeval __user*, tv, void __user*, tz)
-DEFINE_SYSCALL32_TO(clock_gettime64, clock_gettime, clockid_t, clk_id,
-                    timespec __user*, tp)
+DEFINE_SYSCALL32_TO(clock_gettime64, clock_gettime, clockid_t, clk_id, timespec __user*, tp)
 
 extern "C" void NORETURN ISR_stub_restore();
 static uint32_t _syscall32_fork(interrupt_stack* data, mmx_registers* mmxregs) {
     auto& newproc = procs->copy_from(*current_process);
-    auto [iter_newthd, inserted] =
-        newproc.thds.emplace(*current_thread, newproc.pid);
+    auto [iter_newthd, inserted] = newproc.thds.emplace(*current_thread, newproc.pid);
     assert(inserted);
     auto* newthd = &*iter_newthd;
 
@@ -231,108 +190,12 @@ static uint32_t _syscall32_fork(interrupt_stack* data, mmx_registers* mmxregs) {
     return newproc.pid;
 }
 
-static uint32_t _syscall32_llseek(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(unsigned int, fd);
-    SYSCALL32_ARG2(unsigned long, offset_high);
-    SYSCALL32_ARG3(unsigned long, offset_low);
-    SYSCALL32_ARG4(off_t __user*, result);
-    SYSCALL32_ARG5(unsigned int, whence);
-
-    if (!result)
-        return -EFAULT;
-
-    off_t offset = offset_low | (offset_high << 32);
-
-    auto ret = kernel::syscall::do_lseek(fd, offset, whence);
-    if (ret < 0)
-        return ret;
-
-    // TODO: copy_to_user
-    *result = ret;
-
-    return 0;
-}
-
-static uint32_t _syscall32_readv(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(int, fd);
-    SYSCALL32_ARG2(const types::iovec32 __user*, _iov);
-    SYSCALL32_ARG3(int, iovcnt);
-
-    // TODO: use copy_from_user
-    if (!_iov)
-        return -EFAULT;
-
-    std::vector<iovec> iov(iovcnt);
-    for (int i = 0; i < iovcnt; ++i) {
-        // TODO: check access right
-        uintptr_t base = _iov[i].iov_base;
-        iov[i].iov_base = (void*)base;
-        iov[i].iov_len = _iov[i].iov_len;
-    }
-
-    return kernel::syscall::do_readv(fd, iov.data(), iovcnt);
-}
-
-static uint32_t _syscall32_writev(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(int, fd);
-    SYSCALL32_ARG2(const types::iovec32 __user*, _iov);
-    SYSCALL32_ARG3(int, iovcnt);
-
-    // TODO: use copy_from_user
-    if (!_iov)
-        return -EFAULT;
-
-    std::vector<iovec> iov(iovcnt);
-    for (int i = 0; i < iovcnt; ++i) {
-        // TODO: check access right
-        uintptr_t base = _iov[i].iov_base;
-        iov[i].iov_base = (void*)base;
-        iov[i].iov_len = _iov[i].iov_len;
-    }
-
-    return kernel::syscall::do_writev(fd, iov.data(), iovcnt);
-}
-
-[[noreturn]] static uint32_t _syscall32_exit_group(interrupt_stack* data,
-                                                   mmx_registers* mmxregs) {
+[[noreturn]] static uint32_t _syscall32_exit_group(interrupt_stack* data, mmx_registers* mmxregs) {
     // we implement exit_group as exit for now
     _syscall32_exit(data, mmxregs);
 }
 
-static uint32_t _syscall32_execve(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(const char __user*, exec);
-    SYSCALL32_ARG2(const uint32_t __user*, argv);
-    SYSCALL32_ARG3(const uint32_t __user*, envp);
-
-    if (!exec || !argv || !envp)
-        return -EFAULT;
-
-    std::vector<std::string> args, envs;
-
-    // TODO: use copy_from_user
-    while (*argv) {
-        uintptr_t addr = *(argv++);
-        args.push_back((char __user*)addr);
-    }
-
-    while (*envp) {
-        uintptr_t addr = *(envp++);
-        envs.push_back((char __user*)addr);
-    }
-
-    auto retval = kernel::syscall::do_execve(exec, args, envs);
-
-    if (retval.status == 0) {
-        // TODO: switch cs ans ss
-        data->v_rip = retval.ip;
-        data->rsp = retval.sp;
-    }
-
-    return retval.status;
-}
-
-static uint32_t _syscall32_wait4(interrupt_stack* data,
-                                 mmx_registers* mmxregs) {
+static uint32_t _syscall32_wait4(interrupt_stack* data, mmx_registers* mmxregs) {
     SYSCALL32_ARG4(void __user*, rusage);
 
     // TODO: getrusage
@@ -342,8 +205,7 @@ static uint32_t _syscall32_wait4(interrupt_stack* data,
     return _syscall32_waitpid(data, mmxregs);
 }
 
-void kernel::handle_syscall32(int no, interrupt_stack* data,
-                              mmx_registers* mmxregs) {
+void kernel::handle_syscall32(int no, interrupt_stack* data, mmx_registers* mmxregs) {
     if (no >= SYSCALL_HANDLERS_SIZE || !syscall_handlers[no].handler) {
         kmsgf("[kernel] syscall %d(%x) isn't implemented", no, no);
         NOT_IMPLEMENTED;
@@ -353,7 +215,7 @@ void kernel::handle_syscall32(int no, interrupt_stack* data,
         return;
     }
 
-    // kmsgf_debug("[kernel:debug] (pid\t%d) %s()", current_process->pid,
+    // kmsgf_debug("[kernel:debug] (pid\t%d) %s() => {{", current_process->pid,
     // syscall_handlers[no].name);
 
     asm volatile("sti");
@@ -367,76 +229,58 @@ void kernel::handle_syscall32(int no, interrupt_stack* data,
     data->regs.r14 = 0;
     data->regs.r15 = 0;
 
+    // kmsgf_debug("[kernel:debug] }} => %x", data->regs.rax);
+
     if (current_thread->signals.pending_signal())
         current_thread->signals.handle(data, mmxregs);
 }
 
-#define REGISTER_SYSCALL_HANDLER(no, _name)              \
-    syscall_handlers[(no)].handler = _syscall32_##_name; \
-    syscall_handlers[(no)].name = #_name;
+#define REGISTER_SYSCALL_HANDLER(no, _name) register_syscall_handler(no, _syscall32_##_name, #_name)
+
+extern "C" void register_syscall_handler(uint32_t no,
+                                         uint32_t (*handler)(interrupt_stack*, mmx_registers*),
+                                         const char* name) {
+    syscall_handlers[no].handler = handler;
+    syscall_handlers[no].name = name;
+}
+
+extern "C" void r_register_syscall();
 
 SECTION(".text.kinit")
 void kernel::init_syscall_table() {
-    // 32bit syscalls
     REGISTER_SYSCALL_HANDLER(0x01, exit);
     REGISTER_SYSCALL_HANDLER(0x02, fork);
-    REGISTER_SYSCALL_HANDLER(0x03, read);
-    REGISTER_SYSCALL_HANDLER(0x04, write);
-    REGISTER_SYSCALL_HANDLER(0x05, open);
-    REGISTER_SYSCALL_HANDLER(0x06, close);
     REGISTER_SYSCALL_HANDLER(0x07, waitpid);
-    REGISTER_SYSCALL_HANDLER(0x0a, unlink);
-    REGISTER_SYSCALL_HANDLER(0x0b, execve);
-    REGISTER_SYSCALL_HANDLER(0x0c, chdir);
-    REGISTER_SYSCALL_HANDLER(0x0e, mknod);
     REGISTER_SYSCALL_HANDLER(0x14, getpid);
-    REGISTER_SYSCALL_HANDLER(0x15, mount);
-    REGISTER_SYSCALL_HANDLER(0x21, access);
     REGISTER_SYSCALL_HANDLER(0x25, kill);
-    REGISTER_SYSCALL_HANDLER(0x27, mkdir);
-    REGISTER_SYSCALL_HANDLER(0x29, dup);
-    REGISTER_SYSCALL_HANDLER(0x2a, pipe);
     REGISTER_SYSCALL_HANDLER(0x2d, brk);
     REGISTER_SYSCALL_HANDLER(0x2f, getgid);
-    REGISTER_SYSCALL_HANDLER(0x36, ioctl);
     REGISTER_SYSCALL_HANDLER(0x39, setpgid);
-    REGISTER_SYSCALL_HANDLER(0x3c, umask);
-    REGISTER_SYSCALL_HANDLER(0x3f, dup2);
     REGISTER_SYSCALL_HANDLER(0x40, getppid);
     REGISTER_SYSCALL_HANDLER(0x42, setsid);
     REGISTER_SYSCALL_HANDLER(0x4e, gettimeofday);
-    REGISTER_SYSCALL_HANDLER(0x53, symlink);
-    REGISTER_SYSCALL_HANDLER(0x55, readlink);
     REGISTER_SYSCALL_HANDLER(0x5b, munmap);
-    REGISTER_SYSCALL_HANDLER(0x5c, truncate);
     REGISTER_SYSCALL_HANDLER(0x72, wait4);
     REGISTER_SYSCALL_HANDLER(0x7a, newuname);
     REGISTER_SYSCALL_HANDLER(0x84, getpgid);
-    REGISTER_SYSCALL_HANDLER(0x8c, llseek);
-    REGISTER_SYSCALL_HANDLER(0x8d, getdents);
-    REGISTER_SYSCALL_HANDLER(0x91, readv);
-    REGISTER_SYSCALL_HANDLER(0x92, writev);
     REGISTER_SYSCALL_HANDLER(0x93, getsid);
     REGISTER_SYSCALL_HANDLER(0xa8, poll);
     REGISTER_SYSCALL_HANDLER(0xac, prctl);
     REGISTER_SYSCALL_HANDLER(0xae, rt_sigaction);
     REGISTER_SYSCALL_HANDLER(0xaf, rt_sigprocmask);
-    REGISTER_SYSCALL_HANDLER(0xb7, getcwd);
     REGISTER_SYSCALL_HANDLER(0xc0, mmap_pgoff);
     REGISTER_SYSCALL_HANDLER(0xc7, getuid);
     REGISTER_SYSCALL_HANDLER(0xc8, getgid32);
     REGISTER_SYSCALL_HANDLER(0xc9, geteuid);
     REGISTER_SYSCALL_HANDLER(0xca, geteuid32);
-    REGISTER_SYSCALL_HANDLER(0xdc, getdents64);
-    REGISTER_SYSCALL_HANDLER(0xdd, fcntl64);
     REGISTER_SYSCALL_HANDLER(0xe0, gettid);
     REGISTER_SYSCALL_HANDLER(0xee, tkill);
-    REGISTER_SYSCALL_HANDLER(0xef, sendfile64);
     REGISTER_SYSCALL_HANDLER(0xf3, set_thread_area);
     REGISTER_SYSCALL_HANDLER(0xfc, exit_group);
     REGISTER_SYSCALL_HANDLER(0x102, set_tid_address);
     REGISTER_SYSCALL_HANDLER(0x167, socket);
-    REGISTER_SYSCALL_HANDLER(0x17f, statx);
     REGISTER_SYSCALL_HANDLER(0x180, arch_prctl);
     REGISTER_SYSCALL_HANDLER(0x193, clock_gettime64);
+
+    r_register_syscall();
 }

+ 128 - 0
src/kernel/syscall.rs

@@ -0,0 +1,128 @@
+use crate::bindings::root::{interrupt_stack, mmx_registers};
+
+mod file_rw;
+mod procops;
+
+pub(self) trait MapReturnValue {
+    fn map(self) -> u32;
+}
+
+impl MapReturnValue for () {
+    fn map(self) -> u32 {
+        0
+    }
+}
+
+impl MapReturnValue for u32 {
+    fn map(self) -> u32 {
+        self
+    }
+}
+
+impl MapReturnValue for usize {
+    fn map(self) -> u32 {
+        self as u32
+    }
+}
+
+macro_rules! syscall32_call {
+    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        match $handler($arg1) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty, $arg2:ident: $argt2:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        match $handler($arg1, $arg2) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty, $arg2:ident: $argt2:ty, $arg3:ident: $argt3:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        match $handler($arg1, $arg2, $arg3) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident,
+     $arg1:ident: $argt1:ty,
+     $arg2:ident: $argt2:ty,
+     $arg3:ident: $argt3:ty,
+     $arg4:ident: $argt4:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
+        match $handler($arg1, $arg2, $arg3, $arg4) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident,
+     $arg1:ident: $argt1:ty,
+     $arg2:ident: $argt2:ty,
+     $arg3:ident: $argt3:ty,
+     $arg4:ident: $argt4:ty,
+     $arg5:ident: $argt5:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
+        let $arg5: $argt5 = $int_stack.regs.rdi as $argt5;
+        match $handler($arg1, $arg2, $arg3, $arg4, $arg5) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident,
+     $arg1:ident: $argt1:ty,
+     $arg2:ident: $argt2:ty,
+     $arg3:ident: $argt3:ty,
+     $arg4:ident: $argt4:ty,
+     $arg5:ident: $argt5:ty,
+     $arg6:ident: $argt6:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
+        let $arg5: $argt5 = $int_stack.regs.rdi as $argt5;
+        let $arg6: $argt6 = $int_stack.regs.rbp as $argt6;
+        match $handler($arg1, $arg2, $arg3, $arg4, $arg5, $arg6) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+}
+
+macro_rules! define_syscall32 {
+    ($name:ident, $handler:ident, $($arg:ident: $argt:ty),*) => {
+        unsafe extern "C" fn $name(
+            int_stack: *mut $crate::bindings::root::interrupt_stack,
+            _mmxregs: *mut $crate::bindings::root::mmx_registers) -> u32 {
+            let int_stack = int_stack.as_mut().unwrap();
+            $crate::kernel::syscall::syscall32_call!(int_stack, $handler, $($arg: $argt),*)
+        }
+    };
+}
+
+pub(self) use {define_syscall32, syscall32_call};
+
+extern "C" {
+    fn register_syscall_handler(
+        no: u32,
+        handler: unsafe extern "C" fn(*mut interrupt_stack, *mut mmx_registers) -> u32,
+        name: *const i8,
+    );
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn r_register_syscall() {
+    file_rw::register();
+    procops::register();
+}

+ 363 - 0
src/kernel/syscall/file_rw.rs

@@ -0,0 +1,363 @@
+use core::mem::MaybeUninit;
+
+use bindings::{
+    statx, AT_FDCWD, AT_STATX_SYNC_AS_STAT, AT_STATX_SYNC_TYPE, AT_SYMLINK_NOFOLLOW, EBADF, EFAULT,
+    EINVAL, ENOENT, SEEK_CUR, SEEK_END, SEEK_SET, S_IFBLK, S_IFCHR,
+};
+
+use crate::{
+    io::{Buffer, BufferFill},
+    kernel::{
+        user::dataflow::{CheckedUserPointer, UserBuffer, UserString},
+        vfs::{dentry::Dentry, file::SeekOption, filearray::FileArray, FsContext},
+    },
+    path::Path,
+    prelude::*,
+};
+
+use super::{define_syscall32, register_syscall_handler};
+
+fn do_read(fd: u32, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.read(&mut buffer)
+}
+
+fn do_write(fd: u32, buffer: *const u8, count: usize) -> KResult<usize> {
+    let data = unsafe { core::slice::from_raw_parts(buffer, count) };
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.write(data)
+}
+
+fn do_open(path: *const u8, flags: u32, mode: u32) -> KResult<u32> {
+    let path = UserString::new(path)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let files = FileArray::get_current();
+    let context = FsContext::get_current();
+    let mode = mode & !*context.umask.lock();
+
+    files.open(&context, path, flags, mode)
+}
+
+fn do_close(fd: u32) -> KResult<()> {
+    let files = FileArray::get_current();
+    files.close(fd)
+}
+
+fn do_dup(fd: u32) -> KResult<u32> {
+    let files = FileArray::get_current();
+    files.dup(fd)
+}
+
+fn do_dup2(old_fd: u32, new_fd: u32) -> KResult<u32> {
+    let files = FileArray::get_current();
+    files.dup_to(old_fd, new_fd, 0)
+}
+
+fn do_pipe(pipe_fd: *mut [u32; 2]) -> KResult<()> {
+    let mut buffer = UserBuffer::new(pipe_fd as *mut u8, core::mem::size_of::<[u32; 2]>())?;
+    let files = FileArray::get_current();
+    let (read_fd, write_fd) = files.pipe()?;
+
+    buffer.copy(&[read_fd, write_fd])?.ok_or(EFAULT)
+}
+
+fn do_getdents(fd: u32, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.getdents(&mut buffer)?;
+    Ok(buffer.wrote())
+}
+
+fn do_getdents64(fd: u32, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.getdents64(&mut buffer)?;
+    Ok(buffer.wrote())
+}
+
+fn do_statx(dirfd: u32, path: *const u8, flags: u32, mask: u32, buffer: *mut u8) -> KResult<()> {
+    if (flags & AT_STATX_SYNC_TYPE) != AT_STATX_SYNC_AS_STAT {
+        unimplemented!("AT_STATX_SYNC_TYPE={:x}", flags & AT_STATX_SYNC_TYPE);
+    }
+
+    if dirfd != AT_FDCWD as u32 {
+        unimplemented!("dirfd={}", dirfd);
+    }
+
+    let path = UserString::new(path)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+    let mut buffer = UserBuffer::new(buffer, core::mem::size_of::<statx>())?;
+
+    let file = Dentry::open(
+        &FsContext::get_current(),
+        path,
+        (flags & AT_SYMLINK_NOFOLLOW) != AT_SYMLINK_NOFOLLOW,
+    )?;
+
+    let mut stat: statx = unsafe { MaybeUninit::zeroed().assume_init() };
+
+    file.statx(&mut stat, mask)?;
+    buffer.copy(&stat)?.ok_or(EFAULT)
+}
+
+fn do_mkdir(pathname: *const u8, mode: u32) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let context = FsContext::get_current();
+    let mode = mode & !*context.umask.lock() & 0o777;
+
+    let dentry = Dentry::open(&context, path, true)?;
+
+    dentry.mkdir(mode)
+}
+
+fn do_truncate(pathname: *const u8, length: usize) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, true)?;
+
+    dentry.truncate(length)
+}
+
+fn do_unlink(pathname: *const u8) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, false)?;
+
+    dentry.unlink()
+}
+
+fn do_symlink(target: *const u8, linkpath: *const u8) -> KResult<()> {
+    let target = UserString::new(target)?;
+    let linkpath = UserString::new(linkpath)?;
+    let linkpath = Path::new(linkpath.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), linkpath, false)?;
+
+    dentry.symlink(target.as_cstr().to_bytes())
+}
+
+fn do_mknod(pathname: *const u8, mode: u32, dev: u32) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let context = FsContext::get_current();
+    let mode = mode & ((!*context.umask.lock() & 0o777) | (S_IFBLK | S_IFCHR));
+
+    let dentry = Dentry::open(&context, path, true)?;
+
+    dentry.mknod(mode, dev)
+}
+
+fn do_readlink(pathname: *const u8, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, false)?;
+
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    dentry.readlink(&mut buffer)
+}
+
+fn do_llseek(
+    fd: u32,
+    offset_high: u32,
+    offset_low: u32,
+    result: *mut u64,
+    whence: u32,
+) -> KResult<()> {
+    let mut result = UserBuffer::new(result as *mut u8, core::mem::size_of::<u64>())?;
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    let offset = ((offset_high as u64) << 32) | offset_low as u64;
+
+    let new_offset = match whence {
+        SEEK_SET => file.seek(SeekOption::Set(offset as usize))?,
+        SEEK_CUR => file.seek(SeekOption::Current(offset as isize))?,
+        SEEK_END => file.seek(SeekOption::End(offset as isize))?,
+        _ => return Err(EINVAL),
+    } as u64;
+
+    result.copy(&new_offset)?.ok_or(EFAULT)
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy)]
+struct IoVec32 {
+    base: u32,
+    len: u32,
+}
+
+fn do_readv(fd: u32, iov_user: *const u8, iovcnt: u32) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    let iov_user =
+        CheckedUserPointer::new(iov_user, iovcnt as usize * core::mem::size_of::<IoVec32>())?;
+    let mut iov_user_copied: Vec<IoVec32> = vec![];
+    iov_user_copied.resize(iovcnt as usize, IoVec32::default());
+
+    iov_user.read(
+        iov_user_copied.as_mut_ptr() as *mut (),
+        iov_user_copied.len() * core::mem::size_of::<IoVec32>(),
+    )?;
+
+    let iov_buffers = iov_user_copied
+        .into_iter()
+        .take_while(|iov| iov.len != 0)
+        .map(|iov| UserBuffer::new(iov.base as *mut u8, iov.len as usize))
+        .collect::<KResult<Vec<_>>>()?;
+
+    let mut tot = 0usize;
+    for mut buffer in iov_buffers.into_iter() {
+        // TODO!!!: `readv`
+        let nread = file.read(&mut buffer)?;
+        tot += nread;
+
+        if nread == 0 || nread != buffer.total() {
+            break;
+        }
+    }
+
+    Ok(tot)
+}
+
+fn do_writev(fd: u32, iov_user: *const u8, iovcnt: u32) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    let iov_user =
+        CheckedUserPointer::new(iov_user, iovcnt as usize * core::mem::size_of::<IoVec32>())?;
+    let mut iov_user_copied: Vec<IoVec32> = vec![];
+    iov_user_copied.resize(iovcnt as usize, IoVec32::default());
+
+    iov_user.read(
+        iov_user_copied.as_mut_ptr() as *mut (),
+        iov_user_copied.len() * core::mem::size_of::<IoVec32>(),
+    )?;
+
+    let iov_blocks = iov_user_copied
+        .into_iter()
+        .filter(|iov| iov.len != 0)
+        .map(|iov| CheckedUserPointer::new(iov.base as *mut u8, iov.len as usize))
+        .collect::<KResult<Vec<_>>>()?;
+
+    let mut tot = 0usize;
+    for block in iov_blocks.into_iter() {
+        // TODO!!!: atomic `writev`
+        // TODO!!!!!: copy from user
+        let slice = block.as_slice();
+        let nread = file.write(slice)?;
+        tot += nread;
+
+        if nread == 0 || nread != slice.len() {
+            break;
+        }
+    }
+
+    Ok(tot)
+}
+
+fn do_access(pathname: *const u8, _mode: u32) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, true)?;
+
+    if !dentry.is_valid() {
+        return Err(ENOENT);
+    }
+
+    // TODO: check permission
+    // match mode {
+    //     F_OK => todo!(),
+    //     R_OK => todo!(),
+    //     W_OK => todo!(),
+    //     X_OK => todo!(),
+    //     _ => Err(EINVAL),
+    // }
+    Ok(())
+}
+
+fn do_sendfile64(out_fd: u32, in_fd: u32, offset: *mut u8, count: usize) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let in_file = files.get(in_fd).ok_or(EBADF)?;
+    let out_file = files.get(out_fd).ok_or(EBADF)?;
+
+    if !offset.is_null() {
+        unimplemented!("sendfile64 with offset");
+    }
+
+    in_file.sendfile(&out_file, count)
+}
+
+fn do_ioctl(fd: u32, request: usize, arg3: usize) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    file.ioctl(request, arg3)
+}
+
+fn do_fcntl64(fd: u32, cmd: u32, arg: usize) -> KResult<usize> {
+    FileArray::get_current().fcntl(fd, cmd, arg)
+}
+
+define_syscall32!(sys_read, do_read, fd: u32, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_write, do_write, fd: u32, buffer: *const u8, count: usize);
+define_syscall32!(sys_open, do_open, path: *const u8, flags: u32, mode: u32);
+define_syscall32!(sys_close, do_close, fd: u32);
+define_syscall32!(sys_dup, do_dup, fd: u32);
+define_syscall32!(sys_dup2, do_dup2, old_fd: u32, new_fd: u32);
+define_syscall32!(sys_pipe, do_pipe, pipe_fd: *mut [u32; 2]);
+define_syscall32!(sys_getdents, do_getdents, fd: u32, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_getdents64, do_getdents64, fd: u32, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_statx, do_statx, fd: u32, path: *const u8, flags: u32, mask: u32, buffer: *mut u8);
+define_syscall32!(sys_mkdir, do_mkdir, pathname: *const u8, mode: u32);
+define_syscall32!(sys_truncate, do_truncate, pathname: *const u8, length: usize);
+define_syscall32!(sys_unlink, do_unlink, pathname: *const u8);
+define_syscall32!(sys_symlink, do_symlink, target: *const u8, linkpath: *const u8);
+define_syscall32!(sys_readlink, do_readlink, pathname: *const u8, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_llseek, do_llseek, fd: u32, offset_high: u32, offset_low: u32, result: *mut u64, whence: u32);
+define_syscall32!(sys_mknod, do_mknod, pathname: *const u8, mode: u32, dev: u32);
+define_syscall32!(sys_readv, do_readv, fd: u32, iov_user: *const u8, iovcnt: u32);
+define_syscall32!(sys_writev, do_writev, fd: u32, iov_user: *const u8, iovcnt: u32);
+define_syscall32!(sys_access, do_access, pathname: *const u8, mode: u32);
+define_syscall32!(sys_sendfile64, do_sendfile64, out_fd: u32, in_fd: u32, offset: *mut u8, count: usize);
+define_syscall32!(sys_ioctl, do_ioctl, fd: u32, request: usize, arg3: usize);
+define_syscall32!(sys_fcntl64, do_fcntl64, fd: u32, cmd: u32, arg: usize);
+
+pub(super) unsafe fn register() {
+    register_syscall_handler(0x03, sys_read, b"read\0".as_ptr() as *const _);
+    register_syscall_handler(0x04, sys_write, b"write\0".as_ptr() as *const _);
+    register_syscall_handler(0x05, sys_open, b"open\0".as_ptr() as *const _);
+    register_syscall_handler(0x06, sys_close, b"close\0".as_ptr() as *const _);
+    register_syscall_handler(0x0a, sys_unlink, b"unlink\0".as_ptr() as *const _);
+    register_syscall_handler(0x0e, sys_mknod, b"mknod\0".as_ptr() as *const _);
+    register_syscall_handler(0x21, sys_access, b"access\0".as_ptr() as *const _);
+    register_syscall_handler(0x27, sys_mkdir, b"mkdir\0".as_ptr() as *const _);
+    register_syscall_handler(0x29, sys_dup, b"dup\0".as_ptr() as *const _);
+    register_syscall_handler(0x2a, sys_pipe, b"pipe\0".as_ptr() as *const _);
+    register_syscall_handler(0x36, sys_ioctl, b"ioctl\0".as_ptr() as *const _);
+    register_syscall_handler(0x3f, sys_dup2, b"dup2\0".as_ptr() as *const _);
+    register_syscall_handler(0x53, sys_symlink, b"symlink\0".as_ptr() as *const _);
+    register_syscall_handler(0x55, sys_readlink, b"readlink\0".as_ptr() as *const _);
+    register_syscall_handler(0x5c, sys_truncate, b"truncate\0".as_ptr() as *const _);
+    register_syscall_handler(0x8c, sys_llseek, b"llseek\0".as_ptr() as *const _);
+    register_syscall_handler(0x8d, sys_getdents, b"getdents\0".as_ptr() as *const _);
+    register_syscall_handler(0x91, sys_readv, b"readv\0".as_ptr() as *const _);
+    register_syscall_handler(0x92, sys_writev, b"writev\0".as_ptr() as *const _);
+    register_syscall_handler(0xdc, sys_getdents64, b"getdents64\0".as_ptr() as *const _);
+    register_syscall_handler(0xdd, sys_fcntl64, b"fcntl64\0".as_ptr() as *const _);
+    register_syscall_handler(0xef, sys_sendfile64, b"sendfile64\0".as_ptr() as *const _);
+    register_syscall_handler(0x17f, sys_statx, b"statx\0".as_ptr() as *const _);
+}

+ 0 - 401
src/kernel/syscall/fileops.cc

@@ -1,4 +1,3 @@
-#include <bits/ioctl.h>
 #include <errno.h>
 #include <poll.h>
 #include <sys/mman.h>
@@ -22,241 +21,6 @@ static inline void not_implemented(const char* pos, int line) {
     current_thread->send_signal(SIGSYS);
 }
 
-ssize_t kernel::syscall::do_write(int fd, const char __user* buf, size_t n) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return file->write(buf, n);
-}
-
-ssize_t kernel::syscall::do_read(int fd, char __user* buf, size_t n) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return file->read(buf, n);
-}
-
-int kernel::syscall::do_close(int fd) {
-    current_process->files.close(fd);
-    return 0;
-}
-
-int kernel::syscall::do_dup(int old_fd) {
-    return current_process->files.dup(old_fd);
-}
-
-int kernel::syscall::do_dup2(int old_fd, int new_fd) {
-    return current_process->files.dup(old_fd, new_fd, 0);
-}
-
-int kernel::syscall::do_pipe(int __user* pipefd) {
-    // TODO: use copy_from_user and copy_to_user
-    return current_process->files.pipe(*(int(*)[2])pipefd);
-}
-
-ssize_t kernel::syscall::do_getdents(int fd, char __user* buf, size_t cnt) {
-    auto* dir = current_process->files[fd];
-    if (!dir)
-        return -EBADF;
-
-    return dir->getdents(buf, cnt);
-}
-
-ssize_t kernel::syscall::do_getdents64(int fd, char __user* buf, size_t cnt) {
-    auto* dir = current_process->files[fd];
-    if (!dir)
-        return -EBADF;
-
-    return dir->getdents64(buf, cnt);
-}
-
-int kernel::syscall::do_open(const char __user* path, int flags, mode_t mode) {
-    mode &= ~current_process->umask;
-
-    // TODO: use copy_from_user
-    return current_process->files.open(current_process->cwd, path, flags, mode);
-}
-
-int kernel::syscall::do_symlink(const char __user* target, const char __user* linkpath) {
-    // TODO: use copy_from_user
-    auto [dent, status] = current_open(linkpath, false);
-    if (!dent)
-        return status;
-
-    if (status == 0)
-        return -EEXIST;
-
-    assert(status == -ENOENT);
-    return fs::fs_symlink(dent.get(), target);
-}
-
-int kernel::syscall::do_readlink(const char __user* pathname, char __user* buf, size_t buf_size) {
-    // TODO: use copy_from_user
-    auto [dent, status] = current_open(pathname, false);
-
-    if (!dent || status)
-        return status;
-
-    if (buf_size & (1ull << 63))
-        return -EINVAL;
-
-    // TODO: use copy_to_user
-    return fs::fs_readlink(dent.get(), buf, buf_size);
-}
-
-int kernel::syscall::do_ioctl(int fd, unsigned long request, uintptr_t arg3) {
-    // TODO: check fd type and get tty* from fd
-    //
-    //       we use a trick for now, check whether
-    //       the file that fd points to is a pipe or
-    //       not. and we suppose that stdin will be
-    //       either a tty or a pipe.
-    auto* file = current_process->files[fd];
-    // TODO!!!: check whether the file is a tty or not
-    if (!file) // || !S_ISCHR(file->mode))
-        return -ENOTTY;
-
-    switch (request) {
-        case TIOCGPGRP: {
-            auto* pgid = (pid_t __user*)arg3;
-            auto* ctrl_tty = current_process->control_tty;
-
-            if (!ctrl_tty)
-                return -ENOTTY;
-
-            // TODO: copy_to_user
-            *pgid = ctrl_tty->get_pgrp();
-            break;
-        }
-        case TIOCSPGRP: {
-            // TODO: copy_from_user
-            auto pgid = *(const pid_t __user*)arg3;
-            auto* ctrl_tty = current_process->control_tty;
-
-            if (!ctrl_tty)
-                return -ENOTTY;
-
-            ctrl_tty->set_pgrp(pgid);
-            break;
-        }
-        case TIOCGWINSZ: {
-            auto* ws = (winsize __user*)arg3;
-            // TODO: copy_to_user
-            ws->ws_col = 80;
-            ws->ws_row = 10;
-            break;
-        }
-        case TCGETS: {
-            auto* argp = (struct termios __user*)arg3;
-
-            auto* ctrl_tty = current_process->control_tty;
-            if (!ctrl_tty)
-                return -EINVAL;
-
-            // TODO: use copy_to_user
-            memcpy(argp, &ctrl_tty->termio, sizeof(ctrl_tty->termio));
-
-            break;
-        }
-        case TCSETS: {
-            auto* argp = (const struct termios __user*)arg3;
-
-            auto* ctrl_tty = current_process->control_tty;
-            if (!ctrl_tty)
-                return -EINVAL;
-
-            // TODO: use copy_from_user
-            memcpy(&ctrl_tty->termio, argp, sizeof(ctrl_tty->termio));
-
-            break;
-        }
-        default:
-            kmsgf("[error] the ioctl() function %x is not implemented", request);
-            return -EINVAL;
-    }
-
-    return 0;
-}
-
-ssize_t kernel::syscall::do_readv(int fd, const iovec* iov, int iovcnt) {
-    auto* file = current_process->files[fd];
-
-    if (!file)
-        return -EBADF;
-
-    // TODO: fix fake EOF
-    ssize_t totn = 0;
-    for (int i = 0; i < iovcnt; ++i) {
-        auto* base = (char*)iov[i].iov_base;
-        auto len = iov[i].iov_len;
-
-        if (len == 0)
-            break;
-
-        if (len < 0)
-            return -EINVAL;
-
-        if (!base)
-            return -EFAULT;
-
-        ssize_t ret = file->read(base, len);
-
-        if (ret < 0)
-            return ret;
-
-        if (ret == 0)
-            break;
-
-        totn += ret;
-
-        if ((size_t)ret != iov[i].iov_len)
-            break;
-    }
-
-    return totn;
-}
-
-// TODO: this operation SHOULD be atomic
-ssize_t kernel::syscall::do_writev(int fd, const iovec* iov, int iovcnt) {
-    auto* file = current_process->files[fd];
-
-    if (!file)
-        return -EBADF;
-
-    ssize_t totn = 0;
-    for (int i = 0; i < iovcnt; ++i) {
-        auto* base = (const char*)iov[i].iov_base;
-        auto len = iov[i].iov_len;
-
-        if (len == 0)
-            continue;
-
-        if (len < 0)
-            return -EINVAL;
-
-        if (!base)
-            return -EFAULT;
-
-        ssize_t ret = file->write(base, len);
-
-        if (ret < 0)
-            return ret;
-        totn += ret;
-    }
-
-    return totn;
-}
-
-off_t kernel::syscall::do_lseek(int fd, off_t offset, int whence) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return file->seek(offset, whence);
-}
-
 uintptr_t kernel::syscall::do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags, int fd,
                                          off_t pgoffset) {
     if (addr & 0xfff)
@@ -324,147 +88,6 @@ int kernel::syscall::do_munmap(uintptr_t addr, size_t len) {
     return current_process->mms.unmap(addr, len, true);
 }
 
-ssize_t kernel::syscall::do_sendfile(int out_fd, int in_fd, off_t __user* offset, size_t count) {
-    auto* out_file = current_process->files[out_fd];
-    auto* in_file = current_process->files[in_fd];
-
-    if (!out_file || !in_file)
-        return -EBADF;
-
-    // TODO: check whether in_fd supports mmapping
-    // TODO!!!: figure a way to recover this
-    // if (!S_ISREG(in_file->mode) && !S_ISBLK(in_file->mode))
-    return -EINVAL;
-
-    if (offset) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    constexpr size_t bufsize = 4096;
-    std::vector<char> buf(bufsize);
-    size_t totn = 0;
-    while (totn < count) {
-        if (current_thread->signals.pending_signal() != 0)
-            return (totn == 0) ? -EINTR : totn;
-
-        size_t n = std::min(count - totn, bufsize);
-        ssize_t ret = in_file->read(buf.data(), n);
-        if (ret < 0)
-            return ret;
-        if (ret == 0)
-            break;
-        ret = out_file->write(buf.data(), ret);
-        if (ret < 0)
-            return ret;
-        totn += ret;
-    }
-
-    return totn;
-}
-
-int kernel::syscall::do_statx(int dirfd, const char __user* path, int flags, unsigned int mask,
-                              statx __user* statxbuf) {
-    // AT_STATX_SYNC_AS_STAT is the default value
-    if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_SYNC_AS_STAT) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    if (dirfd != AT_FDCWD) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    auto [dent, status] = current_open(path, !(flags & AT_SYMLINK_NOFOLLOW));
-    if (!dent || status)
-        return status;
-
-    // TODO: copy to user
-    return fs::fs_statx(dent.get(), statxbuf, mask);
-}
-
-int kernel::syscall::do_fcntl(int fd, int cmd, unsigned long arg) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    switch (cmd) {
-        case F_SETFD:
-            return current_process->files.set_flags(fd, arg);
-        case F_DUPFD:
-        case F_DUPFD_CLOEXEC: {
-            return current_process->files.dupfd(fd, arg, FD_CLOEXEC);
-        }
-        default:
-            NOT_IMPLEMENTED;
-            return -EINVAL;
-    }
-}
-
-int kernel::syscall::do_mkdir(const char __user* pathname, mode_t mode) {
-    mode &= (~current_process->umask & 0777);
-
-    // TODO: use copy_from_user
-    auto [dent, status] = current_open(pathname);
-    if (!dent)
-        return status;
-
-    if (status == 0)
-        return -EEXIST;
-
-    assert(status == -ENOENT);
-    return fs::fs_mkdir(dent.get(), mode);
-}
-
-int kernel::syscall::do_truncate(const char __user* pathname, long length) {
-    auto [dent, status] = current_open(pathname);
-    if (!dent || status)
-        return status;
-
-    return fs::fs_truncate(dent.get(), length);
-}
-
-int kernel::syscall::do_unlink(const char __user* pathname) {
-    auto [dent, status] = current_open(pathname, false);
-
-    if (!dent || status)
-        return status;
-
-    return fs::fs_unlink(dent.get());
-}
-
-int kernel::syscall::do_access(const char __user* pathname, int mode) {
-    auto [dent, status] = current_open(pathname);
-    if (!dent || status)
-        return status;
-
-    switch (mode) {
-        case F_OK:
-            return 0;
-        case R_OK:
-        case W_OK:
-        case X_OK:
-            // TODO: check privilege
-            return 0;
-        default:
-            return -EINVAL;
-    }
-}
-
-int kernel::syscall::do_mknod(const char __user* pathname, mode_t mode, dev_t dev) {
-    mode &= S_IFMT | (~current_process->umask & 0777);
-    auto [dent, status] = current_open(pathname);
-    if (!dent)
-        return status;
-
-    if (status == 0)
-        return -EEXIST;
-
-    assert(status == -ENOENT);
-    return fs::fs_mknod(dent.get(), mode, dev);
-}
-
 int kernel::syscall::do_poll(pollfd __user* fds, nfds_t nfds, int timeout) {
     if (nfds == 0)
         return 0;
@@ -505,27 +128,3 @@ int kernel::syscall::do_poll(pollfd __user* fds, nfds_t nfds, int timeout) {
 int kernel::syscall::do_socket(int domain, int type, int protocol) {
     return -EINVAL;
 }
-
-/* TODO: implement vfs_stat(stat*)
-int do_stat(const char __user* pathname, stat __user* buf)
-{
-    auto* dent = fs::vfs_open(*current_process->root,
-        types::make_path(pathname, current_process->pwd));
-
-    if (!dent)
-        return -ENOENT;
-
-    return fs::vfs_stat(dent, buf);
-}
-*/
-
-/* TODO: implement vfs_stat(stat*)
-int do_fstat(int fd, stat __user* buf)
-{
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return fs::vfs_stat(file, buf);
-}
-*/

+ 0 - 22
src/kernel/syscall/mount.cc

@@ -1,22 +0,0 @@
-#include <errno.h>
-
-#include <types/path.hpp>
-
-#include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
-#include <kernel/vfs.hpp>
-
-int kernel::syscall::do_mount(const char __user* source,
-                              const char __user* target,
-                              const char __user* fstype, unsigned long flags,
-                              const void __user* _fsdata) {
-    if (!fstype)
-        return -EINVAL;
-
-    // TODO: use copy_from_user
-    auto [mountpoint, status] = current_open(target);
-    if (!mountpoint || status)
-        return status;
-
-    return fs::fs_mount(mountpoint.get(), source, target, fstype, flags, _fsdata);
-}

+ 3 - 62
src/kernel/syscall/procops.cc

@@ -28,49 +28,6 @@ static inline void not_implemented(const char* pos, int line) {
     current_thread->send_signal(SIGSYS);
 }
 
-int kernel::syscall::do_chdir(const char __user* path) {
-    // TODO: use copy_from_user
-    auto [dir, ret] = current_open(path);
-    if (!dir || ret)
-        return ret;
-
-    if (!fs::r_dentry_is_directory(dir.get()))
-        return -ENOTDIR;
-
-    current_process->cwd = std::move(dir);
-    return 0;
-}
-
-execve_retval kernel::syscall::do_execve(const std::string& exec,
-                                         const std::vector<std::string>& args,
-                                         const std::vector<std::string>& envs) {
-    auto [dent, ret] = current_open(exec);
-
-    if (ret)
-        return {0, 0, ret};
-
-    types::elf::elf32_load_data d{
-        .exec_dent{std::move(dent)},
-        .argv{args},
-        .envp{envs},
-        .ip{},
-        .sp{},
-    };
-
-    current_process->files.onexec();
-
-    // TODO: set cs and ss to compatibility mode
-    if (int ret = types::elf::elf32_load(d); ret != 0) {
-        if (ret == types::elf::ELF_LOAD_FAIL_NORETURN)
-            kill_current(SIGSEGV);
-
-        return {0, 0, ret};
-    }
-
-    current_thread->signals.on_exec();
-    return {d.ip, d.sp, 0};
-}
-
 int kernel::syscall::do_exit(int status) {
     // TODO: terminating a thread only
     assert(current_process->thds.size() == 1);
@@ -129,12 +86,6 @@ int kernel::syscall::do_waitpid(pid_t waitpid, int __user* arg1, int options) {
     return -EINVAL;
 }
 
-int kernel::syscall::do_getcwd(char __user* buf, size_t buf_size) {
-    // TODO: use copy_to_user
-    return fs::d_path(current_process->cwd.get(),
-                      current_process->fs_context.root.get(), buf, buf_size);
-}
-
 pid_t kernel::syscall::do_setsid() {
     if (current_process->pid == current_process->pgid)
         return -EPERM;
@@ -240,13 +191,6 @@ int kernel::syscall::do_arch_prctl(int option, uintptr_t arg2) {
     return 0;
 }
 
-int kernel::syscall::do_umask(mode_t mask) {
-    mode_t old = current_process->umask;
-    current_process->umask = mask;
-
-    return old;
-}
-
 int kernel::syscall::do_kill(pid_t pid, int sig) {
     auto [pproc, found] = procs->try_find(pid);
     if (!found)
@@ -285,8 +229,7 @@ int kernel::syscall::do_tkill(pid_t tid, int sig) {
 }
 
 int kernel::syscall::do_rt_sigprocmask(int how, const sigmask_type __user* set,
-                                       sigmask_type __user* oldset,
-                                       size_t sigsetsize) {
+                                       sigmask_type __user* oldset, size_t sigsetsize) {
     if (sigsetsize != sizeof(sigmask_type))
         return -EINVAL;
 
@@ -316,13 +259,11 @@ int kernel::syscall::do_rt_sigprocmask(int how, const sigmask_type __user* set,
 }
 
 int kernel::syscall::do_rt_sigaction(int signum, const sigaction __user* act,
-                                     sigaction __user* oldact,
-                                     size_t sigsetsize) {
+                                     sigaction __user* oldact, size_t sigsetsize) {
     if (sigsetsize != sizeof(sigmask_type))
         return -EINVAL;
 
-    if (!kernel::signal_list::check_valid(signum) || signum == SIGKILL ||
-        signum == SIGSTOP)
+    if (!kernel::signal_list::check_valid(signum) || signum == SIGKILL || signum == SIGSTOP)
         return -EINVAL;
 
     // TODO: use copy_to_user

+ 171 - 0
src/kernel/syscall/procops.rs

@@ -0,0 +1,171 @@
+use core::ffi::CStr;
+
+use alloc::borrow::ToOwned;
+use alloc::ffi::CString;
+use alloc::sync::Arc;
+use bindings::types::elf::{elf32_load, elf32_load_data, ELF_LOAD_FAIL_NORETURN};
+use bindings::{
+    current_process, current_thread, interrupt_stack, kill_current, mmx_registers, EFAULT, EINVAL,
+    ENOENT, ENOTDIR, SIGSEGV,
+};
+
+use crate::io::Buffer;
+use crate::kernel::user::dataflow::UserString;
+use crate::kernel::vfs::dentry::Dentry;
+use crate::kernel::vfs::filearray::FileArray;
+use crate::path::Path;
+use crate::{kernel::user::dataflow::UserBuffer, prelude::*};
+
+use crate::kernel::vfs::{self, FsContext};
+
+use super::{define_syscall32, register_syscall_handler};
+
+fn do_umask(mask: u32) -> KResult<u32> {
+    let context = FsContext::get_current();
+    let mut umask = context.umask.lock();
+
+    let old = *umask;
+    *umask = mask & 0o777;
+    Ok(old)
+}
+
+fn do_getcwd(buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let context = FsContext::get_current();
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+
+    context.cwd.lock().get_path(&context, &mut buffer)?;
+
+    Ok(buffer.wrote())
+}
+
+fn do_chdir(path: *const u8) -> KResult<()> {
+    let context = FsContext::get_current();
+    let path = UserString::new(path)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&context, path, true)?;
+    if !dentry.is_valid() {
+        return Err(ENOENT);
+    }
+
+    if !dentry.is_directory() {
+        return Err(ENOTDIR);
+    }
+
+    *context.cwd.lock() = dentry;
+    Ok(())
+}
+
+fn do_mount(source: *const u8, target: *const u8, fstype: *const u8, flags: usize) -> KResult<()> {
+    let source = UserString::new(source)?;
+    let target = UserString::new(target)?;
+    let fstype = UserString::new(fstype)?;
+
+    let context = FsContext::get_current();
+    let mountpoint = Dentry::open(&context, Path::new(target.as_cstr().to_bytes())?, true)?;
+    if !mountpoint.is_valid() {
+        return Err(ENOENT);
+    }
+
+    vfs::mount::do_mount(
+        &mountpoint,
+        source.as_cstr().to_str().map_err(|_| EINVAL)?,
+        target.as_cstr().to_str().map_err(|_| EINVAL)?,
+        fstype.as_cstr().to_str().map_err(|_| EINVAL)?,
+        flags as u64,
+    )
+}
+
+/// # Return
+/// `(ip, sp)`
+fn do_execve(exec: &[u8], argv: &[CString], envp: &[CString]) -> KResult<(usize, usize)> {
+    let context = FsContext::get_current();
+    let dentry = Dentry::open(&context, Path::new(exec)?, true)?;
+    if !dentry.is_valid() {
+        return Err(ENOENT);
+    }
+
+    let argv_array = argv.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+    let envp_array = envp.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+
+    let mut load_data = elf32_load_data {
+        exec_dent: Arc::into_raw(dentry) as *mut _,
+        argv: argv_array.as_ptr(),
+        argv_count: argv_array.len(),
+        envp: envp_array.as_ptr(),
+        envp_count: envp_array.len(),
+        ip: 0,
+        sp: 0,
+    };
+
+    BorrowedArc::<FileArray>::from_raw(
+        unsafe { current_process.as_mut() }.unwrap().files.m_handle as *const _,
+    )
+    .on_exec();
+
+    match unsafe { elf32_load(&mut load_data) } {
+        0 => {
+            unsafe { current_thread.as_mut().unwrap().signals.on_exec() };
+            Ok((load_data.ip, load_data.sp))
+        }
+        n => {
+            if n == ELF_LOAD_FAIL_NORETURN {
+                unsafe { kill_current(SIGSEGV as i32) }
+            }
+            Err(-n as u32)
+        }
+    }
+}
+
+unsafe extern "C" fn sys_execve(
+    int_stack: *mut interrupt_stack,
+    _mmxregs: *mut mmx_registers,
+) -> u32 {
+    match (|| -> KResult<()> {
+        let exec = int_stack.as_mut().unwrap().regs.rbx as *const u8;
+        let exec = UserString::new(exec)?;
+
+        // TODO!!!!!: copy from user
+        let mut argv = int_stack.as_mut().unwrap().regs.rcx as *const u32;
+        let mut envp = int_stack.as_mut().unwrap().regs.rdx as *const u32;
+
+        if argv.is_null() || envp.is_null() {
+            return Err(EFAULT);
+        }
+
+        let mut argv_vec = Vec::new();
+        let mut envp_vec = Vec::new();
+
+        while argv.read() != 0 {
+            argv_vec.push(CStr::from_ptr(argv.read() as *const i8).to_owned());
+            argv = argv.add(1);
+        }
+
+        while envp.read() != 0 {
+            envp_vec.push(CStr::from_ptr(envp.read() as *const i8).to_owned());
+            envp = envp.add(1);
+        }
+
+        let (ip, sp) = do_execve(exec.as_cstr().to_bytes(), &argv_vec, &envp_vec)?;
+
+        int_stack.as_mut().unwrap().v_rip = ip;
+        int_stack.as_mut().unwrap().rsp = sp;
+        Ok(())
+    })() {
+        Ok(_) => 0,
+        Err(err) => -(err as i32) as u32,
+    }
+}
+
+define_syscall32!(sys_chdir, do_chdir, path: *const u8);
+define_syscall32!(sys_umask, do_umask, mask: u32);
+define_syscall32!(sys_mount, do_mount, source: *const u8, target: *const u8, fstype: *const u8, flags: usize);
+define_syscall32!(sys_getcwd, do_getcwd, buffer: *mut u8, bufsize: usize);
+
+pub(super) unsafe fn register() {
+    register_syscall_handler(0x0b, sys_execve, b"execve\0".as_ptr() as *const _);
+    register_syscall_handler(0x0c, sys_chdir, b"chdir\0".as_ptr() as *const _);
+    register_syscall_handler(0x15, sys_mount, b"mount\0".as_ptr() as *const _);
+    register_syscall_handler(0x3c, sys_umask, b"umask\0".as_ptr() as *const _);
+    register_syscall_handler(0xb7, sys_getcwd, b"getcwd\0".as_ptr() as *const _);
+}

+ 50 - 14
src/kernel/tty.cpp

@@ -1,5 +1,6 @@
 #include <algorithm>
 
+#include <bits/ioctl.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <termios.h>
@@ -12,17 +13,12 @@
 
 #define CTRL(key) ((key)-0x40)
 
-#define TERMIOS_ISET(termios, option) \
-    ((option) == ((termios).c_iflag & (option)))
-#define TERMIOS_OSET(termios, option) \
-    ((option) == ((termios).c_oflag & (option)))
-#define TERMIOS_CSET(termios, option) \
-    ((option) == ((termios).c_cflag & (option)))
-#define TERMIOS_LSET(termios, option) \
-    ((option) == ((termios).c_lflag & (option)))
+#define TERMIOS_ISET(termios, option) ((option) == ((termios).c_iflag & (option)))
+#define TERMIOS_OSET(termios, option) ((option) == ((termios).c_oflag & (option)))
+#define TERMIOS_CSET(termios, option) ((option) == ((termios).c_cflag & (option)))
+#define TERMIOS_LSET(termios, option) ((option) == ((termios).c_lflag & (option)))
 
-#define TERMIOS_TESTCC(c, termios, cc) \
-    ((c != 0xff) && (c == ((termios).c_cc[cc])))
+#define TERMIOS_TESTCC(c, termios, cc) ((c != 0xff) && (c == ((termios).c_cc[cc])))
 
 using namespace kernel::tty;
 
@@ -71,6 +67,48 @@ int tty::poll() {
     return 1;
 }
 
+int tty::ioctl(int request, unsigned long arg3) {
+    switch (request) {
+        case TIOCGPGRP: {
+            auto* pgid = (pid_t __user*)arg3;
+            // TODO: copy_to_user
+            *pgid = this->get_pgrp();
+            break;
+        }
+        case TIOCSPGRP: {
+            // TODO: copy_from_user
+            auto pgid = *(const pid_t __user*)arg3;
+            this->set_pgrp(pgid);
+            break;
+        }
+        case TIOCGWINSZ: {
+            auto* ws = (winsize __user*)arg3;
+            // TODO: copy_to_user
+            ws->ws_col = 80;
+            ws->ws_row = 40;
+            break;
+        }
+        case TCGETS: {
+            auto* argp = (struct termios __user*)arg3;
+            // TODO: use copy_to_user
+            memcpy(argp, &this->termio, sizeof(this->termio));
+            break;
+        }
+        case TCSETS: {
+            auto* argp = (const struct termios __user*)arg3;
+            // TODO: use copy_from_user
+            memcpy(&this->termio, argp, sizeof(this->termio));
+            break;
+        }
+        default: {
+            kmsgf("[kernel:error] ioctl(%x, %x) is not implemented", request, arg3);
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
 ssize_t tty::read(char* buf, size_t buf_size, size_t n) {
     n = std::max(buf_size, n);
     size_t orig_n = n;
@@ -144,8 +182,7 @@ void tty::_real_commit_char(int c) {
         case '\n':
             buf.put(c);
 
-            if (TERMIOS_LSET(this->termio, ECHONL) ||
-                TERMIOS_LSET(this->termio, ECHO))
+            if (TERMIOS_LSET(this->termio, ECHONL) || TERMIOS_LSET(this->termio, ECHO))
                 this->_echo_char(c);
 
             // if ICANON is set, we notify all waiting processes
@@ -171,8 +208,7 @@ void tty::_real_commit_char(int c) {
 void tty::_echo_char(int c) {
     // ECHOCTL
     do {
-        if (c < 0 || c >= 32 ||
-            !TERMIOS_LSET(this->termio, ECHO | ECHOCTL | IEXTEN))
+        if (c < 0 || c >= 32 || !TERMIOS_LSET(this->termio, ECHO | ECHOCTL | IEXTEN))
             break;
 
         if (c == '\t' || c == '\n' || c == CTRL('Q') || c == CTRL('S'))

+ 1 - 0
src/kernel/user.rs

@@ -0,0 +1 @@
+pub mod dataflow;

+ 202 - 0
src/kernel/user/dataflow.rs

@@ -0,0 +1,202 @@
+use core::{arch::asm, ffi::CStr};
+
+use bindings::{EFAULT, EINVAL};
+
+use crate::{
+    io::{Buffer, FillResult},
+    prelude::*,
+};
+
+pub struct CheckedUserPointer {
+    ptr: *const u8,
+    len: usize,
+}
+
+pub struct UserBuffer<'lt> {
+    ptr: CheckedUserPointer,
+    size: usize,
+    cur: usize,
+    _phantom: core::marker::PhantomData<&'lt ()>,
+}
+
+pub struct UserString<'lt> {
+    ptr: CheckedUserPointer,
+    len: usize,
+    _phantom: core::marker::PhantomData<&'lt ()>,
+}
+
+impl CheckedUserPointer {
+    pub fn new(ptr: *const u8, len: usize) -> KResult<Self> {
+        const USER_MAX_ADDR: usize = 0x7ff_fff_fff_fff;
+        let end = (ptr as usize).checked_add(len);
+        if ptr.is_null() || end.ok_or(EFAULT)? > USER_MAX_ADDR {
+            Err(EFAULT)
+        } else {
+            Ok(Self { ptr, len })
+        }
+    }
+
+    pub fn get_mut<T>(&self) -> *mut T {
+        self.ptr as *mut T
+    }
+
+    pub fn get_const<T>(&self) -> *const T {
+        self.ptr as *const T
+    }
+
+    pub fn as_slice(&self) -> &[u8] {
+        // SAFETY: the pointer's validity is checked in `new`
+        unsafe { core::slice::from_raw_parts(self.ptr, self.len) }
+    }
+
+    pub fn read(&self, buffer: *mut (), total: usize) -> KResult<()> {
+        if total > self.len {
+            return Err(EINVAL);
+        }
+
+        let error_bytes: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "rep movsb",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 3b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x3", // type: load
+                ".popsection",
+                inout("rcx") total => error_bytes,
+                inout("rsi") self.ptr => _,
+                inout("rdi") buffer => _,
+            )
+        }
+
+        if error_bytes != 0 {
+            Err(EFAULT)
+        } else {
+            Ok(())
+        }
+    }
+}
+
+impl UserBuffer<'_> {
+    pub fn new(ptr: *mut u8, size: usize) -> KResult<Self> {
+        let ptr = CheckedUserPointer::new(ptr, size)?;
+
+        Ok(Self {
+            ptr,
+            size,
+            cur: 0,
+            _phantom: core::marker::PhantomData,
+        })
+    }
+
+    fn remaining(&self) -> usize {
+        self.size - self.cur
+    }
+}
+
+impl<'lt> Buffer for UserBuffer<'lt> {
+    fn total(&self) -> usize {
+        self.size
+    }
+
+    fn wrote(&self) -> usize {
+        self.cur
+    }
+
+    fn fill(&mut self, data: &[u8]) -> KResult<FillResult> {
+        let remaining = self.remaining();
+        if remaining == 0 {
+            return Ok(FillResult::Full);
+        }
+
+        let data = if data.len() > remaining {
+            &data[..remaining]
+        } else {
+            data
+        };
+
+        // TODO: align to 8 bytes when doing copy for performance
+        let error_bytes: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "rep movsb",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 3b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x1", // type: store
+                ".popsection",
+                inout("rcx") data.len() => error_bytes,
+                inout("rsi") data.as_ptr() => _,
+                inout("rdi") self.ptr.get_mut::<u8>().offset(self.cur as isize) => _,
+            )
+        };
+
+        if error_bytes != 0 {
+            return Err(EFAULT);
+        }
+
+        self.cur += data.len();
+        Ok(FillResult::Done(data.len()))
+    }
+}
+
+impl<'lt> UserString<'lt> {
+    pub fn new(ptr: *const u8) -> KResult<Self> {
+        const MAX_LEN: usize = 4096;
+        // TODO
+        let ptr = CheckedUserPointer::new(ptr, MAX_LEN)?;
+
+        let result: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "mov al, byte ptr [rdx]",
+                "4:",
+                "test al, al",
+                "jz 3f",
+                "add rdx, 1",
+                "loop 2b",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 4b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x2", // type: string
+                ".popsection",
+                in("rdx") ptr.get_const::<u8>(),
+                inout("rcx") MAX_LEN => result,
+            )
+        };
+
+        if result == 0 {
+            Err(EFAULT)
+        } else {
+            Ok(Self {
+                ptr,
+                len: MAX_LEN - result,
+                _phantom: core::marker::PhantomData,
+            })
+        }
+    }
+
+    pub fn as_cstr(&self) -> &'lt CStr {
+        unsafe {
+            CStr::from_bytes_with_nul_unchecked(core::slice::from_raw_parts(
+                self.ptr.get_const(),
+                self.len + 1,
+            ))
+        }
+    }
+}

+ 0 - 243
src/kernel/vfs.cpp

@@ -17,136 +17,6 @@
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
 
-fs::regular_file::regular_file(file_flags flags, size_t cursor, dentry_pointer dentry)
-    : file(flags), cursor(cursor), dentry(std::move(dentry)) {}
-
-ssize_t fs::regular_file::read(char* __user buf, size_t n) {
-    if (!flags.read)
-        return -EBADF;
-
-    // TODO: copy to user function !IMPORTANT
-    ssize_t n_wrote = fs_read(dentry.get(), buf, n, cursor, n);
-    if (n_wrote >= 0)
-        cursor += n_wrote;
-
-    return n_wrote;
-}
-
-ssize_t fs::regular_file::do_write(const char* __user buf, size_t n) {
-    // TODO: check privilege of user ptr
-    ssize_t n_wrote = fs_write(dentry.get(), buf, cursor, n);
-    if (n_wrote >= 0)
-        cursor += n_wrote;
-
-    return n_wrote;
-}
-
-off_t fs::regular_file::seek(off_t n, int whence) {
-    size_t ind_size = r_dentry_get_size(dentry.get());
-    size_t pos;
-    switch (whence) {
-        case SEEK_SET:
-            pos = n;
-            break;
-        case SEEK_CUR:
-            pos = cursor + n;
-            break;
-        case SEEK_END:
-            pos = ind_size + n;
-            break;
-        default:
-            return -EINVAL;
-    }
-
-    if (pos > ind_size)
-        return -EINVAL;
-
-    cursor = pos;
-
-    return cursor;
-}
-
-int fs::regular_file::getdents(char* __user buf, size_t cnt) {
-    size_t orig_cnt = cnt;
-    auto callback = readdir_callback_fn([&buf, &cnt](const char* fn, size_t fnlen, ino_t ino) {
-        size_t reclen = sizeof(fs::user_dirent) + 1 + fnlen;
-        if (cnt < reclen)
-            return -EFAULT;
-
-        auto* dirp = (fs::user_dirent*)buf;
-        dirp->d_ino = ino;
-        dirp->d_reclen = reclen;
-        // TODO: show offset
-        // dirp->d_off = 0;
-        // TODO: use copy_to_user
-        memcpy(dirp->d_name, fn, fnlen);
-        buf[reclen - 2] = 0;
-        buf[reclen - 1] = 0;
-
-        buf += reclen;
-        cnt -= reclen;
-        return 0;
-    });
-
-    int nread = fs_readdir(dentry.get(), cursor, &callback);
-
-    if (nread > 0)
-        cursor += nread;
-
-    return orig_cnt - cnt;
-}
-
-int fs::regular_file::getdents64(char* __user buf, size_t cnt) {
-    size_t orig_cnt = cnt;
-    auto callback = readdir_callback_fn([&buf, &cnt](const char* fn, size_t fnlen, ino_t ino) {
-        size_t reclen = sizeof(fs::user_dirent64) + fnlen;
-        if (cnt < reclen)
-            return -EFAULT;
-
-        auto* dirp = (fs::user_dirent64*)buf;
-        dirp->d_ino = ino;
-        dirp->d_off = 114514;
-        dirp->d_reclen = reclen;
-        dirp->d_type = 0;
-        // TODO: use copy_to_user
-        memcpy(dirp->d_name, fn, fnlen);
-        buf[reclen - 1] = 0;
-
-        buf += reclen;
-        cnt -= reclen;
-        return 0;
-    });
-
-    int nread = fs_readdir(dentry.get(), cursor, &callback);
-
-    if (nread > 0)
-        cursor += nread;
-
-    return orig_cnt - cnt;
-}
-
-fs::fifo_file::fifo_file(file_flags flags, std::shared_ptr<fs::pipe> ppipe)
-    : file(flags), ppipe(ppipe) {}
-
-ssize_t fs::fifo_file::read(char* __user buf, size_t n) {
-    if (!flags.read)
-        return -EBADF;
-
-    return ppipe->read(buf, n);
-}
-
-ssize_t fs::fifo_file::do_write(const char* __user buf, size_t n) {
-    return ppipe->write(buf, n);
-}
-
-fs::fifo_file::~fifo_file() {
-    assert(flags.read ^ flags.write);
-    if (flags.read)
-        ppipe->close_read();
-    else
-        ppipe->close_write();
-}
-
 static fs::chrdev_ops** chrdevs[256];
 
 int fs::register_char_device(dev_t node, const fs::chrdev_ops& ops) {
@@ -191,119 +61,6 @@ ssize_t fs::char_device_write(dev_t node, const char* buf, size_t n) {
     return write(buf, n);
 }
 
-fs::pipe::pipe(void) : buf{PIPE_SIZE}, flags{READABLE | WRITABLE} {}
-
-void fs::pipe::close_read(void) {
-    kernel::async::lock_guard lck{mtx};
-    flags &= (~READABLE);
-    waitlist_w.notify_all();
-}
-
-void fs::pipe::close_write(void) {
-    kernel::async::lock_guard lck{mtx};
-    flags &= (~WRITABLE);
-    waitlist_r.notify_all();
-}
-
-int fs::pipe::write(const char* buf, size_t n) {
-    // TODO: check privilege
-    // TODO: check EPIPE
-    kernel::async::lock_guard lck{mtx};
-
-    if (!is_readable()) {
-        current_thread->send_signal(SIGPIPE);
-        return -EPIPE;
-    }
-
-    if (n <= PIPE_SIZE) {
-        while (this->buf.avail() < n) {
-            bool interrupted = waitlist_w.wait(mtx);
-            if (interrupted)
-                return -EINTR;
-
-            if (!is_readable()) {
-                current_thread->send_signal(SIGPIPE);
-                return -EPIPE;
-            }
-        }
-
-        for (size_t i = 0; i < n; ++i)
-            this->buf.put(*(buf++));
-
-        waitlist_r.notify_all();
-
-        return n;
-    }
-
-    size_t orig_n = n;
-    while (true) {
-        bool write = false;
-        while (n && !this->buf.full()) {
-            --n, this->buf.put(*(buf++));
-            write = true;
-        }
-
-        if (write)
-            waitlist_r.notify_all();
-
-        if (n == 0)
-            break;
-
-        bool interrupted = waitlist_w.wait(mtx);
-        if (interrupted)
-            return -EINTR;
-
-        if (!is_readable()) {
-            current_thread->send_signal(SIGPIPE);
-            return -EPIPE;
-        }
-    }
-
-    return orig_n - n;
-}
-
-int fs::pipe::read(char* buf, size_t n) {
-    // TODO: check privilege
-    kernel::async::lock_guard lck{mtx};
-    size_t orig_n = n;
-
-    while (is_writeable() && this->buf.size() == 0) {
-        bool interrupted = waitlist_r.wait(mtx);
-        if (interrupted)
-            return -EINTR;
-    }
-
-    while (!this->buf.empty() && n)
-        --n, *(buf++) = this->buf.get();
-
-    waitlist_w.notify_all();
-    return orig_n - n;
-}
-
-extern "C" int call_callback(const fs::readdir_callback_fn* func, const char* filename,
-                             size_t fnlen, ino_t ino) {
-    return (*func)(filename, fnlen, ino);
-}
-
-extern "C" struct dentry* dentry_open(struct dentry* context_root, struct dentry* cwd,
-                                      const char* path, size_t path_length, bool follow);
-
-std::pair<fs::dentry_pointer, int> fs::open(const fs::fs_context& context,
-                                            const fs::dentry_pointer& cwd, types::string_view path,
-                                            bool follow_symlinks) {
-    auto result =
-        dentry_open(context.root.get(), cwd.get(), path.data(), path.size(), follow_symlinks);
-    auto result_int = reinterpret_cast<intptr_t>(result);
-
-    if (result_int > -128)
-        return {nullptr, result_int};
-
-    if (fs::r_dentry_is_invalid(result))
-        return {result, -ENOENT};
-
-    return {result, 0};
-}
-
 extern "C" void r_dput(struct dentry* dentry);
 extern "C" struct dentry* r_dget(struct dentry* dentry);
 

+ 178 - 53
src/kernel/vfs/dentry.rs

@@ -2,27 +2,38 @@ pub mod dcache;
 
 use core::{
     hash::{BuildHasher, BuildHasherDefault, Hasher},
-    sync::atomic::AtomicPtr,
+    ops::ControlFlow,
+    sync::atomic::{AtomicPtr, Ordering},
 };
 
 use crate::{
     hash::KernelHasher,
-    io::{ByteBuffer, RawBuffer},
+    io::{Buffer, ByteBuffer},
+    kernel::block::BlockDevice,
     path::{Path, PathComponent},
     prelude::*,
     rcu::{RCUNode, RCUPointer},
 };
 
 use alloc::sync::Arc;
-use bindings::{EINVAL, ELOOP, ENOENT, ENOTDIR};
+use bindings::{statx, EEXIST, EINVAL, EISDIR, ELOOP, ENOENT, ENOTDIR, ERANGE, O_CREAT, O_EXCL};
 
-use super::inode::Inode;
+use super::{
+    inode::{Ino, Inode, Mode, WriteOffset},
+    s_isblk, s_ischr, s_isdir, s_isreg, DevId, FsContext,
+};
 
 struct DentryData {
     inode: Arc<dyn Inode>,
     flags: u64,
 }
 
+/// # Safety
+///
+/// We wrap `Dentry` in `Arc` to ensure that the `Dentry` is not dropped while it is still in use.
+///
+/// Since a `Dentry` is created and marked as live(some data is saved to it), it keeps alive until
+/// the last reference is dropped.
 pub struct Dentry {
     // Const after insertion into dcache
     parent: Arc<Dentry>,
@@ -128,8 +139,11 @@ impl Dentry {
     fn save_data(&self, inode: Arc<dyn Inode>, flags: u64) -> KResult<()> {
         let new = DentryData { inode, flags };
 
-        let old = self.data.swap(Some(Arc::new(new)));
-        // Safety: old data is `None`, so it's safe to emit the `drop` call
+        // TODO!!!: We don't actually need to use `RCUPointer` here
+        // Safety: this function may only be called from `create`-like functions which requires the
+        // superblock's write locks to be held, so only one creation can happen at a time and we
+        // can't get a reference to the old data.
+        let old = unsafe { self.data.swap(Some(Arc::new(new))) };
         assert!(old.is_none());
 
         Ok(())
@@ -151,7 +165,7 @@ impl Dentry {
         self.data
             .load()
             .as_ref()
-            .ok_or(EINVAL)
+            .ok_or(ENOENT)
             .map(|data| data.inode.clone())
     }
 
@@ -167,11 +181,30 @@ impl Dentry {
         data.as_ref()
             .map_or(false, |data| data.flags & D_DIRECTORY != 0)
     }
-}
 
-#[repr(C)]
-pub struct FsContext {
-    root: *const Dentry,
+    pub fn is_valid(&self) -> bool {
+        self.data.load().is_some()
+    }
+
+    pub fn open_check(self: &Arc<Self>, flags: u32, mode: Mode) -> KResult<()> {
+        let data = self.data.load();
+        let create = flags & O_CREAT != 0;
+        let excl = flags & O_EXCL != 0;
+
+        if data.is_some() {
+            if create && excl {
+                return Err(EEXIST);
+            }
+            return Ok(());
+        } else {
+            if !create {
+                return Err(ENOENT);
+            }
+
+            let parent = self.parent().get_inode()?;
+            parent.creat(self, mode as u32)
+        }
+    }
 }
 
 impl Dentry {
@@ -205,7 +238,7 @@ impl Dentry {
         }
     }
 
-    fn open_recursive(
+    pub fn open_recursive(
         context: &FsContext,
         cwd: &Arc<Self>,
         path: Path,
@@ -219,13 +252,11 @@ impl Dentry {
         }
 
         let mut cwd = if path.is_absolute() {
-            Dentry::from_raw(&context.root).clone()
+            context.fsroot.clone()
         } else {
             cwd.clone()
         };
 
-        let root_dentry = Dentry::from_raw(&context.root);
-
         for item in path.iter() {
             if let PathComponent::TrailingEmpty = item {
                 if cwd.data.load().as_ref().is_none() {
@@ -238,7 +269,7 @@ impl Dentry {
             match item {
                 PathComponent::TrailingEmpty | PathComponent::Current => {} // pass
                 PathComponent::Parent => {
-                    if !cwd.hash_eq(root_dentry.as_ref()) {
+                    if !cwd.hash_eq(&context.fsroot) {
                         cwd = Self::resolve_directory(context, cwd.parent.clone(), nrecur)?;
                     }
                     continue;
@@ -269,53 +300,31 @@ impl Dentry {
 
         Ok(cwd)
     }
-}
 
-#[no_mangle]
-pub extern "C" fn dentry_open(
-    context_root: *const Dentry,
-    cwd: *const Dentry, // borrowed
-    path: *const u8,
-    path_len: usize,
-    follow: bool,
-) -> *const Dentry {
-    match (|| -> KResult<Arc<Dentry>> {
-        let path = Path::new(unsafe { core::slice::from_raw_parts(path, path_len) })?;
-
-        let context = FsContext { root: context_root };
-
-        Dentry::open_recursive(&context, Dentry::from_raw(&cwd).as_ref(), path, follow, 0)
-    })() {
-        Ok(dentry) => Arc::into_raw(dentry),
-        Err(err) => (-(err as i32) as usize) as *const Dentry,
+    pub fn open(context: &FsContext, path: Path, follow_symlinks: bool) -> KResult<Arc<Self>> {
+        let cwd = context.cwd.lock().clone();
+        Dentry::open_recursive(context, &cwd, path, follow_symlinks, 0)
     }
-}
 
-#[no_mangle]
-pub extern "C" fn d_path(
-    dentry: *const Dentry,
-    root: *const Dentry,
-    mut buffer: *mut u8,
-    bufsize: usize,
-) -> i32 {
-    let mut buffer = RawBuffer::new_from_raw(&mut buffer, bufsize);
-
-    match (|| {
-        let mut dentry = Dentry::from_raw(&dentry).clone();
-        let root = Dentry::from_raw(&root);
+    pub fn get_path(
+        self: &Arc<Dentry>,
+        context: &FsContext,
+        buffer: &mut dyn Buffer,
+    ) -> KResult<()> {
+        let mut dentry = self;
+        let root = &context.fsroot;
 
         let mut path = vec![];
 
-        while Arc::as_ptr(&dentry) != Arc::as_ptr(root.as_ref()) {
+        while Arc::as_ptr(dentry) != Arc::as_ptr(root) {
             if path.len() > 32 {
                 return Err(ELOOP);
             }
 
             path.push(dentry.name().clone());
-            dentry = dentry.parent().clone();
+            dentry = dentry.parent();
         }
 
-        const ERANGE: u32 = 34;
         buffer.fill(b"/")?.ok_or(ERANGE)?;
         for item in path.iter().rev().map(|name| name.as_ref()) {
             buffer.fill(item)?.ok_or(ERANGE)?;
@@ -325,9 +334,125 @@ pub extern "C" fn d_path(
         buffer.fill(&[0])?.ok_or(ERANGE)?;
 
         Ok(())
-    })() {
-        Ok(_) => 0,
-        Err(err) => -(err as i32),
+    }
+}
+
+impl Dentry {
+    pub fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        let inode = self.get_inode()?;
+
+        // Safety: Changing mode alone will have no effect on the file's contents
+        match inode.mode.load(Ordering::Relaxed) {
+            mode if s_isdir(mode) => Err(EISDIR),
+            mode if s_isreg(mode) => inode.read(buffer, offset),
+            mode if s_isblk(mode) => {
+                let device = BlockDevice::get(inode.devid()?)?;
+                Ok(device.read_some(offset, buffer)?.allow_partial())
+            }
+            mode if s_ischr(mode) => {
+                let devid = inode.devid()?;
+
+                // TODO!!!!!: change this
+                let mut temporary_buffer = [0u8; 256];
+
+                let ret = unsafe {
+                    bindings::fs::char_device_read(
+                        devid,
+                        temporary_buffer.as_mut_ptr() as *mut _,
+                        temporary_buffer.len(),
+                        temporary_buffer.len(),
+                    )
+                };
+
+                if ret < 0 {
+                    Err(-ret as u32)
+                } else {
+                    Ok(buffer
+                        .fill(&temporary_buffer[..ret as usize])?
+                        .allow_partial())
+                }
+            }
+            _ => Err(EINVAL),
+        }
+    }
+
+    pub fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
+        let inode = self.get_inode()?;
+        // Safety: Changing mode alone will have no effect on the file's contents
+        match inode.mode.load(Ordering::Relaxed) {
+            mode if s_isdir(mode) => Err(EISDIR),
+            mode if s_isreg(mode) => inode.write(buffer, offset),
+            mode if s_isblk(mode) => Err(EINVAL), // TODO
+            mode if s_ischr(mode) => {
+                let devid = inode.devid()?;
+
+                let ret = unsafe {
+                    bindings::fs::char_device_write(
+                        devid,
+                        buffer.as_ptr() as *const _,
+                        buffer.len(),
+                    )
+                };
+
+                if ret < 0 {
+                    Err(-ret as u32)
+                } else {
+                    Ok(ret as usize)
+                }
+            }
+            _ => Err(EINVAL),
+        }
+    }
+
+    pub fn readdir<F>(&self, offset: usize, mut callback: F) -> KResult<usize>
+    where
+        F: FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
+    {
+        self.get_inode()?.do_readdir(offset, &mut callback)
+    }
+
+    pub fn mkdir(&self, mode: Mode) -> KResult<()> {
+        if self.get_inode().is_ok() {
+            Err(EEXIST)
+        } else {
+            self.parent.get_inode().unwrap().mkdir(self, mode)
+        }
+    }
+
+    pub fn statx(&self, stat: &mut statx, mask: u32) -> KResult<()> {
+        self.get_inode()?.statx(stat, mask)
+    }
+
+    pub fn truncate(&self, size: usize) -> KResult<()> {
+        self.get_inode()?.truncate(size)
+    }
+
+    pub fn unlink(self: &Arc<Self>) -> KResult<()> {
+        if self.get_inode().is_err() {
+            Err(ENOENT)
+        } else {
+            self.parent.get_inode().unwrap().unlink(self)
+        }
+    }
+
+    pub fn symlink(self: &Arc<Self>, link: &[u8]) -> KResult<()> {
+        if self.get_inode().is_ok() {
+            Err(EEXIST)
+        } else {
+            self.parent.get_inode().unwrap().symlink(self, link)
+        }
+    }
+
+    pub fn readlink(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        self.get_inode()?.readlink(buffer)
+    }
+
+    pub fn mknod(&self, mode: Mode, devid: DevId) -> KResult<()> {
+        if self.get_inode().is_ok() {
+            Err(EEXIST)
+        } else {
+            self.parent.get_inode().unwrap().mknod(self, mode, devid)
+        }
     }
 }
 

+ 4 - 301
src/kernel/vfs/ffi.rs

@@ -1,122 +1,6 @@
-use crate::{
-    io::{ByteBuffer, RawBuffer},
-    kernel::block::BlockDevice,
-    prelude::*,
-};
+use crate::io::RawBuffer;
 
-use core::{
-    ffi::{c_char, c_void},
-    sync::atomic::Ordering,
-};
-
-use alloc::sync::Arc;
-use bindings::{dev_t, ino_t, mode_t, statx};
-
-use crate::io::get_str_from_cstr;
-
-use super::{
-    bindings::{fs, EINVAL, EISDIR},
-    dentry::Dentry,
-    inode::Inode,
-    s_isblk, s_ischr, s_isdir, s_isreg, DevId,
-};
-
-fn into_slice<'a>(buf: *const u8, bufsize: &usize) -> &'a [u8] {
-    unsafe { core::slice::from_raw_parts(buf, *bufsize) }
-}
-
-fn into_mut_slice<'a>(buf: *mut u8, bufsize: &usize) -> &'a mut [u8] {
-    unsafe { core::slice::from_raw_parts_mut(buf, *bufsize) }
-}
-
-macro_rules! map_err_ffi {
-    ($error:expr) => {
-        match $error {
-            Ok(_) => 0,
-            Err(e) => -(e as i32),
-        }
-    };
-}
-
-#[no_mangle]
-pub extern "C" fn fs_mount(
-    mountpoint: *const Dentry, // borrowed
-    source: *const c_char,
-    mountpoint_str: *const c_char,
-    fstype: *const c_char,
-    flags: u64,
-    _data: *const c_void,
-) -> i32 {
-    let mountpoint = Dentry::from_raw(&mountpoint);
-
-    let source = get_str_from_cstr(source).unwrap();
-    let mountpoint_str = get_str_from_cstr(mountpoint_str).unwrap();
-    let fstype = get_str_from_cstr(fstype).unwrap();
-
-    // TODO: data
-    match super::mount::do_mount(&mountpoint, source, mountpoint_str, fstype, flags, &[]) {
-        Ok(_) => 0,
-        Err(e) => -(e as i32),
-    }
-}
-
-fn do_read(file: &Arc<dyn Inode>, buffer: &mut [u8], offset: usize) -> KResult<usize> {
-    // Safety: Changing mode alone will have no effect on the file's contents
-    match file.mode.load(Ordering::Relaxed) {
-        mode if s_isdir(mode) => Err(EISDIR),
-        mode if s_isreg(mode) => {
-            let mut buffer = ByteBuffer::new(buffer);
-            file.read(&mut buffer, offset)
-        }
-        mode if s_isblk(mode) => {
-            let mut buffer = ByteBuffer::new(buffer);
-            let device = BlockDevice::get(file.devid()?)?;
-
-            Ok(device.read_some(offset, &mut buffer)?.allow_partial())
-        }
-        mode if s_ischr(mode) => {
-            let devid = file.devid()?;
-
-            let ret = unsafe {
-                fs::char_device_read(
-                    devid,
-                    buffer.as_mut_ptr() as *mut _,
-                    buffer.len(),
-                    buffer.len(),
-                )
-            };
-
-            if ret < 0 {
-                Err(-ret as u32)
-            } else {
-                Ok(ret as usize)
-            }
-        }
-        _ => Err(EINVAL),
-    }
-}
-
-fn do_write(file: &Arc<dyn Inode>, buffer: &[u8], offset: usize) -> KResult<usize> {
-    // Safety: Changing mode alone will have no effect on the file's contents
-    match file.mode.load(Ordering::Relaxed) {
-        mode if s_isdir(mode) => Err(EISDIR),
-        mode if s_isreg(mode) => file.write(buffer, offset),
-        mode if s_isblk(mode) => Err(EINVAL), // TODO
-        mode if s_ischr(mode) => {
-            let devid = file.devid()?;
-
-            let ret =
-                unsafe { fs::char_device_write(devid, buffer.as_ptr() as *const _, buffer.len()) };
-
-            if ret < 0 {
-                Err(-ret as u32)
-            } else {
-                Ok(ret as usize)
-            }
-        }
-        _ => Err(EINVAL),
-    }
-}
+use super::{dentry::Dentry, inode::Inode};
 
 #[no_mangle]
 pub extern "C" fn fs_read(
@@ -127,193 +11,12 @@ pub extern "C" fn fs_read(
     n: usize,
 ) -> isize {
     let file = Dentry::from_raw(&file);
-    let file = file.get_inode().unwrap();
 
     let bufsize = bufsize.min(n);
-    let buffer = into_mut_slice(buf, &bufsize);
+    let mut buffer = RawBuffer::new_from_raw(buf, bufsize);
 
-    match do_read(&file, buffer, offset) {
+    match file.read(&mut buffer, offset) {
         Ok(n) => n as isize,
         Err(e) => -(e as isize),
     }
 }
-
-#[no_mangle]
-pub extern "C" fn fs_write(
-    file: *const Dentry, // borrowed
-    buf: *const u8,
-    offset: usize,
-    n: usize,
-) -> isize {
-    let file = Dentry::from_raw(&file);
-    let file = file.get_inode().unwrap();
-    let buffer = into_slice(buf, &n);
-
-    match do_write(&file, buffer, offset) {
-        Ok(n) => n as isize,
-        Err(e) => -(e as isize),
-    }
-}
-
-#[no_mangle]
-pub extern "C" fn fs_statx(
-    file: *const Dentry, // borrowed
-    stat: *mut statx,
-    mask: u32,
-) -> i32 {
-    map_err_ffi!((|| {
-        let file = Dentry::from_raw(&file);
-        let file = file.get_inode().unwrap();
-        let statx = unsafe { stat.as_mut() }.unwrap();
-
-        file.statx(statx, mask)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_truncate(
-    file: *const Dentry, // borrowed
-    size: usize,
-) -> i32 {
-    map_err_ffi!((|| {
-        let file = Dentry::from_raw(&file);
-        let file = file.get_inode().unwrap();
-        file.truncate(size)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_readlink(
-    file: *const Dentry, // borrowed
-    mut buf: *mut u8,
-    bufsize: usize,
-) -> i32 {
-    let file = Dentry::from_raw(&file);
-    let file = file.get_inode().unwrap();
-    let mut buffer = RawBuffer::new_from_raw(&mut buf, bufsize);
-
-    match file.readlink(&mut buffer) {
-        Ok(n) => n as i32,
-        Err(e) => -(e as i32),
-    }
-}
-
-#[no_mangle]
-pub extern "C" fn fs_creat(
-    at: *const Dentry, // borrowed
-    mode: mode_t,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.creat(&at, mode as u32)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_mkdir(
-    at: *const Dentry, // borrowed
-    mode: mode_t,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.mkdir(&at, mode as u32)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_mknod(
-    at: *const Dentry, // borrowed
-    mode: mode_t,
-    dev: dev_t,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.mknod(&at, mode as u32, dev as DevId)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_symlink(
-    at: *const Dentry, // borrowed
-    target: *const c_char,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.symlink(&at, get_str_from_cstr(target)?.as_bytes())
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_unlink(at: *const Dentry) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.unlink(&at)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn r_dentry_get_mode(dentry: *const Dentry) -> mode_t {
-    let dentry = Dentry::from_raw(&dentry);
-    dentry.get_inode().unwrap().mode.load(Ordering::Relaxed) as _
-}
-
-#[no_mangle]
-pub extern "C" fn r_dentry_get_size(dentry: *const Dentry) -> u64 {
-    let dentry = Dentry::from_raw(&dentry);
-    dentry.get_inode().unwrap().size.load(Ordering::Relaxed) as _
-}
-
-extern "C" {
-    fn call_callback(
-        callback: *const c_void,
-        filename: *const c_char,
-        filename_len: usize,
-        ino: ino_t,
-    ) -> i32;
-}
-
-#[no_mangle]
-pub extern "C" fn fs_readdir(
-    dentry: *const Dentry, // borrowed
-    offset: usize,
-    callback: *const c_void,
-) -> i64 {
-    let dentry = Dentry::from_raw(&dentry);
-    let dir = dentry.get_inode().unwrap();
-
-    let ret = dir.readdir(offset, &|filename, ino| {
-        let ret = unsafe {
-            call_callback(
-                callback,
-                filename.as_ptr() as *const c_char,
-                filename.len(),
-                ino,
-            )
-        };
-
-        match ret {
-            0 => Ok(()),
-            _ => Err(ret as u32),
-        }
-    });
-
-    match ret {
-        Ok(n) => n as i64,
-        Err(e) => -(e as i64),
-    }
-}

+ 551 - 0
src/kernel/vfs/file.rs

@@ -0,0 +1,551 @@
+use core::{
+    ffi::{c_int, c_ulong},
+    ops::ControlFlow,
+    ptr::NonNull,
+    sync::atomic::Ordering,
+};
+
+use crate::{
+    io::{Buffer, BufferFill, RawBuffer},
+    kernel::mem::{paging::Page, phys::PhysPtr},
+    prelude::*,
+    sync::condvar::CondVar,
+};
+
+use alloc::{collections::vec_deque::VecDeque, sync::Arc};
+use bindings::{
+    current_thread, kernel::tty::tty as TTY, EBADF, EFAULT, EINTR, EINVAL, ENOTDIR, ENOTTY,
+    EOVERFLOW, EPIPE, ESPIPE, SIGPIPE, S_IFMT,
+};
+
+use super::{
+    dentry::Dentry,
+    inode::{Mode, WriteOffset},
+    s_isblk, s_isreg,
+};
+
+pub struct InodeFile {
+    read: bool,
+    write: bool,
+    append: bool,
+    /// Only a few modes those won't possibly change are cached here to speed up file operations.
+    /// Specifically, `S_IFMT` masked bits.
+    mode: Mode,
+    cursor: Mutex<usize>,
+    dentry: Arc<Dentry>,
+}
+
+pub struct PipeInner {
+    buffer: VecDeque<u8>,
+    read_closed: bool,
+    write_closed: bool,
+}
+
+pub struct Pipe {
+    inner: Spin<PipeInner>,
+    cv_read: CondVar,
+    cv_write: CondVar,
+}
+
+pub struct PipeReadEnd {
+    pipe: Arc<Pipe>,
+}
+
+pub struct PipeWriteEnd {
+    pipe: Arc<Pipe>,
+}
+
+pub struct TTYFile {
+    tty: NonNull<TTY>,
+}
+
+pub enum File {
+    Inode(InodeFile),
+    PipeRead(PipeReadEnd),
+    PipeWrite(PipeWriteEnd),
+    TTY(TTYFile),
+}
+
+pub enum SeekOption {
+    Set(usize),
+    Current(isize),
+    End(isize),
+}
+
+impl Drop for PipeReadEnd {
+    fn drop(&mut self) {
+        self.pipe.close_read();
+    }
+}
+
+impl Drop for PipeWriteEnd {
+    fn drop(&mut self) {
+        self.pipe.close_write();
+    }
+}
+
+fn send_sigpipe_to_current() {
+    // Safety: current_thread is always valid.
+    let current = unsafe { current_thread.as_mut().unwrap() };
+
+    // Safety: `signal_list` is `Sync`
+    unsafe { current.send_signal(SIGPIPE) };
+}
+
+impl Pipe {
+    const PIPE_SIZE: usize = 4096;
+
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self {
+            inner: Spin::new(PipeInner {
+                buffer: VecDeque::with_capacity(Self::PIPE_SIZE),
+                read_closed: false,
+                write_closed: false,
+            }),
+            cv_read: CondVar::new(),
+            cv_write: CondVar::new(),
+        })
+    }
+
+    /// # Return
+    /// `(read_end, write_end)`
+    pub fn split(self: &Arc<Self>) -> (Arc<File>, Arc<File>) {
+        (
+            Arc::new(File::PipeRead(PipeReadEnd { pipe: self.clone() })),
+            Arc::new(File::PipeWrite(PipeWriteEnd { pipe: self.clone() })),
+        )
+    }
+
+    fn close_read(&self) {
+        let mut inner = self.inner.lock();
+        if inner.read_closed {
+            return;
+        }
+
+        inner.read_closed = true;
+        self.cv_write.notify_all();
+    }
+
+    fn close_write(&self) {
+        let mut inner = self.inner.lock();
+        if inner.write_closed {
+            return;
+        }
+
+        inner.write_closed = true;
+        self.cv_read.notify_all();
+    }
+
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+
+        while !inner.write_closed && inner.buffer.is_empty() {
+            let interrupted = self.cv_read.wait(&mut inner, true);
+            if interrupted {
+                return Err(EINTR);
+            }
+        }
+
+        let (data1, data2) = inner.buffer.as_slices();
+        let nread = buffer.fill(data1)?.allow_partial() + buffer.fill(data2)?.allow_partial();
+        inner.buffer.drain(..nread);
+
+        self.cv_write.notify_all();
+        Ok(nread)
+    }
+
+    fn write_atomic(&self, data: &[u8]) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+
+        if inner.read_closed {
+            send_sigpipe_to_current();
+            return Err(EPIPE);
+        }
+
+        while inner.buffer.len() + data.len() > Self::PIPE_SIZE {
+            let interrupted = self.cv_write.wait(&mut inner, true);
+            if interrupted {
+                return Err(EINTR);
+            }
+
+            if inner.read_closed {
+                send_sigpipe_to_current();
+                return Err(EPIPE);
+            }
+        }
+
+        inner.buffer.extend(data);
+
+        self.cv_read.notify_all();
+        return Ok(data.len());
+    }
+
+    fn write_non_atomic(&self, data: &[u8]) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+
+        if inner.read_closed {
+            send_sigpipe_to_current();
+            return Err(EPIPE);
+        }
+
+        let mut remaining = data;
+        while !remaining.is_empty() {
+            let space = inner.buffer.capacity() - inner.buffer.len();
+
+            if space != 0 {
+                let to_write = remaining.len().min(space);
+                inner.buffer.extend(&remaining[..to_write]);
+                remaining = &remaining[to_write..];
+
+                self.cv_read.notify_all();
+            }
+
+            if remaining.is_empty() {
+                break;
+            }
+
+            let interrupted = self.cv_write.wait(&mut inner, true);
+            if interrupted {
+                if data.len() != remaining.len() {
+                    break;
+                }
+                return Err(EINTR);
+            }
+
+            if inner.read_closed {
+                send_sigpipe_to_current();
+                return Err(EPIPE);
+            }
+        }
+
+        Ok(data.len() - remaining.len())
+    }
+
+    fn write(&self, data: &[u8]) -> KResult<usize> {
+        // Writes those are smaller than the pipe size are atomic.
+        if data.len() <= Self::PIPE_SIZE {
+            self.write_atomic(data)
+        } else {
+            self.write_non_atomic(data)
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+struct UserDirent64 {
+    /// Inode number
+    d_ino: u64,
+    /// Implementation defined. We ignore it
+    d_off: u64,
+    /// Length of this record
+    d_reclen: u16,
+    /// File type. Set to 0
+    d_type: u8,
+    /// Filename with a padding '\0'
+    d_name: [u8; 0],
+}
+
+/// File type is at offset `d_reclen - 1`. Set it to 0
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+struct UserDirent {
+    /// Inode number
+    d_ino: u32,
+    /// Implementation defined. We ignore it
+    d_off: u32,
+    /// Length of this record
+    d_reclen: u16,
+    /// Filename with a padding '\0'
+    d_name: [u8; 0],
+}
+
+fn has_pending_signal() -> bool {
+    unsafe { current_thread.as_mut().unwrap().signals.pending_signal() != 0 }
+}
+
+impl InodeFile {
+    pub fn new(dentry: Arc<Dentry>, rwa: (bool, bool, bool)) -> Arc<File> {
+        // SAFETY: `dentry` used to create `InodeFile` is valid.
+        // SAFETY: `mode` should never change with respect to the `S_IFMT` fields.
+        let cached_mode = dentry
+            .get_inode()
+            .expect("`dentry` is invalid")
+            .mode
+            .load(Ordering::Relaxed)
+            & S_IFMT;
+
+        Arc::new(File::Inode(InodeFile {
+            dentry,
+            read: rwa.0,
+            write: rwa.1,
+            append: rwa.2,
+            mode: cached_mode,
+            cursor: Mutex::new(0),
+        }))
+    }
+
+    fn seek(&self, option: SeekOption) -> KResult<usize> {
+        let mut cursor = self.cursor.lock();
+
+        let new_cursor = match option {
+            SeekOption::Current(off) => cursor.checked_add_signed(off).ok_or(EOVERFLOW)?,
+            SeekOption::Set(n) => n,
+            SeekOption::End(off) => {
+                let inode = self.dentry.get_inode()?;
+                let size = inode.size.load(Ordering::Relaxed) as usize;
+                size.checked_add_signed(off).ok_or(EOVERFLOW)?
+            }
+        };
+
+        *cursor = new_cursor;
+        Ok(new_cursor)
+    }
+
+    fn write(&self, buffer: &[u8]) -> KResult<usize> {
+        if !self.write {
+            return Err(EBADF);
+        }
+
+        let mut cursor = self.cursor.lock();
+
+        // TODO!!!: use `UserBuffer`
+        if self.append {
+            let nwrote = self
+                .dentry
+                .write(buffer, WriteOffset::End(cursor.as_mut()))?;
+
+            Ok(nwrote)
+        } else {
+            let nwrote = self.dentry.write(buffer, WriteOffset::Position(*cursor))?;
+
+            *cursor += nwrote;
+            Ok(nwrote)
+        }
+    }
+
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        if !self.read {
+            return Err(EBADF);
+        }
+
+        let mut cursor = self.cursor.lock();
+
+        let nread = self.dentry.read(buffer, *cursor)?;
+
+        *cursor += nread;
+        Ok(nread)
+    }
+
+    fn getdents64(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        let mut cursor = self.cursor.lock();
+
+        let nread = self.dentry.readdir(*cursor, |filename, ino| {
+            // Filename length + 1 for padding '\0'
+            let real_record_len = core::mem::size_of::<UserDirent64>() + filename.len() + 1;
+
+            if buffer.available() < real_record_len {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            let record = UserDirent64 {
+                d_ino: ino,
+                d_off: 0,
+                d_reclen: real_record_len as u16,
+                d_type: 0,
+                d_name: [0; 0],
+            };
+
+            buffer.copy(&record)?.ok_or(EFAULT)?;
+            buffer.fill(filename)?.ok_or(EFAULT)?;
+            buffer.fill(&[0])?.ok_or(EFAULT)?;
+
+            Ok(ControlFlow::Continue(()))
+        })?;
+
+        *cursor += nread;
+        Ok(())
+    }
+
+    fn getdents(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        let mut cursor = self.cursor.lock();
+
+        let nread = self.dentry.readdir(*cursor, |filename, ino| {
+            // + 1 for filename length padding '\0', + 1 for d_type.
+            let real_record_len = core::mem::size_of::<UserDirent>() + filename.len() + 2;
+
+            if buffer.available() < real_record_len {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            let record = UserDirent {
+                d_ino: ino as u32,
+                d_off: 0,
+                d_reclen: real_record_len as u16,
+                d_name: [0; 0],
+            };
+
+            buffer.copy(&record)?.ok_or(EFAULT)?;
+            buffer.fill(filename)?.ok_or(EFAULT)?;
+            buffer.fill(&[0, 0])?.ok_or(EFAULT)?;
+
+            Ok(ControlFlow::Continue(()))
+        })?;
+
+        *cursor += nread;
+        Ok(())
+    }
+}
+
+impl TTYFile {
+    pub fn new(tty: *mut TTY) -> Arc<File> {
+        Arc::new(File::TTY(TTYFile {
+            tty: NonNull::new(tty).expect("`tty` is null"),
+        }))
+    }
+
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        // SAFETY: `tty` should always valid.
+        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
+
+        let mut c_buffer: Vec<u8> = vec![0; buffer.total()];
+
+        // SAFETY: `tty` points to a valid `TTY` instance.
+        let nread = unsafe {
+            tty.read(
+                c_buffer.as_mut_ptr() as *mut _,
+                c_buffer.len(),
+                c_buffer.len(),
+            )
+        };
+
+        match nread {
+            n if n < 0 => Err((-n) as u32),
+            0 => Ok(0),
+            n => Ok(buffer.fill(&c_buffer[..n as usize])?.allow_partial()),
+        }
+    }
+
+    fn write(&self, buffer: &[u8]) -> KResult<usize> {
+        // SAFETY: `tty` should always valid.
+        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
+
+        for &ch in buffer.iter() {
+            // SAFETY: `tty` points to a valid `TTY` instance.
+            unsafe { tty.show_char(ch as i32) };
+        }
+
+        Ok(buffer.len())
+    }
+
+    fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
+        // SAFETY: `tty` should always valid.
+        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
+
+        // SAFETY: `tty` points to a valid `TTY` instance.
+        let result = unsafe { tty.ioctl(request as c_int, arg3 as c_ulong) };
+
+        match result {
+            0 => Ok(0),
+            _ => Err((-result) as u32),
+        }
+    }
+}
+
+impl File {
+    pub fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        match self {
+            File::Inode(inode) => inode.read(buffer),
+            File::PipeRead(pipe) => pipe.pipe.read(buffer),
+            File::TTY(tty) => tty.read(buffer),
+            _ => Err(EBADF),
+        }
+    }
+
+    // TODO
+    // /// Read from the file into the given buffers.
+    // ///
+    // /// Reads are atomic, not intermingled with other reads or writes.
+    // pub fn readv<'r, 'i, I: Iterator<Item = &'i mut dyn Buffer>>(
+    //     &'r self,
+    //     buffers: I,
+    // ) -> KResult<usize> {
+    //     match self {
+    //         File::Inode(inode) => inode.readv(buffers),
+    //         File::PipeRead(pipe) => pipe.pipe.readv(buffers),
+    //         _ => Err(EBADF),
+    //     }
+    // }
+
+    pub fn write(&self, buffer: &[u8]) -> KResult<usize> {
+        match self {
+            File::Inode(inode) => inode.write(buffer),
+            File::PipeWrite(pipe) => pipe.pipe.write(buffer),
+            File::TTY(tty) => tty.write(buffer),
+            _ => Err(EBADF),
+        }
+    }
+
+    pub fn seek(&self, option: SeekOption) -> KResult<usize> {
+        match self {
+            File::Inode(inode) => inode.seek(option),
+            File::PipeRead(_) | File::PipeWrite(_) | File::TTY(_) => Err(ESPIPE),
+        }
+    }
+
+    pub fn getdents(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        match self {
+            File::Inode(inode) => inode.getdents(buffer),
+            _ => Err(ENOTDIR),
+        }
+    }
+
+    pub fn getdents64(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        match self {
+            File::Inode(inode) => inode.getdents64(buffer),
+            _ => Err(ENOTDIR),
+        }
+    }
+
+    pub fn sendfile(&self, dest_file: &Self, count: usize) -> KResult<usize> {
+        let buffer_page = Page::alloc_one();
+
+        match self {
+            File::Inode(file) if s_isblk(file.mode) || s_isreg(file.mode) => (),
+            _ => return Err(EINVAL),
+        }
+
+        // TODO!!!: zero copy implementation with mmap
+        let mut tot = 0usize;
+        while tot < count {
+            if has_pending_signal() {
+                if tot == 0 {
+                    return Err(EINTR);
+                } else {
+                    return Ok(tot);
+                }
+            }
+
+            let batch_size = usize::min(count - tot, buffer_page.len());
+            let slice = buffer_page.as_cached().as_mut_slice::<u8>(batch_size);
+            let mut buffer = RawBuffer::new_from_slice(slice);
+
+            let nwrote = self.read(&mut buffer)?;
+
+            if nwrote == 0 {
+                break;
+            }
+
+            tot += dest_file.write(&slice[..nwrote])?;
+        }
+
+        Ok(tot)
+    }
+
+    pub fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
+        match self {
+            File::TTY(tty) => tty.ioctl(request, arg3),
+            _ => Err(ENOTTY),
+        }
+    }
+}

+ 0 - 290
src/kernel/vfs/filearr.cc

@@ -1,290 +0,0 @@
-#include <set>
-
-#include <assert.h>
-
-#include <types/path.hpp>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/filearr.hpp>
-
-using namespace fs;
-
-using kernel::async::mutex, kernel::async::lock_guard;
-
-struct fditem {
-    int fd;
-    int flags;
-    std::shared_ptr<file> pfile;
-};
-
-struct fditem_comparator {
-    constexpr bool operator()(const fditem& lhs, const fditem& rhs) const {
-        return lhs.fd < rhs.fd;
-    }
-
-    constexpr bool operator()(int fd, const fditem& rhs) const { return fd < rhs.fd; }
-
-    constexpr bool operator()(const fditem& lhs, int fd) const { return lhs.fd < fd; }
-};
-
-// ALL METHODS SHOULD BE CALLED WITH LOCK HELD
-struct filearray::impl {
-    mutex mtx;
-
-    const fs_context* context;
-    std::set<fditem, fditem_comparator> arr;
-    int min_avail{};
-
-    int allocate_fd(int from);
-    void release_fd(int fd);
-    int next_fd();
-
-    int do_dup(const fditem& oldfile, int new_fd, int flags);
-    int place_new_file(std::shared_ptr<file> pfile, int flags);
-};
-
-int filearray::impl::allocate_fd(int from) {
-    if (from < min_avail)
-        from = min_avail;
-
-    if (from == min_avail) {
-        int nextfd = min_avail + 1;
-        auto iter = arr.find(nextfd);
-        while (iter && nextfd == iter->fd)
-            ++nextfd, ++iter;
-
-        int retval = min_avail;
-        min_avail = nextfd;
-        return retval;
-    }
-
-    int fd = from;
-    auto iter = arr.find(fd);
-    while (iter && fd == iter->fd)
-        ++fd, ++iter;
-
-    return fd;
-}
-
-void filearray::impl::release_fd(int fd) {
-    if (fd < min_avail)
-        min_avail = fd;
-}
-
-int filearray::impl::next_fd() {
-    return allocate_fd(min_avail);
-}
-
-int filearray::impl::do_dup(const fditem& oldfile, int new_fd, int flags) {
-    bool inserted;
-    std::tie(std::ignore, inserted) = arr.emplace(new_fd, flags, oldfile.pfile);
-    assert(inserted);
-
-    return new_fd;
-}
-
-int filearray::impl::place_new_file(std::shared_ptr<file> pfile, int flags) {
-    int fd = next_fd();
-
-    bool inserted;
-    std::tie(std::ignore, inserted) = arr.emplace(fd, std::move(flags), pfile);
-    assert(inserted);
-
-    return fd;
-}
-
-int filearray::dup(int old_fd) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(old_fd);
-    if (!iter)
-        return -EBADF;
-
-    int fd = pimpl->next_fd();
-    return pimpl->do_dup(*iter, fd, 0);
-}
-
-int filearray::dup(int old_fd, int new_fd, int flags) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter_old = pimpl->arr.find(old_fd);
-    if (!iter_old)
-        return -EBADF;
-
-    auto iter_new = pimpl->arr.find(new_fd);
-    if (iter_new) {
-        iter_new->pfile = iter_old->pfile;
-        iter_new->flags = flags;
-
-        return new_fd;
-    }
-
-    int fd = pimpl->allocate_fd(new_fd);
-    assert(fd == new_fd);
-    return pimpl->do_dup(*iter_old, fd, flags);
-}
-
-int filearray::dupfd(int fd, int min_fd, int flags) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(fd);
-    if (!iter)
-        return -EBADF;
-
-    int new_fd = pimpl->allocate_fd(min_fd);
-    return pimpl->do_dup(*iter, new_fd, flags);
-}
-
-int filearray::set_flags(int fd, int flags) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(fd);
-    if (!iter)
-        return -EBADF;
-
-    iter->flags |= flags;
-    return 0;
-}
-
-int filearray::close(int fd) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(fd);
-    if (!iter)
-        return -EBADF;
-
-    pimpl->release_fd(fd);
-    pimpl->arr.erase(iter);
-
-    return 0;
-}
-
-static inline std::pair<dentry_pointer, int> _open_file(const fs_context& context,
-                                                        const dentry_pointer& cwd,
-                                                        types::string_view filepath, int flags,
-                                                        mode_t mode) {
-    auto [dent, ret] = fs::open(context, cwd, filepath);
-    if (!dent)
-        return {nullptr, ret};
-
-    if (!(r_dentry_is_invalid(dent.get()))) {
-        if ((flags & O_CREAT) && (flags & O_EXCL))
-            return {nullptr, -EEXIST};
-        return {std::move(dent), 0};
-    }
-
-    if (!(flags & O_CREAT))
-        return {nullptr, -ENOENT};
-
-    // create file
-    if (int ret = fs_creat(dent.get(), mode); ret != 0)
-        return {nullptr, ret};
-
-    return {std::move(dent), 0};
-}
-
-// TODO: file opening permissions check
-int filearray::open(const dentry_pointer& cwd, types::string_view filepath, int flags,
-                    mode_t mode) {
-    auto [dent, ret] = _open_file(*pimpl->context, cwd, filepath, flags, mode);
-
-    assert(dent || ret != 0);
-    if (ret != 0)
-        return ret;
-
-    auto filemode = r_dentry_get_mode(dent.get());
-
-    int fdflag = (flags & O_CLOEXEC) ? FD_CLOEXEC : 0;
-
-    file::file_flags fflags;
-    fflags.read = !(flags & O_WRONLY);
-    fflags.write = (flags & (O_WRONLY | O_RDWR));
-    fflags.append = S_ISREG(filemode) && (flags & O_APPEND);
-
-    // check whether dentry is a file if O_DIRECTORY is set
-    if (flags & O_DIRECTORY) {
-        if (!S_ISDIR(filemode))
-            return -ENOTDIR;
-    } else {
-        if (S_ISDIR(filemode) && fflags.write)
-            return -EISDIR;
-    }
-
-    // truncate file
-    if (flags & O_TRUNC) {
-        if (fflags.write && S_ISREG(filemode)) {
-            auto ret = fs_truncate(dent.get(), 0);
-            if (ret != 0)
-                return ret;
-        }
-    }
-
-    lock_guard lck{pimpl->mtx};
-
-    return pimpl->place_new_file(std::make_shared<regular_file>(fflags, 0, d_get(dent)), fdflag);
-}
-
-int filearray::pipe(int (&pipefd)[2]) {
-    lock_guard lck{pimpl->mtx};
-
-    if (1) {
-        std::shared_ptr<fs::pipe> ppipe{new fs::pipe};
-
-        pipefd[0] =
-            pimpl->place_new_file(std::make_shared<fifo_file>(file::file_flags{1, 0, 0}, ppipe), 0);
-
-        pipefd[1] =
-            pimpl->place_new_file(std::make_shared<fifo_file>(file::file_flags{0, 1, 0}, ppipe), 0);
-    }
-
-    return 0;
-}
-
-filearray::filearray(std::shared_ptr<impl> ptr) : pimpl{ptr} {}
-
-filearray::filearray(const fs_context* context) : filearray{std::make_shared<impl>()} {
-    pimpl->context = context;
-}
-
-filearray filearray::copy() const {
-    lock_guard lck{pimpl->mtx};
-    filearray ret{pimpl->context};
-
-    ret.pimpl->min_avail = pimpl->min_avail;
-    ret.pimpl->arr = pimpl->arr;
-
-    return ret;
-}
-
-filearray filearray::share() const {
-    return filearray{pimpl};
-}
-
-void filearray::clear() {
-    pimpl.reset();
-}
-
-void filearray::onexec() {
-    lock_guard lck{pimpl->mtx};
-
-    for (auto iter = pimpl->arr.begin(); iter;) {
-        if (!(iter->flags & FD_CLOEXEC)) {
-            ++iter;
-            continue;
-        }
-        pimpl->release_fd(iter->fd);
-        iter = pimpl->arr.erase(iter);
-    }
-}
-
-file* filearray::operator[](int i) const {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(i);
-    if (!iter)
-        return nullptr;
-
-    return iter->pfile.get();
-}

+ 307 - 0
src/kernel/vfs/filearray.rs

@@ -0,0 +1,307 @@
+use core::sync::atomic::Ordering;
+
+use crate::{
+    kernel::vfs::{dentry::Dentry, file::Pipe, s_isdir, s_isreg},
+    path::Path,
+    prelude::*,
+};
+
+use alloc::{
+    collections::btree_map::{BTreeMap, Entry},
+    sync::Arc,
+};
+use bindings::{
+    current_process, kernel::tty::console, EBADF, EINVAL, EISDIR, ENOTDIR, FD_CLOEXEC, F_DUPFD,
+    F_DUPFD_CLOEXEC, F_GETFD, F_SETFD, O_APPEND, O_CLOEXEC, O_DIRECTORY, O_RDWR, O_TRUNC, O_WRONLY,
+};
+use itertools::{
+    FoldWhile::{Continue, Done},
+    Itertools,
+};
+
+use super::{
+    file::{File, InodeFile, TTYFile},
+    inode::Mode,
+    s_ischr, FsContext, Spin,
+};
+
+type FD = u32;
+
+#[derive(Clone)]
+struct OpenFile {
+    /// File descriptor flags, only for `FD_CLOEXEC`.
+    flags: u64,
+    file: Arc<File>,
+}
+
+#[derive(Clone)]
+struct FileArrayInner {
+    files: BTreeMap<FD, OpenFile>,
+    fd_min_avail: FD,
+}
+
+pub struct FileArray {
+    inner: Spin<FileArrayInner>,
+}
+
+impl OpenFile {
+    pub fn close_on_exec(&self) -> bool {
+        self.flags & O_CLOEXEC as u64 != 0
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_new_for_init() -> *const FileArray {
+    Arc::into_raw(Arc::new(FileArray {
+        inner: Spin::new(FileArrayInner {
+            files: BTreeMap::new(),
+            fd_min_avail: 0,
+        }),
+    }))
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_new_shared(other: *const FileArray) -> *const FileArray {
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(FileArray::new_shared(&other))
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_new_cloned(other: *const FileArray) -> *const FileArray {
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(FileArray::new_cloned(&other))
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_drop(other: *const FileArray) {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    unsafe { Arc::from_raw(other) };
+}
+
+impl FileArray {
+    pub fn get_current<'lt>() -> BorrowedArc<'lt, Self> {
+        // SAFETY: `current_process` is always valid.
+        let current = unsafe { current_process.as_mut().unwrap() };
+        BorrowedArc::from_raw(current.files.m_handle as *const _)
+    }
+
+    pub fn new_shared(other: &Arc<Self>) -> Arc<Self> {
+        other.clone()
+    }
+
+    pub fn new_cloned(other: &Arc<Self>) -> Arc<Self> {
+        Arc::new(Self {
+            inner: Spin::clone(&other.inner),
+        })
+    }
+
+    /// Acquires the file array lock.
+    pub fn get(&self, fd: FD) -> Option<Arc<File>> {
+        self.inner.lock().get(fd)
+    }
+
+    pub fn close_all(&self) {
+        let mut inner = self.inner.lock();
+        inner.fd_min_avail = 0;
+        inner.files.clear();
+    }
+
+    pub fn close(&self, fd: FD) -> KResult<()> {
+        let mut inner = self.inner.lock();
+        inner.files.remove(&fd).ok_or(EBADF)?;
+        inner.release_fd(fd);
+        Ok(())
+    }
+
+    pub fn on_exec(&self) -> () {
+        let mut inner = self.inner.lock();
+
+        // TODO: This is not efficient. We should avoid cloning.
+        let fds_to_close = inner
+            .files
+            .iter()
+            .filter(|(_, ofile)| ofile.close_on_exec())
+            .map(|(&fd, _)| fd)
+            .collect::<Vec<_>>();
+
+        inner.files.retain(|_, ofile| !ofile.close_on_exec());
+        fds_to_close.into_iter().for_each(|fd| inner.release_fd(fd));
+    }
+}
+
+impl FileArray {
+    pub fn dup(&self, old_fd: FD) -> KResult<FD> {
+        let mut inner = self.inner.lock();
+        let old_file = inner.files.get(&old_fd).ok_or(EBADF)?;
+
+        let new_file_data = old_file.file.clone();
+        let new_file_flags = old_file.flags;
+        let new_fd = inner.next_fd();
+
+        inner.do_insert(new_fd, new_file_flags, new_file_data);
+
+        Ok(new_fd)
+    }
+
+    pub fn dup_to(&self, old_fd: FD, new_fd: FD, flags: u64) -> KResult<FD> {
+        let mut inner = self.inner.lock();
+        let old_file = inner.files.get(&old_fd).ok_or(EBADF)?;
+
+        let new_file_data = old_file.file.clone();
+
+        match inner.files.entry(new_fd) {
+            Entry::Vacant(_) => {}
+            Entry::Occupied(entry) => {
+                let new_file = entry.into_mut();
+
+                new_file.flags = flags;
+                new_file.file = new_file_data;
+
+                return Ok(new_fd);
+            }
+        }
+
+        assert_eq!(new_fd, inner.allocate_fd(new_fd));
+        inner.do_insert(new_fd, flags, new_file_data);
+
+        Ok(new_fd)
+    }
+
+    /// # Return
+    /// `(read_fd, write_fd)`
+    pub fn pipe(&self) -> KResult<(FD, FD)> {
+        let mut inner = self.inner.lock();
+
+        let read_fd = inner.next_fd();
+        let write_fd = inner.next_fd();
+
+        let pipe = Pipe::new();
+        let (read_end, write_end) = pipe.split();
+        inner.do_insert(read_fd, 0, read_end);
+        inner.do_insert(write_fd, 0, write_end);
+
+        Ok((read_fd, write_fd))
+    }
+
+    pub fn open(&self, fs_context: &FsContext, path: Path, flags: u32, mode: Mode) -> KResult<FD> {
+        let dentry = Dentry::open(fs_context, path, true)?;
+        dentry.open_check(flags, mode)?;
+
+        let fdflag = if flags & O_CLOEXEC != 0 { FD_CLOEXEC } else { 0 };
+        let can_read = flags & O_WRONLY == 0;
+        let can_write = flags & (O_WRONLY | O_RDWR) != 0;
+        let append = flags & O_APPEND != 0;
+
+        let inode = dentry.get_inode()?;
+        let filemode = inode.mode.load(Ordering::Relaxed);
+
+        if flags & O_DIRECTORY != 0 {
+            if !s_isdir(filemode) {
+                return Err(ENOTDIR);
+            }
+        } else {
+            if s_isdir(filemode) && can_write {
+                return Err(EISDIR);
+            }
+        }
+
+        if flags & O_TRUNC != 0 {
+            if can_write && s_isreg(filemode) {
+                inode.truncate(0)?;
+            }
+        }
+
+        let mut inner = self.inner.lock();
+        let fd = inner.next_fd();
+
+        if s_ischr(filemode) && inode.devid()? == 0x0501 {
+            inner.do_insert(fd, fdflag as u64, TTYFile::new(unsafe { console }));
+        } else {
+            inner.do_insert(
+                fd,
+                fdflag as u64,
+                InodeFile::new(dentry, (can_read, can_write, append)),
+            );
+        }
+
+        Ok(fd)
+    }
+
+    pub fn fcntl(&self, fd: FD, cmd: u32, arg: usize) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+        let ofile = inner.files.get_mut(&fd).ok_or(EBADF)?;
+
+        match cmd {
+            F_DUPFD | F_DUPFD_CLOEXEC => {
+                let cloexec = cmd == F_DUPFD_CLOEXEC || (ofile.flags & FD_CLOEXEC as u64 != 0);
+                let flags = if cloexec { O_CLOEXEC } else { 0 };
+
+                let new_file_data = ofile.file.clone();
+                let new_fd = inner.allocate_fd(arg as FD);
+
+                inner.do_insert(new_fd, flags as u64, new_file_data);
+
+                Ok(new_fd as usize)
+            }
+            F_GETFD => Ok(ofile.flags as usize),
+            F_SETFD => {
+                ofile.flags = arg as u64;
+                Ok(0)
+            }
+            _ => unimplemented!("fcntl: cmd={}", cmd),
+        }
+    }
+}
+
+impl FileArrayInner {
+    fn get(&mut self, fd: FD) -> Option<Arc<File>> {
+        self.files.get(&fd).map(|f| f.file.clone())
+    }
+
+    fn find_available(&mut self, from: FD) -> FD {
+        self.files
+            .range(&from..)
+            .fold_while(from, |current, (&key, _)| {
+                if current == key {
+                    Continue(current + 1)
+                } else {
+                    Done(current)
+                }
+            })
+            .into_inner()
+    }
+
+    /// Allocate a new file descriptor starting from `from`.
+    ///
+    /// Returned file descriptor should be used immediately.
+    ///
+    fn allocate_fd(&mut self, from: FD) -> FD {
+        let from = FD::max(from, self.fd_min_avail);
+
+        if from == self.fd_min_avail {
+            let next_min_avail = self.find_available(from + 1);
+            let allocated = self.fd_min_avail;
+            self.fd_min_avail = next_min_avail;
+            allocated
+        } else {
+            self.find_available(from)
+        }
+    }
+
+    fn release_fd(&mut self, fd: FD) {
+        if fd < self.fd_min_avail {
+            self.fd_min_avail = fd;
+        }
+    }
+
+    fn next_fd(&mut self) -> FD {
+        self.allocate_fd(self.fd_min_avail)
+    }
+
+    /// Insert a file description to the file array.
+    fn do_insert(&mut self, fd: FD, flags: u64, file: Arc<File>) {
+        assert!(self.files.insert(fd, OpenFile { flags, file }).is_none());
+    }
+}

+ 13 - 10
src/kernel/vfs/inode.rs

@@ -5,12 +5,10 @@ use bindings::{
     S_IFMT,
 };
 use core::{
-    mem::MaybeUninit,
-    ptr::addr_of_mut,
-    sync::atomic::{AtomicU32, AtomicU64, Ordering},
+    mem::MaybeUninit, ops::ControlFlow, ptr::addr_of_mut, sync::atomic::{AtomicU32, AtomicU64, Ordering}
 };
 
-use super::{dentry::Dentry, s_isblk, s_ischr, vfs::Vfs, DevId, ReadDirCallback, TimeSpec};
+use super::{dentry::Dentry, s_isblk, s_ischr, vfs::Vfs, DevId, TimeSpec};
 use crate::{io::Buffer, prelude::*};
 
 pub type Ino = u64;
@@ -74,6 +72,11 @@ pub trait InodeInner:
     fn data_mut(&mut self) -> &mut InodeData;
 }
 
+pub enum WriteOffset<'end> {
+    Position(usize),
+    End(&'end mut usize),
+}
+
 #[allow(unused_variables)]
 pub trait Inode: Send + Sync + InodeInner {
     fn is_dir(&self) -> bool {
@@ -88,11 +91,11 @@ pub trait Inode: Send + Sync + InodeInner {
         Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn mkdir(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
+    fn mkdir(&self, at: &Dentry, mode: Mode) -> KResult<()> {
         Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn mknod(&self, at: &Arc<Dentry>, mode: Mode, dev: DevId) -> KResult<()> {
+    fn mknod(&self, at: &Dentry, mode: Mode, dev: DevId) -> KResult<()> {
         Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
@@ -108,7 +111,7 @@ pub trait Inode: Send + Sync + InodeInner {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
-    fn write(&self, buffer: &[u8], offset: usize) -> KResult<usize> {
+    fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
         Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
@@ -124,10 +127,10 @@ pub trait Inode: Send + Sync + InodeInner {
         Err(if self.is_dir() { EISDIR } else { EPERM })
     }
 
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
+    fn do_readdir(
+        &self,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
         Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }

+ 58 - 11
src/kernel/vfs/mod.rs

@@ -1,25 +1,20 @@
 use crate::prelude::*;
 
-use bindings::{dev_t, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFMT, S_IFREG};
-use inode::{Ino, Mode};
+use alloc::sync::Arc;
+use bindings::{current_process, dev_t, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFMT, S_IFREG};
+use dentry::Dentry;
+use inode::Mode;
 
 pub mod dentry;
 pub mod ffi;
+pub mod file;
+pub mod filearray;
 pub mod inode;
 pub mod mount;
 pub mod vfs;
 
 pub type DevId = dev_t;
 
-/// # Return
-///
-/// Return -1 if an error occurred
-///
-/// Return 0 if no more entry available
-///
-/// Otherwise, return bytes to be added to the offset
-pub type ReadDirCallback<'lt> = dyn Fn(&[u8], Ino) -> KResult<()> + 'lt;
-
 pub fn s_isreg(mode: Mode) -> bool {
     (mode & S_IFMT) == S_IFREG
 }
@@ -46,3 +41,55 @@ pub struct TimeSpec {
     pub sec: u64,
     pub nsec: u64,
 }
+
+#[derive(Clone)]
+pub struct FsContext {
+    pub fsroot: Arc<Dentry>,
+    pub cwd: Spin<Arc<Dentry>>,
+    pub umask: Spin<Mode>,
+}
+
+impl FsContext {
+    pub fn get_current() -> BorrowedArc<'static, Self> {
+        // SAFETY: There should always be a current process.
+        let current = unsafe { current_process.as_ref().unwrap() };
+        let ptr = current.fs_context.m_handle as *const _ as *const Self;
+
+        BorrowedArc::from_raw(ptr)
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_drop(other: *const FsContext) {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    unsafe { Arc::from_raw(other) };
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_new_cloned(other: *const FsContext) -> *const FsContext {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(Arc::new(FsContext {
+        fsroot: other.fsroot.clone(),
+        cwd: other.cwd.clone(),
+        umask: other.umask.clone(),
+    }))
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_new_shared(other: *const FsContext) -> *const FsContext {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(other.clone())
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_new_for_init() -> *const FsContext {
+    Arc::into_raw(Arc::new(FsContext {
+        fsroot: Dentry::kernel_root_dentry(),
+        cwd: Spin::new(Dentry::kernel_root_dentry()),
+        umask: Spin::new(0o022),
+    }))
+}

+ 16 - 26
src/kernel/vfs/mount.rs

@@ -1,4 +1,4 @@
-use crate::prelude::*;
+use crate::{fs::tmpfs, prelude::*};
 
 use alloc::{
     collections::btree_map::{BTreeMap, Entry},
@@ -35,8 +35,7 @@ const MOUNT_FLAGS: [(u64, &str); 6] = [
 lazy_static! {
     static ref MOUNT_CREATORS: Spin<BTreeMap<String, Arc<dyn MountCreator>>> =
         Spin::new(BTreeMap::new());
-    static ref MOUNTS: Spin<Vec<(Arc<Dentry>, MountPointData)>> =
-        Spin::new(vec![]);
+    static ref MOUNTS: Spin<Vec<(Arc<Dentry>, MountPointData)>> = Spin::new(vec![]);
 }
 
 static mut ROOTFS: Option<Arc<Dentry>> = None;
@@ -47,11 +46,7 @@ pub struct Mount {
 }
 
 impl Mount {
-    pub fn new(
-        mp: &Dentry,
-        vfs: Arc<dyn Vfs>,
-        root_inode: Arc<dyn Inode>,
-    ) -> KResult<Self> {
+    pub fn new(mp: &Dentry, vfs: Arc<dyn Vfs>, root_inode: Arc<dyn Inode>) -> KResult<Self> {
         let root_dentry = Dentry::create(mp.parent().clone(), mp.name());
         root_dentry.save_dir(root_inode)?;
 
@@ -70,19 +65,10 @@ unsafe impl Send for Mount {}
 unsafe impl Sync for Mount {}
 
 pub trait MountCreator: Send + Sync {
-    fn create_mount(
-        &self,
-        source: &str,
-        flags: u64,
-        data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount>;
+    fn create_mount(&self, source: &str, flags: u64, mp: &Arc<Dentry>) -> KResult<Mount>;
 }
 
-pub fn register_filesystem(
-    fstype: &str,
-    creator: Arc<dyn MountCreator>,
-) -> KResult<()> {
+pub fn register_filesystem(fstype: &str, creator: Arc<dyn MountCreator>) -> KResult<()> {
     let mut creators = MOUNT_CREATORS.lock();
     match creators.entry(String::from(fstype)) {
         Entry::Occupied(_) => Err(EEXIST),
@@ -107,7 +93,6 @@ pub fn do_mount(
     mountpoint_str: &str,
     fstype: &str,
     flags: u64,
-    data: &[u8],
 ) -> KResult<()> {
     let mut flags = flags;
     if flags & MS_NOATIME == 0 {
@@ -126,7 +111,7 @@ pub fn do_mount(
         let creators = { MOUNT_CREATORS.lock() };
         creators.get(fstype).ok_or(ENODEV)?.clone()
     };
-    let mount = creator.create_mount(source, flags, data, mountpoint)?;
+    let mount = creator.create_mount(source, flags, mountpoint)?;
 
     let root_dentry = mount.root().clone();
 
@@ -177,7 +162,11 @@ pub fn dump_mounts(buffer: &mut dyn core::fmt::Write) {
     }
 }
 
-pub fn create_rootfs() {
+#[no_mangle]
+#[link_section = ".text.kinit"]
+pub extern "C" fn r_init_vfs() {
+    tmpfs::init();
+
     let source = String::from("rootfs");
     let fstype = String::from("tmpfs");
     let flags = MS_NOATIME;
@@ -187,7 +176,7 @@ pub fn create_rootfs() {
         let creator = creators.get(&fstype).ok_or(ENODEV).unwrap();
 
         creator
-            .create_mount(&source, flags, &[], dcache::_looped_droot())
+            .create_mount(&source, flags, dcache::_looped_droot())
             .unwrap()
     };
 
@@ -209,7 +198,8 @@ pub fn create_rootfs() {
         .push((dcache::_looped_droot().clone(), mpdata));
 }
 
-#[no_mangle]
-pub extern "C" fn r_get_root_dentry() -> *const Dentry {
-    unsafe { ROOTFS.as_ref().cloned().map(Arc::into_raw).unwrap() }
+impl Dentry {
+    pub fn kernel_root_dentry() -> Arc<Dentry> {
+        unsafe { ROOTFS.as_ref().cloned().unwrap() }
+    }
 }

+ 4 - 0
src/kinit.cpp

@@ -61,6 +61,8 @@ static inline void set_uname() {
     strcpy(kernel::sys_utsname->domainname, "(none)");
 }
 
+extern "C" void r_init_vfs();
+
 SECTION(".text.kinit")
 void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn) {
     // call global constructors
@@ -80,6 +82,8 @@ void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn) {
 
     init_syscall_table();
 
+    r_init_vfs();
+
     init_scheduler(kernel_stack_pfn);
 }
 

+ 64 - 30
src/lib.rs

@@ -20,6 +20,14 @@ mod prelude;
 mod rcu;
 mod sync;
 
+use alloc::{ffi::CString, sync::Arc};
+use bindings::root::types::elf::{elf32_load, elf32_load_data};
+use kernel::vfs::{
+    dentry::Dentry,
+    mount::{do_mount, MS_NOATIME, MS_NODEV, MS_NOSUID, MS_RDONLY},
+    FsContext,
+};
+use path::Path;
 use prelude::*;
 
 #[panic_handler]
@@ -31,10 +39,7 @@ fn panic(info: &core::panic::PanicInfo) -> ! {
 
 extern "C" {
     fn _do_allocate(size: usize) -> *mut core::ffi::c_void;
-    fn _do_deallocate(
-        ptr: *mut core::ffi::c_void,
-        size: core::ffi::c_size_t,
-    ) -> i32;
+    fn _do_deallocate(ptr: *mut core::ffi::c_void, size: core::ffi::c_size_t) -> i32;
 }
 
 use core::alloc::{GlobalAlloc, Layout};
@@ -63,36 +68,65 @@ unsafe impl GlobalAlloc for Allocator {
 static ALLOCATOR: Allocator = Allocator {};
 
 #[no_mangle]
-pub extern "C" fn late_init_rust() {
+pub extern "C" fn late_init_rust(out_sp: *mut usize, out_ip: *mut usize) {
     driver::e1000e::register_e1000e_driver();
     driver::ahci::register_ahci_driver();
 
-    fs::tmpfs::init();
     fs::procfs::init();
     fs::fat32::init();
 
-    kernel::vfs::mount::create_rootfs();
-}
+    // mount fat32 /mnt directory
+    let fs_context = FsContext::get_current();
+    let mnt_dir = Dentry::open(&fs_context, Path::new(b"/mnt/").unwrap(), true).unwrap();
+
+    mnt_dir.mkdir(0o755).unwrap();
+
+    do_mount(
+        &mnt_dir,
+        "/dev/sda",
+        "/mnt",
+        "fat32",
+        MS_RDONLY | MS_NOATIME | MS_NODEV | MS_NOSUID,
+    )
+    .unwrap();
+
+    let init = Dentry::open(&fs_context, Path::new(b"/mnt/busybox").unwrap(), true)
+        .expect("kernel panic: init not found!");
+
+    let argv = vec![
+        CString::new("/mnt/busybox").unwrap(),
+        CString::new("sh").unwrap(),
+        CString::new("/mnt/initsh").unwrap(),
+    ];
+
+    let envp = vec![
+        CString::new("LANG=C").unwrap(),
+        CString::new("HOME=/root").unwrap(),
+        CString::new("PATH=/mnt").unwrap(),
+        CString::new("PWD=/").unwrap(),
+    ];
 
-//
-// #[repr(C)]
-// #[allow(dead_code)]
-// struct Fp {
-//     fp: *const core::ffi::c_void,
-// }
-//
-// unsafe impl Sync for Fp {}
-//
-// #[allow(unused_macros)]
-// macro_rules! late_init {
-//     ($name:ident, $func:ident) => {
-//         #[used]
-//         #[link_section = ".late_init"]
-//         static $name: $crate::Fp = $crate::Fp {
-//             fp: $func as *const core::ffi::c_void,
-//         };
-//     };
-// }
-//
-// #[allow(unused_imports)]
-// pub(crate) use late_init;
+    let argv_array = argv.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+    let envp_array = envp.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+
+    // load init
+    let mut load_data = elf32_load_data {
+        exec_dent: Arc::into_raw(init) as *mut _,
+        argv: argv_array.as_ptr(),
+        argv_count: argv_array.len(),
+        envp: envp_array.as_ptr(),
+        envp_count: envp_array.len(),
+        ip: 0,
+        sp: 0,
+    };
+
+    let result = unsafe { elf32_load(&mut load_data) };
+    if result != 0 {
+        println_fatal!("Failed to load init: {}", result);
+    }
+
+    unsafe {
+        *out_sp = load_data.sp;
+        *out_ip = load_data.ip;
+    }
+}

+ 3 - 1
src/rcu.rs

@@ -203,7 +203,9 @@ impl<T> RCUPointer<T> {
         }
     }
 
-    pub fn swap(&self, new: Option<Arc<T>>) -> Option<Arc<T>> {
+    /// # Safety
+    /// Caller must ensure that the pointer is freed after all readers are done.
+    pub unsafe fn swap(&self, new: Option<Arc<T>>) -> Option<Arc<T>> {
         let new = new
             .map(|arc| Arc::into_raw(arc) as *mut T)
             .unwrap_or(core::ptr::null_mut());

+ 1 - 1
src/sync/condvar.rs

@@ -102,7 +102,7 @@ impl CondVar {
 
         might_sleep!(1);
 
-        let has_signals = unsafe { schedule_now_preempt_disabled() };
+        let has_signals = unsafe { !schedule_now_preempt_disabled() };
 
         unsafe {
             guard.force_relock();

+ 10 - 1
src/sync/lock.rs

@@ -23,7 +23,16 @@ impl<Value, Strategy: LockStrategy> Lock<Value, Strategy> {
     }
 }
 
-impl<Value: Sized + Default, Strategy: LockStrategy> Default for Lock<Value, Strategy> {
+impl<Value: Clone, Strategy: LockStrategy> Clone for Lock<Value, Strategy> {
+    fn clone(&self) -> Self {
+        Self {
+            strategy_data: Strategy::data(),
+            value: UnsafeCell::new(self.lock_shared().clone()),
+        }
+    }
+}
+
+impl<Value: Default, Strategy: LockStrategy> Default for Lock<Value, Strategy> {
     fn default() -> Self {
         Self {
             strategy_data: Strategy::data(),

+ 5 - 5
src/types/elf.cpp

@@ -29,7 +29,7 @@ static inline void __user_push_string32(uintptr_t* sp, const char* str) {
 }
 
 int types::elf::elf32_load(types::elf::elf32_load_data& d) {
-    auto& exec = d.exec_dent;
+    auto exec = fs::dentry_pointer{d.exec_dent};
     if (!exec)
         return -ENOENT;
 
@@ -143,12 +143,12 @@ int types::elf::elf32_load(types::elf::elf32_load_data& d) {
 
     // fill information block area
     std::vector<elf32_addr_t> args, envs;
-    for (const auto& env : d.envp) {
-        __user_push_string32(sp, env.c_str());
+    for (size_t i = 0; i < d.envp_count; ++i) {
+        __user_push_string32(sp, d.envp[i]);
         envs.push_back((uintptr_t)*sp);
     }
-    for (const auto& arg : d.argv) {
-        __user_push_string32(sp, arg.c_str());
+    for (size_t i = 0; i < d.argv_count; ++i) {
+        __user_push_string32(sp, d.argv[i]);
         args.push_back((uintptr_t)*sp);
     }