4 Komitmen f4128f04a0 ... 0235917d17

Pembuat SHA1 Pesan Tanggal
  greatbridf 0235917d17 Merge branch 'rust-sync' into vfs-refactor 1 bulan lalu
  greatbridf 7930b71762 feat: rewrite filearray with rust 1 bulan lalu
  greatbridf e35fd75836 fix(pipe_read): reads don't need to be atomic 1 bulan lalu
  greatbridf ae698248ee [partial] feat: add sync stuffs 1 bulan lalu
77 mengubah file dengan 4520 tambahan dan 3424 penghapusan
  1. 1 0
      .clang-format
  2. 3 3
      .rustfmt.toml
  3. 0 4
      CMakeLists.txt
  4. 1 0
      build.rs
  5. 2 0
      gblibc/include/errno.h
  6. 1 1
      include/kernel/async/lock.hpp
  7. 1 1
      include/kernel/mem/mm_list.hpp
  8. 6 10
      include/kernel/mem/paging.hpp
  9. 15 15
      include/kernel/mem/vm_area.hpp
  10. 5 6
      include/kernel/process.hpp
  11. 12 18
      include/kernel/syscall.hpp
  12. 1 1
      include/kernel/task/thread.hpp
  13. 2 0
      include/kernel/tty.hpp
  14. 38 62
      include/kernel/vfs.hpp
  15. 0 2
      include/kernel/vfs/dentry.hpp
  16. 0 106
      include/kernel/vfs/file.hpp
  17. 0 51
      include/kernel/vfs/filearr.hpp
  18. 0 7
      include/kernel/vfs/vfsfwd.hpp
  19. 5 3
      include/types/elf.hpp
  20. 4 4
      src/boot.s
  21. 5 4
      src/driver/ahci/command.rs
  22. 35 38
      src/driver/ahci/control.rs
  23. 33 33
      src/driver/ahci/defs.rs
  24. 108 57
      src/driver/ahci/mod.rs
  25. 268 58
      src/driver/ahci/port.rs
  26. 23 11
      src/driver/e1000e.rs
  27. 137 148
      src/fs/fat32.rs
  28. 170 140
      src/fs/procfs.rs
  29. 207 243
      src/fs/tmpfs.rs
  30. 23 51
      src/io.rs
  31. 9 5
      src/kernel.ld
  32. 2 0
      src/kernel.rs
  33. 5 3
      src/kernel/async/lock.cc
  34. 2 1
      src/kernel/async/waitlist.cc
  35. 12 10
      src/kernel/block.rs
  36. 35 5
      src/kernel/console.rs
  37. 4 2
      src/kernel/interrupt.cpp
  38. 12 20
      src/kernel/interrupt.rs
  39. 11 14
      src/kernel/mem/mm_list.cc
  40. 45 8
      src/kernel/mem/paging.cc
  41. 20 10
      src/kernel/mem/slab.cc
  42. 97 73
      src/kernel/process.cpp
  43. 1 1
      src/kernel/signal.cpp
  44. 35 194
      src/kernel/syscall.cpp
  45. 128 0
      src/kernel/syscall.rs
  46. 363 0
      src/kernel/syscall/file_rw.rs
  47. 2 409
      src/kernel/syscall/fileops.cc
  48. 0 22
      src/kernel/syscall/mount.cc
  49. 3 68
      src/kernel/syscall/procops.cc
  50. 171 0
      src/kernel/syscall/procops.rs
  51. 31 12
      src/kernel/task/thread.cc
  52. 53 14
      src/kernel/tty.cpp
  53. 1 0
      src/kernel/user.rs
  54. 202 0
      src/kernel/user/dataflow.rs
  55. 3 253
      src/kernel/vfs.cpp
  56. 189 109
      src/kernel/vfs/dentry.rs
  57. 29 17
      src/kernel/vfs/dentry/dcache.rs
  58. 6 324
      src/kernel/vfs/ffi.rs
  59. 551 0
      src/kernel/vfs/file.rs
  60. 0 296
      src/kernel/vfs/filearr.cc
  61. 307 0
      src/kernel/vfs/filearray.rs
  62. 196 192
      src/kernel/vfs/inode.rs
  63. 58 11
      src/kernel/vfs/mod.rs
  64. 24 31
      src/kernel/vfs/mount.rs
  65. 2 3
      src/kernel/vfs/vfs.rs
  66. 4 0
      src/kinit.cpp
  67. 65 31
      src/lib.rs
  68. 12 11
      src/net/netdev.rs
  69. 34 145
      src/prelude.rs
  70. 34 32
      src/rcu.rs
  71. 82 3
      src/sync.rs
  72. 113 0
      src/sync/condvar.rs
  73. 154 0
      src/sync/lock.rs
  74. 157 0
      src/sync/semaphore.rs
  75. 105 0
      src/sync/spin.rs
  76. 33 0
      src/sync/strategy.rs
  77. 12 18
      src/types/elf.cpp

+ 1 - 0
.clang-format

@@ -6,6 +6,7 @@ AllowShortFunctionsOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLoopsOnASingleLine: 'false'
 BreakConstructorInitializers: BeforeComma
+ColumnLimit: '100'
 FixNamespaceComments: 'true'
 IncludeBlocks: Regroup
 IndentWidth: '4'

+ 3 - 3
.rustfmt.toml

@@ -1,4 +1,4 @@
-max_width = 80
+max_width = 100
 hard_tabs = false
 tab_spaces = 4
 newline_style = "Auto"
@@ -10,8 +10,8 @@ struct_lit_width = 18
 struct_variant_width = 35
 array_width = 60
 chain_width = 60
-single_line_if_else_max_width = 50
-single_line_let_else_max_width = 50
+single_line_if_else_max_width = 60
+single_line_let_else_max_width = 60
 wrap_comments = false
 format_code_in_doc_comments = false
 doc_comment_code_block_width = 100

+ 0 - 4
CMakeLists.txt

@@ -49,7 +49,6 @@ set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
                         src/kernel/syscall.cpp
                         src/kernel/syscall/fileops.cc
                         src/kernel/syscall/infoops.cc
-                        src/kernel/syscall/mount.cc
                         src/kernel/syscall/procops.cc
                         src/kernel/mem/mm_list.cc
                         src/kernel/mem/paging.cc
@@ -64,7 +63,6 @@ set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
                         src/kernel/task/thread.cc
                         src/kernel/task/readyqueue.cc
                         src/kernel/user/thread_local.cc
-                        src/kernel/vfs/filearr.cc
                         src/kernel/signal.cpp
                         src/net/ethernet.cc
                         src/types/crc.cc
@@ -87,8 +85,6 @@ set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
                         include/kernel/utsname.hpp
                         include/kernel/vfs.hpp
                         include/kernel/vfs/dentry.hpp
-                        include/kernel/vfs/file.hpp
-                        include/kernel/vfs/filearr.hpp
                         include/kernel/vga.hpp
                         include/kernel/signal.hpp
                         include/kernel/task/forward.hpp

+ 1 - 0
build.rs

@@ -5,6 +5,7 @@ fn main() {
     let headers = [
         "include/kernel/process.hpp",
         "include/kernel/hw/pci.hpp",
+        "include/types/elf.hpp",
     ];
 
     let bindings = bindgen::Builder::default()

+ 2 - 0
gblibc/include/errno.h

@@ -30,7 +30,9 @@ extern int* __errno_location(void);
 #define ESPIPE 29
 #define EROFS 30
 #define EPIPE 32
+#define ERANGE 34
 #define ELOOP 40
+#define EOVERFLOW 75
 
 #ifdef __cplusplus
 }

+ 1 - 1
include/kernel/async/lock.hpp

@@ -8,7 +8,7 @@ namespace kernel::async {
 
 using spinlock_t = unsigned long volatile;
 using lock_context_t = unsigned long;
-using preempt_count_t = std::size_t;
+using preempt_count_t = ssize_t;
 
 void preempt_disable();
 void preempt_enable();

+ 1 - 1
include/kernel/mem/mm_list.hpp

@@ -46,7 +46,7 @@ class mm_list {
 
         unsigned long flags;
 
-        const fs::rust_inode_handle* file_inode;
+        fs::dentry_pointer file;
         // MUSE BE aligned to 4kb boundary
         std::size_t file_offset;
     };

+ 6 - 10
include/kernel/mem/paging.hpp

@@ -6,6 +6,7 @@
 
 #include <stdint.h>
 
+#include <kernel/interrupt.hpp>
 #include <kernel/mem/paging_asm.h>
 #include <kernel/mem/phys.hpp>
 
@@ -27,10 +28,8 @@ constexpr int idx_p1(uintptr_t vaddr) noexcept {
     return (vaddr >> 12) & 0x1ff;
 }
 
-constexpr std::tuple<int, int, int, int, int> idx_all(
-    uintptr_t vaddr) noexcept {
-    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr),
-            idx_p1(vaddr)};
+constexpr std::tuple<int, int, int, int, int> idx_all(uintptr_t vaddr) noexcept {
+    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr), idx_p1(vaddr)};
 }
 
 // page frame number
@@ -74,9 +73,7 @@ class PSE {
 
     constexpr pfn_t pfn() const noexcept { return *m_ptrbase & ~PA_MASK; }
 
-    constexpr psattr_t attributes() const noexcept {
-        return *m_ptrbase & PA_MASK;
-    }
+    constexpr psattr_t attributes() const noexcept { return *m_ptrbase & PA_MASK; }
 
     constexpr PSE operator[](std::size_t nth) const noexcept {
         return PSE{m_ptrbase.phys() + 8 * nth};
@@ -135,7 +132,7 @@ constexpr unsigned long PAGE_FAULT_PK = 0x00000020;
 constexpr unsigned long PAGE_FAULT_SS = 0x00000040;
 constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
 
-void handle_page_fault(unsigned long err);
+void handle_page_fault(interrupt_stack* int_stack);
 
 class vaddr_range {
     std::size_t n;
@@ -156,8 +153,7 @@ class vaddr_range {
     bool is_privilege;
 
    public:
-    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end,
-                         bool is_privilege = false);
+    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool is_privilege = false);
     explicit vaddr_range(std::nullptr_t);
 
     vaddr_range begin() const noexcept;

+ 15 - 15
include/kernel/mem/vm_area.hpp

@@ -20,41 +20,41 @@ struct vm_area {
 
     unsigned long flags;
 
-    const fs::rust_inode_handle* mapped_file;
+    fs::dentry_pointer mapped_file;
     std::size_t file_offset;
 
     constexpr bool is_avail(uintptr_t ostart, uintptr_t oend) const noexcept {
         return (ostart >= end || oend <= start);
     }
 
-    constexpr bool operator<(const vm_area& rhs) const noexcept {
-        return end <= rhs.start;
-    }
-    constexpr bool operator<(uintptr_t rhs) const noexcept {
-        return end <= rhs;
-    }
-    friend constexpr bool operator<(uintptr_t lhs,
-                                    const vm_area& rhs) noexcept {
+    constexpr bool operator<(const vm_area& rhs) const noexcept { return end <= rhs.start; }
+    constexpr bool operator<(uintptr_t rhs) const noexcept { return end <= rhs; }
+    friend constexpr bool operator<(uintptr_t lhs, const vm_area& rhs) noexcept {
         return lhs < rhs.start;
     }
 
     constexpr vm_area(uintptr_t start, unsigned long flags, uintptr_t end,
-                      const fs::rust_inode_handle* mapped_file = nullptr,
-                      std::size_t offset = 0)
+                      fs::dentry_pointer mapped_file = nullptr, std::size_t offset = 0)
         : start{start}
         , end{end}
         , flags{flags}
-        , mapped_file{mapped_file}
+        , mapped_file{std::move(mapped_file)}
         , file_offset{offset} {}
 
     constexpr vm_area(uintptr_t start, unsigned long flags,
-                      const fs::rust_inode_handle* mapped_file = nullptr,
-                      std::size_t offset = 0)
+                      fs::dentry_pointer mapped_file = nullptr, std::size_t offset = 0)
         : start{start}
         , end{start}
         , flags{flags}
-        , mapped_file{mapped_file}
+        , mapped_file{std::move(mapped_file)}
         , file_offset{offset} {}
+
+    inline vm_area(const vm_area& other)
+        : start{other.start}
+        , end{other.end}
+        , flags{other.flags}
+        , mapped_file{d_get(other.mapped_file)}
+        , file_offset{other.file_offset} {}
 };
 
 } // namespace kernel::mem

+ 5 - 6
include/kernel/process.hpp

@@ -7,6 +7,7 @@
 #include <utility>
 
 #include <assert.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <stdint.h>
 #include <sys/types.h>
@@ -27,7 +28,6 @@
 #include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/filearr.hpp>
 
 class process;
 
@@ -57,9 +57,8 @@ class process {
     std::list<wait_obj> waitprocs;
 
     process_attr attr{};
-    fs::filearray files;
-    fs::dentry_pointer cwd{};
-    mode_t umask{0022};
+    fs::rust_file_array files;
+    fs::rust_fs_context fs_context;
 
     pid_t pid{};
     pid_t ppid{};
@@ -67,7 +66,6 @@ class process {
     pid_t sid{};
 
     kernel::tty::tty* control_tty{};
-    struct fs::fs_context fs_context;
     std::set<pid_t> children;
 
    public:
@@ -164,7 +162,8 @@ class proclist final {
 
 void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn);
 /// @return true if returned normally, false if being interrupted
-bool schedule(void);
+bool schedule_now(void);
+bool schedule_now_preempt_disabled();
 void NORETURN schedule_noreturn(void);
 
 void NORETURN freeze(void);

+ 12 - 18
include/kernel/syscall.hpp

@@ -33,7 +33,6 @@ void handle_syscall64(int no, interrupt_stack* data, mmx_registers* mmxregs);
 namespace syscall {
     // in fileops.cc
     ssize_t do_write(int fd, const char __user* buf, size_t n);
-    ssize_t do_read(int fd, char __user* buf, size_t n);
     int do_close(int fd);
     int do_dup(int old_fd);
     int do_dup2(int old_fd, int new_fd);
@@ -42,19 +41,17 @@ namespace syscall {
     ssize_t do_getdents64(int fd, char __user* buf, size_t cnt);
     int do_open(const char __user* path, int flags, mode_t mode);
     int do_symlink(const char __user* target, const char __user* linkpath);
-    int do_readlink(const char __user* pathname, char __user* buf,
-                    size_t buf_size);
+    int do_readlink(const char __user* pathname, char __user* buf, size_t buf_size);
     int do_ioctl(int fd, unsigned long request, uintptr_t arg3);
     ssize_t do_readv(int fd, const iovec* iov, int iovcnt);
     ssize_t do_writev(int fd, const iovec* iov, int iovcnt);
     off_t do_lseek(int fd, off_t offset, int whence);
-    uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags,
-                            int fd, off_t pgoffset);
+    uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags, int fd,
+                            off_t pgoffset);
     int do_munmap(uintptr_t addr, size_t len);
-    ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset,
-                        size_t count);
-    int do_statx(int dirfd, const char __user* path, int flags,
-                 unsigned int mask, statx __user* statxbuf);
+    ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset, size_t count);
+    int do_statx(int dirfd, const char __user* path, int flags, unsigned int mask,
+                 statx __user* statxbuf);
     int do_fcntl(int fd, int cmd, unsigned long arg);
     int do_poll(pollfd __user* fds, nfds_t nfds, int timeout);
     int do_mknod(const char __user* pathname, mode_t mode, dev_t dev);
@@ -88,10 +85,9 @@ namespace syscall {
     int do_kill(pid_t pid, int sig);
     int do_tkill(pid_t pid, int sig);
     int do_rt_sigprocmask(int how, const kernel::sigmask_type __user* set,
-                          kernel::sigmask_type __user* oldset,
-                          size_t sigsetsize);
-    int do_rt_sigaction(int signum, const sigaction __user* act,
-                        sigaction __user* oldact, size_t sigsetsize);
+                          kernel::sigmask_type __user* oldset, size_t sigsetsize);
+    int do_rt_sigaction(int signum, const sigaction __user* act, sigaction __user* oldact,
+                        size_t sigsetsize);
     int do_newuname(new_utsname __user* buf);
 
     struct execve_retval {
@@ -100,14 +96,12 @@ namespace syscall {
         int status;
     };
 
-    execve_retval do_execve(const std::string& exec,
-                            const std::vector<std::string>& args,
+    execve_retval do_execve(const std::string& exec, const std::vector<std::string>& args,
                             const std::vector<std::string>& envs);
 
     // in mount.cc
-    int do_mount(const char __user* source, const char __user* target,
-                 const char __user* fstype, unsigned long flags,
-                 const void __user* _fsdata);
+    int do_mount(const char __user* source, const char __user* target, const char __user* fstype,
+                 unsigned long flags, const void __user* _fsdata);
 
     // in infoops.cc
     int do_clock_gettime(clockid_t clk_id, timespec __user* tp);

+ 1 - 1
include/kernel/task/thread.hpp

@@ -61,7 +61,7 @@ struct thread {
     int set_thread_area(user::user_desc* ptr);
     int load_thread_area32() const;
 
-    void set_attr(thd_attr_t new_attr);
+    void set_attr(thd_attr_t new_attr, bool forced = false);
 
     void send_signal(signal_list::signo_type signal);
 

+ 2 - 0
include/kernel/tty.hpp

@@ -45,6 +45,8 @@ class tty : public types::non_copyable {
     // TODO: formal poll support
     int poll();
 
+    int ioctl(int request, unsigned long arg3);
+
     constexpr void set_pgrp(pid_t pgid) { fg_pgroup = pgid; }
 
     constexpr pid_t get_pgrp(void) const { return fg_pgroup; }

+ 38 - 62
include/kernel/vfs.hpp

@@ -9,7 +9,6 @@
 
 #include <kernel/mem/paging.hpp>
 #include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/file.hpp>
 
 #define NODE_MAJOR(node) (((node) >> 8) & 0xFFU)
 #define NODE_MINOR(node) ((node) & 0xFFU)
@@ -31,71 +30,48 @@ struct chrdev_ops {
     chrdev_write write;
 };
 
-struct PACKED user_dirent {
-    ino_t d_ino;       // inode number
-    uint32_t d_off;    // ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    char d_name[1];    // file name with a padding zero
-    // uint8_t d_type; // file type, with offset of (d_reclen - 1)
-};
+int register_char_device(dev_t node, const chrdev_ops& ops);
+ssize_t char_device_read(dev_t node, char* buf, size_t buf_size, size_t n);
+ssize_t char_device_write(dev_t node, const char* buf, size_t n);
 
-struct PACKED user_dirent64 {
-    ino64_t d_ino;     // inode number
-    uint64_t d_off;    // implementation-defined field, ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    uint8_t d_type;    // file type, with offset of (d_reclen - 1)
-    char d_name[1];    // file name with a padding zero
-};
+class rust_file_array {
+   public:
+    struct handle;
+
+   private:
+    struct handle* m_handle;
+
+   public:
+    rust_file_array(struct handle* handle);
+    rust_file_array(const rust_file_array&) = delete;
+    ~rust_file_array();
+
+    constexpr rust_file_array(rust_file_array&& other) noexcept
+        : m_handle(std::exchange(other.m_handle, nullptr)) {}
 
-struct fs_context {
-    dentry_pointer root;
+    struct handle* get() const;
+    void drop();
 };
 
-int register_char_device(dev_t node, const chrdev_ops& ops);
-ssize_t char_device_read(dev_t node, char* buf, size_t buf_size, size_t n);
-ssize_t char_device_write(dev_t node, const char* buf, size_t n);
+class rust_fs_context {
+   public:
+    struct handle;
+
+   private:
+    struct handle* m_handle;
+
+   public:
+    rust_fs_context(struct handle* handle);
+    rust_fs_context(const rust_fs_context&) = delete;
+    ~rust_fs_context();
+
+    constexpr rust_fs_context(rust_fs_context&& other) noexcept
+        : m_handle(std::exchange(other.m_handle, nullptr)) {}
+
+    struct handle* get() const;
+    void drop();
+};
 
-extern "C" int fs_creat(struct dentry* at, mode_t mode);
-extern "C" int fs_mkdir(struct dentry* at, mode_t mode);
-extern "C" int fs_mknod(struct dentry* at, mode_t mode, dev_t sn);
-extern "C" int fs_unlink(struct dentry* at);
-extern "C" int fs_symlink(struct dentry* at, const char* target);
-
-extern "C" int fs_statx(const struct rust_inode_handle* inode,
-                        struct statx* stat, unsigned int mask);
-extern "C" int fs_readlink(const struct rust_inode_handle* inode, char* buf,
-                           size_t buf_size);
-extern "C" int fs_truncate(const struct rust_inode_handle* file, size_t size);
-extern "C" size_t fs_read(const struct rust_inode_handle* file, char* buf,
-                          size_t buf_size, size_t offset, size_t n);
-extern "C" size_t fs_write(const struct rust_inode_handle* file,
-                           const char* buf, size_t offset, size_t n);
-
-using readdir_callback_fn = std::function<int(const char*, size_t, ino_t)>;
-
-extern "C" ssize_t fs_readdir(const struct rust_inode_handle* file,
-                              size_t offset,
-                              const readdir_callback_fn* callback);
-
-extern "C" int fs_mount(dentry* mnt, const char* source,
-                        const char* mount_point, const char* fstype,
-                        unsigned long flags, const void* data);
-
-extern "C" mode_t r_get_inode_mode(struct rust_inode_handle* inode);
-extern "C" size_t r_get_inode_size(struct rust_inode_handle* inode);
-extern "C" bool r_dentry_is_directory(struct dentry* dentry);
-extern "C" bool r_dentry_is_invalid(struct dentry* dentry);
-
-// borrow from dentry->inode
-extern "C" struct rust_inode_handle* r_dentry_get_inode(struct dentry* dentry);
-extern "C" struct dentry* r_get_root_dentry();
-
-#define current_open(...) \
-    fs::open(current_process->fs_context, current_process->cwd, __VA_ARGS__)
-
-std::pair<dentry_pointer, int> open(const fs_context& context,
-                                    const dentry_pointer& cwd,
-                                    types::string_view path,
-                                    bool follow_symlinks = true);
+extern "C" size_t fs_read(struct dentry* file, char* buf, size_t buf_size, size_t offset, size_t n);
 
 } // namespace fs

+ 0 - 2
include/kernel/vfs/dentry.hpp

@@ -21,8 +21,6 @@ struct dentry_deleter {
 };
 
 using dentry_pointer = std::unique_ptr<struct dentry, dentry_deleter>;
-extern "C" int d_path(struct dentry* dentry, struct dentry* root,
-                      char* out_path, size_t buflen);
 dentry_pointer d_get(const dentry_pointer& dp);
 
 } // namespace fs

+ 0 - 106
include/kernel/vfs/file.hpp

@@ -1,106 +0,0 @@
-#pragma once
-
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/types.h>
-
-#include <types/buffer.hpp>
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace fs {
-
-class pipe : public types::non_copyable {
-   private:
-    static constexpr size_t PIPE_SIZE = 4096;
-    static constexpr uint32_t READABLE = 1;
-    static constexpr uint32_t WRITABLE = 2;
-
-   private:
-    types::buffer buf;
-    uint32_t flags;
-    kernel::async::mutex mtx;
-
-    kernel::async::wait_list waitlist_r;
-    kernel::async::wait_list waitlist_w;
-
-   public:
-    pipe();
-
-    void close_read();
-    void close_write();
-
-    int write(const char* buf, size_t n);
-    int read(char* buf, size_t n);
-
-    constexpr bool is_readable() const { return flags & READABLE; }
-
-    constexpr bool is_writeable() const { return flags & WRITABLE; }
-};
-
-struct file {
-    struct file_flags {
-        uint32_t read : 1;
-        uint32_t write : 1;
-        uint32_t append : 1;
-    } flags{};
-
-    file(file_flags flags) : flags(flags) {}
-
-    virtual ~file() = default;
-
-    virtual ssize_t read(char* __user buf, size_t n) = 0;
-    virtual ssize_t do_write(const char* __user buf, size_t n) = 0;
-
-    virtual off_t seek(off_t n, int whence) {
-        return (void)n, (void)whence, -ESPIPE;
-    }
-
-    ssize_t write(const char* __user buf, size_t n) {
-        if (!flags.write)
-            return -EBADF;
-
-        if (flags.append) {
-            seek(0, SEEK_END);
-        }
-
-        return do_write(buf, n);
-    }
-
-    // regular files should override this method
-    virtual int getdents(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-    virtual int getdents64(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-};
-
-struct regular_file : public virtual file {
-    virtual ~regular_file() = default;
-    std::size_t cursor{};
-    struct rust_inode_handle* ind{};
-
-    regular_file(file_flags flags, size_t cursor, rust_inode_handle* ind);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-    virtual off_t seek(off_t n, int whence) override;
-    virtual int getdents(char* __user buf, size_t cnt) override;
-    virtual int getdents64(char* __user buf, size_t cnt) override;
-};
-
-struct fifo_file : public virtual file {
-    virtual ~fifo_file() override;
-    std::shared_ptr<pipe> ppipe;
-
-    fifo_file(file_flags flags, std::shared_ptr<fs::pipe> ppipe);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-};
-
-} // namespace fs

+ 0 - 51
include/kernel/vfs/filearr.hpp

@@ -1,51 +0,0 @@
-#pragma once
-
-#include "dentry.hpp"
-#include "file.hpp"
-
-#include <memory>
-
-#include <types/path.hpp>
-
-#include <kernel/vfs.hpp>
-
-namespace fs {
-
-class filearray {
-   private:
-    struct impl;
-    std::shared_ptr<impl> pimpl;
-    filearray(std::shared_ptr<impl>);
-
-   public:
-    filearray(const fs_context* ctx);
-    filearray(filearray&& other) = default;
-
-    filearray copy() const;
-    filearray share() const;
-
-    // dup old_fd to some random fd
-    int dup(int old_fd);
-
-    // dup old_fd to new_fd, close new_fd if it is already open
-    int dup(int old_fd, int new_fd, int flags);
-
-    // dup old_fd to the first available fd starting from min_fd
-    int dupfd(int fd, int min_fd, int flags);
-
-    fs::file* operator[](int i) const;
-    int set_flags(int fd, int flags);
-
-    int pipe(int (&pipefd)[2]);
-    int open(const dentry_pointer& cwd, types::string_view filepath, int flags,
-             mode_t mode);
-    int open(types::string_view filepath, int flags, mode_t mode);
-
-    int close(int fd);
-
-    // any call to member methods will be invalid after clear()
-    void clear();
-    void onexec();
-};
-
-} // namespace fs

+ 0 - 7
include/kernel/vfs/vfsfwd.hpp

@@ -6,13 +6,6 @@ namespace fs {
 struct dcache;
 struct dentry;
 
-// in file.hpp
-struct file;
-struct regular_file;
-struct fifo_file;
-
-class pipe;
-
 // in filearray.hpp
 class file_array;
 

+ 5 - 3
include/types/elf.hpp

@@ -148,9 +148,11 @@ struct PACKED elf32_section_header_entry {
 };
 
 struct elf32_load_data {
-    fs::dentry_pointer exec_dent;
-    const std::vector<std::string>& argv;
-    const std::vector<std::string>& envp;
+    struct dentry* exec_dent; // Owned
+    const char* const* argv;
+    size_t argv_count;
+    const char* const* envp;
+    size_t envp_count;
     uintptr_t ip;
     uintptr_t sp;
 };

+ 4 - 4
src/boot.s

@@ -128,8 +128,8 @@ start_32bit:
     # read kimage into memory
 	lea -16(%esp), %esp
     mov $KIMAGE_32K_COUNT, %ecx
-    mov $KERNEL_IMAGE_PADDR, 4(%esp) # destination address
-	mov $9, (%esp) # LBA
+    movl $KERNEL_IMAGE_PADDR, 4(%esp) # destination address
+	movl $9, (%esp) # LBA
 
 .Lread_kimage:
 	mov (%esp), %edi
@@ -139,8 +139,8 @@ start_32bit:
     call read_disk
 	mov %ebx, %ecx
 
-    add $0x8000, 4(%esp)
-	add $64, (%esp)
+    addl $0x8000, 4(%esp)
+	addl $64, (%esp)
 
     loop .Lread_kimage
 

+ 5 - 4
src/driver/ahci/command.rs

@@ -16,19 +16,20 @@ pub trait Command {
 }
 
 pub struct IdentifyCommand {
-    pages: [Page; 1],
+    page: Page,
 }
 
 impl IdentifyCommand {
     pub fn new() -> Self {
-        let page = Page::alloc_one();
-        Self { pages: [page] }
+        Self {
+            page: Page::alloc_one(),
+        }
     }
 }
 
 impl Command for IdentifyCommand {
     fn pages(&self) -> &[Page] {
-        &self.pages
+        core::slice::from_ref(&self.page)
     }
 
     fn lba(&self) -> u64 {

+ 35 - 38
src/driver/ahci/control.rs

@@ -1,9 +1,6 @@
-use crate::{
-    kernel::mem::phys::{NoCachePP, PhysPtr},
-    prelude::*,
-};
+use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
 
-use super::{vread, vwrite, GHC_IE};
+use super::{BitsIterator, GHC_IE};
 
 /// An `AdapterControl` is an HBA device Global Host Control block
 ///
@@ -12,7 +9,7 @@ use super::{vread, vwrite, GHC_IE};
 /// All reads and writes to this struct is volatile
 ///
 #[repr(C)]
-pub struct AdapterControl {
+struct AdapterControlData {
     capabilities: u32,
     global_host_control: u32,
     interrupt_status: u32,
@@ -29,50 +26,50 @@ pub struct AdapterControl {
     vendor: [u8; 96],
 }
 
+const CONTROL_CAP: usize = 0;
+const CONTROL_GHC: usize = 1;
+const CONTROL_IS: usize = 2;
+const CONTROL_PI: usize = 3;
+
+pub struct AdapterControl {
+    inner: *mut u32,
+}
+
+/// # Safety
+/// At the same time, exactly one instance of this struct may exist.
+unsafe impl Send for AdapterControl {}
+
 impl AdapterControl {
-    pub fn new<'lt>(addr: usize) -> &'lt mut Self {
-        NoCachePP::new(addr).as_mut()
+    pub fn new(addr: usize) -> Self {
+        Self {
+            inner: NoCachePP::new(addr).as_ptr(),
+        }
     }
 }
 
 impl AdapterControl {
-    pub fn enable_interrupts(&mut self) {
-        let ghc = vread(&self.global_host_control);
-        vwrite(&mut self.global_host_control, ghc | GHC_IE);
+    fn read(&self, off: usize) -> u32 {
+        unsafe { self.inner.offset(off as isize).read_volatile() }
     }
 
-    pub fn implemented_ports(&self) -> ImplementedPortsIter {
-        ImplementedPortsIter::new(vread(&self.ports_implemented))
+    fn write(&self, off: usize, value: u32) {
+        unsafe { self.inner.offset(off as isize).write_volatile(value) }
     }
-}
 
-pub struct ImplementedPortsIter {
-    ports: u32,
-    n: u32,
-}
-
-impl ImplementedPortsIter {
-    fn new(ports: u32) -> Self {
-        Self { ports, n: 0 }
+    pub fn enable_interrupts(&self) {
+        let ghc = self.read(CONTROL_GHC);
+        self.write(CONTROL_GHC, ghc | GHC_IE);
     }
-}
-
-impl Iterator for ImplementedPortsIter {
-    type Item = u32;
 
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.n == 32 {
-            return None;
-        }
+    pub fn implemented_ports(&self) -> BitsIterator {
+        BitsIterator::new(self.read(CONTROL_PI))
+    }
 
-        let have: bool = self.ports & 1 != 0;
-        self.ports >>= 1;
-        self.n += 1;
+    pub fn pending_interrupts(&self) -> BitsIterator {
+        BitsIterator::new(self.read(CONTROL_IS))
+    }
 
-        if have {
-            Some(self.n - 1)
-        } else {
-            self.next()
-        }
+    pub fn clear_interrupt(&self, no: u32) {
+        self.write(CONTROL_IS, 1 << no)
     }
 }

+ 33 - 33
src/driver/ahci/defs.rs

@@ -17,6 +17,33 @@ pub const PORT_CMD_FRE: u32 = 0x00000010;
 pub const PORT_CMD_FR: u32 = 0x00004000;
 pub const PORT_CMD_CR: u32 = 0x00008000;
 
+pub const PORT_IE_DHRE: u32 = 0x00000001;
+pub const PORT_IE_UFE: u32 = 0x00000010;
+pub const PORT_IE_INFE: u32 = 0x04000000;
+pub const PORT_IE_IFE: u32 = 0x08000000;
+pub const PORT_IE_HBDE: u32 = 0x10000000;
+pub const PORT_IE_IBFE: u32 = 0x20000000;
+pub const PORT_IE_TFEE: u32 = 0x40000000;
+
+pub const PORT_IE_DEFAULT: u32 = PORT_IE_DHRE
+    | PORT_IE_UFE
+    | PORT_IE_INFE
+    | PORT_IE_IFE
+    | PORT_IE_HBDE
+    | PORT_IE_IBFE
+    | PORT_IE_TFEE;
+
+pub const PORT_IS_DHRS: u32 = 0x00000001;
+pub const PORT_IS_UFS: u32 = 0x00000010;
+pub const PORT_IS_INFS: u32 = 0x04000000;
+pub const PORT_IS_IFS: u32 = 0x08000000;
+pub const PORT_IS_HBDS: u32 = 0x10000000;
+pub const PORT_IS_IBFS: u32 = 0x20000000;
+pub const PORT_IS_TFES: u32 = 0x40000000;
+
+pub const PORT_IS_ERROR: u32 =
+    PORT_IS_UFS | PORT_IS_INFS | PORT_IS_IFS | PORT_IS_HBDS | PORT_IS_IBFS;
+
 /// A `CommandHeader` is used to send commands to the HBA device
 ///
 /// # Access
@@ -29,47 +56,20 @@ pub struct CommandHeader {
     // [5]: ATAPI
     // [6]: Write
     // [7]: Prefetchable
-    first: u8,
+    pub first: u8,
 
     // [0]: Reset
     // [1]: BIST
     // [2]: Clear busy upon ok
     // [3]: Reserved
     // [4:7]: Port multiplier
-    second: u8,
-
-    prdt_length: u16,
-    bytes_transferred: u32,
-    command_table_base: u64,
-
-    _reserved: [u32; 4],
-}
-
-impl CommandHeader {
-    pub fn clear(&mut self) {
-        self.first = 0;
-        self.second = 0;
-        self.prdt_length = 0;
-        self.bytes_transferred = 0;
-        self.command_table_base = 0;
-        self._reserved = [0; 4];
-    }
-
-    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
-        self.first = 0x05; // FIS type
-
-        if write {
-            self.first |= 0x40;
-        }
+    pub second: u8,
 
-        self.second = 0x04; // Clear busy upon ok
+    pub prdt_length: u16,
+    pub bytes_transferred: u32,
+    pub command_table_base: u64,
 
-        self.prdt_length = prdtlen;
-        self.bytes_transferred = 0;
-        self.command_table_base = cmdtable_base;
-
-        self._reserved = [0; 4];
-    }
+    pub _reserved: [u32; 4],
 }
 
 pub enum FisType {

+ 108 - 57
src/driver/ahci/mod.rs

@@ -1,9 +1,13 @@
 use crate::{
-    kernel::block::{make_device, BlockDevice},
+    fs::procfs,
+    kernel::{
+        block::{make_device, BlockDevice},
+        interrupt::register_irq_handler,
+    },
     prelude::*,
 };
 
-use alloc::sync::Arc;
+use alloc::{format, sync::Arc};
 use bindings::{
     kernel::hw::pci::{self, pci_device},
     EIO,
@@ -17,100 +21,149 @@ mod control;
 mod defs;
 mod port;
 
-fn vread<T: Sized + Copy>(refval: &T) -> T {
-    unsafe { core::ptr::read_volatile(refval) }
+pub struct BitsIterator {
+    data: u32,
+    n: u32,
 }
 
-fn vwrite<T: Sized + Copy>(refval: &mut T, val: T) {
-    unsafe { core::ptr::write_volatile(refval, val) }
+impl BitsIterator {
+    fn new(data: u32) -> Self {
+        Self { data, n: 0 }
+    }
 }
 
-fn spinwait_clear(refval: &u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
+impl Iterator for BitsIterator {
+    type Item = u32;
 
-    let mut spins = 0;
-    while vread(refval) & mask != 0 {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.n == 32 {
+            return None;
         }
 
-        spins += 1;
-    }
-
-    Ok(())
-}
-
-fn spinwait_set(refval: &u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
+        let have: bool = self.data & 1 != 0;
+        self.data >>= 1;
+        self.n += 1;
 
-    let mut spins = 0;
-    while vread(refval) & mask != mask {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
+        if have {
+            Some(self.n - 1)
+        } else {
+            self.next()
         }
-
-        spins += 1;
     }
+}
+
+fn vread<T: Sized + Copy>(refval: *const T) -> T {
+    unsafe { refval.read_volatile() }
+}
 
-    Ok(())
+fn vwrite<T: Sized + Copy>(refval: *mut T, val: T) {
+    unsafe { refval.write_volatile(val) }
 }
 
-struct Device<'lt, 'port> {
+struct Device {
     control_base: usize,
-    control: &'lt mut AdapterControl,
+    control: AdapterControl,
     // TODO: impl Drop to free pci device
     pcidev: *mut pci_device,
-    ports: Vec<Option<Arc<Mutex<AdapterPort<'port>>>>>,
+    /// # Lock
+    /// Might be accessed from irq handler, use with `lock_irq()`
+    ports: Spin<[Option<Arc<AdapterPort>>; 32]>,
 }
 
-impl<'lt, 'port: 'static> Device<'lt, 'port> {
-    fn probe_ports(&mut self) -> KResult<()> {
-        for nport in self.control.implemented_ports() {
-            let mut port = AdapterPort::<'port>::new(self.control_base, nport);
+/// # Safety
+/// `pcidev` is never accessed from Rust code
+/// TODO!!!: place *mut pci_device in a safe wrapper
+unsafe impl Send for Device {}
+unsafe impl Sync for Device {}
 
+impl Device {
+    fn probe_ports(&self) -> KResult<()> {
+        for nport in self.control.implemented_ports() {
+            let port = Arc::new(AdapterPort::new(self.control_base, nport));
             if !port.status_ok() {
                 continue;
             }
 
-            port.init()?;
+            self.ports.lock_irq()[nport as usize] = Some(port.clone());
+            if let Err(e) = (|| -> KResult<()> {
+                port.init()?;
+
+                {
+                    let port = port.clone();
+                    let name = format!("ahci-p{}-stats", port.nport);
+                    procfs::populate_root(name.into_bytes().into(), move |buffer| {
+                        writeln!(buffer, "{:?}", port.stats.lock().as_ref()).map_err(|_| EIO)
+                    })?;
+                }
+
+                let port = BlockDevice::register_disk(
+                    make_device(8, nport * 16),
+                    2147483647, // TODO: get size from device
+                    port,
+                )?;
+
+                port.partprobe()?;
+
+                Ok(())
+            })() {
+                self.ports.lock_irq()[nport as usize] = None;
+                println_warn!("probe port {nport} failed with {e}");
+            }
+        }
+
+        Ok(())
+    }
+
+    fn handle_interrupt(&self) {
+        // Safety
+        // `self.ports` is accessed inside irq handler
+        let ports = self.ports.lock();
+        for nport in self.control.pending_interrupts() {
+            if let None = ports[nport as usize] {
+                println_warn!("port {nport} not found");
+                continue;
+            }
+
+            let port = ports[nport as usize].as_ref().unwrap();
+            let status = vread(port.interrupt_status());
 
-            let port = Arc::new(Mutex::new(port));
+            if status & PORT_IS_ERROR != 0 {
+                println_warn!("port {nport} SATA error");
+                continue;
+            }
 
-            self.ports[nport as usize] = Some(port.clone());
+            debug_assert!(status & PORT_IS_DHRS != 0);
+            vwrite(port.interrupt_status(), PORT_IS_DHRS);
 
-            let port = BlockDevice::register_disk(
-                make_device(8, nport * 16),
-                2147483647, // TODO: get size from device
-                port,
-            )?;
+            self.control.clear_interrupt(nport);
 
-            port.partprobe()?;
+            port.handle_interrupt();
         }
-
-        Ok(())
     }
 }
 
-impl<'lt: 'static, 'port: 'static> Device<'lt, 'port> {
-    pub fn new(pcidev: *mut pci_device) -> KResult<Self> {
+impl Device {
+    pub fn new(pcidev: *mut pci_device) -> KResult<Arc<Self>> {
         let base = unsafe { *(*pcidev).header_type0() }.bars[PCI_REG_ABAR];
+        let irqno = unsafe { *(*pcidev).header_type0() }.interrupt_line;
 
         // use MMIO
         if base & 0xf != 0 {
             return Err(EIO);
         }
 
-        let mut ports = Vec::with_capacity(32);
-        ports.resize_with(32, || None);
-
-        let mut device = Device {
+        let device = Arc::new(Device {
             control_base: base as usize,
             control: AdapterControl::new(base as usize),
             pcidev,
-            ports,
-        };
+            ports: Spin::new([const { None }; 32]),
+        });
 
         device.control.enable_interrupts();
+
+        let device_irq = device.clone();
+        register_irq_handler(irqno as i32, move || device_irq.handle_interrupt())?;
+
         device.probe_ports()?;
 
         Ok(device)
@@ -123,15 +176,13 @@ unsafe extern "C" fn probe_device(pcidev: *mut pci_device) -> i32 {
             // TODO!!!: save device to pci_device
             Box::leak(Box::new(device));
             0
-        },
+        }
         Err(e) => -(e as i32),
     }
 }
 
 pub fn register_ahci_driver() {
-    let ret = unsafe {
-        pci::register_driver_r(VENDOR_INTEL, DEVICE_AHCI, Some(probe_device))
-    };
+    let ret = unsafe { pci::register_driver_r(VENDOR_INTEL, DEVICE_AHCI, Some(probe_device)) };
 
     assert_eq!(ret, 0);
 }

+ 268 - 58
src/driver/ahci/port.rs

@@ -1,4 +1,5 @@
-use bindings::EINVAL;
+use alloc::collections::vec_deque::VecDeque;
+use bindings::{EINVAL, EIO};
 
 use crate::prelude::*;
 
@@ -6,14 +7,29 @@ use crate::kernel::block::{BlockDeviceRequest, BlockRequestQueue};
 use crate::kernel::mem::paging::Page;
 
 use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
+use crate::sync::condvar::CondVar;
 
 use super::command::{Command, IdentifyCommand, ReadLBACommand};
 use super::{
-    spinwait_clear, vread, vwrite, CommandHeader, PRDTEntry, ReceivedFis,
-    ATA_DEV_BSY, ATA_DEV_DRQ, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
-    PORT_CMD_ST,
+    vread, vwrite, CommandHeader, PRDTEntry, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
+    PORT_CMD_ST, PORT_IE_DEFAULT,
 };
 
+fn spinwait_clear(refval: *const u32, mask: u32) -> KResult<()> {
+    const SPINWAIT_MAX: usize = 1000;
+
+    let mut spins = 0;
+    while vread(refval) & mask != 0 {
+        if spins == SPINWAIT_MAX {
+            return Err(EIO);
+        }
+
+        spins += 1;
+    }
+
+    Ok(())
+}
+
 /// An `AdapterPort` is an HBA device in AHCI mode.
 ///
 /// # Access
@@ -49,92 +65,291 @@ pub struct AdapterPortData {
     vendor: [u32; 4],
 }
 
-pub struct AdapterPort<'lt> {
-    nport: u32,
-    data: &'lt mut AdapterPortData,
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum SlotState {
+    Idle,
+    Working,
+    Finished,
+    Error,
+}
+
+struct CommandSlotInner {
+    state: SlotState,
+    /// # Usage
+    /// `cmdheader` might be used in irq handler. So in order to wait for
+    /// commands to finish, we should use `lock_irq` on `cmdheader`
+    cmdheader: *mut CommandHeader,
+}
+
+/// # Safety
+/// This is safe because the `cmdheader` is not shared between threads
+unsafe impl Send for CommandSlotInner {}
+
+impl CommandSlotInner {
+    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
+        let cmdheader = unsafe { self.cmdheader.as_mut().unwrap() };
+        cmdheader.first = 0x05; // FIS type
+
+        if write {
+            cmdheader.first |= 0x40;
+        }
+
+        cmdheader.second = 0x00;
+
+        cmdheader.prdt_length = prdtlen;
+        cmdheader.bytes_transferred = 0;
+        cmdheader.command_table_base = cmdtable_base;
+
+        cmdheader._reserved = [0; 4];
+    }
+}
+
+struct CommandSlot {
+    inner: Spin<CommandSlotInner>,
+    cv: CondVar,
+}
+
+impl CommandSlot {
+    fn new(cmdheader: *mut CommandHeader) -> Self {
+        Self {
+            inner: Spin::new(CommandSlotInner {
+                state: SlotState::Idle,
+                cmdheader,
+            }),
+            cv: CondVar::new(),
+        }
+    }
+}
+
+struct FreeList {
+    free: VecDeque<u32>,
+    working: VecDeque<u32>,
+}
+
+impl FreeList {
+    fn new() -> Self {
+        Self {
+            free: (0..32).collect(),
+            working: VecDeque::new(),
+        }
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct AdapterPortStats {
+    /// Number of commands sent
+    cmd_sent: u64,
+
+    /// Number of transmission errors
+    cmd_error: u64,
+
+    /// Number of interrupts fired
+    int_fired: u64,
+}
+
+pub struct AdapterPort {
+    pub nport: u32,
+    regs: *mut (),
     page: Page,
-    cmdheaders: &'lt mut [CommandHeader; 32],
-    recv_fis: &'lt mut ReceivedFis,
+    slots: [CommandSlot; 32],
+    free_list: Spin<FreeList>,
+    free_list_cv: CondVar,
+
+    /// Statistics for this port
+    pub stats: Spin<AdapterPortStats>,
 }
 
-impl<'lt> AdapterPort<'lt> {
+/// # Safety
+/// This is safe because the `AdapterPort` can be accessed by only one thread at the same time
+unsafe impl Send for AdapterPort {}
+unsafe impl Sync for AdapterPort {}
+
+impl AdapterPort {
     pub fn new(base: usize, nport: u32) -> Self {
         let page = Page::alloc_one();
+        let cmdheaders_start = page.as_cached().as_ptr::<CommandHeader>();
+
         Self {
             nport,
-            data: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_mut(),
-            cmdheaders: page.as_cached().as_mut(),
-            recv_fis: page.as_cached().offset(0x400).as_mut(),
+            regs: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_ptr(),
+            slots: core::array::from_fn(|index| {
+                CommandSlot::new(unsafe { cmdheaders_start.offset(index as isize) })
+            }),
+            free_list: Spin::new(FreeList::new()),
+            free_list_cv: CondVar::new(),
             page,
+            stats: Spin::default(),
         }
     }
 }
 
-impl<'lt> AdapterPort<'lt> {
+impl AdapterPort {
+    fn command_list_base(&self) -> *mut u64 {
+        unsafe { self.regs.byte_offset(0x00).cast() }
+    }
+
+    fn fis_base(&self) -> *mut u64 {
+        unsafe { self.regs.byte_offset(0x08).cast() }
+    }
+
+    fn sata_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x28).cast() }
+    }
+
+    fn command_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x18).cast() }
+    }
+
+    fn command_issue(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x38).cast() }
+    }
+
+    pub fn interrupt_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x10).cast() }
+    }
+
+    pub fn interrupt_enable(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x14).cast() }
+    }
+
     pub fn status_ok(&self) -> bool {
-        self.data.sata_status & 0xf == 0x3
+        vread(self.sata_status()) & 0xf == 0x3
     }
 
-    fn stop_command(&mut self) -> KResult<()> {
-        let cmd_status = vread(&self.data.command_status);
+    fn get_free_slot(&self) -> u32 {
+        let mut free_list = self.free_list.lock_irq();
+
+        loop {
+            match free_list.free.pop_front() {
+                Some(slot) => break slot,
+                None => {
+                    self.free_list_cv.wait(&mut free_list, false);
+                }
+            }
+        }
+    }
+
+    fn save_working(&self, slot: u32) {
+        self.free_list.lock().working.push_back(slot);
+    }
+
+    fn release_free_slot(&self, slot: u32) {
+        self.free_list.lock().free.push_back(slot);
+        self.free_list_cv.notify_one();
+    }
+
+    pub fn handle_interrupt(&self) {
+        let ci = vread(self.command_issue());
+
+        // no need to use `lock_irq()` inside interrupt handler
+        let mut free_list = self.free_list.lock();
+
+        free_list.working.retain(|&n| {
+            if ci & (1 << n) != 0 {
+                return true;
+            }
+
+            let slot = &self.slots[n as usize];
+
+            // TODO: check error
+            let mut slot_inner = slot.inner.lock();
+            debug_assert_eq!(slot_inner.state, SlotState::Working);
+            slot_inner.state = SlotState::Finished;
+            slot.cv.notify_all();
+            self.stats.lock().int_fired += 1;
+
+            false
+        });
+    }
+
+    fn stop_command(&self) -> KResult<()> {
         vwrite(
-            &mut self.data.command_status,
-            cmd_status & !(PORT_CMD_ST | PORT_CMD_FRE),
+            self.command_status(),
+            vread(self.command_status()) & !(PORT_CMD_ST | PORT_CMD_FRE),
         );
 
-        spinwait_clear(&self.data.command_status, PORT_CMD_CR | PORT_CMD_FR)
+        spinwait_clear(self.command_status(), PORT_CMD_CR | PORT_CMD_FR)
     }
 
-    fn start_command(&mut self) -> KResult<()> {
-        spinwait_clear(&self.data.command_status, PORT_CMD_CR)?;
+    fn start_command(&self) -> KResult<()> {
+        spinwait_clear(self.command_status(), PORT_CMD_CR)?;
 
-        let cmd_status = vread(&self.data.command_status);
+        let cmd_status = vread(self.command_status());
         vwrite(
-            &mut self.data.command_status,
+            self.command_status(),
             cmd_status | PORT_CMD_ST | PORT_CMD_FRE,
         );
 
         Ok(())
     }
 
-    fn send_command(&mut self, cmd: &impl Command) -> KResult<()> {
-        let pages = cmd.pages();
-
-        // TODO: get an available command slot
-        let cmdslot = 0;
+    /// # Might Sleep
+    /// This function **might sleep**, so call it in a preemptible context
+    fn send_command(&self, cmd: &impl Command) -> KResult<()> {
+        might_sleep!();
 
+        let pages = cmd.pages();
         let cmdtable_page = Page::alloc_one();
-        self.cmdheaders[cmdslot].clear();
-        self.cmdheaders[cmdslot].setup(
-            cmdtable_page.as_phys() as u64,
-            pages.len() as u16,
-            cmd.write(),
-        );
 
         let command_fis: &mut FISH2D = cmdtable_page.as_cached().as_mut();
         command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
 
-        let prdt: &mut [PRDTEntry; 248] =
-            cmdtable_page.as_cached().offset(0x80).as_mut();
+        let prdt: &mut [PRDTEntry; 248] = cmdtable_page.as_cached().offset(0x80).as_mut();
 
         for (idx, page) in pages.iter().enumerate() {
             prdt[idx].setup(page);
         }
 
-        // clear received fis?
+        let slot_index = self.get_free_slot() as usize;
+        let slot_object = &self.slots[slot_index];
 
-        // wait until port is not busy
-        spinwait_clear(&self.data.task_file_data, ATA_DEV_BSY | ATA_DEV_DRQ)?;
+        let mut slot = slot_object.inner.lock_irq();
 
-        vwrite(&mut self.data.command_issue, 1 << cmdslot);
-        spinwait_clear(&self.data.command_issue, 1 << cmdslot)?;
+        slot.setup(
+            cmdtable_page.as_phys() as u64,
+            pages.len() as u16,
+            cmd.write(),
+        );
+        slot.state = SlotState::Working;
+
+        // should we clear received fis here?
+        debug_assert!(vread(self.command_issue()) & (1 << slot_index) == 0);
+        vwrite(self.command_issue(), 1 << slot_index);
+
+        if spinwait_clear(self.command_issue(), 1 << slot_index).is_err() {
+            let mut saved = false;
+            while slot.state == SlotState::Working {
+                if !saved {
+                    saved = true;
+                    self.save_working(slot_index as u32);
+                }
+                slot_object.cv.wait(&mut slot, false);
+            }
+        } else {
+            // TODO: check error
+            slot.state = SlotState::Finished;
+        }
 
-        // TODO: check and wait interrupt
+        let state = slot.state;
+        slot.state = SlotState::Idle;
 
-        Ok(())
+        debug_assert_ne!(state, SlotState::Working);
+        self.release_free_slot(slot_index as u32);
+
+        match state {
+            SlotState::Finished => {
+                self.stats.lock().cmd_sent += 1;
+                Ok(())
+            }
+            SlotState::Error => {
+                self.stats.lock().cmd_error += 1;
+                Err(EIO)
+            }
+            _ => panic!("Invalid slot state"),
+        }
     }
 
-    fn identify(&mut self) -> KResult<()> {
+    fn identify(&self) -> KResult<()> {
         let cmd = IdentifyCommand::new();
 
         // TODO: check returned data
@@ -143,43 +358,38 @@ impl<'lt> AdapterPort<'lt> {
         Ok(())
     }
 
-    pub fn init(&mut self) -> KResult<()> {
+    pub fn init(&self) -> KResult<()> {
         self.stop_command()?;
 
-        // TODO: use interrupt
-        // this is the PxIE register, setting bits here will make
-        //      it generate corresponding interrupts in PxIS
-        //
-        // port->interrupt_enable = 1;
+        vwrite(self.interrupt_enable(), PORT_IE_DEFAULT);
 
-        vwrite(&mut self.data.command_list_base, self.page.as_phys() as u64);
-        vwrite(&mut self.data.fis_base, self.page.as_phys() as u64 + 0x400);
+        vwrite(self.command_list_base(), self.page.as_phys() as u64);
+        vwrite(self.fis_base(), self.page.as_phys() as u64 + 0x400);
 
         self.start_command()?;
 
         match self.identify() {
             Err(err) => {
                 self.stop_command()?;
-                return Err(err);
+                Err(err)
             }
             Ok(_) => Ok(()),
         }
     }
 }
 
-impl<'lt> BlockRequestQueue for AdapterPort<'lt> {
+impl BlockRequestQueue for AdapterPort {
     fn max_request_pages(&self) -> u64 {
         1024
     }
 
-    fn submit(&mut self, req: BlockDeviceRequest) -> KResult<()> {
+    fn submit(&self, req: BlockDeviceRequest) -> KResult<()> {
         // TODO: check disk size limit using newtype
         if req.count > 65535 {
             return Err(EINVAL);
         }
 
-        let command =
-            ReadLBACommand::new(req.buffer, req.sector, req.count as u16)?;
+        let command = ReadLBACommand::new(req.buffer, req.sector, req.count as u16)?;
 
         self.send_command(&command)
     }

+ 23 - 11
src/driver/e1000e.rs

@@ -1,3 +1,5 @@
+use crate::prelude::*;
+
 use crate::bindings::root::kernel::hw::pci;
 use crate::kernel::interrupt::register_irq_handler;
 use crate::kernel::mem::paging::copy_to_page;
@@ -56,6 +58,23 @@ fn test(val: u32, bit: u32) -> bool {
     (val & bit) == bit
 }
 
+struct PrintableBytes<'a>(&'a [u8]);
+
+impl core::fmt::Debug for PrintableBytes<'_> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "PrintableBytes {{")?;
+        for chunk in self.0.chunks(16) {
+            for &byte in chunk {
+                write!(f, "{byte} ")?;
+            }
+            write!(f, "\n")?;
+        }
+        write!(f, "}}")?;
+
+        Ok(())
+    }
+}
+
 impl netdev::Netdev for E1000eDev {
     fn mac(&self) -> netdev::Mac {
         self.mac
@@ -151,17 +170,10 @@ impl netdev::Netdev for E1000eDev {
                 )
             };
 
-            use crate::{dont_check, print, println};
-            dont_check!(println!("==== e1000e: received packet ===="));
-
-            for i in 0..len {
-                if i % 16 == 0 {
-                    dont_check!(println!());
-                }
-                dont_check!(print!("{:02x} ", data[i]));
-            }
-
-            dont_check!(println!("\n\n====  e1000e: end of packet  ===="));
+            println_debug!(
+                "e1000e: received {len} bytes, {:?}",
+                PrintableBytes(data)
+            );
             self.rx_tail = Some(next_tail);
         }
 

+ 137 - 148
src/fs/fat32.rs

@@ -1,4 +1,10 @@
-use alloc::{sync::Arc, vec::Vec};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+
+use alloc::{
+    collections::btree_map::BTreeMap,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
 use bindings::{EINVAL, EIO, S_IFDIR, S_IFREG};
 
 use itertools::Itertools;
@@ -10,10 +16,10 @@ use crate::{
         mem::{paging::Page, phys::PhysPtr},
         vfs::{
             dentry::Dentry,
-            inode::{Ino, Inode, InodeCache, InodeOps},
+            inode::{define_struct_inode, Ino, Inode, InodeData},
             mount::{register_filesystem, Mount, MountCreator},
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
     },
     prelude::*,
@@ -131,19 +137,35 @@ struct Bootsector {
     mbr_signature: u16,
 }
 
+impl_any!(FatFs);
 /// # Lock order
-/// 1. FatFs
 /// 2. FatTable
 /// 3. Inodes
 ///
 struct FatFs {
-    device: Arc<BlockDevice>,
-    icache: Mutex<InodeCache<FatFs>>,
     sectors_per_cluster: u8,
     rootdir_cluster: ClusterNo,
     data_start: u64,
-    fat: Mutex<Vec<ClusterNo>>,
-    volume_label: String,
+    volume_label: [u8; 11],
+
+    device: Arc<BlockDevice>,
+    fat: RwSemaphore<Vec<ClusterNo>>,
+    weak: Weak<FatFs>,
+    icache: BTreeMap<Ino, FatInode>,
+}
+
+impl Vfs for FatFs {
+    fn io_blksize(&self) -> usize {
+        4096
+    }
+
+    fn fs_devid(&self) -> DevId {
+        self.device.devid()
+    }
+
+    fn is_read_only(&self) -> bool {
+        true
+    }
 }
 
 impl FatFs {
@@ -151,8 +173,7 @@ impl FatFs {
         let cluster = cluster - 2;
 
         let rq = BlockDeviceRequest {
-            sector: self.data_start as u64
-                + cluster as u64 * self.sectors_per_cluster as u64,
+            sector: self.data_start as u64 + cluster as u64 * self.sectors_per_cluster as u64,
             count: self.sectors_per_cluster as u64,
             buffer: core::slice::from_ref(buf),
         };
@@ -160,57 +181,34 @@ impl FatFs {
 
         Ok(())
     }
-}
-
-impl InodeCache<FatFs> {
-    fn get_or_alloc(
-        &mut self,
-        ino: Ino,
-        is_directory: bool,
-        size: u64,
-    ) -> KResult<Arc<Inode>> {
-        self.get(ino).map(|inode| Ok(inode)).unwrap_or_else(|| {
-            let nlink;
-            let mut mode = 0o777;
-
-            let ops: Box<dyn InodeOps>;
-
-            if is_directory {
-                nlink = 2;
-                mode |= S_IFDIR;
-                ops = Box::new(DirOps);
-            } else {
-                nlink = 1;
-                mode |= S_IFREG;
-                ops = Box::new(FileOps);
-            }
-
-            let mut inode = self.alloc(ino, ops);
-            let inode_mut = unsafe { Arc::get_mut_unchecked(&mut inode) };
-            let inode_idata = inode_mut.idata.get_mut();
 
-            inode_idata.mode = mode;
-            inode_idata.nlink = nlink;
-            inode_idata.size = size;
-
-            self.submit(&inode)?;
-
-            Ok(inode)
-        })
+    fn get_or_alloc_inode(&self, ino: Ino, is_directory: bool, size: u32) -> Arc<dyn Inode> {
+        self.icache
+            .get(&ino)
+            .cloned()
+            .map(FatInode::unwrap)
+            .unwrap_or_else(|| {
+                if is_directory {
+                    DirInode::new(ino, self.weak.clone(), size)
+                } else {
+                    FileInode::new(ino, self.weak.clone(), size)
+                }
+            })
     }
 }
 
 impl FatFs {
-    pub fn create(device: DevId) -> KResult<(Arc<Self>, Arc<Inode>)> {
+    pub fn create(device: DevId) -> KResult<(Arc<Self>, Arc<dyn Inode>)> {
         let device = BlockDevice::get(device)?;
-        let mut fatfs_arc = Arc::new_cyclic(|weak| Self {
+        let mut fatfs_arc = Arc::new_cyclic(|weak: &Weak<FatFs>| Self {
             device,
-            icache: Mutex::new(InodeCache::new(weak.clone())),
             sectors_per_cluster: 0,
             rootdir_cluster: 0,
             data_start: 0,
-            fat: Mutex::new(Vec::new()),
-            volume_label: String::new(),
+            fat: RwSemaphore::new(Vec::new()),
+            weak: weak.clone(),
+            icache: BTreeMap::new(),
+            volume_label: [0; 11],
         });
 
         let fatfs = unsafe { Arc::get_mut_unchecked(&mut fatfs_arc) };
@@ -221,13 +219,13 @@ impl FatFs {
 
         fatfs.sectors_per_cluster = info.sectors_per_cluster;
         fatfs.rootdir_cluster = info.root_cluster;
-        fatfs.data_start = info.reserved_sectors as u64
-            + info.fat_copies as u64 * info.sectors_per_fat as u64;
+        fatfs.data_start =
+            info.reserved_sectors as u64 + info.fat_copies as u64 * info.sectors_per_fat as u64;
 
         let fat = fatfs.fat.get_mut();
+
         fat.resize(
-            512 * info.sectors_per_fat as usize
-                / core::mem::size_of::<ClusterNo>(),
+            512 * info.sectors_per_fat as usize / core::mem::size_of::<ClusterNo>(),
             0,
         );
 
@@ -242,51 +240,21 @@ impl FatFs {
             return Err(EIO);
         }
 
-        fatfs.volume_label = String::from(
-            str::from_utf8(&info.volume_label)
-                .map_err(|_| EINVAL)?
-                .trim_end_matches(char::from(' ')),
-        );
-
-        let root_dir_cluster_count =
-            ClusterIterator::new(&fat, fatfs.rootdir_cluster).count();
-
-        let root_inode = {
-            let icache = fatfs.icache.get_mut();
-
-            let mut inode =
-                icache.alloc(info.root_cluster as Ino, Box::new(DirOps));
-            let inode_mut = unsafe { Arc::get_mut_unchecked(&mut inode) };
-            let inode_idata = inode_mut.idata.get_mut();
-
-            inode_idata.mode = S_IFDIR | 0o777;
-            inode_idata.nlink = 2;
-            inode_idata.size = root_dir_cluster_count as u64
-                * info.sectors_per_cluster as u64
-                * 512;
+        info.volume_label
+            .iter()
+            .take_while(|&&c| c != ' ' as u8)
+            .take(11)
+            .enumerate()
+            .for_each(|(idx, c)| fatfs.volume_label[idx] = *c);
 
-            icache.submit(&inode)?;
-            inode
-        };
+        let root_dir_cluster_count = ClusterIterator::new(fat, fatfs.rootdir_cluster).count();
+        let root_dir_size = root_dir_cluster_count as u32 * info.sectors_per_cluster as u32 * 512;
+        let root_inode = DirInode::new(info.root_cluster as Ino, fatfs.weak.clone(), root_dir_size);
 
         Ok((fatfs_arc, root_inode))
     }
 }
 
-impl Vfs for FatFs {
-    fn io_blksize(&self) -> usize {
-        4096
-    }
-
-    fn fs_devid(&self) -> DevId {
-        self.device.devid()
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
 struct ClusterIterator<'fat> {
     fat: &'fat [ClusterNo],
     cur: ClusterNo,
@@ -371,24 +339,47 @@ impl<'fat> Iterator for ClusterIterator<'fat> {
     }
 }
 
-struct FileOps;
-impl InodeOps for FileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+#[derive(Clone)]
+enum FatInode {
+    File(Arc<FileInode>),
+    Dir(Arc<DirInode>),
+}
+
+impl FatInode {
+    fn unwrap(self) -> Arc<dyn Inode> {
+        match self {
+            FatInode::File(inode) => inode,
+            FatInode::Dir(inode) => inode,
+        }
     }
+}
 
-    fn read(
-        &self,
-        inode: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
-        let vfs = inode.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+define_struct_inode! {
+    struct FileInode;
+}
+
+impl FileInode {
+    fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
+        let inode = Arc::new(Self {
+            idata: InodeData::new(ino, weak),
+        });
+
+        // Safety: We are initializing the inode
+        inode.nlink.store(1, Ordering::Relaxed);
+        inode.mode.store(S_IFREG | 0o777, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+
+        inode
+    }
+}
+
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
-        let iter = ClusterIterator::new(&fat, inode.ino as ClusterNo)
-            .read(vfs, offset);
+        let iter = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).read(vfs, offset);
 
         for data in iter {
             if buffer.fill(data?)?.should_stop() {
@@ -400,23 +391,32 @@ impl InodeOps for FileOps {
     }
 }
 
-struct DirOps;
-impl InodeOps for DirOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+define_struct_inode! {
+    struct DirInode;
+}
+
+impl DirInode {
+    fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
+        let inode = Arc::new(Self {
+            idata: InodeData::new(ino, weak),
+        });
+
+        // Safety: We are initializing the inode
+        inode.nlink.store(2, Ordering::Relaxed);
+        inode.mode.store(S_IFDIR | 0o777, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+
+        inode
     }
+}
 
-    fn lookup(
-        &self,
-        dir: &Inode,
-        dentry: &Arc<Dentry>,
-    ) -> KResult<Option<Arc<Inode>>> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+impl Inode for DirInode {
+    fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
-        let mut entries =
-            ClusterIterator::new(&fat, dir.ino as ClusterNo).dirs(vfs, 0);
+        let mut entries = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).dirs(vfs, 0);
 
         let entry = entries.find_map(|entry| {
             if entry.is_err() {
@@ -438,28 +438,27 @@ impl InodeOps for DirOps {
             Some(Ok(entry)) => {
                 let ino = entry.ino();
 
-                Ok(Some(vfs.icache.lock().get_or_alloc(
+                Ok(Some(vfs.get_or_alloc_inode(
                     ino,
                     entry.is_directory(),
-                    entry.size as u64,
-                )?))
+                    entry.size,
+                )))
             }
         }
     }
 
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
-        dir: &'r Inode,
+    fn do_readdir(
+        &self,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
         const ENTRY_SIZE: usize = core::mem::size_of::<FatDirectoryEntry>();
         let cluster_iter =
-            ClusterIterator::new(&fat, dir.ino as ClusterNo).dirs(vfs, offset);
+            ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).dirs(vfs, offset);
 
         let mut nread = 0;
         for entry in cluster_iter {
@@ -473,13 +472,9 @@ impl InodeOps for DirOps {
             let ino = entry.ino();
             let name = entry.filename();
 
-            vfs.icache.lock().get_or_alloc(
-                ino,
-                entry.is_directory(),
-                entry.size as u64,
-            )?;
+            vfs.get_or_alloc_inode(ino, entry.is_directory(), entry.size);
 
-            if callback(name.as_ref(), ino).is_err() {
+            if callback(name.as_ref(), ino)?.is_break() {
                 break;
             }
 
@@ -493,13 +488,7 @@ impl InodeOps for DirOps {
 struct FatMountCreator;
 
 impl MountCreator for FatMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fatfs, root_inode) = FatFs::create(make_device(8, 1))?;
 
         Mount::new(mp, fatfs, root_inode)
@@ -507,5 +496,5 @@ impl MountCreator for FatMountCreator {
 }
 
 pub fn init() {
-    register_filesystem("fat32", Box::new(FatMountCreator)).unwrap();
+    register_filesystem("fat32", Arc::new(FatMountCreator)).unwrap();
 }

+ 170 - 140
src/fs/procfs.rs

@@ -1,7 +1,11 @@
-use core::sync::atomic::Ordering;
-
-use alloc::sync::{Arc, Weak};
+use alloc::{
+    collections::btree_map::BTreeMap,
+    sync::{Arc, Weak},
+};
 use bindings::{EACCES, ENOTDIR, S_IFDIR, S_IFREG};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
+use lazy_static::lazy_static;
 
 use crate::{
     io::Buffer,
@@ -9,13 +13,14 @@ use crate::{
         mem::paging::{Page, PageBuffer},
         vfs::{
             dentry::Dentry,
-            inode::{AtomicIno, Inode, InodeCache, InodeData, InodeOps},
+            inode::{define_struct_inode, AtomicIno, Ino, Inode, InodeData},
             mount::{dump_mounts, register_filesystem, Mount, MountCreator},
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
     },
     prelude::*,
+    sync::Locked,
 };
 
 fn split_len_offset(data: &[u8], len: usize, offset: usize) -> Option<&[u8]> {
@@ -24,8 +29,6 @@ fn split_len_offset(data: &[u8], len: usize, offset: usize) -> Option<&[u8]> {
     real_data.split_at_checked(offset).map(|(_, data)| data)
 }
 
-pub struct ProcFsNode(Arc<Inode>);
-
 pub trait ProcFsFile: Send + Sync {
     fn can_read(&self) -> bool {
         false
@@ -44,21 +47,57 @@ pub trait ProcFsFile: Send + Sync {
     }
 }
 
-struct ProcFsFileOps {
-    file: Box<dyn ProcFsFile>,
+pub enum ProcFsNode {
+    File(Arc<FileInode>),
+    Dir(Arc<DirInode>),
 }
 
-impl InodeOps for ProcFsFileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl ProcFsNode {
+    fn unwrap(&self) -> Arc<dyn Inode> {
+        match self {
+            ProcFsNode::File(inode) => inode.clone(),
+            ProcFsNode::Dir(inode) => inode.clone(),
+        }
     }
 
-    fn read(
-        &self,
-        _: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
+    fn ino(&self) -> Ino {
+        match self {
+            ProcFsNode::File(inode) => inode.ino,
+            ProcFsNode::Dir(inode) => inode.ino,
+        }
+    }
+}
+
+define_struct_inode! {
+    struct FileInode {
+        file: Box<dyn ProcFsFile>,
+    }
+}
+
+impl FileInode {
+    pub fn new(ino: Ino, vfs: Weak<ProcFs>, file: Box<dyn ProcFsFile>) -> Arc<Self> {
+        let mut mode = S_IFREG;
+        if file.can_read() {
+            mode |= 0o444;
+        }
+        if file.can_write() {
+            mode |= 0o200;
+        }
+
+        let inode = Self {
+            idata: InodeData::new(ino, vfs),
+            file,
+        };
+
+        inode.idata.mode.store(mode, Ordering::Relaxed);
+        inode.idata.nlink.store(1, Ordering::Relaxed);
+
+        Arc::new(inode)
+    }
+}
+
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         if !self.file.can_read() {
             return Err(EACCES);
         }
@@ -75,47 +114,56 @@ impl InodeOps for ProcFsFileOps {
     }
 }
 
-struct ProcFsDirectory {
-    entries: Mutex<Vec<(Arc<[u8]>, ProcFsNode)>>,
+define_struct_inode! {
+    struct DirInode {
+        entries: Locked<Vec<(Arc<[u8]>, ProcFsNode)>, ()>,
+    }
 }
 
-impl InodeOps for ProcFsDirectory {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl DirInode {
+    pub fn new(ino: Ino, vfs: Weak<ProcFs>) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, entries).write(Locked::new(vec![], rwsem));
+            addr_of_mut_field!(inode, mode).write((S_IFDIR | 0o755).into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
+}
 
-    fn lookup(
-        &self,
-        _: &Inode,
-        dentry: &Arc<Dentry>,
-    ) -> KResult<Option<Arc<Inode>>> {
-        Ok(self.entries.lock().iter().find_map(|(name, node)| {
-            name.as_ref()
-                .eq(dentry.name().as_ref())
-                .then(|| node.0.clone())
-        }))
+impl Inode for DirInode {
+    fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
+        let lock = self.rwsem.lock_shared();
+        Ok(self
+            .entries
+            .access(lock.as_ref())
+            .iter()
+            .find_map(|(name, node)| {
+                name.as_ref()
+                    .eq(dentry.name().as_ref())
+                    .then(|| node.unwrap())
+            }))
     }
 
-    fn readdir<'cb, 'r: 'cb>(
+    fn do_readdir(
         &self,
-        _: &Inode,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        Ok(self
-            .entries
-            .lock()
+        let lock = self.rwsem.lock_shared();
+        self.entries
+            .access(lock.as_ref())
             .iter()
             .skip(offset)
-            .take_while(|(name, ProcFsNode(inode))| {
-                callback(name, inode.ino).is_ok()
-            })
-            .count())
+            .map(|(name, node)| callback(name.as_ref(), node.ino()))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
 }
 
+impl_any!(ProcFs);
 pub struct ProcFs {
-    root_node: Arc<Inode>,
+    root_node: Arc<DirInode>,
     next_ino: AtomicIno,
 }
 
@@ -128,38 +176,37 @@ impl Vfs for ProcFs {
         10
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn is_read_only(&self) -> bool {
+        false
     }
 }
 
-static mut GLOBAL_PROCFS: Option<Arc<ProcFs>> = None;
-static mut ICACHE: Option<InodeCache<ProcFs>> = None;
+lazy_static! {
+    static ref ICACHE: Spin<BTreeMap<Ino, ProcFsNode>> = Spin::new(BTreeMap::new());
+    static ref GLOBAL_PROCFS: Arc<ProcFs> = {
+        let fs: Arc<ProcFs> = Arc::new_cyclic(|weak: &Weak<ProcFs>| ProcFs {
+            root_node: DirInode::new(0, weak.clone()),
+            next_ino: AtomicIno::new(1),
+        });
 
-fn get_icache() -> &'static InodeCache<ProcFs> {
-    unsafe { ICACHE.as_ref().unwrap() }
+        fs
+    };
 }
 
 struct ProcFsMountCreator;
 
 impl ProcFsMountCreator {
     pub fn get() -> Arc<ProcFs> {
-        unsafe { GLOBAL_PROCFS.as_ref().cloned().unwrap() }
+        GLOBAL_PROCFS.clone()
     }
 
     pub fn get_weak() -> Weak<ProcFs> {
-        unsafe { GLOBAL_PROCFS.as_ref().map(Arc::downgrade).unwrap() }
+        Arc::downgrade(&GLOBAL_PROCFS)
     }
 }
 
 impl MountCreator for ProcFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let vfs = ProcFsMountCreator::get();
         let root_inode = vfs.root_node.clone();
         Mount::new(mp, vfs, root_inode)
@@ -170,77 +217,55 @@ pub fn root() -> ProcFsNode {
     let vfs = ProcFsMountCreator::get();
     let root = vfs.root_node.clone();
 
-    ProcFsNode(root)
+    ProcFsNode::Dir(root)
 }
 
 pub fn creat(
     parent: &ProcFsNode,
-    name: &Arc<[u8]>,
+    name: Arc<[u8]>,
     file: Box<dyn ProcFsFile>,
 ) -> KResult<ProcFsNode> {
-    let mut mode = S_IFREG;
-    if file.can_read() {
-        mode |= 0o444;
-    }
-    if file.can_write() {
-        mode |= 0o200;
-    }
-
-    let dir = parent
-        .0
-        .ops
-        .as_any()
-        .downcast_ref::<ProcFsDirectory>()
-        .ok_or(ENOTDIR)?;
+    let parent = match parent {
+        ProcFsNode::File(_) => return Err(ENOTDIR),
+        ProcFsNode::Dir(parent) => parent,
+    };
 
     let fs = ProcFsMountCreator::get();
-    let ino = fs.next_ino.fetch_add(1, Ordering::SeqCst);
+    let ino = fs.next_ino.fetch_add(1, Ordering::Relaxed);
 
-    let inode = get_icache().alloc(ino, Box::new(ProcFsFileOps { file }));
+    let inode = FileInode::new(ino, Arc::downgrade(&fs), file);
 
-    inode.idata.lock().mode = mode;
-    inode.idata.lock().nlink = 1;
-
-    dir.entries
-        .lock()
-        .push((name.clone(), ProcFsNode(inode.clone())));
+    {
+        let mut lock = parent.idata.rwsem.lock();
+        parent
+            .entries
+            .access_mut(lock.as_mut())
+            .push((name, ProcFsNode::File(inode.clone())));
+    }
 
-    Ok(ProcFsNode(inode))
+    Ok(ProcFsNode::File(inode))
 }
 
 pub fn mkdir(parent: &ProcFsNode, name: &[u8]) -> KResult<ProcFsNode> {
-    let dir = parent
-        .0
-        .ops
-        .as_any()
-        .downcast_ref::<ProcFsDirectory>()
-        .ok_or(ENOTDIR)?;
-
-    let ino = ProcFsMountCreator::get()
-        .next_ino
-        .fetch_add(1, Ordering::SeqCst);
-
-    let inode = get_icache().alloc(
-        ino,
-        Box::new(ProcFsDirectory {
-            entries: Mutex::new(vec![]),
-        }),
-    );
+    let parent = match parent {
+        ProcFsNode::File(_) => return Err(ENOTDIR),
+        ProcFsNode::Dir(parent) => parent,
+    };
 
-    {
-        let mut idata = inode.idata.lock();
-        idata.nlink = 2;
-        idata.mode = S_IFDIR | 0o755;
-    }
+    let fs = ProcFsMountCreator::get();
+    let ino = fs.next_ino.fetch_add(1, Ordering::Relaxed);
 
-    dir.entries
-        .lock()
-        .push((Arc::from(name), ProcFsNode(inode.clone())));
+    let inode = DirInode::new(ino, Arc::downgrade(&fs));
 
-    Ok(ProcFsNode(inode))
+    parent
+        .entries
+        .access_mut(inode.rwsem.lock().as_mut())
+        .push((Arc::from(name), ProcFsNode::Dir(inode.clone())));
+
+    Ok(ProcFsNode::Dir(inode))
 }
 
-struct DumpMountsFile {}
+struct DumpMountsFile;
 impl ProcFsFile for DumpMountsFile {
     fn can_read(&self) -> bool {
         true
@@ -254,43 +279,48 @@ impl ProcFsFile for DumpMountsFile {
 }
 
 pub fn init() {
-    let dir = ProcFsDirectory {
-        entries: Mutex::new(vec![]),
-    };
-
-    let fs: Arc<ProcFs> = Arc::new_cyclic(|weak: &Weak<ProcFs>| {
-        let root_node = Arc::new(Inode {
-            ino: 0,
-            vfs: weak.clone(),
-            idata: Mutex::new(InodeData::default()),
-            ops: Box::new(dir),
-        });
+    register_filesystem("procfs", Arc::new(ProcFsMountCreator)).unwrap();
 
-        ProcFs {
-            root_node,
-            next_ino: AtomicIno::new(1),
-        }
-    });
+    creat(
+        &root(),
+        Arc::from(b"mounts".as_slice()),
+        Box::new(DumpMountsFile),
+    )
+    .unwrap();
+}
 
-    {
-        let mut indata = fs.root_node.idata.lock();
-        indata.mode = S_IFDIR | 0o755;
-        indata.nlink = 1;
-    };
+pub struct GenericProcFsFile<ReadFn>
+where
+    ReadFn: Send + Sync + Fn(&mut PageBuffer) -> KResult<()>,
+{
+    read_fn: Option<ReadFn>,
+}
 
-    unsafe {
-        GLOBAL_PROCFS = Some(fs);
-        ICACHE = Some(InodeCache::new(ProcFsMountCreator::get_weak()));
-    };
+impl<ReadFn> ProcFsFile for GenericProcFsFile<ReadFn>
+where
+    ReadFn: Send + Sync + Fn(&mut PageBuffer) -> KResult<()>,
+{
+    fn can_read(&self) -> bool {
+        self.read_fn.is_some()
+    }
 
-    register_filesystem("procfs", Box::new(ProcFsMountCreator)).unwrap();
+    fn read(&self, buffer: &mut PageBuffer) -> KResult<usize> {
+        self.read_fn.as_ref().ok_or(EACCES)?(buffer).map(|_| buffer.len())
+    }
+}
 
+pub fn populate_root<F>(name: Arc<[u8]>, read_fn: F) -> KResult<()>
+where
+    F: Send + Sync + Fn(&mut PageBuffer) -> KResult<()> + 'static,
+{
     let root = root();
 
     creat(
         &root,
-        &Arc::from(b"mounts".as_slice()),
-        Box::new(DumpMountsFile {}),
+        name,
+        Box::new(GenericProcFsFile {
+            read_fn: Some(read_fn),
+        }),
     )
-    .unwrap();
+    .map(|_| ())
 }

+ 207 - 243
src/fs/tmpfs.rs

@@ -1,383 +1,347 @@
-use core::sync::atomic::Ordering;
+use alloc::sync::{Arc, Weak};
+use bindings::{EINVAL, EIO, EISDIR, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
 
 use crate::{
     io::Buffer,
     kernel::vfs::{
-        dentry::Dentry,
-        inode::{AtomicIno, Ino, Inode, InodeCache, InodeOps, Mode},
+        dentry::{dcache, Dentry},
+        inode::{define_struct_inode, AtomicIno, Ino, Inode, Mode, WriteOffset},
         mount::{register_filesystem, Mount, MountCreator, MS_RDONLY},
         s_isblk, s_ischr,
         vfs::Vfs,
-        DevId, ReadDirCallback,
+        DevId,
     },
     prelude::*,
+    sync::Locked,
 };
 
-use alloc::sync::Arc;
-
-use bindings::{
-    EINVAL, EIO, EISDIR, EROFS, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG,
-};
-
-struct FileOps {
-    data: Mutex<Vec<u8>>,
+fn acquire(vfs: &Weak<dyn Vfs>) -> KResult<Arc<dyn Vfs>> {
+    vfs.upgrade().ok_or(EIO)
 }
 
-struct NodeOps {
-    devid: DevId,
+fn astmp(vfs: &Arc<dyn Vfs>) -> &TmpFs {
+    vfs.as_any()
+        .downcast_ref::<TmpFs>()
+        .expect("corrupted tmpfs data structure")
 }
 
-impl NodeOps {
-    fn new(devid: DevId) -> Self {
-        Self { devid }
+define_struct_inode! {
+    struct NodeInode {
+        devid: DevId,
     }
 }
 
-impl InodeOps for NodeOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl NodeInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode, devid: DevId) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, _| unsafe {
+            addr_of_mut_field!(inode, devid).write(devid);
+
+            addr_of_mut_field!(inode, mode).write(mode.into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
+}
 
-    fn devid(&self, _: &Inode) -> KResult<DevId> {
+impl Inode for NodeInode {
+    fn devid(&self) -> KResult<DevId> {
         Ok(self.devid)
     }
 }
 
-struct DirectoryOps {
-    entries: Mutex<Vec<(Arc<[u8]>, Ino)>>,
+define_struct_inode! {
+    struct DirectoryInode {
+        entries: Locked<Vec<(Arc<[u8]>, Ino)>, ()>,
+    }
 }
 
-impl DirectoryOps {
-    fn new() -> Self {
-        Self {
-            entries: Mutex::new(vec![]),
-        }
+impl DirectoryInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, entries)
+                .write(Locked::new(vec![(Arc::from(b".".as_slice()), ino)], rwsem));
+
+            addr_of_mut_field!(inode, size).write(1.into());
+            addr_of_mut_field!(inode, mode).write((S_IFDIR | (mode & 0o777)).into());
+            addr_of_mut_field!(inode, nlink).write(1.into()); // link from `.` to itself
+        })
     }
 
-    /// Locks the `inode.idata`
-    fn link(&self, dir: &Inode, file: &Inode, name: Arc<[u8]>) -> KResult<()> {
-        dir.idata.lock().size += 1;
-        self.entries.lock().push((name, file.ino));
+    fn link(&self, name: Arc<[u8]>, file: &dyn Inode, dlock: &mut ()) {
+        // SAFETY: Only `unlink` will do something based on `nlink` count
+        //         No need to synchronize here
+        file.nlink.fetch_add(1, Ordering::Relaxed);
 
-        file.idata.lock().nlink += 1;
+        // SAFETY: `rwsem` has done the synchronization
+        self.size.fetch_add(1, Ordering::Relaxed);
 
-        Ok(())
+        self.entries.access_mut(dlock).push((name, file.ino));
     }
 }
 
-impl InodeOps for DirectoryOps {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn readdir<'cb, 'r: 'cb>(
+impl Inode for DirectoryInode {
+    fn do_readdir(
         &self,
-        _: &Inode,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        Ok(self
-            .entries
-            .lock()
+        let lock = self.rwsem.lock_shared();
+        self.entries
+            .access(lock.as_ref())
             .iter()
             .skip(offset)
-            .take_while(|(name, ino)| callback(name, *ino).is_ok())
-            .count())
+            .map(|(name, ino)| callback(&name, *ino))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
 
-    fn creat(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn creat(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut rwsem = self.rwsem.lock();
 
         let ino = vfs.assign_ino();
-        let file = vfs.icache.lock().alloc_file(ino, mode)?;
+        let file = FileInode::new(ino, self.vfs.clone(), mode);
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_reg(file)
     }
 
-    fn mknod(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        mode: Mode,
-        dev: DevId,
-    ) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
-
-        if vfs.readonly {
-            return Err(EROFS);
-        }
-
+    fn mknod(&self, at: &Dentry, mode: Mode, dev: DevId) -> KResult<()> {
         if !s_ischr(mode) && !s_isblk(mode) {
             return Err(EINVAL);
         }
 
-        let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-        let file = icache.alloc(ino, Box::new(NodeOps::new(dev)));
-        file.idata.lock().mode = mode & (0o777 | S_IFBLK | S_IFCHR);
-        icache.submit(&file)?;
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
+
+        let mut rwsem = self.rwsem.lock();
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        let ino = vfs.assign_ino();
+        let file = NodeInode::new(
+            ino,
+            self.vfs.clone(),
+            mode & (0o777 | S_IFBLK | S_IFCHR),
+            dev,
+        );
+
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_reg(file)
     }
 
-    fn symlink(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        target: &[u8],
-    ) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
-
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+    fn symlink(&self, at: &Arc<Dentry>, target: &[u8]) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-
-        let target_len = target.len() as u64;
+        let mut rwsem = self.rwsem.lock();
 
-        let file =
-            icache.alloc(ino, Box::new(SymlinkOps::new(Arc::from(target))));
-        {
-            let mut idata = file.idata.lock();
-            idata.mode = S_IFLNK | 0o777;
-            idata.size = target_len;
-        }
-        icache.submit(&file)?;
+        let ino = vfs.assign_ino();
+        let file = SymlinkInode::new(ino, self.vfs.clone(), target.into());
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_symlink(file)
     }
 
-    fn mkdir(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn mkdir(&self, at: &Dentry, mode: Mode) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut rwsem = self.rwsem.lock();
 
         let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-
-        let mut newdir_ops = DirectoryOps::new();
-        let entries = newdir_ops.entries.get_mut();
-        entries.push((Arc::from(b".".as_slice()), ino));
-        entries.push((Arc::from(b"..".as_slice()), dir.ino));
-
-        let newdir = icache.alloc(ino, Box::new(newdir_ops));
-        {
-            let mut newdir_idata = newdir.idata.lock();
-            newdir_idata.mode = S_IFDIR | (mode & 0o777);
-            newdir_idata.nlink = 1;
-            newdir_idata.size = 2;
-        }
-
-        icache.submit(&newdir)?;
-        dir.idata.lock().nlink += 1; // link from `newdir` to `dir`, (or parent)
+        let newdir = DirectoryInode::new(ino, self.vfs.clone(), mode);
 
-        self.link(dir, newdir.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), newdir.as_ref(), rwsem.as_mut());
         at.save_dir(newdir)
     }
 
-    fn unlink(&self, dir: &Inode, at: &Arc<Dentry>) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn unlink(&self, at: &Arc<Dentry>) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut dlock = self.rwsem.lock();
 
         let file = at.get_inode()?;
+        let _flock = file.rwsem.lock();
 
-        let mut file_idata = file.idata.lock();
-
-        if file_idata.mode & S_IFDIR != 0 {
+        // SAFETY: `flock` has done the synchronization
+        if file.mode.load(Ordering::Relaxed) & S_IFDIR != 0 {
             return Err(EISDIR);
         }
 
-        let mut self_idata = dir.idata.lock();
-        let mut entries = self.entries.lock();
-
-        let idx = entries
-            .iter()
-            .position(|(_, ino)| *ino == file.ino)
-            .expect("file not found in directory");
+        let entries = self.entries.access_mut(dlock.as_mut());
+        entries.retain(|(_, ino)| *ino != file.ino);
+
+        assert_eq!(
+            entries.len() as u64,
+            // SAFETY: `dlock` has done the synchronization
+            self.size.fetch_sub(1, Ordering::Relaxed) - 1
+        );
+
+        // SAFETY: `flock` has done the synchronization
+        let file_nlink = file.nlink.fetch_sub(1, Ordering::Relaxed) - 1;
+
+        if file_nlink == 0 {
+            // Remove the file inode from the inode cache
+            // The last reference to the inode is held by some dentry
+            // and will be released when the dentry is released
+            //
+            // TODO: Should we use some inode cache in tmpfs?
+            //
+            // vfs.icache.lock().retain(|ino, _| *ino != file.ino);
+        }
 
-        self_idata.size -= 1;
-        file_idata.nlink -= 1;
-        entries.remove(idx);
+        // Postpone the invalidation of the dentry and inode until the
+        // last reference to the dentry is released
+        //
+        // But we can remove it from the dentry cache immediately
+        // so later lookup will fail with ENOENT
+        dcache::d_remove(at);
 
-        at.invalidate()
+        Ok(())
     }
 }
 
-struct SymlinkOps {
-    target: Arc<[u8]>,
-}
-
-impl SymlinkOps {
-    fn new(target: Arc<[u8]>) -> Self {
-        Self { target }
+define_struct_inode! {
+    struct SymlinkInode {
+        target: Arc<[u8]>,
     }
 }
 
-impl InodeOps for SymlinkOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl SymlinkInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, target: Arc<[u8]>) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, _| unsafe {
+            let len = target.len();
+            addr_of_mut_field!(inode, target).write(target);
+
+            addr_of_mut_field!(inode, mode).write((S_IFLNK | 0o777).into());
+            addr_of_mut_field!(inode, size).write((len as u64).into());
+        })
     }
+}
 
-    fn readlink(&self, _: &Inode, buffer: &mut dyn Buffer) -> KResult<usize> {
+impl Inode for SymlinkInode {
+    fn readlink(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
         buffer
             .fill(self.target.as_ref())
             .map(|result| result.allow_partial())
     }
 }
 
-impl FileOps {
-    fn new() -> Self {
-        Self {
-            data: Mutex::new(vec![]),
-        }
+define_struct_inode! {
+    struct FileInode {
+        filedata: Locked<Vec<u8>, ()>,
     }
 }
 
-impl InodeOps for FileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl FileInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, filedata).write(Locked::new(vec![], rwsem));
+
+            addr_of_mut_field!(inode, mode).write((S_IFREG | (mode & 0o777)).into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
+}
 
-    fn read(
-        &self,
-        _: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
-        let data = self.data.lock();
-        let data = data.split_at_checked(offset).ok_or(EINVAL)?.1;
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let lock = self.rwsem.lock_shared();
 
-        buffer.fill(data).map(|result| result.allow_partial())
+        match self.filedata.access(lock.as_ref()).split_at_checked(offset) {
+            Some((_, data)) => buffer.fill(data).map(|result| result.allow_partial()),
+            None => Ok(0),
+        }
     }
 
-    fn write(
-        &self,
-        inode: &Inode,
-        buffer: &[u8],
-        offset: usize,
-    ) -> KResult<usize> {
-        let mut idata = inode.idata.lock();
-        let mut data = self.data.lock();
+    fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let mut lock = self.rwsem.lock();
+        let filedata = self.filedata.access_mut(lock.as_mut());
+
+        let offset = match offset {
+            WriteOffset::Position(offset) => offset,
+            // SAFETY: `lock` has done the synchronization
+            WriteOffset::End(end) => {
+                let size = self.size.load(Ordering::Relaxed) as usize;
+                *end = size + buffer.len();
+
+                size
+            }
+        };
 
-        if data.len() < offset + buffer.len() {
-            data.resize(offset + buffer.len(), 0);
+        if filedata.len() < offset + buffer.len() {
+            filedata.resize(offset + buffer.len(), 0);
         }
 
-        data[offset..offset + buffer.len()].copy_from_slice(&buffer);
-        idata.size = data.len() as u64;
+        filedata[offset..offset + buffer.len()].copy_from_slice(&buffer);
+
+        // SAFETY: `lock` has done the synchronization
+        self.size.store(filedata.len() as u64, Ordering::Relaxed);
 
         Ok(buffer.len())
     }
 
-    fn truncate(&self, inode: &Inode, length: usize) -> KResult<()> {
-        let mut idata = inode.idata.lock();
+    fn truncate(&self, length: usize) -> KResult<()> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let mut lock = self.rwsem.lock();
+        let filedata = self.filedata.access_mut(lock.as_mut());
 
-        idata.size = length as u64;
-        self.data.lock().resize(length, 0);
+        // SAFETY: `lock` has done the synchronization
+        self.size.store(length as u64, Ordering::Relaxed);
+        filedata.resize(length, 0);
 
         Ok(())
     }
 }
 
-/// # Lock order
-/// `vfs` -> `icache` -> `idata` -> `*ops`.`*data`
+impl_any!(TmpFs);
 struct TmpFs {
-    icache: Mutex<InodeCache<TmpFs>>,
     next_ino: AtomicIno,
     readonly: bool,
 }
 
-impl InodeCache<TmpFs> {
-    fn alloc_file(&mut self, ino: Ino, mode: Mode) -> KResult<Arc<Inode>> {
-        let file = self.alloc(ino, Box::new(FileOps::new()));
-        file.idata.lock().mode = S_IFREG | (mode & 0o777);
+impl Vfs for TmpFs {
+    fn io_blksize(&self) -> usize {
+        4096
+    }
 
-        self.submit(&file)?;
+    fn fs_devid(&self) -> DevId {
+        2
+    }
 
-        Ok(file)
+    fn is_read_only(&self) -> bool {
+        self.readonly
     }
 }
 
 impl TmpFs {
     fn assign_ino(&self) -> Ino {
-        self.next_ino.fetch_add(1, Ordering::SeqCst)
+        self.next_ino.fetch_add(1, Ordering::AcqRel)
     }
 
-    pub fn create(readonly: bool) -> KResult<(Arc<TmpFs>, Arc<Inode>)> {
-        let tmpfs = Arc::new_cyclic(|weak| Self {
-            icache: Mutex::new(InodeCache::new(weak.clone())),
+    pub fn create(readonly: bool) -> KResult<(Arc<dyn Vfs>, Arc<dyn Inode>)> {
+        let tmpfs = Arc::new(Self {
             next_ino: AtomicIno::new(1),
             readonly,
         });
 
-        let mut dir = DirectoryOps::new();
-        let entries = dir.entries.get_mut();
-        entries.push((Arc::from(b".".as_slice()), 0));
-        entries.push((Arc::from(b"..".as_slice()), 0));
-
-        let root_dir = {
-            let mut icache = tmpfs.icache.lock();
-            let root_dir = icache.alloc(0, Box::new(dir));
-            {
-                let mut idata = root_dir.idata.lock();
-
-                idata.mode = S_IFDIR | 0o755;
-                idata.nlink = 2;
-                idata.size = 2;
-            }
-
-            icache.submit(&root_dir)?;
-
-            root_dir
-        };
+        let weak = Arc::downgrade(&tmpfs);
+        let root_dir = DirectoryInode::new(0, weak, 0o755);
 
         Ok((tmpfs, root_dir))
     }
 }
 
-impl Vfs for TmpFs {
-    fn io_blksize(&self) -> usize {
-        4096
-    }
-
-    fn fs_devid(&self) -> DevId {
-        2
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
 struct TmpFsMountCreator;
 
 impl MountCreator for TmpFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fs, root_inode) = TmpFs::create(flags & MS_RDONLY != 0)?;
 
         Mount::new(mp, fs, root_inode)
@@ -385,5 +349,5 @@ impl MountCreator for TmpFsMountCreator {
 }
 
 pub fn init() {
-    register_filesystem("tmpfs", Box::new(TmpFsMountCreator)).unwrap();
+    register_filesystem("tmpfs", Arc::new(TmpFsMountCreator)).unwrap();
 }

+ 23 - 51
src/io.rs

@@ -2,7 +2,7 @@ use bindings::EFAULT;
 
 use crate::prelude::*;
 
-use core::{ffi::c_char, fmt::Write, mem::MaybeUninit};
+use core::{fmt::Write, mem::MaybeUninit};
 
 pub enum FillResult {
     Done(usize),
@@ -34,6 +34,24 @@ pub trait Buffer {
     fn total(&self) -> usize;
     fn wrote(&self) -> usize;
     fn fill(&mut self, data: &[u8]) -> KResult<FillResult>;
+
+    fn available(&self) -> usize {
+        self.total() - self.wrote()
+    }
+}
+
+pub trait BufferFill<T: Copy> {
+    fn copy(&mut self, object: &T) -> KResult<FillResult>;
+}
+
+impl<T: Copy, B: Buffer + ?Sized> BufferFill<T> for B {
+    fn copy(&mut self, object: &T) -> KResult<FillResult> {
+        let ptr = object as *const T as *const u8;
+        let len = core::mem::size_of::<T>();
+
+        // SAFETY: `object` is a valid object.
+        self.fill(unsafe { core::slice::from_raw_parts(ptr, len) })
+    }
 }
 
 pub struct UninitBuffer<'lt, T: Copy + Sized> {
@@ -49,10 +67,7 @@ impl<'lt, T: Copy + Sized> UninitBuffer<'lt, T> {
         Self {
             data,
             buffer: RawBuffer::new_from_slice(unsafe {
-                core::slice::from_raw_parts_mut(
-                    ptr as *mut u8,
-                    core::mem::size_of::<T>(),
-                )
+                core::slice::from_raw_parts_mut(ptr as *mut u8, core::mem::size_of::<T>())
             }),
         }
     }
@@ -106,9 +121,9 @@ impl<'lt> RawBuffer<'lt> {
         }
     }
 
-    pub fn new_from_raw(buf: &'lt mut *mut u8, tot: usize) -> Self {
+    pub fn new_from_raw(buf: *mut u8, tot: usize) -> Self {
         Self {
-            buf: *buf,
+            buf,
             tot,
             cur: 0,
             _phantom: core::marker::PhantomData,
@@ -136,11 +151,7 @@ impl<'lt> RawBuffer<'lt> {
             n if n == 0 => Ok(FillResult::Full),
             n if n < data.len() => {
                 unsafe {
-                    core::ptr::copy_nonoverlapping(
-                        data.as_ptr(),
-                        self.buf.add(self.count()),
-                        n,
-                    );
+                    core::ptr::copy_nonoverlapping(data.as_ptr(), self.buf.add(self.count()), n);
                 }
                 self.cur += n;
                 Ok(FillResult::Partial(n))
@@ -227,42 +238,3 @@ impl Write for RawBuffer<'_> {
         }
     }
 }
-
-pub fn get_str_from_cstr<'a>(cstr: *const c_char) -> KResult<&'a str> {
-    if cstr.is_null() {
-        return Err(EFAULT);
-    }
-
-    let cstr = unsafe { core::ffi::CStr::from_ptr::<'a>(cstr) };
-    cstr.to_str().map_err(|_| EFAULT)
-}
-
-/// Copy data from src to dst, starting from offset, and copy at most count bytes.
-///
-/// # Return
-///
-/// The number of bytes copied.
-pub fn copy_offset_count(
-    src: &[u8],
-    dst: &mut [u8],
-    offset: usize,
-    count: usize,
-) -> usize {
-    if offset >= src.len() {
-        return 0;
-    }
-
-    let count = {
-        let count = count.min(dst.len());
-
-        if offset + count > src.len() {
-            src.len() - offset
-        } else {
-            count
-        }
-    };
-
-    dst[..count].copy_from_slice(&src[offset..offset + count]);
-
-    count
-}

+ 9 - 5
src/kernel.ld

@@ -61,6 +61,11 @@ SECTIONS
         KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
         end_ctors = .;
 
+        . = ALIGN(16);
+        START_SYSCALL_HANDLERS = .;
+        KEEP(*(.syscall_handlers));
+        END_SYSCALL_HANDLERS = .;
+
         . = ALIGN(16);
         *(.data.kinit)
 
@@ -91,15 +96,13 @@ SECTIONS
 
         . = ALIGN(16);
         KMOD_LOADERS_START = .;
-
         KEEP(*(.kmods));
         QUAD(0);
 
         . = ALIGN(16);
-        late_init_start = .;
-        KEEP(*(.late_init));
-        QUAD(0);
-        late_init_end = .;
+        FIX_START = .;
+        KEEP(*(.fix));
+        FIX_END = .;
 
         . = ALIGN(16);
 
@@ -122,6 +125,7 @@ SECTIONS
         *(.got)
         *(.got.plt)
 
+        . = . + 4;
         . = ALIGN(0x1000) - 4;
         LONG(KERNEL_MAGIC);
 

+ 2 - 0
src/kernel.rs

@@ -2,4 +2,6 @@ pub mod block;
 pub mod console;
 pub mod interrupt;
 pub mod mem;
+pub mod syscall;
+pub mod user;
 pub mod vfs;

+ 5 - 3
src/kernel/async/lock.cc

@@ -52,24 +52,26 @@ static inline void _restore_interrupt_state(lock_context_t context) {
 // TODO: mark as _per_cpu
 static inline preempt_count_t& _preempt_count() {
     static preempt_count_t _preempt_count;
-    assert(!(_preempt_count & 0x80000000));
+    assert(_preempt_count >= 0);
     return _preempt_count;
 }
 
 void preempt_disable() {
     ++_preempt_count();
+    asm volatile("" : : : "memory");
 }
 
 void preempt_enable() {
+    asm volatile("" : : : "memory");
     --_preempt_count();
 }
 
 extern "C" void r_preempt_disable() {
-    ++_preempt_count();
+    preempt_disable();
 }
 
 extern "C" void r_preempt_enable() {
-    --_preempt_count();
+    preempt_enable();
 }
 
 preempt_count_t preempt_count() {

+ 2 - 1
src/kernel/async/waitlist.cc

@@ -8,13 +8,14 @@
 using namespace kernel::async;
 
 bool wait_list::wait(mutex& lock) {
+    preempt_disable();
     this->subscribe();
 
     auto* curthd = current_thread;
     curthd->set_attr(kernel::task::thread::ISLEEP);
 
     lock.unlock();
-    bool has_signals = schedule();
+    bool has_signals = schedule_now_preempt_disabled();
     lock.lock();
 
     m_subscribers.erase(curthd);

+ 12 - 10
src/kernel/block.rs

@@ -11,7 +11,7 @@ use alloc::{
 };
 use bindings::{EEXIST, EINVAL, EIO, ENOENT};
 
-use crate::KResult;
+use lazy_static::lazy_static;
 
 use super::{
     mem::{paging::Page, phys::PhysPtr},
@@ -27,18 +27,18 @@ pub trait BlockRequestQueue: Send + Sync {
     ///
     fn max_request_pages(&self) -> u64;
 
-    fn submit(&mut self, req: BlockDeviceRequest) -> KResult<()>;
+    fn submit(&self, req: BlockDeviceRequest) -> KResult<()>;
 }
 
 struct BlockDeviceDisk {
-    queue: Arc<Mutex<dyn BlockRequestQueue>>,
+    queue: Arc<dyn BlockRequestQueue>,
 }
 
 struct BlockDevicePartition {
     disk_dev: DevId,
     offset: u64,
 
-    queue: Arc<Mutex<dyn BlockRequestQueue>>,
+    queue: Arc<dyn BlockRequestQueue>,
 }
 
 enum BlockDeviceType {
@@ -74,8 +74,10 @@ impl Ord for BlockDevice {
     }
 }
 
-static BLOCK_DEVICE_LIST: Mutex<BTreeMap<DevId, Arc<BlockDevice>>> =
-    Mutex::new(BTreeMap::new());
+lazy_static! {
+    static ref BLOCK_DEVICE_LIST: Spin<BTreeMap<DevId, Arc<BlockDevice>>> =
+        Spin::new(BTreeMap::new());
+}
 
 #[derive(Debug, Clone, Copy)]
 #[repr(C)]
@@ -100,9 +102,9 @@ impl BlockDevice {
     pub fn register_disk(
         devid: DevId,
         size: u64,
-        queue: Arc<Mutex<dyn BlockRequestQueue>>,
+        queue: Arc<dyn BlockRequestQueue>,
     ) -> KResult<Arc<Self>> {
-        let max_pages = queue.lock().max_request_pages();
+        let max_pages = queue.max_request_pages();
         let device = Arc::new(Self {
             devid,
             size,
@@ -199,10 +201,10 @@ impl BlockDevice {
         }
 
         match self.dev_type {
-            BlockDeviceType::Disk(ref disk) => disk.queue.lock().submit(req),
+            BlockDeviceType::Disk(ref disk) => disk.queue.submit(req),
             BlockDeviceType::Partition(ref part) => {
                 req.sector += part.offset;
-                part.queue.lock().submit(req)
+                part.queue.submit(req)
             }
         }
     }

+ 35 - 5
src/kernel/console.rs

@@ -1,6 +1,8 @@
 use crate::prelude::*;
 
-pub struct Console {}
+use lazy_static::lazy_static;
+
+pub struct Console;
 
 impl Write for Console {
     fn write_str(&mut self, s: &str) -> core::fmt::Result {
@@ -19,11 +21,13 @@ impl Write for Console {
 }
 
 #[doc(hidden)]
-pub fn _print(args: core::fmt::Arguments) -> core::fmt::Result {
-    CONSOLE.lock().write_fmt(args)
+pub fn _print(args: core::fmt::Arguments) {
+    dont_check!(CONSOLE.lock_irq().write_fmt(args))
 }
 
-pub static CONSOLE: spin::Mutex<Console> = spin::Mutex::new(Console {});
+lazy_static! {
+    pub static ref CONSOLE: Spin<Console> = Spin::new(Console {});
+}
 
 macro_rules! print {
     ($($arg:tt)*) => {
@@ -40,4 +44,30 @@ macro_rules! println {
     };
 }
 
-pub(crate) use {print, println};
+macro_rules! println_warn {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel: warn] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_debug {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel:debug] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_info {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel: info] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_fatal {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel:fatal] {}", format_args!($($arg)*))
+    };
+}
+
+pub(crate) use {
+    print, println, println_debug, println_fatal, println_info, println_warn,
+};

+ 4 - 2
src/kernel/interrupt.cpp

@@ -1,3 +1,4 @@
+#include "kernel/async/lock.hpp"
 #include <list>
 #include <vector>
 
@@ -75,7 +76,8 @@ void kernel::kinit::init_interrupt() {
     // TODO: move this to timer driver
     kernel::irq::register_handler(0, []() {
         kernel::hw::timer::inc_tick();
-        schedule();
+        if (async::preempt_count() == 0)
+            schedule_now();
     });
 
     port_pic1_command = 0x11; // edge trigger mode
@@ -110,7 +112,7 @@ static inline void fault_handler(interrupt_stack* context, mmx_registers*) {
                 kill_current(SIGILL); // noreturn
         } break;
         case 14: {
-            kernel::mem::paging::handle_page_fault(context->error_code);
+            kernel::mem::paging::handle_page_fault(context);
             return;
         } break;
     }

+ 12 - 20
src/kernel/interrupt.rs

@@ -2,43 +2,35 @@ use alloc::boxed::Box;
 use alloc::vec;
 use alloc::vec::Vec;
 
+use lazy_static::lazy_static;
+
 use crate::bindings::root::EINVAL;
+use crate::Spin;
 
-static mut IRQ_HANDLERS: spin::Mutex<[Option<Vec<Box<dyn Fn()>>>; 16]> =
-    spin::Mutex::new([const { None }; 16]);
+lazy_static! {
+    static ref IRQ_HANDLERS: Spin<[Vec<Box<dyn Fn() + Send>>; 16]> =
+        Spin::new(core::array::from_fn(|_| vec![]));
+}
 
 #[no_mangle]
 pub extern "C" fn irq_handler_rust(irqno: core::ffi::c_int) {
     assert!(irqno >= 0 && irqno < 16);
 
-    let handlers = unsafe { IRQ_HANDLERS.lock() };
+    let handlers = IRQ_HANDLERS.lock();
 
-    match handlers[irqno as usize] {
-        Some(ref handlers) => {
-            for handler in handlers {
-                handler();
-            }
-        }
-        None => {}
+    for handler in handlers[irqno as usize].iter() {
+        handler();
     }
 }
 
 pub fn register_irq_handler<F>(irqno: i32, handler: F) -> Result<(), u32>
 where
-    F: Fn() + 'static,
+    F: Fn() + Send + 'static,
 {
     if irqno < 0 || irqno >= 16 {
         return Err(EINVAL);
     }
 
-    let mut handlers = unsafe { IRQ_HANDLERS.lock() };
-
-    match handlers[irqno as usize] {
-        Some(ref mut handlers) => handlers.push(Box::new(handler)),
-        None => {
-            handlers[irqno as usize].replace(vec![Box::new(handler)]);
-        }
-    }
-
+    IRQ_HANDLERS.lock_irq()[irqno as usize].push(Box::new(handler));
     Ok(())
 }

+ 11 - 14
src/kernel/mem/mm_list.cc

@@ -1,4 +1,5 @@
 #include <assert.h>
+#include <errno.h>
 #include <stdint.h>
 
 #include <kernel/mem/mm_list.hpp>
@@ -16,8 +17,7 @@ static inline void __invalidate_all_tlb() {
         : "rax", "memory");
 }
 
-static inline void __dealloc_page_table_all(paging::pfn_t pt, int depth,
-                                            int from, int to) {
+static inline void __dealloc_page_table_all(paging::pfn_t pt, int depth, int from, int to) {
     using namespace paging;
 
     if (depth > 1) {
@@ -138,8 +138,7 @@ int mm_list::register_brk(uintptr_t addr) {
         return -ENOMEM;
 
     bool inserted;
-    std::tie(m_brk, inserted) =
-        m_areas.emplace(addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
+    std::tie(m_brk, inserted) = m_areas.emplace(addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
 
     assert(inserted);
     return 0;
@@ -186,8 +185,8 @@ mm_list::iterator mm_list::split(iterator area, uintptr_t addr) {
     auto new_end = area->end;
     area->end = addr;
 
-    auto [iter, inserted] = m_areas.emplace(addr, area->flags, new_end,
-                                            area->mapped_file, new_file_offset);
+    auto [iter, inserted] =
+        m_areas.emplace(addr, area->flags, new_end, d_get(area->mapped_file), new_file_offset);
 
     assert(inserted);
     return iter;
@@ -217,8 +216,7 @@ int mm_list::unmap(iterator area, bool should_invalidate_tlb) {
     return 0;
 }
 
-int mm_list::unmap(uintptr_t start, std::size_t length,
-                   bool should_invalidate_tlb) {
+int mm_list::unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb) {
     // standard says that addr and len MUST be
     // page-aligned or the call is invalid
     if (start & 0xfff)
@@ -279,7 +277,7 @@ int mm_list::unmap(uintptr_t start, std::size_t length,
 int mm_list::mmap(const map_args& args) {
     auto& vaddr = args.vaddr;
     auto& length = args.length;
-    auto& finode = args.file_inode;
+    auto& file = args.file;
     auto& foff = args.file_offset;
     auto& flags = args.flags;
 
@@ -298,10 +296,10 @@ int mm_list::mmap(const map_args& args) {
         attributes |= PA_NXE;
 
     if (flags & MM_MAPPED) {
-        assert(finode);
+        assert(file);
 
-        auto [area, inserted] = m_areas.emplace(
-            vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, finode, foff);
+        auto [area, inserted] =
+            m_areas.emplace(vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, d_get(file), foff);
         assert(inserted);
 
         attributes |= PA_MMAPPED_PAGE;
@@ -310,8 +308,7 @@ int mm_list::mmap(const map_args& args) {
     } else if (flags & MM_ANONYMOUS) {
         // private mapping of zero-filled pages
         // TODO: shared mapping
-        auto [area, inserted] =
-            m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
+        auto [area, inserted] = m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
         assert(inserted);
 
         attributes |= PA_ANONYMOUS_PAGE;

+ 45 - 8
src/kernel/mem/paging.cc

@@ -96,8 +96,7 @@ static inline page* _create_zone(pfn_t pfn, unsigned order) {
 }
 
 // call with zone_lock held
-static inline void _split_zone(page* zone, unsigned order,
-                               unsigned target_order) {
+static inline void _split_zone(page* zone, unsigned order, unsigned target_order) {
     while (order > target_order) {
         pfn_t pfn = page_to_pfn(zone);
         _create_zone(buddy(pfn, order - 1), order - 1);
@@ -271,10 +270,34 @@ void kernel::mem::paging::increase_refcount(page* pg) {
     pg->refcount++;
 }
 
-void kernel::mem::paging::handle_page_fault(unsigned long err) {
+struct fix_entry {
+    uint64_t start;
+    uint64_t length;
+    uint64_t jump_address;
+    uint64_t type;
+};
+
+extern "C" fix_entry FIX_START[], FIX_END[];
+bool page_fault_fix(interrupt_stack* int_stack) {
+    // TODO: type load
+
+    // type store
+    for (fix_entry* fix = FIX_START; fix < FIX_END; fix++) {
+        if (int_stack->v_rip >= fix->start && int_stack->v_rip < fix->start + fix->length) {
+            int_stack->v_rip = fix->jump_address;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void kernel::mem::paging::handle_page_fault(interrupt_stack* int_stack) {
     using namespace kernel::mem;
     using namespace paging;
 
+    auto err = int_stack->error_code;
+
     uintptr_t vaddr;
     asm volatile("mov %%cr2, %0" : "=g"(vaddr) : :);
     auto& mms = current_process->mms;
@@ -285,7 +308,11 @@ void kernel::mem::paging::handle_page_fault(unsigned long err) {
         if (err & PAGE_FAULT_U)
             kill_current(SIGSEGV);
 
-        __page_fault_die(vaddr);
+        if (!page_fault_fix(int_stack)) {
+            __page_fault_die(vaddr);
+        } else {
+            return;
+        }
     }
 
     // user access to a present page caused the fault
@@ -313,8 +340,13 @@ void kernel::mem::paging::handle_page_fault(unsigned long err) {
     bool mmapped = mm_area->flags & MM_MAPPED;
     assert(!mmapped || mm_area->mapped_file);
 
-    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
-        __page_fault_die(vaddr);
+    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]] {
+        if (!page_fault_fix(int_stack)) {
+            __page_fault_die(vaddr);
+        } else {
+            return;
+        }
+    }
 
     pfn_t pfn = pe.pfn();
     auto attr = pe.attributes();
@@ -358,8 +390,13 @@ void kernel::mem::paging::handle_page_fault(unsigned long err) {
         size_t offset = (vaddr & ~0xfff) - mm_area->start;
         char* data = physaddr<char>{pfn};
 
-        int n = fs_read(mm_area->mapped_file, data, 4096,
-                        mm_area->file_offset + offset, 4096);
+        int n = fs::fs_read(mm_area->mapped_file.get(), data, 4096, mm_area->file_offset + offset,
+                            4096);
+
+        if (n < 0) {
+            kill_current(SIGBUS);
+            return;
+        }
 
         // TODO: send SIGBUS if offset is greater than real size
         if (n != 4096)

+ 20 - 10
src/kernel/mem/slab.cc

@@ -4,6 +4,7 @@
 
 #include <types/list.hpp>
 
+#include <kernel/async/lock.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/slab.hpp>
 
@@ -12,6 +13,8 @@ using namespace types::list;
 
 constexpr std::size_t SLAB_PAGE_SIZE = 0x1000; // 4K
 
+kernel::async::mutex slab_lock;
+
 std::ptrdiff_t _slab_data_start_offset(std::size_t size) {
     return (sizeof(slab_head) + size - 1) & ~(size - 1);
 }
@@ -67,6 +70,8 @@ void _slab_add_page(slab_cache* cache) {
 }
 
 void* kernel::mem::slab_alloc(slab_cache* cache) {
+    async::lock_guard_irq lock(slab_lock);
+
     slab_head* slab = cache->slabs_partial;
     if (!slab) {                 // no partial slabs, try to get an empty slab
         if (!cache->slabs_empty) // no empty slabs, create a new one
@@ -88,24 +93,29 @@ void* kernel::mem::slab_alloc(slab_cache* cache) {
 }
 
 void kernel::mem::slab_free(void* ptr) {
+    async::lock_guard_irq lock(slab_lock);
+
     slab_head* slab = (slab_head*)((uintptr_t)ptr & ~(SLAB_PAGE_SIZE - 1));
 
     *(void**)ptr = slab->free;
     slab->free = ptr;
     slab->free_count++;
 
-    if (slab->free_count == _slab_max_count(slab->obj_size)) {
-        auto* cache = slab->cache;
-        slab_head** head = nullptr;
+    auto max_count = _slab_max_count(slab->obj_size);
 
-        if (cache->slabs_full == slab) {
-            head = &cache->slabs_full;
-        } else {
-            head = &cache->slabs_partial;
-        }
+    if (max_count == 1) {
+        list_remove(&slab->cache->slabs_full, slab);
+        list_insert(&slab->cache->slabs_empty, slab);
+    }
+
+    if (slab->free_count == 1) {
+        list_remove(&slab->cache->slabs_full, slab);
+        list_insert(&slab->cache->slabs_partial, slab);
+    }
 
-        list_remove(head, slab);
-        list_insert(&cache->slabs_empty, slab);
+    if (slab->free_count == max_count) {
+        list_remove(&slab->cache->slabs_partial, slab);
+        list_insert(&slab->cache->slabs_empty, slab);
     }
 }
 

+ 97 - 73
src/kernel/process.cpp

@@ -21,25 +21,67 @@
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
 
+extern "C" fs::rust_file_array::handle* r_filearray_new_for_init();
+extern "C" fs::rust_fs_context::handle* r_fs_context_new_for_init();
+extern "C" fs::rust_file_array::handle* r_filearray_new_cloned(
+    struct fs::rust_file_array::handle* other);
+extern "C" fs::rust_fs_context::handle* r_fs_context_new_cloned(
+    struct fs::rust_fs_context::handle* other);
+extern "C" void r_filearray_drop(struct fs::rust_file_array::handle* other);
+extern "C" void r_fs_context_drop(struct fs::rust_fs_context::handle* other);
+
+fs::rust_fs_context::rust_fs_context(rust_fs_context::handle* handle) : m_handle(handle) {}
+fs::rust_file_array::rust_file_array(rust_file_array::handle* handle) : m_handle(handle) {}
+
+fs::rust_fs_context::~rust_fs_context() {
+    drop();
+}
+
+fs::rust_file_array::~rust_file_array() {
+    drop();
+}
+
+void fs::rust_fs_context::drop() {
+    if (m_handle) {
+        r_fs_context_drop(m_handle);
+        m_handle = nullptr;
+    }
+}
+
+void fs::rust_file_array::drop() {
+    if (m_handle) {
+        r_filearray_drop(m_handle);
+        m_handle = nullptr;
+    }
+}
+
+fs::rust_fs_context::handle* fs::rust_fs_context::get() const {
+    assert(m_handle);
+    return m_handle;
+}
+
+fs::rust_file_array::handle* fs::rust_file_array::get() const {
+    assert(m_handle);
+    return m_handle;
+}
+
 process::process(const process& parent, pid_t pid)
     : mms{parent.mms}
     , attr{parent.attr}
-    , files{parent.files.copy()}
-    , umask{parent.umask}
+    , files{r_filearray_new_cloned(parent.files.get())}
+    , fs_context{r_fs_context_new_cloned(parent.fs_context.get())}
     , pid{pid}
     , ppid{parent.pid}
     , pgid{parent.pgid}
     , sid{parent.sid}
-    , control_tty{parent.control_tty} {
-    assert(parent.cwd);
-    cwd = fs::d_get(parent.cwd);
-
-    assert(parent.fs_context.root);
-    fs_context.root = fs::d_get(parent.fs_context.root);
-}
+    , control_tty{parent.control_tty} {}
 
 process::process(pid_t pid, pid_t ppid)
-    : attr{.system = true}, files{&fs_context}, pid{pid}, ppid{ppid} {
+    : attr{.system = true}
+    , files{r_filearray_new_for_init()}
+    , fs_context{r_fs_context_new_for_init()}
+    , pid{pid}
+    , ppid{ppid} {
     bool inserted;
     std::tie(std::ignore, inserted) = thds.emplace("", pid);
     assert(inserted);
@@ -85,6 +127,9 @@ proclist::proclist() {
     auto thd = init.thds.begin();
     thd->name.assign("[kernel init]");
 
+    init.attr.system = 0;
+    thd->attr &= ~kernel::task::thread::SYSTEM;
+
     current_process = &init;
     current_thread = &thd;
 
@@ -134,15 +179,14 @@ void proclist::kill(pid_t pid, int exit_code) {
     //       files should only be closed when this is the last thread
     //
     // write back mmap'ped files and close them
-    proc.files.clear();
+    proc.files.drop();
+
+    // free fs_context
+    proc.fs_context.drop();
 
     // unmap all user memory areas
     proc.mms.clear();
 
-    // free cwd and fs_context dentry
-    proc.cwd.reset();
-    proc.fs_context.root.reset();
-
     // make child processes orphans (children of init)
     this->make_children_orphans(pid);
 
@@ -191,16 +235,14 @@ static void release_kinit() {
     extern uintptr_t volatile KINIT_START_ADDR, KINIT_END_ADDR, KINIT_PAGES;
 
     std::size_t pages = KINIT_PAGES;
-    auto range =
-        vaddr_range{KERNEL_PML4, KINIT_START_ADDR, KINIT_END_ADDR, true};
+    auto range = vaddr_range{KERNEL_PML4, KINIT_START_ADDR, KINIT_END_ADDR, true};
     for (auto pte : range)
         pte.clear();
 
     create_zone(KERNEL_IMAGE_PADDR, KERNEL_IMAGE_PADDR + 0x1000 * pages);
 }
 
-extern "C" void (*const late_init_start[])();
-extern "C" void late_init_rust();
+extern "C" void late_init_rust(uintptr_t* out_sp, uintptr_t* out_ip);
 
 void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
     kernel::mem::paging::free_pages(kernel_stack_pfn, 9);
@@ -208,58 +250,15 @@ void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
 
     kernel::kmod::load_internal_modules();
 
-    late_init_rust();
+    uintptr_t sp, ip;
+    late_init_rust(&sp, &ip);
 
     asm volatile("sti");
 
-    current_process->fs_context.root = fs::r_get_root_dentry();
-    current_process->cwd = fs::r_get_root_dentry();
-
     // ------------------------------------------
     // interrupt enabled
     // ------------------------------------------
 
-    for (auto* init = late_init_start; *init; ++init)
-        (*init)();
-
-    const auto& context = current_process->fs_context;
-
-    // mount fat32 /mnt directory
-    // TODO: parse kernel parameters
-    if (1) {
-        auto [mnt, status] = fs::open(context, context.root, "/mnt");
-        assert(mnt && status == -ENOENT);
-
-        if (int ret = fs::fs_mkdir(mnt.get(), 0755); 1)
-            assert(ret == 0);
-
-        int ret = fs::fs_mount(mnt.get(), "/dev/sda", "/mnt", "fat32",
-                               MS_RDONLY | MS_NOATIME | MS_NODEV | MS_NOSUID,
-                               "ro,nodev");
-
-        assert(ret == 0);
-    }
-
-    current_process->attr.system = 0;
-    current_thread->attr &= ~kernel::task::thread::SYSTEM;
-
-    types::elf::elf32_load_data d{
-        .exec_dent{},
-        .argv{"/mnt/busybox", "sh", "/mnt/initsh"},
-        .envp{"LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/"},
-        .ip{},
-        .sp{}};
-
-    auto [exec, ret] = fs::open(context, context.root.get(), d.argv[0]);
-    if (!exec || ret) {
-        kmsg("kernel panic: init not found!");
-        freeze();
-    }
-
-    d.exec_dent = std::move(exec);
-    if (int ret = types::elf::elf32_load(d); 1)
-        assert(ret == 0);
-
     int ds = 0x33, cs = 0x2b;
 
     asm volatile(
@@ -277,7 +276,7 @@ void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
 
         "iretq\n"
         :
-        : "g"(ds), "g"(cs), "g"(d.sp), "g"(d.ip)
+        : "g"(ds), "g"(cs), "g"(sp), "g"(ip)
         : "eax", "memory");
 
     freeze();
@@ -315,8 +314,7 @@ void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn) {
         "%=:\n"
         "ud2"
         :
-        : "a"(current_thread->kstack.sp), "c"(_kernel_init),
-          "g"(kernel_stack_pfn)
+        : "a"(current_thread->kstack.sp), "c"(_kernel_init), "g"(kernel_stack_pfn)
         : "memory");
 
     freeze();
@@ -327,9 +325,14 @@ extern "C" void asm_ctx_switch(uintptr_t* curr_sp, uintptr_t* next_sp);
 extern "C" void after_ctx_switch() {
     current_thread->kstack.load_interrupt_stack();
     current_thread->load_thread_area32();
+
+    kernel::async::preempt_enable();
 }
 
-bool _schedule() {
+// call this with preempt_count == 1
+// after this function returns, preempt_count will be 0
+static bool do_schedule() {
+    asm volatile("" : : : "memory");
     auto* next_thd = kernel::task::dispatcher::next();
 
     if (current_thread != next_thd) {
@@ -342,21 +345,41 @@ bool _schedule() {
         auto* curr_thd = current_thread;
         current_thread = next_thd;
 
+        // this implies preempt_enable()
         asm_ctx_switch(&curr_thd->kstack.sp, &next_thd->kstack.sp);
+    } else {
+        kernel::async::preempt_enable();
     }
 
     return current_thread->signals.pending_signal() == 0;
 }
 
-bool schedule() {
-    if (kernel::async::preempt_count() != 0)
-        return true;
+static inline void check_preempt_count(kernel::async::preempt_count_t n) {
+    if (kernel::async::preempt_count() != n) [[unlikely]] {
+        kmsgf(
+            "[kernel:fatal] trying to call schedule_now() with preempt count "
+            "%d, expected %d",
+            kernel::async::preempt_count(), n);
+        assert(kernel::async::preempt_count() == n);
+    }
+}
+
+bool schedule_now() {
+    check_preempt_count(0);
+    kernel::async::preempt_disable();
+    bool result = do_schedule();
+    return result;
+}
 
-    return _schedule();
+// call this with preempt_count == 1
+bool schedule_now_preempt_disabled() {
+    check_preempt_count(1);
+    return do_schedule();
 }
 
 void NORETURN schedule_noreturn(void) {
-    _schedule();
+    schedule_now();
+    kmsgf("[kernel:fatal] an schedule_noreturn() DOES return");
     freeze();
 }
 
@@ -365,6 +388,7 @@ void NORETURN freeze(void) {
         asm volatile("cli\n\thlt");
 }
 
+// TODO!!!: make sure we call this after having done all clean up works
 void NORETURN kill_current(int signo) {
     procs->kill(current_process->pid, (signo + 128) << 8 | (signo & 0xff));
     schedule_noreturn();

+ 1 - 1
src/kernel/signal.cpp

@@ -42,7 +42,7 @@ static void stop_process(int signal) {
     parent.waitlist.notify_all();
 
     while (true) {
-        if (schedule())
+        if (schedule_now())
             break;
     }
 

+ 35 - 194
src/kernel/syscall.cpp

@@ -1,8 +1,6 @@
 #include <assert.h>
 #include <bits/alltypes.h>
-#include <bits/ioctl.h>
 #include <errno.h>
-#include <fcntl.h>
 #include <poll.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -84,20 +82,16 @@
 #define _DEFINE_SYSCALL32_END_PARAMS6(type, name, ...) name __VA_OPT__(, void)
 
 #define _DEFINE_SYSCALL32_END(name, ...) \
-    kernel::syscall::do_##name(          \
-        __VA_OPT__(_DEFINE_SYSCALL32_END_PARAMS1(__VA_ARGS__)))
-
-#define DEFINE_SYSCALL32_TO(name, to, ...)                      \
-    static uint32_t _syscall32_##name(interrupt_stack* data,    \
-                                      mmx_registers* mmxregs) { \
-        (void)data, (void)mmxregs;                              \
-        __VA_OPT__(_DEFINE_SYSCALL32_ARGS1(__VA_ARGS__);)       \
-        return (uint32_t)(uintptr_t)_DEFINE_SYSCALL32_END(      \
-            to __VA_OPT__(, __VA_ARGS__));                      \
+    kernel::syscall::do_##name(__VA_OPT__(_DEFINE_SYSCALL32_END_PARAMS1(__VA_ARGS__)))
+
+#define DEFINE_SYSCALL32_TO(sname, to, ...)                                              \
+    static uint32_t _syscall32_##sname(interrupt_stack* data, mmx_registers* mmxregs) {  \
+        (void)data, (void)mmxregs;                                                       \
+        __VA_OPT__(_DEFINE_SYSCALL32_ARGS1(__VA_ARGS__);)                                \
+        return (uint32_t)(uintptr_t)_DEFINE_SYSCALL32_END(to __VA_OPT__(, __VA_ARGS__)); \
     }
 
-#define DEFINE_SYSCALL32(name, ...) \
-    DEFINE_SYSCALL32_TO(name, name __VA_OPT__(, ) __VA_ARGS__)
+#define DEFINE_SYSCALL32(name, ...) DEFINE_SYSCALL32_TO(name, name __VA_OPT__(, ) __VA_ARGS__)
 
 #define DEFINE_SYSCALL32_NORETURN(name, ...)                                 \
     [[noreturn]] static uint32_t _syscall32_##name(interrupt_stack* data,    \
@@ -122,50 +116,18 @@ static inline void not_implemented(const char* pos, int line) {
     current_thread->send_signal(SIGSYS);
 }
 
-DEFINE_SYSCALL32(write, int, fd, const char __user*, buf, size_t, n)
-DEFINE_SYSCALL32(read, int, fd, char __user*, buf, size_t, n)
-DEFINE_SYSCALL32(close, int, fd)
-DEFINE_SYSCALL32(dup, int, old_fd)
-DEFINE_SYSCALL32(dup2, int, old_fd, int, new_fd)
-DEFINE_SYSCALL32(pipe, int __user*, pipefd)
-DEFINE_SYSCALL32(getdents, int, fd, char __user*, buf, size_t, cnt)
-DEFINE_SYSCALL32(getdents64, int, fd, char __user*, buf, size_t, cnt)
-DEFINE_SYSCALL32(open, const char __user*, path, int, flags, mode_t, mode)
-DEFINE_SYSCALL32(chdir, const char __user*, path)
-DEFINE_SYSCALL32(symlink, const char __user*, target, const char __user*,
-                 linkpath)
-DEFINE_SYSCALL32(readlink, const char __user*, pathname, char __user*, buf,
-                 size_t, buf_size)
-DEFINE_SYSCALL32(ioctl, int, fd, unsigned long, request, uintptr_t, arg3)
 DEFINE_SYSCALL32(munmap, uintptr_t, addr, size_t, len)
 DEFINE_SYSCALL32(poll, pollfd __user*, fds, nfds_t, nfds, int, timeout)
-DEFINE_SYSCALL32(mknod, const char __user*, pathname, mode_t, mode, dev_t, dev)
-DEFINE_SYSCALL32(access, const char __user*, pathname, int, mode)
-DEFINE_SYSCALL32(unlink, const char __user*, pathname)
-DEFINE_SYSCALL32(truncate, const char __user*, pathname, long, length)
-DEFINE_SYSCALL32(mkdir, const char __user*, pathname, mode_t, mode)
 DEFINE_SYSCALL32(socket, int, domain, int, type, int, protocol)
-DEFINE_SYSCALL32_TO(fcntl64, fcntl, int, fd, int, cmd, unsigned long, arg)
-
-DEFINE_SYSCALL32_TO(sendfile64, sendfile, int, out_fd, int, in_fd,
-                    off_t __user*, offset, size_t, count)
-
-DEFINE_SYSCALL32(statx, int, dirfd, const char __user*, path, int, flags,
-                 unsigned int, mask, statx __user*, statxbuf)
 
-DEFINE_SYSCALL32(mmap_pgoff, uintptr_t, addr, size_t, len, int, prot, int,
-                 flags, int, fd, off_t, pgoffset)
-
-DEFINE_SYSCALL32(mount, const char __user*, source, const char __user*, target,
-                 const char __user*, fstype, unsigned long, flags,
-                 const void __user*, _fsdata)
+DEFINE_SYSCALL32(mmap_pgoff, uintptr_t, addr, size_t, len, int, prot, int, flags, int, fd, off_t,
+                 pgoffset)
 
 DEFINE_SYSCALL32(waitpid, pid_t, waitpid, int __user*, arg1, int, options)
 DEFINE_SYSCALL32(getsid, pid_t, pid)
 DEFINE_SYSCALL32(setsid)
 DEFINE_SYSCALL32(getpgid, pid_t, pid)
 DEFINE_SYSCALL32(setpgid, pid_t, pid, pid_t, pgid)
-DEFINE_SYSCALL32(getcwd, char __user*, buf, size_t, buf_size)
 DEFINE_SYSCALL32(getpid)
 DEFINE_SYSCALL32(getppid)
 DEFINE_SYSCALL32(getuid)
@@ -179,32 +141,26 @@ DEFINE_SYSCALL32(set_tid_address, int __user*, tidptr)
 DEFINE_SYSCALL32(prctl, int, option, uintptr_t, arg2)
 DEFINE_SYSCALL32(arch_prctl, int, option, uintptr_t, arg2)
 DEFINE_SYSCALL32(brk, uintptr_t, addr)
-DEFINE_SYSCALL32(umask, mode_t, mask)
 DEFINE_SYSCALL32(kill, pid_t, pid, int, sig)
 DEFINE_SYSCALL32(tkill, pid_t, tid, int, sig)
-DEFINE_SYSCALL32(rt_sigprocmask, int, how, const kernel::sigmask_type __user*,
-                 set, kernel::sigmask_type __user*, oldset, size_t, sigsetsize)
-DEFINE_SYSCALL32(rt_sigaction, int, signum, const kernel::sigaction __user*,
-                 act, kernel::sigaction __user*, oldact, size_t, sigsetsize)
+DEFINE_SYSCALL32(rt_sigprocmask, int, how, const kernel::sigmask_type __user*, set,
+                 kernel::sigmask_type __user*, oldset, size_t, sigsetsize)
+DEFINE_SYSCALL32(rt_sigaction, int, signum, const kernel::sigaction __user*, act,
+                 kernel::sigaction __user*, oldact, size_t, sigsetsize)
 DEFINE_SYSCALL32(newuname, new_utsname __user*, buf)
 
 DEFINE_SYSCALL32_NORETURN(exit, int, status)
 
 DEFINE_SYSCALL32(gettimeofday, timeval __user*, tv, void __user*, tz)
-DEFINE_SYSCALL32_TO(clock_gettime64, clock_gettime, clockid_t, clk_id,
-                    timespec __user*, tp)
+DEFINE_SYSCALL32_TO(clock_gettime64, clock_gettime, clockid_t, clk_id, timespec __user*, tp)
 
 extern "C" void NORETURN ISR_stub_restore();
 static uint32_t _syscall32_fork(interrupt_stack* data, mmx_registers* mmxregs) {
     auto& newproc = procs->copy_from(*current_process);
-    auto [iter_newthd, inserted] =
-        newproc.thds.emplace(*current_thread, newproc.pid);
+    auto [iter_newthd, inserted] = newproc.thds.emplace(*current_thread, newproc.pid);
     assert(inserted);
     auto* newthd = &*iter_newthd;
 
-    kernel::async::preempt_disable();
-    kernel::task::dispatcher::enqueue(newthd);
-
     auto newthd_prev_sp = newthd->kstack.sp;
     assert(!(newthd_prev_sp & 0xf));
 
@@ -230,112 +186,16 @@ static uint32_t _syscall32_fork(interrupt_stack* data, mmx_registers* mmxregs) {
     newthd->kstack.pushq(0);              // 0 for alignment
     newthd->kstack.pushq(newthd_prev_sp); // previous sp
 
-    kernel::async::preempt_enable();
+    kernel::task::dispatcher::enqueue(newthd);
     return newproc.pid;
 }
 
-static uint32_t _syscall32_llseek(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(unsigned int, fd);
-    SYSCALL32_ARG2(unsigned long, offset_high);
-    SYSCALL32_ARG3(unsigned long, offset_low);
-    SYSCALL32_ARG4(off_t __user*, result);
-    SYSCALL32_ARG5(unsigned int, whence);
-
-    if (!result)
-        return -EFAULT;
-
-    off_t offset = offset_low | (offset_high << 32);
-
-    auto ret = kernel::syscall::do_lseek(fd, offset, whence);
-    if (ret < 0)
-        return ret;
-
-    // TODO: copy_to_user
-    *result = ret;
-
-    return 0;
-}
-
-static uint32_t _syscall32_readv(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(int, fd);
-    SYSCALL32_ARG2(const types::iovec32 __user*, _iov);
-    SYSCALL32_ARG3(int, iovcnt);
-
-    // TODO: use copy_from_user
-    if (!_iov)
-        return -EFAULT;
-
-    std::vector<iovec> iov(iovcnt);
-    for (int i = 0; i < iovcnt; ++i) {
-        // TODO: check access right
-        uintptr_t base = _iov[i].iov_base;
-        iov[i].iov_base = (void*)base;
-        iov[i].iov_len = _iov[i].iov_len;
-    }
-
-    return kernel::syscall::do_readv(fd, iov.data(), iovcnt);
-}
-
-static uint32_t _syscall32_writev(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(int, fd);
-    SYSCALL32_ARG2(const types::iovec32 __user*, _iov);
-    SYSCALL32_ARG3(int, iovcnt);
-
-    // TODO: use copy_from_user
-    if (!_iov)
-        return -EFAULT;
-
-    std::vector<iovec> iov(iovcnt);
-    for (int i = 0; i < iovcnt; ++i) {
-        // TODO: check access right
-        uintptr_t base = _iov[i].iov_base;
-        iov[i].iov_base = (void*)base;
-        iov[i].iov_len = _iov[i].iov_len;
-    }
-
-    return kernel::syscall::do_writev(fd, iov.data(), iovcnt);
-}
-
-[[noreturn]] static uint32_t _syscall32_exit_group(interrupt_stack* data,
-                                                   mmx_registers* mmxregs) {
+[[noreturn]] static uint32_t _syscall32_exit_group(interrupt_stack* data, mmx_registers* mmxregs) {
     // we implement exit_group as exit for now
     _syscall32_exit(data, mmxregs);
 }
 
-static uint32_t _syscall32_execve(interrupt_stack* data, mmx_registers*) {
-    SYSCALL32_ARG1(const char __user*, exec);
-    SYSCALL32_ARG2(const uint32_t __user*, argv);
-    SYSCALL32_ARG3(const uint32_t __user*, envp);
-
-    if (!exec || !argv || !envp)
-        return -EFAULT;
-
-    std::vector<std::string> args, envs;
-
-    // TODO: use copy_from_user
-    while (*argv) {
-        uintptr_t addr = *(argv++);
-        args.push_back((char __user*)addr);
-    }
-
-    while (*envp) {
-        uintptr_t addr = *(envp++);
-        envs.push_back((char __user*)addr);
-    }
-
-    auto retval = kernel::syscall::do_execve(exec, args, envs);
-
-    if (retval.status == 0) {
-        // TODO: switch cs ans ss
-        data->v_rip = retval.ip;
-        data->rsp = retval.sp;
-    }
-
-    return retval.status;
-}
-
-static uint32_t _syscall32_wait4(interrupt_stack* data,
-                                 mmx_registers* mmxregs) {
+static uint32_t _syscall32_wait4(interrupt_stack* data, mmx_registers* mmxregs) {
     SYSCALL32_ARG4(void __user*, rusage);
 
     // TODO: getrusage
@@ -345,8 +205,7 @@ static uint32_t _syscall32_wait4(interrupt_stack* data,
     return _syscall32_waitpid(data, mmxregs);
 }
 
-void kernel::handle_syscall32(int no, interrupt_stack* data,
-                              mmx_registers* mmxregs) {
+void kernel::handle_syscall32(int no, interrupt_stack* data, mmx_registers* mmxregs) {
     if (no >= SYSCALL_HANDLERS_SIZE || !syscall_handlers[no].handler) {
         kmsgf("[kernel] syscall %d(%x) isn't implemented", no, no);
         NOT_IMPLEMENTED;
@@ -356,7 +215,7 @@ void kernel::handle_syscall32(int no, interrupt_stack* data,
         return;
     }
 
-    // kmsgf_debug("[kernel:debug] (pid\t%d) %s()", current_process->pid,
+    // kmsgf_debug("[kernel:debug] (pid\t%d) %s() => {{", current_process->pid,
     // syscall_handlers[no].name);
 
     asm volatile("sti");
@@ -370,76 +229,58 @@ void kernel::handle_syscall32(int no, interrupt_stack* data,
     data->regs.r14 = 0;
     data->regs.r15 = 0;
 
+    // kmsgf_debug("[kernel:debug] }} => %x", data->regs.rax);
+
     if (current_thread->signals.pending_signal())
         current_thread->signals.handle(data, mmxregs);
 }
 
-#define REGISTER_SYSCALL_HANDLER(no, _name)              \
-    syscall_handlers[(no)].handler = _syscall32_##_name; \
-    syscall_handlers[(no)].name = #_name;
+#define REGISTER_SYSCALL_HANDLER(no, _name) register_syscall_handler(no, _syscall32_##_name, #_name)
+
+extern "C" void register_syscall_handler(uint32_t no,
+                                         uint32_t (*handler)(interrupt_stack*, mmx_registers*),
+                                         const char* name) {
+    syscall_handlers[no].handler = handler;
+    syscall_handlers[no].name = name;
+}
+
+extern "C" void r_register_syscall();
 
 SECTION(".text.kinit")
 void kernel::init_syscall_table() {
-    // 32bit syscalls
     REGISTER_SYSCALL_HANDLER(0x01, exit);
     REGISTER_SYSCALL_HANDLER(0x02, fork);
-    REGISTER_SYSCALL_HANDLER(0x03, read);
-    REGISTER_SYSCALL_HANDLER(0x04, write);
-    REGISTER_SYSCALL_HANDLER(0x05, open);
-    REGISTER_SYSCALL_HANDLER(0x06, close);
     REGISTER_SYSCALL_HANDLER(0x07, waitpid);
-    REGISTER_SYSCALL_HANDLER(0x0a, unlink);
-    REGISTER_SYSCALL_HANDLER(0x0b, execve);
-    REGISTER_SYSCALL_HANDLER(0x0c, chdir);
-    REGISTER_SYSCALL_HANDLER(0x0e, mknod);
     REGISTER_SYSCALL_HANDLER(0x14, getpid);
-    REGISTER_SYSCALL_HANDLER(0x15, mount);
-    REGISTER_SYSCALL_HANDLER(0x21, access);
     REGISTER_SYSCALL_HANDLER(0x25, kill);
-    REGISTER_SYSCALL_HANDLER(0x27, mkdir);
-    REGISTER_SYSCALL_HANDLER(0x29, dup);
-    REGISTER_SYSCALL_HANDLER(0x2a, pipe);
     REGISTER_SYSCALL_HANDLER(0x2d, brk);
     REGISTER_SYSCALL_HANDLER(0x2f, getgid);
-    REGISTER_SYSCALL_HANDLER(0x36, ioctl);
     REGISTER_SYSCALL_HANDLER(0x39, setpgid);
-    REGISTER_SYSCALL_HANDLER(0x3c, umask);
-    REGISTER_SYSCALL_HANDLER(0x3f, dup2);
     REGISTER_SYSCALL_HANDLER(0x40, getppid);
     REGISTER_SYSCALL_HANDLER(0x42, setsid);
     REGISTER_SYSCALL_HANDLER(0x4e, gettimeofday);
-    REGISTER_SYSCALL_HANDLER(0x53, symlink);
-    REGISTER_SYSCALL_HANDLER(0x55, readlink);
     REGISTER_SYSCALL_HANDLER(0x5b, munmap);
-    REGISTER_SYSCALL_HANDLER(0x5c, truncate);
     REGISTER_SYSCALL_HANDLER(0x72, wait4);
     REGISTER_SYSCALL_HANDLER(0x7a, newuname);
     REGISTER_SYSCALL_HANDLER(0x84, getpgid);
-    REGISTER_SYSCALL_HANDLER(0x8c, llseek);
-    REGISTER_SYSCALL_HANDLER(0x8d, getdents);
-    REGISTER_SYSCALL_HANDLER(0x91, readv);
-    REGISTER_SYSCALL_HANDLER(0x92, writev);
     REGISTER_SYSCALL_HANDLER(0x93, getsid);
     REGISTER_SYSCALL_HANDLER(0xa8, poll);
     REGISTER_SYSCALL_HANDLER(0xac, prctl);
     REGISTER_SYSCALL_HANDLER(0xae, rt_sigaction);
     REGISTER_SYSCALL_HANDLER(0xaf, rt_sigprocmask);
-    REGISTER_SYSCALL_HANDLER(0xb7, getcwd);
     REGISTER_SYSCALL_HANDLER(0xc0, mmap_pgoff);
     REGISTER_SYSCALL_HANDLER(0xc7, getuid);
     REGISTER_SYSCALL_HANDLER(0xc8, getgid32);
     REGISTER_SYSCALL_HANDLER(0xc9, geteuid);
     REGISTER_SYSCALL_HANDLER(0xca, geteuid32);
-    REGISTER_SYSCALL_HANDLER(0xdc, getdents64);
-    REGISTER_SYSCALL_HANDLER(0xdd, fcntl64);
     REGISTER_SYSCALL_HANDLER(0xe0, gettid);
     REGISTER_SYSCALL_HANDLER(0xee, tkill);
-    REGISTER_SYSCALL_HANDLER(0xef, sendfile64);
     REGISTER_SYSCALL_HANDLER(0xf3, set_thread_area);
     REGISTER_SYSCALL_HANDLER(0xfc, exit_group);
     REGISTER_SYSCALL_HANDLER(0x102, set_tid_address);
     REGISTER_SYSCALL_HANDLER(0x167, socket);
-    REGISTER_SYSCALL_HANDLER(0x17f, statx);
     REGISTER_SYSCALL_HANDLER(0x180, arch_prctl);
     REGISTER_SYSCALL_HANDLER(0x193, clock_gettime64);
+
+    r_register_syscall();
 }

+ 128 - 0
src/kernel/syscall.rs

@@ -0,0 +1,128 @@
+use crate::bindings::root::{interrupt_stack, mmx_registers};
+
+mod file_rw;
+mod procops;
+
+pub(self) trait MapReturnValue {
+    fn map(self) -> u32;
+}
+
+impl MapReturnValue for () {
+    fn map(self) -> u32 {
+        0
+    }
+}
+
+impl MapReturnValue for u32 {
+    fn map(self) -> u32 {
+        self
+    }
+}
+
+impl MapReturnValue for usize {
+    fn map(self) -> u32 {
+        self as u32
+    }
+}
+
+macro_rules! syscall32_call {
+    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        match $handler($arg1) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty, $arg2:ident: $argt2:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        match $handler($arg1, $arg2) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty, $arg2:ident: $argt2:ty, $arg3:ident: $argt3:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        match $handler($arg1, $arg2, $arg3) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident,
+     $arg1:ident: $argt1:ty,
+     $arg2:ident: $argt2:ty,
+     $arg3:ident: $argt3:ty,
+     $arg4:ident: $argt4:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
+        match $handler($arg1, $arg2, $arg3, $arg4) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident,
+     $arg1:ident: $argt1:ty,
+     $arg2:ident: $argt2:ty,
+     $arg3:ident: $argt3:ty,
+     $arg4:ident: $argt4:ty,
+     $arg5:ident: $argt5:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
+        let $arg5: $argt5 = $int_stack.regs.rdi as $argt5;
+        match $handler($arg1, $arg2, $arg3, $arg4, $arg5) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+    ($int_stack:ident, $handler:ident,
+     $arg1:ident: $argt1:ty,
+     $arg2:ident: $argt2:ty,
+     $arg3:ident: $argt3:ty,
+     $arg4:ident: $argt4:ty,
+     $arg5:ident: $argt5:ty,
+     $arg6:ident: $argt6:ty) => {{
+        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
+        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
+        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
+        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
+        let $arg5: $argt5 = $int_stack.regs.rdi as $argt5;
+        let $arg6: $argt6 = $int_stack.regs.rbp as $argt6;
+        match $handler($arg1, $arg2, $arg3, $arg4, $arg5, $arg6) {
+            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
+            Err(err) => (-(err as i32)) as u32,
+        }
+    }};
+}
+
+macro_rules! define_syscall32 {
+    ($name:ident, $handler:ident, $($arg:ident: $argt:ty),*) => {
+        unsafe extern "C" fn $name(
+            int_stack: *mut $crate::bindings::root::interrupt_stack,
+            _mmxregs: *mut $crate::bindings::root::mmx_registers) -> u32 {
+            let int_stack = int_stack.as_mut().unwrap();
+            $crate::kernel::syscall::syscall32_call!(int_stack, $handler, $($arg: $argt),*)
+        }
+    };
+}
+
+pub(self) use {define_syscall32, syscall32_call};
+
+extern "C" {
+    fn register_syscall_handler(
+        no: u32,
+        handler: unsafe extern "C" fn(*mut interrupt_stack, *mut mmx_registers) -> u32,
+        name: *const i8,
+    );
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn r_register_syscall() {
+    file_rw::register();
+    procops::register();
+}

+ 363 - 0
src/kernel/syscall/file_rw.rs

@@ -0,0 +1,363 @@
+use core::mem::MaybeUninit;
+
+use bindings::{
+    statx, AT_FDCWD, AT_STATX_SYNC_AS_STAT, AT_STATX_SYNC_TYPE, AT_SYMLINK_NOFOLLOW, EBADF, EFAULT,
+    EINVAL, ENOENT, SEEK_CUR, SEEK_END, SEEK_SET, S_IFBLK, S_IFCHR,
+};
+
+use crate::{
+    io::{Buffer, BufferFill},
+    kernel::{
+        user::dataflow::{CheckedUserPointer, UserBuffer, UserString},
+        vfs::{dentry::Dentry, file::SeekOption, filearray::FileArray, FsContext},
+    },
+    path::Path,
+    prelude::*,
+};
+
+use super::{define_syscall32, register_syscall_handler};
+
+fn do_read(fd: u32, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.read(&mut buffer)
+}
+
+fn do_write(fd: u32, buffer: *const u8, count: usize) -> KResult<usize> {
+    let data = unsafe { core::slice::from_raw_parts(buffer, count) };
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.write(data)
+}
+
+fn do_open(path: *const u8, flags: u32, mode: u32) -> KResult<u32> {
+    let path = UserString::new(path)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let files = FileArray::get_current();
+    let context = FsContext::get_current();
+    let mode = mode & !*context.umask.lock();
+
+    files.open(&context, path, flags, mode)
+}
+
+fn do_close(fd: u32) -> KResult<()> {
+    let files = FileArray::get_current();
+    files.close(fd)
+}
+
+fn do_dup(fd: u32) -> KResult<u32> {
+    let files = FileArray::get_current();
+    files.dup(fd)
+}
+
+fn do_dup2(old_fd: u32, new_fd: u32) -> KResult<u32> {
+    let files = FileArray::get_current();
+    files.dup_to(old_fd, new_fd, 0)
+}
+
+fn do_pipe(pipe_fd: *mut [u32; 2]) -> KResult<()> {
+    let mut buffer = UserBuffer::new(pipe_fd as *mut u8, core::mem::size_of::<[u32; 2]>())?;
+    let files = FileArray::get_current();
+    let (read_fd, write_fd) = files.pipe()?;
+
+    buffer.copy(&[read_fd, write_fd])?.ok_or(EFAULT)
+}
+
+fn do_getdents(fd: u32, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.getdents(&mut buffer)?;
+    Ok(buffer.wrote())
+}
+
+fn do_getdents64(fd: u32, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    let files = FileArray::get_current();
+
+    files.get(fd).ok_or(EBADF)?.getdents64(&mut buffer)?;
+    Ok(buffer.wrote())
+}
+
+fn do_statx(dirfd: u32, path: *const u8, flags: u32, mask: u32, buffer: *mut u8) -> KResult<()> {
+    if (flags & AT_STATX_SYNC_TYPE) != AT_STATX_SYNC_AS_STAT {
+        unimplemented!("AT_STATX_SYNC_TYPE={:x}", flags & AT_STATX_SYNC_TYPE);
+    }
+
+    if dirfd != AT_FDCWD as u32 {
+        unimplemented!("dirfd={}", dirfd);
+    }
+
+    let path = UserString::new(path)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+    let mut buffer = UserBuffer::new(buffer, core::mem::size_of::<statx>())?;
+
+    let file = Dentry::open(
+        &FsContext::get_current(),
+        path,
+        (flags & AT_SYMLINK_NOFOLLOW) != AT_SYMLINK_NOFOLLOW,
+    )?;
+
+    let mut stat: statx = unsafe { MaybeUninit::zeroed().assume_init() };
+
+    file.statx(&mut stat, mask)?;
+    buffer.copy(&stat)?.ok_or(EFAULT)
+}
+
+fn do_mkdir(pathname: *const u8, mode: u32) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let context = FsContext::get_current();
+    let mode = mode & !*context.umask.lock() & 0o777;
+
+    let dentry = Dentry::open(&context, path, true)?;
+
+    dentry.mkdir(mode)
+}
+
+fn do_truncate(pathname: *const u8, length: usize) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, true)?;
+
+    dentry.truncate(length)
+}
+
+fn do_unlink(pathname: *const u8) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, false)?;
+
+    dentry.unlink()
+}
+
+fn do_symlink(target: *const u8, linkpath: *const u8) -> KResult<()> {
+    let target = UserString::new(target)?;
+    let linkpath = UserString::new(linkpath)?;
+    let linkpath = Path::new(linkpath.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), linkpath, false)?;
+
+    dentry.symlink(target.as_cstr().to_bytes())
+}
+
+fn do_mknod(pathname: *const u8, mode: u32, dev: u32) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let context = FsContext::get_current();
+    let mode = mode & ((!*context.umask.lock() & 0o777) | (S_IFBLK | S_IFCHR));
+
+    let dentry = Dentry::open(&context, path, true)?;
+
+    dentry.mknod(mode, dev)
+}
+
+fn do_readlink(pathname: *const u8, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, false)?;
+
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+    dentry.readlink(&mut buffer)
+}
+
+fn do_llseek(
+    fd: u32,
+    offset_high: u32,
+    offset_low: u32,
+    result: *mut u64,
+    whence: u32,
+) -> KResult<()> {
+    let mut result = UserBuffer::new(result as *mut u8, core::mem::size_of::<u64>())?;
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    let offset = ((offset_high as u64) << 32) | offset_low as u64;
+
+    let new_offset = match whence {
+        SEEK_SET => file.seek(SeekOption::Set(offset as usize))?,
+        SEEK_CUR => file.seek(SeekOption::Current(offset as isize))?,
+        SEEK_END => file.seek(SeekOption::End(offset as isize))?,
+        _ => return Err(EINVAL),
+    } as u64;
+
+    result.copy(&new_offset)?.ok_or(EFAULT)
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy)]
+struct IoVec32 {
+    base: u32,
+    len: u32,
+}
+
+fn do_readv(fd: u32, iov_user: *const u8, iovcnt: u32) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    let iov_user =
+        CheckedUserPointer::new(iov_user, iovcnt as usize * core::mem::size_of::<IoVec32>())?;
+    let mut iov_user_copied: Vec<IoVec32> = vec![];
+    iov_user_copied.resize(iovcnt as usize, IoVec32::default());
+
+    iov_user.read(
+        iov_user_copied.as_mut_ptr() as *mut (),
+        iov_user_copied.len() * core::mem::size_of::<IoVec32>(),
+    )?;
+
+    let iov_buffers = iov_user_copied
+        .into_iter()
+        .take_while(|iov| iov.len != 0)
+        .map(|iov| UserBuffer::new(iov.base as *mut u8, iov.len as usize))
+        .collect::<KResult<Vec<_>>>()?;
+
+    let mut tot = 0usize;
+    for mut buffer in iov_buffers.into_iter() {
+        // TODO!!!: `readv`
+        let nread = file.read(&mut buffer)?;
+        tot += nread;
+
+        if nread == 0 || nread != buffer.total() {
+            break;
+        }
+    }
+
+    Ok(tot)
+}
+
+fn do_writev(fd: u32, iov_user: *const u8, iovcnt: u32) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    let iov_user =
+        CheckedUserPointer::new(iov_user, iovcnt as usize * core::mem::size_of::<IoVec32>())?;
+    let mut iov_user_copied: Vec<IoVec32> = vec![];
+    iov_user_copied.resize(iovcnt as usize, IoVec32::default());
+
+    iov_user.read(
+        iov_user_copied.as_mut_ptr() as *mut (),
+        iov_user_copied.len() * core::mem::size_of::<IoVec32>(),
+    )?;
+
+    let iov_blocks = iov_user_copied
+        .into_iter()
+        .filter(|iov| iov.len != 0)
+        .map(|iov| CheckedUserPointer::new(iov.base as *mut u8, iov.len as usize))
+        .collect::<KResult<Vec<_>>>()?;
+
+    let mut tot = 0usize;
+    for block in iov_blocks.into_iter() {
+        // TODO!!!: atomic `writev`
+        // TODO!!!!!: copy from user
+        let slice = block.as_slice();
+        let nread = file.write(slice)?;
+        tot += nread;
+
+        if nread == 0 || nread != slice.len() {
+            break;
+        }
+    }
+
+    Ok(tot)
+}
+
+fn do_access(pathname: *const u8, _mode: u32) -> KResult<()> {
+    let path = UserString::new(pathname)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&FsContext::get_current(), path, true)?;
+
+    if !dentry.is_valid() {
+        return Err(ENOENT);
+    }
+
+    // TODO: check permission
+    // match mode {
+    //     F_OK => todo!(),
+    //     R_OK => todo!(),
+    //     W_OK => todo!(),
+    //     X_OK => todo!(),
+    //     _ => Err(EINVAL),
+    // }
+    Ok(())
+}
+
+fn do_sendfile64(out_fd: u32, in_fd: u32, offset: *mut u8, count: usize) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let in_file = files.get(in_fd).ok_or(EBADF)?;
+    let out_file = files.get(out_fd).ok_or(EBADF)?;
+
+    if !offset.is_null() {
+        unimplemented!("sendfile64 with offset");
+    }
+
+    in_file.sendfile(&out_file, count)
+}
+
+fn do_ioctl(fd: u32, request: usize, arg3: usize) -> KResult<usize> {
+    let files = FileArray::get_current();
+    let file = files.get(fd).ok_or(EBADF)?;
+
+    file.ioctl(request, arg3)
+}
+
+fn do_fcntl64(fd: u32, cmd: u32, arg: usize) -> KResult<usize> {
+    FileArray::get_current().fcntl(fd, cmd, arg)
+}
+
+define_syscall32!(sys_read, do_read, fd: u32, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_write, do_write, fd: u32, buffer: *const u8, count: usize);
+define_syscall32!(sys_open, do_open, path: *const u8, flags: u32, mode: u32);
+define_syscall32!(sys_close, do_close, fd: u32);
+define_syscall32!(sys_dup, do_dup, fd: u32);
+define_syscall32!(sys_dup2, do_dup2, old_fd: u32, new_fd: u32);
+define_syscall32!(sys_pipe, do_pipe, pipe_fd: *mut [u32; 2]);
+define_syscall32!(sys_getdents, do_getdents, fd: u32, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_getdents64, do_getdents64, fd: u32, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_statx, do_statx, fd: u32, path: *const u8, flags: u32, mask: u32, buffer: *mut u8);
+define_syscall32!(sys_mkdir, do_mkdir, pathname: *const u8, mode: u32);
+define_syscall32!(sys_truncate, do_truncate, pathname: *const u8, length: usize);
+define_syscall32!(sys_unlink, do_unlink, pathname: *const u8);
+define_syscall32!(sys_symlink, do_symlink, target: *const u8, linkpath: *const u8);
+define_syscall32!(sys_readlink, do_readlink, pathname: *const u8, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_llseek, do_llseek, fd: u32, offset_high: u32, offset_low: u32, result: *mut u64, whence: u32);
+define_syscall32!(sys_mknod, do_mknod, pathname: *const u8, mode: u32, dev: u32);
+define_syscall32!(sys_readv, do_readv, fd: u32, iov_user: *const u8, iovcnt: u32);
+define_syscall32!(sys_writev, do_writev, fd: u32, iov_user: *const u8, iovcnt: u32);
+define_syscall32!(sys_access, do_access, pathname: *const u8, mode: u32);
+define_syscall32!(sys_sendfile64, do_sendfile64, out_fd: u32, in_fd: u32, offset: *mut u8, count: usize);
+define_syscall32!(sys_ioctl, do_ioctl, fd: u32, request: usize, arg3: usize);
+define_syscall32!(sys_fcntl64, do_fcntl64, fd: u32, cmd: u32, arg: usize);
+
+pub(super) unsafe fn register() {
+    register_syscall_handler(0x03, sys_read, b"read\0".as_ptr() as *const _);
+    register_syscall_handler(0x04, sys_write, b"write\0".as_ptr() as *const _);
+    register_syscall_handler(0x05, sys_open, b"open\0".as_ptr() as *const _);
+    register_syscall_handler(0x06, sys_close, b"close\0".as_ptr() as *const _);
+    register_syscall_handler(0x0a, sys_unlink, b"unlink\0".as_ptr() as *const _);
+    register_syscall_handler(0x0e, sys_mknod, b"mknod\0".as_ptr() as *const _);
+    register_syscall_handler(0x21, sys_access, b"access\0".as_ptr() as *const _);
+    register_syscall_handler(0x27, sys_mkdir, b"mkdir\0".as_ptr() as *const _);
+    register_syscall_handler(0x29, sys_dup, b"dup\0".as_ptr() as *const _);
+    register_syscall_handler(0x2a, sys_pipe, b"pipe\0".as_ptr() as *const _);
+    register_syscall_handler(0x36, sys_ioctl, b"ioctl\0".as_ptr() as *const _);
+    register_syscall_handler(0x3f, sys_dup2, b"dup2\0".as_ptr() as *const _);
+    register_syscall_handler(0x53, sys_symlink, b"symlink\0".as_ptr() as *const _);
+    register_syscall_handler(0x55, sys_readlink, b"readlink\0".as_ptr() as *const _);
+    register_syscall_handler(0x5c, sys_truncate, b"truncate\0".as_ptr() as *const _);
+    register_syscall_handler(0x8c, sys_llseek, b"llseek\0".as_ptr() as *const _);
+    register_syscall_handler(0x8d, sys_getdents, b"getdents\0".as_ptr() as *const _);
+    register_syscall_handler(0x91, sys_readv, b"readv\0".as_ptr() as *const _);
+    register_syscall_handler(0x92, sys_writev, b"writev\0".as_ptr() as *const _);
+    register_syscall_handler(0xdc, sys_getdents64, b"getdents64\0".as_ptr() as *const _);
+    register_syscall_handler(0xdd, sys_fcntl64, b"fcntl64\0".as_ptr() as *const _);
+    register_syscall_handler(0xef, sys_sendfile64, b"sendfile64\0".as_ptr() as *const _);
+    register_syscall_handler(0x17f, sys_statx, b"statx\0".as_ptr() as *const _);
+}

+ 2 - 409
src/kernel/syscall/fileops.cc

@@ -1,4 +1,3 @@
-#include <bits/ioctl.h>
 #include <errno.h>
 #include <poll.h>
 #include <sys/mman.h>
@@ -22,247 +21,8 @@ static inline void not_implemented(const char* pos, int line) {
     current_thread->send_signal(SIGSYS);
 }
 
-ssize_t kernel::syscall::do_write(int fd, const char __user* buf, size_t n) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return file->write(buf, n);
-}
-
-ssize_t kernel::syscall::do_read(int fd, char __user* buf, size_t n) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return file->read(buf, n);
-}
-
-int kernel::syscall::do_close(int fd) {
-    current_process->files.close(fd);
-    return 0;
-}
-
-int kernel::syscall::do_dup(int old_fd) {
-    return current_process->files.dup(old_fd);
-}
-
-int kernel::syscall::do_dup2(int old_fd, int new_fd) {
-    return current_process->files.dup(old_fd, new_fd, 0);
-}
-
-int kernel::syscall::do_pipe(int __user* pipefd) {
-    // TODO: use copy_from_user and copy_to_user
-    return current_process->files.pipe(*(int(*)[2])pipefd);
-}
-
-ssize_t kernel::syscall::do_getdents(int fd, char __user* buf, size_t cnt) {
-    auto* dir = current_process->files[fd];
-    if (!dir)
-        return -EBADF;
-
-    return dir->getdents(buf, cnt);
-}
-
-ssize_t kernel::syscall::do_getdents64(int fd, char __user* buf, size_t cnt) {
-    auto* dir = current_process->files[fd];
-    if (!dir)
-        return -EBADF;
-
-    return dir->getdents64(buf, cnt);
-}
-
-int kernel::syscall::do_open(const char __user* path, int flags, mode_t mode) {
-    mode &= ~current_process->umask;
-
-    // TODO: use copy_from_user
-    return current_process->files.open(current_process->cwd, path, flags,
-                                       mode);
-}
-
-int kernel::syscall::do_symlink(const char __user* target,
-                                const char __user* linkpath) {
-    // TODO: use copy_from_user
-    auto [dent, status] = current_open(linkpath, false);
-    if (!dent)
-        return status;
-
-    if (status == 0)
-        return -EEXIST;
-
-    assert(status == -ENOENT);
-    return fs::fs_symlink(dent.get(), target);
-}
-
-int kernel::syscall::do_readlink(const char __user* pathname, char __user* buf,
-                                 size_t buf_size) {
-    // TODO: use copy_from_user
-    auto [dent, status] = current_open(pathname, false);
-
-    if (!dent || status)
-        return status;
-
-    if (buf_size & (1ull << 63))
-        return -EINVAL;
-
-    // TODO: use copy_to_user
-    return fs_readlink(fs::r_dentry_get_inode(dent.get()), buf, buf_size);
-}
-
-int kernel::syscall::do_ioctl(int fd, unsigned long request, uintptr_t arg3) {
-    // TODO: check fd type and get tty* from fd
-    //
-    //       we use a trick for now, check whether
-    //       the file that fd points to is a pipe or
-    //       not. and we suppose that stdin will be
-    //       either a tty or a pipe.
-    auto* file = current_process->files[fd];
-    // TODO!!!: check whether the file is a tty or not
-    if (!file) // || !S_ISCHR(file->mode))
-        return -ENOTTY;
-
-    switch (request) {
-        case TIOCGPGRP: {
-            auto* pgid = (pid_t __user*)arg3;
-            auto* ctrl_tty = current_process->control_tty;
-
-            if (!ctrl_tty)
-                return -ENOTTY;
-
-            // TODO: copy_to_user
-            *pgid = ctrl_tty->get_pgrp();
-            break;
-        }
-        case TIOCSPGRP: {
-            // TODO: copy_from_user
-            auto pgid = *(const pid_t __user*)arg3;
-            auto* ctrl_tty = current_process->control_tty;
-
-            if (!ctrl_tty)
-                return -ENOTTY;
-
-            ctrl_tty->set_pgrp(pgid);
-            break;
-        }
-        case TIOCGWINSZ: {
-            auto* ws = (winsize __user*)arg3;
-            // TODO: copy_to_user
-            ws->ws_col = 80;
-            ws->ws_row = 10;
-            break;
-        }
-        case TCGETS: {
-            auto* argp = (struct termios __user*)arg3;
-
-            auto* ctrl_tty = current_process->control_tty;
-            if (!ctrl_tty)
-                return -EINVAL;
-
-            // TODO: use copy_to_user
-            memcpy(argp, &ctrl_tty->termio, sizeof(ctrl_tty->termio));
-
-            break;
-        }
-        case TCSETS: {
-            auto* argp = (const struct termios __user*)arg3;
-
-            auto* ctrl_tty = current_process->control_tty;
-            if (!ctrl_tty)
-                return -EINVAL;
-
-            // TODO: use copy_from_user
-            memcpy(&ctrl_tty->termio, argp, sizeof(ctrl_tty->termio));
-
-            break;
-        }
-        default:
-            kmsgf("[error] the ioctl() function %x is not implemented",
-                  request);
-            return -EINVAL;
-    }
-
-    return 0;
-}
-
-ssize_t kernel::syscall::do_readv(int fd, const iovec* iov, int iovcnt) {
-    auto* file = current_process->files[fd];
-
-    if (!file)
-        return -EBADF;
-
-    // TODO: fix fake EOF
-    ssize_t totn = 0;
-    for (int i = 0; i < iovcnt; ++i) {
-        auto* base = (char*)iov[i].iov_base;
-        auto len = iov[i].iov_len;
-
-        if (len == 0)
-            break;
-
-        if (len < 0)
-            return -EINVAL;
-
-        if (!base)
-            return -EFAULT;
-
-        ssize_t ret = file->read(base, len);
-
-        if (ret < 0)
-            return ret;
-
-        if (ret == 0)
-            break;
-
-        totn += ret;
-
-        if ((size_t)ret != iov[i].iov_len)
-            break;
-    }
-
-    return totn;
-}
-
-// TODO: this operation SHOULD be atomic
-ssize_t kernel::syscall::do_writev(int fd, const iovec* iov, int iovcnt) {
-    auto* file = current_process->files[fd];
-
-    if (!file)
-        return -EBADF;
-
-    ssize_t totn = 0;
-    for (int i = 0; i < iovcnt; ++i) {
-        auto* base = (const char*)iov[i].iov_base;
-        auto len = iov[i].iov_len;
-
-        if (len == 0)
-            continue;
-
-        if (len < 0)
-            return -EINVAL;
-
-        if (!base)
-            return -EFAULT;
-
-        ssize_t ret = file->write(base, len);
-
-        if (ret < 0)
-            return ret;
-        totn += ret;
-    }
-
-    return totn;
-}
-
-off_t kernel::syscall::do_lseek(int fd, off_t offset, int whence) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return file->seek(offset, whence);
-}
-
-uintptr_t kernel::syscall::do_mmap_pgoff(uintptr_t addr, size_t len, int prot,
-                                         int flags, int fd, off_t pgoffset) {
+uintptr_t kernel::syscall::do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags, int fd,
+                                         off_t pgoffset) {
     if (addr & 0xfff)
         return -EINVAL;
     if (len == 0)
@@ -328,149 +88,6 @@ int kernel::syscall::do_munmap(uintptr_t addr, size_t len) {
     return current_process->mms.unmap(addr, len, true);
 }
 
-ssize_t kernel::syscall::do_sendfile(int out_fd, int in_fd,
-                                     off_t __user* offset, size_t count) {
-    auto* out_file = current_process->files[out_fd];
-    auto* in_file = current_process->files[in_fd];
-
-    if (!out_file || !in_file)
-        return -EBADF;
-
-    // TODO: check whether in_fd supports mmapping
-    // TODO!!!: figure a way to recover this
-    // if (!S_ISREG(in_file->mode) && !S_ISBLK(in_file->mode))
-    return -EINVAL;
-
-    if (offset) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    constexpr size_t bufsize = 4096;
-    std::vector<char> buf(bufsize);
-    size_t totn = 0;
-    while (totn < count) {
-        if (current_thread->signals.pending_signal() != 0)
-            return (totn == 0) ? -EINTR : totn;
-
-        size_t n = std::min(count - totn, bufsize);
-        ssize_t ret = in_file->read(buf.data(), n);
-        if (ret < 0)
-            return ret;
-        if (ret == 0)
-            break;
-        ret = out_file->write(buf.data(), ret);
-        if (ret < 0)
-            return ret;
-        totn += ret;
-    }
-
-    return totn;
-}
-
-int kernel::syscall::do_statx(int dirfd, const char __user* path, int flags,
-                              unsigned int mask, statx __user* statxbuf) {
-    // AT_STATX_SYNC_AS_STAT is the default value
-    if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_SYNC_AS_STAT) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    if (dirfd != AT_FDCWD) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    auto [dent, status] = current_open(path, !(flags & AT_SYMLINK_NOFOLLOW));
-    if (!dent || status)
-        return status;
-
-    // TODO: copy to user
-    return fs_statx(fs::r_dentry_get_inode(dent.get()), statxbuf, mask);
-}
-
-int kernel::syscall::do_fcntl(int fd, int cmd, unsigned long arg) {
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    switch (cmd) {
-        case F_SETFD:
-            return current_process->files.set_flags(fd, arg);
-        case F_DUPFD:
-        case F_DUPFD_CLOEXEC: {
-            return current_process->files.dupfd(fd, arg, FD_CLOEXEC);
-        }
-        default:
-            NOT_IMPLEMENTED;
-            return -EINVAL;
-    }
-}
-
-int kernel::syscall::do_mkdir(const char __user* pathname, mode_t mode) {
-    mode &= (~current_process->umask & 0777);
-
-    // TODO: use copy_from_user
-    auto [dent, status] = current_open(pathname);
-    if (!dent)
-        return status;
-
-    if (status == 0)
-        return -EEXIST;
-
-    assert(status == -ENOENT);
-    return fs::fs_mkdir(dent.get(), mode);
-}
-
-int kernel::syscall::do_truncate(const char __user* pathname, long length) {
-    auto [dent, status] = current_open(pathname);
-    if (!dent || status)
-        return status;
-
-    return fs_truncate(fs::r_dentry_get_inode(dent.get()), length);
-}
-
-int kernel::syscall::do_unlink(const char __user* pathname) {
-    auto [dent, status] = current_open(pathname, false);
-
-    if (!dent || status)
-        return status;
-
-    return fs::fs_unlink(dent.get());
-}
-
-int kernel::syscall::do_access(const char __user* pathname, int mode) {
-    auto [dent, status] = current_open(pathname);
-    if (!dent || status)
-        return status;
-
-    switch (mode) {
-        case F_OK:
-            return 0;
-        case R_OK:
-        case W_OK:
-        case X_OK:
-            // TODO: check privilege
-            return 0;
-        default:
-            return -EINVAL;
-    }
-}
-
-int kernel::syscall::do_mknod(const char __user* pathname, mode_t mode,
-                              dev_t dev) {
-    mode &= S_IFMT | (~current_process->umask & 0777);
-    auto [dent, status] = current_open(pathname);
-    if (!dent)
-        return status;
-
-    if (status == 0)
-        return -EEXIST;
-
-    assert(status == -ENOENT);
-    return fs::fs_mknod(dent.get(), mode, dev);
-}
-
 int kernel::syscall::do_poll(pollfd __user* fds, nfds_t nfds, int timeout) {
     if (nfds == 0)
         return 0;
@@ -511,27 +128,3 @@ int kernel::syscall::do_poll(pollfd __user* fds, nfds_t nfds, int timeout) {
 int kernel::syscall::do_socket(int domain, int type, int protocol) {
     return -EINVAL;
 }
-
-/* TODO: implement vfs_stat(stat*)
-int do_stat(const char __user* pathname, stat __user* buf)
-{
-    auto* dent = fs::vfs_open(*current_process->root,
-        types::make_path(pathname, current_process->pwd));
-
-    if (!dent)
-        return -ENOENT;
-
-    return fs::vfs_stat(dent, buf);
-}
-*/
-
-/* TODO: implement vfs_stat(stat*)
-int do_fstat(int fd, stat __user* buf)
-{
-    auto* file = current_process->files[fd];
-    if (!file)
-        return -EBADF;
-
-    return fs::vfs_stat(file, buf);
-}
-*/

+ 0 - 22
src/kernel/syscall/mount.cc

@@ -1,22 +0,0 @@
-#include <errno.h>
-
-#include <types/path.hpp>
-
-#include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
-#include <kernel/vfs.hpp>
-
-int kernel::syscall::do_mount(const char __user* source,
-                              const char __user* target,
-                              const char __user* fstype, unsigned long flags,
-                              const void __user* _fsdata) {
-    if (!fstype)
-        return -EINVAL;
-
-    // TODO: use copy_from_user
-    auto [mountpoint, status] = current_open(target);
-    if (!mountpoint || status)
-        return status;
-
-    return fs::fs_mount(mountpoint.get(), source, target, fstype, flags, _fsdata);
-}

+ 3 - 68
src/kernel/syscall/procops.cc

@@ -28,55 +28,6 @@ static inline void not_implemented(const char* pos, int line) {
     current_thread->send_signal(SIGSYS);
 }
 
-int kernel::syscall::do_chdir(const char __user* path) {
-    // TODO: use copy_from_user
-    auto [dir, ret] = current_open(path);
-    if (!dir || ret)
-        return ret;
-
-    if (!fs::r_dentry_is_directory(dir.get()))
-        return -ENOTDIR;
-
-    current_process->cwd = std::move(dir);
-    return 0;
-}
-
-execve_retval kernel::syscall::do_execve(const std::string& exec,
-                                         const std::vector<std::string>& args,
-                                         const std::vector<std::string>& envs) {
-    auto [dent, ret] = current_open(exec);
-
-    if (ret)
-        return {0, 0, ret};
-
-    types::elf::elf32_load_data d{
-        .exec_dent{std::move(dent)},
-        .argv{args},
-        .envp{envs},
-        .ip{},
-        .sp{},
-    };
-
-    current_process->files.onexec();
-
-    async::preempt_disable();
-
-    // TODO: set cs and ss to compatibility mode
-    if (int ret = types::elf::elf32_load(d); ret != 0) {
-        async::preempt_enable();
-
-        if (ret == types::elf::ELF_LOAD_FAIL_NORETURN)
-            kill_current(SIGSEGV);
-
-        return {0, 0, ret};
-    }
-
-    current_thread->signals.on_exec();
-    async::preempt_enable();
-
-    return {d.ip, d.sp, 0};
-}
-
 int kernel::syscall::do_exit(int status) {
     // TODO: terminating a thread only
     assert(current_process->thds.size() == 1);
@@ -135,12 +86,6 @@ int kernel::syscall::do_waitpid(pid_t waitpid, int __user* arg1, int options) {
     return -EINVAL;
 }
 
-int kernel::syscall::do_getcwd(char __user* buf, size_t buf_size) {
-    // TODO: use copy_to_user
-    return fs::d_path(current_process->cwd.get(),
-                      current_process->fs_context.root.get(), buf, buf_size);
-}
-
 pid_t kernel::syscall::do_setsid() {
     if (current_process->pid == current_process->pgid)
         return -EPERM;
@@ -246,13 +191,6 @@ int kernel::syscall::do_arch_prctl(int option, uintptr_t arg2) {
     return 0;
 }
 
-int kernel::syscall::do_umask(mode_t mask) {
-    mode_t old = current_process->umask;
-    current_process->umask = mask;
-
-    return old;
-}
-
 int kernel::syscall::do_kill(pid_t pid, int sig) {
     auto [pproc, found] = procs->try_find(pid);
     if (!found)
@@ -291,8 +229,7 @@ int kernel::syscall::do_tkill(pid_t tid, int sig) {
 }
 
 int kernel::syscall::do_rt_sigprocmask(int how, const sigmask_type __user* set,
-                                       sigmask_type __user* oldset,
-                                       size_t sigsetsize) {
+                                       sigmask_type __user* oldset, size_t sigsetsize) {
     if (sigsetsize != sizeof(sigmask_type))
         return -EINVAL;
 
@@ -322,13 +259,11 @@ int kernel::syscall::do_rt_sigprocmask(int how, const sigmask_type __user* set,
 }
 
 int kernel::syscall::do_rt_sigaction(int signum, const sigaction __user* act,
-                                     sigaction __user* oldact,
-                                     size_t sigsetsize) {
+                                     sigaction __user* oldact, size_t sigsetsize) {
     if (sigsetsize != sizeof(sigmask_type))
         return -EINVAL;
 
-    if (!kernel::signal_list::check_valid(signum) || signum == SIGKILL ||
-        signum == SIGSTOP)
+    if (!kernel::signal_list::check_valid(signum) || signum == SIGKILL || signum == SIGSTOP)
         return -EINVAL;
 
     // TODO: use copy_to_user

+ 171 - 0
src/kernel/syscall/procops.rs

@@ -0,0 +1,171 @@
+use core::ffi::CStr;
+
+use alloc::borrow::ToOwned;
+use alloc::ffi::CString;
+use alloc::sync::Arc;
+use bindings::types::elf::{elf32_load, elf32_load_data, ELF_LOAD_FAIL_NORETURN};
+use bindings::{
+    current_process, current_thread, interrupt_stack, kill_current, mmx_registers, EFAULT, EINVAL,
+    ENOENT, ENOTDIR, SIGSEGV,
+};
+
+use crate::io::Buffer;
+use crate::kernel::user::dataflow::UserString;
+use crate::kernel::vfs::dentry::Dentry;
+use crate::kernel::vfs::filearray::FileArray;
+use crate::path::Path;
+use crate::{kernel::user::dataflow::UserBuffer, prelude::*};
+
+use crate::kernel::vfs::{self, FsContext};
+
+use super::{define_syscall32, register_syscall_handler};
+
+fn do_umask(mask: u32) -> KResult<u32> {
+    let context = FsContext::get_current();
+    let mut umask = context.umask.lock();
+
+    let old = *umask;
+    *umask = mask & 0o777;
+    Ok(old)
+}
+
+fn do_getcwd(buffer: *mut u8, bufsize: usize) -> KResult<usize> {
+    let context = FsContext::get_current();
+    let mut buffer = UserBuffer::new(buffer, bufsize)?;
+
+    context.cwd.lock().get_path(&context, &mut buffer)?;
+
+    Ok(buffer.wrote())
+}
+
+fn do_chdir(path: *const u8) -> KResult<()> {
+    let context = FsContext::get_current();
+    let path = UserString::new(path)?;
+    let path = Path::new(path.as_cstr().to_bytes())?;
+
+    let dentry = Dentry::open(&context, path, true)?;
+    if !dentry.is_valid() {
+        return Err(ENOENT);
+    }
+
+    if !dentry.is_directory() {
+        return Err(ENOTDIR);
+    }
+
+    *context.cwd.lock() = dentry;
+    Ok(())
+}
+
+fn do_mount(source: *const u8, target: *const u8, fstype: *const u8, flags: usize) -> KResult<()> {
+    let source = UserString::new(source)?;
+    let target = UserString::new(target)?;
+    let fstype = UserString::new(fstype)?;
+
+    let context = FsContext::get_current();
+    let mountpoint = Dentry::open(&context, Path::new(target.as_cstr().to_bytes())?, true)?;
+    if !mountpoint.is_valid() {
+        return Err(ENOENT);
+    }
+
+    vfs::mount::do_mount(
+        &mountpoint,
+        source.as_cstr().to_str().map_err(|_| EINVAL)?,
+        target.as_cstr().to_str().map_err(|_| EINVAL)?,
+        fstype.as_cstr().to_str().map_err(|_| EINVAL)?,
+        flags as u64,
+    )
+}
+
+/// # Return
+/// `(ip, sp)`
+fn do_execve(exec: &[u8], argv: &[CString], envp: &[CString]) -> KResult<(usize, usize)> {
+    let context = FsContext::get_current();
+    let dentry = Dentry::open(&context, Path::new(exec)?, true)?;
+    if !dentry.is_valid() {
+        return Err(ENOENT);
+    }
+
+    let argv_array = argv.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+    let envp_array = envp.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+
+    let mut load_data = elf32_load_data {
+        exec_dent: Arc::into_raw(dentry) as *mut _,
+        argv: argv_array.as_ptr(),
+        argv_count: argv_array.len(),
+        envp: envp_array.as_ptr(),
+        envp_count: envp_array.len(),
+        ip: 0,
+        sp: 0,
+    };
+
+    BorrowedArc::<FileArray>::from_raw(
+        unsafe { current_process.as_mut() }.unwrap().files.m_handle as *const _,
+    )
+    .on_exec();
+
+    match unsafe { elf32_load(&mut load_data) } {
+        0 => {
+            unsafe { current_thread.as_mut().unwrap().signals.on_exec() };
+            Ok((load_data.ip, load_data.sp))
+        }
+        n => {
+            if n == ELF_LOAD_FAIL_NORETURN {
+                unsafe { kill_current(SIGSEGV as i32) }
+            }
+            Err(-n as u32)
+        }
+    }
+}
+
+unsafe extern "C" fn sys_execve(
+    int_stack: *mut interrupt_stack,
+    _mmxregs: *mut mmx_registers,
+) -> u32 {
+    match (|| -> KResult<()> {
+        let exec = int_stack.as_mut().unwrap().regs.rbx as *const u8;
+        let exec = UserString::new(exec)?;
+
+        // TODO!!!!!: copy from user
+        let mut argv = int_stack.as_mut().unwrap().regs.rcx as *const u32;
+        let mut envp = int_stack.as_mut().unwrap().regs.rdx as *const u32;
+
+        if argv.is_null() || envp.is_null() {
+            return Err(EFAULT);
+        }
+
+        let mut argv_vec = Vec::new();
+        let mut envp_vec = Vec::new();
+
+        while argv.read() != 0 {
+            argv_vec.push(CStr::from_ptr(argv.read() as *const i8).to_owned());
+            argv = argv.add(1);
+        }
+
+        while envp.read() != 0 {
+            envp_vec.push(CStr::from_ptr(envp.read() as *const i8).to_owned());
+            envp = envp.add(1);
+        }
+
+        let (ip, sp) = do_execve(exec.as_cstr().to_bytes(), &argv_vec, &envp_vec)?;
+
+        int_stack.as_mut().unwrap().v_rip = ip;
+        int_stack.as_mut().unwrap().rsp = sp;
+        Ok(())
+    })() {
+        Ok(_) => 0,
+        Err(err) => -(err as i32) as u32,
+    }
+}
+
+define_syscall32!(sys_chdir, do_chdir, path: *const u8);
+define_syscall32!(sys_umask, do_umask, mask: u32);
+define_syscall32!(sys_mount, do_mount, source: *const u8, target: *const u8, fstype: *const u8, flags: usize);
+define_syscall32!(sys_getcwd, do_getcwd, buffer: *mut u8, bufsize: usize);
+
+pub(super) unsafe fn register() {
+    register_syscall_handler(0x0b, sys_execve, b"execve\0".as_ptr() as *const _);
+    register_syscall_handler(0x0c, sys_chdir, b"chdir\0".as_ptr() as *const _);
+    register_syscall_handler(0x15, sys_mount, b"mount\0".as_ptr() as *const _);
+    register_syscall_handler(0x3c, sys_umask, b"umask\0".as_ptr() as *const _);
+    register_syscall_handler(0xb7, sys_getcwd, b"getcwd\0".as_ptr() as *const _);
+}

+ 31 - 12
src/kernel/task/thread.cc

@@ -12,7 +12,7 @@
 #include <kernel/task/readyqueue.hpp>
 #include <kernel/task/thread.hpp>
 
-constexpr std::size_t KERNEL_STACK_ORDER = 3; // 2^3 * 4096 = 32KB
+constexpr std::size_t KERNEL_STACK_ORDER = 7; // 2^7 * 4096 = 512KB
 
 using namespace kernel::task;
 using namespace kernel::mem;
@@ -28,14 +28,10 @@ struct PACKED tss64_t {
 };
 constexpr physaddr<tss64_t> tss{0x00000070};
 
-thread::thread(std::string name, pid_t owner)
-    : owner{owner}, attr{READY | SYSTEM}, name{name} {}
+thread::thread(std::string name, pid_t owner) : owner{owner}, attr{READY | SYSTEM}, name{name} {}
 
 thread::thread(const thread& val, pid_t owner)
-    : owner{owner}
-    , attr{val.attr}
-    , name{val.name}
-    , tls_desc32{val.tls_desc32} {}
+    : owner{owner}, attr{val.attr}, name{val.name}, tls_desc32{val.tls_desc32} {}
 
 tid_t thread::tid() const {
     return (tid_t)kstack.pfn;
@@ -50,8 +46,7 @@ bool thread::operator==(const thread& rhs) const {
 }
 
 static inline uintptr_t __stack_bottom(pfn_t pfn) {
-    return (uintptr_t)(void*)kernel::mem::physaddr<void>{
-        pfn + (1 << KERNEL_STACK_ORDER) * 0x1000};
+    return (uintptr_t)(void*)kernel::mem::physaddr<void>{pfn + (1 << KERNEL_STACK_ORDER) * 0x1000};
 }
 
 thread::kernel_stack::kernel_stack() {
@@ -91,21 +86,39 @@ void thread::kernel_stack::load_interrupt_stack() const {
     tss->rsp[0] = sp;
 }
 
-void thread::set_attr(thd_attr_t new_attr) {
+// TODO!!!: change of attribute should acquire dispatcher lock
+//          to prevent inconsistency of tasks in ready queue
+void thread::set_attr(thd_attr_t new_attr, bool forced) {
+    // TODO!!!: rewrite this with state machine based method to prevent
+    // inconsistency and random transition among states
+    if (attr & USLEEP && (new_attr != READY) && (new_attr != USLEEP)) {
+        kmsgf(
+            "[kernel:warn] trying to change thread state of %d from USLEEP to "
+            "%x, might be "
+            "doing something dumb.",
+            this->owner, new_attr);
+
+        return;
+    }
+
     switch (new_attr) {
         case SYSTEM:
             attr |= SYSTEM;
             break;
         case READY:
             if (attr & ZOMBIE) {
-                kmsgf("[kernel:warn] zombie process pid%d tries to wake up",
-                      owner);
+                kmsgf("[kernel:warn] zombie process pid%d tries to wake up", owner);
                 break;
             }
 
             if (attr & READY)
                 break;
 
+            if (!forced && attr & USLEEP) {
+                kmsgf("[kernel:warn] trying to wake up %d from USLEEP", this->owner);
+                break;
+            }
+
             attr &= SYSTEM;
             attr |= READY;
 
@@ -115,6 +128,12 @@ void thread::set_attr(thd_attr_t new_attr) {
             attr &= SYSTEM;
             attr |= ISLEEP;
 
+            dispatcher::dequeue(this);
+            break;
+        case USLEEP:
+            attr &= SYSTEM;
+            attr |= USLEEP;
+
             dispatcher::dequeue(this);
             break;
         case STOPPED:

+ 53 - 14
src/kernel/tty.cpp

@@ -1,5 +1,6 @@
 #include <algorithm>
 
+#include <bits/ioctl.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <termios.h>
@@ -12,17 +13,12 @@
 
 #define CTRL(key) ((key)-0x40)
 
-#define TERMIOS_ISET(termios, option) \
-    ((option) == ((termios).c_iflag & (option)))
-#define TERMIOS_OSET(termios, option) \
-    ((option) == ((termios).c_oflag & (option)))
-#define TERMIOS_CSET(termios, option) \
-    ((option) == ((termios).c_cflag & (option)))
-#define TERMIOS_LSET(termios, option) \
-    ((option) == ((termios).c_lflag & (option)))
+#define TERMIOS_ISET(termios, option) ((option) == ((termios).c_iflag & (option)))
+#define TERMIOS_OSET(termios, option) ((option) == ((termios).c_oflag & (option)))
+#define TERMIOS_CSET(termios, option) ((option) == ((termios).c_cflag & (option)))
+#define TERMIOS_LSET(termios, option) ((option) == ((termios).c_lflag & (option)))
 
-#define TERMIOS_TESTCC(c, termios, cc) \
-    ((c != 0xff) && (c == ((termios).c_cc[cc])))
+#define TERMIOS_TESTCC(c, termios, cc) ((c != 0xff) && (c == ((termios).c_cc[cc])))
 
 using namespace kernel::tty;
 
@@ -71,6 +67,48 @@ int tty::poll() {
     return 1;
 }
 
+int tty::ioctl(int request, unsigned long arg3) {
+    switch (request) {
+        case TIOCGPGRP: {
+            auto* pgid = (pid_t __user*)arg3;
+            // TODO: copy_to_user
+            *pgid = this->get_pgrp();
+            break;
+        }
+        case TIOCSPGRP: {
+            // TODO: copy_from_user
+            auto pgid = *(const pid_t __user*)arg3;
+            this->set_pgrp(pgid);
+            break;
+        }
+        case TIOCGWINSZ: {
+            auto* ws = (winsize __user*)arg3;
+            // TODO: copy_to_user
+            ws->ws_col = 80;
+            ws->ws_row = 40;
+            break;
+        }
+        case TCGETS: {
+            auto* argp = (struct termios __user*)arg3;
+            // TODO: use copy_to_user
+            memcpy(argp, &this->termio, sizeof(this->termio));
+            break;
+        }
+        case TCSETS: {
+            auto* argp = (const struct termios __user*)arg3;
+            // TODO: use copy_from_user
+            memcpy(&this->termio, argp, sizeof(this->termio));
+            break;
+        }
+        default: {
+            kmsgf("[kernel:error] ioctl(%x, %x) is not implemented", request, arg3);
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
 ssize_t tty::read(char* buf, size_t buf_size, size_t n) {
     n = std::max(buf_size, n);
     size_t orig_n = n;
@@ -144,8 +182,7 @@ void tty::_real_commit_char(int c) {
         case '\n':
             buf.put(c);
 
-            if (TERMIOS_LSET(this->termio, ECHONL) ||
-                TERMIOS_LSET(this->termio, ECHO))
+            if (TERMIOS_LSET(this->termio, ECHONL) || TERMIOS_LSET(this->termio, ECHO))
                 this->_echo_char(c);
 
             // if ICANON is set, we notify all waiting processes
@@ -171,8 +208,7 @@ void tty::_real_commit_char(int c) {
 void tty::_echo_char(int c) {
     // ECHOCTL
     do {
-        if (c < 0 || c >= 32 ||
-            !TERMIOS_LSET(this->termio, ECHO | ECHOCTL | IEXTEN))
+        if (c < 0 || c >= 32 || !TERMIOS_LSET(this->termio, ECHO | ECHOCTL | IEXTEN))
             break;
 
         if (c == '\t' || c == '\n' || c == CTRL('Q') || c == CTRL('S'))
@@ -187,6 +223,9 @@ void tty::_echo_char(int c) {
     this->show_char(c);
 }
 
+// TODO!!!: this function is racy as it acesses this->buf without
+//          acquiring this->mtx_buf or doing any synchronization
+//
 // do some ignore and remapping work
 // real commit operation is in _real_commit_char()
 void tty::commit_char(int c) {

+ 1 - 0
src/kernel/user.rs

@@ -0,0 +1 @@
+pub mod dataflow;

+ 202 - 0
src/kernel/user/dataflow.rs

@@ -0,0 +1,202 @@
+use core::{arch::asm, ffi::CStr};
+
+use bindings::{EFAULT, EINVAL};
+
+use crate::{
+    io::{Buffer, FillResult},
+    prelude::*,
+};
+
+pub struct CheckedUserPointer {
+    ptr: *const u8,
+    len: usize,
+}
+
+pub struct UserBuffer<'lt> {
+    ptr: CheckedUserPointer,
+    size: usize,
+    cur: usize,
+    _phantom: core::marker::PhantomData<&'lt ()>,
+}
+
+pub struct UserString<'lt> {
+    ptr: CheckedUserPointer,
+    len: usize,
+    _phantom: core::marker::PhantomData<&'lt ()>,
+}
+
+impl CheckedUserPointer {
+    pub fn new(ptr: *const u8, len: usize) -> KResult<Self> {
+        const USER_MAX_ADDR: usize = 0x7ff_fff_fff_fff;
+        let end = (ptr as usize).checked_add(len);
+        if ptr.is_null() || end.ok_or(EFAULT)? > USER_MAX_ADDR {
+            Err(EFAULT)
+        } else {
+            Ok(Self { ptr, len })
+        }
+    }
+
+    pub fn get_mut<T>(&self) -> *mut T {
+        self.ptr as *mut T
+    }
+
+    pub fn get_const<T>(&self) -> *const T {
+        self.ptr as *const T
+    }
+
+    pub fn as_slice(&self) -> &[u8] {
+        // SAFETY: the pointer's validity is checked in `new`
+        unsafe { core::slice::from_raw_parts(self.ptr, self.len) }
+    }
+
+    pub fn read(&self, buffer: *mut (), total: usize) -> KResult<()> {
+        if total > self.len {
+            return Err(EINVAL);
+        }
+
+        let error_bytes: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "rep movsb",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 3b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x3", // type: load
+                ".popsection",
+                inout("rcx") total => error_bytes,
+                inout("rsi") self.ptr => _,
+                inout("rdi") buffer => _,
+            )
+        }
+
+        if error_bytes != 0 {
+            Err(EFAULT)
+        } else {
+            Ok(())
+        }
+    }
+}
+
+impl UserBuffer<'_> {
+    pub fn new(ptr: *mut u8, size: usize) -> KResult<Self> {
+        let ptr = CheckedUserPointer::new(ptr, size)?;
+
+        Ok(Self {
+            ptr,
+            size,
+            cur: 0,
+            _phantom: core::marker::PhantomData,
+        })
+    }
+
+    fn remaining(&self) -> usize {
+        self.size - self.cur
+    }
+}
+
+impl<'lt> Buffer for UserBuffer<'lt> {
+    fn total(&self) -> usize {
+        self.size
+    }
+
+    fn wrote(&self) -> usize {
+        self.cur
+    }
+
+    fn fill(&mut self, data: &[u8]) -> KResult<FillResult> {
+        let remaining = self.remaining();
+        if remaining == 0 {
+            return Ok(FillResult::Full);
+        }
+
+        let data = if data.len() > remaining {
+            &data[..remaining]
+        } else {
+            data
+        };
+
+        // TODO: align to 8 bytes when doing copy for performance
+        let error_bytes: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "rep movsb",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 3b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x1", // type: store
+                ".popsection",
+                inout("rcx") data.len() => error_bytes,
+                inout("rsi") data.as_ptr() => _,
+                inout("rdi") self.ptr.get_mut::<u8>().offset(self.cur as isize) => _,
+            )
+        };
+
+        if error_bytes != 0 {
+            return Err(EFAULT);
+        }
+
+        self.cur += data.len();
+        Ok(FillResult::Done(data.len()))
+    }
+}
+
+impl<'lt> UserString<'lt> {
+    pub fn new(ptr: *const u8) -> KResult<Self> {
+        const MAX_LEN: usize = 4096;
+        // TODO
+        let ptr = CheckedUserPointer::new(ptr, MAX_LEN)?;
+
+        let result: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "mov al, byte ptr [rdx]",
+                "4:",
+                "test al, al",
+                "jz 3f",
+                "add rdx, 1",
+                "loop 2b",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 4b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x2", // type: string
+                ".popsection",
+                in("rdx") ptr.get_const::<u8>(),
+                inout("rcx") MAX_LEN => result,
+            )
+        };
+
+        if result == 0 {
+            Err(EFAULT)
+        } else {
+            Ok(Self {
+                ptr,
+                len: MAX_LEN - result,
+                _phantom: core::marker::PhantomData,
+            })
+        }
+    }
+
+    pub fn as_cstr(&self) -> &'lt CStr {
+        unsafe {
+            CStr::from_bytes_with_nul_unchecked(core::slice::from_raw_parts(
+                self.ptr.get_const(),
+                self.len + 1,
+            ))
+        }
+    }
+}

+ 3 - 253
src/kernel/vfs.cpp

@@ -17,139 +17,6 @@
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
 
-fs::regular_file::regular_file(file_flags flags, size_t cursor,
-                               struct rust_inode_handle* ind)
-    : file(flags), cursor(cursor), ind(ind) {}
-
-ssize_t fs::regular_file::read(char* __user buf, size_t n) {
-    if (!flags.read)
-        return -EBADF;
-
-    // TODO: copy to user function !IMPORTANT
-    ssize_t n_wrote = fs_read(ind, buf, n, cursor, n);
-    if (n_wrote >= 0)
-        cursor += n_wrote;
-
-    return n_wrote;
-}
-
-ssize_t fs::regular_file::do_write(const char* __user buf, size_t n) {
-    // TODO: check privilege of user ptr
-    ssize_t n_wrote = fs_write(ind, buf, cursor, n);
-    if (n_wrote >= 0)
-        cursor += n_wrote;
-
-    return n_wrote;
-}
-
-off_t fs::regular_file::seek(off_t n, int whence) {
-    size_t ind_size = r_get_inode_size(ind);
-    size_t pos;
-    switch (whence) {
-        case SEEK_SET:
-            pos = n;
-            break;
-        case SEEK_CUR:
-            pos = cursor + n;
-            break;
-        case SEEK_END:
-            pos = ind_size + n;
-            break;
-        default:
-            return -EINVAL;
-    }
-
-    if (pos > ind_size)
-        return -EINVAL;
-
-    cursor = pos;
-
-    return cursor;
-}
-
-int fs::regular_file::getdents(char* __user buf, size_t cnt) {
-    size_t orig_cnt = cnt;
-    auto callback = readdir_callback_fn(
-        [&buf, &cnt](const char* fn, size_t fnlen, ino_t ino) {
-            size_t reclen = sizeof(fs::user_dirent) + 1 + fnlen;
-            if (cnt < reclen)
-                return -EFAULT;
-
-            auto* dirp = (fs::user_dirent*)buf;
-            dirp->d_ino = ino;
-            dirp->d_reclen = reclen;
-            // TODO: show offset
-            // dirp->d_off = 0;
-            // TODO: use copy_to_user
-            memcpy(dirp->d_name, fn, fnlen);
-            buf[reclen - 2] = 0;
-            buf[reclen - 1] = 0;
-
-            buf += reclen;
-            cnt -= reclen;
-            return 0;
-        });
-
-    int nread = fs_readdir(ind, cursor, &callback);
-
-    if (nread > 0)
-        cursor += nread;
-
-    return orig_cnt - cnt;
-}
-
-int fs::regular_file::getdents64(char* __user buf, size_t cnt) {
-    size_t orig_cnt = cnt;
-    auto callback = readdir_callback_fn(
-        [&buf, &cnt](const char* fn, size_t fnlen, ino_t ino) {
-            size_t reclen = sizeof(fs::user_dirent64) + fnlen;
-            if (cnt < reclen)
-                return -EFAULT;
-
-            auto* dirp = (fs::user_dirent64*)buf;
-            dirp->d_ino = ino;
-            dirp->d_off = 114514;
-            dirp->d_reclen = reclen;
-            dirp->d_type = 0;
-            // TODO: use copy_to_user
-            memcpy(dirp->d_name, fn, fnlen);
-            buf[reclen - 1] = 0;
-
-            buf += reclen;
-            cnt -= reclen;
-            return 0;
-        });
-
-    int nread = fs_readdir(ind, cursor, &callback);
-
-    if (nread > 0)
-        cursor += nread;
-
-    return orig_cnt - cnt;
-}
-
-fs::fifo_file::fifo_file(file_flags flags, std::shared_ptr<fs::pipe> ppipe)
-    : file(flags), ppipe(ppipe) {}
-
-ssize_t fs::fifo_file::read(char* __user buf, size_t n) {
-    if (!flags.read)
-        return -EBADF;
-
-    return ppipe->read(buf, n);
-}
-
-ssize_t fs::fifo_file::do_write(const char* __user buf, size_t n) {
-    return ppipe->write(buf, n);
-}
-
-fs::fifo_file::~fifo_file() {
-    assert(flags.read ^ flags.write);
-    if (flags.read)
-        ppipe->close_read();
-    else
-        ppipe->close_write();
-}
-
 static fs::chrdev_ops** chrdevs[256];
 
 int fs::register_char_device(dev_t node, const fs::chrdev_ops& ops) {
@@ -194,126 +61,6 @@ ssize_t fs::char_device_write(dev_t node, const char* buf, size_t n) {
     return write(buf, n);
 }
 
-fs::pipe::pipe(void) : buf{PIPE_SIZE}, flags{READABLE | WRITABLE} {}
-
-void fs::pipe::close_read(void) {
-    kernel::async::lock_guard lck{mtx};
-    flags &= (~READABLE);
-    waitlist_w.notify_all();
-}
-
-void fs::pipe::close_write(void) {
-    kernel::async::lock_guard lck{mtx};
-    flags &= (~WRITABLE);
-    waitlist_r.notify_all();
-}
-
-int fs::pipe::write(const char* buf, size_t n) {
-    // TODO: check privilege
-    // TODO: check EPIPE
-    kernel::async::lock_guard lck{mtx};
-
-    if (!is_readable()) {
-        current_thread->send_signal(SIGPIPE);
-        return -EPIPE;
-    }
-
-    if (n <= PIPE_SIZE) {
-        while (this->buf.avail() < n) {
-            bool interrupted = waitlist_w.wait(mtx);
-            if (interrupted)
-                return -EINTR;
-
-            if (!is_readable()) {
-                current_thread->send_signal(SIGPIPE);
-                return -EPIPE;
-            }
-        }
-
-        for (size_t i = 0; i < n; ++i)
-            this->buf.put(*(buf++));
-
-        waitlist_r.notify_all();
-
-        return n;
-    }
-
-    size_t orig_n = n;
-    while (true) {
-        bool write = false;
-        while (n && !this->buf.full()) {
-            --n, this->buf.put(*(buf++));
-            write = true;
-        }
-
-        if (write)
-            waitlist_r.notify_all();
-
-        if (n == 0)
-            break;
-
-        bool interrupted = waitlist_w.wait(mtx);
-        if (interrupted)
-            return -EINTR;
-
-        if (!is_readable()) {
-            current_thread->send_signal(SIGPIPE);
-            return -EPIPE;
-        }
-    }
-
-    return orig_n - n;
-}
-
-int fs::pipe::read(char* buf, size_t n) {
-    // TODO: check privilege
-    kernel::async::lock_guard lck{mtx};
-    size_t orig_n = n;
-
-    if (n <= PIPE_SIZE || this->buf.empty()) {
-        while (is_writeable() && this->buf.size() < n) {
-            bool interrupted = waitlist_r.wait(mtx);
-            if (interrupted)
-                return -EINTR;
-
-            if (n > PIPE_SIZE)
-                break;
-        }
-    }
-
-    while (!this->buf.empty() && n)
-        --n, *(buf++) = this->buf.get();
-
-    waitlist_w.notify_all();
-    return orig_n - n;
-}
-
-extern "C" int call_callback(const fs::readdir_callback_fn* func,
-                             const char* filename, size_t fnlen, ino_t ino) {
-    return (*func)(filename, fnlen, ino);
-}
-
-extern "C" struct dentry* dentry_open(struct dentry* context_root,
-                                      struct dentry* cwd, const char* path,
-                                      size_t path_length, bool follow);
-
-std::pair<fs::dentry_pointer, int> fs::open(const fs::fs_context& context,
-                                            const fs::dentry_pointer& cwd,
-                                            types::string_view path,
-                                            bool follow_symlinks) {
-    auto result = dentry_open(context.root.get(), cwd.get(), path.data(),
-                              path.size(), follow_symlinks);
-    auto result_int = reinterpret_cast<intptr_t>(result);
-
-    if (result_int > -128)
-        return {nullptr, result_int};
-
-    if (fs::r_dentry_is_invalid(result))
-        return {result, -ENOENT};
-
-    return {result, 0};
-}
-
 extern "C" void r_dput(struct dentry* dentry);
 extern "C" struct dentry* r_dget(struct dentry* dentry);
 
@@ -323,5 +70,8 @@ void fs::dentry_deleter::operator()(struct dentry* dentry) const {
 }
 
 fs::dentry_pointer fs::d_get(const dentry_pointer& dp) {
+    if (!dp)
+        return nullptr;
+
     return dentry_pointer{r_dget(dp.get())};
 }

+ 189 - 109
src/kernel/vfs/dentry.rs

@@ -2,27 +2,38 @@ pub mod dcache;
 
 use core::{
     hash::{BuildHasher, BuildHasherDefault, Hasher},
-    sync::atomic::AtomicPtr,
+    ops::ControlFlow,
+    sync::atomic::{AtomicPtr, Ordering},
 };
 
 use crate::{
     hash::KernelHasher,
-    io::{ByteBuffer, RawBuffer},
+    io::{Buffer, ByteBuffer},
+    kernel::block::BlockDevice,
     path::{Path, PathComponent},
     prelude::*,
     rcu::{RCUNode, RCUPointer},
 };
 
 use alloc::sync::Arc;
-use bindings::{EINVAL, ELOOP, ENOENT, ENOTDIR};
+use bindings::{statx, EEXIST, EINVAL, EISDIR, ELOOP, ENOENT, ENOTDIR, ERANGE, O_CREAT, O_EXCL};
 
-use super::inode::Inode;
+use super::{
+    inode::{Ino, Inode, Mode, WriteOffset},
+    s_isblk, s_ischr, s_isdir, s_isreg, DevId, FsContext,
+};
 
 struct DentryData {
-    inode: Arc<Inode>,
+    inode: Arc<dyn Inode>,
     flags: u64,
 }
 
+/// # Safety
+///
+/// We wrap `Dentry` in `Arc` to ensure that the `Dentry` is not dropped while it is still in use.
+///
+/// Since a `Dentry` is created and marked as live(some data is saved to it), it keeps alive until
+/// the last reference is dropped.
 pub struct Dentry {
     // Const after insertion into dcache
     parent: Arc<Dentry>,
@@ -125,39 +136,36 @@ impl Dentry {
         Arc::as_ptr(&self.parent)
     }
 
-    fn save_data(&self, inode: Arc<Inode>, flags: u64) -> KResult<()> {
+    fn save_data(&self, inode: Arc<dyn Inode>, flags: u64) -> KResult<()> {
         let new = DentryData { inode, flags };
 
-        let old = self.data.swap(Some(Arc::new(new)));
+        // TODO!!!: We don't actually need to use `RCUPointer` here
+        // Safety: this function may only be called from `create`-like functions which requires the
+        // superblock's write locks to be held, so only one creation can happen at a time and we
+        // can't get a reference to the old data.
+        let old = unsafe { self.data.swap(Some(Arc::new(new))) };
         assert!(old.is_none());
 
         Ok(())
     }
 
-    pub fn save_reg(&self, file: Arc<Inode>) -> KResult<()> {
+    pub fn save_reg(&self, file: Arc<dyn Inode>) -> KResult<()> {
         self.save_data(file, D_REGULAR)
     }
 
-    pub fn save_symlink(&self, link: Arc<Inode>) -> KResult<()> {
+    pub fn save_symlink(&self, link: Arc<dyn Inode>) -> KResult<()> {
         self.save_data(link, D_SYMLINK)
     }
 
-    pub fn save_dir(&self, dir: Arc<Inode>) -> KResult<()> {
+    pub fn save_dir(&self, dir: Arc<dyn Inode>) -> KResult<()> {
         self.save_data(dir, D_DIRECTORY)
     }
 
-    pub fn invalidate(&self) -> KResult<()> {
-        let old = self.data.swap(None);
-        assert!(old.is_some());
-
-        Ok(())
-    }
-
-    pub fn get_inode(&self) -> KResult<Arc<Inode>> {
+    pub fn get_inode(&self) -> KResult<Arc<dyn Inode>> {
         self.data
             .load()
             .as_ref()
-            .ok_or(EINVAL)
+            .ok_or(ENOENT)
             .map(|data| data.inode.clone())
     }
 
@@ -173,11 +181,30 @@ impl Dentry {
         data.as_ref()
             .map_or(false, |data| data.flags & D_DIRECTORY != 0)
     }
-}
 
-#[repr(C)]
-pub struct FsContext {
-    root: *const Dentry,
+    pub fn is_valid(&self) -> bool {
+        self.data.load().is_some()
+    }
+
+    pub fn open_check(self: &Arc<Self>, flags: u32, mode: Mode) -> KResult<()> {
+        let data = self.data.load();
+        let create = flags & O_CREAT != 0;
+        let excl = flags & O_EXCL != 0;
+
+        if data.is_some() {
+            if create && excl {
+                return Err(EEXIST);
+            }
+            return Ok(());
+        } else {
+            if !create {
+                return Err(ENOENT);
+            }
+
+            let parent = self.parent().get_inode()?;
+            parent.creat(self, mode as u32)
+        }
+    }
 }
 
 impl Dentry {
@@ -200,16 +227,10 @@ impl Dentry {
                 let mut buffer = [0u8; 256];
                 let mut buffer = ByteBuffer::new(&mut buffer);
 
-                data.inode.readlink(&data.inode, &mut buffer)?;
+                data.inode.readlink(&mut buffer)?;
                 let path = Path::new(buffer.data())?;
 
-                let dentry = Self::open_recursive(
-                    context,
-                    &dentry.parent,
-                    path,
-                    true,
-                    nrecur + 1,
-                )?;
+                let dentry = Self::open_recursive(context, &dentry.parent, path, true, nrecur + 1)?;
 
                 Self::resolve_directory(context, dentry, nrecur + 1)
             }
@@ -217,7 +238,7 @@ impl Dentry {
         }
     }
 
-    fn open_recursive(
+    pub fn open_recursive(
         context: &FsContext,
         cwd: &Arc<Self>,
         path: Path,
@@ -231,13 +252,11 @@ impl Dentry {
         }
 
         let mut cwd = if path.is_absolute() {
-            Dentry::from_raw(&context.root).clone()
+            context.fsroot.clone()
         } else {
             cwd.clone()
         };
 
-        let root_dentry = Dentry::from_raw(&context.root);
-
         for item in path.iter() {
             if let PathComponent::TrailingEmpty = item {
                 if cwd.data.load().as_ref().is_none() {
@@ -250,12 +269,8 @@ impl Dentry {
             match item {
                 PathComponent::TrailingEmpty | PathComponent::Current => {} // pass
                 PathComponent::Parent => {
-                    if !cwd.hash_eq(root_dentry.as_ref()) {
-                        cwd = Self::resolve_directory(
-                            context,
-                            cwd.parent.clone(),
-                            nrecur,
-                        )?;
+                    if !cwd.hash_eq(&context.fsroot) {
+                        cwd = Self::resolve_directory(context, cwd.parent.clone(), nrecur)?;
                     }
                     continue;
                 }
@@ -275,76 +290,41 @@ impl Dentry {
                     let mut buffer = [0u8; 256];
                     let mut buffer = ByteBuffer::new(&mut buffer);
 
-                    data.inode.readlink(&data.inode, &mut buffer)?;
+                    data.inode.readlink(&mut buffer)?;
                     let path = Path::new(buffer.data())?;
 
-                    cwd = Self::open_recursive(
-                        context,
-                        &cwd.parent,
-                        path,
-                        true,
-                        nrecur + 1,
-                    )?;
+                    cwd = Self::open_recursive(context, &cwd.parent, path, true, nrecur + 1)?;
                 }
             }
         }
 
         Ok(cwd)
     }
-}
 
-#[no_mangle]
-pub extern "C" fn dentry_open(
-    context_root: *const Dentry,
-    cwd: *const Dentry, // borrowed
-    path: *const u8,
-    path_len: usize,
-    follow: bool,
-) -> *const Dentry {
-    match (|| -> KResult<Arc<Dentry>> {
-        let path =
-            Path::new(unsafe { core::slice::from_raw_parts(path, path_len) })?;
-
-        let context = FsContext { root: context_root };
-
-        Dentry::open_recursive(
-            &context,
-            Dentry::from_raw(&cwd).as_ref(),
-            path,
-            follow,
-            0,
-        )
-    })() {
-        Ok(dentry) => Arc::into_raw(dentry),
-        Err(err) => (-(err as i32) as usize) as *const Dentry,
+    pub fn open(context: &FsContext, path: Path, follow_symlinks: bool) -> KResult<Arc<Self>> {
+        let cwd = context.cwd.lock().clone();
+        Dentry::open_recursive(context, &cwd, path, follow_symlinks, 0)
     }
-}
 
-#[no_mangle]
-pub extern "C" fn d_path(
-    dentry: *const Dentry,
-    root: *const Dentry,
-    mut buffer: *mut u8,
-    bufsize: usize,
-) -> i32 {
-    let mut buffer = RawBuffer::new_from_raw(&mut buffer, bufsize);
-
-    match (|| {
-        let mut dentry = Dentry::from_raw(&dentry).clone();
-        let root = Dentry::from_raw(&root);
+    pub fn get_path(
+        self: &Arc<Dentry>,
+        context: &FsContext,
+        buffer: &mut dyn Buffer,
+    ) -> KResult<()> {
+        let mut dentry = self;
+        let root = &context.fsroot;
 
         let mut path = vec![];
 
-        while Arc::as_ptr(&dentry) != Arc::as_ptr(root.as_ref()) {
+        while Arc::as_ptr(dentry) != Arc::as_ptr(root) {
             if path.len() > 32 {
                 return Err(ELOOP);
             }
 
             path.push(dentry.name().clone());
-            dentry = dentry.parent().clone();
+            dentry = dentry.parent();
         }
 
-        const ERANGE: u32 = 34;
         buffer.fill(b"/")?.ok_or(ERANGE)?;
         for item in path.iter().rev().map(|name| name.as_ref()) {
             buffer.fill(item)?.ok_or(ERANGE)?;
@@ -354,9 +334,125 @@ pub extern "C" fn d_path(
         buffer.fill(&[0])?.ok_or(ERANGE)?;
 
         Ok(())
-    })() {
-        Ok(_) => 0,
-        Err(err) => -(err as i32),
+    }
+}
+
+impl Dentry {
+    pub fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        let inode = self.get_inode()?;
+
+        // Safety: Changing mode alone will have no effect on the file's contents
+        match inode.mode.load(Ordering::Relaxed) {
+            mode if s_isdir(mode) => Err(EISDIR),
+            mode if s_isreg(mode) => inode.read(buffer, offset),
+            mode if s_isblk(mode) => {
+                let device = BlockDevice::get(inode.devid()?)?;
+                Ok(device.read_some(offset, buffer)?.allow_partial())
+            }
+            mode if s_ischr(mode) => {
+                let devid = inode.devid()?;
+
+                // TODO!!!!!: change this
+                let mut temporary_buffer = [0u8; 256];
+
+                let ret = unsafe {
+                    bindings::fs::char_device_read(
+                        devid,
+                        temporary_buffer.as_mut_ptr() as *mut _,
+                        temporary_buffer.len(),
+                        temporary_buffer.len(),
+                    )
+                };
+
+                if ret < 0 {
+                    Err(-ret as u32)
+                } else {
+                    Ok(buffer
+                        .fill(&temporary_buffer[..ret as usize])?
+                        .allow_partial())
+                }
+            }
+            _ => Err(EINVAL),
+        }
+    }
+
+    pub fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
+        let inode = self.get_inode()?;
+        // Safety: Changing mode alone will have no effect on the file's contents
+        match inode.mode.load(Ordering::Relaxed) {
+            mode if s_isdir(mode) => Err(EISDIR),
+            mode if s_isreg(mode) => inode.write(buffer, offset),
+            mode if s_isblk(mode) => Err(EINVAL), // TODO
+            mode if s_ischr(mode) => {
+                let devid = inode.devid()?;
+
+                let ret = unsafe {
+                    bindings::fs::char_device_write(
+                        devid,
+                        buffer.as_ptr() as *const _,
+                        buffer.len(),
+                    )
+                };
+
+                if ret < 0 {
+                    Err(-ret as u32)
+                } else {
+                    Ok(ret as usize)
+                }
+            }
+            _ => Err(EINVAL),
+        }
+    }
+
+    pub fn readdir<F>(&self, offset: usize, mut callback: F) -> KResult<usize>
+    where
+        F: FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
+    {
+        self.get_inode()?.do_readdir(offset, &mut callback)
+    }
+
+    pub fn mkdir(&self, mode: Mode) -> KResult<()> {
+        if self.get_inode().is_ok() {
+            Err(EEXIST)
+        } else {
+            self.parent.get_inode().unwrap().mkdir(self, mode)
+        }
+    }
+
+    pub fn statx(&self, stat: &mut statx, mask: u32) -> KResult<()> {
+        self.get_inode()?.statx(stat, mask)
+    }
+
+    pub fn truncate(&self, size: usize) -> KResult<()> {
+        self.get_inode()?.truncate(size)
+    }
+
+    pub fn unlink(self: &Arc<Self>) -> KResult<()> {
+        if self.get_inode().is_err() {
+            Err(ENOENT)
+        } else {
+            self.parent.get_inode().unwrap().unlink(self)
+        }
+    }
+
+    pub fn symlink(self: &Arc<Self>, link: &[u8]) -> KResult<()> {
+        if self.get_inode().is_ok() {
+            Err(EEXIST)
+        } else {
+            self.parent.get_inode().unwrap().symlink(self, link)
+        }
+    }
+
+    pub fn readlink(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        self.get_inode()?.readlink(buffer)
+    }
+
+    pub fn mknod(&self, mode: Mode, devid: DevId) -> KResult<()> {
+        if self.get_inode().is_ok() {
+            Err(EEXIST)
+        } else {
+            self.parent.get_inode().unwrap().mknod(self, mode, devid)
+        }
     }
 }
 
@@ -375,22 +471,6 @@ pub extern "C" fn r_dput(dentry: *const Dentry) {
     unsafe { Arc::from_raw(dentry) };
 }
 
-#[no_mangle]
-pub extern "C" fn r_dentry_get_inode(dentry: *const Dentry) -> *const Inode {
-    let dentry = Dentry::from_raw(&dentry);
-
-    match dentry.get_inode() {
-        Ok(inode) => Arc::into_raw(inode),
-        Err(err) => {
-            dont_check!(println!(
-                "[kernel:warn] r_dentry_get_inode: {:?}",
-                err
-            ));
-            core::ptr::null()
-        }
-    }
-}
-
 #[no_mangle]
 pub extern "C" fn r_dentry_is_directory(dentry: *const Dentry) -> bool {
     let dentry = Dentry::from_raw(&dentry);

+ 29 - 17
src/kernel/vfs/dentry/dcache.rs

@@ -1,4 +1,7 @@
-use core::{mem::MaybeUninit, sync::atomic::AtomicPtr};
+use core::{
+    mem::MaybeUninit,
+    sync::atomic::{AtomicPtr, Ordering},
+};
 
 use alloc::sync::Arc;
 use bindings::ENOENT;
@@ -15,23 +18,21 @@ use lazy_static::lazy_static;
 
 const DCACHE_HASH_BITS: u32 = 8;
 
-static DCACHE: [RCUList<Dentry>; 1 << DCACHE_HASH_BITS] =
-    [const { RCUList::new() }; 1 << DCACHE_HASH_BITS];
-
 lazy_static! {
+    static ref DCACHE: [RCUList<Dentry>; 1 << DCACHE_HASH_BITS] =
+        core::array::from_fn(|_| RCUList::new());
     static ref DROOT: Arc<Dentry> = {
         let dentry = Arc::new_uninit();
         let fake_parent = unsafe { dentry.clone().assume_init() };
 
-        unsafe { &mut *(Arc::as_ptr(&dentry) as *mut MaybeUninit<Dentry>) }
-            .write(Dentry {
-                parent: fake_parent,
-                name: b"[root]".as_slice().into(),
-                hash: 0,
-                prev: AtomicPtr::default(),
-                next: AtomicPtr::default(),
-                data: RCUPointer::empty(),
-            });
+        unsafe { &mut *(Arc::as_ptr(&dentry) as *mut MaybeUninit<Dentry>) }.write(Dentry {
+            parent: fake_parent,
+            name: b"[root]".as_slice().into(),
+            hash: 0,
+            prev: AtomicPtr::default(),
+            next: AtomicPtr::default(),
+            data: RCUPointer::empty(),
+        });
 
         unsafe { dentry.assume_init() }
     };
@@ -50,6 +51,7 @@ pub fn d_iter_for(hash: u64) -> RCUIterator<'static, Dentry> {
     d_hinted(hash).iter()
 }
 
+/// Add the dentry to the dcache
 pub fn d_add(dentry: &Arc<Dentry>) {
     d_hinted(dentry.hash).insert(dentry.clone());
 }
@@ -60,26 +62,36 @@ pub fn d_find_fast(dentry: &Arc<Dentry>) -> Option<Arc<Dentry>> {
         .map(|dentry| dentry.clone())
 }
 
+/// Call `lookup()` on the parent inode to try find if the dentry points to a valid inode
+///
 /// Silently fail without any side effects
 pub fn d_try_revalidate(dentry: &Arc<Dentry>) {
     (|| -> KResult<()> {
         let parent = dentry.parent().get_inode()?;
-        let inode = parent.lookup(&parent, dentry)?.ok_or(ENOENT)?;
+        let inode = parent.lookup(dentry)?.ok_or(ENOENT)?;
 
         d_save(dentry, inode)
     })()
     .unwrap_or_default();
 }
 
-pub fn d_save(dentry: &Arc<Dentry>, inode: Arc<Inode>) -> KResult<()> {
-    let mode = inode.idata.lock().mode;
-    match mode {
+/// Save the inode to the dentry.
+///
+/// Dentry flags will be determined by the inode's mode.
+pub fn d_save(dentry: &Arc<Dentry>, inode: Arc<dyn Inode>) -> KResult<()> {
+    match inode.mode.load(Ordering::Acquire) {
         mode if s_isdir(mode) => dentry.save_dir(inode),
         mode if s_islnk(mode) => dentry.save_symlink(inode),
         _ => dentry.save_reg(inode),
     }
 }
 
+/// Replace the old dentry with the new one in the dcache
 pub fn d_replace(old: &Arc<Dentry>, new: Arc<Dentry>) {
     d_hinted(old.hash).replace(old, new);
 }
+
+/// Remove the dentry from the dcache so that later d_find_fast will fail
+pub fn d_remove(dentry: &Arc<Dentry>) {
+    d_hinted(dentry.hash).remove(&dentry);
+}

+ 6 - 324
src/kernel/vfs/ffi.rs

@@ -1,340 +1,22 @@
-use crate::{
-    io::{ByteBuffer, RawBuffer},
-    kernel::block::BlockDevice,
-    prelude::*,
-};
+use crate::io::RawBuffer;
 
-use core::ffi::{c_char, c_void};
-
-use alloc::sync::Arc;
-use bindings::{dev_t, ino_t, mode_t, statx};
-
-use crate::io::get_str_from_cstr;
-
-use super::{
-    bindings::{fs, EINVAL, EISDIR},
-    dentry::Dentry,
-    inode::Inode,
-    s_isblk, s_ischr, s_isdir, s_isreg, DevId,
-};
-
-fn into_slice<'a>(buf: *const u8, bufsize: &usize) -> &'a [u8] {
-    unsafe { core::slice::from_raw_parts(buf, *bufsize) }
-}
-
-fn into_mut_slice<'a>(buf: *mut u8, bufsize: &usize) -> &'a mut [u8] {
-    unsafe { core::slice::from_raw_parts_mut(buf, *bufsize) }
-}
-
-macro_rules! map_err_ffi {
-    ($error:expr) => {
-        match $error {
-            Ok(_) => 0,
-            Err(e) => -(e as i32),
-        }
-    };
-}
-
-#[no_mangle]
-pub extern "C" fn fs_mount(
-    mountpoint: *const Dentry, // borrowed
-    source: *const c_char,
-    mountpoint_str: *const c_char,
-    fstype: *const c_char,
-    flags: u64,
-    _data: *const c_void,
-) -> i32 {
-    let mountpoint = Dentry::from_raw(&mountpoint);
-
-    let source = get_str_from_cstr(source).unwrap();
-    let mountpoint_str = get_str_from_cstr(mountpoint_str).unwrap();
-    let fstype = get_str_from_cstr(fstype).unwrap();
-
-    // TODO: data
-    match super::mount::do_mount(
-        &mountpoint,
-        source,
-        mountpoint_str,
-        fstype,
-        flags,
-        &[],
-    ) {
-        Ok(_) => 0,
-        Err(e) => -(e as i32),
-    }
-}
-
-fn do_read(
-    file: &Arc<Inode>,
-    buffer: &mut [u8],
-    offset: usize,
-) -> KResult<usize> {
-    let mode = { file.idata.lock().mode };
-
-    match mode {
-        mode if s_isdir(mode) => Err(EISDIR),
-        mode if s_isreg(mode) => {
-            let mut buffer = ByteBuffer::new(buffer);
-            file.read(file, &mut buffer, offset)
-        }
-        mode if s_isblk(mode) => {
-            let mut buffer = ByteBuffer::new(buffer);
-            let device = BlockDevice::get(file.devid(file)?)?;
-
-            Ok(device.read_some(offset, &mut buffer)?.allow_partial())
-        }
-        mode if s_ischr(mode) => {
-            let devid = file.devid(file)?;
-
-            let ret = unsafe {
-                fs::char_device_read(
-                    devid,
-                    buffer.as_mut_ptr() as *mut _,
-                    buffer.len(),
-                    buffer.len(),
-                )
-            };
-
-            if ret < 0 {
-                Err(-ret as u32)
-            } else {
-                Ok(ret as usize)
-            }
-        }
-        _ => Err(EINVAL),
-    }
-}
-
-fn do_write(file: &Arc<Inode>, buffer: &[u8], offset: usize) -> KResult<usize> {
-    let mode = file.idata.lock().mode;
-
-    match mode {
-        mode if s_isdir(mode) => Err(EISDIR),
-        mode if s_isreg(mode) => file.write(file, buffer, offset),
-        mode if s_isblk(mode) => Err(EINVAL), // TODO
-        mode if s_ischr(mode) => {
-            let devid = file.devid(file)?;
-
-            let ret = unsafe {
-                fs::char_device_write(
-                    devid,
-                    buffer.as_ptr() as *const _,
-                    buffer.len(),
-                )
-            };
-
-            if ret < 0 {
-                Err(-ret as u32)
-            } else {
-                Ok(ret as usize)
-            }
-        }
-        _ => Err(EINVAL),
-    }
-}
-
-fn inode_from_raw<'lt>(file: &'lt mut *const Inode) -> BorrowedArc<'lt, Inode> {
-    BorrowedArc::new(file)
-}
+use super::{dentry::Dentry, inode::Inode};
 
 #[no_mangle]
 pub extern "C" fn fs_read(
-    mut file: *const Inode, // borrowed
+    file: *const Dentry, // borrowed
     buf: *mut u8,
     bufsize: usize,
     offset: usize,
     n: usize,
 ) -> isize {
-    let file = inode_from_raw(&mut file);
+    let file = Dentry::from_raw(&file);
 
     let bufsize = bufsize.min(n);
-    let buffer = into_mut_slice(buf, &bufsize);
+    let mut buffer = RawBuffer::new_from_raw(buf, bufsize);
 
-    match do_read(&file, buffer, offset) {
+    match file.read(&mut buffer, offset) {
         Ok(n) => n as isize,
         Err(e) => -(e as isize),
     }
 }
-
-#[no_mangle]
-pub extern "C" fn fs_write(
-    mut file: *const Inode, // borrowed
-    buf: *const u8,
-    offset: usize,
-    n: usize,
-) -> isize {
-    let file = inode_from_raw(&mut file);
-    let buffer = into_slice(buf, &n);
-
-    match do_write(&file, buffer, offset) {
-        Ok(n) => n as isize,
-        Err(e) => -(e as isize),
-    }
-}
-
-#[no_mangle]
-pub extern "C" fn fs_statx(
-    mut file: *const Inode, // borrowed
-    stat: *mut statx,
-    mask: u32,
-) -> i32 {
-    map_err_ffi!((|| {
-        let file = inode_from_raw(&mut file);
-        let statx = unsafe { stat.as_mut() }.unwrap();
-
-        file.statx(file.as_ref(), statx, mask)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_truncate(
-    mut file: *const Inode, // borrowed
-    size: usize,
-) -> i32 {
-    map_err_ffi!((|| {
-        let file = inode_from_raw(&mut file);
-        file.truncate(file.as_ref(), size)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_readlink(
-    mut file: *const Inode, // borrowed
-    mut buf: *mut u8,
-    bufsize: usize,
-) -> i32 {
-    let file = inode_from_raw(&mut file);
-    let mut buffer = RawBuffer::new_from_raw(&mut buf, bufsize);
-
-    match file.readlink(file.as_ref(), &mut buffer) {
-        Ok(n) => n as i32,
-        Err(e) => -(e as i32),
-    }
-}
-
-#[no_mangle]
-pub extern "C" fn fs_creat(
-    at: *const Dentry, // borrowed
-    mode: mode_t,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.creat(inode.as_ref(), &at, mode as u32)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_mkdir(
-    at: *const Dentry, // borrowed
-    mode: mode_t,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.mkdir(inode.as_ref(), &at, mode as u32)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_mknod(
-    at: *const Dentry, // borrowed
-    mode: mode_t,
-    dev: dev_t,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.mknod(inode.as_ref(), &at, mode as u32, dev as DevId)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_symlink(
-    at: *const Dentry, // borrowed
-    target: *const c_char,
-) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.symlink(
-            inode.as_ref(),
-            &at,
-            get_str_from_cstr(target)?.as_bytes(),
-        )
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn fs_unlink(at: *const Dentry) -> i32 {
-    map_err_ffi!((|| {
-        let at = Dentry::from_raw(&at);
-        let parent = at.parent();
-        let inode = parent.get_inode()?;
-
-        inode.unlink(inode.as_ref(), &at)
-    })())
-}
-
-#[no_mangle]
-pub extern "C" fn r_get_inode_mode(mut inode: *const Inode) -> mode_t {
-    let inode = inode_from_raw(&mut inode);
-    let idata = inode.idata.lock();
-
-    idata.mode as _
-}
-
-#[no_mangle]
-pub extern "C" fn r_get_inode_size(mut inode: *const Inode) -> u64 {
-    let inode = inode_from_raw(&mut inode);
-    let idata = inode.idata.lock();
-
-    idata.size
-}
-
-extern "C" {
-    fn call_callback(
-        callback: *const c_void,
-        filename: *const c_char,
-        filename_len: usize,
-        ino: ino_t,
-    ) -> i32;
-}
-
-#[no_mangle]
-pub extern "C" fn fs_readdir(
-    mut file: *const Inode, // borrowed
-    offset: usize,
-    callback: *const c_void,
-) -> i64 {
-    let inode = inode_from_raw(&mut file);
-
-    let ret = inode.readdir(inode.as_ref(), offset, &|filename, ino| {
-        let ret = unsafe {
-            call_callback(
-                callback,
-                filename.as_ptr() as *const c_char,
-                filename.len(),
-                ino,
-            )
-        };
-
-        match ret {
-            0 => Ok(()),
-            _ => Err(ret as u32),
-        }
-    });
-
-    match ret {
-        Ok(n) => n as i64,
-        Err(e) => -(e as i64),
-    }
-}

+ 551 - 0
src/kernel/vfs/file.rs

@@ -0,0 +1,551 @@
+use core::{
+    ffi::{c_int, c_ulong},
+    ops::ControlFlow,
+    ptr::NonNull,
+    sync::atomic::Ordering,
+};
+
+use crate::{
+    io::{Buffer, BufferFill, RawBuffer},
+    kernel::mem::{paging::Page, phys::PhysPtr},
+    prelude::*,
+    sync::condvar::CondVar,
+};
+
+use alloc::{collections::vec_deque::VecDeque, sync::Arc};
+use bindings::{
+    current_thread, kernel::tty::tty as TTY, EBADF, EFAULT, EINTR, EINVAL, ENOTDIR, ENOTTY,
+    EOVERFLOW, EPIPE, ESPIPE, SIGPIPE, S_IFMT,
+};
+
+use super::{
+    dentry::Dentry,
+    inode::{Mode, WriteOffset},
+    s_isblk, s_isreg,
+};
+
+pub struct InodeFile {
+    read: bool,
+    write: bool,
+    append: bool,
+    /// Only a few modes those won't possibly change are cached here to speed up file operations.
+    /// Specifically, `S_IFMT` masked bits.
+    mode: Mode,
+    cursor: Mutex<usize>,
+    dentry: Arc<Dentry>,
+}
+
+pub struct PipeInner {
+    buffer: VecDeque<u8>,
+    read_closed: bool,
+    write_closed: bool,
+}
+
+pub struct Pipe {
+    inner: Spin<PipeInner>,
+    cv_read: CondVar,
+    cv_write: CondVar,
+}
+
+pub struct PipeReadEnd {
+    pipe: Arc<Pipe>,
+}
+
+pub struct PipeWriteEnd {
+    pipe: Arc<Pipe>,
+}
+
+pub struct TTYFile {
+    tty: NonNull<TTY>,
+}
+
+pub enum File {
+    Inode(InodeFile),
+    PipeRead(PipeReadEnd),
+    PipeWrite(PipeWriteEnd),
+    TTY(TTYFile),
+}
+
+pub enum SeekOption {
+    Set(usize),
+    Current(isize),
+    End(isize),
+}
+
+impl Drop for PipeReadEnd {
+    fn drop(&mut self) {
+        self.pipe.close_read();
+    }
+}
+
+impl Drop for PipeWriteEnd {
+    fn drop(&mut self) {
+        self.pipe.close_write();
+    }
+}
+
+fn send_sigpipe_to_current() {
+    // Safety: current_thread is always valid.
+    let current = unsafe { current_thread.as_mut().unwrap() };
+
+    // Safety: `signal_list` is `Sync`
+    unsafe { current.send_signal(SIGPIPE) };
+}
+
+impl Pipe {
+    const PIPE_SIZE: usize = 4096;
+
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self {
+            inner: Spin::new(PipeInner {
+                buffer: VecDeque::with_capacity(Self::PIPE_SIZE),
+                read_closed: false,
+                write_closed: false,
+            }),
+            cv_read: CondVar::new(),
+            cv_write: CondVar::new(),
+        })
+    }
+
+    /// # Return
+    /// `(read_end, write_end)`
+    pub fn split(self: &Arc<Self>) -> (Arc<File>, Arc<File>) {
+        (
+            Arc::new(File::PipeRead(PipeReadEnd { pipe: self.clone() })),
+            Arc::new(File::PipeWrite(PipeWriteEnd { pipe: self.clone() })),
+        )
+    }
+
+    fn close_read(&self) {
+        let mut inner = self.inner.lock();
+        if inner.read_closed {
+            return;
+        }
+
+        inner.read_closed = true;
+        self.cv_write.notify_all();
+    }
+
+    fn close_write(&self) {
+        let mut inner = self.inner.lock();
+        if inner.write_closed {
+            return;
+        }
+
+        inner.write_closed = true;
+        self.cv_read.notify_all();
+    }
+
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+
+        while !inner.write_closed && inner.buffer.is_empty() {
+            let interrupted = self.cv_read.wait(&mut inner, true);
+            if interrupted {
+                return Err(EINTR);
+            }
+        }
+
+        let (data1, data2) = inner.buffer.as_slices();
+        let nread = buffer.fill(data1)?.allow_partial() + buffer.fill(data2)?.allow_partial();
+        inner.buffer.drain(..nread);
+
+        self.cv_write.notify_all();
+        Ok(nread)
+    }
+
+    fn write_atomic(&self, data: &[u8]) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+
+        if inner.read_closed {
+            send_sigpipe_to_current();
+            return Err(EPIPE);
+        }
+
+        while inner.buffer.len() + data.len() > Self::PIPE_SIZE {
+            let interrupted = self.cv_write.wait(&mut inner, true);
+            if interrupted {
+                return Err(EINTR);
+            }
+
+            if inner.read_closed {
+                send_sigpipe_to_current();
+                return Err(EPIPE);
+            }
+        }
+
+        inner.buffer.extend(data);
+
+        self.cv_read.notify_all();
+        return Ok(data.len());
+    }
+
+    fn write_non_atomic(&self, data: &[u8]) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+
+        if inner.read_closed {
+            send_sigpipe_to_current();
+            return Err(EPIPE);
+        }
+
+        let mut remaining = data;
+        while !remaining.is_empty() {
+            let space = inner.buffer.capacity() - inner.buffer.len();
+
+            if space != 0 {
+                let to_write = remaining.len().min(space);
+                inner.buffer.extend(&remaining[..to_write]);
+                remaining = &remaining[to_write..];
+
+                self.cv_read.notify_all();
+            }
+
+            if remaining.is_empty() {
+                break;
+            }
+
+            let interrupted = self.cv_write.wait(&mut inner, true);
+            if interrupted {
+                if data.len() != remaining.len() {
+                    break;
+                }
+                return Err(EINTR);
+            }
+
+            if inner.read_closed {
+                send_sigpipe_to_current();
+                return Err(EPIPE);
+            }
+        }
+
+        Ok(data.len() - remaining.len())
+    }
+
+    fn write(&self, data: &[u8]) -> KResult<usize> {
+        // Writes those are smaller than the pipe size are atomic.
+        if data.len() <= Self::PIPE_SIZE {
+            self.write_atomic(data)
+        } else {
+            self.write_non_atomic(data)
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+struct UserDirent64 {
+    /// Inode number
+    d_ino: u64,
+    /// Implementation defined. We ignore it
+    d_off: u64,
+    /// Length of this record
+    d_reclen: u16,
+    /// File type. Set to 0
+    d_type: u8,
+    /// Filename with a padding '\0'
+    d_name: [u8; 0],
+}
+
+/// File type is at offset `d_reclen - 1`. Set it to 0
+#[derive(Copy, Clone, Debug)]
+#[repr(C, packed)]
+struct UserDirent {
+    /// Inode number
+    d_ino: u32,
+    /// Implementation defined. We ignore it
+    d_off: u32,
+    /// Length of this record
+    d_reclen: u16,
+    /// Filename with a padding '\0'
+    d_name: [u8; 0],
+}
+
+fn has_pending_signal() -> bool {
+    unsafe { current_thread.as_mut().unwrap().signals.pending_signal() != 0 }
+}
+
+impl InodeFile {
+    pub fn new(dentry: Arc<Dentry>, rwa: (bool, bool, bool)) -> Arc<File> {
+        // SAFETY: `dentry` used to create `InodeFile` is valid.
+        // SAFETY: `mode` should never change with respect to the `S_IFMT` fields.
+        let cached_mode = dentry
+            .get_inode()
+            .expect("`dentry` is invalid")
+            .mode
+            .load(Ordering::Relaxed)
+            & S_IFMT;
+
+        Arc::new(File::Inode(InodeFile {
+            dentry,
+            read: rwa.0,
+            write: rwa.1,
+            append: rwa.2,
+            mode: cached_mode,
+            cursor: Mutex::new(0),
+        }))
+    }
+
+    fn seek(&self, option: SeekOption) -> KResult<usize> {
+        let mut cursor = self.cursor.lock();
+
+        let new_cursor = match option {
+            SeekOption::Current(off) => cursor.checked_add_signed(off).ok_or(EOVERFLOW)?,
+            SeekOption::Set(n) => n,
+            SeekOption::End(off) => {
+                let inode = self.dentry.get_inode()?;
+                let size = inode.size.load(Ordering::Relaxed) as usize;
+                size.checked_add_signed(off).ok_or(EOVERFLOW)?
+            }
+        };
+
+        *cursor = new_cursor;
+        Ok(new_cursor)
+    }
+
+    fn write(&self, buffer: &[u8]) -> KResult<usize> {
+        if !self.write {
+            return Err(EBADF);
+        }
+
+        let mut cursor = self.cursor.lock();
+
+        // TODO!!!: use `UserBuffer`
+        if self.append {
+            let nwrote = self
+                .dentry
+                .write(buffer, WriteOffset::End(cursor.as_mut()))?;
+
+            Ok(nwrote)
+        } else {
+            let nwrote = self.dentry.write(buffer, WriteOffset::Position(*cursor))?;
+
+            *cursor += nwrote;
+            Ok(nwrote)
+        }
+    }
+
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        if !self.read {
+            return Err(EBADF);
+        }
+
+        let mut cursor = self.cursor.lock();
+
+        let nread = self.dentry.read(buffer, *cursor)?;
+
+        *cursor += nread;
+        Ok(nread)
+    }
+
+    fn getdents64(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        let mut cursor = self.cursor.lock();
+
+        let nread = self.dentry.readdir(*cursor, |filename, ino| {
+            // Filename length + 1 for padding '\0'
+            let real_record_len = core::mem::size_of::<UserDirent64>() + filename.len() + 1;
+
+            if buffer.available() < real_record_len {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            let record = UserDirent64 {
+                d_ino: ino,
+                d_off: 0,
+                d_reclen: real_record_len as u16,
+                d_type: 0,
+                d_name: [0; 0],
+            };
+
+            buffer.copy(&record)?.ok_or(EFAULT)?;
+            buffer.fill(filename)?.ok_or(EFAULT)?;
+            buffer.fill(&[0])?.ok_or(EFAULT)?;
+
+            Ok(ControlFlow::Continue(()))
+        })?;
+
+        *cursor += nread;
+        Ok(())
+    }
+
+    fn getdents(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        let mut cursor = self.cursor.lock();
+
+        let nread = self.dentry.readdir(*cursor, |filename, ino| {
+            // + 1 for filename length padding '\0', + 1 for d_type.
+            let real_record_len = core::mem::size_of::<UserDirent>() + filename.len() + 2;
+
+            if buffer.available() < real_record_len {
+                return Ok(ControlFlow::Break(()));
+            }
+
+            let record = UserDirent {
+                d_ino: ino as u32,
+                d_off: 0,
+                d_reclen: real_record_len as u16,
+                d_name: [0; 0],
+            };
+
+            buffer.copy(&record)?.ok_or(EFAULT)?;
+            buffer.fill(filename)?.ok_or(EFAULT)?;
+            buffer.fill(&[0, 0])?.ok_or(EFAULT)?;
+
+            Ok(ControlFlow::Continue(()))
+        })?;
+
+        *cursor += nread;
+        Ok(())
+    }
+}
+
+impl TTYFile {
+    pub fn new(tty: *mut TTY) -> Arc<File> {
+        Arc::new(File::TTY(TTYFile {
+            tty: NonNull::new(tty).expect("`tty` is null"),
+        }))
+    }
+
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        // SAFETY: `tty` should always valid.
+        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
+
+        let mut c_buffer: Vec<u8> = vec![0; buffer.total()];
+
+        // SAFETY: `tty` points to a valid `TTY` instance.
+        let nread = unsafe {
+            tty.read(
+                c_buffer.as_mut_ptr() as *mut _,
+                c_buffer.len(),
+                c_buffer.len(),
+            )
+        };
+
+        match nread {
+            n if n < 0 => Err((-n) as u32),
+            0 => Ok(0),
+            n => Ok(buffer.fill(&c_buffer[..n as usize])?.allow_partial()),
+        }
+    }
+
+    fn write(&self, buffer: &[u8]) -> KResult<usize> {
+        // SAFETY: `tty` should always valid.
+        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
+
+        for &ch in buffer.iter() {
+            // SAFETY: `tty` points to a valid `TTY` instance.
+            unsafe { tty.show_char(ch as i32) };
+        }
+
+        Ok(buffer.len())
+    }
+
+    fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
+        // SAFETY: `tty` should always valid.
+        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
+
+        // SAFETY: `tty` points to a valid `TTY` instance.
+        let result = unsafe { tty.ioctl(request as c_int, arg3 as c_ulong) };
+
+        match result {
+            0 => Ok(0),
+            _ => Err((-result) as u32),
+        }
+    }
+}
+
+impl File {
+    pub fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        match self {
+            File::Inode(inode) => inode.read(buffer),
+            File::PipeRead(pipe) => pipe.pipe.read(buffer),
+            File::TTY(tty) => tty.read(buffer),
+            _ => Err(EBADF),
+        }
+    }
+
+    // TODO
+    // /// Read from the file into the given buffers.
+    // ///
+    // /// Reads are atomic, not intermingled with other reads or writes.
+    // pub fn readv<'r, 'i, I: Iterator<Item = &'i mut dyn Buffer>>(
+    //     &'r self,
+    //     buffers: I,
+    // ) -> KResult<usize> {
+    //     match self {
+    //         File::Inode(inode) => inode.readv(buffers),
+    //         File::PipeRead(pipe) => pipe.pipe.readv(buffers),
+    //         _ => Err(EBADF),
+    //     }
+    // }
+
+    pub fn write(&self, buffer: &[u8]) -> KResult<usize> {
+        match self {
+            File::Inode(inode) => inode.write(buffer),
+            File::PipeWrite(pipe) => pipe.pipe.write(buffer),
+            File::TTY(tty) => tty.write(buffer),
+            _ => Err(EBADF),
+        }
+    }
+
+    pub fn seek(&self, option: SeekOption) -> KResult<usize> {
+        match self {
+            File::Inode(inode) => inode.seek(option),
+            File::PipeRead(_) | File::PipeWrite(_) | File::TTY(_) => Err(ESPIPE),
+        }
+    }
+
+    pub fn getdents(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        match self {
+            File::Inode(inode) => inode.getdents(buffer),
+            _ => Err(ENOTDIR),
+        }
+    }
+
+    pub fn getdents64(&self, buffer: &mut dyn Buffer) -> KResult<()> {
+        match self {
+            File::Inode(inode) => inode.getdents64(buffer),
+            _ => Err(ENOTDIR),
+        }
+    }
+
+    pub fn sendfile(&self, dest_file: &Self, count: usize) -> KResult<usize> {
+        let buffer_page = Page::alloc_one();
+
+        match self {
+            File::Inode(file) if s_isblk(file.mode) || s_isreg(file.mode) => (),
+            _ => return Err(EINVAL),
+        }
+
+        // TODO!!!: zero copy implementation with mmap
+        let mut tot = 0usize;
+        while tot < count {
+            if has_pending_signal() {
+                if tot == 0 {
+                    return Err(EINTR);
+                } else {
+                    return Ok(tot);
+                }
+            }
+
+            let batch_size = usize::min(count - tot, buffer_page.len());
+            let slice = buffer_page.as_cached().as_mut_slice::<u8>(batch_size);
+            let mut buffer = RawBuffer::new_from_slice(slice);
+
+            let nwrote = self.read(&mut buffer)?;
+
+            if nwrote == 0 {
+                break;
+            }
+
+            tot += dest_file.write(&slice[..nwrote])?;
+        }
+
+        Ok(tot)
+    }
+
+    pub fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
+        match self {
+            File::TTY(tty) => tty.ioctl(request, arg3),
+            _ => Err(ENOTTY),
+        }
+    }
+}

+ 0 - 296
src/kernel/vfs/filearr.cc

@@ -1,296 +0,0 @@
-#include <set>
-
-#include <assert.h>
-
-#include <types/path.hpp>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/filearr.hpp>
-
-using namespace fs;
-
-using kernel::async::mutex, kernel::async::lock_guard;
-
-struct fditem {
-    int fd;
-    int flags;
-    std::shared_ptr<file> pfile;
-};
-
-struct fditem_comparator {
-    constexpr bool operator()(const fditem& lhs, const fditem& rhs) const {
-        return lhs.fd < rhs.fd;
-    }
-
-    constexpr bool operator()(int fd, const fditem& rhs) const {
-        return fd < rhs.fd;
-    }
-
-    constexpr bool operator()(const fditem& lhs, int fd) const {
-        return lhs.fd < fd;
-    }
-};
-
-// ALL METHODS SHOULD BE CALLED WITH LOCK HELD
-struct filearray::impl {
-    mutex mtx;
-
-    const fs_context* context;
-    std::set<fditem, fditem_comparator> arr;
-    int min_avail{};
-
-    int allocate_fd(int from);
-    void release_fd(int fd);
-    int next_fd();
-
-    int do_dup(const fditem& oldfile, int new_fd, int flags);
-    int place_new_file(std::shared_ptr<file> pfile, int flags);
-};
-
-int filearray::impl::allocate_fd(int from) {
-    if (from < min_avail)
-        from = min_avail;
-
-    if (from == min_avail) {
-        int nextfd = min_avail + 1;
-        auto iter = arr.find(nextfd);
-        while (iter && nextfd == iter->fd)
-            ++nextfd, ++iter;
-
-        int retval = min_avail;
-        min_avail = nextfd;
-        return retval;
-    }
-
-    int fd = from;
-    auto iter = arr.find(fd);
-    while (iter && fd == iter->fd)
-        ++fd, ++iter;
-
-    return fd;
-}
-
-void filearray::impl::release_fd(int fd) {
-    if (fd < min_avail)
-        min_avail = fd;
-}
-
-int filearray::impl::next_fd() {
-    return allocate_fd(min_avail);
-}
-
-int filearray::impl::do_dup(const fditem& oldfile, int new_fd, int flags) {
-    bool inserted;
-    std::tie(std::ignore, inserted) = arr.emplace(new_fd, flags, oldfile.pfile);
-    assert(inserted);
-
-    return new_fd;
-}
-
-int filearray::impl::place_new_file(std::shared_ptr<file> pfile, int flags) {
-    int fd = next_fd();
-
-    bool inserted;
-    std::tie(std::ignore, inserted) = arr.emplace(fd, std::move(flags), pfile);
-    assert(inserted);
-
-    return fd;
-}
-
-int filearray::dup(int old_fd) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(old_fd);
-    if (!iter)
-        return -EBADF;
-
-    int fd = pimpl->next_fd();
-    return pimpl->do_dup(*iter, fd, 0);
-}
-
-int filearray::dup(int old_fd, int new_fd, int flags) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter_old = pimpl->arr.find(old_fd);
-    if (!iter_old)
-        return -EBADF;
-
-    auto iter_new = pimpl->arr.find(new_fd);
-    if (iter_new) {
-        iter_new->pfile = iter_old->pfile;
-        iter_new->flags = flags;
-
-        return new_fd;
-    }
-
-    int fd = pimpl->allocate_fd(new_fd);
-    assert(fd == new_fd);
-    return pimpl->do_dup(*iter_old, fd, flags);
-}
-
-int filearray::dupfd(int fd, int min_fd, int flags) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(fd);
-    if (!iter)
-        return -EBADF;
-
-    int new_fd = pimpl->allocate_fd(min_fd);
-    return pimpl->do_dup(*iter, new_fd, flags);
-}
-
-int filearray::set_flags(int fd, int flags) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(fd);
-    if (!iter)
-        return -EBADF;
-
-    iter->flags |= flags;
-    return 0;
-}
-
-int filearray::close(int fd) {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(fd);
-    if (!iter)
-        return -EBADF;
-
-    pimpl->release_fd(fd);
-    pimpl->arr.erase(iter);
-
-    return 0;
-}
-
-static inline std::pair<dentry_pointer, int> _open_file(
-    const fs_context& context, const dentry_pointer& cwd,
-    types::string_view filepath, int flags, mode_t mode) {
-    auto [dent, ret] = fs::open(context, cwd, filepath);
-    if (!dent)
-        return {nullptr, ret};
-
-    if (!(r_dentry_is_invalid(dent.get()))) {
-        if ((flags & O_CREAT) && (flags & O_EXCL))
-            return {nullptr, -EEXIST};
-        return {std::move(dent), 0};
-    }
-
-    if (!(flags & O_CREAT))
-        return {nullptr, -ENOENT};
-
-    // create file
-    if (int ret = fs_creat(dent.get(), mode); ret != 0)
-        return {nullptr, ret};
-
-    return {std::move(dent), 0};
-}
-
-// TODO: file opening permissions check
-int filearray::open(const dentry_pointer& cwd, types::string_view filepath,
-                    int flags, mode_t mode) {
-    lock_guard lck{pimpl->mtx};
-
-    auto [dent, ret] = _open_file(*pimpl->context, cwd, filepath, flags, mode);
-
-    assert(dent || ret != 0);
-    if (ret != 0)
-        return ret;
-
-    auto inode = r_dentry_get_inode(dent.get());
-    auto filemode = r_get_inode_mode(inode);
-
-    int fdflag = (flags & O_CLOEXEC) ? FD_CLOEXEC : 0;
-
-    file::file_flags fflags;
-    fflags.read = !(flags & O_WRONLY);
-    fflags.write = (flags & (O_WRONLY | O_RDWR));
-    fflags.append = S_ISREG(filemode) && (flags & O_APPEND);
-
-    // check whether dentry is a file if O_DIRECTORY is set
-    if (flags & O_DIRECTORY) {
-        if (!S_ISDIR(filemode))
-            return -ENOTDIR;
-    } else {
-        if (S_ISDIR(filemode) && fflags.write)
-            return -EISDIR;
-    }
-
-    // truncate file
-    if (flags & O_TRUNC) {
-        if (fflags.write && S_ISREG(filemode)) {
-            auto ret = fs_truncate(inode, 0);
-            if (ret != 0)
-                return ret;
-        }
-    }
-
-    return pimpl->place_new_file(
-        std::make_shared<regular_file>(fflags, 0, inode), fdflag);
-}
-
-int filearray::pipe(int (&pipefd)[2]) {
-    lock_guard lck{pimpl->mtx};
-
-    if (1) {
-        std::shared_ptr<fs::pipe> ppipe{new fs::pipe};
-
-        pipefd[0] = pimpl->place_new_file(
-            std::make_shared<fifo_file>(file::file_flags{1, 0, 0}, ppipe), 0);
-
-        pipefd[1] = pimpl->place_new_file(
-            std::make_shared<fifo_file>(file::file_flags{0, 1, 0}, ppipe), 0);
-    }
-
-    return 0;
-}
-
-filearray::filearray(std::shared_ptr<impl> ptr) : pimpl{ptr} {}
-
-filearray::filearray(const fs_context* context)
-    : filearray{std::make_shared<impl>()} {
-    pimpl->context = context;
-}
-
-filearray filearray::copy() const {
-    lock_guard lck{pimpl->mtx};
-    filearray ret{pimpl->context};
-
-    ret.pimpl->min_avail = pimpl->min_avail;
-    ret.pimpl->arr = pimpl->arr;
-
-    return ret;
-}
-
-filearray filearray::share() const {
-    return filearray{pimpl};
-}
-
-void filearray::clear() {
-    pimpl.reset();
-}
-
-void filearray::onexec() {
-    lock_guard lck{pimpl->mtx};
-
-    for (auto iter = pimpl->arr.begin(); iter;) {
-        if (!(iter->flags & FD_CLOEXEC)) {
-            ++iter;
-            continue;
-        }
-        pimpl->release_fd(iter->fd);
-        iter = pimpl->arr.erase(iter);
-    }
-}
-
-file* filearray::operator[](int i) const {
-    lock_guard lck{pimpl->mtx};
-
-    auto iter = pimpl->arr.find(i);
-    if (!iter)
-        return nullptr;
-
-    return iter->pfile.get();
-}

+ 307 - 0
src/kernel/vfs/filearray.rs

@@ -0,0 +1,307 @@
+use core::sync::atomic::Ordering;
+
+use crate::{
+    kernel::vfs::{dentry::Dentry, file::Pipe, s_isdir, s_isreg},
+    path::Path,
+    prelude::*,
+};
+
+use alloc::{
+    collections::btree_map::{BTreeMap, Entry},
+    sync::Arc,
+};
+use bindings::{
+    current_process, kernel::tty::console, EBADF, EINVAL, EISDIR, ENOTDIR, FD_CLOEXEC, F_DUPFD,
+    F_DUPFD_CLOEXEC, F_GETFD, F_SETFD, O_APPEND, O_CLOEXEC, O_DIRECTORY, O_RDWR, O_TRUNC, O_WRONLY,
+};
+use itertools::{
+    FoldWhile::{Continue, Done},
+    Itertools,
+};
+
+use super::{
+    file::{File, InodeFile, TTYFile},
+    inode::Mode,
+    s_ischr, FsContext, Spin,
+};
+
+type FD = u32;
+
+#[derive(Clone)]
+struct OpenFile {
+    /// File descriptor flags, only for `FD_CLOEXEC`.
+    flags: u64,
+    file: Arc<File>,
+}
+
+#[derive(Clone)]
+struct FileArrayInner {
+    files: BTreeMap<FD, OpenFile>,
+    fd_min_avail: FD,
+}
+
+pub struct FileArray {
+    inner: Spin<FileArrayInner>,
+}
+
+impl OpenFile {
+    pub fn close_on_exec(&self) -> bool {
+        self.flags & O_CLOEXEC as u64 != 0
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_new_for_init() -> *const FileArray {
+    Arc::into_raw(Arc::new(FileArray {
+        inner: Spin::new(FileArrayInner {
+            files: BTreeMap::new(),
+            fd_min_avail: 0,
+        }),
+    }))
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_new_shared(other: *const FileArray) -> *const FileArray {
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(FileArray::new_shared(&other))
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_new_cloned(other: *const FileArray) -> *const FileArray {
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(FileArray::new_cloned(&other))
+}
+
+#[no_mangle]
+pub extern "C" fn r_filearray_drop(other: *const FileArray) {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    unsafe { Arc::from_raw(other) };
+}
+
+impl FileArray {
+    pub fn get_current<'lt>() -> BorrowedArc<'lt, Self> {
+        // SAFETY: `current_process` is always valid.
+        let current = unsafe { current_process.as_mut().unwrap() };
+        BorrowedArc::from_raw(current.files.m_handle as *const _)
+    }
+
+    pub fn new_shared(other: &Arc<Self>) -> Arc<Self> {
+        other.clone()
+    }
+
+    pub fn new_cloned(other: &Arc<Self>) -> Arc<Self> {
+        Arc::new(Self {
+            inner: Spin::clone(&other.inner),
+        })
+    }
+
+    /// Acquires the file array lock.
+    pub fn get(&self, fd: FD) -> Option<Arc<File>> {
+        self.inner.lock().get(fd)
+    }
+
+    pub fn close_all(&self) {
+        let mut inner = self.inner.lock();
+        inner.fd_min_avail = 0;
+        inner.files.clear();
+    }
+
+    pub fn close(&self, fd: FD) -> KResult<()> {
+        let mut inner = self.inner.lock();
+        inner.files.remove(&fd).ok_or(EBADF)?;
+        inner.release_fd(fd);
+        Ok(())
+    }
+
+    pub fn on_exec(&self) -> () {
+        let mut inner = self.inner.lock();
+
+        // TODO: This is not efficient. We should avoid cloning.
+        let fds_to_close = inner
+            .files
+            .iter()
+            .filter(|(_, ofile)| ofile.close_on_exec())
+            .map(|(&fd, _)| fd)
+            .collect::<Vec<_>>();
+
+        inner.files.retain(|_, ofile| !ofile.close_on_exec());
+        fds_to_close.into_iter().for_each(|fd| inner.release_fd(fd));
+    }
+}
+
+impl FileArray {
+    pub fn dup(&self, old_fd: FD) -> KResult<FD> {
+        let mut inner = self.inner.lock();
+        let old_file = inner.files.get(&old_fd).ok_or(EBADF)?;
+
+        let new_file_data = old_file.file.clone();
+        let new_file_flags = old_file.flags;
+        let new_fd = inner.next_fd();
+
+        inner.do_insert(new_fd, new_file_flags, new_file_data);
+
+        Ok(new_fd)
+    }
+
+    pub fn dup_to(&self, old_fd: FD, new_fd: FD, flags: u64) -> KResult<FD> {
+        let mut inner = self.inner.lock();
+        let old_file = inner.files.get(&old_fd).ok_or(EBADF)?;
+
+        let new_file_data = old_file.file.clone();
+
+        match inner.files.entry(new_fd) {
+            Entry::Vacant(_) => {}
+            Entry::Occupied(entry) => {
+                let new_file = entry.into_mut();
+
+                new_file.flags = flags;
+                new_file.file = new_file_data;
+
+                return Ok(new_fd);
+            }
+        }
+
+        assert_eq!(new_fd, inner.allocate_fd(new_fd));
+        inner.do_insert(new_fd, flags, new_file_data);
+
+        Ok(new_fd)
+    }
+
+    /// # Return
+    /// `(read_fd, write_fd)`
+    pub fn pipe(&self) -> KResult<(FD, FD)> {
+        let mut inner = self.inner.lock();
+
+        let read_fd = inner.next_fd();
+        let write_fd = inner.next_fd();
+
+        let pipe = Pipe::new();
+        let (read_end, write_end) = pipe.split();
+        inner.do_insert(read_fd, 0, read_end);
+        inner.do_insert(write_fd, 0, write_end);
+
+        Ok((read_fd, write_fd))
+    }
+
+    pub fn open(&self, fs_context: &FsContext, path: Path, flags: u32, mode: Mode) -> KResult<FD> {
+        let dentry = Dentry::open(fs_context, path, true)?;
+        dentry.open_check(flags, mode)?;
+
+        let fdflag = if flags & O_CLOEXEC != 0 { FD_CLOEXEC } else { 0 };
+        let can_read = flags & O_WRONLY == 0;
+        let can_write = flags & (O_WRONLY | O_RDWR) != 0;
+        let append = flags & O_APPEND != 0;
+
+        let inode = dentry.get_inode()?;
+        let filemode = inode.mode.load(Ordering::Relaxed);
+
+        if flags & O_DIRECTORY != 0 {
+            if !s_isdir(filemode) {
+                return Err(ENOTDIR);
+            }
+        } else {
+            if s_isdir(filemode) && can_write {
+                return Err(EISDIR);
+            }
+        }
+
+        if flags & O_TRUNC != 0 {
+            if can_write && s_isreg(filemode) {
+                inode.truncate(0)?;
+            }
+        }
+
+        let mut inner = self.inner.lock();
+        let fd = inner.next_fd();
+
+        if s_ischr(filemode) && inode.devid()? == 0x0501 {
+            inner.do_insert(fd, fdflag as u64, TTYFile::new(unsafe { console }));
+        } else {
+            inner.do_insert(
+                fd,
+                fdflag as u64,
+                InodeFile::new(dentry, (can_read, can_write, append)),
+            );
+        }
+
+        Ok(fd)
+    }
+
+    pub fn fcntl(&self, fd: FD, cmd: u32, arg: usize) -> KResult<usize> {
+        let mut inner = self.inner.lock();
+        let ofile = inner.files.get_mut(&fd).ok_or(EBADF)?;
+
+        match cmd {
+            F_DUPFD | F_DUPFD_CLOEXEC => {
+                let cloexec = cmd == F_DUPFD_CLOEXEC || (ofile.flags & FD_CLOEXEC as u64 != 0);
+                let flags = if cloexec { O_CLOEXEC } else { 0 };
+
+                let new_file_data = ofile.file.clone();
+                let new_fd = inner.allocate_fd(arg as FD);
+
+                inner.do_insert(new_fd, flags as u64, new_file_data);
+
+                Ok(new_fd as usize)
+            }
+            F_GETFD => Ok(ofile.flags as usize),
+            F_SETFD => {
+                ofile.flags = arg as u64;
+                Ok(0)
+            }
+            _ => unimplemented!("fcntl: cmd={}", cmd),
+        }
+    }
+}
+
+impl FileArrayInner {
+    fn get(&mut self, fd: FD) -> Option<Arc<File>> {
+        self.files.get(&fd).map(|f| f.file.clone())
+    }
+
+    fn find_available(&mut self, from: FD) -> FD {
+        self.files
+            .range(&from..)
+            .fold_while(from, |current, (&key, _)| {
+                if current == key {
+                    Continue(current + 1)
+                } else {
+                    Done(current)
+                }
+            })
+            .into_inner()
+    }
+
+    /// Allocate a new file descriptor starting from `from`.
+    ///
+    /// Returned file descriptor should be used immediately.
+    ///
+    fn allocate_fd(&mut self, from: FD) -> FD {
+        let from = FD::max(from, self.fd_min_avail);
+
+        if from == self.fd_min_avail {
+            let next_min_avail = self.find_available(from + 1);
+            let allocated = self.fd_min_avail;
+            self.fd_min_avail = next_min_avail;
+            allocated
+        } else {
+            self.find_available(from)
+        }
+    }
+
+    fn release_fd(&mut self, fd: FD) {
+        if fd < self.fd_min_avail {
+            self.fd_min_avail = fd;
+        }
+    }
+
+    fn next_fd(&mut self) -> FD {
+        self.allocate_fd(self.fd_min_avail)
+    }
+
+    /// Insert a file description to the file array.
+    fn do_insert(&mut self, fd: FD, flags: u64, file: Arc<File>) {
+        assert!(self.files.insert(fd, OpenFile { flags, file }).is_none());
+    }
+}

+ 196 - 192
src/kernel/vfs/inode.rs

@@ -1,236 +1,188 @@
-use core::{ops::Deref, sync::atomic::AtomicU64};
-
-use alloc::{
-    collections::btree_map::{BTreeMap, Entry},
-    sync::{Arc, Weak},
-};
+use alloc::sync::{Arc, Weak};
 use bindings::{
-    statx, EEXIST, EINVAL, EIO, EISDIR, ENOTDIR, EPERM, STATX_ATIME,
-    STATX_BLOCKS, STATX_CTIME, STATX_GID, STATX_INO, STATX_MODE, STATX_MTIME,
-    STATX_NLINK, STATX_SIZE, STATX_TYPE, STATX_UID, S_IFDIR, S_IFMT,
+    statx, EINVAL, EISDIR, ENOTDIR, EPERM, STATX_ATIME, STATX_BLOCKS, STATX_CTIME, STATX_GID,
+    STATX_INO, STATX_MODE, STATX_MTIME, STATX_NLINK, STATX_SIZE, STATX_TYPE, STATX_UID, S_IFDIR,
+    S_IFMT,
 };
-
-use super::{
-    dentry::Dentry, s_isblk, s_ischr, vfs::Vfs, DevId, ReadDirCallback,
-    TimeSpec,
+use core::{
+    mem::MaybeUninit, ops::ControlFlow, ptr::addr_of_mut, sync::atomic::{AtomicU32, AtomicU64, Ordering}
 };
+
+use super::{dentry::Dentry, s_isblk, s_ischr, vfs::Vfs, DevId, TimeSpec};
 use crate::{io::Buffer, prelude::*};
 
 pub type Ino = u64;
 pub type AtomicIno = AtomicU64;
+#[allow(dead_code)]
 pub type ISize = u64;
+pub type AtomicISize = AtomicU64;
+#[allow(dead_code)]
 pub type Nlink = u64;
+pub type AtomicNlink = AtomicU64;
+#[allow(dead_code)]
 pub type Uid = u32;
+pub type AtomicUid = AtomicU32;
+#[allow(dead_code)]
 pub type Gid = u32;
+pub type AtomicGid = AtomicU32;
 pub type Mode = u32;
+pub type AtomicMode = AtomicU32;
 
-#[repr(C)]
-#[derive(Default)]
 pub struct InodeData {
-    pub size: ISize,
-    pub nlink: Nlink,
+    pub ino: Ino,
+    pub size: AtomicISize,
+    pub nlink: AtomicNlink,
 
-    pub uid: Uid,
-    pub gid: Gid,
-    pub mode: Mode,
+    pub uid: AtomicUid,
+    pub gid: AtomicGid,
+    pub mode: AtomicMode,
 
-    pub atime: TimeSpec,
-    pub mtime: TimeSpec,
-    pub ctime: TimeSpec,
-}
+    pub atime: Spin<TimeSpec>,
+    pub ctime: Spin<TimeSpec>,
+    pub mtime: Spin<TimeSpec>,
+
+    pub rwsem: RwSemaphore<()>,
 
-pub struct Inode {
-    pub ino: Ino,
     pub vfs: Weak<dyn Vfs>,
+}
 
-    pub idata: Mutex<InodeData>,
-    pub ops: Box<dyn InodeOps>,
+impl InodeData {
+    pub fn new(ino: Ino, vfs: Weak<dyn Vfs>) -> Self {
+        Self {
+            ino,
+            vfs,
+            atime: Spin::new(TimeSpec::default()),
+            ctime: Spin::new(TimeSpec::default()),
+            mtime: Spin::new(TimeSpec::default()),
+            rwsem: RwSemaphore::new(()),
+            size: Default::default(),
+            nlink: Default::default(),
+            uid: Default::default(),
+            gid: Default::default(),
+            mode: Default::default(),
+        }
+    }
 }
 
-impl Deref for Inode {
-    type Target = dyn InodeOps;
+#[allow(dead_code)]
+pub trait InodeInner:
+    Send + Sync + core::ops::Deref<Target = InodeData> + core::ops::DerefMut
+{
+    fn data(&self) -> &InodeData;
+    fn data_mut(&mut self) -> &mut InodeData;
+}
 
-    fn deref(&self) -> &Self::Target {
-        self.ops.as_ref()
-    }
+pub enum WriteOffset<'end> {
+    Position(usize),
+    End(&'end mut usize),
 }
 
 #[allow(unused_variables)]
-pub trait InodeOps: Send + Sync {
-    fn as_any(&self) -> &dyn Any;
+pub trait Inode: Send + Sync + InodeInner {
+    fn is_dir(&self) -> bool {
+        self.mode.load(Ordering::SeqCst) & S_IFDIR != 0
+    }
 
-    fn lookup(
-        &self,
-        dir: &Inode,
-        dentry: &Arc<Dentry>,
-    ) -> KResult<Option<Arc<Inode>>> {
-        if dir.idata.lock().mode & S_IFDIR == 0 {
-            Err(ENOTDIR)
-        } else {
-            Err(EPERM)
-        }
+    fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
+        Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn creat(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        if dir.idata.lock().mode & S_IFDIR == 0 {
-            Err(ENOTDIR)
-        } else {
-            Err(EPERM)
-        }
+    fn creat(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
+        Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn mkdir(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        if dir.idata.lock().mode & S_IFDIR == 0 {
-            Err(ENOTDIR)
-        } else {
-            Err(EPERM)
-        }
+    fn mkdir(&self, at: &Dentry, mode: Mode) -> KResult<()> {
+        Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn mknod(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        mode: Mode,
-        dev: DevId,
-    ) -> KResult<()> {
-        if dir.idata.lock().mode & S_IFDIR == 0 {
-            Err(ENOTDIR)
-        } else {
-            Err(EPERM)
-        }
+    fn mknod(&self, at: &Dentry, mode: Mode, dev: DevId) -> KResult<()> {
+        Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn unlink(&self, dir: &Inode, at: &Arc<Dentry>) -> KResult<()> {
-        if dir.idata.lock().mode & S_IFDIR == 0 {
-            Err(ENOTDIR)
-        } else {
-            Err(EPERM)
-        }
+    fn unlink(&self, at: &Arc<Dentry>) -> KResult<()> {
+        Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn symlink(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        target: &[u8],
-    ) -> KResult<()> {
-        if dir.idata.lock().mode & S_IFDIR == 0 {
-            Err(ENOTDIR)
-        } else {
-            Err(EPERM)
-        }
+    fn symlink(&self, at: &Arc<Dentry>, target: &[u8]) -> KResult<()> {
+        Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn read(
-        &self,
-        inode: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
-        if inode.idata.lock().mode & S_IFDIR != 0 {
-            Err(EISDIR)
-        } else {
-            Err(EINVAL)
-        }
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
-    fn write(
-        &self,
-        inode: &Inode,
-        buffer: &[u8],
-        offset: usize,
-    ) -> KResult<usize> {
-        if inode.idata.lock().mode & S_IFDIR != 0 {
-            Err(EISDIR)
-        } else {
-            Err(EINVAL)
-        }
+    fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
-    fn devid(&self, inode: &Inode) -> KResult<DevId> {
-        if inode.idata.lock().mode & S_IFDIR != 0 {
-            Err(EISDIR)
-        } else {
-            Err(EINVAL)
-        }
+    fn devid(&self) -> KResult<DevId> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
-    fn readlink(
-        &self,
-        inode: &Inode,
-        buffer: &mut dyn Buffer,
-    ) -> KResult<usize> {
-        Err(EINVAL)
+    fn readlink(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        Err(if self.is_dir() { EISDIR } else { EINVAL })
     }
 
-    fn truncate(&self, inode: &Inode, length: usize) -> KResult<()> {
-        if inode.idata.lock().mode & S_IFDIR != 0 {
-            Err(EISDIR)
-        } else {
-            Err(EPERM)
-        }
+    fn truncate(&self, length: usize) -> KResult<()> {
+        Err(if self.is_dir() { EISDIR } else { EPERM })
     }
 
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
-        inode: &'r Inode,
+    fn do_readdir(
+        &self,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        if inode.idata.lock().mode & S_IFDIR == 0 {
-            Err(ENOTDIR)
-        } else {
-            Err(EPERM)
-        }
+        Err(if !self.is_dir() { ENOTDIR } else { EPERM })
     }
 
-    fn statx(&self, inode: &Inode, stat: &mut statx, mask: u32) -> KResult<()> {
-        let (fsdev, io_blksize) = {
-            let vfs = inode.vfs.upgrade().ok_or(EIO)?;
-            (vfs.fs_devid(), vfs.io_blksize())
-        };
-        let devid = self.devid(inode);
+    fn statx(&self, stat: &mut statx, mask: u32) -> KResult<()> {
+        // Safety: ffi should have checked reference
+        let vfs = self.vfs.upgrade().expect("Vfs is dropped");
 
-        let idata = inode.idata.lock();
+        let size = self.size.load(Ordering::Relaxed);
+        let mode = self.mode.load(Ordering::Relaxed);
 
         if mask & STATX_NLINK != 0 {
-            stat.stx_nlink = idata.nlink as _;
+            stat.stx_nlink = self.nlink.load(Ordering::Acquire) as _;
             stat.stx_mask |= STATX_NLINK;
         }
 
         if mask & STATX_ATIME != 0 {
-            stat.stx_atime.tv_nsec = idata.atime.nsec as _;
-            stat.stx_atime.tv_sec = idata.atime.sec as _;
+            let atime = self.atime.lock();
+            stat.stx_atime.tv_nsec = atime.nsec as _;
+            stat.stx_atime.tv_sec = atime.sec as _;
             stat.stx_mask |= STATX_ATIME;
         }
 
         if mask & STATX_MTIME != 0 {
-            stat.stx_mtime.tv_nsec = idata.mtime.nsec as _;
-            stat.stx_mtime.tv_sec = idata.mtime.sec as _;
+            let mtime = self.mtime.lock();
+            stat.stx_mtime.tv_nsec = mtime.nsec as _;
+            stat.stx_mtime.tv_sec = mtime.sec as _;
             stat.stx_mask |= STATX_MTIME;
         }
 
         if mask & STATX_CTIME != 0 {
-            stat.stx_ctime.tv_nsec = idata.ctime.nsec as _;
-            stat.stx_ctime.tv_sec = idata.ctime.sec as _;
+            let ctime = self.ctime.lock();
+            stat.stx_ctime.tv_nsec = ctime.nsec as _;
+            stat.stx_ctime.tv_sec = ctime.sec as _;
             stat.stx_mask |= STATX_CTIME;
         }
 
         if mask & STATX_SIZE != 0 {
-            stat.stx_size = idata.size as _;
+            stat.stx_size = self.size.load(Ordering::Relaxed) as _;
             stat.stx_mask |= STATX_SIZE;
         }
 
         stat.stx_mode = 0;
         if mask & STATX_MODE != 0 {
-            stat.stx_mode |= (idata.mode & !S_IFMT) as u16;
+            stat.stx_mode |= (mode & !S_IFMT) as u16;
             stat.stx_mask |= STATX_MODE;
         }
 
         if mask & STATX_TYPE != 0 {
-            stat.stx_mode |= (idata.mode & S_IFMT) as u16;
-            if s_isblk(idata.mode) || s_ischr(idata.mode) {
+            stat.stx_mode |= (mode & S_IFMT) as u16;
+            if s_isblk(mode) || s_ischr(mode) {
+                let devid = self.devid();
                 stat.stx_rdev_major = (devid? >> 8) & 0xff;
                 stat.stx_rdev_minor = devid? & 0xff;
             }
@@ -238,26 +190,27 @@ pub trait InodeOps: Send + Sync {
         }
 
         if mask & STATX_INO != 0 {
-            stat.stx_ino = inode.ino as _;
+            stat.stx_ino = self.ino as _;
             stat.stx_mask |= STATX_INO;
         }
 
         if mask & STATX_BLOCKS != 0 {
-            stat.stx_blocks = (idata.size + 512 - 1) / 512;
-            stat.stx_blksize = io_blksize as _;
+            stat.stx_blocks = (size + 512 - 1) / 512;
+            stat.stx_blksize = vfs.io_blksize() as _;
             stat.stx_mask |= STATX_BLOCKS;
         }
 
         if mask & STATX_UID != 0 {
-            stat.stx_uid = idata.uid as _;
+            stat.stx_uid = self.uid.load(Ordering::Relaxed) as _;
             stat.stx_mask |= STATX_UID;
         }
 
         if mask & STATX_GID != 0 {
-            stat.stx_gid = idata.gid as _;
+            stat.stx_gid = self.gid.load(Ordering::Relaxed) as _;
             stat.stx_mask |= STATX_GID;
         }
 
+        let fsdev = vfs.fs_devid();
         stat.stx_dev_major = (fsdev >> 8) & 0xff;
         stat.stx_dev_minor = fsdev & 0xff;
 
@@ -266,49 +219,100 @@ pub trait InodeOps: Send + Sync {
 
         Ok(())
     }
-}
 
-pub struct InodeCache<Fs: Vfs + 'static> {
-    cache: BTreeMap<Ino, Arc<Inode>>,
-    vfs: Weak<Fs>,
+    fn new_locked<F>(ino: Ino, vfs: Weak<dyn Vfs>, f: F) -> Arc<Self>
+    where
+        Self: Sized,
+        F: FnOnce(*mut Self, &()),
+    {
+        let mut uninit = Arc::<Self>::new_uninit();
+
+        let uninit_mut = Arc::get_mut(&mut uninit).unwrap();
+
+        // Safety: `idata` is owned by `uninit`
+        let idata = unsafe {
+            addr_of_mut!(*(*uninit_mut.as_mut_ptr()).data_mut())
+                .cast::<MaybeUninit<InodeData>>()
+                .as_mut()
+                .unwrap()
+        };
+
+        idata.write(InodeData::new(ino, vfs));
+
+        f(
+            uninit_mut.as_mut_ptr(),
+            // Safety: `idata` is initialized
+            &unsafe { idata.assume_init_ref() }.rwsem.lock_shared(),
+        );
+
+        // Safety: `uninit` is initialized
+        unsafe { uninit.assume_init() }
+    }
 }
 
-impl<Fs: Vfs> InodeCache<Fs> {
-    pub fn new(vfs: Weak<Fs>) -> Self {
-        Self {
-            cache: BTreeMap::new(),
-            vfs,
+// TODO: define multiple inode structs a time
+macro_rules! define_struct_inode {
+    ($v:vis struct $inode_t:ident;) => {
+        $v struct $inode_t {
+            /// Do not use this directly
+            idata: $crate::kernel::vfs::inode::InodeData,
         }
-    }
 
-    pub fn vfs(&self) -> Weak<Fs> {
-        self.vfs.clone()
-    }
+        impl core::ops::Deref for $inode_t {
+            type Target = $crate::kernel::vfs::inode::InodeData;
 
-    pub fn alloc(&self, ino: Ino, ops: Box<dyn InodeOps>) -> Arc<Inode> {
-        Arc::new(Inode {
-            ino,
-            vfs: self.vfs.clone(),
-            idata: Mutex::new(InodeData::default()),
-            ops,
-        })
-    }
+            fn deref(&self) -> &Self::Target {
+                &self.idata
+            }
+        }
 
-    pub fn submit(&mut self, inode: &Arc<Inode>) -> KResult<()> {
-        match self.cache.entry(inode.ino) {
-            Entry::Occupied(_) => Err(EEXIST),
-            Entry::Vacant(entry) => {
-                entry.insert(inode.clone());
-                Ok(())
+        impl core::ops::DerefMut for $inode_t {
+            fn deref_mut(&mut self) -> &mut Self::Target {
+                &mut self.idata
             }
         }
-    }
 
-    pub fn get(&self, ino: Ino) -> Option<Arc<Inode>> {
-        self.cache.get(&ino).cloned()
-    }
+        impl $crate::kernel::vfs::inode::InodeInner for $inode_t {
+            fn data(&self) -> &$crate::kernel::vfs::inode::InodeData {
+                &self.idata
+            }
 
-    pub fn free(&mut self, ino: Ino) {
-        self.cache.remove(&ino);
-    }
+            fn data_mut(&mut self) -> &mut $crate::kernel::vfs::inode::InodeData {
+                &mut self.idata
+            }
+        }
+    };
+    ($v:vis struct $inode_t:ident { $($vis:vis $name:ident: $type:ty,)* }) => {
+        $v struct $inode_t {
+            /// Do not use this directly
+            idata: $crate::kernel::vfs::inode::InodeData,
+            $($vis $name: $type,)*
+        }
+
+        impl core::ops::Deref for $inode_t {
+            type Target = $crate::kernel::vfs::inode::InodeData;
+
+            fn deref(&self) -> &Self::Target {
+                &self.idata
+            }
+        }
+
+        impl core::ops::DerefMut for $inode_t {
+            fn deref_mut(&mut self) -> &mut Self::Target {
+                &mut self.idata
+            }
+        }
+
+        impl $crate::kernel::vfs::inode::InodeInner for $inode_t {
+            fn data(&self) -> &$crate::kernel::vfs::inode::InodeData {
+                &self.idata
+            }
+
+            fn data_mut(&mut self) -> &mut $crate::kernel::vfs::inode::InodeData {
+                &mut self.idata
+            }
+        }
+    };
 }
+
+pub(crate) use define_struct_inode;

+ 58 - 11
src/kernel/vfs/mod.rs

@@ -1,25 +1,20 @@
 use crate::prelude::*;
 
-use bindings::{dev_t, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFMT, S_IFREG};
-use inode::{Ino, Mode};
+use alloc::sync::Arc;
+use bindings::{current_process, dev_t, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFMT, S_IFREG};
+use dentry::Dentry;
+use inode::Mode;
 
 pub mod dentry;
 pub mod ffi;
+pub mod file;
+pub mod filearray;
 pub mod inode;
 pub mod mount;
 pub mod vfs;
 
 pub type DevId = dev_t;
 
-/// # Return
-///
-/// Return -1 if an error occurred
-///
-/// Return 0 if no more entry available
-///
-/// Otherwise, return bytes to be added to the offset
-pub type ReadDirCallback<'lt> = dyn Fn(&[u8], Ino) -> KResult<()> + 'lt;
-
 pub fn s_isreg(mode: Mode) -> bool {
     (mode & S_IFMT) == S_IFREG
 }
@@ -46,3 +41,55 @@ pub struct TimeSpec {
     pub sec: u64,
     pub nsec: u64,
 }
+
+#[derive(Clone)]
+pub struct FsContext {
+    pub fsroot: Arc<Dentry>,
+    pub cwd: Spin<Arc<Dentry>>,
+    pub umask: Spin<Mode>,
+}
+
+impl FsContext {
+    pub fn get_current() -> BorrowedArc<'static, Self> {
+        // SAFETY: There should always be a current process.
+        let current = unsafe { current_process.as_ref().unwrap() };
+        let ptr = current.fs_context.m_handle as *const _ as *const Self;
+
+        BorrowedArc::from_raw(ptr)
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_drop(other: *const FsContext) {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    unsafe { Arc::from_raw(other) };
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_new_cloned(other: *const FsContext) -> *const FsContext {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(Arc::new(FsContext {
+        fsroot: other.fsroot.clone(),
+        cwd: other.cwd.clone(),
+        umask: other.umask.clone(),
+    }))
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_new_shared(other: *const FsContext) -> *const FsContext {
+    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
+    let other = BorrowedArc::from_raw(other);
+
+    Arc::into_raw(other.clone())
+}
+
+#[no_mangle]
+pub extern "C" fn r_fs_context_new_for_init() -> *const FsContext {
+    Arc::into_raw(Arc::new(FsContext {
+        fsroot: Dentry::kernel_root_dentry(),
+        cwd: Spin::new(Dentry::kernel_root_dentry()),
+        umask: Spin::new(0o022),
+    }))
+}

+ 24 - 31
src/kernel/vfs/mount.rs

@@ -1,4 +1,4 @@
-use crate::prelude::*;
+use crate::{fs::tmpfs, prelude::*};
 
 use alloc::{
     collections::btree_map::{BTreeMap, Entry},
@@ -6,11 +6,12 @@ use alloc::{
 };
 use bindings::{EEXIST, ENODEV, ENOTDIR};
 
+use lazy_static::lazy_static;
+
 use super::{
     dentry::{dcache, Dentry},
     inode::Inode,
     vfs::Vfs,
-    Mutex,
 };
 
 pub const MS_RDONLY: u64 = 1 << 0;
@@ -31,10 +32,11 @@ const MOUNT_FLAGS: [(u64, &str); 6] = [
     (MS_LAZYTIME, ",lazytime"),
 ];
 
-static MOUNT_CREATORS: Mutex<BTreeMap<String, Box<dyn MountCreator>>> =
-    Mutex::new(BTreeMap::new());
-
-static MOUNTS: Mutex<Vec<(Arc<Dentry>, MountPointData)>> = Mutex::new(vec![]);
+lazy_static! {
+    static ref MOUNT_CREATORS: Spin<BTreeMap<String, Arc<dyn MountCreator>>> =
+        Spin::new(BTreeMap::new());
+    static ref MOUNTS: Spin<Vec<(Arc<Dentry>, MountPointData)>> = Spin::new(vec![]);
+}
 
 static mut ROOTFS: Option<Arc<Dentry>> = None;
 
@@ -44,11 +46,7 @@ pub struct Mount {
 }
 
 impl Mount {
-    pub fn new(
-        mp: &Dentry,
-        vfs: Arc<dyn Vfs>,
-        root_inode: Arc<Inode>,
-    ) -> KResult<Self> {
+    pub fn new(mp: &Dentry, vfs: Arc<dyn Vfs>, root_inode: Arc<dyn Inode>) -> KResult<Self> {
         let root_dentry = Dentry::create(mp.parent().clone(), mp.name());
         root_dentry.save_dir(root_inode)?;
 
@@ -67,19 +65,10 @@ unsafe impl Send for Mount {}
 unsafe impl Sync for Mount {}
 
 pub trait MountCreator: Send + Sync {
-    fn create_mount(
-        &self,
-        source: &str,
-        flags: u64,
-        data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount>;
+    fn create_mount(&self, source: &str, flags: u64, mp: &Arc<Dentry>) -> KResult<Mount>;
 }
 
-pub fn register_filesystem(
-    fstype: &str,
-    creator: Box<dyn MountCreator>,
-) -> KResult<()> {
+pub fn register_filesystem(fstype: &str, creator: Arc<dyn MountCreator>) -> KResult<()> {
     let mut creators = MOUNT_CREATORS.lock();
     match creators.entry(String::from(fstype)) {
         Entry::Occupied(_) => Err(EEXIST),
@@ -104,7 +93,6 @@ pub fn do_mount(
     mountpoint_str: &str,
     fstype: &str,
     flags: u64,
-    data: &[u8],
 ) -> KResult<()> {
     let mut flags = flags;
     if flags & MS_NOATIME == 0 {
@@ -119,11 +107,11 @@ pub fn do_mount(
         return Err(ENOTDIR);
     }
 
-    let mount = {
+    let creator = {
         let creators = { MOUNT_CREATORS.lock() };
-        let creator = creators.get(fstype).ok_or(ENODEV)?;
-        creator.create_mount(source, flags, data, mountpoint)?
+        creators.get(fstype).ok_or(ENODEV)?.clone()
     };
+    let mount = creator.create_mount(source, flags, mountpoint)?;
 
     let root_dentry = mount.root().clone();
 
@@ -174,7 +162,11 @@ pub fn dump_mounts(buffer: &mut dyn core::fmt::Write) {
     }
 }
 
-pub fn create_rootfs() {
+#[no_mangle]
+#[link_section = ".text.kinit"]
+pub extern "C" fn r_init_vfs() {
+    tmpfs::init();
+
     let source = String::from("rootfs");
     let fstype = String::from("tmpfs");
     let flags = MS_NOATIME;
@@ -184,7 +176,7 @@ pub fn create_rootfs() {
         let creator = creators.get(&fstype).ok_or(ENODEV).unwrap();
 
         creator
-            .create_mount(&source, flags, &[], dcache::_looped_droot())
+            .create_mount(&source, flags, dcache::_looped_droot())
             .unwrap()
     };
 
@@ -206,7 +198,8 @@ pub fn create_rootfs() {
         .push((dcache::_looped_droot().clone(), mpdata));
 }
 
-#[no_mangle]
-pub extern "C" fn r_get_root_dentry() -> *const Dentry {
-    unsafe { ROOTFS.as_ref().cloned().map(Arc::into_raw).unwrap() }
+impl Dentry {
+    pub fn kernel_root_dentry() -> Arc<Dentry> {
+        unsafe { ROOTFS.as_ref().cloned().unwrap() }
+    }
 }

+ 2 - 3
src/kernel/vfs/vfs.rs

@@ -2,9 +2,8 @@ use crate::prelude::*;
 
 use super::DevId;
 
-#[allow(unused_variables)]
-pub trait Vfs: Send + Sync {
+pub trait Vfs: Send + Sync + AsAny {
     fn io_blksize(&self) -> usize;
     fn fs_devid(&self) -> DevId;
-    fn as_any(&self) -> &dyn Any;
+    fn is_read_only(&self) -> bool;
 }

+ 4 - 0
src/kinit.cpp

@@ -61,6 +61,8 @@ static inline void set_uname() {
     strcpy(kernel::sys_utsname->domainname, "(none)");
 }
 
+extern "C" void r_init_vfs();
+
 SECTION(".text.kinit")
 void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn) {
     // call global constructors
@@ -80,6 +82,8 @@ void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn) {
 
     init_syscall_table();
 
+    r_init_vfs();
+
     init_scheduler(kernel_stack_pfn);
 }
 

+ 65 - 31
src/lib.rs

@@ -20,21 +20,26 @@ mod prelude;
 mod rcu;
 mod sync;
 
+use alloc::{ffi::CString, sync::Arc};
+use bindings::root::types::elf::{elf32_load, elf32_load_data};
+use kernel::vfs::{
+    dentry::Dentry,
+    mount::{do_mount, MS_NOATIME, MS_NODEV, MS_NOSUID, MS_RDONLY},
+    FsContext,
+};
+use path::Path;
 use prelude::*;
 
 #[panic_handler]
 fn panic(info: &core::panic::PanicInfo) -> ! {
-    dont_check!(println!("[kernel] panic: {:?}", info));
+    println_fatal!("panicked at {:?}\n\t\t{}", info.location(), info.message());
 
     unsafe { bindings::root::freeze() };
 }
 
 extern "C" {
     fn _do_allocate(size: usize) -> *mut core::ffi::c_void;
-    fn _do_deallocate(
-        ptr: *mut core::ffi::c_void,
-        size: core::ffi::c_size_t,
-    ) -> i32;
+    fn _do_deallocate(ptr: *mut core::ffi::c_void, size: core::ffi::c_size_t) -> i32;
 }
 
 use core::alloc::{GlobalAlloc, Layout};
@@ -63,36 +68,65 @@ unsafe impl GlobalAlloc for Allocator {
 static ALLOCATOR: Allocator = Allocator {};
 
 #[no_mangle]
-pub extern "C" fn late_init_rust() {
+pub extern "C" fn late_init_rust(out_sp: *mut usize, out_ip: *mut usize) {
     driver::e1000e::register_e1000e_driver();
     driver::ahci::register_ahci_driver();
 
-    fs::tmpfs::init();
     fs::procfs::init();
     fs::fat32::init();
 
-    kernel::vfs::mount::create_rootfs();
-}
+    // mount fat32 /mnt directory
+    let fs_context = FsContext::get_current();
+    let mnt_dir = Dentry::open(&fs_context, Path::new(b"/mnt/").unwrap(), true).unwrap();
+
+    mnt_dir.mkdir(0o755).unwrap();
+
+    do_mount(
+        &mnt_dir,
+        "/dev/sda",
+        "/mnt",
+        "fat32",
+        MS_RDONLY | MS_NOATIME | MS_NODEV | MS_NOSUID,
+    )
+    .unwrap();
+
+    let init = Dentry::open(&fs_context, Path::new(b"/mnt/busybox").unwrap(), true)
+        .expect("kernel panic: init not found!");
+
+    let argv = vec![
+        CString::new("/mnt/busybox").unwrap(),
+        CString::new("sh").unwrap(),
+        CString::new("/mnt/initsh").unwrap(),
+    ];
+
+    let envp = vec![
+        CString::new("LANG=C").unwrap(),
+        CString::new("HOME=/root").unwrap(),
+        CString::new("PATH=/mnt").unwrap(),
+        CString::new("PWD=/").unwrap(),
+    ];
 
-//
-// #[repr(C)]
-// #[allow(dead_code)]
-// struct Fp {
-//     fp: *const core::ffi::c_void,
-// }
-//
-// unsafe impl Sync for Fp {}
-//
-// #[allow(unused_macros)]
-// macro_rules! late_init {
-//     ($name:ident, $func:ident) => {
-//         #[used]
-//         #[link_section = ".late_init"]
-//         static $name: $crate::Fp = $crate::Fp {
-//             fp: $func as *const core::ffi::c_void,
-//         };
-//     };
-// }
-//
-// #[allow(unused_imports)]
-// pub(crate) use late_init;
+    let argv_array = argv.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+    let envp_array = envp.iter().map(|x| x.as_ptr()).collect::<Vec<_>>();
+
+    // load init
+    let mut load_data = elf32_load_data {
+        exec_dent: Arc::into_raw(init) as *mut _,
+        argv: argv_array.as_ptr(),
+        argv_count: argv_array.len(),
+        envp: envp_array.as_ptr(),
+        envp_count: envp_array.len(),
+        ip: 0,
+        sp: 0,
+    };
+
+    let result = unsafe { elf32_load(&mut load_data) };
+    if result != 0 {
+        println_fatal!("Failed to load init: {}", result);
+    }
+
+    unsafe {
+        *out_sp = load_data.sp;
+        *out_ip = load_data.ip;
+    }
+}

+ 12 - 11
src/net/netdev.rs

@@ -1,7 +1,8 @@
 use alloc::{collections::btree_map::BTreeMap, sync::Arc};
-use spin::Mutex;
 
-use crate::bindings::root::EFAULT;
+use crate::{bindings::root::EFAULT, prelude::*};
+
+use lazy_static::lazy_static;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum LinkStatus {
@@ -19,7 +20,7 @@ pub enum LinkSpeed {
 
 pub type Mac = [u8; 6];
 
-pub trait Netdev {
+pub trait Netdev: Send {
     fn up(&mut self) -> Result<(), u32>;
     fn send(&mut self, data: &[u8]) -> Result<(), u32>;
     fn fire(&mut self) -> Result<(), u32>;
@@ -51,12 +52,14 @@ impl Ord for dyn Netdev {
     }
 }
 
-static mut NETDEVS_ID: Mutex<u32> = Mutex::new(0);
-static mut NETDEVS: Mutex<BTreeMap<u32, Arc<Mutex<dyn Netdev>>>> =
-    Mutex::new(BTreeMap::new());
+lazy_static! {
+    static ref NETDEVS_ID: Spin<u32> = Spin::new(0);
+    static ref NETDEVS: Spin<BTreeMap<u32, Arc<Mutex<dyn Netdev>>>> =
+        Spin::new(BTreeMap::new());
+}
 
 pub fn alloc_id() -> u32 {
-    let mut id = unsafe { NETDEVS_ID.lock() };
+    let mut id = NETDEVS_ID.lock();
     let retval = *id;
 
     *id += 1;
@@ -68,7 +71,7 @@ pub fn register_netdev(
 ) -> Result<Arc<Mutex<dyn Netdev>>, u32> {
     let devid = netdev.id();
 
-    let mut netdevs = unsafe { NETDEVS.lock() };
+    let mut netdevs = NETDEVS.lock();
 
     use alloc::collections::btree_map::Entry;
     match netdevs.entry(devid) {
@@ -82,7 +85,5 @@ pub fn register_netdev(
 }
 
 pub fn get_netdev(id: u32) -> Option<Arc<Mutex<dyn Netdev>>> {
-    let netdevs = unsafe { NETDEVS.lock() };
-
-    netdevs.get(&id).map(|netdev| netdev.clone())
+    NETDEVS.lock().get(&id).map(|netdev| netdev.clone())
 }

+ 34 - 145
src/prelude.rs

@@ -18,7 +18,12 @@ pub(crate) use dont_check;
 pub use crate::bindings::root as bindings;
 
 #[allow(unused_imports)]
-pub(crate) use crate::kernel::console::{print, println};
+pub(crate) use crate::kernel::console::{
+    print, println, println_debug, println_fatal, println_info, println_warn,
+};
+
+#[allow(unused_imports)]
+pub(crate) use crate::sync::might_sleep;
 
 #[allow(unused_imports)]
 pub(crate) use alloc::{boxed::Box, string::String, vec, vec::Vec};
@@ -27,150 +32,7 @@ pub(crate) use alloc::{boxed::Box, string::String, vec, vec::Vec};
 pub(crate) use core::{any::Any, fmt::Write, marker::PhantomData, str};
 use core::{mem::ManuallyDrop, ops::Deref};
 
-pub struct Yield;
-
-extern "C" {
-    fn r_preempt_disable();
-    fn r_preempt_enable();
-}
-
-#[inline(always)]
-pub fn preempt_disable() {
-    unsafe {
-        r_preempt_disable();
-    }
-}
-
-#[inline(always)]
-pub fn preempt_enable() {
-    unsafe {
-        r_preempt_enable();
-    }
-}
-
-impl spin::RelaxStrategy for Yield {
-    fn relax() {
-        panic!("ohohoh");
-    }
-}
-
-#[derive(Debug)]
-#[repr(transparent)]
-pub struct PreemptGuard;
-
-impl PreemptGuard {
-    #[inline(always)]
-    pub fn new() -> Self {
-        preempt_disable();
-        Self
-    }
-}
-
-impl Drop for PreemptGuard {
-    #[inline(always)]
-    fn drop(&mut self) {
-        preempt_enable();
-    }
-}
-
-#[repr(transparent)]
-pub struct MutexNoPreemptionGuard<'a, T: ?Sized> {
-    data_guard: spin::mutex::MutexGuard<'a, T>,
-    preempt_guard: PreemptGuard,
-}
-
-impl<'a, T: ?Sized> MutexNoPreemptionGuard<'a, T> {
-    #[inline(always)]
-    pub fn new(
-        preempt_guard: PreemptGuard,
-        data_guard: spin::mutex::MutexGuard<'a, T>,
-    ) -> Self {
-        Self {
-            data_guard,
-            preempt_guard,
-        }
-    }
-}
-
-impl<'a, T: ?Sized> core::ops::Deref for MutexNoPreemptionGuard<'a, T> {
-    type Target = <spin::mutex::MutexGuard<'a, T> as core::ops::Deref>::Target;
-
-    #[inline(always)]
-    fn deref(&self) -> &Self::Target {
-        &*self.data_guard
-    }
-}
-
-impl<'a, T: ?Sized> core::ops::DerefMut for MutexNoPreemptionGuard<'a, T> {
-    #[inline(always)]
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut *self.data_guard
-    }
-}
-
-impl<'a, T: ?Sized> AsRef<T> for MutexNoPreemptionGuard<'a, T> {
-    #[inline(always)]
-    fn as_ref(&self) -> &T {
-        &*self.data_guard
-    }
-}
-
-impl<'a, T: ?Sized> AsMut<T> for MutexNoPreemptionGuard<'a, T> {
-    #[inline(always)]
-    fn as_mut(&mut self) -> &mut T {
-        &mut *self.data_guard
-    }
-}
-
-#[repr(transparent)]
-pub struct MutexNoPreemption<T: ?Sized> {
-    lock: spin::mutex::Mutex<T, spin::Spin>,
-}
-
-impl<T> MutexNoPreemption<T> {
-    #[inline(always)]
-    pub const fn new(value: T) -> Self {
-        Self {
-            lock: spin::mutex::Mutex::new(value),
-        }
-    }
-}
-
-#[allow(dead_code)]
-impl<T: ?Sized> MutexNoPreemption<T> {
-    #[inline(always)]
-    pub fn lock(&self) -> MutexNoPreemptionGuard<T> {
-        let preempt_guard = PreemptGuard::new();
-        let data_guard = self.lock.lock();
-
-        MutexNoPreemptionGuard::new(preempt_guard, data_guard)
-    }
-
-    #[inline(always)]
-    pub fn is_locked(&self) -> bool {
-        self.lock.is_locked()
-    }
-
-    #[inline(always)]
-    pub fn try_lock(&self) -> Option<MutexNoPreemptionGuard<T>> {
-        let preempt_guard = PreemptGuard::new();
-        let data_guard = self.lock.try_lock();
-
-        data_guard.map(|data_guard| {
-            MutexNoPreemptionGuard::new(preempt_guard, data_guard)
-        })
-    }
-
-    #[inline(always)]
-    pub fn get_mut(&mut self) -> &mut T {
-        self.lock.get_mut()
-    }
-}
-
-#[allow(dead_code)]
-pub type RwLock<T> = spin::rwlock::RwLock<T, Yield>;
-pub type RwLockReadGuard<'a, T> = spin::rwlock::RwLockReadGuard<'a, T>;
-pub type Mutex<T> = MutexNoPreemption<T>;
+pub use crate::sync::{Mutex, RwSemaphore, Semaphore, Spin};
 
 pub struct BorrowedArc<'lt, T: ?Sized> {
     arc: ManuallyDrop<Arc<T>>,
@@ -208,3 +70,30 @@ impl<'lt, T: ?Sized> AsRef<Arc<T>> for BorrowedArc<'lt, T> {
         &self.arc
     }
 }
+
+pub trait AsAny: Send + Sync {
+    fn as_any(&self) -> &dyn Any;
+    fn as_any_mut(&mut self) -> &mut dyn Any;
+}
+
+macro_rules! impl_any {
+    ($t:ty) => {
+        impl AsAny for $t {
+            fn as_any(&self) -> &dyn Any {
+                self
+            }
+
+            fn as_any_mut(&mut self) -> &mut dyn Any {
+                self
+            }
+        }
+    };
+}
+
+macro_rules! addr_of_mut_field {
+    ($pointer:expr, $field:ident) => {
+        core::ptr::addr_of_mut!((*$pointer).$field)
+    };
+}
+
+pub(crate) use {addr_of_mut_field, impl_any};

+ 34 - 32
src/rcu.rs

@@ -3,23 +3,30 @@ use core::{
     sync::atomic::{AtomicPtr, Ordering},
 };
 
-use crate::prelude::*;
+use crate::{
+    prelude::*,
+    sync::{lock::Guard, semaphore::RwSemaphoreStrategy},
+};
 
 use alloc::sync::Arc;
 
+use lazy_static::lazy_static;
+
 pub struct RCUReadGuard<'data, T: 'data> {
     value: T,
-    guard: RwLockReadGuard<'static, ()>,
+    guard: Guard<'data, (), RwSemaphoreStrategy, false>,
     _phantom: PhantomData<&'data T>,
 }
 
-static READ_GUARD: RwLock<()> = RwLock::new(());
+lazy_static! {
+    static ref GLOBAL_RCU_SEM: RwSemaphore<()> = RwSemaphore::new(());
+}
 
 impl<'data, T: 'data> RCUReadGuard<'data, T> {
     fn lock(value: T) -> Self {
         Self {
             value,
-            guard: READ_GUARD.read(),
+            guard: GLOBAL_RCU_SEM.lock_shared(),
             _phantom: PhantomData,
         }
     }
@@ -34,7 +41,7 @@ impl<'data, T: 'data> Deref for RCUReadGuard<'data, T> {
 }
 
 fn rcu_sync() {
-    READ_GUARD.write();
+    GLOBAL_RCU_SEM.lock();
 }
 
 pub trait RCUNode<MySelf> {
@@ -45,15 +52,15 @@ pub trait RCUNode<MySelf> {
 pub struct RCUList<T: RCUNode<T>> {
     head: AtomicPtr<T>,
 
-    reader_lock: RwLock<()>,
+    reader_lock: RwSemaphore<()>,
     update_lock: Mutex<()>,
 }
 
 impl<T: RCUNode<T>> RCUList<T> {
-    pub const fn new() -> Self {
+    pub fn new() -> Self {
         Self {
             head: AtomicPtr::new(core::ptr::null_mut()),
-            reader_lock: RwLock::new(()),
+            reader_lock: RwSemaphore::new(()),
             update_lock: Mutex::new(()),
         }
     }
@@ -68,17 +75,16 @@ impl<T: RCUNode<T>> RCUList<T> {
         new_node.rcu_next().store(old_head, Ordering::Release);
 
         if let Some(old_head) = unsafe { old_head.as_ref() } {
-            old_head.rcu_prev().store(
-                Arc::into_raw(new_node.clone()) as *mut _,
-                Ordering::Release,
-            );
+            old_head
+                .rcu_prev()
+                .store(Arc::into_raw(new_node.clone()) as *mut _, Ordering::Release);
         }
 
         self.head
             .store(Arc::into_raw(new_node) as *mut _, Ordering::Release);
     }
 
-    pub fn remove(&self, node: Arc<T>) {
+    pub fn remove(&self, node: &Arc<T>) {
         let _lck = self.update_lock.lock();
 
         let prev = node.rcu_prev().load(Ordering::Acquire);
@@ -91,21 +97,19 @@ impl<T: RCUNode<T>> RCUList<T> {
         }
 
         {
-            let prev_next = unsafe { prev.as_ref().map(|rcu| rcu.rcu_next()) }
-                .unwrap_or(&self.head);
+            let prev_next =
+                unsafe { prev.as_ref().map(|rcu| rcu.rcu_next()) }.unwrap_or(&self.head);
 
             let me = prev_next.swap(next, Ordering::AcqRel);
             debug_assert!(me == Arc::as_ptr(&node) as *mut _);
             unsafe { Arc::from_raw(me) };
         }
 
-        let _lck = self.reader_lock.write();
+        let _lck = self.reader_lock.lock();
         node.rcu_prev()
             .store(core::ptr::null_mut(), Ordering::Release);
         node.rcu_next()
             .store(core::ptr::null_mut(), Ordering::Release);
-
-        drop(node);
     }
 
     pub fn replace(&self, old_node: &Arc<T>, new_node: Arc<T>) {
@@ -118,29 +122,25 @@ impl<T: RCUNode<T>> RCUList<T> {
         new_node.rcu_next().store(next, Ordering::Release);
 
         {
-            let prev_next = unsafe { prev.as_ref().map(|rcu| rcu.rcu_next()) }
-                .unwrap_or(&self.head);
+            let prev_next =
+                unsafe { prev.as_ref().map(|rcu| rcu.rcu_next()) }.unwrap_or(&self.head);
 
-            let old = prev_next.swap(
-                Arc::into_raw(new_node.clone()) as *mut _,
-                Ordering::AcqRel,
-            );
+            let old = prev_next.swap(Arc::into_raw(new_node.clone()) as *mut _, Ordering::AcqRel);
 
             debug_assert!(old == Arc::as_ptr(&old_node) as *mut _);
             unsafe { Arc::from_raw(old) };
         }
 
         if let Some(next) = unsafe { next.as_ref() } {
-            let old = next.rcu_prev().swap(
-                Arc::into_raw(new_node.clone()) as *mut _,
-                Ordering::AcqRel,
-            );
+            let old = next
+                .rcu_prev()
+                .swap(Arc::into_raw(new_node.clone()) as *mut _, Ordering::AcqRel);
 
             debug_assert!(old == Arc::as_ptr(&old_node) as *mut _);
             unsafe { Arc::from_raw(old) };
         }
 
-        let _lck = self.reader_lock.write();
+        let _lck = self.reader_lock.lock();
         old_node
             .rcu_prev()
             .store(core::ptr::null_mut(), Ordering::Release);
@@ -150,7 +150,7 @@ impl<T: RCUNode<T>> RCUList<T> {
     }
 
     pub fn iter(&self) -> RCUIterator<T> {
-        let _lck = self.reader_lock.read();
+        let _lck = self.reader_lock.lock_shared();
 
         RCUIterator {
             // SAFETY: We have a read lock, so the node is still alive.
@@ -162,7 +162,7 @@ impl<T: RCUNode<T>> RCUList<T> {
 
 pub struct RCUIterator<'lt, T: RCUNode<T>> {
     cur: *const T,
-    _lock: RwLockReadGuard<'lt, ()>,
+    _lock: Guard<'lt, (), RwSemaphoreStrategy, false>,
 }
 
 impl<'lt, T: RCUNode<T>> Iterator for RCUIterator<'lt, T> {
@@ -203,7 +203,9 @@ impl<T> RCUPointer<T> {
         }
     }
 
-    pub fn swap(&self, new: Option<Arc<T>>) -> Option<Arc<T>> {
+    /// # Safety
+    /// Caller must ensure that the pointer is freed after all readers are done.
+    pub unsafe fn swap(&self, new: Option<Arc<T>>) -> Option<Arc<T>> {
         let new = new
             .map(|arc| Arc::into_raw(arc) as *mut T)
             .unwrap_or(core::ptr::null_mut());

+ 82 - 3
src/sync.rs

@@ -1,10 +1,56 @@
-pub struct Locked<T: Sized + Sync, U: ?Sized> {
+pub mod condvar;
+pub mod lock;
+pub mod semaphore;
+pub mod spin;
+pub mod strategy;
+
+extern "C" {
+    fn r_preempt_disable();
+    fn r_preempt_enable();
+}
+
+#[inline(always)]
+fn preempt_disable() {
+    unsafe {
+        r_preempt_disable();
+    }
+}
+
+#[inline(always)]
+fn preempt_enable() {
+    unsafe {
+        r_preempt_enable();
+    }
+}
+
+pub type Spin<T> = lock::Lock<T, spin::SpinStrategy>;
+pub type Mutex<T> = lock::Lock<T, semaphore::SemaphoreStrategy<1>>;
+#[allow(dead_code)]
+pub type Semaphore<T> = lock::Lock<T, semaphore::SemaphoreStrategy>;
+pub type RwSemaphore<T> = lock::Lock<T, semaphore::RwSemaphoreStrategy>;
+
+#[allow(dead_code)]
+pub type SpinGuard<'lock, T> = lock::Guard<'lock, T, spin::SpinStrategy, true>;
+
+#[allow(dead_code)]
+pub type MutexGuard<'lock, T> = lock::Guard<'lock, T, semaphore::SemaphoreStrategy<1>, true>;
+
+#[allow(dead_code)]
+pub type SemGuard<'lock, T> = lock::Guard<'lock, T, semaphore::SemaphoreStrategy, true>;
+
+#[allow(dead_code)]
+pub type RwSemReadGuard<'lock, T> = lock::Guard<'lock, T, semaphore::RwSemaphoreStrategy, false>;
+
+#[allow(dead_code)]
+pub type RwSemWriteGuard<'lock, T> = lock::Guard<'lock, T, semaphore::RwSemaphoreStrategy, true>;
+
+pub struct Locked<T: Sized, U: ?Sized> {
     inner: T,
     guard: *const U,
 }
 
-unsafe impl<T: Sized + Sync, U: ?Sized> Sync for Locked<T, U> {}
-unsafe impl<T: Sized + Sync, U: ?Sized> Send for Locked<T, U> {}
+unsafe impl<T: Sized + Send, U: ?Sized> Send for Locked<T, U> {}
+unsafe impl<T: Sized + Send + Sync, U: ?Sized> Sync for Locked<T, U> {}
 
 impl<T: Sized + Sync, U: ?Sized> Locked<T, U> {
     pub fn new(value: T, from: &U) -> Self {
@@ -24,3 +70,36 @@ impl<T: Sized + Sync, U: ?Sized> Locked<T, U> {
         unsafe { &mut *(&raw const self.inner as *mut T) }
     }
 }
+
+macro_rules! might_sleep {
+    () => {
+        if cfg!(debug_assertions) {
+            if unsafe { $crate::bindings::root::kernel::async_::preempt_count() } != 0 {
+                println_fatal!("failed assertion");
+                unsafe { $crate::bindings::root::freeze() };
+            }
+        } else {
+            assert_eq!(
+                unsafe { $crate::bindings::root::kernel::async_::preempt_count() },
+                0,
+                "a might_sleep function called with preempt disabled"
+            );
+        }
+    };
+    ($n:expr) => {
+        if cfg!(debug_assertions) {
+            if unsafe { $crate::bindings::root::kernel::async_::preempt_count() } != $n {
+                println_fatal!("failed assertion");
+                unsafe { $crate::bindings::root::freeze() };
+            }
+        } else {
+            assert_eq!(
+                unsafe { $crate::bindings::root::kernel::async_::preempt_count() },
+                $n,
+                "a might_sleep function called with the preempt count not satisfying its requirement",
+            );
+        }
+    };
+}
+
+pub(crate) use might_sleep;

+ 113 - 0
src/sync/condvar.rs

@@ -0,0 +1,113 @@
+use alloc::collections::vec_deque::VecDeque;
+use bindings::{
+    current_thread,
+    kernel::task::{thread, thread_ISLEEP, thread_READY, thread_USLEEP},
+    schedule_now_preempt_disabled,
+};
+
+use crate::{prelude::*, sync::preempt_disable};
+
+use super::{lock::Guard, strategy::LockStrategy};
+
+/// `current` should be per CPU, so no sync is needed
+fn current() -> &'static mut *mut thread {
+    #[allow(static_mut_refs)]
+    unsafe {
+        &mut current_thread
+    }
+}
+
+pub struct CondVar {
+    waiters: Spin<VecDeque<*mut thread>>,
+}
+
+// TODO!!!: acquire dispatcher lock because modifying thread attribute
+//          is racy. But we put this in the future work since that would
+//          require a lot of changes in the kernel task management system.
+unsafe impl Send for CondVar {}
+unsafe impl Sync for CondVar {}
+
+impl CondVar {
+    pub fn new() -> Self {
+        Self {
+            waiters: Spin::new(VecDeque::new()),
+        }
+    }
+
+    pub fn notify_one(&self) {
+        // TODO!!!: acquire dispatcher lock
+        let mut waiters = self.waiters.lock();
+
+        if waiters.is_empty() {
+            return;
+        }
+
+        let thread = waiters
+            .pop_front()
+            .map(|ptr| unsafe { ptr.as_mut() }.unwrap());
+
+        if let Some(thread) = thread {
+            unsafe { thread.set_attr(thread_READY, true) };
+        }
+    }
+
+    pub fn notify_all(&self) {
+        // TODO!!!: acquire dispatcher lock
+        let mut waiters = self.waiters.lock();
+
+        if waiters.is_empty() {
+            return;
+        }
+
+        for item in waiters.iter() {
+            let thread = unsafe { item.as_mut() }.unwrap();
+            unsafe { thread.set_attr(thread_READY, true) };
+        }
+
+        waiters.clear();
+    }
+
+    /// # Might Sleep
+    /// This function **might sleep**, so call it in a preemptible context
+    ///
+    /// # Return
+    /// - `true`: a pending signal was received
+    pub fn wait<'a, T, S: LockStrategy>(
+        &self,
+        guard: &mut Guard<'a, T, S>,
+        interruptible: bool,
+    ) -> bool {
+        preempt_disable();
+
+        // TODO!!!: acquire dispatcher lock
+        let current = *current();
+
+        let current_mut = unsafe { current.as_mut() }.unwrap();
+        unsafe {
+            if interruptible {
+                current_mut.set_attr(thread_ISLEEP, false);
+            } else {
+                current_mut.set_attr(thread_USLEEP, false);
+            }
+        }
+
+        {
+            let mut waiters = self.waiters.lock();
+            waiters.push_back(current);
+        }
+
+        unsafe {
+            guard.force_unlock();
+        }
+
+        might_sleep!(1);
+
+        let has_signals = unsafe { !schedule_now_preempt_disabled() };
+
+        unsafe {
+            guard.force_relock();
+        }
+
+        has_signals
+    }
+}

+ 154 - 0
src/sync/lock.rs

@@ -0,0 +1,154 @@
+use core::{
+    cell::UnsafeCell,
+    ops::{Deref, DerefMut},
+};
+
+use super::{spin::IrqStrategy, strategy::LockStrategy};
+
+pub struct Lock<Value: ?Sized, Strategy: LockStrategy> {
+    strategy_data: Strategy::StrategyData,
+    value: UnsafeCell<Value>,
+}
+
+unsafe impl<T: ?Sized + Send, S: LockStrategy> Send for Lock<T, S> {}
+unsafe impl<T: ?Sized + Send, S: LockStrategy> Sync for Lock<T, S> {}
+
+impl<Value, Strategy: LockStrategy> Lock<Value, Strategy> {
+    #[inline(always)]
+    pub fn new(value: Value) -> Self {
+        Self {
+            strategy_data: Strategy::data(),
+            value: UnsafeCell::new(value),
+        }
+    }
+}
+
+impl<Value: Clone, Strategy: LockStrategy> Clone for Lock<Value, Strategy> {
+    fn clone(&self) -> Self {
+        Self {
+            strategy_data: Strategy::data(),
+            value: UnsafeCell::new(self.lock_shared().clone()),
+        }
+    }
+}
+
+impl<Value: Default, Strategy: LockStrategy> Default for Lock<Value, Strategy> {
+    fn default() -> Self {
+        Self {
+            strategy_data: Strategy::data(),
+            value: Default::default(),
+        }
+    }
+}
+
+impl<Value: ?Sized, Strategy: LockStrategy> Lock<Value, Strategy> {
+    #[inline(always)]
+    pub fn lock<'lt>(&'lt self) -> Guard<'lt, Value, Strategy> {
+        Guard {
+            _phantom: core::marker::PhantomData,
+            value: &self.value,
+            strategy_data: &self.strategy_data,
+            context: unsafe { Strategy::do_lock(&self.strategy_data) },
+        }
+    }
+
+    #[inline(always)]
+    pub fn lock_irq<'lt>(&'lt self) -> Guard<'lt, Value, IrqStrategy<Strategy>> {
+        Guard {
+            _phantom: core::marker::PhantomData,
+            value: &self.value,
+            strategy_data: &self.strategy_data,
+            context: unsafe { IrqStrategy::<Strategy>::do_lock(&self.strategy_data) },
+        }
+    }
+
+    #[inline(always)]
+    pub fn lock_shared<'lt>(&'lt self) -> Guard<'lt, Value, Strategy, false> {
+        Guard {
+            _phantom: core::marker::PhantomData,
+            value: &self.value,
+            strategy_data: &self.strategy_data,
+            context: unsafe { Strategy::do_lock_shared(&self.strategy_data) },
+        }
+    }
+
+    #[inline(always)]
+    pub fn lock_shared_irq<'lt>(&'lt self) -> Guard<'lt, Value, IrqStrategy<Strategy>, false> {
+        Guard {
+            _phantom: core::marker::PhantomData,
+            value: &self.value,
+            strategy_data: &self.strategy_data,
+            context: unsafe { IrqStrategy::<Strategy>::do_lock(&self.strategy_data) },
+        }
+    }
+
+    #[inline(always)]
+    pub fn get_mut(&mut self) -> &mut Value {
+        unsafe { &mut *self.value.get() }
+    }
+}
+
+pub struct Guard<'lock, Value: ?Sized, Strategy: LockStrategy, const Write: bool = true> {
+    _phantom: core::marker::PhantomData<Strategy>,
+    value: &'lock UnsafeCell<Value>,
+    strategy_data: &'lock Strategy::StrategyData,
+    context: Strategy::GuardContext,
+}
+
+impl<'lock, Value: ?Sized, Strategy: LockStrategy> Guard<'lock, Value, Strategy> {
+    /// # Safety
+    /// Use of the lock after calling this function without relocking is undefined behavior.
+    #[inline(always)]
+    pub unsafe fn force_unlock(&mut self) {
+        Strategy::do_temporary_unlock(&self.strategy_data, &mut self.context)
+    }
+
+    /// # Safety
+    /// Calling this function more than once will cause deadlocks.
+    #[inline(always)]
+    pub unsafe fn force_relock(&mut self) {
+        Strategy::do_relock(&self.strategy_data, &mut self.context)
+    }
+}
+
+impl<'lock, Value: ?Sized, Strategy: LockStrategy, const Write: bool> Deref
+    for Guard<'lock, Value, Strategy, Write>
+{
+    type Target = Value;
+
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*self.value.get() }
+    }
+}
+
+impl<'lock, Value: ?Sized, Strategy: LockStrategy> DerefMut
+    for Guard<'lock, Value, Strategy, true>
+{
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *self.value.get() }
+    }
+}
+
+impl<'lock, Value: ?Sized, Strategy: LockStrategy, const Write: bool> AsRef<Value>
+    for Guard<'lock, Value, Strategy, Write>
+{
+    fn as_ref(&self) -> &Value {
+        unsafe { &*self.value.get() }
+    }
+}
+
+impl<'lock, Value: ?Sized, Strategy: LockStrategy> AsMut<Value>
+    for Guard<'lock, Value, Strategy, true>
+{
+    fn as_mut(&mut self) -> &mut Value {
+        unsafe { &mut *self.value.get() }
+    }
+}
+
+impl<'lock, Value: ?Sized, Strategy: LockStrategy, const Write: bool> Drop
+    for Guard<'lock, Value, Strategy, Write>
+{
+    fn drop(&mut self) {
+        unsafe { Strategy::do_unlock(&self.strategy_data, &mut self.context) }
+    }
+}

+ 157 - 0
src/sync/semaphore.rs

@@ -0,0 +1,157 @@
+use super::{condvar::CondVar, strategy::LockStrategy, Spin};
+
+pub struct SemaphoreStrategy<const MAX: usize = { core::usize::MAX }>;
+
+impl<const MAX: usize> SemaphoreStrategy<MAX> {
+    #[inline(always)]
+    fn is_locked(data: &<Self as LockStrategy>::StrategyData) -> bool {
+        let counter = data.counter.lock();
+        *counter > 0
+    }
+}
+
+pub struct SemaphoreData {
+    counter: Spin<usize>,
+    cv: CondVar,
+}
+
+unsafe impl<const MAX: usize> LockStrategy for SemaphoreStrategy<MAX> {
+    type StrategyData = SemaphoreData;
+    type GuardContext = ();
+
+    #[inline(always)]
+    fn data() -> Self::StrategyData {
+        SemaphoreData {
+            counter: Spin::new(0),
+            cv: CondVar::new(),
+        }
+    }
+
+    #[inline(always)]
+    /// Acquire the semaphore in write mode
+    ///
+    /// # Might Sleep
+    unsafe fn do_lock(data: &Self::StrategyData) -> Self::GuardContext {
+        loop {
+            let mut counter = data.counter.lock();
+            assert!(*counter <= MAX);
+
+            if *counter < MAX {
+                *counter += 1;
+                return;
+            }
+
+            // TODO!!!: interruptible wait
+            data.cv.wait(&mut counter, false);
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn do_unlock(data: &Self::StrategyData, _: &mut Self::GuardContext) {
+        let mut counter = data.counter.lock();
+        assert!(*counter <= MAX);
+
+        match *counter {
+            n if n > 0 => {
+                *counter -= 1;
+                data.cv.notify_one();
+            }
+            _ => panic!("Semaphore in inconsistent state"),
+        }
+    }
+}
+
+pub struct RwSemaphoreStrategy<const READ_MAX: isize = { core::isize::MAX }>;
+
+impl<const READ_MAX: isize> RwSemaphoreStrategy<READ_MAX> {
+    #[inline(always)]
+    fn is_read_locked(data: &<Self as LockStrategy>::StrategyData) -> bool {
+        let counter = data.counter.lock();
+        *counter > 0
+    }
+
+    #[inline(always)]
+    fn is_write_locked(data: &<Self as LockStrategy>::StrategyData) -> bool {
+        let counter = data.counter.lock();
+        *counter < 0
+    }
+}
+
+pub struct RwSemaphoreData {
+    counter: Spin<isize>,
+    read_cv: CondVar,
+    write_cv: CondVar,
+}
+
+unsafe impl<const READ_MAX: isize> LockStrategy for RwSemaphoreStrategy<READ_MAX> {
+    type StrategyData = RwSemaphoreData;
+    type GuardContext = ();
+
+    #[inline(always)]
+    fn data() -> Self::StrategyData {
+        RwSemaphoreData {
+            counter: Spin::new(0),
+            read_cv: CondVar::new(),
+            write_cv: CondVar::new(),
+        }
+    }
+
+    #[inline(always)]
+    /// Acquire the semaphore in write mode
+    ///
+    /// # Might Sleep
+    unsafe fn do_lock(data: &Self::StrategyData) -> Self::GuardContext {
+        loop {
+            let mut counter = data.counter.lock();
+            assert!(*counter >= -1 && *counter <= READ_MAX);
+
+            if *counter == 0 {
+                *counter -= 1;
+                return;
+            }
+
+            // TODO!!!: interruptible wait
+            data.write_cv.wait(&mut counter, false);
+        }
+    }
+
+    #[inline(always)]
+    /// Acquire the semaphore in read mode
+    ///
+    /// # Might Sleep
+    unsafe fn do_lock_shared(data: &Self::StrategyData) -> Self::GuardContext {
+        loop {
+            let mut counter = data.counter.lock();
+            assert!(*counter >= -1 && *counter <= READ_MAX);
+
+            if *counter >= 0 && *counter < READ_MAX {
+                *counter += 1;
+                return;
+            }
+
+            // TODO!!!: interruptible wait
+            data.read_cv.wait(&mut counter, false);
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn do_unlock(data: &Self::StrategyData, _: &mut Self::GuardContext) {
+        let mut counter = data.counter.lock();
+        assert!(*counter >= -1 && *counter <= READ_MAX);
+
+        match *counter {
+            -1 => {
+                *counter = 0;
+                data.read_cv.notify_all();
+                data.write_cv.notify_one();
+            }
+            n if n > 0 => {
+                *counter -= 1;
+                if *counter == 0 {
+                    data.write_cv.notify_one();
+                }
+            }
+            _ => panic!("Semaphore in inconsistent state"),
+        }
+    }
+}

+ 105 - 0
src/sync/spin.rs

@@ -0,0 +1,105 @@
+use core::{
+    arch::asm,
+    sync::atomic::{AtomicBool, Ordering},
+};
+
+use crate::sync::preempt_disable;
+
+use super::{preempt_enable, strategy::LockStrategy};
+
+pub struct SpinStrategy;
+
+impl SpinStrategy {
+    #[inline(always)]
+    fn is_locked(data: &<Self as LockStrategy>::StrategyData) -> bool {
+        data.load(Ordering::Relaxed)
+    }
+}
+
+unsafe impl LockStrategy for SpinStrategy {
+    type StrategyData = AtomicBool;
+    type GuardContext = ();
+
+    #[inline(always)]
+    fn data() -> Self::StrategyData {
+        AtomicBool::new(false)
+    }
+
+    #[inline(always)]
+    unsafe fn do_lock(data: &Self::StrategyData) -> Self::GuardContext {
+        use Ordering::{Acquire, Relaxed};
+        preempt_disable();
+
+        while data
+            .compare_exchange_weak(false, true, Acquire, Relaxed)
+            .is_err()
+        {
+            while Self::is_locked(data) {
+                core::hint::spin_loop();
+            }
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn do_unlock(data: &Self::StrategyData, _: &mut Self::GuardContext) {
+        data.store(false, Ordering::Release);
+        preempt_enable();
+    }
+}
+
+pub struct IrqStrategy<Strategy: LockStrategy> {
+    _phantom: core::marker::PhantomData<Strategy>,
+}
+
+unsafe impl<Strategy: LockStrategy> LockStrategy for IrqStrategy<Strategy> {
+    type StrategyData = Strategy::StrategyData;
+    type GuardContext = (Strategy::GuardContext, usize);
+
+    #[inline(always)]
+    fn data() -> Self::StrategyData {
+        Strategy::data()
+    }
+
+    #[inline(always)]
+    unsafe fn do_lock(data: &Self::StrategyData) -> Self::GuardContext {
+        let mut context: usize;
+        asm!(
+            "pushf",
+            "pop {context}",
+            "cli",
+            context = out(reg) context,
+        );
+
+        (Strategy::do_lock(data), context)
+    }
+
+    #[inline(always)]
+    unsafe fn do_unlock(
+        data: &Self::StrategyData,
+        context: &mut Self::GuardContext,
+    ) {
+        Strategy::do_unlock(data, &mut context.0);
+
+        asm!(
+            "push {context}",
+            "popf",
+            context = in(reg) context.1,
+        )
+    }
+
+    #[inline(always)]
+    unsafe fn do_temporary_unlock(
+        data: &Self::StrategyData,
+        context: &mut Self::GuardContext,
+    ) {
+        Strategy::do_unlock(data, &mut context.0)
+    }
+
+    #[inline(always)]
+    unsafe fn do_relock(
+        data: &Self::StrategyData,
+        context: &mut Self::GuardContext,
+    ) {
+        Strategy::do_relock(data, &mut context.0);
+    }
+}

+ 33 - 0
src/sync/strategy.rs

@@ -0,0 +1,33 @@
+pub unsafe trait LockStrategy {
+    type StrategyData;
+    type GuardContext;
+
+    fn data() -> Self::StrategyData;
+
+    unsafe fn do_lock(data: &Self::StrategyData) -> Self::GuardContext;
+
+    unsafe fn do_unlock(
+        data: &Self::StrategyData,
+        context: &mut Self::GuardContext,
+    );
+
+    unsafe fn do_lock_shared(data: &Self::StrategyData) -> Self::GuardContext {
+        Self::do_lock(data)
+    }
+
+    #[inline(always)]
+    unsafe fn do_temporary_unlock(
+        data: &Self::StrategyData,
+        context: &mut Self::GuardContext,
+    ) {
+        Self::do_unlock(data, context);
+    }
+
+    #[inline(always)]
+    unsafe fn do_relock(
+        data: &Self::StrategyData,
+        context: &mut Self::GuardContext,
+    ) {
+        *context = Self::do_lock(data);
+    }
+}

+ 12 - 18
src/types/elf.cpp

@@ -12,6 +12,7 @@
 #include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
+#include <kernel/vfs/dentry.hpp>
 
 static inline void __user_push32(uintptr_t* sp, uint32_t d) {
     // TODO: use copy_to_user
@@ -28,37 +29,31 @@ static inline void __user_push_string32(uintptr_t* sp, const char* str) {
 }
 
 int types::elf::elf32_load(types::elf::elf32_load_data& d) {
-    auto& exec = d.exec_dent;
+    auto exec = fs::dentry_pointer{d.exec_dent};
     if (!exec)
         return -ENOENT;
 
-    auto* inode = fs::r_dentry_get_inode(exec.get());
-
     types::elf::elf32_header hdr{};
-    auto n_read =
-        fs::fs_read(inode, (char*)&hdr, sizeof(types::elf::elf32_header), 0,
-                 sizeof(types::elf::elf32_header));
+    auto n_read = fs::fs_read(exec.get(), (char*)&hdr, sizeof(types::elf::elf32_header), 0,
+                              sizeof(types::elf::elf32_header));
 
     if (n_read != sizeof(types::elf::elf32_header))
         return -EINVAL;
 
-    if (hdr.magic[0] != 0x7f || hdr.magic[1] != 'E' || hdr.magic[2] != 'L' ||
-        hdr.magic[3] != 'F')
+    if (hdr.magic[0] != 0x7f || hdr.magic[1] != 'E' || hdr.magic[2] != 'L' || hdr.magic[3] != 'F')
         return -EINVAL;
 
     size_t phents_size = hdr.phentsize * hdr.phnum;
     size_t shents_size = hdr.shentsize * hdr.shnum;
     std::vector<types::elf::elf32_program_header_entry> phents(hdr.phnum);
-    n_read = fs_read(inode, (char*)phents.data(), phents_size, hdr.phoff,
-                      phents_size);
+    n_read = fs::fs_read(exec.get(), (char*)phents.data(), phents_size, hdr.phoff, phents_size);
 
     // broken file or I/O error
     if (n_read != phents_size)
         return -EINVAL;
 
     std::vector<types::elf::elf32_section_header_entry> shents(hdr.shnum);
-    n_read = fs_read(inode, (char*)shents.data(), shents_size, hdr.shoff,
-                      shents_size);
+    n_read = fs::fs_read(exec.get(), (char*)shents.data(), shents_size, hdr.shoff, shents_size);
 
     // broken file or I/O error
     if (n_read != shents_size)
@@ -86,8 +81,7 @@ int types::elf::elf32_load(types::elf::elf32_load_data& d) {
 
             args.vaddr = vaddr;
             args.length = flen;
-            // TODO!!!!!!!: get ownership
-            args.file_inode = inode;
+            args.file = fs::d_get(exec);
             args.file_offset = fileoff;
 
             args.flags = MM_MAPPED;
@@ -149,12 +143,12 @@ int types::elf::elf32_load(types::elf::elf32_load_data& d) {
 
     // fill information block area
     std::vector<elf32_addr_t> args, envs;
-    for (const auto& env : d.envp) {
-        __user_push_string32(sp, env.c_str());
+    for (size_t i = 0; i < d.envp_count; ++i) {
+        __user_push_string32(sp, d.envp[i]);
         envs.push_back((uintptr_t)*sp);
     }
-    for (const auto& arg : d.argv) {
-        __user_push_string32(sp, arg.c_str());
+    for (size_t i = 0; i < d.argv_count; ++i) {
+        __user_push_string32(sp, d.argv[i]);
         args.push_back((uintptr_t)*sp);
     }