Ver código fonte

rewrite: some of the process part and syscalls

greatbridf 5 meses atrás
pai
commit
b9756e3233
93 arquivos alterados com 5465 adições e 4006 exclusões
  1. 1 26
      CMakeLists.txt
  2. 13 0
      Cargo.lock
  3. 2 0
      Cargo.toml
  4. 14 0
      arch/Cargo.lock
  5. 7 0
      arch/Cargo.toml
  6. 92 0
      arch/src/lib.rs
  7. 6 0
      arch/x86_64/Cargo.toml
  8. 13 0
      arch/x86_64/src/interrupt.rs
  9. 76 0
      arch/x86_64/src/io.rs
  10. 56 0
      arch/x86_64/src/lib.rs
  11. 100 0
      arch/x86_64/src/task.rs
  12. 0 29
      include/kernel/async/waitlist.hpp
  13. 0 20
      include/kernel/hw/serial.hpp
  14. 0 11
      include/kernel/hw/timer.hpp
  15. 2 15
      include/kernel/log.hpp
  16. 0 38
      include/kernel/mem/paging.hpp
  17. 0 37
      include/kernel/module.hpp
  18. 1 141
      include/kernel/process.hpp
  19. 0 74
      include/kernel/signal.hpp
  20. 0 112
      include/kernel/syscall.hpp
  21. 0 5
      include/kernel/task/current.hpp
  22. 0 16
      include/kernel/task/readyqueue.hpp
  23. 0 76
      include/kernel/task/thread.hpp
  24. 0 75
      include/kernel/tty.hpp
  25. 0 21
      include/kernel/user/thread_local.hpp
  26. 0 52
      src/asm/interrupt.s
  27. 1 1
      src/boot.s
  28. 0 71
      src/dev/builtin-chardev.cc
  29. 21 0
      src/driver.rs
  30. 8 10
      src/driver/ahci/port.rs
  31. 155 0
      src/driver/serial.rs
  32. 14 0
      src/driver/timer.rs
  33. 3 0
      src/io.rs
  34. 7 51
      src/kernel.ld
  35. 10 0
      src/kernel.rs
  36. 0 1
      src/kernel/allocator.cc
  37. 0 58
      src/kernel/async/waitlist.cc
  38. 137 0
      src/kernel/chardev.rs
  39. 27 11
      src/kernel/console.rs
  40. 32 0
      src/kernel/constants.rs
  41. 0 1
      src/kernel/hw/pci.cc
  42. 0 115
      src/kernel/hw/serial.cc
  43. 0 28
      src/kernel/hw/timer.cc
  44. 6 13
      src/kernel/interrupt.cpp
  45. 10 0
      src/kernel/mem.rs
  46. 102 0
      src/kernel/mem/mm_area.rs
  47. 0 322
      src/kernel/mem/mm_list.cc
  48. 320 0
      src/kernel/mem/mm_list.rs
  49. 288 0
      src/kernel/mem/page_table.rs
  50. 0 84
      src/kernel/mem/paging.cc
  51. 67 35
      src/kernel/mem/paging.rs
  52. 2 2
      src/kernel/mem/phys.rs
  53. 162 0
      src/kernel/mem/vrange.rs
  54. 0 31
      src/kernel/module.cc
  55. 0 299
      src/kernel/process.cpp
  56. 0 214
      src/kernel/signal.cpp
  57. 0 286
      src/kernel/syscall.cpp
  58. 182 90
      src/kernel/syscall.rs
  59. 83 45
      src/kernel/syscall/file_rw.rs
  60. 0 130
      src/kernel/syscall/fileops.cc
  61. 0 50
      src/kernel/syscall/infoops.cc
  62. 112 0
      src/kernel/syscall/mm.rs
  63. 15 0
      src/kernel/syscall/net.rs
  64. 0 329
      src/kernel/syscall/procops.cc
  65. 358 47
      src/kernel/syscall/procops.rs
  66. 102 0
      src/kernel/syscall/sysinfo.rs
  67. 13 0
      src/kernel/task.rs
  68. 128 0
      src/kernel/task/kstack.rs
  69. 0 47
      src/kernel/task/readyqueue.cc
  70. 211 0
      src/kernel/task/scheduler.rs
  71. 395 0
      src/kernel/task/signal.rs
  72. 0 197
      src/kernel/task/thread.cc
  73. 982 0
      src/kernel/task/thread.rs
  74. 703 0
      src/kernel/terminal.rs
  75. 39 0
      src/kernel/timer.rs
  76. 0 341
      src/kernel/tty.cpp
  77. 5 0
      src/kernel/user.rs
  78. 129 33
      src/kernel/user/dataflow.rs
  79. 0 23
      src/kernel/user/thread_local.cc
  80. 0 1
      src/kernel/vfs.cpp
  81. 9 0
      src/kernel/vfs/dentry.rs
  82. 59 70
      src/kernel/vfs/file.rs
  83. 26 10
      src/kernel/vfs/filearray.rs
  84. 23 39
      src/kernel/vfs/mod.rs
  85. 5 7
      src/kernel/vfs/mount.rs
  86. 3 32
      src/kinit.cpp
  87. 22 6
      src/lib.rs
  88. 1 1
      src/prelude.rs
  89. 25 18
      src/sync.rs
  90. 55 79
      src/sync/condvar.rs
  91. 8 0
      src/sync/lock.rs
  92. 10 13
      src/sync/semaphore.rs
  93. 7 17
      src/sync/spin.rs

+ 1 - 26
CMakeLists.txt

@@ -38,65 +38,40 @@ set(BOOTLOADER_SOURCES src/boot.s
                        src/asm/interrupt.s
                        )
 
-set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
-                        src/kinit.cpp
-                        src/kernel/async/waitlist.cc
+set(KERNEL_MAIN_SOURCES src/kinit.cpp
                         src/kernel/async/lock.cc
                         src/kernel/allocator.cc
                         src/kernel/interrupt.cpp
                         src/kernel/process.cpp
-                        src/kernel/tty.cpp
-                        src/kernel/syscall.cpp
-                        src/kernel/syscall/fileops.cc
-                        src/kernel/syscall/infoops.cc
-                        src/kernel/syscall/procops.cc
-                        src/kernel/mem/mm_list.cc
                         src/kernel/mem/paging.cc
                         src/kernel/mem/slab.cc
-                        src/kernel/module.cc
                         src/kernel/vfs.cpp
                         src/kernel/vga.cpp
                         src/kernel/hw/acpi.cc
                         src/kernel/hw/pci.cc
-                        src/kernel/hw/serial.cc
-                        src/kernel/hw/timer.cc
-                        src/kernel/task/thread.cc
-                        src/kernel/task/readyqueue.cc
-                        src/kernel/user/thread_local.cc
-                        src/kernel/signal.cpp
                         src/net/ethernet.cc
                         src/types/crc.cc
                         src/types/elf.cpp
                         src/types/libstdcpp.cpp
                         include/defs.hpp
-                        include/kernel/async/waitlist.hpp
                         include/kernel/async/lock.hpp
-                        include/kernel/tty.hpp
                         include/kernel/interrupt.hpp
                         include/kernel/irq.hpp
                         include/kernel/process.hpp
-                        include/kernel/syscall.hpp
                         include/kernel/mem/mm_list.hpp
                         include/kernel/mem/paging.hpp
                         include/kernel/mem/slab.hpp
                         include/kernel/mem/types.hpp
                         include/kernel/mem/vm_area.hpp
-                        include/kernel/module.hpp
                         include/kernel/utsname.hpp
                         include/kernel/vfs.hpp
                         include/kernel/vfs/dentry.hpp
                         include/kernel/vga.hpp
-                        include/kernel/signal.hpp
                         include/kernel/task/forward.hpp
-                        include/kernel/task/thread.hpp
-                        include/kernel/task/readyqueue.hpp
                         include/kernel/hw/acpi.hpp
                         include/kernel/hw/pci.hpp
                         include/kernel/hw/port.hpp
-                        include/kernel/hw/serial.hpp
-                        include/kernel/hw/timer.hpp
                         include/kernel/input/keycodes.h
-                        include/kernel/user/thread_local.hpp
                         include/net/arp.hpp
                         include/net/ethernet.hpp
                         include/net/netdev.hpp

+ 13 - 0
Cargo.lock

@@ -11,6 +11,13 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "arch"
+version = "0.1.0"
+dependencies = [
+ "x86_64",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.4.0"
@@ -79,7 +86,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 name = "gbos-rust-part"
 version = "0.1.0"
 dependencies = [
+ "arch",
  "bindgen",
+ "bitflags",
  "itertools",
  "lazy_static",
  "spin",
@@ -327,3 +336,7 @@ name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "x86_64"
+version = "0.1.0"

+ 2 - 0
Cargo.toml

@@ -7,6 +7,8 @@ edition = "2021"
 crate-type = ["staticlib"]
 
 [dependencies]
+arch = { path="./arch" }
+bitflags = "2.6.0"
 itertools = { version = "0.13.0", default-features = false }
 lazy_static = { version = "1.5.0", features = ["spin_no_std"] }
 spin = "0.9.8"

+ 14 - 0
arch/Cargo.lock

@@ -0,0 +1,14 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "arch"
+version = "0.1.0"
+dependencies = [
+ "x86_64",
+]
+
+[[package]]
+name = "x86_64"
+version = "0.1.0"

+ 7 - 0
arch/Cargo.toml

@@ -0,0 +1,7 @@
+[package]
+name = "arch"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+x86_64 = { path="./x86_64" }

+ 92 - 0
arch/src/lib.rs

@@ -0,0 +1,92 @@
+#![no_std]
+
+pub mod vm {
+    pub fn invlpg(vaddr: usize) {
+        x86_64::vm::invlpg(vaddr)
+    }
+
+    pub fn invlpg_all() {
+        x86_64::vm::invlpg_all()
+    }
+
+    pub fn current_page_table() -> usize {
+        x86_64::vm::get_cr3()
+    }
+
+    pub fn switch_page_table(pfn: usize) {
+        x86_64::vm::set_cr3(pfn)
+    }
+}
+
+pub mod task {
+    #[inline(always)]
+    pub fn halt() {
+        x86_64::task::halt()
+    }
+
+    #[inline(always)]
+    pub fn pause() {
+        x86_64::task::pause()
+    }
+
+    #[inline(always)]
+    pub fn freeze() -> ! {
+        x86_64::task::freeze()
+    }
+
+    /// Switch to the `next` task. `IF` state is also switched.
+    ///
+    /// This function should only be used to switch between tasks that do not need SMP synchronization.
+    ///
+    /// # Arguments
+    /// * `current_task_sp` - Pointer to the stack pointer of the current task.
+    /// * `next_task_sp` - Pointer to the stack pointer of the next task.
+    #[inline(always)]
+    pub fn context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize) {
+        x86_64::task::context_switch_light(current_task_sp, next_task_sp);
+    }
+}
+
+pub mod interrupt {
+    #[inline(always)]
+    pub fn enable() {
+        x86_64::interrupt::enable()
+    }
+
+    #[inline(always)]
+    pub fn disable() {
+        x86_64::interrupt::disable()
+    }
+}
+
+pub mod io {
+    #[inline(always)]
+    pub fn inb(port: u16) -> u8 {
+        x86_64::io::inb(port)
+    }
+
+    #[inline(always)]
+    pub fn outb(port: u16, data: u8) {
+        x86_64::io::outb(port, data)
+    }
+
+    #[inline(always)]
+    pub fn inw(port: u16) -> u16 {
+        x86_64::io::inw(port)
+    }
+
+    #[inline(always)]
+    pub fn outw(port: u16, data: u16) {
+        x86_64::io::outw(port, data)
+    }
+
+    #[inline(always)]
+    pub fn inl(port: u16) -> u32 {
+        x86_64::io::inl(port)
+    }
+
+    #[inline(always)]
+    pub fn outl(port: u16, data: u32) {
+        x86_64::io::outl(port, data)
+    }
+}

+ 6 - 0
arch/x86_64/Cargo.toml

@@ -0,0 +1,6 @@
+[package]
+name = "x86_64"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]

+ 13 - 0
arch/x86_64/src/interrupt.rs

@@ -0,0 +1,13 @@
+use core::arch::asm;
+
+pub fn enable() {
+    unsafe {
+        asm!("sti");
+    }
+}
+
+pub fn disable() {
+    unsafe {
+        asm!("cli");
+    }
+}

+ 76 - 0
arch/x86_64/src/io.rs

@@ -0,0 +1,76 @@
+use core::arch::asm;
+
+pub fn inb(no: u16) -> u8 {
+    let data;
+    unsafe {
+        asm!(
+            "inb %dx, %al",
+            in("dx") no,
+            out("al") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn inw(no: u16) -> u16 {
+    let data;
+    unsafe {
+        asm!(
+            "inw %dx, %ax",
+            in("dx") no,
+            out("ax") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn inl(no: u16) -> u32 {
+    let data;
+    unsafe {
+        asm!(
+            "inl %dx, %eax",
+            in("dx") no,
+            out("eax") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn outb(no: u16, data: u8) {
+    unsafe {
+        asm!(
+            "outb %al, %dx",
+            in("al") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}
+
+pub fn outw(no: u16, data: u16) {
+    unsafe {
+        asm!(
+            "outw %ax, %dx",
+            in("ax") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}
+
+pub fn outl(no: u16, data: u32) {
+    unsafe {
+        asm!(
+            "outl %eax, %dx",
+            in("eax") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}

+ 56 - 0
arch/x86_64/src/lib.rs

@@ -0,0 +1,56 @@
+#![no_std]
+
+pub mod vm {
+    use core::arch::asm;
+
+    #[inline(always)]
+    pub fn invlpg(vaddr: usize) {
+        unsafe {
+            asm!(
+                "invlpg ({})",
+                in(reg) vaddr,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn invlpg_all() {
+        unsafe {
+            asm!(
+                "mov %cr3, %rax",
+                "mov %rax, %cr3",
+                out("rax") _,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn get_cr3() -> usize {
+        let cr3: usize;
+        unsafe {
+            asm!(
+                "mov %cr3, {0}",
+                out(reg) cr3,
+                options(att_syntax)
+            );
+        }
+        cr3
+    }
+
+    #[inline(always)]
+    pub fn set_cr3(pfn: usize) {
+        unsafe {
+            asm!(
+                "mov %cr3, {0}",
+                in(reg) pfn,
+                options(att_syntax)
+            );
+        }
+    }
+}
+
+pub mod interrupt;
+pub mod io;
+pub mod task;

+ 100 - 0
arch/x86_64/src/task.rs

@@ -0,0 +1,100 @@
+use core::arch::{asm, global_asm};
+
+#[inline(always)]
+pub fn halt() {
+    unsafe {
+        asm!("hlt", options(att_syntax, nostack));
+    }
+}
+
+#[inline(always)]
+pub fn pause() {
+    unsafe {
+        asm!("pause", options(att_syntax, nostack));
+    }
+}
+
+#[inline(always)]
+pub fn freeze() -> ! {
+    loop {
+        unsafe {
+            asm!("cli", options(att_syntax, nostack));
+        }
+
+        halt();
+    }
+}
+
+global_asm!(
+    r"
+    .macro movcfi reg, offset
+    	mov \reg, \offset(%rsp)
+    	.cfi_rel_offset \reg, \offset
+    .endm
+
+    .macro movrst reg, offset
+    	mov \offset(%rsp), \reg
+    	.cfi_restore \reg
+    .endm
+
+    .globl __context_switch_light
+    .type __context_switch_light @function
+    __context_switch_light:
+    .cfi_startproc
+
+        pushf
+    .cfi_def_cfa_offset 0x10
+
+	    sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
+    .cfi_def_cfa_offset 0x48
+
+	    movcfi %rbx, 0x08
+	    movcfi %rbp, 0x10
+	    movcfi %r12, 0x18
+	    movcfi %r13, 0x20
+	    movcfi %r14, 0x28
+	    movcfi %r15, 0x30
+
+        push (%rdi) 	 # save sp of previous stack frame of current
+	                     # acts as saving bp
+    .cfi_def_cfa_offset 0x50
+
+        mov %rsp, (%rdi) # save sp of current stack
+        mov (%rsi), %rsp # load sp of target stack
+
+        pop (%rsi)       # load sp of previous stack frame of target
+	                     # acts as restoring previous bp
+    .cfi_def_cfa_offset 0x48
+
+	    pop %rax         # align to 16 bytes
+    .cfi_def_cfa_offset 0x40
+
+	    call after_ctx_switch
+
+	    mov 0x28(%rsp), %r15
+	    mov 0x20(%rsp), %r14
+	    mov 0x18(%rsp), %r13
+	    mov 0x10(%rsp), %r12
+	    mov 0x08(%rsp), %rbp
+        mov 0x00(%rsp), %rbx
+
+	    add $0x30, %rsp
+    .cfi_def_cfa_offset 0x10
+
+        popf
+    .cfi_def_cfa_offset 0x08
+
+        ret
+    .cfi_endproc
+    ",
+    options(att_syntax),
+);
+
+extern "C" {
+    fn __context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize);
+}
+
+#[inline(always)]
+pub fn context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize) {
+    unsafe { __context_switch_light(current_task_sp, next_task_sp) }
+}

+ 0 - 29
include/kernel/async/waitlist.hpp

@@ -1,29 +0,0 @@
-#pragma once
-
-#include <set>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/task/forward.hpp>
-
-namespace kernel::async {
-
-class wait_list {
-   private:
-    mutex m_mtx;
-    std::set<task::thread*> m_subscribers;
-
-    wait_list(const wait_list&) = delete;
-
-   public:
-    explicit wait_list() = default;
-
-    // @return whether the wait is interrupted
-    bool wait(mutex& lck);
-
-    void subscribe();
-
-    void notify_one();
-    void notify_all();
-};
-
-} // namespace kernel::async

+ 0 - 20
include/kernel/hw/serial.hpp

@@ -1,20 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define PORT_SERIAL0 (0x3f8)
-#define PORT_SERIAL1 (0x2f8)
-
-int32_t init_serial_port(port_id_t port);
-
-int32_t is_serial_has_data(port_id_t port);
-uint8_t serial_read_data(port_id_t port);
-
-int32_t is_serial_ready_for_transmition(port_id_t port);
-void serial_send_data(port_id_t port, uint8_t data);
-
-#ifdef __cplusplus
-}
-#endif

+ 0 - 11
include/kernel/hw/timer.hpp

@@ -1,11 +0,0 @@
-#pragma once
-
-#include <cstddef>
-
-namespace kernel::hw::timer {
-void init_pit(void);
-void inc_tick(void);
-
-std::size_t current_ticks(void);
-
-} // namespace kernel::hw::timer

+ 2 - 15
include/kernel/log.hpp

@@ -1,20 +1,7 @@
 #pragma once
 
-#include <stdio.h>
-
-#include <kernel/tty.hpp>
-
-#define kmsgf(fmt, ...)                                                  \
-    if (1) {                                                             \
-        char buf[512];                                                   \
-        snprintf(buf, sizeof(buf), fmt "\n" __VA_OPT__(, ) __VA_ARGS__); \
-        if (kernel::tty::console)                                        \
-            kernel::tty::console->print(buf);                            \
-    }
-
-#define kmsg(msg)             \
-    if (kernel::tty::console) \
-    kernel::tty::console->print(msg "\n")
+#define kmsgf(fmt, ...)
+#define kmsg(msg)
 
 #ifdef NDEBUG
 #define kmsgf_debug(...)

+ 0 - 38
include/kernel/mem/paging.hpp

@@ -45,15 +45,11 @@ constexpr psattr_t PA_USER_DATA = PA_DATA | PA_G | PA_US;
 
 constexpr psattr_t PA_PAGE_TABLE = PA_P | PA_RW;
 constexpr psattr_t PA_KERNEL_PAGE_TABLE = PA_PAGE_TABLE | PA_G;
-constexpr psattr_t PA_USER_PAGE_TABLE = PA_PAGE_TABLE | PA_US;
 
 constexpr psattr_t PA_DATA_HUGE = PA_DATA | PA_PS;
 constexpr psattr_t PA_KERNEL_DATA_HUGE = PA_DATA_HUGE | PA_G;
 constexpr psattr_t PA_USER_DATA_HUGE = PA_DATA_HUGE | PA_US;
 
-constexpr psattr_t PA_ANONYMOUS_PAGE = PA_P | PA_US | PA_COW | PA_ANON;
-constexpr psattr_t PA_MMAPPED_PAGE = PA_US | PA_COW | PA_ANON | PA_MMAP;
-
 namespace __inner {
     using pse_t = uint64_t;
 
@@ -134,38 +130,4 @@ constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
 
 void handle_page_fault(interrupt_stack* int_stack);
 
-class vaddr_range {
-    std::size_t n;
-
-    int idx4;
-    int idx3;
-    int idx2;
-    int idx1;
-
-    PSE pml4;
-    PSE pdpt;
-    PSE pd;
-    PSE pt;
-
-    uintptr_t m_start;
-    uintptr_t m_end;
-
-    bool is_privilege;
-
-   public:
-    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool is_privilege = false);
-    explicit vaddr_range(std::nullptr_t);
-
-    vaddr_range begin() const noexcept;
-    vaddr_range end() const noexcept;
-
-    PSE operator*() const noexcept;
-
-    vaddr_range& operator++();
-    operator bool() const noexcept;
-
-    // compares remaining pages to iterate
-    bool operator==(const vaddr_range& other) const noexcept;
-};
-
 } // namespace kernel::mem::paging

+ 0 - 37
include/kernel/module.hpp

@@ -1,37 +0,0 @@
-#pragma once
-
-#include <memory>
-
-#include <types/types.h>
-
-#define MODULE_LOADER(name) \
-    static std::unique_ptr<kernel::kmod::kmod> __module##name##_loader()
-
-#define INTERNAL_MODULE(name, type)                                         \
-    MODULE_LOADER(name);                                                    \
-    SECTION(".kmods")                                                       \
-    __attribute__((used))                                                   \
-    std::unique_ptr<kernel::kmod::kmod> (*const __module##name##_entry)() = \
-        __module##name##_loader;                                            \
-    MODULE_LOADER(name) {                                                   \
-        return std::make_unique<type>();                                    \
-    }
-
-namespace kernel::kmod {
-
-struct kmod {
-    const char* const name;
-
-    explicit kmod(const char* name);
-
-    virtual ~kmod() = default;
-    kmod(const kmod&) = delete;
-    kmod& operator=(const kmod&) = delete;
-
-    virtual int init() = 0;
-};
-
-extern "C" std::unique_ptr<kmod> (*const KMOD_LOADERS_START[])();
-void load_internal_modules();
-
-} // namespace kernel::kmod

+ 1 - 141
include/kernel/process.hpp

@@ -9,6 +9,7 @@
 #include <assert.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <signal.h>
 #include <stdint.h>
 #include <sys/types.h>
 
@@ -17,156 +18,15 @@
 #include <types/path.hpp>
 #include <types/types.h>
 
-#include <kernel/async/waitlist.hpp>
 #include <kernel/interrupt.hpp>
 #include <kernel/mem/mm_list.hpp>
 #include <kernel/mem/paging.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/task/current.hpp>
-#include <kernel/task/thread.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
 
-class process;
-
-class proclist;
-
-inline process* volatile current_process;
-inline proclist* procs;
-
-struct process_attr {
-    uint16_t system : 1;
-    uint16_t zombie : 1 = 0;
-};
-
-class process {
-   public:
-    struct wait_obj {
-        pid_t pid;
-        int code;
-    };
-
-   public:
-    kernel::mem::mm_list mms{};
-    std::set<kernel::task::thread> thds;
-    kernel::async::wait_list waitlist;
-
-    kernel::async::mutex mtx_waitprocs;
-    std::list<wait_obj> waitprocs;
-
-    process_attr attr{};
-    fs::rust_file_array files;
-    fs::rust_fs_context fs_context;
-
-    pid_t pid{};
-    pid_t ppid{};
-    pid_t pgid{};
-    pid_t sid{};
-
-    kernel::tty::tty* control_tty{};
-    std::set<pid_t> children;
-
-   public:
-    process(const process&) = delete;
-    explicit process(const process& parent, pid_t pid);
-
-    // this function is used for system initialization
-    // DO NOT use this after the system is on
-    explicit process(pid_t pid, pid_t ppid);
-
-    constexpr bool is_system(void) const { return attr.system; }
-    constexpr bool is_zombie(void) const { return attr.zombie; }
-
-    void send_signal(kernel::signal_list::signo_type signal);
-};
-
-class proclist final {
-   private:
-    std::map<pid_t, process> m_procs;
-    pid_t m_nextpid = 2;
-
-    constexpr pid_t next_pid() { return m_nextpid++; }
-    process& real_emplace(pid_t pid, pid_t ppid);
-
-   public:
-    proclist();
-
-    constexpr process& copy_from(process& proc) {
-        pid_t pid = next_pid();
-        auto [iter, inserted] = m_procs.try_emplace(pid, proc, pid);
-        assert(inserted);
-
-        proc.children.insert(pid);
-        return iter->second;
-    }
-
-    constexpr void remove(pid_t pid) {
-        make_children_orphans(pid);
-
-        auto proc_iter = m_procs.find(pid);
-
-        auto ppid = proc_iter->second.ppid;
-        find(ppid).children.erase(pid);
-
-        m_procs.erase(proc_iter);
-    }
-
-    constexpr std::pair<process*, bool> try_find(pid_t pid) const {
-        auto iter = m_procs.find(pid);
-        if (iter)
-            return {(process*)&iter->second, true};
-        else
-            return {nullptr, false};
-    }
-
-    // if process doesn't exist, the behavior is undefined
-    constexpr process& find(pid_t pid) {
-        auto [ptr, found] = try_find(pid);
-        assert(found);
-        return *ptr;
-    }
-
-    constexpr void make_children_orphans(pid_t pid) {
-        auto& children = find(pid).children;
-        auto& init_children = find(1).children;
-
-        for (auto item : children) {
-            init_children.insert(item);
-            find(item).ppid = 1;
-        }
-
-        children.clear();
-    }
-
-    // the process MUST exist, or the behavior is undefined
-    void send_signal(pid_t pid, kernel::signal_list::signo_type signal) {
-        auto& proc = find(pid);
-        proc.send_signal(signal);
-    }
-    void send_signal_grp(pid_t pgid, kernel::signal_list::signo_type signal) {
-        // TODO: find processes that are in the same session quickly
-        for (auto& [pid, proc] : m_procs) {
-            if (proc.pgid != pgid)
-                continue;
-            proc.send_signal(signal);
-        }
-    }
-
-    void kill(pid_t pid, int exit_code);
-
-    constexpr auto begin() const { return m_procs.begin(); }
-    constexpr auto end() const { return m_procs.end(); }
-};
-
 void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn);
 /// @return true if returned normally, false if being interrupted
-bool schedule_now(void);
-bool schedule_now_preempt_disabled();
 void NORETURN schedule_noreturn(void);
 
 void NORETURN freeze(void);
 void NORETURN kill_current(int signo);
-
-void check_signal(void);

+ 0 - 74
include/kernel/signal.hpp

@@ -1,74 +0,0 @@
-#pragma once
-
-#include <list>
-#include <map>
-
-#include <signal.h>
-#include <stdint.h>
-
-#include <types/cplusplus.hpp>
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/interrupt.hpp>
-
-namespace kernel {
-
-using sigmask_type = uint64_t;
-
-struct sigaction {
-    sighandler_t sa_handler;
-    unsigned long sa_flags;
-    sigrestorer_t sa_restorer;
-    sigmask_type sa_mask;
-};
-
-class signal_list {
-   public:
-    using signo_type = uint32_t;
-    using list_type = std::list<signo_type>;
-
-   private:
-    list_type m_list;
-    sigmask_type m_mask{};
-    std::map<signo_type, sigaction> m_handlers;
-    async::mutex m_mtx;
-
-   public:
-    static constexpr bool check_valid(signo_type sig) {
-        return sig >= 1 && sig <= 64;
-    }
-
-   public:
-    constexpr signal_list() = default;
-    constexpr signal_list(const signal_list& val)
-        : m_list{val.m_list}
-        , m_mask{val.m_mask}
-        , m_handlers{val.m_handlers}
-        , m_mtx{} {}
-
-    constexpr signal_list(signal_list&& val)
-        : m_list{std::move(val.m_list)}
-        , m_mask{std::move(val.m_mask)}
-        , m_handlers{std::move(val.m_handlers)}
-        , m_mtx{} {}
-
-    void on_exec();
-
-    sigmask_type get_mask() const;
-    void set_mask(sigmask_type mask);
-    void mask(sigmask_type mask);
-    void unmask(sigmask_type mask);
-
-    void set_handler(signo_type signal, const sigaction& action);
-    void get_handler(signo_type signal, sigaction& action) const;
-
-    signo_type pending_signal();
-
-    // return value: whether the thread should wake up
-    bool raise(signo_type signal);
-    void handle(interrupt_stack* context, mmx_registers* mmxregs);
-    void after_signal(signo_type signal);
-};
-
-} // namespace kernel

+ 0 - 112
include/kernel/syscall.hpp

@@ -1,112 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include <bits/alltypes.h>
-#include <poll.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <sys/utsname.h>
-#include <time.h>
-
-#include <types/types.h>
-
-#include <kernel/interrupt.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/user/thread_local.hpp>
-
-#define SYSCALL64_ARG1(type, name) type name = (type)((data)->head.s_regs.rdi)
-#define SYSCALL64_ARG2(type, name) type name = (type)((data)->head.s_regs.rsi)
-#define SYSCALL64_ARG3(type, name) type name = (type)((data)->head.s_regs.rdx)
-#define SYSCALL64_ARG4(type, name) type name = (type)((data)->head.s_regs.r10)
-#define SYSCALL64_ARG5(type, name) type name = (type)((data)->head.s_regs.r8)
-#define SYSCALL64_ARG6(type, name) type name = (type)((data)->head.s_regs.r9)
-
-namespace kernel {
-void init_syscall_table();
-
-void handle_syscall32(int no, interrupt_stack* data, mmx_registers* mmxregs);
-void handle_syscall64(int no, interrupt_stack* data, mmx_registers* mmxregs);
-
-namespace syscall {
-    // in fileops.cc
-    ssize_t do_write(int fd, const char __user* buf, size_t n);
-    int do_close(int fd);
-    int do_dup(int old_fd);
-    int do_dup2(int old_fd, int new_fd);
-    int do_pipe(int __user* pipefd);
-    ssize_t do_getdents(int fd, char __user* buf, size_t cnt);
-    ssize_t do_getdents64(int fd, char __user* buf, size_t cnt);
-    int do_open(const char __user* path, int flags, mode_t mode);
-    int do_symlink(const char __user* target, const char __user* linkpath);
-    int do_readlink(const char __user* pathname, char __user* buf, size_t buf_size);
-    int do_ioctl(int fd, unsigned long request, uintptr_t arg3);
-    ssize_t do_readv(int fd, const iovec* iov, int iovcnt);
-    ssize_t do_writev(int fd, const iovec* iov, int iovcnt);
-    off_t do_lseek(int fd, off_t offset, int whence);
-    uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags, int fd,
-                            off_t pgoffset);
-    int do_munmap(uintptr_t addr, size_t len);
-    ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset, size_t count);
-    int do_statx(int dirfd, const char __user* path, int flags, unsigned int mask,
-                 statx __user* statxbuf);
-    int do_fcntl(int fd, int cmd, unsigned long arg);
-    int do_poll(pollfd __user* fds, nfds_t nfds, int timeout);
-    int do_mknod(const char __user* pathname, mode_t mode, dev_t dev);
-    int do_access(const char __user* pathname, int mode);
-    int do_unlink(const char __user* pathname);
-    int do_truncate(const char __user* pathname, long length);
-    int do_mkdir(const char __user* pathname, mode_t mode);
-    int do_socket(int domain, int type, int protocol);
-
-    // in procops.cc
-    int do_chdir(const char __user* path);
-    [[noreturn]] int do_exit(int status);
-    int do_waitpid(pid_t waitpid, int __user* arg1, int options);
-    pid_t do_getsid(pid_t pid);
-    pid_t do_setsid();
-    pid_t do_getpgid(pid_t pid);
-    int do_setpgid(pid_t pid, pid_t pgid);
-    int do_set_thread_area(user::user_desc __user* ptr);
-    pid_t do_set_tid_address(int __user* tidptr);
-    int do_prctl(int option, uintptr_t arg2);
-    int do_arch_prctl(int option, uintptr_t arg2);
-    pid_t do_getpid();
-    pid_t do_getppid();
-    uid_t do_getuid();
-    uid_t do_geteuid();
-    gid_t do_getgid();
-    pid_t do_gettid();
-    int do_getcwd(char __user* buf, size_t buf_size);
-    uintptr_t do_brk(uintptr_t addr);
-    int do_umask(mode_t mask);
-    int do_kill(pid_t pid, int sig);
-    int do_tkill(pid_t pid, int sig);
-    int do_rt_sigprocmask(int how, const kernel::sigmask_type __user* set,
-                          kernel::sigmask_type __user* oldset, size_t sigsetsize);
-    int do_rt_sigaction(int signum, const sigaction __user* act, sigaction __user* oldact,
-                        size_t sigsetsize);
-    int do_newuname(new_utsname __user* buf);
-
-    struct execve_retval {
-        uintptr_t ip;
-        uintptr_t sp;
-        int status;
-    };
-
-    execve_retval do_execve(const std::string& exec, const std::vector<std::string>& args,
-                            const std::vector<std::string>& envs);
-
-    // in mount.cc
-    int do_mount(const char __user* source, const char __user* target, const char __user* fstype,
-                 unsigned long flags, const void __user* _fsdata);
-
-    // in infoops.cc
-    int do_clock_gettime(clockid_t clk_id, timespec __user* tp);
-    int do_gettimeofday(timeval __user* tv, void __user* tz);
-
-} // namespace syscall
-
-} // namespace kernel

+ 0 - 5
include/kernel/task/current.hpp

@@ -1,5 +0,0 @@
-#pragma once
-
-#include <kernel/task/thread.hpp>
-
-inline kernel::task::thread* volatile current_thread;

+ 0 - 16
include/kernel/task/readyqueue.hpp

@@ -1,16 +0,0 @@
-#pragma once
-
-#include <list>
-
-#include <kernel/task/thread.hpp>
-
-namespace kernel::task::dispatcher {
-
-void enqueue(thread* thd);
-void dequeue(thread* thd);
-
-void setup_idle(thread* idle_thd);
-
-thread* next();
-
-} // namespace kernel::task::dispatcher

+ 0 - 76
include/kernel/task/thread.hpp

@@ -1,76 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <string>
-
-#include <stdint.h>
-#include <sys/types.h>
-
-#include <types/types.h>
-
-#include <kernel/mem/paging.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/user/thread_local.hpp>
-
-namespace kernel::task {
-
-using tid_t = std::size_t;
-
-struct thread {
-   public:
-    using thd_attr_t = uint32_t;
-    static constexpr thd_attr_t SYSTEM = 0x01;
-    static constexpr thd_attr_t READY = 0x02;
-    static constexpr thd_attr_t STOPPED = 0x04;
-    static constexpr thd_attr_t ZOMBIE = 0x08;
-    static constexpr thd_attr_t ISLEEP = 0x10;
-    static constexpr thd_attr_t USLEEP = 0x20;
-
-   private:
-    struct kernel_stack {
-        mem::paging::pfn_t pfn;
-        uintptr_t sp;
-
-        kernel_stack();
-        kernel_stack(const kernel_stack& other);
-        kernel_stack(kernel_stack&& other);
-        ~kernel_stack();
-
-        uint64_t pushq(uint64_t val);
-        uint32_t pushl(uint32_t val);
-
-        void load_interrupt_stack() const;
-    };
-
-   public:
-    kernel_stack kstack;
-    pid_t owner;
-    thd_attr_t attr;
-    signal_list signals;
-
-    int* __user set_child_tid{};
-    int* __user clear_child_tid{};
-
-    std::string name{};
-    uint64_t tls_desc32{};
-    std::size_t elected_times{};
-
-    explicit thread(std::string name, pid_t owner);
-    thread(const thread& val, pid_t owner);
-
-    int set_thread_area(user::user_desc* ptr);
-    int load_thread_area32() const;
-
-    void set_attr(thd_attr_t new_attr, bool forced = false);
-
-    void send_signal(signal_list::signo_type signal);
-
-    thread(thread&& val) = default;
-
-    tid_t tid() const;
-
-    bool operator<(const thread& rhs) const;
-    bool operator==(const thread& rhs) const;
-};
-
-} // namespace kernel::task

+ 0 - 75
include/kernel/tty.hpp

@@ -1,75 +0,0 @@
-#pragma once
-
-#include <string>
-
-#include <stdint.h>
-#include <sys/types.h>
-#include <termios.h>
-
-#include <types/allocator.hpp>
-#include <types/buffer.hpp>
-#include <types/cplusplus.hpp>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-
-namespace kernel::tty {
-
-class tty : public types::non_copyable {
-   public:
-    static constexpr size_t BUFFER_SIZE = 4096;
-
-   private:
-    void _real_commit_char(int c);
-    void _echo_char(int c);
-
-    int _do_erase(bool should_echo);
-
-   public:
-    explicit tty(std::string name);
-    virtual void putchar(char c) = 0;
-    void print(const char* str);
-    ssize_t read(char* buf, size_t buf_size, size_t n);
-    ssize_t write(const char* buf, size_t n);
-
-    // characters committed to buffer will be handled
-    // by the input line discipline (N_TTY)
-    void commit_char(int c);
-
-    // print character to the output
-    // characters will be handled by the output line discipline
-    void show_char(int c);
-
-    void clear_read_buf(void);
-
-    // TODO: formal poll support
-    int poll();
-
-    int ioctl(int request, unsigned long arg3);
-
-    constexpr void set_pgrp(pid_t pgid) { fg_pgroup = pgid; }
-
-    constexpr pid_t get_pgrp(void) const { return fg_pgroup; }
-
-    termios termio;
-    std::string name;
-
-   protected:
-    async::mutex mtx_buf;
-    types::buffer buf;
-    async::wait_list waitlist;
-
-    pid_t fg_pgroup;
-};
-
-class vga_tty : public virtual tty {
-   public:
-    vga_tty();
-    virtual void putchar(char c) override;
-};
-
-inline tty* console;
-
-int register_tty(tty* tty_dev);
-
-} // namespace kernel::tty

+ 0 - 21
include/kernel/user/thread_local.hpp

@@ -1,21 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-namespace kernel::user {
-
-struct user_desc {
-    uint32_t entry_number;
-    uint32_t base_addr;
-    uint32_t limit;
-    uint32_t seg_32bit : 1;
-    uint32_t contents : 2;
-    uint32_t read_exec_only : 1;
-    uint32_t limit_in_pages : 1;
-    uint32_t seg_not_present : 1;
-    uint32_t useable : 1;
-};
-
-void load_thread_area32(uint64_t desc);
-
-} // namespace kernel::user

+ 0 - 52
src/asm/interrupt.s

@@ -104,58 +104,6 @@ ISR_stub_restore:
 	iretq
 	.cfi_endproc
 
-# parameters
-# #1: sp* current_task_sp
-# #2: sp* target_task_sp
-.globl asm_ctx_switch
-.type  asm_ctx_switch @function
-asm_ctx_switch:
-	.cfi_startproc
-    pushf
-	.cfi_def_cfa_offset 0x10
-
-	sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
-	.cfi_def_cfa_offset 0x48
-
-	movcfi %rbx, 0x08
-	movcfi %rbp, 0x10
-	movcfi %r12, 0x18
-	movcfi %r13, 0x20
-	movcfi %r14, 0x28
-	movcfi %r15, 0x30
-
-    push (%rdi) 	 # save sp of previous stack frame of current
-	                 # acts as saving bp
-	.cfi_def_cfa_offset 0x50
-
-    mov %rsp, (%rdi) # save sp of current stack
-    mov (%rsi), %rsp # load sp of target stack
-
-    pop (%rsi)       # load sp of previous stack frame of target
-	                 # acts as restoring previous bp
-	.cfi_def_cfa_offset 0x48
-
-	pop %rax         # align to 16 bytes
-	.cfi_def_cfa_offset 0x40
-
-	call after_ctx_switch
-
-	mov 0x28(%rsp), %r15
-	mov 0x20(%rsp), %r14
-	mov 0x18(%rsp), %r13
-	mov 0x10(%rsp), %r12
-	mov 0x08(%rsp), %rbp
-    mov 0x00(%rsp), %rbx
-
-	add $0x30, %rsp
-	.cfi_def_cfa_offset 0x10
-
-    popf
-	.cfi_def_cfa_offset 0x08
-
-    ret
-	.cfi_endproc
-
 .altmacro
 .macro build_isr_no_err name
 	.align 8

+ 1 - 1
src/boot.s

@@ -293,7 +293,7 @@ fill_pxe:
 .L64bit_entry:
     jmp start_64bit
 
-.section .text.kinit
+.section .text
 start_64bit:
     # set stack pointer and clear stack bottom
     mov %rsp, %rdi

+ 0 - 71
src/dev/builtin-chardev.cc

@@ -1,71 +0,0 @@
-#include <kernel/module.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/vfs.hpp>
-
-using namespace kernel::kmod;
-using namespace kernel::tty;
-
-static ssize_t null_read(char*, size_t, size_t) {
-    return 0;
-}
-
-static ssize_t null_write(const char*, size_t n) {
-    return n;
-}
-
-static ssize_t zero_read(char* buf, size_t buf_size, size_t n) {
-    if (n > buf_size)
-        n = buf_size;
-
-    memset(buf, 0, n);
-    return n;
-}
-
-static ssize_t zero_write(const char*, size_t n) {
-    return n;
-}
-
-// TODO: add interface to bind console device to other devices
-ssize_t console_read(char* buf, size_t buf_size, size_t n) {
-    return console->read(buf, buf_size, n);
-}
-
-ssize_t console_write(const char* buf, size_t n) {
-    size_t orig_n = n;
-    while (n--)
-        console->putchar(*(buf++));
-
-    return orig_n;
-}
-
-class builtin_chardev : public virtual kmod {
-   public:
-    builtin_chardev() : kmod("builtin-chardev") {}
-    int init() override {
-        using namespace fs;
-        // null
-        chrdev_ops null_ops{
-            .read = null_read,
-            .write = null_write,
-        };
-        register_char_device(make_device(1, 3), null_ops);
-
-        // zero
-        chrdev_ops zero_ops{
-            .read = zero_read,
-            .write = zero_write,
-        };
-        register_char_device(make_device(1, 5), zero_ops);
-
-        // console
-        chrdev_ops console_ops{
-            .read = console_read,
-            .write = console_write,
-        };
-        register_char_device(make_device(5, 1), console_ops);
-
-        return 0;
-    }
-};
-
-INTERNAL_MODULE(builtin_chardev, builtin_chardev);

+ 21 - 0
src/driver.rs

@@ -1,2 +1,23 @@
 pub mod ahci;
 pub mod e1000e;
+pub mod serial;
+pub mod timer;
+
+// TODO!!!: Put it somewhere else.
+pub(self) struct Port8 {
+    no: u16,
+}
+
+impl Port8 {
+    const fn new(no: u16) -> Self {
+        Self { no }
+    }
+
+    fn read(&self) -> u8 {
+        arch::io::inb(self.no)
+    }
+
+    fn write(&self, data: u8) {
+        arch::io::outb(self.no, data)
+    }
+}

+ 8 - 10
src/driver/ahci/port.rs

@@ -7,7 +7,7 @@ use crate::kernel::block::{BlockDeviceRequest, BlockRequestQueue};
 use crate::kernel::mem::paging::Page;
 
 use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
-use crate::sync::condvar::CondVar;
+use crate::sync::UCondVar;
 
 use super::command::{Command, IdentifyCommand, ReadLBACommand};
 use super::{
@@ -106,7 +106,7 @@ impl CommandSlotInner {
 
 struct CommandSlot {
     inner: Spin<CommandSlotInner>,
-    cv: CondVar,
+    cv: UCondVar,
 }
 
 impl CommandSlot {
@@ -116,7 +116,7 @@ impl CommandSlot {
                 state: SlotState::Idle,
                 cmdheader,
             }),
-            cv: CondVar::new(),
+            cv: UCondVar::new(),
         }
     }
 }
@@ -153,7 +153,7 @@ pub struct AdapterPort {
     page: Page,
     slots: [CommandSlot; 32],
     free_list: Spin<FreeList>,
-    free_list_cv: CondVar,
+    free_list_cv: UCondVar,
 
     /// Statistics for this port
     pub stats: Spin<AdapterPortStats>,
@@ -176,7 +176,7 @@ impl AdapterPort {
                 CommandSlot::new(unsafe { cmdheaders_start.offset(index as isize) })
             }),
             free_list: Spin::new(FreeList::new()),
-            free_list_cv: CondVar::new(),
+            free_list_cv: UCondVar::new(),
             page,
             stats: Spin::default(),
         }
@@ -222,10 +222,8 @@ impl AdapterPort {
         loop {
             match free_list.free.pop_front() {
                 Some(slot) => break slot,
-                None => {
-                    self.free_list_cv.wait(&mut free_list, false);
-                }
-            }
+                None => self.free_list_cv.wait(&mut free_list),
+            };
         }
     }
 
@@ -323,7 +321,7 @@ impl AdapterPort {
                     saved = true;
                     self.save_working(slot_index as u32);
                 }
-                slot_object.cv.wait(&mut slot, false);
+                slot_object.cv.wait(&mut slot);
             }
         } else {
             // TODO: check error

+ 155 - 0
src/driver/serial.rs

@@ -0,0 +1,155 @@
+use alloc::{format, sync::Arc};
+use bindings::EIO;
+
+use crate::{
+    kernel::{
+        block::make_device, interrupt::register_irq_handler, CharDevice, CharDeviceType, Console,
+        Terminal, TerminalDevice,
+    },
+    prelude::*,
+};
+
+use super::Port8;
+
+struct Serial {
+    id: u32,
+    name: Arc<str>,
+
+    terminal: Option<Arc<Terminal>>,
+
+    tx_rx: Port8,
+    int_ena: Port8,
+    int_ident: Port8,
+    line_control: Port8,
+    modem_control: Port8,
+    line_status: Port8,
+    modem_status: Port8,
+    scratch: Port8,
+}
+
+impl Serial {
+    const COM0_BASE: u16 = 0x3f8;
+    const COM1_BASE: u16 = 0x2f8;
+
+    const COM0_IRQ: u8 = 4;
+    const COM1_IRQ: u8 = 3;
+
+    fn enable_interrupts(&self) {
+        // Enable interrupt #0: Received data available
+        self.int_ena.write(0x01);
+    }
+
+    pub fn new(id: u32, base_port: u16) -> KResult<Self> {
+        let port = Self {
+            id,
+            name: Arc::from(format!("ttyS{id}")),
+            terminal: None,
+            tx_rx: Port8::new(base_port),
+            int_ena: Port8::new(base_port + 1),
+            int_ident: Port8::new(base_port + 2),
+            line_control: Port8::new(base_port + 3),
+            modem_control: Port8::new(base_port + 4),
+            line_status: Port8::new(base_port + 5),
+            modem_status: Port8::new(base_port + 6),
+            scratch: Port8::new(base_port + 7),
+        };
+
+        port.int_ena.write(0x00); // Disable all interrupts
+        port.line_control.write(0x80); // Enable DLAB (set baud rate divisor)
+        port.tx_rx.write(0x00); // Set divisor to 0 (lo byte) 115200 baud rate
+        port.int_ena.write(0x00); //              0 (hi byte)
+        port.line_control.write(0x03); // 8 bits, no parity, one stop bit
+        port.int_ident.write(0xc7); // Enable FIFO, clear them, with 14-byte threshold
+        port.modem_control.write(0x0b); // IRQs enabled, RTS/DSR set
+        port.modem_control.write(0x1e); // Set in loopback mode, test the serial chip
+        port.tx_rx.write(0x19); // Test serial chip (send byte 0x19 and check if serial returns
+                                // same byte)
+        if port.tx_rx.read() != 0x19 {
+            return Err(EIO);
+        }
+
+        port.modem_control.write(0x0f); // Return to normal operation mode
+        Ok(port)
+    }
+
+    fn irq_handler(&self) {
+        let terminal = self.terminal.as_ref();
+        while self.line_status.read() & 0x01 != 0 {
+            let ch = self.tx_rx.read();
+
+            if let Some(terminal) = terminal {
+                terminal.commit_char(ch);
+            }
+        }
+    }
+
+    fn register_char_device(port: Self) -> KResult<()> {
+        let mut port = Arc::new(port);
+        let terminal = Terminal::new(port.clone());
+
+        // TODO!!!!!!: This is unsafe, we should find a way to avoid this.
+        //             Under smp, we should make the publish of terminal atomic.
+        unsafe { Arc::get_mut_unchecked(&mut port) }.terminal = Some(terminal.clone());
+
+        {
+            let port = port.clone();
+            let irq_no = match port.id {
+                0 => Serial::COM0_IRQ,
+                1 => Serial::COM1_IRQ,
+                _ => unreachable!(),
+            };
+
+            register_irq_handler(irq_no as i32, move || {
+                port.irq_handler();
+            })?;
+        }
+        port.enable_interrupts();
+        dont_check!(Console::register_terminal(&terminal));
+
+        CharDevice::register(
+            make_device(4, 64 + port.id),
+            port.name.clone(),
+            CharDeviceType::Terminal(terminal),
+        )?;
+
+        Ok(())
+    }
+}
+
+impl TerminalDevice for Serial {
+    fn putchar(&self, ch: u8) {
+        loop {
+            // If we poll the status and get the corresponding bit, we should handle the action.
+            let status = self.line_status.read();
+
+            // We should receive a byte and commit that to the line.
+            if status & 0x01 != 0 {
+                let ch = self.tx_rx.read();
+
+                if let Some(terminal) = self.terminal.as_ref() {
+                    terminal.commit_char(ch);
+                }
+            }
+
+            if status & 0x20 != 0 {
+                self.tx_rx.write(ch);
+                return;
+            }
+        }
+    }
+}
+
+pub fn init() -> KResult<()> {
+    let com0 = Serial::new(0, Serial::COM0_BASE);
+    let com1 = Serial::new(1, Serial::COM1_BASE);
+
+    if let Ok(port) = com0 {
+        Serial::register_char_device(port)?;
+    }
+
+    if let Ok(port) = com1 {
+        Serial::register_char_device(port)?;
+    }
+
+    Ok(())
+}

+ 14 - 0
src/driver/timer.rs

@@ -0,0 +1,14 @@
+use super::Port8;
+
+const COUNT: Port8 = Port8::new(0x40);
+const CONTROL: Port8 = Port8::new(0x43);
+
+pub fn init() {
+    // Set interval
+    CONTROL.write(0x34);
+
+    // Send interval number
+    // 0x2e9a = 11930 = 100Hz
+    COUNT.write(0x9a);
+    COUNT.write(0x2e);
+}

+ 3 - 0
src/io.rs

@@ -4,6 +4,7 @@ use crate::prelude::*;
 
 use core::{fmt::Write, mem::MaybeUninit};
 
+#[must_use]
 pub enum FillResult {
     Done(usize),
     Partial(usize),
@@ -33,6 +34,8 @@ impl FillResult {
 pub trait Buffer {
     fn total(&self) -> usize;
     fn wrote(&self) -> usize;
+
+    #[must_use]
     fn fill(&mut self, data: &[u8]) -> KResult<FillResult>;
 
     fn available(&self) -> usize {

+ 7 - 51
src/kernel.ld

@@ -30,54 +30,8 @@ SECTIONS
         . = ALIGN(0x1000);
     } > STAGE1
 
-    .kinit :
-        AT(LOADADDR(.stage1) + SIZEOF(.stage1))
-    {
-        KIMAGE_START = .;
-        KINIT_START = .;
-
-        *(.text.kinit)
-
-        . = ALIGN(16);
-        *(.rodata.kinit)
-
-        KINIT_START_ADDR = .;
-        QUAD(ABSOLUTE(KINIT_START));
-
-        KINIT_END_ADDR = .;
-        QUAD(ABSOLUTE(KINIT_END));
-
-        KINIT_PAGES = .;
-        QUAD((KINIT_END - KINIT_START) / 0x1000);
-
-        KIMAGE_PAGES_VALUE = .;
-        QUAD((KIMAGE_END - KIMAGE_START) / 0x1000);
-
-        . = ALIGN(16);
-        start_ctors = .;
-        KEEP(*(.init_array));
-        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
-        KEEP(*(.ctors));
-        KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
-        end_ctors = .;
-
-        . = ALIGN(16);
-        START_SYSCALL_HANDLERS = .;
-        KEEP(*(.syscall_handlers));
-        END_SYSCALL_HANDLERS = .;
-
-        . = ALIGN(16);
-        *(.data.kinit)
-
-        . = ALIGN(16);
-        *(.bss.kinit)
-
-        . = ALIGN(0x1000);
-        KINIT_END = .;
-    } > KIMAGE
-
     .text :
-        AT(LOADADDR(.kinit) + SIZEOF(.kinit))
+        AT(LOADADDR(.stage1) + SIZEOF(.stage1))
     {
         TEXT_START = .;
         *(.text)
@@ -95,9 +49,12 @@ SECTIONS
         *(.rodata*)
 
         . = ALIGN(16);
-        KMOD_LOADERS_START = .;
-        KEEP(*(.kmods));
-        QUAD(0);
+        start_ctors = .;
+        KEEP(*(.init_array));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
+        KEEP(*(.ctors));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
+        end_ctors = .;
 
         . = ALIGN(16);
         FIX_START = .;
@@ -105,7 +62,6 @@ SECTIONS
         FIX_END = .;
 
         . = ALIGN(16);
-
         BSS_ADDR = .;
         QUAD(ABSOLUTE(BSS_START));
         BSS_LENGTH = .;

+ 10 - 0
src/kernel.rs

@@ -3,5 +3,15 @@ pub mod console;
 pub mod interrupt;
 pub mod mem;
 pub mod syscall;
+pub mod task;
+pub mod timer;
 pub mod user;
 pub mod vfs;
+
+mod chardev;
+mod constants;
+mod terminal;
+
+pub use chardev::{CharDevice, CharDeviceType, VirtualCharDevice};
+pub use console::Console;
+pub use terminal::{Terminal, TerminalDevice};

+ 0 - 1
src/kernel/allocator.cc

@@ -218,7 +218,6 @@ static constexpr int __cache_index(std::size_t size) {
     return -1;
 }
 
-SECTION(".text.kinit")
 void kernel::kinit::init_allocator() {
     mem::init_slab_cache(caches + 0, 32);
     mem::init_slab_cache(caches + 1, 64);

+ 0 - 58
src/kernel/async/waitlist.cc

@@ -1,58 +0,0 @@
-#include <assert.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-#include <kernel/process.hpp>
-#include <kernel/task/thread.hpp>
-
-using namespace kernel::async;
-
-bool wait_list::wait(mutex& lock) {
-    preempt_disable();
-    this->subscribe();
-
-    auto* curthd = current_thread;
-    curthd->set_attr(kernel::task::thread::ISLEEP);
-
-    lock.unlock();
-    bool has_signals = schedule_now_preempt_disabled();
-    lock.lock();
-
-    m_subscribers.erase(curthd);
-    return !has_signals;
-}
-
-void wait_list::subscribe() {
-    lock_guard lck(m_mtx);
-
-    auto* thd = current_thread;
-
-    bool inserted;
-    std::tie(std::ignore, inserted) = m_subscribers.insert(thd);
-
-    assert(inserted);
-}
-
-void wait_list::notify_one() {
-    lock_guard lck(m_mtx);
-
-    if (m_subscribers.empty())
-        return;
-
-    auto iter = m_subscribers.begin();
-    (*iter)->set_attr(kernel::task::thread::READY);
-
-    m_subscribers.erase(iter);
-}
-
-void wait_list::notify_all() {
-    lock_guard lck(m_mtx);
-
-    if (m_subscribers.empty())
-        return;
-
-    for (auto thd : m_subscribers)
-        thd->set_attr(kernel::task::thread::READY);
-
-    m_subscribers.clear();
-}

+ 137 - 0
src/kernel/chardev.rs

@@ -0,0 +1,137 @@
+use alloc::{
+    boxed::Box,
+    collections::btree_map::{BTreeMap, Entry},
+    sync::Arc,
+};
+use bindings::{EEXIST, EIO};
+
+use crate::{io::Buffer, kernel::console::CONSOLE, prelude::*};
+
+use super::{block::make_device, terminal::Terminal, vfs::DevId};
+
+use lazy_static::lazy_static;
+
+pub trait VirtualCharDevice: Send + Sync {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize>;
+    fn write(&self, data: &[u8]) -> KResult<usize>;
+}
+
+pub enum CharDeviceType {
+    Terminal(Arc<Terminal>),
+    Virtual(Box<dyn VirtualCharDevice>),
+}
+
+pub struct CharDevice {
+    name: Arc<str>,
+    device: CharDeviceType,
+}
+
+lazy_static! {
+    pub static ref CHAR_DEVICES: Spin<BTreeMap<DevId, Arc<CharDevice>>> =
+        Spin::new(BTreeMap::new());
+}
+
+impl CharDevice {
+    pub fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        match &self.device {
+            CharDeviceType::Terminal(terminal) => terminal.read(buffer),
+            CharDeviceType::Virtual(device) => device.read(buffer),
+        }
+    }
+
+    pub fn write(&self, data: &[u8]) -> KResult<usize> {
+        match &self.device {
+            CharDeviceType::Virtual(device) => device.write(data),
+            CharDeviceType::Terminal(terminal) => {
+                for &ch in data.iter() {
+                    terminal.show_char(ch);
+                }
+                Ok(data.len())
+            }
+        }
+    }
+
+    pub fn get(devid: DevId) -> Option<Arc<CharDevice>> {
+        CHAR_DEVICES.lock().get(&devid).cloned()
+    }
+
+    pub fn register(devid: DevId, name: Arc<str>, device: CharDeviceType) -> KResult<()> {
+        match CHAR_DEVICES.lock().entry(devid) {
+            Entry::Vacant(entry) => {
+                entry.insert(Arc::new(CharDevice { name, device }));
+                Ok(())
+            }
+            Entry::Occupied(_) => Err(EEXIST),
+        }
+    }
+}
+
+struct NullDevice;
+impl VirtualCharDevice for NullDevice {
+    fn read(&self, _buffer: &mut dyn Buffer) -> KResult<usize> {
+        Ok(0)
+    }
+
+    fn write(&self, _data: &[u8]) -> KResult<usize> {
+        Ok(_data.len())
+    }
+}
+
+struct ZeroDevice;
+impl VirtualCharDevice for ZeroDevice {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        // TODO: Copy from empty page.
+        while let false = buffer.fill(&[0; 16])?.should_stop() {}
+        Ok(buffer.wrote())
+    }
+
+    fn write(&self, _data: &[u8]) -> KResult<usize> {
+        Ok(_data.len())
+    }
+}
+
+struct ConsoleDevice;
+impl VirtualCharDevice for ConsoleDevice {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        match CONSOLE.lock_irq().get_terminal() {
+            Some(console) => console.read(buffer),
+            None => Err(EIO),
+        }
+    }
+
+    fn write(&self, data: &[u8]) -> KResult<usize> {
+        match CONSOLE.lock_irq().get_terminal() {
+            None => Err(EIO),
+            Some(console) => {
+                for &ch in data.iter() {
+                    console.show_char(ch);
+                }
+                Ok(data.len())
+            }
+        }
+    }
+}
+
+impl CharDevice {
+    pub fn init() -> KResult<()> {
+        Self::register(
+            make_device(1, 3),
+            Arc::from("null"),
+            CharDeviceType::Virtual(Box::new(NullDevice)),
+        )?;
+
+        Self::register(
+            make_device(1, 5),
+            Arc::from("zero"),
+            CharDeviceType::Virtual(Box::new(ZeroDevice)),
+        )?;
+
+        Self::register(
+            make_device(5, 1),
+            Arc::from("console"),
+            CharDeviceType::Virtual(Box::new(ConsoleDevice)),
+        )?;
+
+        Ok(())
+    }
+}

+ 27 - 11
src/kernel/console.rs

@@ -1,18 +1,34 @@
 use crate::prelude::*;
 
+use alloc::sync::Arc;
+use bindings::EEXIST;
 use lazy_static::lazy_static;
 
-pub struct Console;
+pub struct Console {
+    terminal: Option<Arc<Terminal>>,
+}
+
+impl Console {
+    pub fn get_terminal(&self) -> Option<Arc<Terminal>> {
+        self.terminal.clone()
+    }
+
+    pub fn register_terminal(terminal: &Arc<Terminal>) -> KResult<()> {
+        let mut console = CONSOLE.lock_irq();
+        if console.terminal.is_some() {
+            return Err(EEXIST);
+        }
+
+        console.terminal = Some(terminal.clone());
+        Ok(())
+    }
+}
 
 impl Write for Console {
     fn write_str(&mut self, s: &str) -> core::fmt::Result {
-        use crate::bindings::root::kernel::tty::console as _console;
-
-        if let Some(console) = unsafe { _console.as_mut() } {
+        if let Some(console) = &self.terminal {
             for &ch in s.as_bytes() {
-                unsafe {
-                    console.show_char(ch as i32);
-                }
+                console.show_char(ch)
             }
         }
 
@@ -26,7 +42,7 @@ pub fn _print(args: core::fmt::Arguments) {
 }
 
 lazy_static! {
-    pub static ref CONSOLE: Spin<Console> = Spin::new(Console {});
+    pub static ref CONSOLE: Spin<Console> = Spin::new(Console { terminal: None });
 }
 
 macro_rules! print {
@@ -68,6 +84,6 @@ macro_rules! println_fatal {
     };
 }
 
-pub(crate) use {
-    print, println, println_debug, println_fatal, println_info, println_warn,
-};
+use super::terminal::Terminal;
+
+pub(crate) use {print, println, println_debug, println_fatal, println_info, println_warn};

+ 32 - 0
src/kernel/constants.rs

@@ -0,0 +1,32 @@
+use bitflags::bitflags;
+
+pub const TCGETS: u32 = 0x5401;
+pub const TCSETS: u32 = 0x5402;
+pub const TIOCGPGRP: u32 = 0x540f;
+pub const TIOCSPGRP: u32 = 0x5410;
+pub const TIOCGWINSZ: u32 = 0x5413;
+
+pub const PR_SET_NAME: u32 = 15;
+pub const PR_GET_NAME: u32 = 16;
+
+pub const SIG_BLOCK: u32 = 0;
+pub const SIG_UNBLOCK: u32 = 1;
+pub const SIG_SETMASK: u32 = 2;
+
+pub const CLOCK_REALTIME: u32 = 0;
+pub const CLOCK_MONOTONIC: u32 = 1;
+
+bitflags! {
+    pub struct UserMmapFlags: u32 {
+        const MAP_SHARED = 0x01;
+        const MAP_PRIVATE = 0x02;
+        const MAP_FIXED = 0x10;
+        const MAP_ANONYMOUS = 0x20;
+    }
+
+    pub struct UserMmapProtocol: u32 {
+        const PROT_READ = 0x01;
+        const PROT_WRITE = 0x02;
+        const PROT_EXEC = 0x04;
+    }
+}

+ 0 - 1
src/kernel/hw/pci.cc

@@ -87,7 +87,6 @@ int register_driver_r(uint16_t vendor, uint16_t device,
 
 namespace kernel::kinit {
 
-SECTION(".text.kinit")
 void init_pci() {
     using namespace hw::acpi;
     using namespace hw::pci;

+ 0 - 115
src/kernel/hw/serial.cc

@@ -1,115 +0,0 @@
-#include <errno.h>
-#include <stdio.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/irq.hpp>
-#include <kernel/log.hpp>
-#include <kernel/module.hpp>
-#include <kernel/tty.hpp>
-
-using namespace kernel::tty;
-using namespace kernel::hw;
-using namespace kernel::irq;
-using namespace kernel::kmod;
-
-constexpr int PORT0 = 0x3f8;
-constexpr int PORT1 = 0x2f8;
-
-using port_group = const p8[6];
-
-constexpr p8 port0[] = {
-    p8{PORT0 + 0}, p8{PORT0 + 1}, p8{PORT0 + 2},
-    p8{PORT0 + 3}, p8{PORT0 + 4}, p8{PORT0 + 5},
-};
-
-constexpr p8 port1[] = {
-    p8{PORT1 + 0}, p8{PORT1 + 1}, p8{PORT1 + 2},
-    p8{PORT1 + 3}, p8{PORT1 + 4}, p8{PORT1 + 5},
-};
-
-static void _serial0_receive_data_interrupt() {
-    while (*port0[5] & 1)
-        console->commit_char(*port0[0]);
-}
-
-static void _serial1_receive_data_interrupt() {
-    while (*port1[5] & 1)
-        console->commit_char(*port1[0]);
-}
-
-static inline int _init_port(port_group ports) {
-    // taken from osdev.org
-
-    ports[1] = 0x00; // Disable all interrupts
-    ports[3] = 0x80; // Enable DLAB (set baud rate divisor)
-    // TODO: set baud rate
-    ports[0] = 0x00; // Set divisor to 0 -3- (lo byte) 115200 -38400- baud
-    ports[1] = 0x00; //                  (hi byte)
-    ports[3] = 0x03; // 8 bits, no parity, one stop bit
-    ports[2] = 0xC7; // Enable FIFO, clear them, with 14-byte threshold
-    // TODO: IRQ disabled
-    ports[4] = 0x0B; // IRQs enabled, RTS/DSR set
-    ports[4] = 0x1E; // Set in loopback mode, test the serial chip
-    ports[0] = 0xAE; // Test serial chip (send byte 0xAE and check if serial
-                     // returns same byte)
-
-    // Check if serial is faulty (i.e: not same byte as sent)
-    if (*ports[0] != 0xAE)
-        return -EIO;
-
-    // If serial is not faulty set it in normal operation mode
-    // (not-loopback with IRQs enabled and OUT#1 and OUT#2 bits enabled)
-    ports[4] = 0x0F;
-
-    ports[1] = 0x01; // Enable interrupts #0: Received Data Available
-
-    return 0;
-}
-
-class serial_tty : public virtual tty {
-    const p8* ports;
-
-   public:
-    serial_tty(port_group ports, int id) : tty{"ttyS"}, ports(ports) {
-        name += '0' + id;
-    }
-
-    virtual void putchar(char c) override {
-        while (true) {
-            auto status = *ports[5];
-            if (status & 0x1)
-                this->commit_char(*ports[0]);
-            if (status & 0x20)
-                break;
-        }
-
-        ports[0] = c;
-    }
-};
-
-class serial_module : public virtual kmod {
-   public:
-    serial_module() : kmod("serial-tty") {}
-
-    virtual int init() override {
-        if (int ret = _init_port(port0); ret == 0) {
-            auto* dev = new serial_tty(port0, 0);
-            register_handler(4, _serial0_receive_data_interrupt);
-
-            if (int ret = register_tty(dev); ret != 0)
-                kmsg("[serial] cannot register ttyS0");
-        }
-
-        if (int ret = _init_port(port1); ret == 0) {
-            auto* dev = new serial_tty(port1, 0);
-            register_handler(3, _serial1_receive_data_interrupt);
-
-            if (int ret = register_tty(dev); ret != 0)
-                kmsg("[serial] cannot register ttyS1");
-        }
-
-        return 0;
-    }
-};
-
-INTERNAL_MODULE(serial, serial_module);

+ 0 - 28
src/kernel/hw/timer.cc

@@ -1,28 +0,0 @@
-#include <types/types.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/hw/timer.hpp>
-
-constexpr kernel::hw::p8 port_control(0x43);
-constexpr kernel::hw::p8 port_count(0x40);
-
-static std::size_t _current_ticks = 0;
-
-SECTION(".text.kinit")
-void kernel::hw::timer::init_pit(void) {
-    // set interval
-    port_control = 0x34;
-
-    // send interval number
-    // 0x2e9a = 11930 = 100Hz
-    port_count = 0x9a;
-    port_count = 0x2e;
-}
-
-void kernel::hw::timer::inc_tick(void) {
-    ++_current_ticks;
-}
-
-size_t kernel::hw::timer::current_ticks(void) {
-    return _current_ticks;
-}

+ 6 - 13
src/kernel/interrupt.cpp

@@ -1,21 +1,19 @@
 #include "kernel/async/lock.hpp"
+
 #include <list>
 #include <vector>
 
 #include <assert.h>
 #include <stdint.h>
-#include <stdio.h>
 
 #include <types/types.h>
 
 #include <kernel/hw/port.hpp>
-#include <kernel/hw/timer.hpp>
 #include <kernel/interrupt.hpp>
 #include <kernel/irq.hpp>
 #include <kernel/log.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
 
 #define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
@@ -42,9 +40,8 @@ static struct IDT_entry IDT[256];
 
 extern "C" uintptr_t ISR_START_ADDR;
 
-SECTION(".text.kinit")
-static inline void set_idt_entry(IDT_entry (&idt)[256], int n, uintptr_t offset,
-                                 uint16_t selector, uint8_t type) {
+static inline void set_idt_entry(IDT_entry (&idt)[256], int n, uintptr_t offset, uint16_t selector,
+                                 uint8_t type) {
     idt[n].offset_low = offset & 0xffff;
     idt[n].segment = selector;
     idt[n].IST = 0;
@@ -57,13 +54,10 @@ static inline void set_idt_entry(IDT_entry (&idt)[256], int n, uintptr_t offset,
 using kernel::irq::irq_handler_t;
 static std::vector<std::list<irq_handler_t>> s_irq_handlers;
 
-SECTION(".text.kinit")
 void kernel::kinit::init_interrupt() {
     for (int i = 0; i < 0x30; ++i)
-        set_idt_entry(IDT, i, ISR_START_ADDR + 8 * i, 0x08,
-                      KERNEL_INTERRUPT_GATE_TYPE);
-    set_idt_entry(IDT, 0x80, ISR_START_ADDR + 8 * 0x80, 0x08,
-                  USER_INTERRUPT_GATE_TYPE);
+        set_idt_entry(IDT, i, ISR_START_ADDR + 8 * i, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
+    set_idt_entry(IDT, 0x80, ISR_START_ADDR + 8 * 0x80, 0x08, USER_INTERRUPT_GATE_TYPE);
 
     uint64_t idt_descriptor[2];
     idt_descriptor[0] = (sizeof(IDT_entry) * 256) << 48;
@@ -138,8 +132,7 @@ static inline void irq_handler(interrupt_stack* context, mmx_registers*) {
         port_pic2_command = PIC_EOI;
 }
 
-extern "C" void interrupt_handler(interrupt_stack* context,
-                                  mmx_registers* mmxregs) {
+extern "C" void interrupt_handler(interrupt_stack* context, mmx_registers* mmxregs) {
     if (context->int_no < 0x20) // interrupt is a fault
         fault_handler(context, mmxregs);
     else if (context->int_no == 0x80) // syscall by int 0x80

+ 10 - 0
src/kernel/mem.rs

@@ -1,2 +1,12 @@
 pub mod paging;
 pub mod phys;
+
+mod mm_area;
+mod mm_list;
+mod page_table;
+mod vrange;
+
+pub(self) use mm_area::MMArea;
+pub use mm_list::{MMList, Mapping, Permission, FileMapping};
+pub(self) use page_table::{PTEIterator, PageTable, PTE};
+pub use vrange::{VAddr, VRange};

+ 102 - 0
src/kernel/mem/mm_area.rs

@@ -0,0 +1,102 @@
+use core::{borrow::Borrow, cell::UnsafeCell, cmp::Ordering};
+
+use super::{Mapping, Permission, VAddr, VRange};
+
+#[derive(Debug)]
+pub struct MMArea {
+    range: UnsafeCell<VRange>,
+    mapping: Mapping,
+    permission: Permission,
+}
+
+impl Clone for MMArea {
+    fn clone(&self) -> Self {
+        Self {
+            range: UnsafeCell::new(self.range()),
+            mapping: self.mapping.clone(),
+            permission: self.permission,
+        }
+    }
+}
+
+impl MMArea {
+    pub fn new(range: VRange, mapping: Mapping, permission: Permission) -> Self {
+        Self {
+            range: range.into(),
+            mapping,
+            permission,
+        }
+    }
+
+    fn range_borrow(&self) -> &VRange {
+        // SAFETY: The only way we get a reference to `MMArea` object is through `MMListInner`.
+        // And `MMListInner` is locked with IRQ disabled.
+        unsafe { self.range.get().as_ref().unwrap() }
+    }
+
+    pub fn range(&self) -> VRange {
+        *self.range_borrow()
+    }
+
+    pub fn len(&self) -> usize {
+        self.range_borrow().len()
+    }
+
+    /// # Safety
+    /// This function should be called only when we can guarantee that the range
+    /// won't overlap with any other range in some scope.
+    pub fn grow(&self, count: usize) {
+        let range = unsafe { self.range.get().as_mut().unwrap() };
+        range.clone_from(&self.range_borrow().grow(count));
+    }
+
+    pub fn split(mut self, at: VAddr) -> (Option<Self>, Option<Self>) {
+        assert_eq!(at.floor(), at);
+
+        match self.range_borrow().cmp(&VRange::from(at)) {
+            Ordering::Less => (Some(self), None),
+            Ordering::Greater => (None, Some(self)),
+            Ordering::Equal => {
+                let diff = at - self.range_borrow().start();
+                if diff == 0 {
+                    return (None, Some(self));
+                }
+
+                let right = Self {
+                    range: VRange::new(at, self.range_borrow().end()).into(),
+                    permission: self.permission,
+                    mapping: match &self.mapping {
+                        Mapping::Anonymous => Mapping::Anonymous,
+                        Mapping::File(mapping) => Mapping::File(mapping.offset(diff)),
+                    },
+                };
+
+                self.range.get_mut().shrink(diff);
+                (Some(self), Some(right))
+            }
+        }
+    }
+}
+
+impl Eq for MMArea {}
+impl PartialEq for MMArea {
+    fn eq(&self, other: &Self) -> bool {
+        self.range_borrow().eq(other.range_borrow())
+    }
+}
+impl PartialOrd for MMArea {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        self.range_borrow().partial_cmp(other.range_borrow())
+    }
+}
+impl Ord for MMArea {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.range_borrow().cmp(other.range_borrow())
+    }
+}
+
+impl Borrow<VRange> for MMArea {
+    fn borrow(&self) -> &VRange {
+        self.range_borrow()
+    }
+}

+ 0 - 322
src/kernel/mem/mm_list.cc

@@ -1,322 +0,0 @@
-#include <assert.h>
-#include <errno.h>
-#include <stdint.h>
-
-#include <kernel/mem/mm_list.hpp>
-#include <kernel/mem/paging.hpp>
-#include <kernel/mem/vm_area.hpp>
-
-using namespace kernel::mem;
-
-static inline void __invalidate_all_tlb() {
-    asm volatile(
-        "mov %%cr3, %%rax\n\t"
-        "mov %%rax, %%cr3\n\t"
-        :
-        :
-        : "rax", "memory");
-}
-
-static inline void __dealloc_page_table_all(paging::pfn_t pt, int depth, int from, int to) {
-    using namespace paging;
-
-    if (depth > 1) {
-        for (int i = from; i < to; ++i) {
-            auto pse = PSE{pt}[i];
-            if (!(pse.attributes() & PA_P))
-                continue;
-
-            int pfn = pse.pfn();
-            __dealloc_page_table_all(pfn, depth - 1, 0, 512);
-        }
-    }
-
-    free_page(pt);
-}
-
-static inline void __dealloc_page_table(paging::pfn_t pt) {
-    using namespace paging;
-    auto start_idx = idx_p4(0);
-    auto end_idx = idx_p4(KERNEL_SPACE_START);
-
-    __dealloc_page_table_all(pt, 4, start_idx, end_idx);
-}
-
-mm_list::mm_list() : m_pt{paging::alloc_page_table()}, m_brk{m_areas.end()} {
-    memcpy(physaddr<void>{m_pt}, paging::KERNEL_PAGE_TABLE_PHYS_ADDR, 0x1000);
-}
-
-mm_list::mm_list(const mm_list& other) : mm_list{} {
-    m_areas = other.m_areas;
-
-    using namespace paging;
-    for (auto iter = m_areas.begin(); iter != m_areas.end(); ++iter) {
-        auto& area = *iter;
-
-        if (area.flags & MM_BREAK)
-            m_brk = iter;
-
-        auto this_iter = vaddr_range{m_pt, area.start, area.end};
-        auto other_iter = vaddr_range{other.m_pt, area.start, area.end};
-
-        while (this_iter) {
-            auto this_pte = *this_iter, other_pte = *other_iter;
-            auto attributes = other_pte.attributes();
-            auto pfn = other_pte.pfn();
-
-            attributes &= ~(PA_RW | PA_A | PA_D);
-            attributes |= PA_COW;
-            this_pte.set(attributes, pfn);
-
-            increase_refcount(pfn_to_page(pfn));
-
-            // TODO: create a function to set COW mappings
-            attributes = other_pte.attributes();
-            attributes &= ~PA_RW;
-            attributes |= PA_COW;
-            other_pte.set(attributes, pfn);
-
-            ++this_iter, ++other_iter;
-        }
-    }
-
-    __invalidate_all_tlb();
-}
-
-mm_list::~mm_list() {
-    if (!m_pt)
-        return;
-
-    clear();
-    __dealloc_page_table(m_pt);
-}
-
-bool mm_list::is_avail(uintptr_t start, std::size_t len) const noexcept {
-    start &= ~0xfff;
-    uintptr_t end = (start + len + 0xfff) & ~0xfff;
-    len = end - start;
-
-    if (end > USER_SPACE_MEMORY_TOP)
-        return false;
-
-    for (const auto& area : m_areas) {
-        if (!area.is_avail(start, end))
-            return false;
-    }
-    return true;
-}
-
-bool mm_list::is_avail(uintptr_t addr) const {
-    if (addr >= USER_SPACE_MEMORY_TOP)
-        return false;
-
-    auto iter = m_areas.find(addr);
-    return iter == m_areas.end();
-}
-
-uintptr_t mm_list::find_avail(uintptr_t hint, size_t len) const {
-    auto addr = std::max(hint, MMAP_MIN_ADDR);
-
-    while (!is_avail(addr, len)) {
-        auto iter = m_areas.lower_bound(addr);
-        if (iter == m_areas.end())
-            return 0;
-
-        addr = iter->end;
-    }
-
-    return addr;
-}
-
-void mm_list::switch_pd() const noexcept {
-    asm volatile("mov %0, %%cr3" : : "r"(m_pt) : "memory");
-}
-
-int mm_list::register_brk(uintptr_t addr) {
-    assert(m_brk == m_areas.end());
-    if (!is_avail(addr))
-        return -ENOMEM;
-
-    bool inserted;
-    std::tie(m_brk, inserted) = m_areas.emplace(addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
-
-    assert(inserted);
-    return 0;
-}
-
-uintptr_t mm_list::set_brk(uintptr_t addr) {
-    using namespace paging;
-    assert(m_brk != m_areas.end());
-    uintptr_t curbrk = m_brk->end;
-
-    addr += 4096 - 1;
-    addr &= ~0xfff;
-
-    if (addr <= curbrk || !is_avail(curbrk, addr - curbrk))
-        return curbrk;
-
-    for (auto pte : vaddr_range{m_pt, curbrk, addr})
-        pte.set(PA_ANONYMOUS_PAGE | PA_NXE, EMPTY_PAGE_PFN);
-
-    m_brk->end = addr;
-    return m_brk->end;
-}
-
-void mm_list::clear() {
-    for (auto iter = m_areas.begin(); iter != m_areas.end(); ++iter)
-        unmap(iter, false);
-
-    __invalidate_all_tlb();
-
-    m_areas.clear();
-    m_brk = m_areas.end();
-}
-
-mm_list::iterator mm_list::split(iterator area, uintptr_t addr) {
-    assert(!(addr & 0xfff));
-    assert(addr > area->start && addr < area->end);
-
-    std::size_t old_len = addr - area->start;
-    std::size_t new_file_offset = 0;
-
-    if (area->mapped_file)
-        new_file_offset = area->file_offset + old_len;
-
-    auto new_end = area->end;
-    area->end = addr;
-
-    auto [iter, inserted] =
-        m_areas.emplace(addr, area->flags, new_end, d_get(area->mapped_file), new_file_offset);
-
-    assert(inserted);
-    return iter;
-}
-
-int mm_list::unmap(iterator area, bool should_invalidate_tlb) {
-    using namespace paging;
-
-    bool should_use_invlpg = area->end - area->start <= 0x4000;
-    auto range = vaddr_range{m_pt, area->start, area->end};
-    uintptr_t cur_addr = area->start;
-
-    // TODO: write back dirty pages
-    for (auto pte : range) {
-        free_page(pte.pfn());
-        pte.clear();
-
-        if (should_invalidate_tlb && should_use_invlpg) {
-            asm volatile("invlpg (%0)" : : "r"(cur_addr) : "memory");
-            cur_addr += 0x1000;
-        }
-    }
-
-    if (should_invalidate_tlb && !should_use_invlpg)
-        __invalidate_all_tlb();
-
-    return 0;
-}
-
-int mm_list::unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb) {
-    // standard says that addr and len MUST be
-    // page-aligned or the call is invalid
-    if (start & 0xfff)
-        return -EINVAL;
-
-    uintptr_t end = (start + length + 0xfff) & ~0xfff;
-
-    // check address validity
-    if (end > KERNEL_SPACE_START)
-        return -EINVAL;
-    if (end > USER_SPACE_MEMORY_TOP)
-        return -ENOMEM;
-
-    auto iter = m_areas.lower_bound(start);
-    auto iter_end = m_areas.upper_bound(end);
-
-    // start <= iter <= end a.k.a. !(start > *iter) && !(*iter > end)
-    while (iter != iter_end) {
-        // start == iter:
-        // start is between (iter->start, iter->end)
-        //
-        // strip out the area before start
-        if (!(start < *iter) && start != iter->start)
-            iter = split(iter, start);
-
-        // iter.end <= end
-        // it is safe to unmap the area directly
-        if (*iter < end) {
-            if (int ret = unmap(iter, should_invalidate_tlb); ret != 0)
-                return ret;
-
-            iter = m_areas.erase(iter);
-            continue;
-        }
-
-        // end == iter:
-        // end is between [iter->start, iter->end)
-        //
-        // if end == iter->start, no need to strip the area
-        if (end == iter->start) {
-            ++iter;
-            continue;
-        }
-
-        (void)split(iter, end);
-        if (int ret = unmap(iter, should_invalidate_tlb); ret != 0)
-            return ret;
-
-        iter = m_areas.erase(iter);
-
-        // no need to check areas after this
-        break;
-    }
-
-    return 0;
-}
-
-int mm_list::mmap(const map_args& args) {
-    auto& vaddr = args.vaddr;
-    auto& length = args.length;
-    auto& file = args.file;
-    auto& foff = args.file_offset;
-    auto& flags = args.flags;
-
-    assert((vaddr & 0xfff) == 0 && (foff & 0xfff) == 0);
-    assert((length & 0xfff) == 0 && length != 0);
-
-    if (!is_avail(vaddr, length))
-        return -EEXIST;
-
-    using namespace kernel::mem::paging;
-
-    // PA_RW is set during page fault while PA_NXE is preserved
-    // so we set PA_NXE now
-    psattr_t attributes = PA_US;
-    if (!(flags & MM_EXECUTE))
-        attributes |= PA_NXE;
-
-    if (flags & MM_MAPPED) {
-        assert(file);
-
-        auto [area, inserted] =
-            m_areas.emplace(vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, d_get(file), foff);
-        assert(inserted);
-
-        attributes |= PA_MMAPPED_PAGE;
-        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
-            pte.set(attributes, EMPTY_PAGE_PFN);
-    } else if (flags & MM_ANONYMOUS) {
-        // private mapping of zero-filled pages
-        // TODO: shared mapping
-        auto [area, inserted] = m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
-        assert(inserted);
-
-        attributes |= PA_ANONYMOUS_PAGE;
-        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
-            pte.set(attributes, EMPTY_PAGE_PFN);
-    } else {
-        return -EINVAL;
-    }
-
-    return 0;
-}

+ 320 - 0
src/kernel/mem/mm_list.rs

@@ -0,0 +1,320 @@
+use crate::prelude::*;
+
+use alloc::{collections::btree_set::BTreeSet, sync::Arc};
+use bindings::{EEXIST, EINVAL, ENOMEM};
+
+use crate::kernel::vfs::dentry::Dentry;
+
+use super::{MMArea, PageTable, VAddr, VRange};
+
+#[derive(Debug, Clone)]
+pub struct FileMapping {
+    file: Arc<Dentry>,
+    offset: usize,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct Permission {
+    pub write: bool,
+    pub execute: bool,
+}
+
+#[derive(Debug, Clone)]
+pub enum Mapping {
+    Anonymous,
+    File(FileMapping),
+}
+
+#[derive(Debug)]
+struct MMListInner {
+    areas: BTreeSet<MMArea>,
+    page_table: PageTable,
+    break_start: Option<VRange>,
+    break_pos: Option<VAddr>,
+}
+
+#[derive(Debug)]
+pub struct MMList {
+    /// # Safety
+    /// This field might be used in IRQ context, so it should be locked with `lock_irq()`.
+    inner: Spin<MMListInner>,
+}
+
+impl FileMapping {
+    pub fn new(file: Arc<Dentry>, offset: usize) -> Self {
+        assert_eq!(offset & 0xfff, 0);
+        Self { file, offset }
+    }
+
+    pub fn offset(&self, offset: usize) -> Self {
+        Self::new(self.file.clone(), self.offset + offset)
+    }
+}
+
+impl MMListInner {
+    fn clear_user(&mut self) {
+        self.areas.retain(|area| {
+            self.page_table.unmap(area);
+            false
+        });
+        self.break_start = None;
+        self.break_pos = None;
+    }
+
+    fn overlapping_addr(&self, addr: VAddr) -> Option<&MMArea> {
+        self.areas.get(&VRange::from(addr))
+    }
+
+    fn check_overlapping_addr(&self, addr: VAddr) -> bool {
+        addr.is_user() && self.overlapping_addr(addr).is_some()
+    }
+
+    fn overlapping_range(&self, range: VRange) -> impl DoubleEndedIterator<Item = &MMArea> + '_ {
+        self.areas.range(range.into_range())
+    }
+
+    fn check_overlapping_range(&self, range: VRange) -> bool {
+        range.is_user() && self.overlapping_range(range).next().is_some()
+    }
+
+    fn find_available(&self, hint: VAddr, len: usize) -> Option<VAddr> {
+        let mut range = VRange::new(hint.floor(), (hint + len).ceil());
+        let len = range.len();
+
+        loop {
+            if !range.is_user() {
+                return None;
+            }
+
+            match self.overlapping_range(range).next_back() {
+                None => return Some(range.start()),
+                Some(area) => {
+                    range = VRange::new(area.range().end().ceil(), area.range().end().ceil() + len);
+                }
+            }
+        }
+    }
+
+    fn unmap(&mut self, start: VAddr, len: usize) -> KResult<()> {
+        assert_eq!(start.floor(), start);
+        let end = (start + len).ceil();
+        let range = VRange::new(start, end);
+        if !range.is_user() {
+            return Err(EINVAL);
+        }
+
+        let check_range = VRange::from(range.start())..VRange::from(range.end());
+        let mut front_remaining = None;
+        let mut back_remaining = None;
+
+        self.areas.retain(|area| {
+            if !check_range.contains(&area.range()) {
+                return true;
+            }
+            if area.range() == range.start().into() {
+                let (left, right) = area.clone().split(range.start());
+                self.page_table.unmap(&right.unwrap());
+
+                if let Some(left) = left {
+                    assert!(
+                        front_remaining.replace(left).is_none(),
+                        "There should be only one `front`."
+                    );
+                }
+            } else if area.range() == range.end().into() {
+                let (left, right) = area.clone().split(range.end());
+                self.page_table.unmap(&left.unwrap());
+
+                assert!(
+                    back_remaining
+                        .replace(right.expect("`right` should be valid"))
+                        .is_none(),
+                    "There should be only one `back`."
+                );
+            } else {
+                self.page_table.unmap(area);
+            }
+
+            false
+        });
+
+        if let Some(front) = front_remaining {
+            self.areas.insert(front);
+        }
+        if let Some(back) = back_remaining {
+            self.areas.insert(back);
+        }
+
+        Ok(())
+    }
+
+    fn mmap(
+        &mut self,
+        at: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+    ) -> KResult<()> {
+        assert_eq!(at.floor(), at);
+        assert_eq!(len & 0xfff, 0);
+        let range = VRange::new(at, at + len);
+
+        // We are doing a area marker insertion.
+        if len == 0 && !self.check_overlapping_addr(at) || !self.check_overlapping_range(range) {
+            return Err(EEXIST);
+        }
+
+        match &mapping {
+            Mapping::Anonymous => self.page_table.set_anonymous(range, permission),
+            Mapping::File(_) => self.page_table.set_mmapped(range, permission),
+        }
+
+        self.areas.insert(MMArea::new(range, mapping, permission));
+        Ok(())
+    }
+
+    fn set_break(&mut self, pos: Option<VAddr>) -> VAddr {
+        // SAFETY: `set_break` is only called in syscalls, where program break should be valid.
+        assert!(self.break_start.is_some() && self.break_pos.is_some());
+        let break_start = self.break_start.unwrap();
+        let current_break = self.break_pos.unwrap();
+        let pos = match pos {
+            None => return current_break,
+            Some(pos) => pos.ceil(),
+        };
+
+        let range = VRange::new(current_break, pos);
+        if !self.check_overlapping_range(range) {
+            return current_break;
+        }
+
+        self.page_table.set_anonymous(
+            range,
+            Permission {
+                write: true,
+                execute: false,
+            },
+        );
+
+        let program_break = self
+            .areas
+            .get(&break_start)
+            .expect("Program break area should be valid");
+        program_break.grow(pos - current_break);
+
+        self.break_pos = Some(pos);
+        pos
+    }
+}
+
+impl MMList {
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self {
+            inner: Spin::new(MMListInner {
+                areas: BTreeSet::new(),
+                page_table: PageTable::new(),
+                break_start: None,
+                break_pos: None,
+            }),
+        })
+    }
+
+    /// # Safety
+    /// Calling this function on the `MMList` of current process will need invalidating
+    /// the TLB cache after the clone will be done. We might set some of the pages as COW.
+    pub fn new_cloned(&self) -> Arc<Self> {
+        let inner = self.inner.lock_irq();
+
+        let list = Arc::new(Self {
+            inner: Spin::new(MMListInner {
+                areas: inner.areas.clone(),
+                page_table: PageTable::new(),
+                break_start: inner.break_start,
+                break_pos: inner.break_pos,
+            }),
+        });
+
+        // SAFETY: `self.inner` already locked with IRQ disabled.
+        {
+            let list_inner = list.inner.lock();
+
+            for area in list_inner.areas.iter() {
+                let new_iter = list_inner.page_table.iter_user(area.range());
+                let old_iter = inner.page_table.iter_user(area.range());
+
+                for (new, old) in new_iter.zip(old_iter) {
+                    new.setup_cow(old);
+                }
+            }
+        }
+
+        list
+    }
+
+    /// # Safety
+    /// Calling this function on the `MMList` of current process will need invalidating
+    /// the TLB cache after the clone will be done. We might remove some mappings.
+    pub fn clear_user(&self) {
+        self.inner.lock_irq().clear_user()
+    }
+
+    pub fn switch_page_table(&self) {
+        self.inner.lock_irq().page_table.switch();
+    }
+
+    /// No need to do invalidation manually, `PageTable` already does it.
+    pub fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
+        self.inner.lock_irq().unmap(start, len)
+    }
+
+    pub fn mmap(
+        &self,
+        at: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+        fixed: bool,
+    ) -> KResult<VAddr> {
+        let mut inner = self.inner.lock_irq();
+        match inner.mmap(at, len, mapping.clone(), permission) {
+            Ok(()) => Ok(at),
+            Err(EEXIST) if fixed => Err(EEXIST),
+            Err(EEXIST) => {
+                let at = inner.find_available(at, len).ok_or(ENOMEM)?;
+                inner.mmap(at, len, mapping, permission)?;
+                Ok(at)
+            }
+            Err(err) => Err(err),
+        }
+    }
+
+    pub fn set_break(&self, pos: Option<VAddr>) -> VAddr {
+        self.inner.lock_irq().set_break(pos)
+    }
+
+    pub fn register_break(&self, start: VAddr) {
+        let mut inner = self.inner.lock_irq();
+        assert!(inner.break_start.is_none() && inner.break_pos.is_none());
+
+        inner
+            .mmap(
+                start,
+                0,
+                Mapping::Anonymous,
+                Permission {
+                    write: true,
+                    execute: false,
+                },
+            )
+            .expect("Probably, we have a bug in the ELF loader?");
+
+        inner.break_start = Some(start.into());
+        inner.break_pos = Some(start);
+    }
+}
+
+impl Drop for MMList {
+    fn drop(&mut self) {
+        self.clear_user();
+    }
+}

+ 288 - 0
src/kernel/mem/page_table.rs

@@ -0,0 +1,288 @@
+use crate::prelude::*;
+
+use crate::bindings::root::{EINVAL, KERNEL_PML4};
+
+use super::{
+    paging::Page,
+    phys::{CachedPP, PhysPtr as _},
+    VAddr, VRange,
+};
+use super::{MMArea, Permission};
+
+const EMPTY_PAGE_PFN: usize = 0x8000;
+
+const PA_P: usize = 0x001;
+const PA_RW: usize = 0x002;
+const PA_US: usize = 0x004;
+const PA_PWT: usize = 0x008;
+const PA_PCD: usize = 0x010;
+const PA_A: usize = 0x020;
+const PA_D: usize = 0x040;
+const PA_PS: usize = 0x080;
+const PA_G: usize = 0x100;
+const PA_COW: usize = 0x200;
+const PA_MMAP: usize = 0x400;
+const PA_ANON: usize = 0x800;
+const PA_NXE: usize = 0x8000_0000_0000_0000;
+const PA_MASK: usize = 0xfff0_0000_0000_0fff;
+
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy)]
+pub struct PTE(usize);
+
+#[derive(Debug)]
+pub struct PageTable {
+    page: Page,
+}
+
+pub struct PTEIterator<'lt, const Kernel: bool> {
+    count: usize,
+    i4: u16,
+    i3: u16,
+    i2: u16,
+    i1: u16,
+    p4: Page,
+    p3: Page,
+    p2: Page,
+    p1: Page,
+
+    start: VAddr,
+    end: VAddr,
+    _phantom: core::marker::PhantomData<&'lt ()>,
+}
+
+impl PTE {
+    pub fn is_user(&self) -> bool {
+        self.0 & PA_US != 0
+    }
+
+    pub fn is_present(&self) -> bool {
+        self.0 & PA_P != 0
+    }
+
+    pub fn pfn(&self) -> usize {
+        self.0 & !0xfff
+    }
+
+    pub fn attributes(&self) -> usize {
+        self.0 & 0xfff
+    }
+
+    pub fn set(&mut self, pfn: usize, attributes: usize) {
+        self.0 = pfn | attributes;
+    }
+
+    pub fn set_pfn(&mut self, pfn: usize) {
+        self.set(pfn, self.attributes())
+    }
+
+    pub fn set_attributes(&mut self, attributes: usize) {
+        self.set(self.pfn(), attributes)
+    }
+
+    pub fn parse_page_table(&mut self, kernel: bool) -> Page {
+        let attributes = if kernel {
+            PA_P | PA_RW | PA_G
+        } else {
+            PA_P | PA_RW | PA_US
+        };
+
+        if self.is_present() {
+            Page::get(self.pfn(), 0)
+        } else {
+            let page = Page::alloc_one();
+            page.zero();
+            self.set(page.as_phys(), attributes);
+
+            page
+        }
+    }
+
+    pub fn setup_cow(&mut self, from: &mut Self) {
+        self.set(
+            Page::get(from.pfn(), 0).into_pfn(),
+            (from.attributes() & !(PA_RW | PA_A | PA_D)) | PA_COW,
+        );
+
+        from.set_attributes((from.attributes() & !PA_RW) | PA_COW);
+    }
+
+    pub fn clear(&mut self) {
+        self.set(0, 0)
+    }
+
+    /// Take the ownership of the page from the PTE, clear the PTE and return the page.
+    pub fn take(&mut self) -> Page {
+        // SAFETY: Acquire the ownership of the page from the page table and then
+        // clear the PTE so no one could be able to access the page from here later on.
+        let page = unsafe { Page::from_pfn(self.pfn(), 0) };
+        self.clear();
+        page
+    }
+}
+
+impl<const Kernel: bool> PTEIterator<'_, Kernel> {
+    fn new(pt: Page, start: VAddr, end: VAddr) -> KResult<Self> {
+        if start >= end {
+            return Err(EINVAL);
+        }
+
+        let p3 = pt.as_page_table()[Self::index(4, start)].parse_page_table(Kernel);
+        let p2 = pt.as_page_table()[Self::index(3, start)].parse_page_table(Kernel);
+        let p1 = pt.as_page_table()[Self::index(2, start)].parse_page_table(Kernel);
+
+        Ok(Self {
+            count: (end.0 - start.0) >> 12,
+            i4: Self::index(4, start) as u16,
+            i3: Self::index(3, start) as u16,
+            i2: Self::index(2, start) as u16,
+            i1: Self::index(1, start) as u16,
+            p4: pt.clone(),
+            p3,
+            p2,
+            p1,
+            start,
+            end,
+            _phantom: core::marker::PhantomData,
+        })
+    }
+
+    fn offset(level: u32) -> usize {
+        12 + (level as usize - 1) * 9
+    }
+
+    fn index(level: u32, vaddr: VAddr) -> usize {
+        (vaddr.0 >> Self::offset(level)) & 0x1ff
+    }
+}
+
+impl<'lt, const Kernel: bool> Iterator for PTEIterator<'lt, Kernel> {
+    type Item = &'lt mut PTE;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.count == 0 {
+            return None;
+        }
+
+        let retval = &mut self.p1.as_page_table()[self.i1 as usize];
+        self.i1 = (self.i1 + 1) % 512;
+        if self.i1 == 0 {
+            self.i2 = (self.i2 + 1) % 512;
+            if self.i2 == 0 {
+                self.i3 = (self.i3 + 1) % 512;
+                if self.i3 == 0 {
+                    self.i4 = (self.i4 + 1) % 512;
+                    if self.i4 == 0 {
+                        panic!("PTEIterator: out of range");
+                    }
+                }
+                self.p3 = self.p4.as_page_table()[self.i4 as usize].parse_page_table(Kernel);
+            }
+            self.p2 = self.p3.as_page_table()[self.i3 as usize].parse_page_table(Kernel);
+        }
+        self.p1 = self.p2.as_page_table()[self.i2 as usize].parse_page_table(Kernel);
+        Some(retval)
+    }
+}
+
+impl PageTable {
+    pub fn new() -> Self {
+        let page = Page::alloc_one();
+        page.zero();
+
+        let kernel_space_page_table = CachedPP::new(KERNEL_PML4 as usize);
+        unsafe {
+            page.as_cached()
+                .as_ptr::<()>()
+                .copy_from_nonoverlapping(kernel_space_page_table.as_ptr(), page.len())
+        };
+
+        Self { page }
+    }
+
+    pub fn iter_user(&self, range: VRange) -> PTEIterator<'_, false> {
+        PTEIterator::new(self.page.clone(), range.start().floor(), range.end().ceil()).unwrap()
+    }
+
+    pub fn iter_kernel(&self, range: VRange) -> PTEIterator<'_, true> {
+        PTEIterator::new(self.page.clone(), range.start().floor(), range.end().ceil()).unwrap()
+    }
+
+    pub fn switch(&self) {
+        arch::vm::switch_page_table(self.page.as_phys())
+    }
+
+    pub fn unmap(&self, area: &MMArea) {
+        let range = area.range();
+        let use_invlpg = range.len() / 4096 < 4;
+        let iter = self.iter_user(range);
+
+        if self.page.as_phys() != arch::vm::current_page_table() {
+            for pte in iter {
+                pte.take();
+            }
+            return;
+        }
+
+        if use_invlpg {
+            for (offset_pages, pte) in iter.enumerate() {
+                pte.take();
+
+                let pfn = range.start().floor().0 + offset_pages * 4096;
+                arch::vm::invlpg(pfn);
+            }
+        } else {
+            for pte in iter {
+                pte.take();
+            }
+            arch::vm::invlpg_all();
+        }
+    }
+
+    pub fn set_mmapped(&self, range: VRange, permission: Permission) {
+        // PA_RW is set during page fault handling.
+        // PA_NXE is preserved across page faults, so we set PA_NXE now.
+        let attributes = if permission.execute {
+            PA_US | PA_COW | PA_ANON | PA_MMAP
+        } else {
+            PA_US | PA_COW | PA_ANON | PA_MMAP | PA_NXE
+        };
+
+        for pte in self.iter_user(range) {
+            pte.set(EMPTY_PAGE_PFN, attributes);
+        }
+    }
+
+    pub fn set_anonymous(&self, range: VRange, permission: Permission) {
+        // PA_RW is set during page fault handling.
+        // PA_NXE is preserved across page faults, so we set PA_NXE now.
+        let attributes = if permission.execute {
+            PA_P | PA_US | PA_COW | PA_ANON
+        } else {
+            PA_P | PA_US | PA_COW | PA_ANON | PA_NXE
+        };
+
+        for pte in self.iter_user(range) {
+            pte.set(EMPTY_PAGE_PFN, attributes);
+        }
+    }
+}
+
+fn drop_page_table_recursive(pt: &Page, level: usize) {
+    for pte in pt
+        .as_page_table()
+        .iter_mut()
+        .filter(|pte| pte.is_present() && pte.is_user())
+    {
+        let page = pte.take();
+        if level > 1 {
+            drop_page_table_recursive(&page, level - 1);
+        }
+    }
+}
+
+impl Drop for PageTable {
+    fn drop(&mut self) {
+        drop_page_table_recursive(&self.page, 4);
+    }
+}

+ 0 - 84
src/kernel/mem/paging.cc

@@ -21,14 +21,6 @@ static inline void __page_fault_die(uintptr_t vaddr) {
     freeze();
 }
 
-static inline PSE __parse_pse(PSE pse, bool priv) {
-    auto attr = priv ? PA_KERNEL_PAGE_TABLE : PA_USER_PAGE_TABLE;
-    if (!(pse.attributes() & PA_P))
-        pse.set(attr, alloc_page_table());
-
-    return pse.parse();
-}
-
 static struct zone_info {
     page* next;
     std::size_t count;
@@ -408,79 +400,3 @@ void kernel::mem::paging::handle_page_fault(interrupt_stack* int_stack) {
         pe.set(attr, pfn);
     }
 }
-
-vaddr_range::vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool priv)
-    : n{start >= end ? 0 : ((end - start) >> 12)}
-    , idx4{!n ? 0 : idx_p4(start)}
-    , idx3{!n ? 0 : idx_p3(start)}
-    , idx2{!n ? 0 : idx_p2(start)}
-    , idx1{!n ? 0 : idx_p1(start)}
-    , pml4{!n ? PSE{0} : PSE{pt}}
-    , pdpt{!n ? PSE{0} : __parse_pse(pml4[idx4], priv)}
-    , pd{!n ? PSE{0} : __parse_pse(pdpt[idx3], priv)}
-    , pt{!n ? PSE{0} : __parse_pse(pd[idx2], priv)}
-    , m_start{!n ? 0 : start}
-    , m_end{!n ? 0 : end}
-    , is_privilege{!n ? false : priv} {}
-
-vaddr_range::vaddr_range(std::nullptr_t)
-    : n{}
-    , idx4{}
-    , idx3{}
-    , idx2{}
-    , idx1{}
-    , pml4{0}
-    , pdpt{0}
-    , pd{0}
-    , pt{0}
-    , m_start{}
-    , m_end{}
-    , is_privilege{} {}
-
-vaddr_range vaddr_range::begin() const noexcept {
-    return *this;
-}
-
-vaddr_range vaddr_range::end() const noexcept {
-    return vaddr_range{nullptr};
-}
-
-PSE vaddr_range::operator*() const noexcept {
-    return pt[idx1];
-}
-
-vaddr_range& vaddr_range::operator++() {
-    --n;
-
-    if ((idx1 = (idx1 + 1) % 512) != 0)
-        return *this;
-
-    do {
-        if ((idx2 = (idx2 + 1) % 512) != 0)
-            break;
-        do {
-            if ((idx3 = (idx3 + 1) % 512) != 0)
-                break;
-
-            idx4 = (idx4 + 1) % 512;
-
-            // if idx4 is 0 after update, we have an overflow
-            assert(idx4 != 0);
-
-            pdpt = __parse_pse(pml4[idx4], is_privilege);
-        } while (false);
-
-        pd = __parse_pse(pdpt[idx3], is_privilege);
-    } while (false);
-
-    pt = __parse_pse(pd[idx2], is_privilege);
-    return *this;
-}
-
-vaddr_range::operator bool() const noexcept {
-    return n;
-}
-
-bool vaddr_range::operator==(const vaddr_range& other) const noexcept {
-    return n == other.n;
-}

+ 67 - 35
src/kernel/mem/paging.rs

@@ -1,37 +1,76 @@
+use crate::bindings::root::kernel::mem::paging::{
+    alloc_page as c_alloc_page, alloc_pages as c_alloc_pages, free_pages as c_free_pages,
+    increase_refcount as c_increase_refcount, page as c_page, page_to_pfn as c_page_to_pfn,
+    pfn_to_page as c_pfn_to_page, PAGE_BUDDY,
+};
 use crate::bindings::root::EFAULT;
 use crate::kernel::mem::phys;
 use core::fmt;
 
 use super::phys::PhysPtr;
+use super::PTE;
 
 pub struct Page {
-    page_ptr: *mut crate::bindings::root::kernel::mem::paging::page,
+    page_ptr: *mut c_page,
     order: u32,
 }
 
 impl Page {
     pub fn alloc_one() -> Self {
-        use crate::bindings::root::kernel::mem::paging::alloc_page;
-        let page_ptr = unsafe { alloc_page() };
+        let page_ptr = unsafe { c_alloc_page() };
 
         Self { page_ptr, order: 0 }
     }
 
     pub fn alloc_many(order: u32) -> Self {
-        use crate::bindings::root::kernel::mem::paging::alloc_pages;
-        let page_ptr = unsafe { alloc_pages(order) };
+        let page_ptr = unsafe { c_alloc_pages(order) };
 
         Self { page_ptr, order }
     }
 
+    /// Get `Page` from `pfn` without increasing the reference count.
+    ///
+    /// # Safety
+    ///
+    /// Caller must ensure that the `pfn` is no longer used or there will be a memory leak.
+    pub unsafe fn from_pfn(pfn: usize, order: u32) -> Self {
+        let page_ptr = unsafe { c_pfn_to_page(pfn) };
+
+        // Only buddy pages can be used here.
+        assert!(unsafe { page_ptr.as_ref().unwrap() }.flags & PAGE_BUDDY != 0);
+
+        // Check if the order is correct.
+        assert_eq!(
+            unsafe { page_ptr.as_ref().unwrap() }.flags & 0xff,
+            order as u64
+        );
+
+        Self { page_ptr, order }
+    }
+
+    /// Get `Page` from `pfn` and increase the reference count.
+    pub fn get(pfn: usize, order: u32) -> Self {
+        // SAFETY: `pfn` is a valid physical frame number with refcount > 0.
+        unsafe { Self::increase_refcount(pfn) };
+
+        // SAFETY: `pfn` has increased refcount.
+        unsafe { Self::from_pfn(pfn, order) }
+    }
+
+    /// Consumes the `Page` and returns the physical frame number without dropping the reference
+    /// count the page holds.
+    pub fn into_pfn(self) -> usize {
+        let pfn = unsafe { c_page_to_pfn(self.page_ptr) };
+        core::mem::forget(self);
+        pfn
+    }
+
     pub fn len(&self) -> usize {
         1 << (self.order + 12)
     }
 
     pub fn as_phys(&self) -> usize {
-        use crate::bindings::root::kernel::mem::paging::page_to_pfn;
-
-        unsafe { page_to_pfn(self.page_ptr) }
+        unsafe { c_page_to_pfn(self.page_ptr) }
     }
 
     pub fn as_cached(&self) -> phys::CachedPP {
@@ -46,11 +85,21 @@ impl Page {
         use phys::PhysPtr;
 
         unsafe {
-            core::ptr::write_bytes(
-                self.as_cached().as_ptr::<u8>(),
-                0,
-                self.len(),
-            );
+            core::ptr::write_bytes(self.as_cached().as_ptr::<u8>(), 0, self.len());
+        }
+    }
+
+    pub fn as_page_table<'lt>(&self) -> &'lt mut [PTE; 512] {
+        self.as_cached().as_mut_slice(512).try_into().unwrap()
+    }
+
+    /// # Safety
+    /// Caller must ensure that the page is properly freed.
+    pub unsafe fn increase_refcount(pfn: usize) {
+        let page = unsafe { c_pfn_to_page(pfn) };
+
+        unsafe {
+            c_increase_refcount(page);
         }
     }
 }
@@ -58,9 +107,7 @@ impl Page {
 impl Clone for Page {
     fn clone(&self) -> Self {
         unsafe {
-            crate::bindings::root::kernel::mem::paging::increase_refcount(
-                self.page_ptr,
-            );
+            c_increase_refcount(self.page_ptr);
         }
 
         Self {
@@ -73,10 +120,7 @@ impl Clone for Page {
 impl Drop for Page {
     fn drop(&mut self) {
         unsafe {
-            crate::bindings::root::kernel::mem::paging::free_pages(
-                self.page_ptr,
-                self.order,
-            );
+            c_free_pages(self.page_ptr, self.order);
         }
     }
 }
@@ -118,20 +162,12 @@ impl PageBuffer {
     }
 
     pub fn as_slice(&self) -> &[u8] {
-        unsafe {
-            core::slice::from_raw_parts(
-                self.page.as_cached().as_ptr::<u8>(),
-                self.offset,
-            )
-        }
+        unsafe { core::slice::from_raw_parts(self.page.as_cached().as_ptr::<u8>(), self.offset) }
     }
 
     pub fn as_mut_slice(&self) -> &mut [u8] {
         unsafe {
-            core::slice::from_raw_parts_mut(
-                self.page.as_cached().as_ptr::<u8>(),
-                self.offset,
-            )
+            core::slice::from_raw_parts_mut(self.page.as_cached().as_ptr::<u8>(), self.offset)
         }
     }
 
@@ -177,11 +213,7 @@ pub fn copy_to_page(src: &[u8], dst: &Page) -> Result<(), u32> {
     }
 
     unsafe {
-        core::ptr::copy_nonoverlapping(
-            src.as_ptr(),
-            dst.as_cached().as_ptr(),
-            src.len(),
-        );
+        core::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_cached().as_ptr(), src.len());
     }
 
     Ok(())

+ 2 - 2
src/kernel/mem/phys.rs

@@ -31,11 +31,11 @@ pub struct NoCachePP {
 }
 
 impl CachedPP {
-    pub fn new(addr: usize) -> Self {
+    pub const fn new(addr: usize) -> Self {
         Self { addr }
     }
 
-    pub fn offset(&self, offset: usize) -> Self {
+    pub const fn offset(&self, offset: usize) -> Self {
         Self {
             addr: self.addr + offset,
         }

+ 162 - 0
src/kernel/mem/vrange.rs

@@ -0,0 +1,162 @@
+use core::{
+    cmp::Ordering,
+    fmt::{self, Debug, Formatter},
+    ops::{Add, RangeBounds, Sub},
+};
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct VAddr(pub usize);
+
+#[derive(Clone, Copy)]
+pub struct VRange {
+    start: VAddr,
+    end: VAddr,
+}
+
+const USER_SPACE_MEMORY_TOP: VAddr = VAddr(0x8000_0000_0000);
+
+impl VAddr {
+    pub fn floor(&self) -> Self {
+        VAddr(self.0 & !0xfff)
+    }
+
+    pub fn ceil(&self) -> Self {
+        VAddr((self.0 + 0xfff) & !0xfff)
+    }
+
+    pub fn is_user(&self) -> bool {
+        self.0 != 0 && self < &USER_SPACE_MEMORY_TOP
+    }
+}
+
+impl Sub for VAddr {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Add<usize> for VAddr {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 + rhs)
+    }
+}
+
+impl Sub<usize> for VAddr {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 - rhs)
+    }
+}
+
+impl Debug for VAddr {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "V{:#x}", self.0)
+    }
+}
+
+impl Debug for VRange {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "[{:?}, {:?})", self.start, self.end)
+    }
+}
+
+impl Eq for VRange {}
+impl PartialOrd for VRange {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for VRange {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+/// Any two ranges that have one of them containing the other are considered equal.
+impl Ord for VRange {
+    fn cmp(&self, other: &Self) -> Ordering {
+        if self.start == other.start {
+            return Ordering::Equal;
+        }
+
+        if self.end == other.end {
+            if self.start == self.end {
+                return Ordering::Greater;
+            }
+            if other.start == other.end {
+                return Ordering::Less;
+            }
+            return Ordering::Equal;
+        }
+
+        if self.start < other.start {
+            if other.end < self.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Less;
+            }
+        }
+
+        if other.start < self.start {
+            if self.end < other.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Greater;
+            }
+        }
+
+        unreachable!()
+    }
+}
+
+impl From<VAddr> for VRange {
+    fn from(addr: VAddr) -> Self {
+        VRange::new(addr, addr)
+    }
+}
+
+impl VRange {
+    pub fn new(start: VAddr, end: VAddr) -> Self {
+        assert!(start <= end);
+        VRange { start, end }
+    }
+
+    pub fn is_overlapped(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    pub fn is_user(&self) -> bool {
+        self.start < USER_SPACE_MEMORY_TOP && self.end <= USER_SPACE_MEMORY_TOP
+    }
+
+    pub fn start(&self) -> VAddr {
+        self.start
+    }
+
+    pub fn end(&self) -> VAddr {
+        self.end
+    }
+
+    pub fn len(&self) -> usize {
+        self.end.0 - self.start.0
+    }
+
+    pub fn shrink(&self, count: usize) -> Self {
+        assert!(count <= self.len());
+        VRange::new(self.start, self.end - count)
+    }
+
+    pub fn grow(&self, count: usize) -> Self {
+        VRange::new(self.start, self.end + count)
+    }
+
+    pub fn into_range(self) -> impl RangeBounds<Self> {
+        VRange::from(self.start())..VRange::from(self.end())
+    }
+}

+ 0 - 31
src/kernel/module.cc

@@ -1,31 +0,0 @@
-#include <map>
-
-#include <assert.h>
-
-#include <kernel/log.hpp>
-#include <kernel/module.hpp>
-
-namespace kernel::kmod {
-
-kmod::kmod(const char* name) : name(name) {}
-
-static std::map<std::string, std::unique_ptr<kmod>> modules;
-
-void load_internal_modules() {
-    for (auto loader = KMOD_LOADERS_START; *loader; ++loader) {
-        auto mod = (*loader)();
-        if (!mod)
-            continue;
-
-        if (int ret = mod->init(); ret != 0) {
-            kmsgf("[kernel] An error(%x) occured while loading \"%s\"", ret,
-                  mod->name);
-            continue;
-        }
-
-        auto [_, inserted] = modules.try_emplace(mod->name, std::move(mod));
-        assert(inserted);
-    }
-}
-
-} // namespace kernel::kmod

+ 0 - 299
src/kernel/process.cpp

@@ -12,243 +12,14 @@
 #include <kernel/async/lock.hpp>
 #include <kernel/log.hpp>
 #include <kernel/mem/paging.hpp>
-#include <kernel/module.hpp>
 #include <kernel/process.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/task/readyqueue.hpp>
-#include <kernel/task/thread.hpp>
-#include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
 
-extern "C" fs::rust_file_array::handle* r_filearray_new_for_init();
-extern "C" fs::rust_fs_context::handle* r_fs_context_new_for_init();
-extern "C" fs::rust_file_array::handle* r_filearray_new_cloned(
-    struct fs::rust_file_array::handle* other);
-extern "C" fs::rust_fs_context::handle* r_fs_context_new_cloned(
-    struct fs::rust_fs_context::handle* other);
-extern "C" void r_filearray_drop(struct fs::rust_file_array::handle* other);
-extern "C" void r_fs_context_drop(struct fs::rust_fs_context::handle* other);
-
-fs::rust_fs_context::rust_fs_context(rust_fs_context::handle* handle) : m_handle(handle) {}
-fs::rust_file_array::rust_file_array(rust_file_array::handle* handle) : m_handle(handle) {}
-
-fs::rust_fs_context::~rust_fs_context() {
-    drop();
-}
-
-fs::rust_file_array::~rust_file_array() {
-    drop();
-}
-
-void fs::rust_fs_context::drop() {
-    if (m_handle) {
-        r_fs_context_drop(m_handle);
-        m_handle = nullptr;
-    }
-}
-
-void fs::rust_file_array::drop() {
-    if (m_handle) {
-        r_filearray_drop(m_handle);
-        m_handle = nullptr;
-    }
-}
-
-fs::rust_fs_context::handle* fs::rust_fs_context::get() const {
-    assert(m_handle);
-    return m_handle;
-}
-
-fs::rust_file_array::handle* fs::rust_file_array::get() const {
-    assert(m_handle);
-    return m_handle;
-}
-
-process::process(const process& parent, pid_t pid)
-    : mms{parent.mms}
-    , attr{parent.attr}
-    , files{r_filearray_new_cloned(parent.files.get())}
-    , fs_context{r_fs_context_new_cloned(parent.fs_context.get())}
-    , pid{pid}
-    , ppid{parent.pid}
-    , pgid{parent.pgid}
-    , sid{parent.sid}
-    , control_tty{parent.control_tty} {}
-
-process::process(pid_t pid, pid_t ppid)
-    : attr{.system = true}
-    , files{r_filearray_new_for_init()}
-    , fs_context{r_fs_context_new_for_init()}
-    , pid{pid}
-    , ppid{ppid} {
-    bool inserted;
-    std::tie(std::ignore, inserted) = thds.emplace("", pid);
-    assert(inserted);
-}
-
-using signo_type = kernel::signal_list::signo_type;
-
-void process::send_signal(signo_type signal) {
-    for (auto& thd : thds)
-        thd.send_signal(signal);
-}
-
-void kernel_threadd_main(void) {
-    kmsg("[kernel] kthread daemon started");
-
-    // TODO: create new kthread
-    for (;;)
-        asm volatile("hlt");
-}
-
-static inline void __spawn(kernel::task::thread& thd, uintptr_t entry) {
-    auto prev_sp = thd.kstack.sp;
-
-    // return(start) address
-    thd.kstack.pushq(entry);
-    thd.kstack.pushq(0x200);   // flags
-    thd.kstack.pushq(0);       // r15
-    thd.kstack.pushq(0);       // r14
-    thd.kstack.pushq(0);       // r13
-    thd.kstack.pushq(0);       // r12
-    thd.kstack.pushq(0);       // rbp
-    thd.kstack.pushq(0);       // rbx
-    thd.kstack.pushq(0);       // 0 for alignment
-    thd.kstack.pushq(prev_sp); // previous sp
-}
-
-SECTION(".text.kinit")
-proclist::proclist() {
-    // init process has no parent
-    auto& init = real_emplace(1, 0);
-    assert(init.pid == 1 && init.ppid == 0);
-
-    auto thd = init.thds.begin();
-    thd->name.assign("[kernel init]");
-
-    init.attr.system = 0;
-    thd->attr &= ~kernel::task::thread::SYSTEM;
-
-    current_process = &init;
-    current_thread = &thd;
-
-    kernel::task::dispatcher::enqueue(current_thread);
-
-    current_thread->kstack.load_interrupt_stack();
-    current_process->mms.switch_pd();
-
-    if (1) {
-        // pid 0 is kernel thread daemon
-        auto& proc = real_emplace(0, 0);
-        assert(proc.pid == 0 && proc.ppid == 0);
-
-        // create thread
-        auto thd = proc.thds.begin();
-        thd->name.assign("[kernel thread daemon]");
-
-        __spawn(*thd, (uintptr_t)kernel_threadd_main);
-
-        kernel::task::dispatcher::setup_idle(&thd);
-    }
-}
-
-process& proclist::real_emplace(pid_t pid, pid_t ppid) {
-    auto [iter, inserted] = m_procs.try_emplace(pid, pid, ppid);
-    assert(inserted);
-
-    return iter->second;
-}
-
-void proclist::kill(pid_t pid, int exit_code) {
-    auto& proc = this->find(pid);
-
-    // init should never exit
-    if (proc.ppid == 0) {
-        kmsg("kernel panic: init exited!");
-        freeze();
-    }
-
-    kernel::async::preempt_disable();
-
-    // put all threads into sleep
-    for (auto& thd : proc.thds)
-        thd.set_attr(kernel::task::thread::ZOMBIE);
-
-    // TODO: CHANGE THIS
-    //       files should only be closed when this is the last thread
-    //
-    // write back mmap'ped files and close them
-    proc.files.drop();
-
-    // free fs_context
-    proc.fs_context.drop();
-
-    // unmap all user memory areas
-    proc.mms.clear();
-
-    // make child processes orphans (children of init)
-    this->make_children_orphans(pid);
-
-    proc.attr.zombie = 1;
-
-    // notify parent process and init
-    auto& parent = this->find(proc.ppid);
-    auto& init = this->find(1);
-
-    using kernel::async::lock_guard;
-    bool flag = false;
-    if (1) {
-        lock_guard lck(init.mtx_waitprocs);
-
-        if (1) {
-            lock_guard lck(proc.mtx_waitprocs);
-
-            for (const auto& item : proc.waitprocs) {
-                if (WIFSTOPPED(item.code) || WIFCONTINUED(item.code))
-                    continue;
-
-                init.waitprocs.push_back(item);
-                flag = true;
-            }
-
-            proc.waitprocs.clear();
-        }
-    }
-
-    if (flag)
-        init.waitlist.notify_all();
-
-    if (1) {
-        lock_guard lck(parent.mtx_waitprocs);
-        parent.waitprocs.push_back({pid, exit_code});
-    }
-
-    parent.waitlist.notify_all();
-
-    kernel::async::preempt_enable();
-}
-
-static void release_kinit() {
-    // free .kinit
-    using namespace kernel::mem::paging;
-    extern uintptr_t volatile KINIT_START_ADDR, KINIT_END_ADDR, KINIT_PAGES;
-
-    std::size_t pages = KINIT_PAGES;
-    auto range = vaddr_range{KERNEL_PML4, KINIT_START_ADDR, KINIT_END_ADDR, true};
-    for (auto pte : range)
-        pte.clear();
-
-    create_zone(KERNEL_IMAGE_PADDR, KERNEL_IMAGE_PADDR + 0x1000 * pages);
-}
-
 extern "C" void late_init_rust(uintptr_t* out_sp, uintptr_t* out_ip);
 
 void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
     kernel::mem::paging::free_pages(kernel_stack_pfn, 9);
-    release_kinit();
-
-    kernel::kmod::load_internal_modules();
 
     uintptr_t sp, ip;
     late_init_rust(&sp, &ip);
@@ -282,7 +53,6 @@ void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn) {
     freeze();
 }
 
-SECTION(".text.kinit")
 void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn) {
     procs = new proclist;
 
@@ -320,76 +90,7 @@ void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn) {
     freeze();
 }
 
-extern "C" void asm_ctx_switch(uintptr_t* curr_sp, uintptr_t* next_sp);
-
-extern "C" void after_ctx_switch() {
-    current_thread->kstack.load_interrupt_stack();
-    current_thread->load_thread_area32();
-
-    kernel::async::preempt_enable();
-}
-
-// call this with preempt_count == 1
-// after this function returns, preempt_count will be 0
-static bool do_schedule() {
-    asm volatile("" : : : "memory");
-    auto* next_thd = kernel::task::dispatcher::next();
-
-    if (current_thread != next_thd) {
-        auto* proc = &procs->find(next_thd->owner);
-        if (current_process != proc) {
-            proc->mms.switch_pd();
-            current_process = proc;
-        }
-
-        auto* curr_thd = current_thread;
-        current_thread = next_thd;
-
-        // this implies preempt_enable()
-        asm_ctx_switch(&curr_thd->kstack.sp, &next_thd->kstack.sp);
-    } else {
-        kernel::async::preempt_enable();
-    }
-
-    return current_thread->signals.pending_signal() == 0;
-}
-
-static inline void check_preempt_count(kernel::async::preempt_count_t n) {
-    if (kernel::async::preempt_count() != n) [[unlikely]] {
-        kmsgf(
-            "[kernel:fatal] trying to call schedule_now() with preempt count "
-            "%d, expected %d",
-            kernel::async::preempt_count(), n);
-        assert(kernel::async::preempt_count() == n);
-    }
-}
-
-bool schedule_now() {
-    check_preempt_count(0);
-    kernel::async::preempt_disable();
-    bool result = do_schedule();
-    return result;
-}
-
-// call this with preempt_count == 1
-bool schedule_now_preempt_disabled() {
-    check_preempt_count(1);
-    return do_schedule();
-}
-
-void NORETURN schedule_noreturn(void) {
-    schedule_now();
-    kmsgf("[kernel:fatal] an schedule_noreturn() DOES return");
-    freeze();
-}
-
 void NORETURN freeze(void) {
     for (;;)
         asm volatile("cli\n\thlt");
 }
-
-// TODO!!!: make sure we call this after having done all clean up works
-void NORETURN kill_current(int signo) {
-    procs->kill(current_process->pid, (signo + 128) << 8 | (signo & 0xff));
-    schedule_noreturn();
-}

+ 0 - 214
src/kernel/signal.cpp

@@ -1,214 +0,0 @@
-#include <signal.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/interrupt.hpp>
-#include <kernel/process.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/task/thread.hpp>
-
-#define sigmask(sig) (1ULL << ((sig)-1))
-
-#define sigmask_now (sigmask(SIGKILL) | sigmask(SIGSTOP))
-
-#define sigmask_ignore (sigmask(SIGCHLD) | sigmask(SIGURG) | sigmask(SIGWINCH))
-
-#define sigmask_coredump                                                       \
-    (sigmask(SIGQUIT) | sigmask(SIGILL) | sigmask(SIGTRAP) |                   \
-     sigmask(SIGABRT) | sigmask(SIGFPE) | sigmask(SIGSEGV) | sigmask(SIGBUS) | \
-     sigmask(SIGSYS) | sigmask(SIGXCPU) | sigmask(SIGXFSZ))
-
-#define sigmask_stop \
-    (sigmask(SIGSTOP) | sigmask(SIGTSTP) | sigmask(SIGTTIN) | sigmask(SIGTTOU))
-
-using kernel::signal_list;
-using signo_type = signal_list::signo_type;
-
-static void continue_process(int signal) {
-    auto& parent = procs->find(current_process->ppid);
-
-    // signal parent we're running
-    parent.waitprocs.push_back({current_process->pid, 0xffff});
-
-    current_thread->signals.after_signal(signal);
-}
-
-static void stop_process(int signal) {
-    auto& parent = procs->find(current_process->ppid);
-
-    current_thread->set_attr(kernel::task::thread::STOPPED);
-
-    // signal parent we're stopped
-    parent.waitprocs.push_back({current_process->pid, 0x7f});
-    parent.waitlist.notify_all();
-
-    while (true) {
-        if (schedule_now())
-            break;
-    }
-
-    current_thread->signals.after_signal(signal);
-}
-
-static void terminate_process(int signo) {
-    kill_current(signo);
-}
-
-static void terminate_process_with_core_dump(int signo) {
-    terminate_process(signo & 0x80);
-}
-
-void signal_list::set_handler(signo_type signal, const sigaction& action) {
-    if (action.sa_handler == SIG_DFL)
-        m_handlers.erase(signal);
-    else
-        m_handlers[signal] = action;
-}
-
-void signal_list::get_handler(signo_type signal, sigaction& action) const {
-    auto iter = m_handlers.find(signal);
-    if (iter == m_handlers.end()) {
-        action.sa_handler = SIG_DFL;
-        action.sa_flags = 0;
-        action.sa_restorer = nullptr;
-        action.sa_mask = 0;
-    } else {
-        action = iter->second;
-    }
-}
-
-void signal_list::on_exec() {
-    std::erase_if(m_handlers,
-                  [](auto& pair) { return pair.second.sa_handler != SIG_IGN; });
-}
-
-bool signal_list::raise(signo_type signal) {
-    async::lock_guard lck{m_mtx};
-
-    // TODO: clear pending signals
-    if (signal == SIGCONT) {
-        m_list.remove_if([](signo_type sig) {
-            return sig == SIGSTOP || sig == SIGTSTP || sig == SIGTTIN ||
-                   sig == SIGTTOU;
-        });
-        return true;
-    }
-
-    if (sigmask(signal) & sigmask_stop) {
-        m_list.remove(SIGCONT);
-        return false;
-    }
-
-    auto iter = m_handlers.find(signal);
-    if (iter != m_handlers.end()) {
-        if (iter->second.sa_handler == SIG_IGN)
-            return false;
-    } else {
-        if (sigmask(signal) & sigmask_ignore)
-            return false;
-    }
-
-    m_list.push_back(signal);
-    m_mask |= sigmask(signal);
-
-    return true;
-}
-
-signo_type signal_list::pending_signal() {
-    async::lock_guard lck{m_mtx};
-    for (auto iter = m_list.begin(); iter != m_list.end(); ++iter) {
-        auto iter_handler = m_handlers.find(*iter);
-
-        // signal default action
-        if (iter_handler == m_handlers.end()) {
-            if (!(sigmask(*iter) & sigmask_ignore))
-                return *iter;
-            iter = m_list.erase(iter);
-            continue;
-        }
-
-        if (iter_handler->second.sa_handler == SIG_IGN) {
-            iter = m_list.erase(iter);
-            continue;
-        }
-
-        return *iter;
-    }
-
-    return 0;
-}
-
-void signal_list::handle(interrupt_stack* context, mmx_registers* mmxregs) {
-    unsigned int signal;
-    if (1) {
-        async::lock_guard lck{m_mtx};
-        // assume that the pending signal is at the front of the list
-        signal = m_list.front();
-        m_list.pop_front();
-    }
-
-    // default handlers
-    if (sigmask(signal) & sigmask_now) {
-        if (signal == SIGKILL)
-            terminate_process(signal);
-        else // SIGSTOP
-            stop_process(signal);
-    }
-
-    auto iter = m_handlers.find(signal);
-    if (iter == m_handlers.end()) {
-        if (signal == SIGCONT)
-            continue_process(signal);
-        else if (sigmask(signal) & sigmask_stop)
-            stop_process(signal);
-        else if (sigmask(signal) & sigmask_coredump)
-            terminate_process_with_core_dump(signal);
-        else if (!(sigmask(signal) & sigmask_ignore))
-            terminate_process(signal);
-        // signal is ignored by default
-        return;
-    }
-
-    auto& handler = iter->second;
-    if (!(handler.sa_flags & SA_RESTORER))
-        raise(SIGSYS);
-
-    // save current interrupt context to 128 bytes above current user stack
-    uintptr_t sp = (uintptr_t)context->rsp;
-    sp -= (128 + sizeof(mmx_registers) + sizeof(interrupt_stack) + 16);
-    sp &= ~0xf;
-
-    auto tmpsp = sp;
-    *(uint64_t*)tmpsp = signal; // signal handler argument: int signo
-    tmpsp += 8;
-    *(uintptr_t*)tmpsp = context->rsp; // original rsp
-    tmpsp += 8;
-
-    memcpy((void*)tmpsp, mmxregs, sizeof(mmx_registers));
-    tmpsp += sizeof(mmx_registers); // mmx registers
-    memcpy((void*)tmpsp, context, sizeof(interrupt_stack));
-    tmpsp += sizeof(interrupt_stack); // context
-
-    sp -= sizeof(void*);
-    // signal handler return address: restorer
-    *(uintptr_t*)sp = (uintptr_t)handler.sa_restorer;
-
-    context->rsp = sp;
-    context->v_rip = (uintptr_t)handler.sa_handler;
-}
-
-void signal_list::after_signal(signo_type signal) {
-    m_mask &= ~sigmask(signal);
-}
-
-kernel::sigmask_type signal_list::get_mask() const {
-    return m_mask;
-}
-void signal_list::set_mask(sigmask_type mask) {
-    m_mask = mask & ~sigmask_now;
-}
-void signal_list::mask(sigmask_type mask) {
-    set_mask(m_mask | mask);
-}
-void signal_list::unmask(sigmask_type mask) {
-    set_mask(m_mask & ~mask);
-}

+ 0 - 286
src/kernel/syscall.cpp

@@ -1,286 +0,0 @@
-#include <assert.h>
-#include <bits/alltypes.h>
-#include <errno.h>
-#include <poll.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/prctl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <sys/utsname.h>
-#include <sys/wait.h>
-#include <termios.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <types/allocator.hpp>
-#include <types/elf.hpp>
-#include <types/path.hpp>
-#include <types/types.h>
-#include <types/user_types.hpp>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/hw/timer.hpp>
-#include <kernel/interrupt.hpp>
-#include <kernel/log.hpp>
-#include <kernel/process.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/syscall.hpp>
-#include <kernel/task/readyqueue.hpp>
-#include <kernel/task/thread.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/user/thread_local.hpp>
-#include <kernel/utsname.hpp>
-#include <kernel/vfs.hpp>
-
-#define SYSCALL_HANDLERS_SIZE (404)
-
-#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
-
-#define SYSCALL32_ARG1(type, name) type name = (type)((data)->regs.rbx)
-#define SYSCALL32_ARG2(type, name) type name = (type)((data)->regs.rcx)
-#define SYSCALL32_ARG3(type, name) type name = (type)((data)->regs.rdx)
-#define SYSCALL32_ARG4(type, name) type name = (type)((data)->regs.rsi)
-#define SYSCALL32_ARG5(type, name) type name = (type)((data)->regs.rdi)
-#define SYSCALL32_ARG6(type, name) type name = (type)((data)->regs.rbp)
-
-#define _DEFINE_SYSCALL32_ARGS1(type, name, ...) \
-    SYSCALL32_ARG1(type, name);                  \
-    __VA_OPT__(_DEFINE_SYSCALL32_ARGS2(__VA_ARGS__))
-
-#define _DEFINE_SYSCALL32_ARGS2(type, name, ...) \
-    SYSCALL32_ARG2(type, name);                  \
-    __VA_OPT__(_DEFINE_SYSCALL32_ARGS3(__VA_ARGS__))
-
-#define _DEFINE_SYSCALL32_ARGS3(type, name, ...) \
-    SYSCALL32_ARG3(type, name);                  \
-    __VA_OPT__(_DEFINE_SYSCALL32_ARGS4(__VA_ARGS__))
-
-#define _DEFINE_SYSCALL32_ARGS4(type, name, ...) \
-    SYSCALL32_ARG4(type, name);                  \
-    __VA_OPT__(_DEFINE_SYSCALL32_ARGS5(__VA_ARGS__))
-
-#define _DEFINE_SYSCALL32_ARGS5(type, name, ...) \
-    SYSCALL32_ARG5(type, name);                  \
-    __VA_OPT__(_DEFINE_SYSCALL32_ARGS6(__VA_ARGS__))
-
-#define _DEFINE_SYSCALL32_ARGS6(type, name, ...) SYSCALL32_ARG6(type, name);
-
-#define _DEFINE_SYSCALL32_END_PARAMS1(type, name, ...) \
-    name __VA_OPT__(, _DEFINE_SYSCALL32_END_PARAMS2(__VA_ARGS__))
-#define _DEFINE_SYSCALL32_END_PARAMS2(type, name, ...) \
-    name __VA_OPT__(, _DEFINE_SYSCALL32_END_PARAMS3(__VA_ARGS__))
-#define _DEFINE_SYSCALL32_END_PARAMS3(type, name, ...) \
-    name __VA_OPT__(, _DEFINE_SYSCALL32_END_PARAMS4(__VA_ARGS__))
-#define _DEFINE_SYSCALL32_END_PARAMS4(type, name, ...) \
-    name __VA_OPT__(, _DEFINE_SYSCALL32_END_PARAMS5(__VA_ARGS__))
-#define _DEFINE_SYSCALL32_END_PARAMS5(type, name, ...) \
-    name __VA_OPT__(, _DEFINE_SYSCALL32_END_PARAMS6(__VA_ARGS__))
-#define _DEFINE_SYSCALL32_END_PARAMS6(type, name, ...) name __VA_OPT__(, void)
-
-#define _DEFINE_SYSCALL32_END(name, ...) \
-    kernel::syscall::do_##name(__VA_OPT__(_DEFINE_SYSCALL32_END_PARAMS1(__VA_ARGS__)))
-
-#define DEFINE_SYSCALL32_TO(sname, to, ...)                                              \
-    static uint32_t _syscall32_##sname(interrupt_stack* data, mmx_registers* mmxregs) {  \
-        (void)data, (void)mmxregs;                                                       \
-        __VA_OPT__(_DEFINE_SYSCALL32_ARGS1(__VA_ARGS__);)                                \
-        return (uint32_t)(uintptr_t)_DEFINE_SYSCALL32_END(to __VA_OPT__(, __VA_ARGS__)); \
-    }
-
-#define DEFINE_SYSCALL32(name, ...) DEFINE_SYSCALL32_TO(name, name __VA_OPT__(, ) __VA_ARGS__)
-
-#define DEFINE_SYSCALL32_NORETURN(name, ...)                                 \
-    [[noreturn]] static uint32_t _syscall32_##name(interrupt_stack* data,    \
-                                                   mmx_registers* mmxregs) { \
-        (void)data, (void)mmxregs;                                           \
-        __VA_OPT__(_DEFINE_SYSCALL32_ARGS1(__VA_ARGS__);)                    \
-        _DEFINE_SYSCALL32_END(name, __VA_ARGS__);                            \
-    }
-
-struct syscall_handler_t {
-    uint32_t (*handler)(interrupt_stack*, mmx_registers*);
-    const char* name;
-};
-
-static syscall_handler_t syscall_handlers[SYSCALL_HANDLERS_SIZE];
-
-static inline void not_implemented(const char* pos, int line) {
-    kmsgf(
-        "[kernel] the function at %s:%d is not implemented, killing the "
-        "pid%d...",
-        pos, line, current_process->pid);
-    current_thread->send_signal(SIGSYS);
-}
-
-DEFINE_SYSCALL32(munmap, uintptr_t, addr, size_t, len)
-DEFINE_SYSCALL32(poll, pollfd __user*, fds, nfds_t, nfds, int, timeout)
-DEFINE_SYSCALL32(socket, int, domain, int, type, int, protocol)
-
-DEFINE_SYSCALL32(mmap_pgoff, uintptr_t, addr, size_t, len, int, prot, int, flags, int, fd, off_t,
-                 pgoffset)
-
-DEFINE_SYSCALL32(waitpid, pid_t, waitpid, int __user*, arg1, int, options)
-DEFINE_SYSCALL32(getsid, pid_t, pid)
-DEFINE_SYSCALL32(setsid)
-DEFINE_SYSCALL32(getpgid, pid_t, pid)
-DEFINE_SYSCALL32(setpgid, pid_t, pid, pid_t, pgid)
-DEFINE_SYSCALL32(getpid)
-DEFINE_SYSCALL32(getppid)
-DEFINE_SYSCALL32(getuid)
-DEFINE_SYSCALL32(geteuid)
-DEFINE_SYSCALL32_TO(geteuid32, geteuid)
-DEFINE_SYSCALL32(getgid)
-DEFINE_SYSCALL32_TO(getgid32, getgid)
-DEFINE_SYSCALL32(gettid)
-DEFINE_SYSCALL32(set_thread_area, kernel::user::user_desc __user*, ptr)
-DEFINE_SYSCALL32(set_tid_address, int __user*, tidptr)
-DEFINE_SYSCALL32(prctl, int, option, uintptr_t, arg2)
-DEFINE_SYSCALL32(arch_prctl, int, option, uintptr_t, arg2)
-DEFINE_SYSCALL32(brk, uintptr_t, addr)
-DEFINE_SYSCALL32(kill, pid_t, pid, int, sig)
-DEFINE_SYSCALL32(tkill, pid_t, tid, int, sig)
-DEFINE_SYSCALL32(rt_sigprocmask, int, how, const kernel::sigmask_type __user*, set,
-                 kernel::sigmask_type __user*, oldset, size_t, sigsetsize)
-DEFINE_SYSCALL32(rt_sigaction, int, signum, const kernel::sigaction __user*, act,
-                 kernel::sigaction __user*, oldact, size_t, sigsetsize)
-DEFINE_SYSCALL32(newuname, new_utsname __user*, buf)
-
-DEFINE_SYSCALL32_NORETURN(exit, int, status)
-
-DEFINE_SYSCALL32(gettimeofday, timeval __user*, tv, void __user*, tz)
-DEFINE_SYSCALL32_TO(clock_gettime64, clock_gettime, clockid_t, clk_id, timespec __user*, tp)
-
-extern "C" void NORETURN ISR_stub_restore();
-static uint32_t _syscall32_fork(interrupt_stack* data, mmx_registers* mmxregs) {
-    auto& newproc = procs->copy_from(*current_process);
-    auto [iter_newthd, inserted] = newproc.thds.emplace(*current_thread, newproc.pid);
-    assert(inserted);
-    auto* newthd = &*iter_newthd;
-
-    auto newthd_prev_sp = newthd->kstack.sp;
-    assert(!(newthd_prev_sp & 0xf));
-
-    newthd->kstack.sp -= sizeof(interrupt_stack);
-    memcpy((void*)(newthd->kstack.sp), data, sizeof(interrupt_stack));
-
-    ((interrupt_stack*)(newthd->kstack.sp))->regs.rax = 0; // return value
-    auto isr_restore_sp = newthd->kstack.sp;
-
-    newthd->kstack.sp -= sizeof(mmx_registers);
-    memcpy((void*)(newthd->kstack.sp), mmxregs, sizeof(mmx_registers));
-
-    // asm_ctx_switch stack
-    // return(start) address
-    newthd->kstack.pushq((uintptr_t)ISR_stub_restore);
-    newthd->kstack.pushq(0);              // flags
-    newthd->kstack.pushq(0);              // r15
-    newthd->kstack.pushq(0);              // r14
-    newthd->kstack.pushq(0);              // r13
-    newthd->kstack.pushq(0);              // r12
-    newthd->kstack.pushq(0);              // rbp
-    newthd->kstack.pushq(isr_restore_sp); // rbx
-    newthd->kstack.pushq(0);              // 0 for alignment
-    newthd->kstack.pushq(newthd_prev_sp); // previous sp
-
-    kernel::task::dispatcher::enqueue(newthd);
-    return newproc.pid;
-}
-
-[[noreturn]] static uint32_t _syscall32_exit_group(interrupt_stack* data, mmx_registers* mmxregs) {
-    // we implement exit_group as exit for now
-    _syscall32_exit(data, mmxregs);
-}
-
-static uint32_t _syscall32_wait4(interrupt_stack* data, mmx_registers* mmxregs) {
-    SYSCALL32_ARG4(void __user*, rusage);
-
-    // TODO: getrusage
-    if (rusage)
-        return -EINVAL;
-
-    return _syscall32_waitpid(data, mmxregs);
-}
-
-void kernel::handle_syscall32(int no, interrupt_stack* data, mmx_registers* mmxregs) {
-    if (no >= SYSCALL_HANDLERS_SIZE || !syscall_handlers[no].handler) {
-        kmsgf("[kernel] syscall %d(%x) isn't implemented", no, no);
-        NOT_IMPLEMENTED;
-
-        if (current_thread->signals.pending_signal())
-            current_thread->signals.handle(data, mmxregs);
-        return;
-    }
-
-    // kmsgf_debug("[kernel:debug] (pid\t%d) %s() => {{", current_process->pid,
-    // syscall_handlers[no].name);
-
-    asm volatile("sti");
-    data->regs.rax = syscall_handlers[no].handler(data, mmxregs);
-    data->regs.r8 = 0;
-    data->regs.r9 = 0;
-    data->regs.r10 = 0;
-    data->regs.r11 = 0;
-    data->regs.r12 = 0;
-    data->regs.r13 = 0;
-    data->regs.r14 = 0;
-    data->regs.r15 = 0;
-
-    // kmsgf_debug("[kernel:debug] }} => %x", data->regs.rax);
-
-    if (current_thread->signals.pending_signal())
-        current_thread->signals.handle(data, mmxregs);
-}
-
-#define REGISTER_SYSCALL_HANDLER(no, _name) register_syscall_handler(no, _syscall32_##_name, #_name)
-
-extern "C" void register_syscall_handler(uint32_t no,
-                                         uint32_t (*handler)(interrupt_stack*, mmx_registers*),
-                                         const char* name) {
-    syscall_handlers[no].handler = handler;
-    syscall_handlers[no].name = name;
-}
-
-extern "C" void r_register_syscall();
-
-SECTION(".text.kinit")
-void kernel::init_syscall_table() {
-    REGISTER_SYSCALL_HANDLER(0x01, exit);
-    REGISTER_SYSCALL_HANDLER(0x02, fork);
-    REGISTER_SYSCALL_HANDLER(0x07, waitpid);
-    REGISTER_SYSCALL_HANDLER(0x14, getpid);
-    REGISTER_SYSCALL_HANDLER(0x25, kill);
-    REGISTER_SYSCALL_HANDLER(0x2d, brk);
-    REGISTER_SYSCALL_HANDLER(0x2f, getgid);
-    REGISTER_SYSCALL_HANDLER(0x39, setpgid);
-    REGISTER_SYSCALL_HANDLER(0x40, getppid);
-    REGISTER_SYSCALL_HANDLER(0x42, setsid);
-    REGISTER_SYSCALL_HANDLER(0x4e, gettimeofday);
-    REGISTER_SYSCALL_HANDLER(0x5b, munmap);
-    REGISTER_SYSCALL_HANDLER(0x72, wait4);
-    REGISTER_SYSCALL_HANDLER(0x7a, newuname);
-    REGISTER_SYSCALL_HANDLER(0x84, getpgid);
-    REGISTER_SYSCALL_HANDLER(0x93, getsid);
-    REGISTER_SYSCALL_HANDLER(0xa8, poll);
-    REGISTER_SYSCALL_HANDLER(0xac, prctl);
-    REGISTER_SYSCALL_HANDLER(0xae, rt_sigaction);
-    REGISTER_SYSCALL_HANDLER(0xaf, rt_sigprocmask);
-    REGISTER_SYSCALL_HANDLER(0xc0, mmap_pgoff);
-    REGISTER_SYSCALL_HANDLER(0xc7, getuid);
-    REGISTER_SYSCALL_HANDLER(0xc8, getgid32);
-    REGISTER_SYSCALL_HANDLER(0xc9, geteuid);
-    REGISTER_SYSCALL_HANDLER(0xca, geteuid32);
-    REGISTER_SYSCALL_HANDLER(0xe0, gettid);
-    REGISTER_SYSCALL_HANDLER(0xee, tkill);
-    REGISTER_SYSCALL_HANDLER(0xf3, set_thread_area);
-    REGISTER_SYSCALL_HANDLER(0xfc, exit_group);
-    REGISTER_SYSCALL_HANDLER(0x102, set_tid_address);
-    REGISTER_SYSCALL_HANDLER(0x167, socket);
-    REGISTER_SYSCALL_HANDLER(0x180, arch_prctl);
-    REGISTER_SYSCALL_HANDLER(0x193, clock_gettime64);
-
-    r_register_syscall();
-}

+ 182 - 90
src/kernel/syscall.rs

@@ -1,128 +1,220 @@
-use crate::bindings::root::{interrupt_stack, mmx_registers};
+use crate::{
+    bindings::root::{interrupt_stack, mmx_registers},
+    kernel::task::{ProcessList, Signal},
+    println_warn,
+};
 
 mod file_rw;
+mod mm;
+mod net;
 mod procops;
+mod sysinfo;
+
+pub(self) struct MapArgumentImpl;
+pub(self) trait MapArgument<'a, T: 'a> {
+    fn map_arg(value: u64) -> T;
+}
 
 pub(self) trait MapReturnValue {
-    fn map(self) -> u32;
+    fn map_ret(self) -> usize;
 }
 
 impl MapReturnValue for () {
-    fn map(self) -> u32 {
+    fn map_ret(self) -> usize {
         0
     }
 }
 
 impl MapReturnValue for u32 {
-    fn map(self) -> u32 {
-        self
+    fn map_ret(self) -> usize {
+        self as usize
     }
 }
 
 impl MapReturnValue for usize {
-    fn map(self) -> u32 {
-        self as u32
+    fn map_ret(self) -> usize {
+        self
     }
 }
 
+impl MapArgument<'_, u64> for MapArgumentImpl {
+    fn map_arg(value: u64) -> u64 {
+        value as u64
+    }
+}
+
+impl MapArgument<'_, u32> for MapArgumentImpl {
+    fn map_arg(value: u64) -> u32 {
+        value as u32
+    }
+}
+
+impl MapArgument<'_, i32> for MapArgumentImpl {
+    fn map_arg(value: u64) -> i32 {
+        value as i32
+    }
+}
+
+impl MapArgument<'_, usize> for MapArgumentImpl {
+    fn map_arg(value: u64) -> usize {
+        value as usize
+    }
+}
+
+impl<'a, T: 'a> MapArgument<'a, *const T> for MapArgumentImpl {
+    fn map_arg(value: u64) -> *const T {
+        value as *const _
+    }
+}
+
+impl<'a, T: 'a> MapArgument<'a, *mut T> for MapArgumentImpl {
+    fn map_arg(value: u64) -> *mut T {
+        value as *mut _
+    }
+}
+
+macro_rules! arg_register {
+    (0, $is:ident) => {
+        $is.regs.rbx
+    };
+    (1, $is:ident) => {
+        $is.regs.rcx
+    };
+    (2, $is:ident) => {
+        $is.regs.rdx
+    };
+    (3, $is:ident) => {
+        $is.regs.rsi
+    };
+    (4, $is:ident) => {
+        $is.regs.rdi
+    };
+    (5, $is:ident) => {
+        $is.regs.rbp
+    };
+}
+
 macro_rules! syscall32_call {
-    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty) => {{
-        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
-        match $handler($arg1) {
-            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
-            Err(err) => (-(err as i32)) as u32,
-        }
-    }};
-    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty, $arg2:ident: $argt2:ty) => {{
-        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
-        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
-        match $handler($arg1, $arg2) {
-            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
-            Err(err) => (-(err as i32)) as u32,
-        }
-    }};
-    ($int_stack:ident, $handler:ident, $arg1:ident: $argt1:ty, $arg2:ident: $argt2:ty, $arg3:ident: $argt3:ty) => {{
-        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
-        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
-        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
-        match $handler($arg1, $arg2, $arg3) {
-            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
-            Err(err) => (-(err as i32)) as u32,
-        }
-    }};
-    ($int_stack:ident, $handler:ident,
-     $arg1:ident: $argt1:ty,
-     $arg2:ident: $argt2:ty,
-     $arg3:ident: $argt3:ty,
-     $arg4:ident: $argt4:ty) => {{
-        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
-        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
-        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
-        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
-        match $handler($arg1, $arg2, $arg3, $arg4) {
-            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
-            Err(err) => (-(err as i32)) as u32,
-        }
-    }};
-    ($int_stack:ident, $handler:ident,
-     $arg1:ident: $argt1:ty,
-     $arg2:ident: $argt2:ty,
-     $arg3:ident: $argt3:ty,
-     $arg4:ident: $argt4:ty,
-     $arg5:ident: $argt5:ty) => {{
-        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
-        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
-        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
-        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
-        let $arg5: $argt5 = $int_stack.regs.rdi as $argt5;
-        match $handler($arg1, $arg2, $arg3, $arg4, $arg5) {
-            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
-            Err(err) => (-(err as i32)) as u32,
-        }
-    }};
-    ($int_stack:ident, $handler:ident,
-     $arg1:ident: $argt1:ty,
-     $arg2:ident: $argt2:ty,
-     $arg3:ident: $argt3:ty,
-     $arg4:ident: $argt4:ty,
-     $arg5:ident: $argt5:ty,
-     $arg6:ident: $argt6:ty) => {{
-        let $arg1: $argt1 = $int_stack.regs.rbx as $argt1;
-        let $arg2: $argt2 = $int_stack.regs.rcx as $argt2;
-        let $arg3: $argt3 = $int_stack.regs.rdx as $argt3;
-        let $arg4: $argt4 = $int_stack.regs.rsi as $argt4;
-        let $arg5: $argt5 = $int_stack.regs.rdi as $argt5;
-        let $arg6: $argt6 = $int_stack.regs.rbp as $argt6;
-        match $handler($arg1, $arg2, $arg3, $arg4, $arg5, $arg6) {
-            Ok(val) => $crate::kernel::syscall::MapReturnValue::map(val),
-            Err(err) => (-(err as i32)) as u32,
+    ($is:ident, $handler:ident, $($arg:ident: $type:ty),*) => {{
+        use $crate::kernel::syscall::{MapArgument, MapArgumentImpl, arg_register};
+        use $crate::kernel::syscall::{MapReturnValue};
+
+        $(
+            let $arg: $type =
+                MapArgumentImpl::map_arg(arg_register!(${index()}, $is));
+        )*
+
+        match $handler($($arg),*) {
+            Ok(val) => MapReturnValue::map_ret(val),
+            Err(err) => (-(err as i32)) as usize,
         }
     }};
 }
 
 macro_rules! define_syscall32 {
+    ($name:ident, $handler:ident) => {
+        fn $name(_int_stack: &mut $crate::bindings::root::interrupt_stack,
+            _mmxregs: &mut $crate::bindings::root::mmx_registers) -> usize {
+            use $crate::kernel::syscall::MapReturnValue;
+
+            match $handler() {
+                Ok(val) => MapReturnValue::map_ret(val),
+                Err(err) => (-(err as i32)) as usize,
+            }
+        }
+    };
     ($name:ident, $handler:ident, $($arg:ident: $argt:ty),*) => {
-        unsafe extern "C" fn $name(
-            int_stack: *mut $crate::bindings::root::interrupt_stack,
-            _mmxregs: *mut $crate::bindings::root::mmx_registers) -> u32 {
-            let int_stack = int_stack.as_mut().unwrap();
-            $crate::kernel::syscall::syscall32_call!(int_stack, $handler, $($arg: $argt),*)
+        fn $name(
+            int_stack: &mut $crate::bindings::root::interrupt_stack,
+            _mmxregs: &mut $crate::bindings::root::mmx_registers) -> usize {
+            use $crate::kernel::syscall::syscall32_call;
+
+            syscall32_call!(int_stack, $handler, $($arg: $argt),*)
         }
     };
 }
 
-pub(self) use {define_syscall32, syscall32_call};
+macro_rules! register_syscall {
+    ($no:expr, $name:ident) => {
+        $crate::kernel::syscall::register_syscall_handler(
+            $no,
+            concat_idents!(sys_, $name),
+            stringify!($name),
+        );
+    };
+}
+
+use super::task::Thread;
 
-extern "C" {
-    fn register_syscall_handler(
-        no: u32,
-        handler: unsafe extern "C" fn(*mut interrupt_stack, *mut mmx_registers) -> u32,
-        name: *const i8,
+pub(self) use {arg_register, define_syscall32, register_syscall, syscall32_call};
+
+pub(self) struct SyscallHandler {
+    handler: fn(&mut interrupt_stack, &mut mmx_registers) -> usize,
+    name: &'static str,
+}
+
+pub(self) fn register_syscall_handler(
+    no: usize,
+    handler: fn(&mut interrupt_stack, &mut mmx_registers) -> usize,
+    name: &'static str,
+) {
+    // SAFETY: `SYSCALL_HANDLERS` is never modified after initialization.
+    let syscall = unsafe { SYSCALL_HANDLERS.get_mut(no) }.unwrap();
+    assert!(
+        syscall.replace(SyscallHandler { handler, name }).is_none(),
+        "Syscall {} is already registered",
+        no
     );
 }
 
-#[no_mangle]
-pub unsafe extern "C" fn r_register_syscall() {
+pub fn register_syscalls() {
     file_rw::register();
     procops::register();
+    mm::register();
+    net::register();
+    sysinfo::register();
+}
+
+const SYSCALL_HANDLERS_SIZE: usize = 404;
+static mut SYSCALL_HANDLERS: [Option<SyscallHandler>; SYSCALL_HANDLERS_SIZE] =
+    [const { None }; SYSCALL_HANDLERS_SIZE];
+
+#[no_mangle]
+pub extern "C" fn handle_syscall32(
+    no: usize,
+    int_stack: *mut interrupt_stack,
+    mmxregs: *mut mmx_registers,
+) {
+    // SAFETY: `SYSCALL_HANDLERS` are never modified after initialization.
+    let syscall = unsafe { SYSCALL_HANDLERS.get(no) }.and_then(Option::as_ref);
+
+    // SAFETY: `int_stack` and `mmx_registers` are always valid.
+    let int_stack = unsafe { int_stack.as_mut().unwrap() };
+    let mmxregs = unsafe { mmxregs.as_mut().unwrap() };
+
+    match syscall {
+        None => {
+            println_warn!("Syscall {} isn't implemented", no);
+            ProcessList::kill_current(Signal::SIGSYS);
+        }
+        Some(handler) => {
+            arch::interrupt::enable();
+            let retval = (handler.handler)(int_stack, mmxregs);
+
+            // SAFETY: `int_stack` is always valid.
+            int_stack.regs.rax = retval as u64;
+            int_stack.regs.r8 = 0;
+            int_stack.regs.r9 = 0;
+            int_stack.regs.r10 = 0;
+            int_stack.regs.r11 = 0;
+            int_stack.regs.r12 = 0;
+            int_stack.regs.r13 = 0;
+            int_stack.regs.r14 = 0;
+            int_stack.regs.r15 = 0;
+        }
+    }
+
+    if Thread::current().signal_list.has_pending_signal() {
+        Thread::current().signal_list.handle(int_stack, mmxregs);
+    }
 }

+ 83 - 45
src/kernel/syscall/file_rw.rs

@@ -8,14 +8,23 @@ use bindings::{
 use crate::{
     io::{Buffer, BufferFill},
     kernel::{
-        user::dataflow::{CheckedUserPointer, UserBuffer, UserString},
-        vfs::{dentry::Dentry, file::SeekOption, filearray::FileArray, FsContext},
+        task::Thread,
+        user::{
+            dataflow::{CheckedUserPointer, UserBuffer, UserString},
+            UserPointer, UserPointerMut,
+        },
+        vfs::{
+            dentry::Dentry,
+            file::{PollEvent, SeekOption},
+            filearray::FileArray,
+            FsContext,
+        },
     },
     path::Path,
     prelude::*,
 };
 
-use super::{define_syscall32, register_syscall_handler};
+use super::{define_syscall32, register_syscall};
 
 fn do_read(fd: u32, buffer: *mut u8, bufsize: usize) -> KResult<usize> {
     let mut buffer = UserBuffer::new(buffer, bufsize)?;
@@ -198,24 +207,22 @@ struct IoVec32 {
     len: u32,
 }
 
-fn do_readv(fd: u32, iov_user: *const u8, iovcnt: u32) -> KResult<usize> {
+fn do_readv(fd: u32, iov_user: *const IoVec32, iovcnt: u32) -> KResult<usize> {
     let files = FileArray::get_current();
     let file = files.get(fd).ok_or(EBADF)?;
 
-    let iov_user =
-        CheckedUserPointer::new(iov_user, iovcnt as usize * core::mem::size_of::<IoVec32>())?;
-    let mut iov_user_copied: Vec<IoVec32> = vec![];
-    iov_user_copied.resize(iovcnt as usize, IoVec32::default());
-
-    iov_user.read(
-        iov_user_copied.as_mut_ptr() as *mut (),
-        iov_user_copied.len() * core::mem::size_of::<IoVec32>(),
-    )?;
-
-    let iov_buffers = iov_user_copied
-        .into_iter()
-        .take_while(|iov| iov.len != 0)
-        .map(|iov| UserBuffer::new(iov.base as *mut u8, iov.len as usize))
+    let mut iov_user = UserPointer::new(iov_user as *mut IoVec32)?;
+    let iov_buffers = (0..iovcnt)
+        .map(|_| {
+            let iov_result = iov_user.read()?;
+            iov_user = iov_user.offset(1)?;
+            Ok(iov_result)
+        })
+        .filter_map(|iov_result| match iov_result {
+            Err(err) => Some(Err(err)),
+            Ok(IoVec32 { len: 0, .. }) => None,
+            Ok(IoVec32 { base, len }) => Some(UserBuffer::new(base as *mut u8, len as usize)),
+        })
         .collect::<KResult<Vec<_>>>()?;
 
     let mut tot = 0usize;
@@ -224,7 +231,7 @@ fn do_readv(fd: u32, iov_user: *const u8, iovcnt: u32) -> KResult<usize> {
         let nread = file.read(&mut buffer)?;
         tot += nread;
 
-        if nread == 0 || nread != buffer.total() {
+        if nread != buffer.total() {
             break;
         }
     }
@@ -236,6 +243,7 @@ fn do_writev(fd: u32, iov_user: *const u8, iovcnt: u32) -> KResult<usize> {
     let files = FileArray::get_current();
     let file = files.get(fd).ok_or(EBADF)?;
 
+    // TODO: Rewrite this with `UserPointer`.
     let iov_user =
         CheckedUserPointer::new(iov_user, iovcnt as usize * core::mem::size_of::<IoVec32>())?;
     let mut iov_user_copied: Vec<IoVec32> = vec![];
@@ -312,6 +320,34 @@ fn do_fcntl64(fd: u32, cmd: u32, arg: usize) -> KResult<usize> {
     FileArray::get_current().fcntl(fd, cmd, arg)
 }
 
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+struct UserPollFd {
+    fd: u32,
+    events: u16,
+    revents: u16,
+}
+
+fn do_poll(fds: *mut UserPollFd, nfds: u32, timeout: u32) -> KResult<u32> {
+    match nfds {
+        0 => Ok(0),
+        2.. => unimplemented!("Poll with {} fds", nfds),
+        1 => {
+            if timeout != u32::MAX {
+                unimplemented!("Poll with timeout {}", timeout);
+            }
+            let fds = UserPointerMut::new(fds)?;
+            let mut fd = fds.read()?;
+
+            let file = Thread::current().files.get(fd.fd).ok_or(EBADF)?;
+            fd.revents = file.poll(PollEvent::from_bits_retain(fd.events))?.bits();
+
+            fds.write(fd)?;
+            Ok(1)
+        }
+    }
+}
+
 define_syscall32!(sys_read, do_read, fd: u32, buffer: *mut u8, bufsize: usize);
 define_syscall32!(sys_write, do_write, fd: u32, buffer: *const u8, count: usize);
 define_syscall32!(sys_open, do_open, path: *const u8, flags: u32, mode: u32);
@@ -329,35 +365,37 @@ define_syscall32!(sys_symlink, do_symlink, target: *const u8, linkpath: *const u
 define_syscall32!(sys_readlink, do_readlink, pathname: *const u8, buffer: *mut u8, bufsize: usize);
 define_syscall32!(sys_llseek, do_llseek, fd: u32, offset_high: u32, offset_low: u32, result: *mut u64, whence: u32);
 define_syscall32!(sys_mknod, do_mknod, pathname: *const u8, mode: u32, dev: u32);
-define_syscall32!(sys_readv, do_readv, fd: u32, iov_user: *const u8, iovcnt: u32);
+define_syscall32!(sys_readv, do_readv, fd: u32, iov_user: *const IoVec32, iovcnt: u32);
 define_syscall32!(sys_writev, do_writev, fd: u32, iov_user: *const u8, iovcnt: u32);
 define_syscall32!(sys_access, do_access, pathname: *const u8, mode: u32);
 define_syscall32!(sys_sendfile64, do_sendfile64, out_fd: u32, in_fd: u32, offset: *mut u8, count: usize);
 define_syscall32!(sys_ioctl, do_ioctl, fd: u32, request: usize, arg3: usize);
 define_syscall32!(sys_fcntl64, do_fcntl64, fd: u32, cmd: u32, arg: usize);
-
-pub(super) unsafe fn register() {
-    register_syscall_handler(0x03, sys_read, b"read\0".as_ptr() as *const _);
-    register_syscall_handler(0x04, sys_write, b"write\0".as_ptr() as *const _);
-    register_syscall_handler(0x05, sys_open, b"open\0".as_ptr() as *const _);
-    register_syscall_handler(0x06, sys_close, b"close\0".as_ptr() as *const _);
-    register_syscall_handler(0x0a, sys_unlink, b"unlink\0".as_ptr() as *const _);
-    register_syscall_handler(0x0e, sys_mknod, b"mknod\0".as_ptr() as *const _);
-    register_syscall_handler(0x21, sys_access, b"access\0".as_ptr() as *const _);
-    register_syscall_handler(0x27, sys_mkdir, b"mkdir\0".as_ptr() as *const _);
-    register_syscall_handler(0x29, sys_dup, b"dup\0".as_ptr() as *const _);
-    register_syscall_handler(0x2a, sys_pipe, b"pipe\0".as_ptr() as *const _);
-    register_syscall_handler(0x36, sys_ioctl, b"ioctl\0".as_ptr() as *const _);
-    register_syscall_handler(0x3f, sys_dup2, b"dup2\0".as_ptr() as *const _);
-    register_syscall_handler(0x53, sys_symlink, b"symlink\0".as_ptr() as *const _);
-    register_syscall_handler(0x55, sys_readlink, b"readlink\0".as_ptr() as *const _);
-    register_syscall_handler(0x5c, sys_truncate, b"truncate\0".as_ptr() as *const _);
-    register_syscall_handler(0x8c, sys_llseek, b"llseek\0".as_ptr() as *const _);
-    register_syscall_handler(0x8d, sys_getdents, b"getdents\0".as_ptr() as *const _);
-    register_syscall_handler(0x91, sys_readv, b"readv\0".as_ptr() as *const _);
-    register_syscall_handler(0x92, sys_writev, b"writev\0".as_ptr() as *const _);
-    register_syscall_handler(0xdc, sys_getdents64, b"getdents64\0".as_ptr() as *const _);
-    register_syscall_handler(0xdd, sys_fcntl64, b"fcntl64\0".as_ptr() as *const _);
-    register_syscall_handler(0xef, sys_sendfile64, b"sendfile64\0".as_ptr() as *const _);
-    register_syscall_handler(0x17f, sys_statx, b"statx\0".as_ptr() as *const _);
+define_syscall32!(sys_poll, do_poll, fds: *mut UserPollFd, nfds: u32, timeout: u32);
+
+pub(super) fn register() {
+    register_syscall!(0x03, read);
+    register_syscall!(0x04, write);
+    register_syscall!(0x05, open);
+    register_syscall!(0x06, close);
+    register_syscall!(0x0a, unlink);
+    register_syscall!(0x0e, mknod);
+    register_syscall!(0x21, access);
+    register_syscall!(0x27, mkdir);
+    register_syscall!(0x29, dup);
+    register_syscall!(0x2a, pipe);
+    register_syscall!(0x36, ioctl);
+    register_syscall!(0x3f, dup2);
+    register_syscall!(0x53, symlink);
+    register_syscall!(0x55, readlink);
+    register_syscall!(0x5c, truncate);
+    register_syscall!(0x8c, llseek);
+    register_syscall!(0x8d, getdents);
+    register_syscall!(0x91, readv);
+    register_syscall!(0x92, writev);
+    register_syscall!(0xa8, poll);
+    register_syscall!(0xdc, getdents64);
+    register_syscall!(0xdd, fcntl64);
+    register_syscall!(0xef, sendfile64);
+    register_syscall!(0x17f, statx);
 }

+ 0 - 130
src/kernel/syscall/fileops.cc

@@ -1,130 +0,0 @@
-#include <errno.h>
-#include <poll.h>
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include <types/path.hpp>
-
-#include <kernel/log.hpp>
-#include <kernel/mem/vm_area.hpp>
-#include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
-#include <kernel/vfs.hpp>
-
-#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
-
-static inline void not_implemented(const char* pos, int line) {
-    kmsgf(
-        "[kernel] the function at %s:%d is not implemented, killing the "
-        "pid%d...",
-        pos, line, current_process->pid);
-    current_thread->send_signal(SIGSYS);
-}
-
-uintptr_t kernel::syscall::do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags, int fd,
-                                         off_t pgoffset) {
-    if (addr & 0xfff)
-        return -EINVAL;
-    if (len == 0)
-        return -EINVAL;
-
-    len = (len + 0xfff) & ~0xfff;
-
-    // TODO: shared mappings
-    if (flags & MAP_SHARED)
-        return -ENOMEM;
-
-    if (flags & MAP_ANONYMOUS) {
-        if (fd != -1)
-            return -EINVAL;
-        if (pgoffset != 0)
-            return -EINVAL;
-
-        // TODO: shared mappings
-        if (!(flags & MAP_PRIVATE))
-            return -EINVAL;
-
-        auto& mms = current_process->mms;
-
-        // do unmapping, equal to munmap, MAP_FIXED set
-        if (prot == PROT_NONE) {
-            if (int ret = mms.unmap(addr, len, true); ret != 0)
-                return ret;
-        } else {
-            // TODO: add NULL check in mm_list
-            if (!addr || !mms.is_avail(addr, len)) {
-                if (flags & MAP_FIXED)
-                    return -ENOMEM;
-                addr = mms.find_avail(addr, len);
-            }
-
-            // TODO: check current cs
-            if (addr + len > 0x100000000ULL)
-                return -ENOMEM;
-
-            mem::mm_list::map_args args{};
-            args.vaddr = addr;
-            args.length = len;
-            args.flags = mem::MM_ANONYMOUS;
-
-            if (prot & PROT_WRITE)
-                args.flags |= mem::MM_WRITE;
-
-            if (prot & PROT_EXEC)
-                args.flags |= mem::MM_EXECUTE;
-
-            if (int ret = mms.mmap(args); ret != 0)
-                return ret;
-        }
-    }
-
-    return addr;
-}
-
-int kernel::syscall::do_munmap(uintptr_t addr, size_t len) {
-    if (addr & 0xfff)
-        return -EINVAL;
-
-    return current_process->mms.unmap(addr, len, true);
-}
-
-int kernel::syscall::do_poll(pollfd __user* fds, nfds_t nfds, int timeout) {
-    if (nfds == 0)
-        return 0;
-
-    if (nfds > 1) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    // TODO: handle timeout
-    // if (timeout != -1) {
-    // }
-    (void)timeout;
-
-    // for now, we will poll from console only
-    int ret = tty::console->poll();
-    if (ret < 0)
-        return ret;
-
-    fds[0].revents = POLLIN;
-    return ret;
-
-    // TODO: check address validity
-    // TODO: poll multiple fds and other type of files
-    // for (nfds_t i = 0; i < nfds; ++i) {
-    //     auto& pfd = fds[i];
-
-    //     auto* file = current_process->files[pfd.fd];
-    //     if (!file || !S_ISCHR(file->mode))
-    //         return -EINVAL;
-
-    //     // poll the fds
-    // }
-    //
-    // return 0;
-}
-
-int kernel::syscall::do_socket(int domain, int type, int protocol) {
-    return -EINVAL;
-}

+ 0 - 50
src/kernel/syscall/infoops.cc

@@ -1,50 +0,0 @@
-#include <bits/alltypes.h>
-#include <time.h>
-
-#include <kernel/hw/timer.hpp>
-#include <kernel/log.hpp>
-#include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
-
-#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
-
-static inline void not_implemented(const char* pos, int line) {
-    kmsgf(
-        "[kernel] the function at %s:%d is not implemented, killing the "
-        "pid%d...",
-        pos, line, current_process->pid);
-    current_thread->send_signal(SIGSYS);
-}
-
-int kernel::syscall::do_clock_gettime(clockid_t clk_id, timespec __user* tp) {
-    if (clk_id != CLOCK_REALTIME && clk_id != CLOCK_MONOTONIC) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    if (!tp)
-        return -EFAULT;
-
-    auto time = hw::timer::current_ticks();
-
-    // TODO: copy_to_user
-    tp->tv_sec = time / 100;
-    tp->tv_nsec = 10000000 * (time % 100);
-
-    return 0;
-}
-
-int kernel::syscall::do_gettimeofday(timeval __user* tv, void __user* tz) {
-    // TODO: return time of the day, not time from this boot
-    if (tz) [[unlikely]]
-        return -EINVAL;
-
-    if (tv) {
-        // TODO: use copy_to_user
-        auto ticks = kernel::hw::timer::current_ticks();
-        tv->tv_sec = ticks / 100;
-        tv->tv_usec = ticks * 10 * 1000;
-    }
-
-    return 0;
-}

+ 112 - 0
src/kernel/syscall/mm.rs

@@ -0,0 +1,112 @@
+use bindings::{EINVAL, ENOMEM};
+
+use crate::{
+    kernel::{
+        constants::{UserMmapFlags, UserMmapProtocol},
+        mem::{Mapping, Permission, VAddr},
+        task::Thread,
+    },
+    prelude::*,
+};
+
+use super::{define_syscall32, register_syscall, MapArgument, MapArgumentImpl};
+
+/// Check whether we are doing an implemented function.
+/// If `condition` is false, return `Err(err)`.
+fn check_impl(condition: bool, err: u32) -> KResult<()> {
+    if !condition {
+        Err(err)
+    } else {
+        Ok(())
+    }
+}
+
+fn do_mmap_pgoff(
+    addr: usize,
+    len: usize,
+    prot: UserMmapProtocol,
+    flags: UserMmapFlags,
+    fd: u32,
+    pgoffset: usize,
+) -> KResult<usize> {
+    let addr = VAddr(addr);
+    if addr.floor() != addr || len == 0 {
+        return Err(EINVAL);
+    }
+
+    let len = (len + 0xfff) & !0xfff;
+    check_impl(flags.contains(UserMmapFlags::MAP_ANONYMOUS), ENOMEM)?;
+    check_impl(flags.contains(UserMmapFlags::MAP_PRIVATE), EINVAL)?;
+    if fd != u32::MAX || pgoffset != 0 {
+        return Err(EINVAL);
+    }
+
+    let mm_list = &Thread::current().process.mm_list;
+
+    // PROT_NONE, we do unmapping.
+    if prot.is_empty() {
+        mm_list.unmap(addr, len).map(|_| 0)?;
+        return Ok(0);
+    }
+    // Otherwise, do mmapping.
+
+    // TODO!!!: If we are doing mmap's in 32-bit mode, we should check whether
+    //          `addr` is above user reachable memory.
+    mm_list
+        .mmap(
+            addr,
+            len,
+            Mapping::Anonymous,
+            Permission {
+                write: prot.contains(UserMmapProtocol::PROT_WRITE),
+                execute: prot.contains(UserMmapProtocol::PROT_EXEC),
+            },
+            flags.contains(UserMmapFlags::MAP_FIXED),
+        )
+        .map(|addr| addr.0)
+}
+
+fn do_munmap(addr: usize, len: usize) -> KResult<usize> {
+    let addr = VAddr(addr);
+    if addr.floor() != addr || len == 0 {
+        return Err(EINVAL);
+    }
+
+    let len = (len + 0xfff) & !0xfff;
+    Thread::current()
+        .process
+        .mm_list
+        .unmap(addr, len)
+        .map(|_| 0)
+}
+
+fn do_brk(addr: usize) -> KResult<usize> {
+    let vaddr = if addr == 0 { None } else { Some(VAddr(addr)) };
+    Ok(Thread::current().process.mm_list.set_break(vaddr).0)
+}
+
+impl MapArgument<'_, UserMmapProtocol> for MapArgumentImpl {
+    fn map_arg(value: u64) -> UserMmapProtocol {
+        UserMmapProtocol::from_bits_truncate(value as u32)
+    }
+}
+
+impl MapArgument<'_, UserMmapFlags> for MapArgumentImpl {
+    fn map_arg(value: u64) -> UserMmapFlags {
+        UserMmapFlags::from_bits_truncate(value as u32)
+    }
+}
+
+define_syscall32!(sys_brk, do_brk, addr: usize);
+define_syscall32!(sys_munmap, do_munmap, addr: usize, len: usize);
+define_syscall32!(sys_mmap_pgoff, do_mmap_pgoff,
+    addr: usize, len: usize,
+    prot: UserMmapProtocol,
+    flags: UserMmapFlags,
+    fd: u32, pgoffset: usize);
+
+pub(super) fn register() {
+    register_syscall!(0x2d, brk);
+    register_syscall!(0x5b, munmap);
+    register_syscall!(0xc0, mmap_pgoff);
+}

+ 15 - 0
src/kernel/syscall/net.rs

@@ -0,0 +1,15 @@
+use bindings::EINVAL;
+
+use crate::prelude::*;
+
+use super::{define_syscall32, register_syscall};
+
+fn do_socket(_domain: u32, _socket_type: u32, _protocol: u32) -> KResult<u32> {
+    Err(EINVAL)
+}
+
+define_syscall32!(sys_socket, do_socket, domain: u32, socket_type: u32, protocol: u32);
+
+pub(super) fn register() {
+    register_syscall!(0x167, socket);
+}

+ 0 - 329
src/kernel/syscall/procops.cc

@@ -1,329 +0,0 @@
-#include <string>
-#include <vector>
-
-#include <sys/prctl.h>
-#include <sys/utsname.h>
-#include <sys/wait.h>
-
-#include <types/elf.hpp>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/log.hpp>
-#include <kernel/process.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/syscall.hpp>
-#include <kernel/utsname.hpp>
-#include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-using namespace kernel::syscall;
-
-#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
-
-static inline void not_implemented(const char* pos, int line) {
-    kmsgf(
-        "[kernel] the function at %s:%d is not implemented, killing the "
-        "pid%d...",
-        pos, line, current_process->pid);
-    current_thread->send_signal(SIGSYS);
-}
-
-int kernel::syscall::do_exit(int status) {
-    // TODO: terminating a thread only
-    assert(current_process->thds.size() == 1);
-
-    // terminating a whole process:
-    procs->kill(current_process->pid, (status & 0xff) << 8);
-
-    // switch to new process and continue
-    schedule_noreturn();
-}
-
-int kernel::syscall::do_waitpid(pid_t waitpid, int __user* arg1, int options) {
-    if (waitpid != -1)
-        return -EINVAL;
-
-    auto& cv = current_process->waitlist;
-    async::lock_guard lck(current_process->mtx_waitprocs);
-
-    auto& waitlist = current_process->waitprocs;
-
-    // TODO: check if it is waiting for stopped process
-    if (options & ~(WNOHANG | WUNTRACED)) {
-        NOT_IMPLEMENTED;
-        return -EINVAL;
-    }
-
-    while (waitlist.empty()) {
-        if (current_process->children.empty())
-            return -ECHILD;
-
-        if (options & WNOHANG)
-            return 0;
-
-        bool interrupted = cv.wait(current_process->mtx_waitprocs);
-        if (interrupted)
-            return -EINTR;
-    }
-
-    for (auto iter = waitlist.begin(); iter != waitlist.end(); ++iter) {
-        if (WIFSTOPPED(iter->code) && !(options & WUNTRACED))
-            continue;
-
-        pid_t pid = iter->pid;
-
-        // TODO: copy_to_user
-        *arg1 = iter->code;
-
-        procs->remove(pid);
-        waitlist.erase(iter);
-
-        return pid;
-    }
-
-    // we should never reach here
-    freeze();
-    return -EINVAL;
-}
-
-pid_t kernel::syscall::do_setsid() {
-    if (current_process->pid == current_process->pgid)
-        return -EPERM;
-
-    current_process->sid = current_process->pid;
-    current_process->pgid = current_process->pid;
-
-    // TODO: get tty* from fd or block device id
-    tty::console->set_pgrp(current_process->pid);
-    current_process->control_tty = tty::console;
-
-    return current_process->pid;
-}
-
-pid_t kernel::syscall::do_getsid(pid_t pid) {
-    auto [pproc, found] = procs->try_find(pid);
-    if (!found)
-        return -ESRCH;
-    if (pproc->sid != current_process->sid)
-        return -EPERM;
-
-    return pproc->sid;
-}
-
-int kernel::syscall::do_setpgid(pid_t pid, pid_t pgid) {
-    if (pgid < 0)
-        return -EINVAL;
-
-    if (pid == 0)
-        pid = current_process->pid;
-
-    if (pgid == 0)
-        pgid = pid;
-
-    auto [pproc, found] = procs->try_find(pid);
-    if (!found)
-        return -ESRCH;
-
-    // TODO: check whether pgid and the original
-    //       pgid is in the same session
-
-    pproc->pgid = pgid;
-
-    return 0;
-}
-
-int kernel::syscall::do_set_thread_area(kernel::user::user_desc __user* ptr) {
-    auto ret = current_thread->set_thread_area(ptr);
-    if (ret != 0)
-        return ret;
-
-    current_thread->load_thread_area32();
-    return 0;
-}
-
-pid_t kernel::syscall::do_set_tid_address(int __user* tidptr) {
-    // TODO: copy_from_user
-    current_thread->set_child_tid = tidptr;
-    return current_thread->tid();
-}
-
-int kernel::syscall::do_prctl(int option, uintptr_t arg2) {
-    switch (option) {
-        case PR_SET_NAME: {
-            // TODO: copy_from_user
-            auto* name = (const char __user*)arg2;
-            current_thread->name.assign(name, 15);
-            break;
-        }
-        case PR_GET_NAME: {
-            auto* name = (char __user*)arg2;
-            // TODO: copy_to_user
-            strncpy(name, current_thread->name.c_str(), 16);
-            name[15] = 0;
-            break;
-        }
-        default:
-            return -EINVAL;
-    }
-
-    return 0;
-}
-
-int kernel::syscall::do_arch_prctl(int option, uintptr_t arg2) {
-    switch (option) {
-        case PR_SET_NAME: {
-            // TODO: copy_from_user
-            auto* name = (const char __user*)arg2;
-            current_thread->name.assign(name, 15);
-            break;
-        }
-        case PR_GET_NAME: {
-            auto* name = (char __user*)arg2;
-            // TODO: copy_to_user
-            strncpy(name, current_thread->name.c_str(), 16);
-            name[15] = 0;
-            break;
-        }
-        default:
-            return -EINVAL;
-    }
-
-    return 0;
-}
-
-int kernel::syscall::do_kill(pid_t pid, int sig) {
-    auto [pproc, found] = procs->try_find(pid);
-    if (!found)
-        return -ESRCH;
-
-    if (!kernel::signal_list::check_valid(sig))
-        return -EINVAL;
-
-    if (pproc->is_system())
-        return 0;
-
-    // TODO: check permission
-    procs->send_signal(pid, sig);
-
-    return 0;
-}
-
-int kernel::syscall::do_tkill(pid_t tid, int sig) {
-    NOT_IMPLEMENTED;
-    return -EINVAL;
-
-    auto [pproc, found] = procs->try_find(tid);
-    if (!found)
-        return -ESRCH;
-
-    if (!kernel::signal_list::check_valid(sig))
-        return -EINVAL;
-
-    if (pproc->is_system())
-        return 0;
-
-    // TODO: check permission
-    procs->send_signal(tid, sig);
-
-    return 0;
-}
-
-int kernel::syscall::do_rt_sigprocmask(int how, const sigmask_type __user* set,
-                                       sigmask_type __user* oldset, size_t sigsetsize) {
-    if (sigsetsize != sizeof(sigmask_type))
-        return -EINVAL;
-
-    sigmask_type sigs = current_thread->signals.get_mask();
-
-    // TODO: use copy_to_user
-    if (oldset)
-        memcpy(oldset, &sigs, sizeof(sigmask_type));
-
-    if (!set)
-        return 0;
-
-    // TODO: use copy_from_user
-    switch (how) {
-        case SIG_BLOCK:
-            current_thread->signals.mask(*set);
-            break;
-        case SIG_UNBLOCK:
-            current_thread->signals.unmask(*set);
-            break;
-        case SIG_SETMASK:
-            current_thread->signals.set_mask(*set);
-            break;
-    }
-
-    return 0;
-}
-
-int kernel::syscall::do_rt_sigaction(int signum, const sigaction __user* act,
-                                     sigaction __user* oldact, size_t sigsetsize) {
-    if (sigsetsize != sizeof(sigmask_type))
-        return -EINVAL;
-
-    if (!kernel::signal_list::check_valid(signum) || signum == SIGKILL || signum == SIGSTOP)
-        return -EINVAL;
-
-    // TODO: use copy_to_user
-    if (oldact)
-        current_thread->signals.get_handler(signum, *oldact);
-
-    if (!act)
-        return 0;
-
-    // TODO: use copy_from_user
-    current_thread->signals.set_handler(signum, *act);
-
-    return 0;
-}
-
-int kernel::syscall::do_newuname(new_utsname __user* buf) {
-    if (!buf)
-        return -EFAULT;
-
-    // TODO: use copy_to_user
-    memcpy(buf, sys_utsname, sizeof(new_utsname));
-
-    return 0;
-}
-
-pid_t kernel::syscall::do_getpgid(pid_t pid) {
-    if (pid == 0)
-        return current_process->pgid;
-
-    auto [pproc, found] = procs->try_find(pid);
-    if (!found)
-        return -ESRCH;
-
-    return pproc->pgid;
-}
-
-pid_t kernel::syscall::do_getpid() {
-    return current_process->pid;
-}
-
-pid_t kernel::syscall::do_getppid() {
-    return current_process->ppid;
-}
-
-uid_t kernel::syscall::do_getuid() {
-    return 0; // all users are root for now
-}
-
-uid_t kernel::syscall::do_geteuid() {
-    return 0; // all users are root for now
-}
-
-gid_t kernel::syscall::do_getgid() {
-    return 0; // all users are root for now
-}
-
-pid_t kernel::syscall::do_gettid() {
-    return current_thread->tid();
-}
-
-uintptr_t kernel::syscall::do_brk(uintptr_t addr) {
-    return current_process->mms.set_brk(addr);
-}

+ 358 - 47
src/kernel/syscall/procops.rs

@@ -1,24 +1,24 @@
-use core::ffi::CStr;
-
 use alloc::borrow::ToOwned;
 use alloc::ffi::CString;
 use alloc::sync::Arc;
 use bindings::types::elf::{elf32_load, elf32_load_data, ELF_LOAD_FAIL_NORETURN};
-use bindings::{
-    current_process, current_thread, interrupt_stack, kill_current, mmx_registers, EFAULT, EINVAL,
-    ENOENT, ENOTDIR, SIGSEGV,
-};
+use bindings::{interrupt_stack, mmx_registers, EINVAL, ENOENT, ENOTDIR, ESRCH};
+use bitflags::bitflags;
 
 use crate::io::Buffer;
+use crate::kernel::constants::{PR_GET_NAME, PR_SET_NAME, SIG_BLOCK, SIG_SETMASK, SIG_UNBLOCK};
+use crate::kernel::task::{
+    Process, ProcessList, Scheduler, Signal, SignalAction, Thread, UserDescriptor, WaitObject,
+};
 use crate::kernel::user::dataflow::UserString;
+use crate::kernel::user::{UserPointer, UserPointerMut};
 use crate::kernel::vfs::dentry::Dentry;
-use crate::kernel::vfs::filearray::FileArray;
 use crate::path::Path;
 use crate::{kernel::user::dataflow::UserBuffer, prelude::*};
 
 use crate::kernel::vfs::{self, FsContext};
 
-use super::{define_syscall32, register_syscall_handler};
+use super::{define_syscall32, register_syscall};
 
 fn do_umask(mask: u32) -> KResult<u32> {
     let context = FsContext::get_current();
@@ -98,74 +98,385 @@ fn do_execve(exec: &[u8], argv: &[CString], envp: &[CString]) -> KResult<(usize,
         sp: 0,
     };
 
-    BorrowedArc::<FileArray>::from_raw(
-        unsafe { current_process.as_mut() }.unwrap().files.m_handle as *const _,
-    )
-    .on_exec();
-
+    Thread::current().files.on_exec();
     match unsafe { elf32_load(&mut load_data) } {
+        ELF_LOAD_FAIL_NORETURN => ProcessList::kill_current(Signal::SIGSEGV),
         0 => {
-            unsafe { current_thread.as_mut().unwrap().signals.on_exec() };
+            Thread::current().signal_list.clear_non_ignore();
             Ok((load_data.ip, load_data.sp))
         }
-        n => {
-            if n == ELF_LOAD_FAIL_NORETURN {
-                unsafe { kill_current(SIGSEGV as i32) }
-            }
-            Err(-n as u32)
-        }
+        n => Err(-n as u32),
     }
 }
 
-unsafe extern "C" fn sys_execve(
-    int_stack: *mut interrupt_stack,
-    _mmxregs: *mut mmx_registers,
-) -> u32 {
+fn sys_execve(int_stack: &mut interrupt_stack, _mmxregs: &mut mmx_registers) -> usize {
     match (|| -> KResult<()> {
-        let exec = int_stack.as_mut().unwrap().regs.rbx as *const u8;
+        let exec = int_stack.regs.rbx as *const u8;
         let exec = UserString::new(exec)?;
 
         // TODO!!!!!: copy from user
-        let mut argv = int_stack.as_mut().unwrap().regs.rcx as *const u32;
-        let mut envp = int_stack.as_mut().unwrap().regs.rdx as *const u32;
-
-        if argv.is_null() || envp.is_null() {
-            return Err(EFAULT);
-        }
+        let mut argv: UserPointer<u32> = UserPointer::new_vaddr(int_stack.regs.rcx as _)?;
+        let mut envp: UserPointer<u32> = UserPointer::new_vaddr(int_stack.regs.rdx as _)?;
 
         let mut argv_vec = Vec::new();
         let mut envp_vec = Vec::new();
 
-        while argv.read() != 0 {
-            argv_vec.push(CStr::from_ptr(argv.read() as *const i8).to_owned());
-            argv = argv.add(1);
+        loop {
+            let arg = argv.read()?;
+            if arg == 0 {
+                break;
+            }
+
+            let arg = UserString::new(arg as *const u8)?;
+            argv_vec.push(arg.as_cstr().to_owned());
+            argv = argv.offset(1)?;
         }
 
-        while envp.read() != 0 {
-            envp_vec.push(CStr::from_ptr(envp.read() as *const i8).to_owned());
-            envp = envp.add(1);
+        loop {
+            let arg = envp.read()?;
+            if arg == 0 {
+                break;
+            }
+
+            let arg = UserString::new(arg as *const u8)?;
+            envp_vec.push(arg.as_cstr().to_owned());
+            envp = envp.offset(1)?;
         }
 
         let (ip, sp) = do_execve(exec.as_cstr().to_bytes(), &argv_vec, &envp_vec)?;
 
-        int_stack.as_mut().unwrap().v_rip = ip;
-        int_stack.as_mut().unwrap().rsp = sp;
+        int_stack.v_rip = ip;
+        int_stack.rsp = sp;
         Ok(())
     })() {
         Ok(_) => 0,
-        Err(err) => -(err as i32) as u32,
+        Err(err) => -(err as i32) as _,
+    }
+}
+
+// TODO: Find a better way.
+#[allow(unreachable_code)]
+fn do_exit(status: u32) -> KResult<()> {
+    ProcessList::get().do_kill_process(&Thread::current().process, (status & 0xff) << 8);
+    Scheduler::schedule_noreturn();
+    panic!("schedule_noreturn returned!");
+}
+
+bitflags! {
+    pub struct UserWaitOptions: u32 {
+        const WNOHANG = 1;
+        const WUNTRACED = 2;
+        const WCONTINUED = 8;
+    }
+}
+
+fn do_waitpid(waitpid: u32, arg1: *mut u32, options: u32) -> KResult<u32> {
+    if waitpid != u32::MAX {
+        unimplemented!("waitpid with pid {waitpid}")
+    }
+    let options = match UserWaitOptions::from_bits(options) {
+        None => unimplemented!("waitpid with options {options}"),
+        Some(options) => options,
+    };
+
+    let wait_object = Thread::current().process.wait(
+        options.contains(UserWaitOptions::WNOHANG),
+        options.contains(UserWaitOptions::WUNTRACED),
+        options.contains(UserWaitOptions::WCONTINUED),
+    )?;
+
+    match wait_object {
+        None => Ok(0),
+        Some(WaitObject { pid, code }) => {
+            if !arg1.is_null() {
+                UserPointerMut::new(arg1)?.write(code)?;
+            }
+            Ok(pid)
+        }
     }
 }
 
+fn do_setsid() -> KResult<u32> {
+    Thread::current().process.setsid()
+}
+
+fn do_setpgid(pid: u32, pgid: i32) -> KResult<()> {
+    let pid = if pid == 0 { Thread::current().process.pid } else { pid };
+
+    let pgid = match pgid {
+        0 => pid,
+        1.. => pgid as u32,
+        _ => return Err(EINVAL),
+    };
+
+    Thread::current().process.setpgid(pid, pgid)
+}
+
+fn do_getsid(pid: u32) -> KResult<u32> {
+    if pid == 0 {
+        Ok(Thread::current().process.sid())
+    } else {
+        ProcessList::get()
+            .try_find_process(pid)
+            .map(|proc| proc.sid())
+            .ok_or(ESRCH)
+    }
+}
+
+fn do_getpgid(pid: u32) -> KResult<u32> {
+    if pid == 0 {
+        Ok(Thread::current().process.pgid())
+    } else {
+        ProcessList::get()
+            .try_find_process(pid)
+            .map(|proc| proc.pgid())
+            .ok_or(ESRCH)
+    }
+}
+
+fn do_getpid() -> KResult<u32> {
+    Ok(Thread::current().process.pid)
+}
+
+fn do_getppid() -> KResult<u32> {
+    Ok(Thread::current().process.parent().map_or(0, |x| x.pid))
+}
+
+fn do_getuid() -> KResult<u32> {
+    // All users are root for now.
+    Ok(0)
+}
+
+fn do_geteuid() -> KResult<u32> {
+    // All users are root for now.
+    Ok(0)
+}
+
+fn do_getgid() -> KResult<u32> {
+    // All users are root for now.
+    Ok(0)
+}
+
+fn do_gettid() -> KResult<u32> {
+    Ok(Thread::current().tid)
+}
+
+fn do_set_thread_area(desc: *mut UserDescriptor) -> KResult<()> {
+    let desc_pointer = UserPointerMut::new(desc)?;
+    let mut desc = desc_pointer.read()?;
+
+    Thread::current().set_thread_area(&mut desc)?;
+    desc_pointer.write(desc)?;
+
+    Thread::current().load_thread_area32();
+    Ok(())
+}
+
+fn do_set_tid_address(tidptr: *mut u32) -> KResult<u32> {
+    // TODO!!!: Implement this. We don't use it for now.
+    let _tidptr = UserPointerMut::new(tidptr)?;
+    Ok(Thread::current().tid)
+}
+
+fn do_prctl(option: u32, arg2: usize) -> KResult<()> {
+    match option {
+        PR_SET_NAME => {
+            let name = UserPointer::new(arg2 as *mut [u8; 16])?.read()?;
+            let len = name.iter().position(|&c| c == 0).unwrap_or(15);
+            Thread::current().set_name(name[..len].into());
+            Ok(())
+        }
+        PR_GET_NAME => {
+            let name = Thread::current().get_name();
+            let len = name.len().min(15);
+            let name: [u8; 16] = core::array::from_fn(|i| if i < len { name[i] } else { 0 });
+            UserPointerMut::new(arg2 as *mut [u8; 16])?.write(name)?;
+            Ok(())
+        }
+        _ => Err(EINVAL),
+    }
+}
+
+fn do_kill(pid: i32, sig: u32) -> KResult<()> {
+    match pid {
+        // Send signal to every process for which the calling process has
+        // permission to send signals.
+        -1 => unimplemented!("kill with pid -1"),
+        // Send signal to every process in the process group.
+        0 => Thread::current()
+            .process
+            .pgroup()
+            .raise(Signal::try_from(sig)?),
+        // Send signal to the process with the specified pid.
+        1.. => ProcessList::get()
+            .try_find_process(pid as u32)
+            .ok_or(ESRCH)?
+            .raise(Signal::try_from(sig)?),
+        // Send signal to the process group with the specified pgid equals to `-pid`.
+        ..-1 => ProcessList::get()
+            .try_find_pgroup((-pid) as u32)
+            .ok_or(ESRCH)?
+            .raise(Signal::try_from(sig)?),
+    }
+
+    Ok(())
+}
+
+fn do_tkill(tid: u32, sig: u32) -> KResult<()> {
+    ProcessList::get()
+        .try_find_thread(tid)
+        .ok_or(ESRCH)?
+        .raise(Signal::try_from(sig)?);
+    Ok(())
+}
+
+fn do_rt_sigprocmask(how: u32, set: *mut u64, oldset: *mut u64, sigsetsize: usize) -> KResult<()> {
+    if sigsetsize != size_of::<u64>() {
+        return Err(EINVAL);
+    }
+
+    let old_mask = Thread::current().signal_list.get_mask();
+    if !oldset.is_null() {
+        UserPointerMut::new(oldset)?.write(old_mask)?;
+    }
+
+    let new_mask = !if set.is_null() {
+        UserPointer::new(set)?.read()?
+    } else {
+        return Ok(());
+    };
+
+    match how {
+        SIG_BLOCK => Thread::current().signal_list.mask(new_mask),
+        SIG_UNBLOCK => Thread::current().signal_list.unmask(new_mask),
+        SIG_SETMASK => Thread::current().signal_list.set_mask(new_mask),
+        _ => return Err(EINVAL),
+    }
+
+    Ok(())
+}
+
+fn do_rt_sigaction(
+    signum: u32,
+    act: *const SignalAction,
+    oldact: *mut SignalAction,
+    sigsetsize: usize,
+) -> KResult<()> {
+    let signal = Signal::try_from(signum)?;
+    if sigsetsize != size_of::<u64>() || signal.is_now() {
+        return Err(EINVAL);
+    }
+
+    let old_action = Thread::current().signal_list.get_handler(signal);
+    if !oldact.is_null() {
+        UserPointerMut::new(oldact)?.write(old_action)?;
+    }
+
+    if !act.is_null() {
+        let new_action = UserPointer::new(act as *mut _)?.read()?;
+        Thread::current()
+            .signal_list
+            .set_handler(signal, new_action)?;
+    }
+
+    Ok(())
+}
+
 define_syscall32!(sys_chdir, do_chdir, path: *const u8);
 define_syscall32!(sys_umask, do_umask, mask: u32);
-define_syscall32!(sys_mount, do_mount, source: *const u8, target: *const u8, fstype: *const u8, flags: usize);
 define_syscall32!(sys_getcwd, do_getcwd, buffer: *mut u8, bufsize: usize);
+define_syscall32!(sys_exit, do_exit, status: u32);
+define_syscall32!(sys_waitpid, do_waitpid, waitpid: u32, arg1: *mut u32, options: u32);
+define_syscall32!(sys_setsid, do_setsid);
+define_syscall32!(sys_setpgid, do_setpgid, pid: u32, pgid: i32);
+define_syscall32!(sys_getsid, do_getsid, pid: u32);
+define_syscall32!(sys_getpgid, do_getpgid, pid: u32);
+define_syscall32!(sys_getpid, do_getpid);
+define_syscall32!(sys_getppid, do_getppid);
+define_syscall32!(sys_getuid, do_getuid);
+define_syscall32!(sys_geteuid, do_geteuid);
+define_syscall32!(sys_getgid, do_getgid);
+define_syscall32!(sys_gettid, do_gettid);
+define_syscall32!(sys_mount, do_mount,
+    source: *const u8, target: *const u8,fstype: *const u8, flags: usize);
+define_syscall32!(sys_set_thread_area, do_set_thread_area, desc: *mut UserDescriptor);
+define_syscall32!(sys_set_tid_address, do_set_tid_address, tidptr: *mut u32);
+define_syscall32!(sys_prctl, do_prctl, option: u32, arg2: usize);
+define_syscall32!(sys_arch_prctl, do_prctl, option: u32, arg2: usize);
+define_syscall32!(sys_kill, do_kill, pid: i32, sig: u32);
+define_syscall32!(sys_tkill, do_tkill, tid: u32, sig: u32);
+define_syscall32!(sys_rt_sigprocmask, do_rt_sigprocmask,
+    how: u32, set: *mut u64, oldset: *mut u64, sigsetsize: usize);
+define_syscall32!(sys_rt_sigaction, do_rt_sigaction,
+    signum: u32, act: *const SignalAction, oldact: *mut SignalAction, sigsetsize: usize);
+
+extern "C" {
+    fn ISR_stub_restore();
+}
+
+fn sys_fork(int_stack: &mut interrupt_stack, mmxregs: &mut mmx_registers) -> usize {
+    let new_thread = Thread::new_cloned(Thread::current());
+
+    // TODO: We should make the preparation of the kernel stack more abstract.
+    //       Currently, we can see that we are directly writing to the kernel stack,
+    //       which is platform dependent.
+    new_thread.prepare_kernel_stack(|kstack| {
+        let mut writer = kstack.get_writer();
+
+        // We make the child process return to `ISR_stub_restore`, pretending that we've
+        // just returned from a interrupt handler.
+        writer.entry = ISR_stub_restore;
+
+        let mut new_int_stack = int_stack.clone();
+
+        // Child's return value: 0
+        new_int_stack.regs.rax = 0;
+
+        writer.write(new_int_stack);
+
+        // In `ISR_stub_restore`, we will restore the mmx register context, followed by
+        // restoring the stack pointer by moving the value in `rbx` to `rsp`, which should
+        // point to the interrupt stack.
+        writer.rbx = writer.get_current_sp();
+
+        // Push the mmx register context to the stack.
+        writer.write(mmxregs.clone());
+
+        writer.finish();
+    });
+
+    Scheduler::get().lock_irq().uwake(&new_thread);
+    new_thread.process.pid as usize
+}
 
-pub(super) unsafe fn register() {
-    register_syscall_handler(0x0b, sys_execve, b"execve\0".as_ptr() as *const _);
-    register_syscall_handler(0x0c, sys_chdir, b"chdir\0".as_ptr() as *const _);
-    register_syscall_handler(0x15, sys_mount, b"mount\0".as_ptr() as *const _);
-    register_syscall_handler(0x3c, sys_umask, b"umask\0".as_ptr() as *const _);
-    register_syscall_handler(0xb7, sys_getcwd, b"getcwd\0".as_ptr() as *const _);
+pub(super) fn register() {
+    register_syscall!(0x01, exit);
+    register_syscall!(0x02, fork);
+    register_syscall!(0x07, waitpid);
+    register_syscall!(0x0b, execve);
+    register_syscall!(0x0c, chdir);
+    register_syscall!(0x14, getpid);
+    register_syscall!(0x15, mount);
+    register_syscall!(0x25, kill);
+    register_syscall!(0x2f, getgid);
+    register_syscall!(0x39, setpgid);
+    register_syscall!(0x3c, umask);
+    register_syscall!(0x40, getppid);
+    register_syscall!(0x42, setsid);
+    register_syscall!(0x84, getpgid);
+    register_syscall!(0x93, getsid);
+    register_syscall!(0xac, prctl);
+    register_syscall!(0xae, rt_sigaction);
+    register_syscall!(0xaf, rt_sigprocmask);
+    register_syscall!(0xb7, getcwd);
+    register_syscall!(0xc7, getuid);
+    register_syscall!(0xc8, getgid);
+    register_syscall!(0xc9, geteuid);
+    register_syscall!(0xca, geteuid);
+    register_syscall!(0xe0, gettid);
+    register_syscall!(0xee, tkill);
+    register_syscall!(0xf3, set_thread_area);
+    register_syscall!(0xfc, exit);
+    register_syscall!(0x102, set_tid_address);
+    register_syscall!(0x180, arch_prctl);
 }

+ 102 - 0
src/kernel/syscall/sysinfo.rs

@@ -0,0 +1,102 @@
+use bindings::EINVAL;
+
+use crate::{
+    kernel::{
+        constants::{CLOCK_MONOTONIC, CLOCK_REALTIME},
+        timer::ticks,
+        user::UserPointerMut,
+    },
+    prelude::*,
+};
+
+use super::{define_syscall32, register_syscall};
+
+#[derive(Clone, Copy)]
+struct NewUTSName {
+    sysname: [u8; 65],
+    nodename: [u8; 65],
+    release: [u8; 65],
+    version: [u8; 65],
+    machine: [u8; 65],
+    domainname: [u8; 65],
+}
+
+fn copy_cstr_to_array(cstr: &[u8], array: &mut [u8]) {
+    let len = cstr.len().min(array.len() - 1);
+    array[..len].copy_from_slice(&cstr[..len]);
+    array[len] = 0;
+}
+
+fn do_newuname(buffer: *mut NewUTSName) -> KResult<()> {
+    let buffer = UserPointerMut::new(buffer)?;
+    let mut uname = NewUTSName {
+        sysname: [0; 65],
+        nodename: [0; 65],
+        release: [0; 65],
+        version: [0; 65],
+        machine: [0; 65],
+        domainname: [0; 65],
+    };
+
+    // Linux compatible
+    copy_cstr_to_array(b"Linux", &mut uname.sysname);
+    copy_cstr_to_array(b"(none)", &mut uname.nodename);
+    copy_cstr_to_array(b"1.0.0", &mut uname.release);
+    copy_cstr_to_array(b"1.0.0", &mut uname.version);
+    copy_cstr_to_array(b"x86", &mut uname.machine);
+    copy_cstr_to_array(b"(none)", &mut uname.domainname);
+
+    buffer.write(uname)
+}
+
+#[derive(Clone, Copy)]
+struct TimeVal {
+    sec: u64,
+    usec: u64,
+}
+
+#[derive(Clone, Copy)]
+struct TimeSpec {
+    sec: u64,
+    nsec: u64,
+}
+
+fn do_gettimeofday(timeval: *mut TimeVal, timezone: *mut ()) -> KResult<()> {
+    if !timezone.is_null() {
+        return Err(EINVAL);
+    }
+
+    if !timeval.is_null() {
+        let timeval = UserPointerMut::new(timeval)?;
+        let ticks = ticks();
+        timeval.write(TimeVal {
+            sec: ticks.in_secs() as u64,
+            usec: ticks.in_usecs() as u64 % 1_000_000,
+        })?;
+    }
+
+    Ok(())
+}
+
+fn do_clock_gettime64(clock_id: u32, timespec: *mut TimeSpec) -> KResult<()> {
+    if clock_id != CLOCK_REALTIME && clock_id != CLOCK_MONOTONIC {
+        unimplemented!("Unsupported clock_id: {}", clock_id);
+    }
+
+    let timespec = UserPointerMut::new(timespec)?;
+    let ticks = ticks();
+    timespec.write(TimeSpec {
+        sec: ticks.in_secs() as u64,
+        nsec: ticks.in_nsecs() as u64 % 1_000_000_000,
+    })
+}
+
+define_syscall32!(sys_newuname, do_newuname, buffer: *mut NewUTSName);
+define_syscall32!(sys_gettimeofday, do_gettimeofday, timeval: *mut TimeVal, timezone: *mut ());
+define_syscall32!(sys_clock_gettime64, do_clock_gettime64, clock_id: u32, timespec: *mut TimeSpec);
+
+pub(super) fn register() {
+    register_syscall!(0x4e, gettimeofday);
+    register_syscall!(0x7a, newuname);
+    register_syscall!(0x193, clock_gettime64);
+}

+ 13 - 0
src/kernel/task.rs

@@ -0,0 +1,13 @@
+mod kstack;
+mod scheduler;
+mod signal;
+mod thread;
+
+pub(self) use kstack::KernelStack;
+
+pub use scheduler::Scheduler;
+pub use signal::{Signal, SignalAction};
+pub use thread::{
+    Process, ProcessGroup, ProcessList, Session, Thread, ThreadState, UserDescriptor,
+    UserDescriptorFlags, WaitObject,
+};

+ 128 - 0
src/kernel/task/kstack.rs

@@ -0,0 +1,128 @@
+use crate::kernel::mem::{
+    paging::Page,
+    phys::{CachedPP, PhysPtr},
+};
+
+use core::cell::UnsafeCell;
+
+pub struct KernelStack {
+    pages: Page,
+    bottom: usize,
+    sp: UnsafeCell<usize>,
+}
+
+pub struct KernelStackWriter<'lt> {
+    sp: &'lt mut usize,
+    prev_sp: usize,
+
+    pub entry: unsafe extern "C" fn(),
+    pub flags: usize,
+    pub r15: usize,
+    pub r14: usize,
+    pub r13: usize,
+    pub r12: usize,
+    pub rbp: usize,
+    pub rbx: usize,
+}
+
+unsafe extern "C" fn __not_assigned_entry() {
+    panic!("__not_assigned_entry called");
+}
+
+impl<'lt> KernelStackWriter<'lt> {
+    fn new(sp: &'lt mut usize) -> Self {
+        let prev_sp = *sp;
+
+        Self {
+            sp,
+            entry: __not_assigned_entry,
+            flags: 0,
+            r15: 0,
+            r14: 0,
+            r13: 0,
+            r12: 0,
+            rbp: 0,
+            rbx: 0,
+            prev_sp,
+        }
+    }
+
+    /// `data` and current sp should have an alignment of 16 bytes.
+    /// Otherwise, extra padding is added.
+    pub fn write<T: Copy>(&mut self, data: T) {
+        *self.sp -= core::mem::size_of::<T>();
+        *self.sp &= !0xf; // Align to 16 bytes
+
+        // SAFETY: `sp` is always valid.
+        unsafe {
+            (*self.sp as *mut T).write(data);
+        }
+    }
+
+    pub fn get_current_sp(&self) -> usize {
+        *self.sp
+    }
+
+    fn push(&mut self, val: usize) {
+        *self.sp -= core::mem::size_of::<usize>();
+
+        // SAFETY: `sp` is always valid.
+        unsafe {
+            (self.sp as *mut usize).write(val);
+        }
+    }
+
+    pub fn finish(mut self) {
+        self.push(self.entry as usize);
+        self.push(self.flags); // rflags
+        self.push(self.r15); // r15
+        self.push(self.r14); // r14
+        self.push(self.r13); // r13
+        self.push(self.r12); // r12
+        self.push(self.rbp); // rbp
+        self.push(self.rbx); // rbx
+        self.push(0); // 0 for alignment
+        self.push(self.prev_sp) // previous sp
+    }
+}
+
+impl KernelStack {
+    /// Kernel stack page order
+    /// 7 for `2^7 = 128 pages = 512 KiB`
+    const KERNEL_STACK_ORDER: u32 = 7;
+
+    pub fn new() -> Self {
+        let pages = Page::alloc_many(Self::KERNEL_STACK_ORDER);
+        let bottom = pages.as_cached().offset(pages.len()).as_ptr::<u8>() as usize;
+
+        Self {
+            pages,
+            bottom,
+            sp: UnsafeCell::new(bottom),
+        }
+    }
+
+    pub fn load_interrupt_stack(&self) {
+        const TSS_RSP0: CachedPP = CachedPP::new(0x00000074);
+
+        // TODO!!!: Make `TSS` a per cpu struct.
+        // SAFETY: `TSS_RSP0` is always valid.
+        unsafe {
+            TSS_RSP0
+                .as_ptr::<u64>()
+                .write_unaligned(*self.sp.get() as u64);
+        }
+    }
+
+    pub fn get_writer(&mut self) -> KernelStackWriter {
+        KernelStackWriter::new(self.sp.get_mut())
+    }
+
+    /// Get a pointer to `self.sp` so we can use it in `context_switch()`.
+    ///
+    /// # Safety
+    /// Save the pointer somewhere or pass it to a function that will use it is UB.
+    pub unsafe fn get_sp_ptr(&self) -> *mut usize {
+        self.sp.get()
+    }
+}

+ 0 - 47
src/kernel/task/readyqueue.cc

@@ -1,47 +0,0 @@
-#include <list>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/task/readyqueue.hpp>
-#include <kernel/task/thread.hpp>
-
-using namespace kernel::task;
-using kernel::async::mutex, kernel::async::lock_guard_irq;
-
-static mutex dispatcher_mtx;
-static std::list<thread*> dispatcher_thds;
-static thread* idle_task;
-
-void dispatcher::setup_idle(thread* _idle) {
-    idle_task = _idle;
-}
-
-void dispatcher::enqueue(thread* thd) {
-    lock_guard_irq lck(dispatcher_mtx);
-
-    dispatcher_thds.push_back(thd);
-}
-
-void dispatcher::dequeue(thread* thd) {
-    lock_guard_irq lck(dispatcher_mtx);
-
-    dispatcher_thds.remove(thd);
-}
-
-thread* dispatcher::next() {
-    lock_guard_irq lck(dispatcher_mtx);
-
-    if (dispatcher_thds.empty()) {
-        idle_task->elected_times++;
-        return idle_task;
-    }
-
-    auto* front = dispatcher_thds.front();
-
-    if (dispatcher_thds.size() != 1) {
-        dispatcher_thds.pop_front();
-        dispatcher_thds.push_back(front);
-    }
-
-    front->elected_times++;
-    return front;
-}

+ 211 - 0
src/kernel/task/scheduler.rs

@@ -0,0 +1,211 @@
+use core::sync::atomic::{compiler_fence, Ordering};
+
+use crate::prelude::*;
+
+use alloc::{
+    collections::vec_deque::VecDeque,
+    sync::{Arc, Weak},
+};
+
+use super::{Thread, ThreadState};
+
+pub struct Scheduler {
+    ready: VecDeque<Weak<Thread>>,
+}
+
+/// Idle task thread
+///
+/// # Safety
+/// This variable is per cpu. So no need to synchronize accesses to it.
+///
+/// TODO!!!: This should be per cpu in smp environment.
+static mut IDLE_TASK: Option<Arc<Thread>> = None;
+
+/// Current thread
+///
+/// # Safety
+/// This variable is per cpu. So no need to synchronize accesses to it.
+///
+/// TODO!!!: This should be per cpu in smp environment.
+static mut CURRENT: Option<Arc<Thread>> = None;
+
+impl Scheduler {
+    /// `Scheduler` might be used in various places. Do not hold it for a long time.
+    ///
+    /// # Safety
+    /// The locked returned by this function should be locked with `lock_irq` to prevent from
+    /// rescheduling during access to the scheduler. Disabling preemption will do the same.
+    ///
+    /// Drop the lock before calling `schedule`.
+    pub fn get() -> &'static Spin<Self> {
+        todo!()
+    }
+
+    pub fn current<'lt>() -> &'lt Arc<Thread> {
+        // SAFETY: `CURRENT` is per cpu.
+        unsafe { CURRENT.as_ref().unwrap() }
+    }
+
+    pub fn idle_task() -> &'static Arc<Thread> {
+        // SAFETY: `IDLE_TASK` is per cpu.
+        unsafe { IDLE_TASK.as_ref().unwrap() }
+    }
+
+    pub(super) fn set_idle(thread: Arc<Thread>) {
+        // TODO!!!: Set per cpu variable.
+        unsafe { IDLE_TASK = Some(thread) };
+    }
+
+    pub(super) fn set_current(thread: Arc<Thread>) {
+        // TODO!!!: Set per cpu variable.
+        unsafe { CURRENT = Some(thread) };
+    }
+
+    fn enqueue(&mut self, thread: &Arc<Thread>) {
+        self.ready.push_back(Arc::downgrade(thread));
+    }
+
+    pub fn usleep(&mut self, thread: &Arc<Thread>) {
+        let mut state = thread.state.lock();
+        assert!(matches!(*state, ThreadState::Running));
+        // No need to dequeue. We have proved that the thread is running so not in the queue.
+
+        *state = ThreadState::USleep;
+    }
+
+    pub fn uwake(&mut self, thread: &Arc<Thread>) {
+        let mut state = thread.state.lock();
+        assert!(matches!(*state, ThreadState::USleep));
+
+        *state = ThreadState::Ready;
+        self.enqueue(&thread);
+    }
+
+    pub fn isleep(&mut self, thread: &Arc<Thread>) {
+        let mut state = thread.state.lock();
+        assert!(matches!(*state, ThreadState::Running));
+        // No need to dequeue. We have proved that the thread is running so not in the queue.
+
+        *state = ThreadState::ISleep;
+    }
+
+    pub fn iwake(&mut self, thread: &Arc<Thread>) {
+        let mut state = thread.state.lock();
+
+        match *state {
+            ThreadState::USleep => return,
+            ThreadState::ISleep => {
+                *state = ThreadState::Ready;
+                self.enqueue(&thread);
+            }
+            _ => panic!(),
+        }
+    }
+
+    /// Put `Running` thread into `Ready` state and enqueue the task.
+    pub fn put_ready(&mut self, thread: &Arc<Thread>) {
+        let mut state = thread.state.lock();
+        assert!(matches!(*state, ThreadState::Running));
+
+        *state = ThreadState::Ready;
+        self.enqueue(&thread);
+    }
+
+    /// Set `Ready` threads to the `Running` state.
+    pub fn set_running(&mut self, thread: &Arc<Thread>) {
+        let mut state = thread.state.lock();
+        assert!(matches!(*state, ThreadState::Ready));
+
+        *state = ThreadState::Running;
+        // No need to dequeue. We got the thread from the queue.
+    }
+
+    /// Set `Running` threads to the `Zombie` state.
+    pub fn set_zombie(&mut self, thread: &Arc<Thread>) {
+        let mut state = thread.state.lock();
+        assert!(matches!(*state, ThreadState::Running));
+
+        *state = ThreadState::Zombie;
+    }
+}
+
+impl Scheduler {
+    /// Go to idle task. Call this with `preempt_count == 1`
+    ///
+    /// # Safety
+    /// We might never return from here.
+    /// Drop all variables that take ownership of some resource before calling this function.
+    pub fn schedule() {
+        might_sleep!(1);
+
+        // Make sure all works are done before scheduling.
+        compiler_fence(Ordering::SeqCst);
+
+        // TODO!!!!!: Use of reference here needs further consideration.
+        //
+        // Since we might never return to here, we can't take ownership of `current()`.
+        // Is it safe to believe that `current()` will never change across calls?
+        context_switch_light(Thread::current(), Scheduler::idle_task());
+    }
+
+    pub fn schedule_noreturn() -> ! {
+        Self::schedule();
+        panic!("Scheduler::schedule_noreturn(): Should never return")
+    }
+}
+
+fn context_switch_light(from: &Arc<Thread>, to: &Arc<Thread>) {
+    unsafe {
+        arch::task::context_switch_light(from.get_sp_ptr(), to.get_sp_ptr());
+    }
+}
+
+/// In this function, we should see `preempt_count == 1`.
+extern "C" fn idle_task() {
+    loop {
+        // SAFETY: No need to call `lock_irq`, preempt is already disabled.
+        let mut scheduler = Scheduler::get().lock();
+        let state = *Thread::current().state.lock();
+
+        // Previous thread is `Running`
+        if let ThreadState::Running = state {
+            // No other thread to run, return to current running thread without changing its state.
+            if scheduler.ready.is_empty() {
+                drop(scheduler);
+                context_switch_light(Scheduler::idle_task(), Thread::current());
+                continue;
+            } else {
+                // Put it into `Ready` state
+                scheduler.put_ready(&Thread::current());
+            }
+        }
+
+        // No thread to run, halt the cpu and rerun the loop.
+        if scheduler.ready.is_empty() {
+            drop(scheduler);
+            arch::task::halt();
+            continue;
+        }
+
+        let next_thread = scheduler
+            .ready
+            .pop_front()
+            .as_ref()
+            .map(|weak| weak.upgrade().unwrap())
+            .expect("We should have a thread to run");
+        scheduler.set_running(&next_thread);
+        drop(scheduler);
+
+        next_thread.process.mm_list.switch_page_table();
+        unsafe { CURRENT = Some(next_thread) };
+
+        Thread::current().load_interrupt_stack();
+        Thread::current().load_thread_area32();
+
+        // TODO!!!: If the task comes from another cpu, we need to sync.
+        //
+        // The other cpu should see the changes of kernel stack of the target thread
+        // made in this cpu.
+        context_switch_light(Scheduler::idle_task(), Thread::current());
+    }
+}

+ 395 - 0
src/kernel/task/signal.rs

@@ -0,0 +1,395 @@
+use core::cmp::Reverse;
+
+use crate::{io::BufferFill, kernel::user::dataflow::UserBuffer, prelude::*};
+
+use alloc::collections::{binary_heap::BinaryHeap, btree_map::BTreeMap};
+use bindings::{
+    interrupt_stack, kill_current, mmx_registers, EFAULT, EINVAL, SA_RESTORER, SIGABRT, SIGBUS,
+    SIGCHLD, SIGCONT, SIGFPE, SIGILL, SIGKILL, SIGQUIT, SIGSEGV, SIGSTOP, SIGSYS, SIGTRAP, SIGTSTP,
+    SIGTTIN, SIGTTOU, SIGURG, SIGWINCH, SIGXCPU, SIGXFSZ,
+};
+
+use super::Thread;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Signal(u32);
+
+impl Signal {
+    pub const SIGHUP: Signal = Signal(1);
+    pub const SIGINT: Signal = Signal(2);
+    pub const SIGQUIT: Signal = Signal(3);
+    pub const SIGILL: Signal = Signal(4);
+    pub const SIGTRAP: Signal = Signal(5);
+    pub const SIGABRT: Signal = Signal(6);
+    pub const SIGIOT: Signal = Signal(6);
+    pub const SIGBUS: Signal = Signal(7);
+    pub const SIGFPE: Signal = Signal(8);
+    pub const SIGKILL: Signal = Signal(9);
+    pub const SIGUSR1: Signal = Signal(10);
+    pub const SIGSEGV: Signal = Signal(11);
+    pub const SIGUSR2: Signal = Signal(12);
+    pub const SIGPIPE: Signal = Signal(13);
+    pub const SIGALRM: Signal = Signal(14);
+    pub const SIGTERM: Signal = Signal(15);
+    pub const SIGSTKFLT: Signal = Signal(16);
+    pub const SIGCHLD: Signal = Signal(17);
+    pub const SIGCONT: Signal = Signal(18);
+    pub const SIGSTOP: Signal = Signal(19);
+    pub const SIGTSTP: Signal = Signal(20);
+    pub const SIGTTIN: Signal = Signal(21);
+    pub const SIGTTOU: Signal = Signal(22);
+    pub const SIGURG: Signal = Signal(23);
+    pub const SIGXCPU: Signal = Signal(24);
+    pub const SIGXFSZ: Signal = Signal(25);
+    pub const SIGVTALRM: Signal = Signal(26);
+    pub const SIGPROF: Signal = Signal(27);
+    pub const SIGWINCH: Signal = Signal(28);
+    pub const SIGIO: Signal = Signal(29);
+    pub const SIGPOLL: Signal = Signal(29);
+    pub const SIGPWR: Signal = Signal(30);
+    pub const SIGSYS: Signal = Signal(31);
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SignalAction {
+    sa_handler: usize,
+    sa_flags: usize,
+    sa_restorer: usize,
+    sa_mask: usize,
+}
+
+#[derive(Debug, Clone)]
+struct SignalListInner {
+    mask: u64,
+    pending: BinaryHeap<Reverse<Signal>>,
+
+    // TODO!!!!!: Signal disposition should be per-process.
+    handlers: BTreeMap<Signal, SignalAction>,
+}
+
+#[derive(Debug, Clone)]
+pub struct SignalList {
+    inner: Mutex<SignalListInner>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum RaiseResult {
+    ShouldIWakeUp,
+    ShouldUWakeUp,
+    Finished,
+    Masked,
+}
+
+impl Signal {
+    fn is_continue(&self) -> bool {
+        self.0 == SIGCONT
+    }
+
+    fn is_stop(&self) -> bool {
+        match self.0 {
+            SIGSTOP | SIGTSTP | SIGTTIN | SIGTTOU => true,
+            _ => false,
+        }
+    }
+
+    fn is_ignore(&self) -> bool {
+        match self.0 {
+            SIGCHLD | SIGURG | SIGWINCH => true,
+            _ => false,
+        }
+    }
+
+    pub fn is_now(&self) -> bool {
+        match self.0 {
+            SIGKILL | SIGSTOP => true,
+            _ => false,
+        }
+    }
+
+    fn is_coredump(&self) -> bool {
+        match self.0 {
+            SIGQUIT | SIGILL | SIGABRT | SIGFPE | SIGSEGV | SIGBUS | SIGTRAP | SIGSYS | SIGXCPU
+            | SIGXFSZ => true,
+            _ => false,
+        }
+    }
+
+    fn to_mask(&self) -> u64 {
+        1 << (self.0 - 1)
+    }
+
+    pub fn to_signum(&self) -> u32 {
+        self.0
+    }
+}
+
+impl TryFrom<u32> for Signal {
+    type Error = u32;
+
+    fn try_from(signum: u32) -> Result<Self, Self::Error> {
+        if signum > 0 && signum <= 64 {
+            Ok(Self(signum))
+        } else {
+            Err(EINVAL)
+        }
+    }
+}
+
+impl SignalAction {
+    fn default_action() -> Self {
+        Self {
+            sa_handler: 0,
+            sa_flags: 0,
+            sa_restorer: 0,
+            sa_mask: 0,
+        }
+    }
+
+    fn is_ignore(&self) -> bool {
+        const SIG_IGN: usize = 1;
+        self.sa_handler == SIG_IGN
+    }
+
+    fn is_default(&self) -> bool {
+        const SIG_DFL: usize = 0;
+        self.sa_handler == SIG_DFL
+    }
+
+    /// # Return
+    /// `(new_ip, new_sp)`
+    ///
+    /// # Might Sleep
+    fn handle(
+        &self,
+        signum: u32,
+        int_stack: &mut interrupt_stack,
+        mmxregs: &mut mmx_registers,
+    ) -> KResult<(usize, usize)> {
+        if self.sa_flags & SA_RESTORER as usize == 0 {
+            return Err(EINVAL);
+        }
+
+        const CONTEXT_SIZE: usize = size_of::<interrupt_stack>()
+            + size_of::<mmx_registers>()
+            + 2 * size_of::<u32>() // Signum and address of sa_restorer
+            + size_of::<usize>(); // Original RSP
+
+        // Save current interrupt context to 128 bytes above current user stack
+        // and align to 16 bytes
+        // TODO!!!: Determine the size of the return address
+        let sp = (int_stack.rsp - (128 + CONTEXT_SIZE + size_of::<u32>())) & !0xf;
+        let restorer_address: u32 = self.sa_restorer as u32;
+        let mut stack = UserBuffer::new(sp as *mut _, CONTEXT_SIZE)?;
+
+        stack.copy(&restorer_address)?.ok_or(EFAULT)?; // Restorer address
+        stack.copy(&signum)?.ok_or(EFAULT)?; // Signal number
+        stack.copy(&int_stack.rsp)?.ok_or(EFAULT)?; // Original RSP
+        stack.copy(mmxregs)?.ok_or(EFAULT)?; // MMX registers
+        stack.copy(int_stack)?.ok_or(EFAULT)?; // Interrupt stack
+
+        Ok((self.sa_handler, sp))
+    }
+}
+
+impl SignalListInner {
+    fn get_mask(&self) -> u64 {
+        self.mask
+    }
+
+    fn set_mask(&mut self, mask: u64) {
+        self.mask = mask;
+    }
+
+    fn mask(&mut self, mask: u64) {
+        self.set_mask(self.mask | mask)
+    }
+
+    fn unmask(&mut self, mask: u64) {
+        self.set_mask(self.mask & !mask)
+    }
+
+    fn is_masked(&self, signal: Signal) -> bool {
+        self.mask & signal.to_mask() != 0
+    }
+
+    fn pop(&mut self) -> Option<Signal> {
+        self.pending.pop().map(|Reverse(signal)| signal)
+    }
+
+    fn raise(&mut self, signal: Signal) -> RaiseResult {
+        if self.is_masked(signal) {
+            return RaiseResult::Masked;
+        }
+
+        match self.handlers.get(&signal) {
+            // Ignore action
+            Some(handler) if handler.is_ignore() => return RaiseResult::Finished,
+            // Default action
+            None if signal.is_ignore() => return RaiseResult::Finished,
+            _ => {}
+        }
+
+        self.mask(signal.to_mask());
+        self.pending.push(Reverse(signal));
+
+        if signal.is_stop() {
+            return RaiseResult::Finished;
+        }
+
+        // TODO!!!!!!: Fix this. SIGCONT could wake up USleep threads.
+        if signal.is_continue() {
+            return RaiseResult::ShouldUWakeUp;
+        }
+
+        return RaiseResult::ShouldIWakeUp;
+    }
+}
+
+impl SignalList {
+    pub fn new() -> Self {
+        Self {
+            inner: Mutex::new(SignalListInner {
+                mask: 0,
+                pending: BinaryHeap::new(),
+                handlers: BTreeMap::new(),
+            }),
+        }
+    }
+
+    pub fn get_mask(&self) -> u64 {
+        self.inner.lock().get_mask()
+    }
+
+    pub fn set_mask(&self, mask: u64) {
+        self.inner.lock().set_mask(mask)
+    }
+
+    pub fn mask(&self, mask: u64) {
+        self.inner.lock().set_mask(mask)
+    }
+
+    pub fn unmask(&self, mask: u64) {
+        self.inner.lock().unmask(mask)
+    }
+
+    pub fn set_handler(&self, signal: Signal, action: &SignalAction) -> KResult<()> {
+        if signal.is_now() {
+            return Err(EINVAL);
+        }
+
+        let mut inner = self.inner.lock();
+        if action.is_default() {
+            inner.handlers.remove(&signal);
+        } else {
+            inner.handlers.insert(signal, action.clone());
+        }
+
+        Ok(())
+    }
+
+    pub fn get_handler(&self, signal: Signal) -> SignalAction {
+        self.inner
+            .lock()
+            .handlers
+            .get(&signal)
+            .cloned()
+            .unwrap_or_else(SignalAction::default_action)
+    }
+
+    /// Clear all signals except for `SIG_IGN`.
+    /// This is used when `execve` is called.
+    pub fn clear_non_ignore(&self) {
+        self.inner
+            .lock()
+            .handlers
+            .retain(|_, action| action.is_ignore());
+    }
+
+    /// Clear all pending signals.
+    /// This is used when `fork` is called.
+    pub fn clear_pending(&self) {
+        self.inner.lock().pending.clear()
+    }
+
+    pub fn has_pending_signal(&self) -> bool {
+        !self.inner.lock().pending.is_empty()
+    }
+
+    /// Do not use this, use `Thread::raise` instead.
+    pub(super) fn raise(&self, signal: Signal) -> RaiseResult {
+        self.inner.lock().raise(signal)
+    }
+
+    /// # Safety
+    /// This function might never return. Caller must make sure that local variables
+    /// that own resources are dropped before calling this function.
+    ///
+    /// # Return
+    /// `(new_ip, new_sp)`
+    pub fn handle(
+        &self,
+        int_stack: &mut interrupt_stack,
+        mmxregs: &mut mmx_registers,
+    ) -> Option<(usize, usize)> {
+        let mut inner = self.inner.lock();
+
+        loop {
+            let signal = match self.inner.lock().pop() {
+                Some(signal) => signal,
+                None => return None,
+            };
+
+            if signal.is_now() {
+                match signal {
+                    Signal::SIGKILL => terminate_process(signal),
+                    Signal::SIGSTOP => {
+                        Thread::current().do_stop();
+                        inner.unmask(signal.to_mask());
+                    }
+                    _ => unreachable!(),
+                }
+            }
+
+            match inner.handlers.get(&signal) {
+                // Default action
+                None => {
+                    match signal {
+                        s if s.is_continue() => {
+                            Thread::current().do_continue();
+                            inner.unmask(signal.to_mask());
+                            return None;
+                        }
+                        s if s.is_stop() => {
+                            Thread::current().do_stop();
+                            inner.unmask(signal.to_mask());
+                            continue;
+                        }
+                        s if s.is_coredump() => terminate_process_core_dump(signal),
+                        s if !s.is_ignore() => terminate_process(signal),
+                        _ => continue, // Ignore
+                    }
+                }
+                Some(handler) => {
+                    let result = handler.handle(signal.to_signum(), int_stack, mmxregs);
+                    match result {
+                        Err(EFAULT) => inner.raise(Signal::SIGSEGV),
+                        Err(_) => inner.raise(Signal::SIGSYS),
+                        Ok((ip, sp)) => return Some((ip, sp)),
+                    };
+                    continue;
+                }
+            }
+        }
+    }
+}
+
+// TODO!!!: Should we use `uwake` or `iwake`?
+fn terminate_process(signal: Signal) -> ! {
+    unsafe { kill_current(signal.to_signum() as i32) };
+}
+
+fn terminate_process_core_dump(signal: Signal) -> ! {
+    unsafe { kill_current(signal.to_signum() as i32 & 0x80) };
+}
+
+fn schedule() {}

+ 0 - 197
src/kernel/task/thread.cc

@@ -1,197 +0,0 @@
-#include <queue>
-
-#include <stdint.h>
-
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/log.hpp>
-#include <kernel/mem/paging.hpp>
-#include <kernel/mem/phys.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/task/readyqueue.hpp>
-#include <kernel/task/thread.hpp>
-
-constexpr std::size_t KERNEL_STACK_ORDER = 7; // 2^7 * 4096 = 512KB
-
-using namespace kernel::task;
-using namespace kernel::mem;
-using namespace kernel::mem::paging;
-
-struct PACKED tss64_t {
-    uint32_t _reserved1;
-    uint64_t rsp[3];
-    uint64_t _reserved2;
-    uint64_t ist[7];
-    uint64_t _reserved3;
-    uint32_t _reserved4;
-};
-constexpr physaddr<tss64_t> tss{0x00000070};
-
-thread::thread(std::string name, pid_t owner) : owner{owner}, attr{READY | SYSTEM}, name{name} {}
-
-thread::thread(const thread& val, pid_t owner)
-    : owner{owner}, attr{val.attr}, name{val.name}, tls_desc32{val.tls_desc32} {}
-
-tid_t thread::tid() const {
-    return (tid_t)kstack.pfn;
-}
-
-bool thread::operator<(const thread& rhs) const {
-    return tid() < rhs.tid();
-}
-
-bool thread::operator==(const thread& rhs) const {
-    return tid() == rhs.tid();
-}
-
-static inline uintptr_t __stack_bottom(pfn_t pfn) {
-    return (uintptr_t)(void*)kernel::mem::physaddr<void>{pfn + (1 << KERNEL_STACK_ORDER) * 0x1000};
-}
-
-thread::kernel_stack::kernel_stack() {
-    pfn = page_to_pfn(alloc_pages(KERNEL_STACK_ORDER));
-    sp = __stack_bottom(pfn);
-}
-
-thread::kernel_stack::kernel_stack(const kernel_stack& other) : kernel_stack() {
-    auto offset = __stack_bottom(other.pfn) - other.sp;
-
-    sp -= offset;
-    memcpy((void*)sp, (void*)other.sp, offset);
-}
-
-thread::kernel_stack::kernel_stack(kernel_stack&& other)
-    : pfn(std::exchange(other.pfn, 0)), sp(std::exchange(other.sp, 0)) {}
-
-thread::kernel_stack::~kernel_stack() {
-    if (!pfn)
-        return;
-    free_pages(pfn, KERNEL_STACK_ORDER);
-}
-
-uint64_t thread::kernel_stack::pushq(uint64_t val) {
-    sp -= 8;
-    *(uint64_t*)sp = val;
-    return val;
-}
-
-uint32_t thread::kernel_stack::pushl(uint32_t val) {
-    sp -= 4;
-    *(uint32_t*)sp = val;
-    return val;
-}
-
-void thread::kernel_stack::load_interrupt_stack() const {
-    tss->rsp[0] = sp;
-}
-
-// TODO!!!: change of attribute should acquire dispatcher lock
-//          to prevent inconsistency of tasks in ready queue
-void thread::set_attr(thd_attr_t new_attr, bool forced) {
-    // TODO!!!: rewrite this with state machine based method to prevent
-    // inconsistency and random transition among states
-    if (attr & USLEEP && (new_attr != READY) && (new_attr != USLEEP)) {
-        kmsgf(
-            "[kernel:warn] trying to change thread state of %d from USLEEP to "
-            "%x, might be "
-            "doing something dumb.",
-            this->owner, new_attr);
-
-        return;
-    }
-
-    switch (new_attr) {
-        case SYSTEM:
-            attr |= SYSTEM;
-            break;
-        case READY:
-            if (attr & ZOMBIE) {
-                kmsgf("[kernel:warn] zombie process pid%d tries to wake up", owner);
-                break;
-            }
-
-            if (attr & READY)
-                break;
-
-            if (!forced && attr & USLEEP) {
-                kmsgf("[kernel:warn] trying to wake up %d from USLEEP", this->owner);
-                break;
-            }
-
-            attr &= SYSTEM;
-            attr |= READY;
-
-            dispatcher::enqueue(this);
-            break;
-        case ISLEEP:
-            attr &= SYSTEM;
-            attr |= ISLEEP;
-
-            dispatcher::dequeue(this);
-            break;
-        case USLEEP:
-            attr &= SYSTEM;
-            attr |= USLEEP;
-
-            dispatcher::dequeue(this);
-            break;
-        case STOPPED:
-            attr &= SYSTEM;
-            attr |= STOPPED;
-
-            dispatcher::dequeue(this);
-            break;
-        case ZOMBIE:
-            attr &= SYSTEM;
-            attr |= ZOMBIE;
-
-            dispatcher::dequeue(this);
-            break;
-        default:
-            kmsgf("[kernel:warn] unknown thread attribute: %x", new_attr);
-            break;
-    }
-}
-
-void thread::send_signal(signal_list::signo_type signal) {
-    if (signals.raise(signal))
-        this->set_attr(READY);
-}
-
-int thread::set_thread_area(kernel::user::user_desc* ptr) {
-    if (ptr->read_exec_only && ptr->seg_not_present) {
-        // TODO: use copy_to_user
-        auto* dst = (void*)(uintptr_t)ptr->base_addr;
-        std::size_t len = ptr->limit;
-        if (len > 0 && dst)
-            memset(dst, 0x00, len);
-        return 0;
-    }
-
-    if (ptr->entry_number == -1U)
-        ptr->entry_number = 7;
-    else
-        return -1;
-
-    if (!ptr->seg_32bit)
-        return -1;
-
-    if ((ptr->limit & 0xffff) != 0xffff) {
-        asm volatile("nop" : : : "memory");
-    }
-
-    tls_desc32 = ptr->limit & 0x0'ffff;
-    tls_desc32 |= (ptr->base_addr & 0x00'ffffffULL) << 16;
-    tls_desc32 |= 0x4'0'f2'000000'0000;
-    tls_desc32 |= (ptr->limit & 0xf'0000ULL) << (48 - 16);
-    tls_desc32 |= ((ptr->limit_in_pages + 0ULL) << 55);
-    tls_desc32 |= (ptr->base_addr & 0xff'000000ULL) << (56 - 24);
-
-    return 0;
-}
-
-int thread::load_thread_area32() const {
-    kernel::user::load_thread_area32(tls_desc32);
-    return 0;
-}

+ 982 - 0
src/kernel/task/thread.rs

@@ -0,0 +1,982 @@
+use core::{
+    arch::asm,
+    cell::RefCell,
+    cmp,
+    sync::atomic::{self, AtomicU32},
+};
+
+use crate::{
+    kernel::{
+        mem::{
+            phys::{CachedPP, PhysPtr},
+            MMList,
+        },
+        terminal::Terminal,
+        user::dataflow::CheckedUserPointer,
+        vfs::FsContext,
+    },
+    prelude::*,
+    sync::{preempt, CondVar},
+};
+
+use alloc::{
+    collections::{btree_map::BTreeMap, vec_deque::VecDeque},
+    sync::{Arc, Weak},
+};
+use bindings::{ECHILD, EINTR, EINVAL, EPERM, ESRCH};
+
+use crate::kernel::vfs::filearray::FileArray;
+
+use super::{
+    signal::{RaiseResult, Signal, SignalList},
+    KernelStack, Scheduler,
+};
+
+#[derive(Debug, Clone, Copy)]
+pub enum ThreadState {
+    Preparing,
+    Running,
+    Ready,
+    Zombie,
+    ISleep,
+    USleep,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct WaitObject {
+    pub pid: u32,
+    pub code: u32,
+}
+
+impl WaitObject {
+    pub fn is_stopped(&self) -> bool {
+        self.code & 0x7f == 0x7f
+    }
+
+    pub fn is_continue(&self) -> bool {
+        self.code == 0xffff
+    }
+}
+
+#[derive(Debug)]
+struct SessionInner {
+    /// Foreground process group
+    foreground: Weak<ProcessGroup>,
+    control_terminal: Option<Arc<Terminal>>,
+    groups: BTreeMap<u32, Weak<ProcessGroup>>,
+}
+
+#[derive(Debug)]
+pub struct Session {
+    sid: u32,
+    leader: Weak<Process>,
+
+    inner: Spin<SessionInner>,
+}
+
+#[derive(Debug)]
+pub struct ProcessGroup {
+    pgid: u32,
+    leader: Weak<Process>,
+    session: Weak<Session>,
+
+    processes: Spin<BTreeMap<u32, Weak<Process>>>,
+}
+
+#[derive(Debug)]
+struct ProcessInner {
+    /// Parent process
+    ///
+    /// Parent process must be valid during the whole life of the process.
+    /// The only case that parent process may be `None` is when this is the init process
+    /// or the process is kernel thread.
+    parent: Option<Arc<Process>>,
+
+    /// Process group
+    pgroup: Arc<ProcessGroup>,
+
+    /// Session
+    session: Arc<Session>,
+
+    /// Children list
+    children: BTreeMap<u32, Weak<Thread>>,
+
+    /// Thread list
+    threads: BTreeMap<u32, Weak<Thread>>,
+}
+
+#[derive(Debug)]
+pub struct WaitList {
+    wait_procs: Spin<VecDeque<WaitObject>>,
+    cv_wait_procs: CondVar,
+}
+
+#[derive(Debug)]
+pub struct Process {
+    /// Process id
+    ///
+    /// This should never change during the life of the process.
+    pub pid: u32,
+
+    pub wait_list: WaitList,
+    pub mm_list: Arc<MMList>,
+    inner: Spin<ProcessInner>,
+}
+
+impl PartialOrd for Process {
+    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+        self.pid.partial_cmp(&other.pid)
+    }
+}
+
+impl Ord for Process {
+    fn cmp(&self, other: &Self) -> cmp::Ordering {
+        self.pid.cmp(&other.pid)
+    }
+}
+
+impl PartialEq for Process {
+    fn eq(&self, other: &Self) -> bool {
+        self.pid == other.pid
+    }
+}
+
+impl Eq for Process {}
+
+#[derive(Debug)]
+struct ThreadInner {
+    /// Thread name
+    name: Arc<[u8]>,
+
+    /// Thread TLS descriptor 32-bit
+    tls_desc32: u64,
+
+    /// User pointer
+    /// Store child thread's tid when child thread returns to user space.
+    set_child_tid: usize,
+}
+
+pub struct Thread {
+    pub tid: u32,
+    pub process: Arc<Process>,
+
+    pub files: Arc<FileArray>,
+    pub fs_context: Arc<FsContext>,
+
+    pub signal_list: SignalList,
+
+    /// Thread state for scheduler use.
+    pub state: Spin<ThreadState>,
+
+    /// Kernel stack
+    /// Never access this directly.
+    ///
+    /// We can only touch kernel stack when the process is neither running nor sleeping.
+    /// AKA, the process is in the ready queue and will return to `schedule` context.
+    kstack: RefCell<KernelStack>,
+
+    inner: Spin<ThreadInner>,
+}
+
+impl PartialOrd for Thread {
+    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+        self.tid.partial_cmp(&other.tid)
+    }
+}
+
+impl Ord for Thread {
+    fn cmp(&self, other: &Self) -> cmp::Ordering {
+        self.tid.cmp(&other.tid)
+    }
+}
+
+impl PartialEq for Thread {
+    fn eq(&self, other: &Self) -> bool {
+        self.tid == other.tid
+    }
+}
+
+impl Eq for Thread {}
+
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy)]
+pub struct UserDescriptorFlags(u32);
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct UserDescriptor {
+    entry: u32,
+    base: u32,
+    limit: u32,
+    flags: UserDescriptorFlags,
+}
+
+pub struct ProcessList {
+    init: Arc<Process>,
+    threads: Spin<BTreeMap<u32, Arc<Thread>>>,
+    processes: Spin<BTreeMap<u32, Weak<Process>>>,
+    pgroups: Spin<BTreeMap<u32, Weak<ProcessGroup>>>,
+    sessions: Spin<BTreeMap<u32, Weak<Session>>>,
+}
+
+impl Session {
+    fn new(sid: u32, leader: Weak<Process>) -> Arc<Self> {
+        let session = Arc::new(Self {
+            sid,
+            leader,
+            inner: Spin::new(SessionInner {
+                foreground: Weak::new(),
+                control_terminal: None,
+                groups: BTreeMap::new(),
+            }),
+        });
+
+        ProcessList::get().add_session(&session);
+        session
+    }
+
+    fn add_member(&self, pgroup: &Arc<ProcessGroup>) {
+        self.inner
+            .lock()
+            .groups
+            .insert(pgroup.pgid, Arc::downgrade(pgroup));
+    }
+
+    pub fn foreground_pgid(&self) -> Option<u32> {
+        self.inner.lock().foreground.upgrade().map(|fg| fg.pgid)
+    }
+
+    /// Set the foreground process group.
+    pub fn set_foreground_pgid(&self, pgid: u32) -> KResult<()> {
+        let mut inner = self.inner.lock();
+        let group = inner.groups.get(&pgid);
+
+        if let Some(group) = group {
+            inner.foreground = group.clone();
+            Ok(())
+        } else {
+            // TODO!!!: Check if the process group is valid.
+            //          We assume that the process group is valid for now.
+            Err(EPERM)
+        }
+    }
+
+    pub fn raise_foreground(&self, signal: Signal) {
+        if let Some(fg) = self.inner.lock().foreground.upgrade() {
+            fg.raise(signal);
+        }
+    }
+}
+
+impl ProcessGroup {
+    fn new_for_init(pgid: u32, leader: Weak<Process>, session: Weak<Session>) -> Arc<Self> {
+        let pgroup = Arc::new(Self {
+            pgid,
+            leader: leader.clone(),
+            session,
+            processes: Spin::new(BTreeMap::from([(pgid, leader)])),
+        });
+
+        ProcessList::get().add_pgroup(&pgroup);
+        pgroup
+    }
+
+    fn new(leader: &Arc<Process>, session: &Arc<Session>) -> Arc<Self> {
+        let pgroup = Arc::new(Self {
+            pgid: leader.pid,
+            leader: Arc::downgrade(leader),
+            session: Arc::downgrade(session),
+            processes: Spin::new(BTreeMap::from([(leader.pid, Arc::downgrade(leader))])),
+        });
+
+        ProcessList::get().add_pgroup(&pgroup);
+        session.add_member(&pgroup);
+        pgroup
+    }
+}
+
+impl Drop for Thread {
+    fn drop(&mut self) {
+        let mut process = self.process.inner.lock();
+
+        process.threads.remove(&self.tid);
+        if let Some(parent) = &process.parent {
+            parent.inner.lock().children.remove(&self.tid);
+        }
+    }
+}
+
+impl Drop for Process {
+    fn drop(&mut self) {
+        let inner = self.inner.lock();
+        assert!(inner.children.is_empty());
+
+        inner.pgroup.processes.lock().remove(&self.pid);
+        ProcessList::get().processes.lock().remove(&self.pid);
+    }
+}
+
+impl Drop for ProcessGroup {
+    fn drop(&mut self) {
+        if let Some(session) = self.session.upgrade() {
+            session.inner.lock().groups.remove(&self.pgid);
+        }
+    }
+}
+
+impl ProcessList {
+    pub fn get() -> &'static Self {
+        todo!()
+    }
+
+    pub fn add_session(&self, session: &Arc<Session>) {
+        self.sessions
+            .lock()
+            .insert(session.sid, Arc::downgrade(session));
+    }
+
+    pub fn add_pgroup(&self, pgroup: &Arc<ProcessGroup>) {
+        self.pgroups
+            .lock()
+            .insert(pgroup.pgid, Arc::downgrade(pgroup));
+    }
+
+    pub fn add_process(&self, process: &Arc<Process>) {
+        self.processes
+            .lock()
+            .insert(process.pid, Arc::downgrade(process));
+    }
+
+    pub fn add_thread(&self, thread: &Arc<Thread>) {
+        self.threads.lock().insert(thread.tid, thread.clone());
+    }
+
+    pub fn kill_current(signal: Signal) -> ! {
+        ProcessList::get().do_kill_process(
+            &Thread::current().process,
+            ((signal.to_signum() + 128) << 8) | (signal.to_signum() & 0xff),
+        );
+
+        Scheduler::schedule_noreturn()
+    }
+
+    fn new() -> Self {
+        let init_process = Process::new_for_init(1, None);
+        let init_thread = Thread::new_for_init(b"[kernel kinit]".as_slice().into(), &init_process);
+        Scheduler::set_current(init_thread.clone());
+
+        let idle_process = Process::new_for_init(0, None);
+        let idle_thread =
+            Thread::new_for_init(b"[kernel idle#BS]".as_slice().into(), &idle_process);
+        Scheduler::set_idle(idle_thread.clone());
+
+        Self {
+            sessions: Spin::new(BTreeMap::new()),
+            pgroups: Spin::new(BTreeMap::new()),
+            threads: Spin::new(BTreeMap::from([
+                (1, init_thread.clone()),
+                (0, idle_thread.clone()),
+            ])),
+            processes: Spin::new(BTreeMap::from([
+                (1, Arc::downgrade(&init_process)),
+                (0, Arc::downgrade(&idle_process)),
+            ])),
+            init: init_process,
+        }
+    }
+
+    // TODO!!!!!!: Reconsider this
+    fn remove(&self, tid: u32) {
+        if let None = self.threads.lock().remove(&tid) {
+            panic!("Thread {} not found", tid);
+        }
+    }
+
+    pub fn try_find_process(&self, pid: u32) -> Option<Arc<Process>> {
+        self.processes.lock().get(&pid).and_then(Weak::upgrade)
+    }
+
+    pub fn try_find_thread(&self, tid: u32) -> Option<Arc<Thread>> {
+        self.threads.lock().get(&tid).cloned()
+    }
+
+    pub fn try_find_pgroup(&self, pgid: u32) -> Option<Arc<ProcessGroup>> {
+        self.pgroups.lock().get(&pgid).and_then(Weak::upgrade)
+    }
+
+    pub fn try_find_session(&self, sid: u32) -> Option<Arc<Session>> {
+        self.sessions.lock().get(&sid).and_then(Weak::upgrade)
+    }
+
+    /// Make the process a zombie and notify the parent.
+    pub fn do_kill_process(&self, process: &Arc<Process>, status: u32) {
+        if &self.init == process {
+            panic!("init exited");
+        }
+
+        preempt::disable();
+
+        let mut inner = process.inner.lock();
+        // TODO!!!!!!: When we are killing multiple threads, we need to wait until all
+        // the threads are stopped then proceed.
+        for thread in inner.threads.values().map(|t| t.upgrade().unwrap()) {
+            assert!(&thread == Thread::current());
+            Scheduler::get().lock().set_zombie(&thread);
+            thread.files.close_all();
+        }
+
+        // Unmap all user memory areas
+        process.mm_list.clear_user();
+
+        // Make children orphans (adopted by init)
+        let mut init_inner = self.init.inner.lock();
+
+        inner.children.retain(|_, child| {
+            let child = child.upgrade().unwrap();
+            let mut child_inner = child.process.inner.lock();
+            if child_inner.parent.as_ref().unwrap() == &self.init {
+                return false;
+            }
+
+            child_inner.parent = Some(self.init.clone());
+            init_inner.add_child(&child);
+
+            false
+        });
+
+        let has_waiting = {
+            let mut init_waits = self.init.wait_list.wait_procs.lock();
+            let mut waits = process.wait_list.wait_procs.lock();
+
+            let mut done_some_work = false;
+            waits.retain(|item| {
+                if !item.is_stopped() && !item.is_continue() {
+                    init_waits.push_back(*item);
+                    done_some_work = true;
+                }
+                false
+            });
+
+            done_some_work
+        };
+
+        if has_waiting {
+            self.init.wait_list.cv_wait_procs.notify_all();
+        }
+
+        {
+            let parent = process.parent().unwrap();
+            {
+                let mut parent_waits = parent.wait_list.wait_procs.lock();
+                parent_waits.push_back(WaitObject {
+                    pid: process.pid,
+                    code: status,
+                });
+            }
+            parent.wait_list.cv_wait_procs.notify_all();
+        }
+
+        preempt::enable();
+    }
+}
+
+impl ProcessGroup {
+    fn add_member(&self, process: &Arc<Process>) {
+        self.processes
+            .lock()
+            .insert(process.pid, Arc::downgrade(process));
+    }
+
+    fn remove_member(&self, pid: u32) {
+        self.processes.lock().remove(&pid);
+    }
+
+    pub fn raise(&self, signal: Signal) {
+        let processes = self.processes.lock();
+        for process in processes.values().map(|p| p.upgrade().unwrap()) {
+            process.raise(signal);
+        }
+    }
+}
+
+impl ProcessInner {
+    fn add_child(&mut self, child: &Arc<Thread>) {
+        self.children.insert(child.tid, Arc::downgrade(child));
+    }
+
+    fn add_thread(&mut self, thread: &Arc<Thread>) {
+        self.threads.insert(thread.tid, Arc::downgrade(thread));
+    }
+}
+
+/// PID 0 and 1 is created manually so we start from 2.
+static NEXT_PID: AtomicU32 = AtomicU32::new(2);
+impl Process {
+    fn alloc_pid() -> u32 {
+        NEXT_PID.fetch_add(1, atomic::Ordering::Relaxed)
+    }
+
+    pub fn new_cloned(other: &Arc<Self>) -> Arc<Self> {
+        let other_inner = other.inner.lock();
+
+        let process = Arc::new(Self {
+            pid: Self::alloc_pid(),
+            wait_list: WaitList::new(),
+            mm_list: MMList::new_cloned(&other.mm_list),
+            inner: Spin::new(ProcessInner {
+                pgroup: other_inner.pgroup.clone(),
+                session: other_inner.session.clone(),
+                children: BTreeMap::new(),
+                threads: BTreeMap::new(),
+                parent: Some(other.clone()),
+            }),
+        });
+
+        ProcessList::get().add_process(&process);
+        other_inner.pgroup.add_member(&process);
+        process
+    }
+
+    fn new_for_init(pid: u32, parent: Option<Arc<Self>>) -> Arc<Self> {
+        let process = Arc::new_cyclic(|weak| {
+            let session = Session::new(pid, weak.clone());
+            let pgroup = ProcessGroup::new_for_init(pid, weak.clone(), Arc::downgrade(&session));
+
+            session.add_member(&pgroup);
+            Self {
+                pid,
+                wait_list: WaitList::new(),
+                mm_list: MMList::new(),
+                inner: Spin::new(ProcessInner {
+                    parent,
+                    pgroup,
+                    session,
+                    children: BTreeMap::new(),
+                    threads: BTreeMap::new(),
+                }),
+            }
+        });
+
+        ProcessList::get().add_process(&process);
+        process.inner.lock().pgroup.add_member(&process);
+        process
+    }
+
+    pub fn raise(&self, signal: Signal) {
+        let inner = self.inner.lock();
+        for thread in inner.threads.values().map(|t| t.upgrade().unwrap()) {
+            if let RaiseResult::Finished = thread.raise(signal) {
+                break;
+            }
+        }
+    }
+
+    fn add_child(&self, child: &Arc<Thread>) {
+        self.inner.lock().add_child(child);
+    }
+
+    fn add_thread(&self, thread: &Arc<Thread>) {
+        self.inner.lock().add_thread(thread);
+    }
+
+    pub fn wait(
+        &self,
+        no_block: bool,
+        trace_stop: bool,
+        trace_continue: bool,
+    ) -> KResult<Option<WaitObject>> {
+        let mut wait_list = self.wait_list.wait_procs.lock();
+        let wait_object = loop {
+            if let Some(idx) = wait_list
+                .iter()
+                .enumerate()
+                .filter(|(_, item)| {
+                    if item.is_stopped() {
+                        trace_stop
+                    } else if item.is_continue() {
+                        trace_continue
+                    } else {
+                        true
+                    }
+                })
+                .map(|(idx, _)| idx)
+                .next()
+            {
+                break wait_list.remove(idx).unwrap();
+            }
+
+            if self.inner.lock().children.is_empty() {
+                return Err(ECHILD);
+            }
+            if no_block {
+                return Ok(None);
+            }
+            self.wait_list.cv_wait_procs.wait(&mut wait_list);
+            if Thread::current().signal_list.has_pending_signal() {
+                return Err(EINTR);
+            }
+        };
+
+        if wait_object.is_stopped() || wait_object.is_continue() {
+            Ok(Some(wait_object))
+        } else {
+            ProcessList::get().remove(wait_object.pid);
+            Ok(Some(wait_object))
+        }
+    }
+
+    /// Create a new session for the process.
+    pub fn setsid(self: &Arc<Self>) -> KResult<u32> {
+        let mut inner = self.inner.lock();
+        // If there exists a session that has the same sid as our pid, we can't create a new
+        // session. The standard says that we should create a new process group and be the
+        // only process in the new process group and session.
+        if ProcessList::get().try_find_session(self.pid).is_some() {
+            return Err(EPERM);
+        }
+        inner.session = Session::new(self.pid, Arc::downgrade(self));
+        inner.pgroup.remove_member(self.pid);
+        inner.pgroup = ProcessGroup::new(self, &inner.session);
+        Ok(inner.pgroup.pgid)
+    }
+
+    /// Set the process group id of the process to `pgid`.
+    ///
+    /// This function does the actual work.
+    fn do_setpgid(self: &Arc<Self>, pgid: u32) -> KResult<()> {
+        let mut inner = self.inner.lock();
+
+        // Changing the process group of a session leader is not allowed.
+        if inner.session.sid == self.pid {
+            return Err(EPERM);
+        }
+
+        // Move us to an existing process group.
+        if let Some(pgroup) = ProcessList::get().try_find_pgroup(pgid) {
+            // Move the process to a process group in a different session in not allowed.
+            if pgroup.session.upgrade().unwrap().sid != inner.session.sid {
+                return Err(EPERM);
+            }
+
+            // If we are already in the process group, we are done.
+            if pgroup.pgid == inner.pgroup.pgid {
+                return Ok(());
+            }
+
+            inner.pgroup.remove_member(self.pid);
+            inner.pgroup = pgroup;
+        } else {
+            // Create a new process group only if `pgid` matches our `pid`.
+            if pgid != self.pid {
+                return Err(EPERM);
+            }
+
+            inner.session = Session::new(self.pid, Arc::downgrade(self));
+            inner.pgroup.remove_member(self.pid);
+            inner.pgroup = ProcessGroup::new(self, &inner.session);
+        }
+
+        Ok(())
+    }
+
+    /// Set the process group id of the process `pid` to `pgid`.
+    ///
+    /// This function should be called on the process that issued the syscall in order to do
+    /// permission checks.
+    pub fn setpgid(self: &Arc<Self>, pid: u32, pgid: u32) -> KResult<()> {
+        // We may set pgid of either the calling process or a child process.
+        if pid == self.pid {
+            self.do_setpgid(pgid)
+        } else {
+            let child = {
+                // If `pid` refers to one of our children, the thread leaders must be
+                // in out children list.
+                let inner = self.inner.lock();
+                let child = {
+                    let child = inner.children.get(&pid);
+                    child.and_then(Weak::upgrade).ok_or(ESRCH)?
+                };
+
+                // Changing the process group of a child is only allowed
+                // if we are in the same session.
+                if child.process.sid() != inner.session.sid {
+                    return Err(EPERM);
+                }
+
+                child
+            };
+
+            // TODO: Check whether we, as a child, have already performed an `execve`.
+            //       If so, we should return `Err(EACCES)`.
+            child.process.do_setpgid(pgid)
+        }
+    }
+
+    pub fn sid(&self) -> u32 {
+        self.inner.lock().session.sid
+    }
+
+    pub fn pgid(&self) -> u32 {
+        self.inner.lock().pgroup.pgid
+    }
+
+    pub fn session(&self) -> Arc<Session> {
+        self.inner.lock().session.clone()
+    }
+
+    pub fn pgroup(&self) -> Arc<ProcessGroup> {
+        self.inner.lock().pgroup.clone()
+    }
+}
+
+impl UserDescriptorFlags {
+    fn is_32bit_segment(&self) -> bool {
+        self.0 & 0b1 != 0
+    }
+
+    fn contents(&self) -> u32 {
+        self.0 & 0b110
+    }
+
+    fn is_read_exec_only(&self) -> bool {
+        self.0 & 0b1000 != 0
+    }
+
+    fn is_limit_in_pages(&self) -> bool {
+        self.0 & 0b10000 != 0
+    }
+
+    fn is_present(&self) -> bool {
+        self.0 & 0b100000 == 0
+    }
+
+    fn is_usable(&self) -> bool {
+        self.0 & 0b1000000 != 0
+    }
+}
+
+impl Thread {
+    fn new_for_init(name: Arc<[u8]>, process: &Arc<Process>) -> Arc<Self> {
+        let thread = Arc::new(Self {
+            tid: process.pid,
+            process: process.clone(),
+            files: FileArray::new_for_init(),
+            fs_context: FsContext::new_for_init(),
+            signal_list: SignalList::new(),
+            kstack: RefCell::new(KernelStack::new()),
+            state: Spin::new(ThreadState::Preparing),
+            inner: Spin::new(ThreadInner {
+                name,
+                tls_desc32: 0,
+                set_child_tid: 0,
+            }),
+        });
+
+        ProcessList::get().add_thread(&thread);
+        process.add_thread(&thread);
+        thread
+    }
+
+    pub fn new_cloned(other: &Self) -> Arc<Self> {
+        let process = Process::new_cloned(&other.process);
+
+        let other_state = other.state.lock();
+        let other_inner = other.inner.lock();
+        assert!(matches!(*other_state, ThreadState::Running));
+
+        let signal_list = other.signal_list.clone();
+        signal_list.clear_pending();
+
+        let thread = Arc::new(Self {
+            tid: process.pid,
+            process: process.clone(),
+            files: FileArray::new_cloned(&other.files),
+            fs_context: FsContext::new_cloned(&other.fs_context),
+            signal_list,
+            kstack: RefCell::new(KernelStack::new()),
+            state: Spin::new(ThreadState::Preparing),
+            inner: Spin::new(ThreadInner {
+                name: other_inner.name.clone(),
+                tls_desc32: other_inner.tls_desc32,
+                set_child_tid: other_inner.set_child_tid,
+            }),
+        });
+
+        ProcessList::get().add_thread(&thread);
+        other.process.add_child(&thread);
+        process.add_thread(&thread);
+        thread
+    }
+
+    pub fn current<'lt>() -> &'lt Arc<Self> {
+        Scheduler::current()
+    }
+
+    pub fn do_stop(self: &Arc<Self>) {
+        if let Some(parent) = self.process.parent() {
+            parent.wait_list.notify_stop(self.process.pid);
+        }
+
+        preempt::disable();
+
+        // `SIGSTOP` can only be waken up by `SIGCONT` or `SIGKILL`.
+        // SAFETY: Preempt disabled above.
+        Scheduler::get().lock().usleep(self);
+        Scheduler::schedule();
+    }
+
+    pub fn do_continue(self: &Arc<Self>) {
+        if let Some(parent) = self.process.parent() {
+            parent.wait_list.notify_continue(self.process.pid);
+        }
+    }
+
+    pub fn raise(self: &Arc<Thread>, signal: Signal) -> RaiseResult {
+        match self.signal_list.raise(signal) {
+            RaiseResult::ShouldIWakeUp => {
+                Scheduler::get().lock_irq().iwake(self);
+                RaiseResult::Finished
+            }
+            RaiseResult::ShouldUWakeUp => {
+                Scheduler::get().lock_irq().uwake(self);
+                RaiseResult::Finished
+            }
+            result => result,
+        }
+    }
+
+    pub fn load_thread_area32(&self) {
+        let inner = self.inner.lock();
+        if inner.tls_desc32 == 0 {
+            return;
+        }
+
+        // SAFETY: `tls32` should be per cpu.
+        let tls32_addr = CachedPP::new(0x0 + 7 * 8);
+        tls32_addr.as_mut::<u64>().clone_from(&inner.tls_desc32);
+
+        unsafe {
+            asm!(
+                "mov %gs, %ax",
+                "mov %ax, %gs",
+                out("ax") _,
+                options(att_syntax)
+            )
+        };
+    }
+
+    pub fn set_thread_area(&self, desc: &mut UserDescriptor) -> KResult<()> {
+        let mut inner = self.inner.lock();
+
+        // Clear the TLS area if it is not present.
+        if desc.flags.is_read_exec_only() && !desc.flags.is_present() {
+            if desc.limit != 0 && desc.base != 0 {
+                CheckedUserPointer::new(desc.base as _, desc.limit as _)?.zero()?;
+            }
+            return Ok(());
+        }
+
+        if desc.entry != u32::MAX || !desc.flags.is_32bit_segment() {
+            return Err(EINVAL);
+        }
+        desc.entry = 7;
+
+        inner.tls_desc32 = desc.limit as u64 & 0xffff;
+        inner.tls_desc32 |= (desc.base as u64 & 0xffffff) << 16;
+        inner.tls_desc32 |= 0x4_0_f2_000000_0000;
+        inner.tls_desc32 |= (desc.limit as u64 & 0xf_0000) << (48 - 16);
+
+        if desc.flags.is_limit_in_pages() {
+            inner.tls_desc32 |= 1 << 55;
+        }
+
+        inner.tls_desc32 |= (desc.base as u64 & 0xff_000000) << (56 - 24);
+
+        Ok(())
+    }
+
+    /// This function is used to prepare the kernel stack for the thread in `Preparing` state.
+    ///
+    /// # Safety
+    /// Calling this function on a thread that is not in `Preparing` state will panic.
+    pub fn prepare_kernel_stack<F: FnOnce(&mut KernelStack)>(&self, func: F) {
+        let mut state = self.state.lock();
+        assert!(matches!(*state, ThreadState::Preparing));
+
+        // SAFETY: We are in the preparing state with `state` locked.
+        func(&mut self.kstack.borrow_mut());
+
+        // Enter USleep state. Await for the thread to be scheduled manually.
+        *state = ThreadState::USleep;
+    }
+
+    pub fn load_interrupt_stack(&self) {
+        self.kstack.borrow().load_interrupt_stack();
+    }
+
+    /// Get a pointer to `self.sp` so we can use it in `context_switch()`.
+    ///
+    /// # Safety
+    /// Save the pointer somewhere or pass it to a function that will use it is UB.
+    pub unsafe fn get_sp_ptr(&self) -> *mut usize {
+        self.kstack.borrow().get_sp_ptr()
+    }
+
+    pub fn set_name(&self, name: Arc<[u8]>) {
+        self.inner.lock().name = name;
+    }
+
+    pub fn get_name(&self) -> Arc<[u8]> {
+        self.inner.lock().name.clone()
+    }
+}
+
+// TODO: Maybe we can find a better way instead of using `RefCell` for `KernelStack`?
+unsafe impl Sync for Thread {}
+
+impl WaitList {
+    pub fn new() -> Self {
+        Self {
+            wait_procs: Spin::new(VecDeque::new()),
+            cv_wait_procs: CondVar::new(),
+        }
+    }
+
+    pub fn notify_continue(&self, pid: u32) {
+        let mut wait_procs = self.wait_procs.lock();
+        wait_procs.push_back(WaitObject { pid, code: 0xffff });
+        self.cv_wait_procs.notify_all();
+    }
+
+    pub fn notify_stop(&self, pid: u32) {
+        let mut wait_procs = self.wait_procs.lock();
+        wait_procs.push_back(WaitObject { pid, code: 0x7f });
+        self.cv_wait_procs.notify_all();
+    }
+}
+
+impl Process {
+    pub fn parent(&self) -> Option<Arc<Process>> {
+        self.inner.lock().parent.clone()
+    }
+}
+
+// TODO!!!!!!: impl this
+fn init_scheduler() {
+    let process_list = ProcessList::new();
+    Thread::current().load_interrupt_stack();
+    Thread::current().process.mm_list.switch_page_table();
+
+    Scheduler::idle_task().prepare_kernel_stack(|kstack| {
+        let mut writer = kstack.get_writer();
+        writer.flags = 0x200;
+        writer.entry = idle_task;
+        writer.finish();
+    });
+}
+
+extern "C" fn idle_task() {
+    loop {
+        arch::task::halt();
+    }
+}

+ 703 - 0
src/kernel/terminal.rs

@@ -0,0 +1,703 @@
+use core::iter::repeat;
+
+use alloc::{
+    collections::vec_deque::VecDeque,
+    sync::{Arc, Weak},
+};
+use bindings::{EINTR, ENOTTY};
+use bitflags::bitflags;
+
+use crate::{io::Buffer, prelude::*, sync::CondVar};
+
+use super::{
+    task::{Session, Signal, Thread},
+    user::{UserPointer, UserPointerMut},
+};
+
+const BUFFER_SIZE: usize = 4096;
+
+const NCCS: usize = 32;
+
+// taken from linux kernel code
+
+/* c_cc characters */
+const VINTR: usize = 0;
+const VQUIT: usize = 1;
+const VERASE: usize = 2;
+const VKILL: usize = 3;
+const VEOF: usize = 4;
+const VTIME: usize = 5;
+const VMIN: usize = 6;
+const VSWTC: usize = 7;
+const VSTART: usize = 8;
+const VSTOP: usize = 9;
+const VSUSP: usize = 10;
+const VEOL: usize = 11;
+const VREPRINT: usize = 12;
+const VDISCARD: usize = 13;
+const VWERASE: usize = 14;
+const VLNEXT: usize = 15;
+const VEOL2: usize = 16;
+
+/* c_iflag bits */
+
+bitflags! {
+    pub struct TermioIFlags: u16 {
+        /// Ignore break condition
+        const IGNBRK = 0x0001;
+        /// Signal interrupt on break
+        const BRKINT = 0x0002;
+        /// Ignore characters with parity errors
+        const IGNPAR = 0x0004;
+        /// Mark parity and framing errors
+        const PARMRK = 0x0008;
+        /// Enable input parity check
+        const INPCK = 0x0010;
+        /// Strip 8th bit off characters
+        const ISTRIP = 0x0020;
+        /// Map NL to CR on input
+        const INLCR = 0x0040;
+        /// Ignore CR
+        const IGNCR = 0x0080;
+        /// Map CR to NL on input
+        const ICRNL = 0x0100;
+        const IUCLC = 0x0200;
+        const IXON = 0x0400;
+        /// Any character will restart after stop
+        const IXANY = 0x0800;
+        const IXOFF = 0x1000;
+        const IMAXBEL = 0x2000;
+        const IUTF8 = 0x4000;
+    }
+
+    pub struct TermioOFlags: u16 {
+        /// Perform output processing
+        const OPOST = 0x0001;
+        const OLCUC = 0x0002;
+        const ONLCR = 0x0004;
+        const OCRNL = 0x0008;
+        const ONOCR = 0x0010;
+        const ONLRET = 0x0020;
+        const OFILL = 0x0040;
+        const OFDEL = 0x0080;
+    }
+}
+
+bitflags! {
+    pub struct TermioLFlags: u16 {
+        const ISIG = 0x0001;
+        const ICANON = 0x0002;
+        const XCASE = 0x0004;
+        const ECHO = 0x0008;
+        const ECHOE = 0x0010;
+        const ECHOK = 0x0020;
+        const ECHONL = 0x0040;
+        const NOFLSH = 0x0080;
+        const TOSTOP = 0x0100;
+        const ECHOCTL = 0x0200;
+        const ECHOPRT = 0x0400;
+        const ECHOKE = 0x0800;
+        const FLUSHO = 0x1000;
+        const PENDIN = 0x4000;
+        const IEXTEN = 0x8000;
+    }
+}
+
+/* c_cflag bit meaning */
+/* Common CBAUD rates */
+const B0: u32 = 0x00000000; /* hang up */
+const B50: u32 = 0x00000001;
+const B75: u32 = 0x00000002;
+const B110: u32 = 0x00000003;
+const B134: u32 = 0x00000004;
+const B150: u32 = 0x00000005;
+const B200: u32 = 0x00000006;
+const B300: u32 = 0x00000007;
+const B600: u32 = 0x00000008;
+const B1200: u32 = 0x00000009;
+const B1800: u32 = 0x0000000a;
+const B2400: u32 = 0x0000000b;
+const B4800: u32 = 0x0000000c;
+const B9600: u32 = 0x0000000d;
+const B19200: u32 = 0x0000000e;
+const B38400: u32 = 0x0000000f;
+const EXTA: u32 = B19200;
+const EXTB: u32 = B38400;
+
+const ADDRB: u32 = 0x20000000; /* address bit */
+const CMSPAR: u32 = 0x40000000; /* mark or space (stick) parity */
+const CRTSCTS: u32 = 0x80000000; /* flow control */
+
+const IBSHIFT: u32 = 16; /* Shift from CBAUD to CIBAUD */
+
+const CBAUD: u32 = 0x0000100f;
+const CSIZE: u32 = 0x00000030;
+const CS5: u32 = 0x00000000;
+const CS6: u32 = 0x00000010;
+const CS7: u32 = 0x00000020;
+const CS8: u32 = 0x00000030;
+const CSTOPB: u32 = 0x00000040;
+const CREAD: u32 = 0x00000080;
+const PARENB: u32 = 0x00000100;
+const PARODD: u32 = 0x00000200;
+const HUPCL: u32 = 0x00000400;
+const CLOCAL: u32 = 0x00000800;
+const CBAUDEX: u32 = 0x00001000;
+const BOTHER: u32 = 0x00001000;
+const B57600: u32 = 0x00001001;
+const B115200: u32 = 0x00001002;
+const B230400: u32 = 0x00001003;
+const B460800: u32 = 0x00001004;
+const B500000: u32 = 0x00001005;
+const B576000: u32 = 0x00001006;
+const B921600: u32 = 0x00001007;
+const B1000000: u32 = 0x00001008;
+const B1152000: u32 = 0x00001009;
+const B1500000: u32 = 0x0000100a;
+const B2000000: u32 = 0x0000100b;
+const B2500000: u32 = 0x0000100c;
+const B3000000: u32 = 0x0000100d;
+const B3500000: u32 = 0x0000100e;
+const B4000000: u32 = 0x0000100f;
+const CIBAUD: u32 = 0x100f0000; /* input baud rate */
+
+// line disciplines
+
+const N_TTY: u8 = 0;
+
+pub struct Termios {
+    iflag: TermioIFlags,
+    oflag: TermioOFlags,
+    cflag: u32,
+    lflag: TermioLFlags,
+
+    line: u8,
+    cc: [u8; NCCS],
+}
+
+macro_rules! CTRL {
+    ('A') => {
+        0x01
+    };
+    ('B') => {
+        0x02
+    };
+    ('C') => {
+        0x03
+    };
+    ('D') => {
+        0x04
+    };
+    ('E') => {
+        0x05
+    };
+    ('F') => {
+        0x06
+    };
+    ('G') => {
+        0x07
+    };
+    ('H') => {
+        0x08
+    };
+    ('I') => {
+        0x09
+    };
+    ('J') => {
+        0x0A
+    };
+    ('K') => {
+        0x0B
+    };
+    ('L') => {
+        0x0C
+    };
+    ('M') => {
+        0x0D
+    };
+    ('N') => {
+        0x0E
+    };
+    ('O') => {
+        0x0F
+    };
+    ('P') => {
+        0x10
+    };
+    ('Q') => {
+        0x11
+    };
+    ('R') => {
+        0x12
+    };
+    ('S') => {
+        0x13
+    };
+    ('T') => {
+        0x14
+    };
+    ('U') => {
+        0x15
+    };
+    ('V') => {
+        0x16
+    };
+    ('W') => {
+        0x17
+    };
+    ('X') => {
+        0x18
+    };
+    ('Y') => {
+        0x19
+    };
+    ('Z') => {
+        0x1A
+    };
+    ('\\') => {
+        0x1c
+    };
+}
+
+impl Termios {
+    pub fn ctrl(c: u8) -> u8 {
+        c - 0x40
+    }
+
+    pub fn veof(&self) -> u8 {
+        self.cc[VEOF]
+    }
+
+    pub fn veol(&self) -> u8 {
+        self.cc[VEOL]
+    }
+
+    pub fn veol2(&self) -> u8 {
+        self.cc[VEOL2]
+    }
+
+    pub fn vintr(&self) -> u8 {
+        self.cc[VINTR]
+    }
+
+    pub fn vquit(&self) -> u8 {
+        self.cc[VQUIT]
+    }
+
+    pub fn vsusp(&self) -> u8 {
+        self.cc[VSUSP]
+    }
+
+    pub fn vstart(&self) -> u8 {
+        self.cc[VSTART]
+    }
+
+    pub fn vstop(&self) -> u8 {
+        self.cc[VSTOP]
+    }
+
+    pub fn verase(&self) -> u8 {
+        self.cc[VERASE]
+    }
+
+    pub fn vkill(&self) -> u8 {
+        self.cc[VKILL]
+    }
+
+    pub fn echo(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ECHO)
+    }
+
+    pub fn echoe(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ECHOE)
+    }
+
+    pub fn echoctl(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ECHOCTL)
+    }
+
+    pub fn echoke(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ECHOKE)
+    }
+
+    pub fn echok(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ECHOK)
+    }
+
+    pub fn echonl(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ECHONL)
+    }
+
+    pub fn isig(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ISIG)
+    }
+
+    pub fn icanon(&self) -> bool {
+        self.lflag.contains(TermioLFlags::ICANON)
+    }
+
+    pub fn iexten(&self) -> bool {
+        self.lflag.contains(TermioLFlags::IEXTEN)
+    }
+
+    pub fn igncr(&self) -> bool {
+        self.iflag.contains(TermioIFlags::IGNCR)
+    }
+
+    pub fn icrnl(&self) -> bool {
+        self.iflag.contains(TermioIFlags::ICRNL)
+    }
+
+    pub fn inlcr(&self) -> bool {
+        self.iflag.contains(TermioIFlags::INLCR)
+    }
+
+    pub fn noflsh(&self) -> bool {
+        self.lflag.contains(TermioLFlags::NOFLSH)
+    }
+
+    pub fn new_standard() -> Self {
+        let cc = core::array::from_fn(|idx| match idx {
+            VINTR => CTRL!('C'),
+            VQUIT => CTRL!('\\'),
+            VERASE => 0x7f,
+            VKILL => CTRL!('U'),
+            VEOF => CTRL!('D'),
+            VSUSP => CTRL!('Z'),
+            VMIN => 1,
+            _ => 0,
+        });
+
+        Self {
+            iflag: TermioIFlags::ICRNL | TermioIFlags::IXOFF,
+            oflag: TermioOFlags::OPOST | TermioOFlags::ONLCR,
+            cflag: B38400 | CS8 | CREAD | HUPCL,
+            lflag: TermioLFlags::ISIG
+                | TermioLFlags::ICANON
+                | TermioLFlags::ECHO
+                | TermioLFlags::ECHOE
+                | TermioLFlags::ECHOK
+                | TermioLFlags::ECHOCTL
+                | TermioLFlags::ECHOKE
+                | TermioLFlags::IEXTEN,
+            line: N_TTY,
+            cc,
+        }
+    }
+
+    fn get_user(&self) -> UserTermios {
+        UserTermios {
+            iflag: self.iflag.bits() as u16,
+            oflag: self.oflag.bits() as u16,
+            cflag: self.cflag as u16,
+            lflag: self.lflag.bits() as u16,
+            line: self.line,
+            cc: self.cc,
+        }
+    }
+}
+
+pub trait TerminalDevice: Send + Sync {
+    fn putchar(&self, ch: u8);
+}
+
+struct TerminalInner {
+    termio: Termios,
+    session: Weak<Session>,
+    buffer: VecDeque<u8>,
+}
+
+pub struct Terminal {
+    /// Lock with IRQ disabled. We might use this in IRQ context.
+    inner: Spin<TerminalInner>,
+    device: Arc<dyn TerminalDevice>,
+    cv: CondVar,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct UserWindowSize {
+    row: u16,
+    col: u16,
+    xpixel: u16,
+    ypixel: u16,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct UserTermios {
+    iflag: u16,
+    oflag: u16,
+    cflag: u16,
+    lflag: u16,
+    line: u8,
+    cc: [u8; NCCS],
+}
+
+pub enum TerminalIORequest<'a> {
+    GetProcessGroup(UserPointerMut<'a, u32>),
+    SetProcessGroup(UserPointer<'a, u32>),
+    GetWindowSize(UserPointerMut<'a, UserWindowSize>),
+    GetTermios(UserPointerMut<'a, UserTermios>),
+    SetTermios(UserPointer<'a, UserTermios>),
+}
+
+impl core::fmt::Debug for Terminal {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("Terminal").finish()
+    }
+}
+
+impl Terminal {
+    pub fn new(device: Arc<dyn TerminalDevice>) -> Arc<Self> {
+        Arc::new(Self {
+            inner: Spin::new(TerminalInner {
+                termio: Termios::new_standard(),
+                session: Weak::new(),
+                buffer: repeat(0).take(BUFFER_SIZE).collect(),
+            }),
+            cv: CondVar::new(),
+            device,
+        })
+    }
+
+    /// Clear the read buffer.
+    fn clear_read_buffer(&self, inner: &mut TerminalInner) {
+        inner.buffer.clear();
+    }
+
+    // TODO: Buffer terminal writes.
+    pub fn show_char(&self, ch: u8) {
+        self.device.putchar(ch)
+    }
+
+    fn erase(&self, inner: &mut TerminalInner, echo: bool) -> Option<u8> {
+        let back = inner.buffer.back().copied();
+        match back {
+            None => return None,
+            Some(b'\n') => return None,
+            Some(back) if back == inner.termio.veof() => return None,
+            Some(back) if back == inner.termio.veol() => return None,
+            Some(back) if back == inner.termio.veol2() => return None,
+            _ => {}
+        }
+        let back = inner.buffer.pop_back();
+
+        if echo && inner.termio.echo() && inner.termio.echoe() {
+            self.show_char(CTRL!('H')); // Backspace
+            self.show_char(b' '); // Space
+            self.show_char(CTRL!('H')); // Backspace
+        }
+
+        return back;
+    }
+
+    fn echo_char(&self, inner: &mut TerminalInner, ch: u8) {
+        match ch {
+            b'\t' | b'\n' | CTRL!('Q') | CTRL!('S') => self.show_char(ch),
+            c if c >= 32 => self.show_char(ch),
+            _ if !inner.termio.echo() => self.show_char(ch),
+            _ if !inner.termio.echoctl() => self.show_char(ch),
+            _ if !inner.termio.iexten() => self.show_char(ch),
+            _ => {
+                self.show_char(b'^');
+                self.show_char(ch + 0x40);
+            }
+        }
+    }
+
+    fn signal(&self, inner: &mut TerminalInner, signal: Signal) {
+        if let Some(session) = inner.session.upgrade() {
+            session.raise_foreground(signal);
+        }
+        if !inner.termio.noflsh() {
+            self.clear_read_buffer(inner);
+        }
+    }
+
+    fn echo_and_signal(&self, inner: &mut TerminalInner, ch: u8, signal: Signal) {
+        self.echo_char(inner, ch);
+        self.signal(inner, signal);
+    }
+
+    fn do_commit_char(&self, inner: &mut TerminalInner, ch: u8) {
+        inner.buffer.push_back(ch);
+
+        if inner.termio.echo() || (ch == b'\n' && inner.termio.echonl()) {
+            self.echo_char(inner, ch);
+        }
+
+        // If ICANON is set, we notify all waiting processes.
+        // If ICANON is not set but we have a new line, there are data ready, we notify as well.
+        if ch == b'\n' || inner.termio.icanon() {
+            self.cv.notify_all();
+        }
+    }
+
+    // TODO: Find a better way to handle this.
+    pub fn commit_char(&self, ch: u8) {
+        let mut inner = self.inner.lock_irq();
+        if inner.termio.isig() {
+            match ch {
+                0xff => {}
+                ch if ch == inner.termio.vintr() => {
+                    return self.echo_and_signal(&mut inner, ch, Signal::SIGINT)
+                }
+                ch if ch == inner.termio.vquit() => {
+                    return self.echo_and_signal(&mut inner, ch, Signal::SIGQUIT)
+                }
+                ch if ch == inner.termio.vsusp() => {
+                    return self.echo_and_signal(&mut inner, ch, Signal::SIGTSTP)
+                }
+                _ => {}
+            }
+        }
+
+        // If handled, the character is discarded.
+        if inner.termio.icanon() {
+            match ch {
+                0xff => {}
+                ch if ch == inner.termio.veof() => return self.cv.notify_all(),
+                ch if ch == inner.termio.verase() => {
+                    self.erase(&mut inner, true);
+                    return;
+                }
+                ch if ch == inner.termio.vkill() => {
+                    if inner.termio.echok() {
+                        while self.erase(&mut inner, false).is_some() {}
+                        self.show_char(b'\n');
+                    } else if inner.termio.echoke() && inner.termio.iexten() {
+                        while self.erase(&mut inner, true).is_some() {}
+                    }
+                    return;
+                }
+                _ => {}
+            }
+        }
+
+        match ch {
+            b'\r' if inner.termio.igncr() => {}
+            b'\r' if inner.termio.icrnl() => return self.do_commit_char(&mut inner, b'\n'),
+            b'\n' if inner.termio.inlcr() => return self.do_commit_char(&mut inner, b'\r'),
+            _ => self.do_commit_char(&mut inner, ch),
+        }
+    }
+
+    pub fn poll_in(&self) -> KResult<()> {
+        let mut inner = self.inner.lock_irq();
+        if inner.buffer.is_empty() {
+            self.cv.wait(&mut inner);
+
+            if Thread::current().signal_list.has_pending_signal() {
+                return Err(EINTR);
+            }
+        }
+        Ok(())
+    }
+
+    pub fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        'block: {
+            if buffer.available() == 0 {
+                break 'block;
+            }
+
+            let mut inner = self.inner.lock_irq();
+            if inner.buffer.is_empty() {
+                self.cv.wait(&mut inner);
+
+                if Thread::current().signal_list.has_pending_signal() {
+                    return Err(EINTR);
+                }
+            }
+
+            if inner.buffer.is_empty() {
+                break 'block;
+            }
+
+            if !inner.termio.icanon() {
+                let ch = inner.buffer.pop_front().unwrap();
+                buffer.fill(&[ch])?;
+                break 'block;
+            }
+
+            // Canonical mode, return data until we see a newline.
+            let length = inner
+                .buffer
+                .iter()
+                .position(|&ch| ch == b'\n')
+                .map(|pos| pos + 1)
+                .unwrap_or(inner.buffer.len());
+
+            let (first, second) = inner.buffer.as_slices();
+            let first_data = first
+                .split_at_checked(length)
+                .map_or(first, |(data, _)| data);
+            let second_data = second
+                .split_at_checked(length - first_data.len())
+                .map_or(second, |(data, _)| data);
+
+            buffer.fill(first_data)?.allow_partial();
+            buffer.fill(second_data)?.allow_partial();
+
+            inner.buffer.drain(..buffer.wrote());
+        }
+
+        Ok(buffer.wrote())
+    }
+
+    pub fn ioctl(&self, request: TerminalIORequest) -> KResult<()> {
+        match request {
+            TerminalIORequest::GetProcessGroup(pgid_pointer) => {
+                let inner = self.inner.lock();
+                let session = inner.session.upgrade();
+                let pgid = session.map(|session| session.foreground_pgid()).flatten();
+
+                if let Some(pgid) = pgid {
+                    pgid_pointer.write(pgid)
+                } else {
+                    Err(ENOTTY)
+                }
+            }
+            TerminalIORequest::SetProcessGroup(pgid) => {
+                let inner = self.inner.lock();
+                let session = inner.session.upgrade();
+                let pgid = pgid.read()?;
+
+                if let Some(session) = session {
+                    session.set_foreground_pgid(pgid)
+                } else {
+                    Err(ENOTTY)
+                }
+            }
+            TerminalIORequest::GetWindowSize(ptr) => {
+                // TODO: Get the actual window size
+                let window_size = UserWindowSize {
+                    row: 40,
+                    col: 80,
+                    xpixel: 0,
+                    ypixel: 0,
+                };
+
+                ptr.write(window_size)
+            }
+            TerminalIORequest::GetTermios(ptr) => {
+                let inner = self.inner.lock();
+                ptr.write(inner.termio.get_user())
+            }
+            TerminalIORequest::SetTermios(ptr) => {
+                let mut inner = self.inner.lock();
+                let user_termios = ptr.read()?;
+
+                // TODO: We ignore unknown bits for now.
+                inner.termio.iflag = TermioIFlags::from_bits_truncate(user_termios.iflag);
+                inner.termio.oflag = TermioOFlags::from_bits_truncate(user_termios.oflag);
+                inner.termio.lflag = TermioLFlags::from_bits_truncate(user_termios.lflag);
+                inner.termio.cflag = user_termios.cflag as u32;
+                inner.termio.line = user_termios.line;
+                inner.termio.cc = user_termios.cc;
+
+                Ok(())
+            }
+        }
+    }
+}

+ 39 - 0
src/kernel/timer.rs

@@ -0,0 +1,39 @@
+use core::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::prelude::*;
+
+use super::interrupt::register_irq_handler;
+
+static TICKS: AtomicUsize = AtomicUsize::new(0);
+
+pub struct Ticks(usize);
+
+impl Ticks {
+    pub fn in_secs(&self) -> usize {
+        self.0 / 100
+    }
+
+    pub fn in_msecs(&self) -> usize {
+        self.0 * 10
+    }
+
+    pub fn in_usecs(&self) -> usize {
+        self.0 * 10_000
+    }
+
+    pub fn in_nsecs(&self) -> usize {
+        self.0 * 10_000_000
+    }
+}
+
+fn timer_interrupt() {
+    TICKS.fetch_add(1, Ordering::Relaxed);
+}
+
+pub fn ticks() -> Ticks {
+    Ticks(TICKS.load(Ordering::Relaxed))
+}
+
+pub fn init() -> KResult<()> {
+    register_irq_handler(0, timer_interrupt)
+}

+ 0 - 341
src/kernel/tty.cpp

@@ -1,341 +0,0 @@
-#include <algorithm>
-
-#include <bits/ioctl.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <termios.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/log.hpp>
-#include <kernel/process.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/vga.hpp>
-
-#define CTRL(key) ((key)-0x40)
-
-#define TERMIOS_ISET(termios, option) ((option) == ((termios).c_iflag & (option)))
-#define TERMIOS_OSET(termios, option) ((option) == ((termios).c_oflag & (option)))
-#define TERMIOS_CSET(termios, option) ((option) == ((termios).c_cflag & (option)))
-#define TERMIOS_LSET(termios, option) ((option) == ((termios).c_lflag & (option)))
-
-#define TERMIOS_TESTCC(c, termios, cc) ((c != 0xff) && (c == ((termios).c_cc[cc])))
-
-using namespace kernel::tty;
-
-tty::tty(std::string name)
-    : termio {
-        .c_iflag = ICRNL | IXOFF,
-        .c_oflag = OPOST | ONLCR,
-        .c_cflag = B38400 | CS8 | CREAD | HUPCL,
-        .c_lflag = ISIG | ICANON | ECHO | ECHOE |
-            ECHOK | ECHOCTL | ECHOKE | IEXTEN,
-        .c_line = N_TTY,
-        .c_cc {},
-        .c_ispeed = 38400,
-        .c_ospeed = 38400,
-    }
-    , name{name}
-    , buf(BUFFER_SIZE)
-    , fg_pgroup { 0 }
-{
-    memset(this->termio.c_cc, 0x00, sizeof(this->termio.c_cc));
-
-    // other special characters is not supported for now
-    this->termio.c_cc[VINTR] = CTRL('C');
-    this->termio.c_cc[VQUIT] = CTRL('\\');
-    this->termio.c_cc[VERASE] = 0x7f;
-    this->termio.c_cc[VKILL] = CTRL('U');
-    this->termio.c_cc[VEOF] = CTRL('D');
-    this->termio.c_cc[VSUSP] = CTRL('Z');
-    this->termio.c_cc[VMIN] = 1;
-}
-
-void tty::print(const char* str) {
-    while (*str != '\0')
-        this->putchar(*(str++));
-}
-
-int tty::poll() {
-    async::lock_guard_irq lck(this->mtx_buf);
-    if (this->buf.empty()) {
-        bool interrupted = this->waitlist.wait(this->mtx_buf);
-
-        if (interrupted)
-            return -EINTR;
-    }
-
-    return 1;
-}
-
-int tty::ioctl(int request, unsigned long arg3) {
-    switch (request) {
-        case TIOCGPGRP: {
-            auto* pgid = (pid_t __user*)arg3;
-            // TODO: copy_to_user
-            *pgid = this->get_pgrp();
-            break;
-        }
-        case TIOCSPGRP: {
-            // TODO: copy_from_user
-            auto pgid = *(const pid_t __user*)arg3;
-            this->set_pgrp(pgid);
-            break;
-        }
-        case TIOCGWINSZ: {
-            auto* ws = (winsize __user*)arg3;
-            // TODO: copy_to_user
-            ws->ws_col = 80;
-            ws->ws_row = 40;
-            break;
-        }
-        case TCGETS: {
-            auto* argp = (struct termios __user*)arg3;
-            // TODO: use copy_to_user
-            memcpy(argp, &this->termio, sizeof(this->termio));
-            break;
-        }
-        case TCSETS: {
-            auto* argp = (const struct termios __user*)arg3;
-            // TODO: use copy_from_user
-            memcpy(&this->termio, argp, sizeof(this->termio));
-            break;
-        }
-        default: {
-            kmsgf("[kernel:error] ioctl(%x, %x) is not implemented", request, arg3);
-            return -EINVAL;
-        }
-    }
-
-    return 0;
-}
-
-ssize_t tty::read(char* buf, size_t buf_size, size_t n) {
-    n = std::max(buf_size, n);
-    size_t orig_n = n;
-
-    do {
-        if (n == 0)
-            break;
-
-        async::lock_guard_irq lck(this->mtx_buf);
-
-        if (this->buf.empty()) {
-            bool interrupted = this->waitlist.wait(this->mtx_buf);
-
-            if (interrupted)
-                break;
-        }
-
-        if (this->buf.empty())
-            break;
-
-        if (!TERMIOS_LSET(this->termio, ICANON)) {
-            --n, *buf = this->buf.get();
-            break;
-        }
-
-        while (n && !this->buf.empty()) {
-            int c = this->buf.get();
-
-            --n, *(buf++) = c;
-
-            // canonical mode
-            if (c == '\n')
-                break;
-        }
-    } while (false);
-
-    return orig_n - n;
-}
-
-int tty::_do_erase(bool should_echo) {
-    if (buf.empty())
-        return -1;
-
-    int back = buf.back();
-
-    if (back == '\n' || back == this->termio.c_cc[VEOF])
-        return -1;
-
-    if (back == this->termio.c_cc[VEOL] || back == this->termio.c_cc[VEOL2])
-        return -1;
-
-    buf.pop();
-
-    if (should_echo && TERMIOS_LSET(this->termio, ECHO | ECHOE)) {
-        this->show_char('\b'); // backspace
-        this->show_char(' ');  // space
-        this->show_char('\b'); // backspace
-
-        // xterm's way to show backspace
-        // serial_send_data(id, '\b');
-        // serial_send_data(id, CTRL('['));
-        // serial_send_data(id, '[');
-        // serial_send_data(id, 'K');
-    }
-
-    return back;
-}
-
-void tty::_real_commit_char(int c) {
-    switch (c) {
-        case '\n':
-            buf.put(c);
-
-            if (TERMIOS_LSET(this->termio, ECHONL) || TERMIOS_LSET(this->termio, ECHO))
-                this->_echo_char(c);
-
-            // if ICANON is set, we notify all waiting processes
-            // if ICANON is not set, since there are data ready, we notify as
-            // well
-            this->waitlist.notify_all();
-
-            break;
-
-        default:
-            buf.put(c);
-
-            if (TERMIOS_LSET(this->termio, ECHO))
-                this->_echo_char(c);
-
-            if (!TERMIOS_LSET(this->termio, ICANON))
-                this->waitlist.notify_all();
-
-            break;
-    }
-}
-
-void tty::_echo_char(int c) {
-    // ECHOCTL
-    do {
-        if (c < 0 || c >= 32 || !TERMIOS_LSET(this->termio, ECHO | ECHOCTL | IEXTEN))
-            break;
-
-        if (c == '\t' || c == '\n' || c == CTRL('Q') || c == CTRL('S'))
-            break;
-
-        this->show_char('^');
-        this->show_char(c + 0x40);
-
-        return;
-    } while (false);
-
-    this->show_char(c);
-}
-
-// TODO!!!: this function is racy as it acesses this->buf without
-//          acquiring this->mtx_buf or doing any synchronization
-//
-// do some ignore and remapping work
-// real commit operation is in _real_commit_char()
-void tty::commit_char(int c) {
-    // check special control characters
-    // if handled, the character is discarded
-    if (TERMIOS_LSET(this->termio, ISIG)) {
-        if (TERMIOS_TESTCC(c, this->termio, VINTR)) {
-            if (!TERMIOS_LSET(this->termio, NOFLSH))
-                this->clear_read_buf();
-
-            this->_echo_char(c);
-            procs->send_signal_grp(fg_pgroup, SIGINT);
-
-            return;
-        }
-
-        if (TERMIOS_TESTCC(c, this->termio, VSUSP)) {
-            if (!TERMIOS_LSET(this->termio, NOFLSH))
-                this->clear_read_buf();
-
-            this->_echo_char(c);
-            procs->send_signal_grp(fg_pgroup, SIGTSTP);
-
-            return;
-        }
-
-        if (TERMIOS_TESTCC(c, this->termio, VQUIT)) {
-            if (!TERMIOS_LSET(this->termio, NOFLSH))
-                this->clear_read_buf();
-
-            this->_echo_char(c);
-            procs->send_signal_grp(fg_pgroup, SIGQUIT);
-
-            return;
-        }
-    }
-
-    // if handled, the character is discarded
-    if (TERMIOS_LSET(this->termio, ICANON)) {
-        if (TERMIOS_TESTCC(c, this->termio, VEOF)) {
-            this->waitlist.notify_all();
-            return;
-        }
-
-        if (TERMIOS_TESTCC(c, this->termio, VKILL)) {
-            if (TERMIOS_LSET(this->termio, ECHOKE | IEXTEN)) {
-                while (this->_do_erase(true) != -1)
-                    ;
-            } else if (TERMIOS_LSET(this->termio, ECHOK)) {
-                while (this->_do_erase(false) != -1)
-                    ;
-                this->show_char('\n');
-            }
-            return;
-        }
-
-        if (TERMIOS_TESTCC(c, this->termio, VERASE)) {
-            this->_do_erase(true);
-            return;
-        }
-    }
-
-    switch (c) {
-        case '\r':
-            if (TERMIOS_ISET(this->termio, IGNCR))
-                break;
-
-            if (TERMIOS_ISET(this->termio, ICRNL)) {
-                this->_real_commit_char('\n');
-                break;
-            }
-
-            this->_real_commit_char('\r');
-            break;
-
-        case '\n':
-            if (TERMIOS_ISET(this->termio, INLCR)) {
-                this->_real_commit_char('\r');
-                break;
-            }
-
-            this->_real_commit_char('\n');
-            break;
-
-        default:
-            this->_real_commit_char(c);
-            break;
-    }
-}
-
-void tty::show_char(int c) {
-    this->putchar(c);
-}
-
-vga_tty::vga_tty() : tty{"ttyVGA"} {}
-
-void vga_tty::putchar(char c) {
-    static struct vga_char vc = {.c = '\0', .color = VGA_CHAR_COLOR_WHITE};
-    vc.c = c;
-    vga_put_char(&vc);
-}
-
-void tty::clear_read_buf(void) {
-    this->buf.clear();
-}
-
-int kernel::tty::register_tty(tty* tty_dev) {
-    // TODO: manage all ttys
-    if (!console)
-        console = tty_dev;
-
-    return 0;
-}

+ 5 - 0
src/kernel/user.rs

@@ -1 +1,6 @@
 pub mod dataflow;
+
+pub use dataflow::{UserBuffer, UserString};
+
+pub type UserPointer<'a, T> = dataflow::UserPointer<'a, T, true>;
+pub type UserPointerMut<'a, T> = dataflow::UserPointer<'a, T, false>;

+ 129 - 33
src/kernel/user/dataflow.rs

@@ -25,6 +25,45 @@ pub struct UserString<'lt> {
     _phantom: core::marker::PhantomData<&'lt ()>,
 }
 
+pub struct UserPointer<'a, T: Copy, const CONST: bool> {
+    pointer: CheckedUserPointer,
+    _phantom: core::marker::PhantomData<&'a T>,
+}
+
+impl<'a, T: Copy, const CONST: bool> UserPointer<'a, T, CONST> {
+    pub fn new(ptr: *mut T) -> KResult<Self> {
+        let pointer = CheckedUserPointer::new(ptr as *const u8, core::mem::size_of::<T>())?;
+
+        Ok(Self {
+            pointer,
+            _phantom: core::marker::PhantomData,
+        })
+    }
+
+    pub fn new_vaddr(vaddr: usize) -> KResult<Self> {
+        Self::new(vaddr as *mut T)
+    }
+
+    pub fn read(&self) -> KResult<T> {
+        let mut value = core::mem::MaybeUninit::<T>::uninit();
+        self.pointer
+            .read(value.as_mut_ptr() as *mut (), core::mem::size_of::<T>())?;
+        Ok(unsafe { value.assume_init() })
+    }
+
+    pub fn offset(&self, offset: isize) -> KResult<Self> {
+        let new_vaddr = self.pointer.ptr as isize + offset * size_of::<T>() as isize;
+        Self::new_vaddr(new_vaddr as usize)
+    }
+}
+
+impl<'a, T: Copy> UserPointer<'a, T, false> {
+    pub fn write(&self, value: T) -> KResult<()> {
+        self.pointer
+            .write(&value as *const T as *mut (), core::mem::size_of::<T>())
+    }
+}
+
 impl CheckedUserPointer {
     pub fn new(ptr: *const u8, len: usize) -> KResult<Self> {
         const USER_MAX_ADDR: usize = 0x7ff_fff_fff_fff;
@@ -49,7 +88,10 @@ impl CheckedUserPointer {
         unsafe { core::slice::from_raw_parts(self.ptr, self.len) }
     }
 
+    /// # Might Sleep
     pub fn read(&self, buffer: *mut (), total: usize) -> KResult<()> {
+        might_sleep!();
+
         if total > self.len {
             return Err(EINVAL);
         }
@@ -80,6 +122,79 @@ impl CheckedUserPointer {
             Ok(())
         }
     }
+
+    /// # Might Sleep
+    pub fn write(&self, data: *mut (), total: usize) -> KResult<()> {
+        might_sleep!();
+
+        if total > self.len {
+            return Err(EINVAL);
+        }
+
+        // TODO: align to 8 bytes when doing copy for performance
+        let error_bytes: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "rep movsb",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 3b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x1", // type: store
+                ".popsection",
+                inout("rcx") total => error_bytes,
+                inout("rsi") data => _,
+                inout("rdi") self.ptr => _,
+            )
+        };
+
+        if error_bytes != 0 {
+            return Err(EFAULT);
+        }
+
+        Ok(())
+    }
+
+    /// # Might Sleep
+    pub fn zero(&self) -> KResult<()> {
+        might_sleep!();
+
+        if self.len == 0 {
+            return Ok(());
+        }
+
+        // TODO: align to 8 bytes when doing copy for performance
+        let error_bytes: usize;
+        unsafe {
+            asm!(
+                "2:",
+                "rep stosb",
+                "3:",
+                "nop",
+                ".pushsection .fix",
+                ".align 32",
+                ".quad 2b",  // instruction address
+                ".quad 3b - 2b",  // instruction length
+                ".quad 3b",  // fix jump address
+                ".quad 0x1", // type: store
+                ".popsection",
+                in("rax") 0,
+                inout("rcx") self.len => error_bytes,
+                inout("rdi") self.ptr => _,
+                options(att_syntax)
+            )
+        };
+
+        if error_bytes != 0 {
+            Err(EFAULT)
+        } else {
+            Ok(())
+        }
+    }
 }
 
 impl UserBuffer<'_> {
@@ -108,50 +223,31 @@ impl<'lt> Buffer for UserBuffer<'lt> {
         self.cur
     }
 
+    /// # Might Sleep
     fn fill(&mut self, data: &[u8]) -> KResult<FillResult> {
-        let remaining = self.remaining();
-        if remaining == 0 {
+        might_sleep!();
+
+        let to_write = data.len().min(self.remaining());
+        if to_write == 0 {
             return Ok(FillResult::Full);
         }
 
-        let data = if data.len() > remaining {
-            &data[..remaining]
-        } else {
-            data
-        };
+        self.ptr.write(data.as_ptr() as *mut (), to_write)?;
+        self.cur += to_write;
 
-        // TODO: align to 8 bytes when doing copy for performance
-        let error_bytes: usize;
-        unsafe {
-            asm!(
-                "2:",
-                "rep movsb",
-                "3:",
-                "nop",
-                ".pushsection .fix",
-                ".align 32",
-                ".quad 2b",  // instruction address
-                ".quad 3b - 2b",  // instruction length
-                ".quad 3b",  // fix jump address
-                ".quad 0x1", // type: store
-                ".popsection",
-                inout("rcx") data.len() => error_bytes,
-                inout("rsi") data.as_ptr() => _,
-                inout("rdi") self.ptr.get_mut::<u8>().offset(self.cur as isize) => _,
-            )
-        };
-
-        if error_bytes != 0 {
-            return Err(EFAULT);
+        if to_write == data.len() {
+            Ok(FillResult::Done(to_write))
+        } else {
+            Ok(FillResult::Partial(to_write))
         }
-
-        self.cur += data.len();
-        Ok(FillResult::Done(data.len()))
     }
 }
 
 impl<'lt> UserString<'lt> {
+    /// # Might Sleep
     pub fn new(ptr: *const u8) -> KResult<Self> {
+        might_sleep!();
+
         const MAX_LEN: usize = 4096;
         // TODO
         let ptr = CheckedUserPointer::new(ptr, MAX_LEN)?;

+ 0 - 23
src/kernel/user/thread_local.cc

@@ -1,23 +0,0 @@
-#include <cstddef>
-
-#include <stdint.h>
-
-#include <kernel/mem/phys.hpp>
-#include <kernel/mem/types.hpp>
-#include <kernel/user/thread_local.hpp>
-
-using namespace kernel::user;
-
-void kernel::user::load_thread_area32(uint64_t desc) {
-    if (!desc)
-        return;
-
-    kernel::mem::gdt[7] = desc;
-
-    asm volatile(
-        "mov %%gs, %%ax\n\t"
-        "mov %%ax, %%gs\n\t"
-        :
-        :
-        : "ax");
-}

+ 0 - 1
src/kernel/vfs.cpp

@@ -13,7 +13,6 @@
 
 #include <kernel/log.hpp>
 #include <kernel/process.hpp>
-#include <kernel/tty.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/dentry.hpp>
 

+ 9 - 0
src/kernel/vfs/dentry.rs

@@ -48,6 +48,15 @@ pub struct Dentry {
     data: RCUPointer<DentryData>,
 }
 
+impl core::fmt::Debug for Dentry {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("Dentry")
+            .field("name", &String::from_utf8_lossy(&self.name))
+            .field("parent", &String::from_utf8_lossy(&self.parent.name))
+            .finish()
+    }
+}
+
 const D_DIRECTORY: u64 = 1;
 const D_MOUNTPOINT: u64 = 2;
 const D_SYMLINK: u64 = 4;

+ 59 - 70
src/kernel/vfs/file.rs

@@ -1,22 +1,21 @@
-use core::{
-    ffi::{c_int, c_ulong},
-    ops::ControlFlow,
-    ptr::NonNull,
-    sync::atomic::Ordering,
-};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
 
 use crate::{
     io::{Buffer, BufferFill, RawBuffer},
-    kernel::mem::{paging::Page, phys::PhysPtr},
+    kernel::{
+        constants::{TCGETS, TCSETS, TIOCGPGRP, TIOCGWINSZ, TIOCSPGRP},
+        mem::{paging::Page, phys::PhysPtr},
+        task::{Signal, Thread},
+        terminal::{Terminal, TerminalIORequest},
+        user::{UserPointer, UserPointerMut},
+    },
     prelude::*,
-    sync::condvar::CondVar,
+    sync::CondVar,
 };
 
 use alloc::{collections::vec_deque::VecDeque, sync::Arc};
-use bindings::{
-    current_thread, kernel::tty::tty as TTY, EBADF, EFAULT, EINTR, EINVAL, ENOTDIR, ENOTTY,
-    EOVERFLOW, EPIPE, ESPIPE, SIGPIPE, S_IFMT,
-};
+use bindings::{EBADF, EFAULT, EINTR, EINVAL, ENOTDIR, ENOTTY, EOVERFLOW, EPIPE, ESPIPE, S_IFMT};
+use bitflags::bitflags;
 
 use super::{
     dentry::Dentry,
@@ -55,15 +54,15 @@ pub struct PipeWriteEnd {
     pipe: Arc<Pipe>,
 }
 
-pub struct TTYFile {
-    tty: NonNull<TTY>,
+pub struct TerminalFile {
+    terminal: Arc<Terminal>,
 }
 
 pub enum File {
     Inode(InodeFile),
     PipeRead(PipeReadEnd),
     PipeWrite(PipeWriteEnd),
-    TTY(TTYFile),
+    TTY(TerminalFile),
 }
 
 pub enum SeekOption {
@@ -72,6 +71,12 @@ pub enum SeekOption {
     End(isize),
 }
 
+bitflags! {
+    pub struct PollEvent: u16 {
+        const Readable = 0x0001;
+    }
+}
+
 impl Drop for PipeReadEnd {
     fn drop(&mut self) {
         self.pipe.close_read();
@@ -85,11 +90,9 @@ impl Drop for PipeWriteEnd {
 }
 
 fn send_sigpipe_to_current() {
-    // Safety: current_thread is always valid.
-    let current = unsafe { current_thread.as_mut().unwrap() };
-
-    // Safety: `signal_list` is `Sync`
-    unsafe { current.send_signal(SIGPIPE) };
+    // SAFETY: current_thread is always valid.
+    let current = Thread::current();
+    current.raise(Signal::SIGPIPE);
 }
 
 impl Pipe {
@@ -140,8 +143,8 @@ impl Pipe {
         let mut inner = self.inner.lock();
 
         while !inner.write_closed && inner.buffer.is_empty() {
-            let interrupted = self.cv_read.wait(&mut inner, true);
-            if interrupted {
+            self.cv_read.wait(&mut inner);
+            if Thread::current().signal_list.has_pending_signal() {
                 return Err(EINTR);
             }
         }
@@ -163,8 +166,8 @@ impl Pipe {
         }
 
         while inner.buffer.len() + data.len() > Self::PIPE_SIZE {
-            let interrupted = self.cv_write.wait(&mut inner, true);
-            if interrupted {
+            self.cv_write.wait(&mut inner);
+            if Thread::current().signal_list.has_pending_signal() {
                 return Err(EINTR);
             }
 
@@ -204,8 +207,8 @@ impl Pipe {
                 break;
             }
 
-            let interrupted = self.cv_write.wait(&mut inner, true);
-            if interrupted {
+            self.cv_write.wait(&mut inner);
+            if Thread::current().signal_list.has_pending_signal() {
                 if data.len() != remaining.len() {
                     break;
                 }
@@ -260,10 +263,6 @@ struct UserDirent {
     d_name: [u8; 0],
 }
 
-fn has_pending_signal() -> bool {
-    unsafe { current_thread.as_mut().unwrap().signals.pending_signal() != 0 }
-}
-
 impl InodeFile {
     pub fn new(dentry: Arc<Dentry>, rwa: (bool, bool, bool)) -> Arc<File> {
         // SAFETY: `dentry` used to create `InodeFile` is valid.
@@ -397,58 +396,40 @@ impl InodeFile {
     }
 }
 
-impl TTYFile {
-    pub fn new(tty: *mut TTY) -> Arc<File> {
-        Arc::new(File::TTY(TTYFile {
-            tty: NonNull::new(tty).expect("`tty` is null"),
-        }))
+impl TerminalFile {
+    pub fn new(tty: Arc<Terminal>) -> Arc<File> {
+        Arc::new(File::TTY(TerminalFile { terminal: tty }))
     }
 
     fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
-        // SAFETY: `tty` should always valid.
-        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
-
-        let mut c_buffer: Vec<u8> = vec![0; buffer.total()];
-
-        // SAFETY: `tty` points to a valid `TTY` instance.
-        let nread = unsafe {
-            tty.read(
-                c_buffer.as_mut_ptr() as *mut _,
-                c_buffer.len(),
-                c_buffer.len(),
-            )
-        };
-
-        match nread {
-            n if n < 0 => Err((-n) as u32),
-            0 => Ok(0),
-            n => Ok(buffer.fill(&c_buffer[..n as usize])?.allow_partial()),
-        }
+        self.terminal.read(buffer)
     }
 
     fn write(&self, buffer: &[u8]) -> KResult<usize> {
-        // SAFETY: `tty` should always valid.
-        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
-
         for &ch in buffer.iter() {
-            // SAFETY: `tty` points to a valid `TTY` instance.
-            unsafe { tty.show_char(ch as i32) };
+            self.terminal.show_char(ch);
         }
 
         Ok(buffer.len())
     }
 
-    fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
-        // SAFETY: `tty` should always valid.
-        let tty = unsafe { self.tty.as_ptr().as_mut().unwrap() };
+    fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
+        if !event.contains(PollEvent::Readable) {
+            unimplemented!("Poll event not supported.")
+        }
 
-        // SAFETY: `tty` points to a valid `TTY` instance.
-        let result = unsafe { tty.ioctl(request as c_int, arg3 as c_ulong) };
+        self.terminal.poll_in().map(|_| PollEvent::Readable)
+    }
 
-        match result {
-            0 => Ok(0),
-            _ => Err((-result) as u32),
-        }
+    fn ioctl(&self, request: usize, arg3: usize) -> KResult<()> {
+        self.terminal.ioctl(match request as u32 {
+            TCGETS => TerminalIORequest::GetTermios(UserPointerMut::new_vaddr(arg3)?),
+            TCSETS => TerminalIORequest::SetTermios(UserPointer::new_vaddr(arg3)?),
+            TIOCGPGRP => TerminalIORequest::GetProcessGroup(UserPointerMut::new_vaddr(arg3)?),
+            TIOCSPGRP => TerminalIORequest::SetProcessGroup(UserPointer::new_vaddr(arg3)?),
+            TIOCGWINSZ => TerminalIORequest::GetWindowSize(UserPointerMut::new_vaddr(arg3)?),
+            _ => return Err(EINVAL),
+        })
     }
 }
 
@@ -518,7 +499,7 @@ impl File {
         // TODO!!!: zero copy implementation with mmap
         let mut tot = 0usize;
         while tot < count {
-            if has_pending_signal() {
+            if Thread::current().signal_list.has_pending_signal() {
                 if tot == 0 {
                     return Err(EINTR);
                 } else {
@@ -544,8 +525,16 @@ impl File {
 
     pub fn ioctl(&self, request: usize, arg3: usize) -> KResult<usize> {
         match self {
-            File::TTY(tty) => tty.ioctl(request, arg3),
+            File::TTY(tty) => tty.ioctl(request, arg3).map(|_| 0),
             _ => Err(ENOTTY),
         }
     }
+
+    pub fn poll(&self, event: PollEvent) -> KResult<PollEvent> {
+        match self {
+            File::PipeRead(_) | File::PipeWrite(_) => unimplemented!("Poll event not supported."),
+            File::Inode(_) => Ok(event),
+            File::TTY(tty) => tty.poll(event),
+        }
+    }
 }

+ 26 - 10
src/kernel/vfs/filearray.rs

@@ -1,7 +1,11 @@
 use core::sync::atomic::Ordering;
 
 use crate::{
-    kernel::vfs::{dentry::Dentry, file::Pipe, s_isdir, s_isreg},
+    kernel::{
+        console::CONSOLE,
+        task::Thread,
+        vfs::{dentry::Dentry, file::Pipe, s_isdir, s_isreg},
+    },
     path::Path,
     prelude::*,
 };
@@ -11,8 +15,8 @@ use alloc::{
     sync::Arc,
 };
 use bindings::{
-    current_process, kernel::tty::console, EBADF, EINVAL, EISDIR, ENOTDIR, FD_CLOEXEC, F_DUPFD,
-    F_DUPFD_CLOEXEC, F_GETFD, F_SETFD, O_APPEND, O_CLOEXEC, O_DIRECTORY, O_RDWR, O_TRUNC, O_WRONLY,
+    EBADF, EISDIR, ENOTDIR, FD_CLOEXEC, F_DUPFD, F_DUPFD_CLOEXEC, F_GETFD, F_SETFD, O_APPEND,
+    O_CLOEXEC, O_DIRECTORY, O_RDWR, O_TRUNC, O_WRONLY,
 };
 use itertools::{
     FoldWhile::{Continue, Done},
@@ -20,7 +24,7 @@ use itertools::{
 };
 
 use super::{
-    file::{File, InodeFile, TTYFile},
+    file::{File, InodeFile, TerminalFile},
     inode::Mode,
     s_ischr, FsContext, Spin,
 };
@@ -81,17 +85,24 @@ pub extern "C" fn r_filearray_drop(other: *const FileArray) {
 }
 
 impl FileArray {
-    pub fn get_current<'lt>() -> BorrowedArc<'lt, Self> {
-        // SAFETY: `current_process` is always valid.
-        let current = unsafe { current_process.as_mut().unwrap() };
-        BorrowedArc::from_raw(current.files.m_handle as *const _)
+    pub fn get_current<'lt>() -> &'lt Arc<Self> {
+        &Thread::current().files
+    }
+
+    pub fn new_for_init() -> Arc<Self> {
+        Arc::new(FileArray {
+            inner: Spin::new(FileArrayInner {
+                files: BTreeMap::new(),
+                fd_min_avail: 0,
+            }),
+        })
     }
 
     pub fn new_shared(other: &Arc<Self>) -> Arc<Self> {
         other.clone()
     }
 
-    pub fn new_cloned(other: &Arc<Self>) -> Arc<Self> {
+    pub fn new_cloned(other: &Self) -> Arc<Self> {
         Arc::new(Self {
             inner: Spin::clone(&other.inner),
         })
@@ -217,7 +228,12 @@ impl FileArray {
         let fd = inner.next_fd();
 
         if s_ischr(filemode) && inode.devid()? == 0x0501 {
-            inner.do_insert(fd, fdflag as u64, TTYFile::new(unsafe { console }));
+            // TODO!!!: Get terminal from char device.
+            inner.do_insert(
+                fd,
+                fdflag as u64,
+                TerminalFile::new(CONSOLE.lock_irq().get_terminal().unwrap()),
+            );
         } else {
             inner.do_insert(
                 fd,

+ 23 - 39
src/kernel/vfs/mod.rs

@@ -1,10 +1,12 @@
 use crate::prelude::*;
 
 use alloc::sync::Arc;
-use bindings::{current_process, dev_t, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFMT, S_IFREG};
+use bindings::{dev_t, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFMT, S_IFREG};
 use dentry::Dentry;
 use inode::Mode;
 
+use super::task::Thread;
+
 pub mod dentry;
 pub mod ffi;
 pub mod file;
@@ -50,46 +52,28 @@ pub struct FsContext {
 }
 
 impl FsContext {
-    pub fn get_current() -> BorrowedArc<'static, Self> {
-        // SAFETY: There should always be a current process.
-        let current = unsafe { current_process.as_ref().unwrap() };
-        let ptr = current.fs_context.m_handle as *const _ as *const Self;
-
-        BorrowedArc::from_raw(ptr)
+    pub fn get_current<'lt>() -> &'lt Arc<Self> {
+        let current = Thread::current();
+        &current.fs_context
     }
-}
-
-#[no_mangle]
-pub extern "C" fn r_fs_context_drop(other: *const FsContext) {
-    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
-    unsafe { Arc::from_raw(other) };
-}
-
-#[no_mangle]
-pub extern "C" fn r_fs_context_new_cloned(other: *const FsContext) -> *const FsContext {
-    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
-    let other = BorrowedArc::from_raw(other);
 
-    Arc::into_raw(Arc::new(FsContext {
-        fsroot: other.fsroot.clone(),
-        cwd: other.cwd.clone(),
-        umask: other.umask.clone(),
-    }))
-}
-
-#[no_mangle]
-pub extern "C" fn r_fs_context_new_shared(other: *const FsContext) -> *const FsContext {
-    // SAFETY: `other` is a valid pointer from `Arc::into_raw()`.
-    let other = BorrowedArc::from_raw(other);
+    pub fn new_for_init() -> Arc<Self> {
+        Arc::new(FsContext {
+            fsroot: Dentry::kernel_root_dentry(),
+            cwd: Spin::new(Dentry::kernel_root_dentry()),
+            umask: Spin::new(0o022),
+        })
+    }
 
-    Arc::into_raw(other.clone())
-}
+    pub fn new_cloned(other: &Self) -> Arc<Self> {
+        Arc::new(Self {
+            fsroot: other.fsroot.clone(),
+            cwd: other.cwd.clone(),
+            umask: other.umask.clone(),
+        })
+    }
 
-#[no_mangle]
-pub extern "C" fn r_fs_context_new_for_init() -> *const FsContext {
-    Arc::into_raw(Arc::new(FsContext {
-        fsroot: Dentry::kernel_root_dentry(),
-        cwd: Spin::new(Dentry::kernel_root_dentry()),
-        umask: Spin::new(0o022),
-    }))
+    pub fn new_shared(other: &Arc<Self>) -> Arc<Self> {
+        other.clone()
+    }
 }

+ 5 - 7
src/kernel/vfs/mount.rs

@@ -162,9 +162,7 @@ pub fn dump_mounts(buffer: &mut dyn core::fmt::Write) {
     }
 }
 
-#[no_mangle]
-#[link_section = ".text.kinit"]
-pub extern "C" fn r_init_vfs() {
+pub fn init_vfs() -> KResult<()> {
     tmpfs::init();
 
     let source = String::from("rootfs");
@@ -173,11 +171,9 @@ pub extern "C" fn r_init_vfs() {
 
     let mount = {
         let creators = MOUNT_CREATORS.lock();
-        let creator = creators.get(&fstype).ok_or(ENODEV).unwrap();
+        let creator = creators.get(&fstype).ok_or(ENODEV)?;
 
-        creator
-            .create_mount(&source, flags, dcache::_looped_droot())
-            .unwrap()
+        creator.create_mount(&source, flags, dcache::_looped_droot())?
     };
 
     let root_dentry = mount.root().clone();
@@ -196,6 +192,8 @@ pub extern "C" fn r_init_vfs() {
     MOUNTS
         .lock()
         .push((dcache::_looped_droot().clone(), mpdata));
+
+    Ok(())
 }
 
 impl Dentry {

+ 3 - 32
src/kinit.cpp

@@ -14,7 +14,6 @@
 #include <kernel/mem/phys.hpp>
 #include <kernel/mem/types.hpp>
 #include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
 #include <kernel/utsname.hpp>
 
 using constructor = void (*)();
@@ -35,7 +34,6 @@ struct PACKED bootloader_data {
 
 namespace kernel::kinit {
 
-SECTION(".text.kinit")
 static inline void enable_sse() {
     asm volatile(
         "mov %%cr0, %%rax\n\t"
@@ -50,20 +48,6 @@ static inline void enable_sse() {
             : "rax");
 }
 
-SECTION(".text.kinit")
-static inline void set_uname() {
-    kernel::sys_utsname = new new_utsname;
-    strcpy(kernel::sys_utsname->sysname, "Linux"); // linux compatible
-    strcpy(kernel::sys_utsname->nodename, "(none)");
-    strcpy(kernel::sys_utsname->release, "1.0.0");
-    strcpy(kernel::sys_utsname->version, "1.0.0");
-    strcpy(kernel::sys_utsname->machine, "x86");
-    strcpy(kernel::sys_utsname->domainname, "(none)");
-}
-
-extern "C" void r_init_vfs();
-
-SECTION(".text.kinit")
 void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn) {
     // call global constructors
     // NOTE: the initializer of global objects MUST NOT contain
@@ -71,8 +55,6 @@ void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn) {
     for (auto* ctor = &start_ctors; ctor != &end_ctors; ++ctor)
         (*ctor)();
 
-    set_uname();
-
     init_interrupt();
     hw::timer::init_pit();
 
@@ -80,14 +62,9 @@ void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn) {
 
     init_pci();
 
-    init_syscall_table();
-
-    r_init_vfs();
-
     init_scheduler(kernel_stack_pfn);
 }
 
-SECTION(".text.kinit")
 static inline void setup_early_kernel_page_table() {
     using namespace kernel::mem::paging;
 
@@ -111,7 +88,6 @@ static inline void setup_early_kernel_page_table() {
 
 extern "C" uintptr_t KIMAGE_PAGES_VALUE;
 
-SECTION(".text.kinit")
 static inline void setup_buddy(uintptr_t addr_max) {
     using namespace kernel::mem;
     using namespace kernel::mem::paging;
@@ -168,7 +144,6 @@ static inline void setup_buddy(uintptr_t addr_max) {
     create_zone(real_start_pfn, saved_start_pfn);
 }
 
-SECTION(".text.kinit")
 static inline void save_memory_info(bootloader_data* data) {
     kernel::mem::info::memory_size = 1ULL * 1024ULL * 1024ULL + // initial 1M
                                      1024ULL * data->meminfo_1k_blocks +
@@ -180,7 +155,6 @@ static inline void save_memory_info(bootloader_data* data) {
            sizeof(kernel::mem::info::e820_entries));
 }
 
-SECTION(".text.kinit")
 void setup_gdt() {
     // user code
     mem::gdt[3] = 0x0020'fa00'0000'0000;
@@ -206,8 +180,7 @@ void setup_gdt() {
     // thread local 64bit
     mem::gdt[13] = 0x0000'0000'0000'0000;
 
-    uint64_t descriptor[] = {0x005f'0000'0000'0000,
-                             (uintptr_t)(uint64_t*)mem::gdt};
+    uint64_t descriptor[] = {0x005f'0000'0000'0000, (uintptr_t)(uint64_t*)mem::gdt};
 
     asm volatile(
         "lgdt (%0)\n\t"
@@ -220,8 +193,7 @@ void setup_gdt() {
         : "ax", "memory");
 }
 
-extern "C" SECTION(".text.kinit") void NORETURN
-    kernel_init(bootloader_data* data) {
+extern "C" void NORETURN kernel_init(bootloader_data* data) {
     enable_sse();
 
     setup_early_kernel_page_table();
@@ -241,8 +213,7 @@ extern "C" SECTION(".text.kinit") void NORETURN
 
     using namespace mem::paging;
     auto kernel_stack_pfn = page_to_pfn(alloc_pages(9));
-    auto kernel_stack_ptr =
-        mem::physaddr<std::byte>{kernel_stack_pfn} + (1 << 9) * 0x1000;
+    auto kernel_stack_ptr = mem::physaddr<std::byte>{kernel_stack_pfn} + (1 << 9) * 0x1000;
 
     asm volatile(
         "mov %1, %%rdi\n\t"

+ 22 - 6
src/lib.rs

@@ -4,6 +4,7 @@
 #![feature(concat_idents)]
 #![feature(arbitrary_self_types)]
 #![feature(get_mut_unchecked)]
+#![feature(macro_metavar_expr)]
 extern crate alloc;
 
 #[allow(warnings)]
@@ -22,10 +23,13 @@ mod sync;
 
 use alloc::{ffi::CString, sync::Arc};
 use bindings::root::types::elf::{elf32_load, elf32_load_data};
-use kernel::vfs::{
-    dentry::Dentry,
-    mount::{do_mount, MS_NOATIME, MS_NODEV, MS_NOSUID, MS_RDONLY},
-    FsContext,
+use kernel::{
+    vfs::{
+        dentry::Dentry,
+        mount::{do_mount, MS_NOATIME, MS_NODEV, MS_NOSUID, MS_RDONLY},
+        FsContext,
+    },
+    CharDevice,
 };
 use path::Path;
 use prelude::*;
@@ -33,8 +37,7 @@ use prelude::*;
 #[panic_handler]
 fn panic(info: &core::panic::PanicInfo) -> ! {
     println_fatal!("panicked at {:?}\n\t\t{}", info.location(), info.message());
-
-    unsafe { bindings::root::freeze() };
+    arch::task::freeze()
 }
 
 extern "C" {
@@ -69,6 +72,19 @@ static ALLOCATOR: Allocator = Allocator {};
 
 #[no_mangle]
 pub extern "C" fn late_init_rust(out_sp: *mut usize, out_ip: *mut usize) {
+    kernel::timer::init().unwrap();
+
+    // Use the PIT timer for now.
+    driver::timer::init();
+
+    kernel::syscall::register_syscalls();
+    CharDevice::init().unwrap();
+
+    // We might want the serial initialized as soon as possible.
+    driver::serial::init().unwrap();
+
+    kernel::vfs::mount::init_vfs().unwrap();
+
     driver::e1000e::register_e1000e_driver();
     driver::ahci::register_ahci_driver();
 

+ 1 - 1
src/prelude.rs

@@ -32,7 +32,7 @@ pub(crate) use alloc::{boxed::Box, string::String, vec, vec::Vec};
 pub(crate) use core::{any::Any, fmt::Write, marker::PhantomData, str};
 use core::{mem::ManuallyDrop, ops::Deref};
 
-pub use crate::sync::{Mutex, RwSemaphore, Semaphore, Spin};
+pub use crate::sync::{Mutex, RwSemaphore, Semaphore, Spin, Locked};
 
 pub struct BorrowedArc<'lt, T: ?Sized> {
     arc: ManuallyDrop<Arc<T>>,

+ 25 - 18
src/sync.rs

@@ -1,25 +1,25 @@
-pub mod condvar;
+mod condvar;
 pub mod lock;
 pub mod semaphore;
 pub mod spin;
 pub mod strategy;
 
-extern "C" {
-    fn r_preempt_disable();
-    fn r_preempt_enable();
-}
+pub mod preempt {
+    use core::sync::atomic::{compiler_fence, Ordering};
+
+    /// TODO: This should be per cpu.
+    static mut PREEMPT_COUNT: usize = 0;
 
-#[inline(always)]
-fn preempt_disable() {
-    unsafe {
-        r_preempt_disable();
+    #[inline(always)]
+    pub fn disable() {
+        unsafe { PREEMPT_COUNT += 1 };
+        compiler_fence(Ordering::SeqCst);
     }
-}
 
-#[inline(always)]
-fn preempt_enable() {
-    unsafe {
-        r_preempt_enable();
+    #[inline(always)]
+    pub fn enable() {
+        compiler_fence(Ordering::SeqCst);
+        unsafe { PREEMPT_COUNT -= 1 };
     }
 }
 
@@ -44,8 +44,11 @@ pub type RwSemReadGuard<'lock, T> = lock::Guard<'lock, T, semaphore::RwSemaphore
 #[allow(dead_code)]
 pub type RwSemWriteGuard<'lock, T> = lock::Guard<'lock, T, semaphore::RwSemaphoreStrategy, true>;
 
+pub type CondVar = condvar::CondVar<true>;
+pub type UCondVar = condvar::CondVar<false>;
+
 pub struct Locked<T: Sized, U: ?Sized> {
-    inner: T,
+    inner: UnsafeCell<T>,
     guard: *const U,
 }
 
@@ -55,19 +58,21 @@ unsafe impl<T: Sized + Send + Sync, U: ?Sized> Sync for Locked<T, U> {}
 impl<T: Sized + Sync, U: ?Sized> Locked<T, U> {
     pub fn new(value: T, from: &U) -> Self {
         Self {
-            inner: value,
+            inner: UnsafeCell::new(value),
             guard: from,
         }
     }
 
     pub fn access<'lt>(&'lt self, guard: &'lt U) -> &'lt T {
         assert_eq!(self.guard, guard as *const U, "wrong guard");
-        &self.inner
+        // SAFETY: The guard protects the shared access to the inner value.
+        unsafe { self.inner.get().as_ref() }.unwrap()
     }
 
     pub fn access_mut<'lt>(&'lt self, guard: &'lt mut U) -> &'lt mut T {
         assert_eq!(self.guard, guard as *const U, "wrong guard");
-        unsafe { &mut *(&raw const self.inner as *mut T) }
+        // SAFETY: The guard protects the exclusive access to the inner value.
+        unsafe { self.inner.get().as_mut() }.unwrap()
     }
 }
 
@@ -102,4 +107,6 @@ macro_rules! might_sleep {
     };
 }
 
+use core::cell::UnsafeCell;
+
 pub(crate) use might_sleep;

+ 55 - 79
src/sync/condvar.rs

@@ -1,113 +1,89 @@
-use alloc::collections::vec_deque::VecDeque;
-use bindings::{
-    current_thread,
-    kernel::task::{thread, thread_ISLEEP, thread_READY, thread_USLEEP},
-    schedule_now_preempt_disabled,
+use crate::{
+    kernel::task::{Scheduler, Thread, ThreadState},
+    prelude::*,
+    sync::preempt,
 };
 
-use crate::{prelude::*, sync::preempt_disable};
-
 use super::{lock::Guard, strategy::LockStrategy};
+use alloc::{collections::vec_deque::VecDeque, sync::Arc};
 
-/// `current` should be per CPU, so no sync is needed
-fn current() -> &'static mut *mut thread {
-    #[allow(static_mut_refs)]
-    unsafe {
-        &mut current_thread
-    }
+pub struct CondVar<const Interruptible: bool> {
+    waiters: Spin<VecDeque<Arc<Thread>>>,
 }
 
-pub struct CondVar {
-    waiters: Spin<VecDeque<*mut thread>>,
+impl<const I: bool> core::fmt::Debug for CondVar<I> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if I {
+            f.debug_struct("CondVar").finish()
+        } else {
+            f.debug_struct("CondVarUnintrruptible").finish()
+        }
+    }
 }
 
-// TODO!!!: acquire dispatcher lock because modifying thread attribute
-//          is racy. But we put this in the future work since that would
-//          require a lot of changes in the kernel task management system.
-unsafe impl Send for CondVar {}
-unsafe impl Sync for CondVar {}
-
-impl CondVar {
+impl<const I: bool> CondVar<I> {
     pub fn new() -> Self {
         Self {
             waiters: Spin::new(VecDeque::new()),
         }
     }
 
-    pub fn notify_one(&self) {
-        // TODO!!!: acquire dispatcher lock
-        let mut waiters = self.waiters.lock();
-
-        if waiters.is_empty() {
-            return;
-        }
-
-        let thread = waiters
-            .pop_front()
-            .map(|ptr| unsafe { ptr.as_mut() }.unwrap());
-
-        if let Some(thread) = thread {
-            unsafe { thread.set_attr(thread_READY, true) };
+    fn wake(schedule: &mut Scheduler, thread: &Arc<Thread>) {
+        if I {
+            schedule.iwake(thread);
+        } else {
+            schedule.uwake(thread);
         }
     }
 
-    pub fn notify_all(&self) {
-        // TODO!!!: acquire dispatcher lock
-        let mut waiters = self.waiters.lock();
-
-        if waiters.is_empty() {
-            return;
+    fn sleep(scheduler: &mut Scheduler) {
+        if I {
+            scheduler.isleep(Thread::current());
+        } else {
+            scheduler.usleep(Thread::current());
         }
+    }
 
-        for item in waiters.iter() {
-            let thread = unsafe { item.as_mut() }.unwrap();
-            unsafe { thread.set_attr(thread_READY, true) };
+    pub fn notify_one(&self) {
+        let mut scheduler = Scheduler::get().lock_irq();
+        if let Some(waiter) = self.waiters.lock().pop_front() {
+            Self::wake(scheduler.as_mut(), &waiter);
         }
+    }
 
-        waiters.clear();
+    pub fn notify_all(&self) {
+        let mut scheduler = Scheduler::get().lock_irq();
+        self.waiters.lock().retain(|waiter| {
+            Self::wake(scheduler.as_mut(), &waiter);
+            false
+        });
     }
 
+    /// Unlock the `guard`. Then wait until being waken up. Relock the `guard` before returning.
+    ///
     /// # Might Sleep
-    /// This function **might sleep**, so call it in a preemptible context
+    /// This function **might sleep**, so call it in a preemptible context.
     ///
     /// # Return
     /// - `true`: a pending signal was received
-    pub fn wait<'a, T, S: LockStrategy>(
-        &self,
-        guard: &mut Guard<'a, T, S>,
-        interruptible: bool,
-    ) -> bool {
-        preempt_disable();
-
-        // TODO!!!: acquire dispatcher lock
-        let current = *current();
-
-        let current_mut = unsafe { current.as_mut() }.unwrap();
-        unsafe {
-            if interruptible {
-                current_mut.set_attr(thread_ISLEEP, false);
-            } else {
-                current_mut.set_attr(thread_USLEEP, false);
-            }
-        }
-
+    pub fn wait<'a, T, S: LockStrategy>(&self, guard: &mut Guard<'a, T, S>) {
+        preempt::disable();
         {
-            let mut waiters = self.waiters.lock();
-            waiters.push_back(current);
-        }
+            let mut scheduler = Scheduler::get().lock_irq();
+            // We have scheduler locked and IRQ disabled. So no one could be waking us up for now.
 
-        unsafe {
-            guard.force_unlock();
+            self.waiters.lock().push_back(Thread::current().clone());
+            Self::sleep(scheduler.as_mut());
         }
 
-        might_sleep!(1);
-
-        let has_signals = unsafe { !schedule_now_preempt_disabled() };
-
-        unsafe {
-            guard.force_relock();
-        }
+        // TODO!!!: Another way to do this:
+        //
+        // Store a flag in our entry in the waiting list.
+        // Check the flag before doing `schedule()` but after we've unlocked the `guard`.
+        // If the flag is already set, we don't need to sleep.
 
-        has_signals
+        unsafe { guard.force_unlock() };
+        Scheduler::schedule();
+        unsafe { guard.force_relock() };
     }
 }

+ 8 - 0
src/sync/lock.rs

@@ -23,6 +23,14 @@ impl<Value, Strategy: LockStrategy> Lock<Value, Strategy> {
     }
 }
 
+impl<Value: core::fmt::Debug, Strategy: LockStrategy> core::fmt::Debug for Lock<Value, Strategy> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("Lock")
+            .field("locked_value", &self.value)
+            .finish()
+    }
+}
+
 impl<Value: Clone, Strategy: LockStrategy> Clone for Lock<Value, Strategy> {
     fn clone(&self) -> Self {
         Self {

+ 10 - 13
src/sync/semaphore.rs

@@ -1,4 +1,4 @@
-use super::{condvar::CondVar, strategy::LockStrategy, Spin};
+use super::{strategy::LockStrategy, Spin, UCondVar};
 
 pub struct SemaphoreStrategy<const MAX: usize = { core::usize::MAX }>;
 
@@ -12,7 +12,7 @@ impl<const MAX: usize> SemaphoreStrategy<MAX> {
 
 pub struct SemaphoreData {
     counter: Spin<usize>,
-    cv: CondVar,
+    cv: UCondVar,
 }
 
 unsafe impl<const MAX: usize> LockStrategy for SemaphoreStrategy<MAX> {
@@ -23,7 +23,7 @@ unsafe impl<const MAX: usize> LockStrategy for SemaphoreStrategy<MAX> {
     fn data() -> Self::StrategyData {
         SemaphoreData {
             counter: Spin::new(0),
-            cv: CondVar::new(),
+            cv: UCondVar::new(),
         }
     }
 
@@ -41,8 +41,7 @@ unsafe impl<const MAX: usize> LockStrategy for SemaphoreStrategy<MAX> {
                 return;
             }
 
-            // TODO!!!: interruptible wait
-            data.cv.wait(&mut counter, false);
+            data.cv.wait(&mut counter);
         }
     }
 
@@ -79,8 +78,8 @@ impl<const READ_MAX: isize> RwSemaphoreStrategy<READ_MAX> {
 
 pub struct RwSemaphoreData {
     counter: Spin<isize>,
-    read_cv: CondVar,
-    write_cv: CondVar,
+    read_cv: UCondVar,
+    write_cv: UCondVar,
 }
 
 unsafe impl<const READ_MAX: isize> LockStrategy for RwSemaphoreStrategy<READ_MAX> {
@@ -91,8 +90,8 @@ unsafe impl<const READ_MAX: isize> LockStrategy for RwSemaphoreStrategy<READ_MAX
     fn data() -> Self::StrategyData {
         RwSemaphoreData {
             counter: Spin::new(0),
-            read_cv: CondVar::new(),
-            write_cv: CondVar::new(),
+            read_cv: UCondVar::new(),
+            write_cv: UCondVar::new(),
         }
     }
 
@@ -110,8 +109,7 @@ unsafe impl<const READ_MAX: isize> LockStrategy for RwSemaphoreStrategy<READ_MAX
                 return;
             }
 
-            // TODO!!!: interruptible wait
-            data.write_cv.wait(&mut counter, false);
+            data.write_cv.wait(&mut counter);
         }
     }
 
@@ -129,8 +127,7 @@ unsafe impl<const READ_MAX: isize> LockStrategy for RwSemaphoreStrategy<READ_MAX
                 return;
             }
 
-            // TODO!!!: interruptible wait
-            data.read_cv.wait(&mut counter, false);
+            data.read_cv.wait(&mut counter);
         }
     }
 

+ 7 - 17
src/sync/spin.rs

@@ -3,9 +3,7 @@ use core::{
     sync::atomic::{AtomicBool, Ordering},
 };
 
-use crate::sync::preempt_disable;
-
-use super::{preempt_enable, strategy::LockStrategy};
+use super::{preempt, strategy::LockStrategy};
 
 pub struct SpinStrategy;
 
@@ -28,7 +26,7 @@ unsafe impl LockStrategy for SpinStrategy {
     #[inline(always)]
     unsafe fn do_lock(data: &Self::StrategyData) -> Self::GuardContext {
         use Ordering::{Acquire, Relaxed};
-        preempt_disable();
+        preempt::disable();
 
         while data
             .compare_exchange_weak(false, true, Acquire, Relaxed)
@@ -43,7 +41,7 @@ unsafe impl LockStrategy for SpinStrategy {
     #[inline(always)]
     unsafe fn do_unlock(data: &Self::StrategyData, _: &mut Self::GuardContext) {
         data.store(false, Ordering::Release);
-        preempt_enable();
+        preempt::enable();
     }
 }
 
@@ -74,32 +72,24 @@ unsafe impl<Strategy: LockStrategy> LockStrategy for IrqStrategy<Strategy> {
     }
 
     #[inline(always)]
-    unsafe fn do_unlock(
-        data: &Self::StrategyData,
-        context: &mut Self::GuardContext,
-    ) {
+    unsafe fn do_unlock(data: &Self::StrategyData, context: &mut Self::GuardContext) {
         Strategy::do_unlock(data, &mut context.0);
 
         asm!(
             "push {context}",
             "popf",
             context = in(reg) context.1,
+            options(nomem),
         )
     }
 
     #[inline(always)]
-    unsafe fn do_temporary_unlock(
-        data: &Self::StrategyData,
-        context: &mut Self::GuardContext,
-    ) {
+    unsafe fn do_temporary_unlock(data: &Self::StrategyData, context: &mut Self::GuardContext) {
         Strategy::do_unlock(data, &mut context.0)
     }
 
     #[inline(always)]
-    unsafe fn do_relock(
-        data: &Self::StrategyData,
-        context: &mut Self::GuardContext,
-    ) {
+    unsafe fn do_relock(data: &Self::StrategyData, context: &mut Self::GuardContext) {
         Strategy::do_relock(data, &mut context.0);
     }
 }