greatbridf пре 9 месеци
родитељ
комит
ac083a7cf0

+ 6 - 3
CMakeLists.txt

@@ -10,6 +10,7 @@ set(C_CXX_FLAGS "-nostdinc -nostdlib -W -Wall -Wextra -Wno-stringop-overflow -Wn
 set(CMAKE_C_FLAGS "${C_CXX_FLAGS} -Werror=implicit-int -Werror=implicit-function-declaration -Werror=strict-aliasing")
 set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -fno-use-cxa-atexit -fno-rtti")
 set(CMAKE_CXX_LINK_FLAGS "")
+SET(CMAKE_ASM_FLAGS "${CFLAGS} -x assembler-with-cpp")
 set(CMAKE_CXX_STANDARD 20)
 
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -45,9 +46,9 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         # src/kernel/syscall.cpp
                         src/kernel/syscall/fileops.cc
                         src/kernel/syscall/mount.cc
+                        src/kernel/mem/mm_list.cc
                         src/kernel/mem/paging.cc
                         src/kernel/mem/slab.cc
-                        src/kernel/mem.cpp
                         src/kernel/module.cc
                         src/kernel/vfs.cpp
                         src/kernel/vga.cpp
@@ -66,14 +67,15 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         include/kernel/async/waitlist.hpp
                         include/kernel/async/lock.hpp
                         include/kernel/tty.hpp
-                        include/kernel/interrupt.h
+                        include/kernel/interrupt.hpp
                         include/kernel/irq.hpp
                         include/kernel/process.hpp
                         include/kernel/syscall.hpp
+                        include/kernel/mem/mm_list.hpp
                         include/kernel/mem/paging.hpp
                         include/kernel/mem/slab.hpp
                         include/kernel/mem/types.hpp
-                        include/kernel/mm.hpp
+                        include/kernel/mem/vm_area.hpp
                         include/kernel/module.hpp
                         include/kernel/utsname.hpp
                         include/kernel/vfs.hpp
@@ -96,6 +98,7 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         include/types/buffer.hpp
                         include/types/elf.hpp
                         include/types/hash_map.hpp
+                        include/types/list.hpp
                         include/types/types.h
                         include/types/allocator.hpp
                         include/types/cplusplus.hpp

+ 4 - 0
gblibstdc++/include/bits/rbtree

@@ -369,6 +369,8 @@ public:
         root = copy(other.root);
         if (root)
             root->parent = nullptr;
+
+        return *this;
     }
     
     constexpr rbtree& operator=(rbtree&& other) noexcept
@@ -380,6 +382,8 @@ public:
         if constexpr (node_alloc_traits::
             propagate_on_container_move_assignment::value)
             alloc = std::move(other.alloc);
+
+        return *this;
     }
 
     constexpr void rotateleft(node* rt)

+ 10 - 7
include/kernel/async/lock.hpp

@@ -1,11 +1,14 @@
 #pragma once
 
+#include <cstddef>
+
 #include <stdint.h>
 
 namespace kernel::async {
 
-using spinlock_t = uint32_t volatile;
-using preempt_count_t = size_t;
+using spinlock_t = unsigned long volatile;
+using lock_context_t = unsigned long;
+using preempt_count_t = std::size_t;
 
 void preempt_disable();
 void preempt_enable();
@@ -16,8 +19,8 @@ void init_spinlock(spinlock_t& lock);
 void spin_lock(spinlock_t& lock);
 void spin_unlock(spinlock_t& lock);
 
-size_t spin_lock_irqsave(spinlock_t& lock);
-void spin_unlock_irqrestore(spinlock_t& lock, size_t state);
+lock_context_t spin_lock_irqsave(spinlock_t& lock);
+void spin_unlock_irqrestore(spinlock_t& lock, lock_context_t context);
 
 class mutex {
 private:
@@ -31,8 +34,8 @@ public:
     void lock();
     void unlock();
 
-    uint32_t lock_irq();
-    void unlock_irq(uint32_t state);
+    lock_context_t lock_irq();
+    void unlock_irq(lock_context_t state);
 };
 
 class lock_guard {
@@ -50,7 +53,7 @@ public:
 class lock_guard_irq {
 private:
     mutex& m_mtx;
-    uint32_t state;
+    lock_context_t state;
 
 public:
     explicit inline lock_guard_irq(mutex& mtx)

+ 0 - 67
include/kernel/interrupt.h

@@ -1,67 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
-#define USER_INTERRUPT_GATE_TYPE (0xee)
-
-struct regs_64 {
-    uint64_t rax;
-    uint64_t rbx;
-    uint64_t rcx;
-    uint64_t rdx;
-    uint64_t rsi;
-    uint64_t rdi;
-    uint64_t rsp;
-    uint64_t rbp;
-    uint64_t r8;
-    uint64_t r9;
-    uint64_t r10;
-    uint64_t r11;
-    uint64_t r12;
-    uint64_t r13;
-    uint64_t r14;
-    uint64_t r15;
-};
-
-struct interrupt_stack {
-    regs_64 s_regs;
-    void* v_rip;
-    uint64_t cs;
-    uint64_t flags;
-    uint64_t rsp;
-    uint64_t ss;
-};
-
-struct mmx_registers {
-    uint8_t data[512]; // TODO: list of content
-};
-
-// present: When set, the page fault was caused by a page-protection violation.
-//          When not set, it was caused by a non-present page.
-// write:   When set, the page fault was caused by a write access.
-//          When not set, it was caused by a read access.
-// user:    When set, the page fault was caused while CPL = 3.
-//          This does not necessarily mean that the page fault was a privilege violation.
-// from https://wiki.osdev.org/Exceptions#Page_Fault
-struct page_fault_error_code {
-    uint32_t present : 1;
-    uint32_t write : 1;
-    uint32_t user : 1;
-    uint32_t reserved_write : 1;
-    uint32_t instruction_fetch : 1;
-    uint32_t protection_key : 1;
-    uint32_t shadow_stack : 1;
-    uint32_t software_guard_extensions : 1;
-};
-
-void init_idt(void);
-void init_pic(void);
-
-#ifdef __cplusplus
-}
-#endif

+ 74 - 0
include/kernel/interrupt.hpp

@@ -0,0 +1,74 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <types/types.h>
+
+struct saved_regs {
+    unsigned long rax;
+    unsigned long rbx;
+    unsigned long rcx;
+    unsigned long rdx;
+    unsigned long rsi;
+    unsigned long rdi;
+    unsigned long r8;
+    unsigned long r9;
+    unsigned long r10;
+    unsigned long r11;
+    unsigned long r12;
+    unsigned long r13;
+    unsigned long r14;
+    unsigned long r15;
+    unsigned long rbp;
+};
+
+struct PACKED interrupt_stack_head {
+    saved_regs s_regs;
+    unsigned long int_no;
+};
+
+struct PACKED interrupt_stack_normal {
+    interrupt_stack_head head;
+    uintptr_t v_rip;
+    unsigned long cs;
+    unsigned long flags;
+    uintptr_t rsp;
+    unsigned long ss;
+};
+
+struct PACKED interrupt_stack_with_code {
+    interrupt_stack_head head;
+    unsigned long error_code;
+    uintptr_t v_rip;
+    unsigned long cs;
+    unsigned long flags;
+    uintptr_t rsp;
+    unsigned long ss;
+};
+
+struct mmx_registers {
+    uint8_t data[512]; // TODO: list of content
+};
+
+// present: When set, the page fault was caused by a page-protection violation.
+//          When not set, it was caused by a non-present page.
+// write:   When set, the page fault was caused by a write access.
+//          When not set, it was caused by a read access.
+// user:    When set, the page fault was caused while CPL = 3.
+//          This does not necessarily mean that the page fault was a privilege violation.
+// from https://wiki.osdev.org/Exceptions#Page_Fault
+struct page_fault_error_code {
+    unsigned long present : 1;
+    unsigned long write : 1;
+    unsigned long user : 1;
+    unsigned long reserved_write : 1;
+    unsigned long instruction_fetch : 1;
+    unsigned long protection_key : 1;
+    unsigned long shadow_stack : 1;
+    unsigned long software_guard_extensions : 1;
+};
+
+namespace kernel::kinit {
+void init_interrupt();
+
+} // namespace kernel::kinit

+ 107 - 0
include/kernel/mem/mm_list.hpp

@@ -0,0 +1,107 @@
+#pragma once
+
+#include <set>
+
+#include <stdint.h>
+
+#include "vm_area.hpp"
+#include "paging.hpp"
+
+namespace kernel::mem {
+
+constexpr uintptr_t KERNEL_SPACE_START    = 0x8000000000000000ULL;
+constexpr uintptr_t USER_SPACE_MEMORY_TOP = 0x0000800000000000ULL;
+constexpr uintptr_t MMAP_MIN_ADDR         = 0x0000600000000000ULL;
+constexpr uintptr_t STACK_MIN_ADDR        = 0x0000700000000000ULL;
+
+class mm_list {
+private:
+    struct comparator {
+        constexpr bool operator()(const vm_area& lhs, const vm_area& rhs) const noexcept
+        { return lhs < rhs; }
+        constexpr bool operator()(const vm_area& lhs, uintptr_t rhs) const noexcept
+        { return lhs < rhs; }
+        constexpr bool operator()(uintptr_t lhs, const vm_area& rhs) const noexcept
+        { return lhs < rhs; }
+    };
+
+public:
+    using list_type = std::set<vm_area, comparator>;
+    using iterator = list_type::iterator;
+    using const_iterator = list_type::const_iterator;
+
+    struct map_args {
+        // MUSE BE aligned to 4kb boundary
+        uintptr_t vaddr;
+        // MUSE BE aligned to 4kb boundary
+        std::size_t length;
+
+        unsigned long flags;
+
+        fs::inode* file_inode;
+        // MUSE BE aligned to 4kb boundary
+        std::size_t file_offset;
+    };
+
+private:
+    list_type m_areas;
+    paging::pfn_t m_pt;
+    iterator m_brk {};
+
+public:
+    // default constructor copies kernel_mms
+    explicit mm_list();
+    // copies kernel_mms and mirrors user space
+    explicit mm_list(const mm_list& other);
+
+    constexpr mm_list(mm_list&& v)
+        : m_areas(std::move(v.m_areas))
+        , m_pt(std::exchange(v.m_pt, 0))
+        , m_brk{std::move(v.m_brk)} { }
+
+    ~mm_list();
+
+    void switch_pd() const noexcept;
+
+    int register_brk(uintptr_t addr);
+    uintptr_t set_brk(uintptr_t addr);
+
+    void clear();
+
+    // split the memory block at the specified address
+    // return: iterator to the new block
+    iterator split(iterator area, uintptr_t at);
+
+    bool is_avail(uintptr_t addr) const;
+    bool is_avail(uintptr_t start, std::size_t length) const noexcept;
+
+    uintptr_t find_avail(uintptr_t hint, size_t length) const;
+
+    int unmap(iterator area);
+    int unmap(uintptr_t start, std::size_t length);
+
+    int mmap(const map_args& args);
+
+    constexpr vm_area* find(uintptr_t lp)
+    {
+        auto iter = m_areas.find(lp);
+        if (iter == m_areas.end())
+            return nullptr;
+        return &iter;
+    }
+
+    constexpr const vm_area* find(uintptr_t lp) const
+    {
+        auto iter = m_areas.find(lp);
+        if (iter == m_areas.end())
+            return nullptr;
+        return &iter;
+    }
+
+    constexpr paging::PSE get_page_table() const noexcept
+    {
+        return paging::PSE {m_pt};
+    }
+};
+
+} // namespace kernel::mem

+ 70 - 11
include/kernel/mem/paging.hpp

@@ -6,8 +6,6 @@
 
 #include <stdint.h>
 
-#include <types/types.h>
-
 #include <kernel/mem/phys.hpp>
 
 namespace kernel::mem::paging {
@@ -41,7 +39,7 @@ constexpr psattr_t PA_PS   = 0x0000000000000080ULL;
 constexpr psattr_t PA_G    = 0x0000000000000100ULL;
 constexpr psattr_t PA_COW  = 0x0000000000000200ULL; // copy on write
 constexpr psattr_t PA_MMAP = 0x0000000000000400ULL; // memory mapped
-constexpr psattr_t PA_FRE  = 0x0000000000000800ULL; // unused flag
+constexpr psattr_t PA_ANON = 0x0000000000000800ULL; // anonymous map
 constexpr psattr_t PA_NXE  = 0x8000000000000000ULL;
 constexpr psattr_t PA_MASK = 0xfff0000000000fffULL;
 
@@ -54,6 +52,9 @@ constexpr psattr_t PA_KERNEL_PAGE_TABLE = PA_PAGE_TABLE | PA_G;
 constexpr psattr_t PA_DATA_HUGE = PA_DATA | PA_PS;
 constexpr psattr_t PA_KERNEL_DATA_HUGE = PA_DATA_HUGE | PA_G;
 
+constexpr psattr_t PA_ANONYMOUS_PAGE = PA_P | PA_US | PA_COW | PA_ANON;
+constexpr psattr_t PA_MMAPPED_PAGE = PA_US | PA_COW | PA_ANON | PA_MMAP;
+
 namespace __inner {
     using pse_t = uint64_t;
 
@@ -96,6 +97,10 @@ public:
     }
 };
 
+constexpr pfn_t EMPTY_PAGE_PFN = 0x7f000;
+
+constexpr uintptr_t KERNEL_PAGE_TABLE_ADDR = 0x100000;
+constexpr physaddr<void> KERNEL_PAGE_TABLE_PHYS_ADDR{KERNEL_PAGE_TABLE_ADDR};
 constexpr PSE KERNEL_PAGE_TABLE{0x100000};
 
 constexpr unsigned long PAGE_PRESENT = 0x00000001;
@@ -103,13 +108,12 @@ constexpr unsigned long PAGE_BUDDY   = 0x00000002;
 constexpr unsigned long PAGE_SLAB    = 0x00000004;
 
 struct page {
-    refcount_t refcount;
+    // TODO: use atomic
+    unsigned long refcount;
     unsigned long flags;
 
     page* next;
-
-    // padding
-    uint64_t padding;
+    page* prev;
 };
 
 inline page* PAGE_ARRAY;
@@ -117,14 +121,69 @@ inline page* PAGE_ARRAY;
 void create_zone(uintptr_t start, uintptr_t end);
 void mark_present(uintptr_t start, uintptr_t end);
 
+[[nodiscard]] page* alloc_page();
+// order represents power of 2
+[[nodiscard]] page* alloc_pages(int order);
+
 // order represents power of 2
-page* alloc_page();
-page* alloc_pages(int order);
-void free_page(page* page, int order);
+void free_pages(page* page, int order);
+void free_page(page* page);
 
-pfn_t alloc_page_table();
+// order represents power of 2
+void free_pages(pfn_t pfn, int order);
+void free_page(pfn_t pfn);
+
+// clear the page all zero
+[[nodiscard]] pfn_t alloc_page_table();
 
 pfn_t page_to_pfn(page* page);
 page* pfn_to_page(pfn_t pfn);
 
+void increase_refcount(page* page);
+
+constexpr unsigned long PAGE_FAULT_P   = 0x00000001;
+constexpr unsigned long PAGE_FAULT_W   = 0x00000002;
+constexpr unsigned long PAGE_FAULT_U   = 0x00000004;
+constexpr unsigned long PAGE_FAULT_R   = 0x00000008;
+constexpr unsigned long PAGE_FAULT_I   = 0x00000010;
+constexpr unsigned long PAGE_FAULT_PK  = 0x00000020;
+constexpr unsigned long PAGE_FAULT_SS  = 0x00000040;
+constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
+
+void handle_page_fault(unsigned long err);
+
+class vaddr_range {
+    std::size_t n;
+
+    int idx4;
+    int idx3;
+    int idx2;
+    int idx1;
+
+    PSE pml4;
+    PSE pdpt;
+    PSE pd;
+    PSE pt;
+
+    uintptr_t m_start;
+    uintptr_t m_end;
+
+    bool is_privilege;
+
+public:
+    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool is_privilege = false);
+    explicit vaddr_range(std::nullptr_t);
+
+    vaddr_range begin() const noexcept;
+    vaddr_range end() const noexcept;
+
+    PSE operator*() const noexcept;
+
+    vaddr_range& operator++();
+    operator bool() const noexcept;
+
+    // compares remaining pages to iterate
+    bool operator==(const vaddr_range& other) const noexcept;
+};
+
 } // namespace kernel::mem::paging

+ 19 - 0
include/kernel/mem/phys.hpp

@@ -7,6 +7,8 @@
 
 #include <types/types.h>
 
+#include <kernel/mem/types.hpp>
+
 namespace kernel::mem {
 
 template <typename T, bool Cached = true>
@@ -43,4 +45,21 @@ public:
     }
 };
 
+//  gdt[0]:  null
+//  gdt[1]:  kernel code
+//  gdt[2]:  kernel data
+//  gdt[3]:  user code
+//  gdt[4]:  user data
+//  gdt[5]:  user code compability mode
+//  gdt[6]:  user data compability mode
+//  gdt[7]:  reserved
+//  gdt[8]:  tss descriptor low
+//  gdt[9]:  tss descriptor high
+//  gdt[10]: ldt descriptor low
+//  gdt[11]: ldt descriptor high
+//  gdt[12]: thread local(in ldt)
+//  gdt[13]: thread local(in ldt)
+// &gdt[14]: tss of 0x68 bytes from here
+constexpr physaddr<uint64_t> gdt{0x00000000 + 1 - 1};
+
 } // namespace kernel::mem

+ 46 - 0
include/kernel/mem/vm_area.hpp

@@ -0,0 +1,46 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <kernel/vfs.hpp>
+
+namespace kernel::mem {
+
+constexpr unsigned long MM_WRITE         = 0x00000000'00000001;
+constexpr unsigned long MM_EXECUTE       = 0x00000000'00000002;
+constexpr unsigned long MM_MAPPED        = 0x00000000'00000004;
+constexpr unsigned long MM_ANONYMOUS     = 0x00000000'00000008;
+constexpr unsigned long MM_INTERNAL_MASK = 0xffffffff'00000000;
+constexpr unsigned long MM_BREAK         = 0x80000000'00000000;
+
+struct vm_area {
+    uintptr_t start;
+    uintptr_t end;
+
+    unsigned long flags;
+
+    fs::inode* mapped_file;
+    std::size_t file_offset;
+
+    constexpr bool is_avail(uintptr_t ostart, uintptr_t oend) const noexcept
+    {
+        return (ostart >= end || oend <= start);
+    }
+
+    constexpr bool operator<(const vm_area& rhs) const noexcept
+    { return end <= rhs.start; }
+    constexpr bool operator<(uintptr_t rhs) const noexcept
+    { return end <= rhs; }
+    friend constexpr bool operator<(uintptr_t lhs, const vm_area& rhs) noexcept
+    { return lhs < rhs.start; }
+
+    constexpr vm_area(uintptr_t start, unsigned long flags, uintptr_t end,
+            fs::inode* mapped_file = nullptr, std::size_t offset = 0)
+        : start{start}, end{end}, flags{flags}, mapped_file{mapped_file}, file_offset{offset} { }
+
+    constexpr vm_area(uintptr_t start, unsigned long flags,
+            fs::inode* mapped_file = nullptr, std::size_t offset = 0)
+        : start{start}, end{start}, flags{flags}, mapped_file{mapped_file}, file_offset{offset} { }
+};
+
+} // namespace kernel::mem

+ 0 - 260
include/kernel/mm.hpp

@@ -1,260 +0,0 @@
-#pragma once
-
-#include <set>
-#include <vector>
-#include <bit>
-#include <cstddef>
-#include <utility>
-
-#include <kernel/mem/paging.hpp>
-#include <kernel/vfs.hpp>
-#include <stdint.h>
-#include <types/allocator.hpp>
-#include <types/cplusplus.hpp>
-#include <types/types.h>
-
-#define invalidate_tlb(addr) asm volatile("invlpg (%0)": : "r"(addr) : "memory")
-
-// private memory mapping
-// changes won't be neither written back to file nor shared between processes
-// TODO: shared mapping
-// @param len is aligned to 4kb boundary automatically, exceeding part will
-// be filled with '0's and not written back to the file
-// @param offset MUST be aligned to 4kb
-int mmap(
-    void* hint,
-    size_t len,
-    fs::inode* file,
-    size_t offset,
-    int write,
-    int priv);
-
-template <int N>
-constexpr std::size_t align_down(std::size_t v)
-{
-    return v & ~((1 << N) - 1);
-}
-template <int N>
-constexpr void* align_down(void* v)
-{
-    return std::bit_cast<void*>(align_down<N>(std::bit_cast<std::size_t>(v)));
-}
-template <int N>
-constexpr std::size_t align_up(std::size_t v)
-{
-    return align_down<N>(v + (1 << N) - 1);
-}
-template <int N>
-constexpr void* align_up(void* v)
-{
-    return std::bit_cast<void*>(align_up<N>(std::bit_cast<std::size_t>(v)));
-}
-
-constexpr size_t vptrdiff(void* p1, void* p2)
-{
-    auto* _p1 = static_cast<std::byte*>(p1);
-    auto* _p2 = static_cast<std::byte*>(p2);
-    return _p1 - _p2;
-}
-
-constexpr void* vptradd(void* p, std::size_t off)
-{
-    auto* _p = static_cast<std::byte*>(p);
-    return _p + off;
-}
-
-// TODO: LONG MODE
-// void dealloc_pd(page_t pd);
-
-// allocate a struct page together with the raw page
-kernel::mem::paging::page allocate_page(void);
-void free_page(kernel::mem::paging::page* pg);
-
-namespace kernel {
-
-namespace mem {
-
-struct mm {
-public:
-    void* start {};
-    struct mm_attr {
-        uint32_t write : 1;
-        uint32_t system : 1;
-        uint32_t mapped : 1;
-    } attr {};
-    fs::inode* mapped_file {};
-    size_t file_offset {};
-    std::size_t page_count;
-
-public:
-    constexpr void* end() const noexcept
-    { return vptradd(start, page_count * 4096); } // TODO: LONG MODE
-    constexpr bool is_kernel_space() const noexcept
-    { return attr.system; }
-    constexpr bool is_avail(void* ostart, void* oend) const noexcept
-    {
-        void* m_start = start;
-        void* m_end = end();
-
-        return (ostart >= m_end || oend <= m_start);
-    }
-
-    // void append_page(pd_t pd, const page& pg, uint32_t attr, bool priv); TODO: LONG MODE
-
-    /**
-     * @brief Splits the memory block at the specified address.
-     * 
-     * @param addr The address at which the memory block will be split.
-     * @return The new memory block created after splitting.
-     */
-    mm split(void* addr);
-
-    constexpr bool operator<(const mm& rhs) const noexcept
-    { return end() <= rhs.start; }
-    constexpr bool operator<(void* rhs) const noexcept
-    { return end() <= rhs; }
-    friend constexpr bool operator<(void* lhs, const mm& rhs) noexcept
-    { return lhs < rhs.start; }
-};
-
-class mm_list {
-private:
-    struct comparator {
-        constexpr bool operator()(const mm& lhs, const mm& rhs) const noexcept
-        { return lhs < rhs; }
-        constexpr bool operator()(const mm& lhs, void* rhs) const noexcept
-        { return lhs < rhs; }
-        constexpr bool operator()(void* lhs, const mm& rhs) const noexcept
-        { return lhs < rhs; }
-    };
-
-public:
-    // TODO: LONG MODE: use slab allocator
-    using list_type = std::set<mm, comparator>;
-    using iterator = list_type::iterator;
-    using const_iterator = list_type::const_iterator;
-
-public:
-    static inline mm_list* s_kernel_mms;
-
-private:
-    list_type m_areas;
-    kernel::mem::paging::pfn_t m_pd;
-    mm* m_brk {};
-
-public:
-    // for system initialization only
-    explicit constexpr mm_list(kernel::mem::paging::pfn_t pd)
-        : m_pd(pd) { }
-
-    // default constructor copies kernel_mms
-    explicit mm_list();
-    // copies kernel_mms and mirrors user space
-    explicit mm_list(const mm_list& other);
-
-    constexpr mm_list(mm_list&& v)
-        : m_areas(std::move(v.m_areas))
-        , m_pd(std::exchange(v.m_pd, 0)) { }
-
-    ~mm_list();
-    void switch_pd() const;
-
-    int register_brk(void* addr);
-    void* set_brk(void* addr);
-
-    void* find_avail(void* hint, size_t len, bool priv) const;
-
-    int unmap(void* start, size_t len, bool priv);
-
-    constexpr mm& addarea(void* start, bool w, bool system)
-    {
-        auto [ iter, inserted ] = m_areas.emplace(mm {
-            .start = start,
-            .attr {
-                .write = w,
-                .system = system,
-                .mapped = 0,
-            },
-        });
-        assert(inserted);
-        return *iter;
-    }
-
-    mm& add_empty_area(void* start, std::size_t page_count,
-        uint32_t page_attr, bool w, bool system);
-
-    constexpr void clear_user()
-    {
-        for (auto iter = m_areas.begin(); iter != m_areas.end(); ) {
-            if (iter->is_kernel_space()) {
-                ++iter;
-                continue;
-            }
-
-            // TODO: LONG MODE
-            // this->unmap(*iter);
-            iter = m_areas.erase(iter);
-        }
-        m_brk = nullptr;
-    }
-
-    // TODO: LONG MODE
-    // inline void unmap(mm& area)
-    // {
-    //     int i = 0;
-
-    //     // TODO:
-    //     // if there are more than 4 pages, calling invlpg
-    //     // should be faster. otherwise, we use movl cr3
-    //     // bool should_invlpg = (area->pgs->size() > 4);
-
-    //     for (auto& pg : *area.pgs) {
-    //         kernel::paccess pa(pg.pg_pteidx >> 12);
-    //         auto pt = (pt_t)pa.ptr();
-    //         assert(pt);
-    //         auto* pte = *pt + (pg.pg_pteidx & 0xfff);
-    //         pte->v = 0;
-
-    //         free_page(&pg);
-
-    //         invalidate_tlb((std::size_t)area.start + (i++) * PAGE_SIZE);
-    //     }
-    //     types::memory::kidelete<mm::pages_vector>(area.pgs);
-    // }
-
-    constexpr mm* find(void* lp)
-    {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &*iter;
-    }
-    constexpr const mm* find(void* lp) const
-    {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &*iter;
-    }
-
-    constexpr bool is_avail(void* start, size_t len) const noexcept
-    {
-        start = align_down<12>(start);
-        len = vptrdiff(align_up<12>(vptradd(start, len)), start);
-        for (const auto& area : m_areas) {
-            if (!area.is_avail(start, vptradd(start, len)))
-                return false;
-        }
-        return true;
-    }
-
-    constexpr bool is_avail(void* addr) const
-    {
-        auto iter = m_areas.find(addr);
-        return iter == m_areas.end();
-    }
-};
-
-} // namespace memory
-
-} // namespace kernel

+ 5 - 15
include/kernel/process.hpp

@@ -1,7 +1,7 @@
 #pragma once
 
-#include <map>
 #include <list>
+#include <map>
 #include <memory>
 #include <queue>
 #include <set>
@@ -13,8 +13,8 @@
 #include <stdint.h>
 #include <sys/types.h>
 
-#include <kernel/task/thread.hpp>
 #include <kernel/task/current.hpp>
+#include <kernel/task/thread.hpp>
 
 #include <types/allocator.hpp>
 #include <types/cplusplus.hpp>
@@ -22,12 +22,11 @@
 #include <types/types.h>
 
 #include <kernel/async/waitlist.hpp>
-#include <kernel/interrupt.h>
-#include <kernel/mm.hpp>
-#include <kernel/user/thread_local.hpp>
+#include <kernel/interrupt.hpp>
+#include <kernel/mem/mm_list.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/task.h>
 #include <kernel/tty.hpp>
+#include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 
 class process;
@@ -37,8 +36,6 @@ class proclist;
 inline process* volatile current_process;
 inline proclist* procs;
 
-inline tss32_t tss;
-
 struct process_attr {
     uint16_t system : 1;
     uint16_t zombie : 1 = 0;
@@ -297,13 +294,6 @@ void NORETURN init_scheduler(void);
 bool schedule(void);
 void NORETURN schedule_noreturn(void);
 
-constexpr uint32_t push_stack(uint32_t** stack, uint32_t val)
-{
-    --*stack;
-    **stack = val;
-    return val;
-}
-
 void k_new_thread(void (*func)(void*), void* data);
 
 void NORETURN freeze(void);

+ 2 - 2
include/kernel/signal.hpp

@@ -9,7 +9,7 @@
 
 #include <types/cplusplus.hpp>
 
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 
 namespace kernel {
 
@@ -57,7 +57,7 @@ public:
 
     // return value: whether the thread should wake up
     bool raise(signo_type signal);
-    void handle(interrupt_stack* context, mmx_registers* mmxregs);
+    void handle(interrupt_stack_normal* context, mmx_registers* mmxregs);
     void after_signal(signo_type signal);
 };
 

+ 10 - 9
include/kernel/syscall.hpp

@@ -1,16 +1,17 @@
 #pragma once
 
-#include <kernel/interrupt.h>
 #include <types/types.h>
 
-#define SYSCALL_ARG1(type, name) type name = (type)((data)->s_regs.rdi)
-#define SYSCALL_ARG2(type, name) type name = (type)((data)->s_regs.rsi)
-#define SYSCALL_ARG3(type, name) type name = (type)((data)->s_regs.rdx)
-#define SYSCALL_ARG4(type, name) type name = (type)((data)->s_regs.r10)
-#define SYSCALL_ARG5(type, name) type name = (type)((data)->s_regs.r8)
-#define SYSCALL_ARG6(type, name) type name = (type)((data)->s_regs.r9)
+#include <kernel/interrupt.hpp>
 
-// return value is stored in %eax and %edx
-typedef int (*syscall_handler)(interrupt_stack* data);
+#define SYSCALL_ARG1(type, name) type name = (type)((data)->head.s_regs.rdi)
+#define SYSCALL_ARG2(type, name) type name = (type)((data)->head.s_regs.rsi)
+#define SYSCALL_ARG3(type, name) type name = (type)((data)->head.s_regs.rdx)
+#define SYSCALL_ARG4(type, name) type name = (type)((data)->head.s_regs.r10)
+#define SYSCALL_ARG5(type, name) type name = (type)((data)->head.s_regs.r8)
+#define SYSCALL_ARG6(type, name) type name = (type)((data)->head.s_regs.r9)
+
+// return value is stored in %rax
+typedef long (*syscall_handler)(interrupt_stack_normal* data);
 
 void init_syscall(void);

+ 0 - 18
include/kernel/task.h

@@ -1,18 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct tss32_t {
-    uint32_t backlink, esp0, ss0, esp1, ss1, esp2, ss2, cr3;
-    uint32_t eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
-    uint32_t es, cs, ss, ds, fs, gs;
-    uint32_t ldtr, iomap;
-};
-
-#ifdef __cplusplus
-}
-#endif

+ 7 - 4
include/kernel/task/thread.hpp

@@ -8,6 +8,7 @@
 
 #include <types/types.h>
 
+#include <kernel/mem/paging.hpp>
 #include <kernel/signal.hpp>
 #include <kernel/user/thread_local.hpp>
 
@@ -27,13 +28,16 @@ public:
 
 private:
     struct kernel_stack {
-        std::byte* stack_base;
-        uint32_t* esp;
+        mem::paging::pfn_t pfn;
+        uintptr_t sp;
 
         kernel_stack();
         kernel_stack(const kernel_stack& other);
         kernel_stack(kernel_stack&& other);
         ~kernel_stack();
+
+        uint64_t pushq(uint64_t val);
+        uint32_t pushl(uint32_t val);
     };
 
 public:
@@ -47,8 +51,7 @@ public:
 
     std::string name {};
 
-    // TODO: LONG MODE
-    // segment_descriptor tls_desc {};
+    uint64_t tls_desc[2] {};
 
     explicit thread(std::string name, pid_t owner);
     thread(const thread& val, pid_t owner);

+ 4 - 2
include/kernel/user/thread_local.hpp

@@ -16,7 +16,9 @@ struct user_desc {
     uint32_t useable : 1;
 };
 
-// TODO: LONG MODE
-// void load_thread_area(const segment_descriptor& desc);
+void load_thread_area32(uint64_t desc);
+void load_thread_area64(uint64_t desc_lo, uint64_t desc_hi);
+
+void load_thread_area(uint64_t desc_lo, uint64_t desc_hi);
 
 } // namespace kernel::user

+ 157 - 17
include/types/elf.hpp

@@ -1,20 +1,23 @@
 #pragma once
+
 #include <errno.h>
-#include <kernel/interrupt.h>
+#include <stdint.h>
+
+#include <kernel/interrupt.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
-#include <stdint.h>
 
 namespace types::elf {
+
 using elf32_addr_t = uint32_t;
 using elf32_off_t = uint32_t;
 
-using elf_addr_t = elf32_addr_t;
-using elf_off_t = elf32_off_t;
+using elf64_addr_t = uint64_t;
+using elf64_off_t = uint64_t;
 
-constexpr elf32_addr_t ELF_STACK_BOTTOM = 0xbffff000;
-constexpr elf32_off_t ELF_STACK_SIZE = 8 * 1024 * 1024;
-constexpr elf32_addr_t ELF_STACK_TOP = ELF_STACK_BOTTOM - ELF_STACK_SIZE;
+constexpr elf32_addr_t ELF32_STACK_BOTTOM = 0xbffff000;
+constexpr elf32_off_t ELF32_STACK_SIZE = 8 * 1024 * 1024;
+constexpr elf32_addr_t ELF32_STACK_TOP = ELF32_STACK_BOTTOM - ELF32_STACK_SIZE;
 
 struct PACKED elf32_header {
     // 0x7f, "ELF"
@@ -102,7 +105,11 @@ struct PACKED elf32_program_header_entry {
     elf32_off_t filesz;
     elf32_off_t memsz;
     // segment dependent
-    uint32_t flags;
+    enum : uint32_t {
+        PF_X = 0x1,
+        PF_W = 0x2,
+        PF_R = 0x4,
+    } flags;
     // 0 and 1 for no alignment, otherwise power of 2
     uint32_t align;
 };
@@ -129,21 +136,154 @@ struct PACKED elf32_section_header_entry {
     } sh_flags;
     elf32_addr_t sh_addr;
     elf32_off_t sh_offset;
-    uint32_t sh_size;
-    char _[16];
+    elf32_off_t sh_size;
+    uint32_t sh_link;
+    uint32_t sh_info;
+    elf32_off_t sh_addralign;
+    elf32_off_t sh_entsize;
 };
 
 struct elf32_load_data {
     const fs::dentry* exec_dent;
-    const char* const* argv;
-    const char* const* envp;
-    int errcode;
-    void* eip;
-    uint32_t* sp;
-    bool system;
+    std::vector<std::string> argv;
+    std::vector<std::string> envp;
+    uintptr_t ip;
+    uintptr_t sp;
 };
 
 // TODO: environment variables
-int elf32_load(elf32_load_data* data);
+int elf32_load(elf32_load_data& data);
+
+struct PACKED elf64_header {
+    // 0x7f, "ELF"
+    char magic[4];
+
+    enum : uint8_t {
+        FORMAT_32 = 1,
+        FORMAT_64 = 2,
+    } format;
+    enum : uint8_t {
+        ENDIAN_LITTLE = 1,
+        ENDIAN_BIG = 2,
+    } endian;
+    // should be 1
+    uint8_t _version1;
+    enum : uint8_t {
+        ABI_SYSTEM_V = 0x00,
+        // TODO:
+        ABI_LINUX = 0x03,
+    } abi;
+    uint8_t abi_version;
+    uint8_t _reserved[7];
+    enum : uint16_t {
+        ET_NONE = 0x00,
+        ET_REL = 0x01,
+        ET_EXEC = 0x02,
+        ET_DYN = 0x03,
+        ET_CORE = 0x04,
+        ET_LOOS = 0xfe00,
+        ET_HIOS = 0xfeff,
+        ET_LOPROC = 0xff00,
+        ET_HIPROC = 0xffff,
+    } type;
+    enum : uint16_t {
+        ARCH_NONE = 0x00,
+        ARCH_X86 = 0x03,
+        ARCH_ARM = 0x28,
+        ARCH_IA64 = 0x32,
+        ARCH_X86_64 = 0x3e,
+        ARCH_ARM64 = 0xb7,
+        ARCH_RISCV = 0xf3,
+    } arch;
+    // should be 1
+    uint32_t _version2;
+    // entry address
+    elf64_addr_t entry;
+    // program header table offset
+    elf64_off_t phoff;
+    // section header table offset
+    elf64_off_t shoff;
+    // architecture dependent flags
+    uint32_t flags;
+    // elf header size
+    uint16_t ehsize;
+    // program header table entry size
+    uint16_t phentsize;
+    // program header table entries number
+    uint16_t phnum;
+    // section header table entry size
+    uint16_t shentsize;
+    // section header table entries number
+    uint16_t shnum;
+    // section header table entry index that contains section names
+    uint16_t shstrndx;
+};
+
+struct PACKED elf64_program_header_entry {
+    enum : uint32_t {
+        PT_NULL = 0x00,
+        PT_LOAD = 0x01,
+        PT_DYNAMIC = 0x02,
+        PT_INTERP = 0x03,
+        PT_NOTE = 0x04,
+        PT_SHLIB = 0x05,
+        PT_PHDR = 0x06,
+        PT_TLS = 0x07,
+        PT_LOOS = 0x60000000,
+        PT_HIOS = 0x6fffffff,
+        PT_LIPROC = 0x70000000,
+        PT_HIPROC = 0x7fffffff,
+    } type;
+    // segment dependent
+    enum : uint32_t {
+        PF_X = 0x1,
+        PF_W = 0x2,
+        PF_R = 0x4,
+    } flags;
+    elf64_off_t offset;
+    elf64_addr_t vaddr;
+    elf64_addr_t paddr;
+    elf64_off_t filesz;
+    elf64_off_t memsz;
+    // 0 and 1 for no alignment, otherwise power of 2
+    uint64_t align;
+};
+
+struct PACKED elf64_section_header_entry {
+    uint32_t sh_name;
+    enum : uint32_t {
+        SHT_NULL = 0x00,
+        SHT_PROGBITS = 0x01,
+        SHT_RELA = 0x04,
+        SHT_DYNAMIC = 0x06,
+        SHT_NOTE = 0x07,
+        SHT_NOBITS = 0x08,
+        SHT_REL = 0x09,
+        SHT_DYNSYM = 0x0b,
+        SHT_INIT_ARRAY = 0x0e,
+        SHT_FINI_ARRAY = 0x0f,
+        SHT_PREINIT_ARRAY = 0x0f,
+    } sh_type;
+    enum : uint64_t {
+        SHF_WRITE = 0x01,
+        SHF_ALLOC = 0x02,
+        SHF_EXECINSTR = 0x04,
+    } sh_flags;
+    elf64_addr_t sh_addr;
+    elf64_off_t sh_offset;
+    elf64_off_t sh_size;
+    uint32_t sh_link;
+    uint32_t sh_info;
+    elf64_off_t sh_addralign;
+    elf64_off_t sh_entsize;
+};
+
+struct elf64_load_data {
+    const fs::dentry* exec_dent;
+    std::vector<std::string> argv;
+    std::vector<std::string> envp;
+    unsigned long ip;
+    unsigned long sp;
+};
 
 } // namespace types::elf

+ 43 - 0
include/types/list.hpp

@@ -0,0 +1,43 @@
+#pragma once
+
+namespace types::list {
+
+template <typename ListNode>
+void list_insert(ListNode** head, ListNode* node)
+{
+    node->prev = nullptr;
+    node->next = *head;
+    if (*head)
+        (*head)->prev = node;
+    *head = node;
+}
+
+template <typename ListNode>
+ListNode* list_get(ListNode** head)
+{
+    ListNode* node = *head;
+    if (node) {
+        *head = node->next;
+
+        node->next = nullptr;
+        node->prev = nullptr;
+    }
+    return node;
+}
+
+template <typename ListNode>
+void list_remove(ListNode** head, ListNode* node)
+{
+    if (node->prev)
+        node->prev->next = node->next;
+    else
+        *head = node->next;
+
+    if (node->next)
+        node->next->prev = node->prev;
+
+    node->next = nullptr;
+    node->prev = nullptr;
+}
+
+} // namespace types

+ 0 - 2
include/types/types.h

@@ -30,8 +30,6 @@
 #define unlikely(expr) (!!(expr))
 #endif
 
-typedef size_t refcount_t;
-
 #ifdef __cplusplus
 #include <types/cplusplus.hpp>
 #endif

+ 77 - 195
src/asm/interrupt.s

@@ -1,189 +1,64 @@
 .text
 
-# TODO: LONG MODE
-# rewrite interrupt handlers
-
-# TODO: stack alignment
-.globl int6
-.type  int6 @function
-int6:
-# pushal
-    call int6_handler
-# popal
-
-    iret
-
-# TODO: stack alignment
-.globl int8
-.type  int8 @function
-int8:
-    nop
-    iret
-
-# TODO: stack alignment
-.globl int13
-.type  int13 @function
-int13:
-# pushal
-    call int13_handler
-# popal
-
-# remove the 32bit error code from stack
-    addl $4, %esp
-    iret
-
-.globl int14
-.type  int14 @function
-int14:
-    # push general purpose registers
-# pushal
-
-    # save %cr2
-    mov %cr2, %rax
-    push %rax
-
-    # save current esp (also pointer to struct int14_data)
-    mov %esp, %ebx
-
-    # allocate space for mmx registers and argument
-    subl $0x210, %esp
-
-    # align stack to 16byte boundary
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    fxsave 16(%esp)
-
-    # push (interrupt_stack*)data
-    mov %ebx, (%esp)
-
-    call int14_handler
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack and general purpose registers
-    leal 4(%ebx), %esp
-# popal
-
-# remove the 32bit error code from stack
-    addl $4, %esp
-    iret
-
-.globl irq0
-irq0:
-# pushal
-    mov $0, %eax
-    jmp irqstub
-.globl irq1
-irq1:
-# pushal
-    mov $1, %eax
-    jmp irqstub
-.globl irq2
-irq2:
-# pushal
-    mov $2, %eax
-    jmp irqstub
-.globl irq3
-irq3:
-# pushal
-    mov $3, %eax
-    jmp irqstub
-.globl irq4
-irq4:
-# pushal
-    mov $4, %eax
-    jmp irqstub
-.globl irq5
-irq5:
-# pushal
-    mov $5, %eax
-    jmp irqstub
-.globl irq6
-irq6:
-# pushal
-    mov $6, %eax
-    jmp irqstub
-.globl irq7
-irq7:
-# pushal
-    mov $7, %eax
-    jmp irqstub
-.globl irq8
-irq8:
-# pushal
-    mov $8, %eax
-    jmp irqstub
-.globl irq9
-irq9:
-# pushal
-    mov $9, %eax
-    jmp irqstub
-.globl irq10
-irq10:
-# pushal
-    mov $10, %eax
-    jmp irqstub
-.globl irq11
-irq11:
-# pushal
-    mov $11, %eax
-    jmp irqstub
-.globl irq12
-irq12:
-# pushal
-    mov $12, %eax
-    jmp irqstub
-.globl irq13
-irq13:
-# pushal
-    mov $13, %eax
-    jmp irqstub
-.globl irq14
-irq14:
-# pushal
-    mov $14, %eax
-    jmp irqstub
-.globl irq15
-irq15:
-# pushal
-    mov $15, %eax
-    jmp irqstub
-
-.globl irqstub
-irqstub:
-    # save current esp
-    mov %esp, %ebx
-
-    # align stack to 16byte boundary
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    sub $(512 + 16), %esp
-    fxsave 16(%esp)
-
-    # save irq number and pointers to context and mmx registers
-    mov %eax, (%esp)  # irq number
-    mov %ebx, 4(%esp) # pointer to context
-    lea 16(%esp), %eax
-    mov %eax, 8(%esp) # pointer to mmx registers
-
-    call irq_handler
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack and general purpose registers
-    mov %ebx, %esp
-# popal
-
-    iret
+ISR_stub:
+	sub $0x78, %rsp
+	mov %rax,  0x00(%rsp)
+	mov %rbx,  0x08(%rsp)
+	mov %rcx,  0x10(%rsp)
+	mov %rdx,  0x18(%rsp)
+	mov %rdi,  0x20(%rsp)
+	mov %rsi,  0x28(%rsp)
+	mov %r8,   0x30(%rsp)
+	mov %r9,   0x38(%rsp)
+	mov %r10,  0x40(%rsp)
+	mov %r11,  0x48(%rsp)
+	mov %r12,  0x50(%rsp)
+	mov %r13,  0x58(%rsp)
+	mov %r14,  0x60(%rsp)
+	mov %r15,  0x68(%rsp)
+	mov %rbp,  0x70(%rsp)
+
+	mov 0x78(%rsp), %rax
+	sub $ISR0, %rax
+	shr $3, %rax
+	mov %rax, 0x78(%rsp)
+
+	mov %rsp, %rbx
+	and $~0xf, %rsp
+
+	sub $512, %rsp
+	fxsave (%rsp)
+
+	mov %rbx, %rdi
+	mov %rsp, %rsi
+	call interrupt_handler
+
+	fxrstor (%rsp)
+	mov %rbx, %rsp
+
+	mov 0x00(%rsp), %rax
+	mov 0x08(%rsp), %rbx
+	mov 0x10(%rsp), %rcx
+	mov 0x18(%rsp), %rdx
+	mov 0x20(%rsp), %rdi
+	mov 0x28(%rsp), %rsi
+	mov 0x30(%rsp), %r8
+	mov 0x38(%rsp), %r9
+	mov 0x40(%rsp), %r10
+	mov 0x48(%rsp), %r11
+	mov 0x50(%rsp), %r12
+	mov 0x58(%rsp), %r13
+	mov 0x60(%rsp), %r14
+	mov 0x68(%rsp), %r15
+	mov 0x70(%rsp), %rbp
+
+	mov 0x78(%rsp), %rsp
+	iretq
 
 .globl syscall_stub
 .type  syscall_stub @function
 syscall_stub:
-# pushal
+    # pushal
 
     # save current esp
     mov %esp, %ebx
@@ -212,8 +87,8 @@ syscall_stub:
 .globl _syscall_stub_fork_return
 .type  _syscall_stub_fork_return @function
 _syscall_stub_fork_return:
-# popal
-    iret
+    # popal
+    iretq
 
 # parameters
 # #1: esp* curr_esp
@@ -251,16 +126,23 @@ asm_ctx_switch:
 _ctx_switch_return:
     ret
 
-.section .text.kinit
-
-.globl asm_load_idt
-.type  asm_load_idt @function
-asm_load_idt:
-    movl 4(%esp), %edx
-    lidt (%edx)
-    movl 8(%esp), %edx
-    cmpl $0, %edx
-    je asm_load_idt_skip
-    sti
-asm_load_idt_skip:
-    ret
+.altmacro
+.macro build_isr name
+	.align 8
+	ISR\name:
+		call ISR_stub
+.endm
+
+.set i, 0
+.rept 48
+	build_isr %i
+	.set i, i+1
+.endr
+
+.section .rodata
+
+.align 8
+.globl ISR_START_ADDR
+.type  ISR_START_ADDR @object
+ISR_START_ADDR:
+	.quad ISR0

+ 6 - 6
src/boot.s

@@ -117,14 +117,14 @@ _fill_loop3:
 
     # create gdt
 	xor %eax, %eax # at 0x0000
-	mov %eax,   (%eax)
-	mov %eax,  4(%eax) # null descriptor
-	mov %eax,  8(%eax) # code segment lower
-	mov %eax, 16(%eax) # data segment lower
+	mov %eax, 0x00(%eax)
+	mov %eax, 0x04(%eax) # null descriptor
+	mov %eax, 0x08(%eax) # code segment lower
+	mov %eax, 0x10(%eax) # data segment lower
 	mov $0x00209a00, %ecx
-	mov %ecx, 12(%eax) # code segment higher
+	mov %ecx, 0x0c(%eax) # code segment higher
 	mov $0x00009200, %ecx
-	mov %ecx, 20(%eax) # data segment higher
+	mov %ecx, 0x14(%eax) # data segment higher
 
     # gdt descriptor
 	push %eax

+ 1 - 2
src/fs/fat.cpp

@@ -9,7 +9,6 @@
 #include <types/allocator.hpp>
 
 #include <fs/fat.hpp>
-#include <kernel/mm.hpp>
 #include <kernel/module.hpp>
 #include <kernel/vfs.hpp>
 
@@ -266,7 +265,7 @@ int fat32::inode_statx(dentry* ent, statx* st, unsigned int mask)
     }
 
     if (mask & STATX_BLOCKS) {
-        st->stx_blocks = align_up<12>(ent->ind->size) / 512;
+        st->stx_blocks = ((ent->ind->size + 0xfff) & ~0xfff) / 512;
         st->stx_blksize = 4096;
         st->stx_mask |= STATX_BLOCKS;
     }

+ 13 - 13
src/kernel/async/lock.cc

@@ -9,8 +9,8 @@ static inline void _raw_spin_lock(spinlock_t* lock_addr)
 {
     asm volatile(
         "%=:\n\t\
-         movl $1, %%eax\n\t\
-         xchgl %%eax, (%0)\n\t\
+         mov $1, %%eax\n\t\
+         xchg %%eax, (%0)\n\t\
          cmp $0, %%eax\n\t\
          jne %=b\n\t\
         "
@@ -22,18 +22,18 @@ static inline void _raw_spin_lock(spinlock_t* lock_addr)
 static inline void _raw_spin_unlock(spinlock_t* lock_addr)
 {
     asm volatile(
-        "movl $0, %%eax\n\
-         xchgl %%eax, (%0)"
+        "mov $0, %%eax\n\
+         xchg %%eax, (%0)"
         :
         : "r"(lock_addr)
         : "eax", "memory");
 }
 
-static inline size_t _save_interrupt_state()
+static inline lock_context_t _save_interrupt_state()
 {
-    size_t retval;
+    lock_context_t retval;
     asm volatile(
-        "pushfq\n\t"
+        "pushf\n\t"
         "pop %0\n\t"
         "cli"
         : "=g"(retval)
@@ -44,13 +44,13 @@ static inline size_t _save_interrupt_state()
     return retval;
 }
 
-static inline void _restore_interrupt_state(size_t flags)
+static inline void _restore_interrupt_state(lock_context_t context)
 {
     asm volatile(
         "push %0\n\t"
         "popf"
         :
-        : "g"(flags)
+        : "g"(context)
         :
         );
 }
@@ -90,7 +90,7 @@ void spin_unlock(spinlock_t& lock)
     preempt_enable();
 }
 
-size_t spin_lock_irqsave(spinlock_t& lock)
+lock_context_t spin_lock_irqsave(spinlock_t& lock)
 {
     auto state = _save_interrupt_state();
     preempt_disable();
@@ -100,7 +100,7 @@ size_t spin_lock_irqsave(spinlock_t& lock)
     return state;
 }
 
-void spin_unlock_irqrestore(spinlock_t& lock, size_t state)
+void spin_unlock_irqrestore(spinlock_t& lock, lock_context_t state)
 {
     _raw_spin_unlock(&lock);
     preempt_enable();
@@ -122,12 +122,12 @@ void mutex::unlock()
     spin_unlock(m_lock);
 }
 
-uint32_t mutex::lock_irq()
+lock_context_t mutex::lock_irq()
 {
     return spin_lock_irqsave(m_lock);
 }
 
-void mutex::unlock_irq(uint32_t state)
+void mutex::unlock_irq(lock_context_t state)
 {
     spin_unlock_irqrestore(m_lock, state);
 }

+ 26 - 20
src/kernel/hw/ahci.cc

@@ -5,8 +5,8 @@
 #include <kernel/hw/pci.hpp>
 #include <kernel/irq.hpp>
 #include <kernel/log.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/mem/phys.hpp>
-#include <kernel/mm.hpp>
 #include <kernel/module.hpp>
 #include <kernel/vfs.hpp>
 
@@ -20,6 +20,7 @@
 
 using namespace kernel::module;
 using namespace kernel::hw::pci;
+using namespace kernel::mem::paging;
 
 using kernel::mem::physaddr;
 
@@ -292,7 +293,7 @@ private:
     received_fis* fis { };
     std::size_t sectors { -1U };
 
-    int send_command(char* buf, uint64_t lba, uint32_t count, uint8_t cmd, bool write)
+    int send_command(physaddr<void> buf, uint64_t lba, uint32_t count, uint8_t cmd, bool write)
     {
         // count must be a multiple of 512
         if (count & (512 - 1))
@@ -302,9 +303,10 @@ private:
         int n = 0;
         // auto n = qu.pop();
 
-        // for now, we read 3.5KB at most at a time
         // command fis and prdt will take up the lower 128+Bytes
-        physaddr<command_table> cmdtable{nullptr}; // TODO: LONG MODE allocate a page
+        // TODO: buffer array
+        pfn_t command_table_pfn = page_to_pfn(alloc_page());
+        physaddr<command_table, false> cmdtable{command_table_pfn};
 
         // construct command header
         memset(cmd_header + n, 0x00, sizeof(command_header));
@@ -334,7 +336,7 @@ private:
 
         // fill in prdt
         auto* pprdt = cmdtable->prdt;
-        pprdt->data_base = cmdtable.phys() + 512;
+        pprdt->data_base = buf.phys();
         pprdt->byte_count = count;
         pprdt->interrupt = 1;
 
@@ -353,16 +355,17 @@ private:
         SPIN(port->command_issue & (1 << n), spins)
             return -1;
 
-        memcpy(buf, cmdtable.cast_to<char*>() + 512, count);
-
-        // TODO: free cmdtable
+        free_page(command_table_pfn);
         return 0;
     }
 
     int identify()
     {
-        char buf[512];
-        int ret = send_command(buf, 0, 512, 0xEC, false);
+        pfn_t buffer_page = page_to_pfn(alloc_page());
+        int ret = send_command(physaddr<void>{buffer_page},
+                0, 512, 0xEC, false);
+
+        free_page(buffer_page);
         if (ret != 0)
             return -1;
         return 0;
@@ -370,40 +373,43 @@ private:
 
 public:
     explicit ahci_port(hba_port* port)
-        // TODO: LONG MODE
-        : cmd_header{nullptr}, port(port) { }
+        : cmd_header{page_to_pfn(alloc_page())}, port(port) { }
 
     ~ahci_port()
     {
         if (!cmd_header)
             return;
-        // TODO: free cmd_header
+        free_page(cmd_header.phys());
     }
 
     ssize_t read(char* buf, std::size_t buf_size, std::size_t offset, std::size_t cnt)
     {
         cnt = std::min(buf_size, cnt);
 
-        constexpr size_t READ_BUF_SECTORS = 6;
+        pfn_t buffer_page = page_to_pfn(alloc_page());
+        physaddr<void> buffer_ptr{buffer_page};
 
-        char b[READ_BUF_SECTORS * 512] {};
         char* orig_buf = buf;
         size_t start = offset / 512;
         size_t end = std::min((offset + cnt + 511) / 512, sectors);
 
         offset -= start * 512;
-        for (size_t i = start; i < end; i += READ_BUF_SECTORS) {
-            size_t n_read = std::min(end - i, READ_BUF_SECTORS) * 512;
-            int status = send_command(b, i, n_read, 0xC8, false);
-            if (status != 0)
+        for (size_t i = start; i < end; i += 4096UL / 512) {
+            size_t n_read = std::min(end - i, 4096UL / 512) * 512;
+            int status = send_command(buffer_ptr, i, n_read, 0xC8, false);
+            if (status != 0) {
+                free_page(buffer_page);
                 return -EIO;
+            }
 
             size_t to_copy = std::min(cnt, n_read - offset);
-            memcpy(buf, b + offset, to_copy);
+            memcpy(buf, (std::byte*)(void*)buffer_ptr + offset, to_copy);
             offset = 0;
             buf += to_copy;
             cnt -= to_copy;
         }
+
+        free_page(buffer_page);
         return buf - orig_buf;
     }
 

+ 59 - 247
src/kernel/interrupt.cpp

@@ -9,13 +9,15 @@
 
 #include <kernel/hw/port.hpp>
 #include <kernel/hw/timer.hpp>
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 #include <kernel/irq.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
-#include <kernel/vga.hpp>
+
+#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
+#define USER_INTERRUPT_GATE_TYPE (0xee)
 
 constexpr kernel::hw::p8 port_pic1_command{0x20};
 constexpr kernel::hw::p8 port_pic1_data{0x21};
@@ -24,103 +26,48 @@ constexpr kernel::hw::p8 port_pic2_data{0xa1};
 
 struct IDT_entry {
     uint16_t offset_low;
-    uint16_t selector;
-    uint8_t zero;
-    uint8_t type_attr;
-    uint16_t offset_high;
-};
+    uint16_t segment;
 
-// interrupt stubs
-extern "C" void irq0(); extern "C" void irq1(); extern "C" void irq2();
-extern "C" void irq3(); extern "C" void irq4(); extern "C" void irq5();
-extern "C" void irq6(); extern "C" void irq7(); extern "C" void irq8();
-extern "C" void irq9(); extern "C" void irq10(); extern "C" void irq11();
-extern "C" void irq12(); extern "C" void irq13(); extern "C" void irq14();
-extern "C" void irq15(); extern "C" void int6(); extern "C" void int8();
-extern "C" void int13(); extern "C" void int14();
-extern "C" void syscall_stub();
+    uint8_t IST;
+    uint8_t attributes;
+
+    uint16_t offset_mid;
+    uint32_t offset_high;
+    uint32_t reserved;
+};
 
-#define SET_UP_IRQ(N, SELECTOR)                   \
-    uintptr_t addr_irq##N = (uintptr_t)irq##N;            \
-    set_idt_entry(IDT, 0x20 + (N), (addr_irq##N), \
-        (SELECTOR), KERNEL_INTERRUPT_GATE_TYPE);
+static struct IDT_entry IDT[256];
 
-#define SET_IDT_ENTRY_FN(N, FUNC_NAME, SELECTOR, TYPE) \
-    uintptr_t addr_##FUNC_NAME = (uintptr_t)FUNC_NAME;         \
-    set_idt_entry(IDT, (N), (addr_##FUNC_NAME), (SELECTOR), (TYPE));
+extern "C" uintptr_t ISR_START_ADDR;
 
 SECTION(".text.kinit")
-static void set_idt_entry(IDT_entry (&idt)[256], int n,
+static inline void set_idt_entry(IDT_entry (&idt)[256], int n,
     uintptr_t offset, uint16_t selector, uint8_t type)
 {
     idt[n].offset_low = offset & 0xffff;
-    idt[n].selector = selector;
-    idt[n].zero = 0;
-    idt[n].type_attr = type;
-    idt[n].offset_high = (offset >> 16) & 0xffff;
+    idt[n].segment = selector;
+    idt[n].IST = 0;
+    idt[n].attributes = type;
+    idt[n].offset_mid = (offset >> 16) & 0xffff;
+    idt[n].offset_high = (offset >> 32) & 0xffffffff;
+    idt[n].reserved = 0;
 }
 
-// idt_descriptor: uint16_t[3]
-// [0] bit 0 :15 => limit
-// [1] bit 16:47 => address
-extern "C" void asm_load_idt(uint16_t idt_descriptor[3], int sti);
-
-static struct IDT_entry IDT[256];
-
-static inline void NORETURN die(regs_64& regs, void* rip)
-{
-    kmsgf( "***** KERNEL PANIC *****\n"
-           "rax: %llx, rbx: %llx, rcx: %llx, rdx: %llx\n"
-           "rsp: %llx, rbp: %llx, rsi: %llx, rdi: %llx\n"
-           "r8 : %llx, r9 : %llx, r10: %llx, r11: %llx\n"
-           "r12: %llx, r13: %llx, r14: %llx, r15: %llx\n"
-           "rip: %llx\n",
-           regs.rax, regs.rbx, regs.rcx, regs.rdx,
-           regs.rsp, regs.rbp, regs.rsi, regs.rdi,
-           regs.r8 , regs.r9 , regs.r10, regs.r11,
-           regs.r12, regs.r13, regs.r14, regs.r15, rip);
-    freeze();
-}
+using kernel::irq::irq_handler_t;
+static std::vector<std::list<irq_handler_t>> s_irq_handlers;
 
 SECTION(".text.kinit")
-void init_idt()
+void kernel::kinit::init_interrupt()
 {
-    memset(IDT, 0x00, sizeof(IDT));
-
-    // invalid opcode
-    SET_IDT_ENTRY_FN(6, int6, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // double fault
-    SET_IDT_ENTRY_FN(8, int8, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // general protection
-    SET_IDT_ENTRY_FN(13, int13, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // page fault
-    SET_IDT_ENTRY_FN(14, int14, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // system call
-    SET_IDT_ENTRY_FN(0x80, syscall_stub, 0x08, USER_INTERRUPT_GATE_TYPE);
+    for (int i = 0; i < 0x30; ++i)
+        set_idt_entry(IDT, i, ISR_START_ADDR+8*i, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
 
     uint64_t idt_descriptor[2];
     idt_descriptor[0] = (sizeof(IDT_entry) * 256) << 48;
     idt_descriptor[1] = (uintptr_t)IDT;
 
-    asm volatile(
-            "lidt (%0)"
-            :
-            : "r"((uintptr_t)idt_descriptor + 6)
-            :
-            );
-}
-
-using kernel::irq::irq_handler_t;
-static std::vector<std::list<irq_handler_t>> s_irq_handlers;
-
-void kernel::irq::register_handler(int irqno, irq_handler_t handler)
-{
-    s_irq_handlers[irqno].emplace_back(std::move(handler));
-}
-
-SECTION(".text.kinit")
-void init_pic(void)
-{
+    // initialize PIC
+    asm volatile("lidt (%0)": :"r"((uintptr_t)idt_descriptor + 6): );
     s_irq_handlers.resize(16);
 
     // TODO: move this to timer driver
@@ -142,176 +89,42 @@ void init_pic(void)
     // allow all the interrupts
     port_pic1_data = 0x00;
     port_pic2_data = 0x00;
-
-    // 0x08 stands for kernel code segment
-    SET_UP_IRQ(0, 0x08);
-    SET_UP_IRQ(1, 0x08);
-    SET_UP_IRQ(2, 0x08);
-    SET_UP_IRQ(3, 0x08);
-    SET_UP_IRQ(4, 0x08);
-    SET_UP_IRQ(5, 0x08);
-    SET_UP_IRQ(6, 0x08);
-    SET_UP_IRQ(7, 0x08);
-    SET_UP_IRQ(8, 0x08);
-    SET_UP_IRQ(9, 0x08);
-    SET_UP_IRQ(10, 0x08);
-    SET_UP_IRQ(11, 0x08);
-    SET_UP_IRQ(12, 0x08);
-    SET_UP_IRQ(13, 0x08);
-    SET_UP_IRQ(14, 0x08);
-    SET_UP_IRQ(15, 0x08);
-}
-
-extern "C" void int6_handler(
-    regs_64 s_regs,
-    void* rip,
-    uint64_t cs,
-    uint64_t rflags,
-    uint64_t rsp,
-    uint64_t ss)
-{
-    if (!current_process->attr.system)
-        kill_current(SIGSEGV); // noreturn
-
-    kmsgf("[kernel] int6: cs: %llx, rflags: %llx, rsp: %llx, ss: %llx",
-            cs, rflags, rsp, ss);
-    die(s_regs, rip); // noreturn
-}
-
-// general protection
-extern "C" void int13_handler(
-    regs_64 s_regs,
-    uint64_t error_code,
-    void* rip,
-    uint64_t cs,
-    uint64_t rflags,
-    uint64_t rsp,
-    uint64_t ss)
-{
-    if (!current_process->attr.system)
-        kill_current(SIGILL); // noreturn
-
-    kmsgf("[kernel] int13: error_code: %llx, cs: %llx, rflags: %llx, rsp: %llx, ss: %llx",
-            error_code, cs, rflags, rsp, ss);
-
-    die(s_regs, rip); // noreturn
 }
 
-struct PACKED int14_data {
-    void* l_addr;
-    regs_64 s_regs;
-    page_fault_error_code error_code;
-    void* v_eip;
-    uint32_t cs;
-    uint32_t eflags;
-};
-
-static inline void _int14_panic(
-        void* rip, void* cr2,
-        struct page_fault_error_code error_code)
-{
-    kmsgf("[kernel] int14: rip: %p, cr2: %p, error_code: %llx",
-          rip, cr2, error_code);
-    freeze();
-}
-
-static inline void NORETURN _int14_kill_user(void)
+void kernel::irq::register_handler(int irqno, irq_handler_t handler)
 {
-    kill_current(SIGSEGV);
+    s_irq_handlers[irqno].emplace_back(std::move(handler));
 }
 
-// page fault
-extern "C" void int14_handler(int14_data* d)
+extern "C" void interrupt_handler(
+        interrupt_stack_head* context,
+        mmx_registers* mmxregs)
 {
-    kernel::mem::mm_list* mms = nullptr;
-    if (current_process) [[likely]]
-        mms = &current_process->mms;
-    else
-        mms = kernel::mem::mm_list::s_kernel_mms;
-
-    auto* mm_area = mms->find(d->l_addr);
-    if (!mm_area) [[unlikely]] {
-        if (d->error_code.user) {
-            // user access of address that does not exist
-            _int14_kill_user();
-        } else {
-            _int14_panic(d->v_eip, d->l_addr, d->error_code);
+    // interrupt is a fault
+    if (context->int_no < 0x20) {
+        auto* with_code = (interrupt_stack_with_code*)context;
+
+        switch (context->int_no) {
+        case 6:
+        case 8: {
+            if (!current_process->attr.system)
+                kill_current(SIGSEGV); // noreturn
+        } break;
+        case 13: {
+            if (!current_process->attr.system)
+                kill_current(SIGILL); // noreturn
+        } break;
+        case 14: {
+            kernel::mem::paging::handle_page_fault(with_code->error_code);
+            context->int_no = (unsigned long)context + 0x80;
+        }
         }
+        freeze();
     }
-    if (d->error_code.user && mm_area->attr.system)
-        _int14_kill_user();
+    auto* real_context = (interrupt_stack_normal*)context;
 
-    // TODO: LONG MODE
-    // kernel::mem::paging::page* page = &(*mm_area->pgs)[vptrdiff(d->l_addr, mm_area->start) >> 12];
-    // kernel::paccess pa(page->pg_pteidx >> 12);
-    // auto pt = (pt_t)pa.ptr();
-    // assert(pt);
-    // pte_t* pte = *pt + (page->pg_pteidx & 0xfff);
+    int irqno = context->int_no - 0x20;
 
-    // if (unlikely(d->error_code.present == 0 && !mm_area->mapped_file))
-    //     _int14_panic(d->v_eip, d->l_addr, d->error_code);
-
-    // if (page->attr & PAGE_COW) {
-    //     // if it is a dying page
-    //     if (*page->ref_count == 1) {
-    //         page->attr &= ~PAGE_COW;
-    //         pte->in.p = 1;
-    //         pte->in.a = 0;
-    //         pte->in.rw = mm_area->attr.write;
-    //         return;
-    //     }
-    //     // duplicate the page
-    //     page_t new_page = __alloc_raw_page();
-
-    //     {
-    //         kernel::paccess pdst(new_page), psrc(page->phys_page_id);
-    //         auto* new_page_data = (char*)pdst.ptr();
-    //         auto* src = psrc.ptr();
-    //         assert(new_page_data && src);
-    //         memcpy(new_page_data, src, PAGE_SIZE);
-    //     }
-
-    //     pte->in.page = new_page;
-    //     pte->in.rw = mm_area->attr.write;
-    //     pte->in.a = 0;
-
-    //     --*page->ref_count;
-
-    //     page->ref_count = types::memory::kinew<size_t>(1);
-    //     page->attr &= ~PAGE_COW;
-    //     page->phys_page_id = new_page;
-    // }
-
-    // if (page->attr & PAGE_MMAP) {
-    //     pte->in.p = 1;
-
-    //     size_t offset = align_down<12>((std::size_t)d->l_addr);
-    //     offset -= (std::size_t)mm_area->start;
-
-    //     kernel::paccess pa(page->phys_page_id);
-    //     auto* data = (char*)pa.ptr();
-    //     assert(data);
-
-    //     int n = vfs_read(
-    //         mm_area->mapped_file,
-    //         data,
-    //         PAGE_SIZE,
-    //         mm_area->file_offset + offset,
-    //         PAGE_SIZE);
-
-    //     // TODO: send SIGBUS if offset is greater than real size
-    //     if (n != PAGE_SIZE)
-    //         memset(data + n, 0x00, PAGE_SIZE - n);
-
-    //     page->attr &= ~PAGE_MMAP;
-    // }
-}
-
-extern "C" void irq_handler(
-    int irqno,
-    interrupt_stack* context,
-    mmx_registers* mmxregs)
-{
     constexpr uint8_t PIC_EOI = 0x20;
 
     port_pic1_command = PIC_EOI;
@@ -321,10 +134,9 @@ extern "C" void irq_handler(
     for (const auto& handler : s_irq_handlers[irqno])
         handler();
 
-    // TODO: LONG MODE
-    // if (context->cs != USER_CODE_SEGMENT)
-    //     return;
+    if (real_context->cs == 0x1b && current_thread->signals.pending_signal())
+        current_thread->signals.handle(real_context, mmxregs);
 
-    if (current_thread->signals.pending_signal())
-        current_thread->signals.handle(context, mmxregs);
+    context->int_no = (unsigned long)context + 0x78;
+    return;
 }

+ 0 - 346
src/kernel/mem.cpp

@@ -1,346 +0,0 @@
-#include <cstddef>
-
-#include <assert.h>
-#include <errno.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <kernel/mem/paging.hpp>
-#include <kernel/mm.hpp>
-#include <kernel/process.hpp>
-#include <kernel/task.h>
-#include <kernel/vga.hpp>
-
-#include <types/allocator.hpp>
-
-void dealloc_pd(kernel::mem::paging::pfn_t pd)
-{
-    // TODO: LONG MODE
-    // {
-    //     kernel::paccess pa(pd);
-    //     auto p_pd = (pd_t)pa.ptr();
-    //     assert(p_pd);
-    //     for (pde_t* ent = (*p_pd); ent < (*p_pd) + 768; ++ent) {
-    //         if (!ent->in.p)
-    //             continue;
-    //         __free_raw_page(ent->in.pt_page);
-    //     }
-    // }
-    // __free_raw_page(pd);
-}
-
-using kernel::mem::mm_list;
-using kernel::mem::mm;
-
-mm_list::mm_list()
-    : m_areas(s_kernel_mms->m_areas)
-{
-    // TODO: LONG MODE
-    // m_pd = __alloc_raw_page();
-    // kernel::paccess pdst(m_pd), psrc(s_kernel_mms->m_pd);
-    // auto* dst = pdst.ptr();
-    // auto* src = psrc.ptr();
-    // assert(dst && src);
-    // memcpy(dst, src, PAGE_SIZE);
-}
-
-mm_list::mm_list(const mm_list& other)
-    : mm_list()
-{
-    m_brk = other.m_brk;
-    for (auto& src : other.m_areas) {
-        if (src.is_kernel_space() || src.attr.system)
-            continue;
-
-        auto& area = this->addarea(
-            src.start, src.attr.write, src.attr.system);
-
-        if (src.attr.mapped) {
-            area.attr.mapped = 1;
-            area.mapped_file = src.mapped_file;
-            area.file_offset = src.file_offset;
-        }
-
-        // TODO: LONG MODE
-        // paccess pa(m_pd);
-        // pd_t pd = (pd_t)pa.ptr();
-
-        // for (const auto& pg : *src.pgs) {
-        //     area.append_page(pd, pg,
-        //             PAGE_COW | (pg.attr & PAGE_MMAP),
-        //             src.attr.system);
-        // }
-    }
-}
-
-mm_list::~mm_list()
-{
-    if (!m_pd)
-        return;
-
-    clear_user();
-    dealloc_pd(m_pd);
-}
-
-void mm_list::switch_pd() const
-{
-    // TODO: LONG MODE
-    // asm_switch_pd(m_pd);
-}
-
-int mm_list::register_brk(void* addr)
-{
-    if (!is_avail(addr))
-        return -ENOMEM;
-    m_brk = &addarea(addr, true, false);
-    return 0;
-}
-
-void* mm_list::set_brk(void* addr)
-{
-    assert(m_brk);
-    void* curbrk = m_brk->end();
-
-    if (addr <= curbrk || !is_avail(curbrk, vptrdiff(addr, curbrk)))
-        return curbrk;
-
-    // TODO: LONG MODE
-    // kernel::paccess pa(m_pd);
-    // pd_t pd = (pd_t)pa.ptr();
-
-    // while (curbrk < addr) {
-    //     m_brk->append_page(pd, empty_page, PAGE_COW, false);
-    //     curbrk = (char*)curbrk + PAGE_SIZE;
-    // }
-
-    return curbrk;
-}
-
-void* mm_list::find_avail(void* hint, size_t len, bool priv) const
-{
-    void* addr = hint;
-    if (!addr) {
-        // default value of mmapp'ed area
-        if (!priv)
-            addr = (void*)0x40000000;
-        else
-            addr = (void*)0xe0000000;
-    }
-
-    while (!is_avail(addr, len)) {
-        auto iter = m_areas.lower_bound(addr);
-        if (iter == m_areas.end())
-            return nullptr;
-
-        addr = iter->end();
-    }
-
-    if (!priv && addr >= (void*)0xc0000000)
-        return nullptr;
-
-    return addr;
-}
-
-// TODO: write dirty pages to file
-int mm_list::unmap(void* start, size_t len, bool system)
-{
-    uintptr_t addr = (uintptr_t)start;
-    void* end = vptradd(start, align_up<12>(len));
-
-    // standard says that addr and len MUST be
-    // page-aligned or the call is invalid
-    if ((addr & 0xfff) != 0)
-        return -EINVAL;
-
-    // if doing user mode unmapping, check area privilege
-    if (!system) {
-        if (addr >= 0xc0000000 || end > (void*)0xc0000000)
-            return -EINVAL;
-    }
-
-    auto iter = m_areas.lower_bound(start);
-
-    // TODO: LONG MODE
-    for ( ; iter != m_areas.end() && *iter < end; ) {
-        if (!(start < *iter) && start != iter->start) {
-            mm newmm = iter->split(start);
-            // unmap(newmm);
-            ++iter;
-            continue;
-        }
-        else if (!(*iter < end)) {
-            mm newmm = iter->split(end);
-            // unmap(*iter);
-            m_areas.erase(iter);
-
-            bool inserted;
-            std::tie(std::ignore, inserted) = m_areas.emplace(std::move(newmm));
-            assert(inserted);
-            break;
-        }
-        else {
-            // unmap(*iter);
-            iter = m_areas.erase(iter);
-        }
-    }
-
-    return 0;
-}
-
-mm& mm_list::add_empty_area(void *start, std::size_t page_count,
-    uint32_t page_attr, bool w, bool system)
-{
-    // TODO: LONG MODE
-    // auto& area = addarea(start, w, system);
-    // kernel::paccess pa(m_pd);
-    // pd_t pd = (pd_t)pa.ptr();
-
-    // while (page_count--)
-    //     area.append_page(pd, empty_page, page_attr, system);
-
-    // return area;
-}
-
-// TODO: LONG MODE
-// constexpr void map_raw_page_to_pte(
-//     pte_t* pte, kernel::mem::paging::pfn_t page,
-//     bool present, bool write, bool priv)
-// {
-//     // set P bit
-//     pte->v = 0;
-//     pte->in.p = present;
-//     pte->in.rw = write;
-//     pte->in.us = !priv;
-//     pte->in.page = page;
-// }
-
-// TODO: LONG MODE
-// void mm::append_page(pd_t pd, const page& pg, uint32_t attr, bool priv)
-// {
-//     assert(pd);
-// 
-//     void* addr = this->end();
-//     pde_t* pde = *pd + v_to_pdi(addr);
-// 
-//     kernel::mem::paging::pfn_t pt_pg = 0;
-//     pte_t* pte = nullptr;
-//     // page table not exist
-//     if (!pde->in.p) [[unlikely]] {
-//         // allocate a page for the page table
-//         pt_pg = __alloc_raw_page();
-//         pde->in.p = 1;
-//         pde->in.rw = 1;
-//         pde->in.us = 1;
-//         pde->in.pt_page = pt_pg;
-// 
-//         auto pt = (pt_t)kernel::pmap(pt_pg);
-//         assert(pt);
-//         pte = *pt;
-// 
-//         memset(pt, 0x00, PAGE_SIZE);
-//     } else {
-//         pt_pg = pde->in.pt_page;
-//         auto pt = (pt_t)kernel::pmap(pt_pg);
-//         assert(pt);
-//         pte = *pt;
-//     }
-// 
-//     // map the page in the page table
-//     int pti = v_to_pti(addr);
-//     pte += pti;
-// 
-//     map_raw_kernel::mem::paging::pfn_to_pte(
-//         pte,
-//         pg.phys_page_id,
-//         !(attr & PAGE_MMAP),
-//         false,
-//         priv);
-// 
-//     kernel::pfree(pt_pg);
-// 
-//     if (unlikely((attr & PAGE_COW) && !(pg.attr & PAGE_COW))) {
-//         kernel::paccess pa(pg.pg_pteidx >> 12);
-//         auto* pg_pte = (pte_t*)pa.ptr();
-//         assert(pg_pte);
-//         pg_pte += (pg.pg_pteidx & 0xfff);
-//         pg.attr |= PAGE_COW;
-//         pg_pte->in.rw = 0;
-//         pg_pte->in.a = 0;
-//         invalidate_tlb(addr);
-//     }
-// 
-//     ++*pg.ref_count;
-// 
-//     this->pgs->emplace_back(pg);
-//     auto& emplaced = this->pgs->back();
-//     emplaced.pg_pteidx = (pt_pg << 12) + pti;
-//     emplaced.attr = attr;
-// }
-
-mm mm::split(void *addr)
-{
-    assert(addr > start && addr < end());
-    assert((uintptr_t)addr % 4096 == 0);
-
-    size_t this_count = vptrdiff(addr, start) / 4096;
-    size_t new_count = page_count - this_count;
-
-    mm newmm {
-        .start = addr,
-        .attr { attr },
-        .mapped_file = mapped_file,
-        .file_offset = attr.mapped ? file_offset + this_count * 4096 : 0,
-        .page_count = 0,
-    };
-
-    // TODO:
-    // for (size_t i = 0; i < new_count; ++i) {
-    //     newmm.pgs->emplace_back(pgs->back());
-    //     pgs->pop_back();
-    // }
-
-    return newmm;
-}
-
-int mmap(
-    void* hint,
-    size_t len,
-    fs::inode* file,
-    size_t offset,
-    int write,
-    int priv)
-{
-    auto& mms = current_process->mms;
-
-    if (file && !S_ISREG(file->mode) && !S_ISBLK(file->mode)) [[unlikely]]
-        return -EINVAL;
-
-    // TODO: find another address
-    assert(((uintptr_t)hint & 0xfff) == 0);
-    // TODO: return failed
-    assert((offset & 0xfff) == 0);
-
-    size_t n_pgs = align_up<12>(len) >> 12;
-
-    if (!mms.is_avail(hint, len))
-        return -EEXIST;
-
-    // TODO: LONG MODE
-    using namespace kernel::mem::paging;
-
-    if (file) {
-        auto& mm = mms.add_empty_area(hint, n_pgs, PA_MMAP | PA_COW, write, priv);
-
-        mm.attr.mapped = 1;
-        mm.mapped_file = file;
-        mm.file_offset = offset;
-    }
-    else {
-        // private mapping of zero-filled pages
-        auto& mm = mms.add_empty_area(hint, n_pgs, PA_COW, write, priv);
-
-        mm.attr.mapped = 0;
-    }
-
-    return 0;
-}

+ 322 - 0
src/kernel/mem/mm_list.cc

@@ -0,0 +1,322 @@
+#include <assert.h>
+#include <stdint.h>
+
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/vm_area.hpp>
+
+using namespace kernel::mem;
+
+static inline void __dealloc_page_table_all(
+        paging::pfn_t pt, int depth, int from, int to)
+{
+    using namespace paging;
+
+    if (depth > 1) {
+        for (int i = from; i < to; ++i) {
+            auto pse = PSE{pt}[i];
+            if (!(pse.attributes() & PA_P))
+                continue;
+
+            int pfn = pse.pfn();
+            __dealloc_page_table_all(pfn, depth-1, 0, 512);
+        }
+    }
+
+    free_page(pt);
+}
+
+static inline void __dealloc_page_table(paging::pfn_t pt)
+{
+    using namespace paging;
+    auto start_idx = idx_p4(0);
+    auto end_idx = idx_p4(KERNEL_SPACE_START);
+
+    __dealloc_page_table_all(pt, 4, start_idx, end_idx);
+}
+
+mm_list::mm_list()
+    : m_pt{paging::alloc_page_table()}
+    , m_brk{m_areas.end()}
+{
+    memcpy(physaddr<void>{m_pt},
+           paging::KERNEL_PAGE_TABLE_PHYS_ADDR, 0x1000);
+}
+
+mm_list::mm_list(const mm_list& other): mm_list{}
+{
+    m_areas = other.m_areas;
+
+    using namespace paging;
+    for (const auto& area : m_areas) {
+        auto this_iter = vaddr_range{m_pt, area.start, area.end};
+        auto other_iter = vaddr_range{other.m_pt, area.start, area.end};
+
+        while (this_iter) {
+            auto this_pte = *this_iter, other_pte = *other_iter;
+            auto attributes = other_pte.attributes();
+            auto pfn = other_pte.pfn();
+
+            attributes &= ~(PA_RW | PA_A | PA_D);
+            attributes |= PA_COW;
+            this_pte.set(attributes, pfn);
+
+            increase_refcount(pfn_to_page(pfn));
+
+            ++this_iter, ++other_iter;
+        }
+    }
+}
+
+mm_list::~mm_list()
+{
+    if (!m_pt)
+        return;
+
+    clear();
+    __dealloc_page_table(m_pt);
+}
+
+bool mm_list::is_avail(uintptr_t start, std::size_t len) const noexcept
+{
+    start &= ~0xfff;
+    uintptr_t end = (start + len + 0xfff) & ~0xfff;
+    len = end - start;
+
+    if (end > USER_SPACE_MEMORY_TOP)
+        return false;
+
+    for (const auto& area : m_areas) {
+        if (!area.is_avail(start, end))
+            return false;
+    }
+    return true;
+}
+
+bool mm_list::is_avail(uintptr_t addr) const
+{
+    if (addr >= USER_SPACE_MEMORY_TOP)
+        return false;
+
+    auto iter = m_areas.find(addr);
+    return iter == m_areas.end();
+}
+
+uintptr_t mm_list::find_avail(uintptr_t hint, size_t len) const
+{
+    auto addr = hint;
+
+    // use default value of mmapp'ed area
+    if (!addr)
+        addr = MMAP_MIN_ADDR;
+
+    while (!is_avail(addr, len)) {
+        auto iter = m_areas.lower_bound(addr);
+        if (iter == m_areas.end())
+            return 0;
+
+        addr = iter->end;
+    }
+
+    return addr;
+}
+
+void mm_list::switch_pd() const noexcept
+{
+    asm volatile("mov %0, %%cr3": : "r"(m_pt): "memory");
+}
+
+int mm_list::register_brk(uintptr_t addr)
+{
+    if (!is_avail(addr))
+        return -ENOMEM;
+
+    bool inserted;
+    std::tie(m_brk, inserted) = m_areas.emplace(
+            addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
+
+    assert(inserted);
+    return 0;
+}
+
+uintptr_t mm_list::set_brk(uintptr_t addr)
+{
+    using namespace paging;
+    assert(m_brk != m_areas.end());
+    uintptr_t curbrk = m_brk->end;
+
+    addr += 4096-1;
+    addr &= ~0xfff;
+
+    if (addr <= curbrk || !is_avail(curbrk, addr - curbrk))
+        return curbrk;
+
+    for (auto pte : vaddr_range{m_pt, curbrk, addr})
+        pte.set(PA_ANONYMOUS_PAGE | PA_NXE, EMPTY_PAGE_PFN);
+
+    m_brk->end = addr;
+    return curbrk;
+}
+
+void mm_list::clear()
+{
+    for (auto iter = m_areas.begin(); iter != m_areas.end(); ++iter)
+        unmap(iter);
+
+    m_areas.clear();
+}
+
+mm_list::iterator mm_list::split(iterator area, uintptr_t addr)
+{
+    assert(!(addr & 0xfff));
+    assert(addr > area->start && addr < area->end);
+
+    std::size_t old_len = addr - area->start;
+    std::size_t new_file_offset = 0;
+
+    if (area->mapped_file)
+        new_file_offset = area->file_offset + old_len;
+
+    auto [ iter, inserted ] =
+        m_areas.emplace(addr, area->flags, area->end,
+                area->mapped_file, new_file_offset);
+
+    area->end = addr;
+
+    assert(inserted);
+    return iter;
+}
+
+int mm_list::unmap(iterator area)
+{
+    using namespace paging;
+
+    bool should_invlpg = area->end - area->start <= 0x4000;
+    auto range = vaddr_range{m_pt, area->start, area->end};
+    uintptr_t cur_addr = area->start;
+
+    // TODO: write back dirty pages
+    for (auto pte : range) {
+        free_page(pte.pfn());
+        pte.clear();
+
+        if (should_invlpg) {
+            asm volatile("invlpg (%0)": :"r"(cur_addr) :"memory");
+            cur_addr += 0x1000;
+        }
+    }
+
+    if (!should_invlpg)
+        asm volatile("mov %%cr3, %%rax\n\t mov %%rax, %%cr3": : : "rax", "memory");
+
+    return 0;
+}
+
+int mm_list::unmap(uintptr_t start, std::size_t length)
+{
+    // standard says that addr and len MUST be
+    // page-aligned or the call is invalid
+    if (start & 0xfff)
+        return -EINVAL;
+
+    uintptr_t end = (start + length + 0xfff) & ~0xfff;
+
+    // check address validity
+    if (end > KERNEL_SPACE_START)
+        return -EINVAL;
+    if (end > USER_SPACE_MEMORY_TOP)
+        return -ENOMEM;
+
+    auto iter = m_areas.lower_bound(start);
+    auto iter_end = m_areas.upper_bound(end);
+
+    // start <= iter <= end a.k.a. !(start > *iter) && !(*iter > end)
+    while (iter != iter_end) {
+        // start == iter:
+        // start is between (iter->start, iter->end)
+        //
+        // strip out the area before start
+        if (!(start < *iter) && start != iter->start)
+            iter = split(iter, start);
+
+        // iter.end <= end
+        // it is safe to unmap the area directly
+        if (*iter < end) {
+            if (int ret = unmap(iter); ret != 0)
+                return ret;
+
+            iter = m_areas.erase(iter);
+            continue;
+        }
+
+        // end == iter:
+        // end is between [iter->start, iter->end)
+        //
+        // if end == iter->start, no need to strip the area
+        if (end == iter->start) {
+            ++iter;
+            continue;
+        }
+
+        (void)split(iter, end);
+        if (int ret = unmap(iter); ret != 0)
+            return ret;
+
+        iter = m_areas.erase(iter);
+
+        // no need to check areas after this
+        break;
+    }
+
+    return 0;
+}
+
+int mm_list::mmap(const map_args& args)
+{
+    auto& vaddr = args.vaddr;
+    auto& length = args.length;
+    auto& finode = args.file_inode;
+    auto& foff = args.file_offset;
+    auto& flags = args.flags;
+
+    assert((vaddr & 0xfff) == 0 && (foff & 0xfff) == 0);
+    assert((length & 0xfff) == 0 && length != 0);
+
+    if (!is_avail(vaddr, length))
+        return -EEXIST;
+
+    using namespace kernel::mem::paging;
+
+    // PA_RW is set during page fault while PA_NXE is preserved
+    // so we set PA_NXE now
+    psattr_t attributes = (flags & MM_EXECUTE) ? 0 : PA_NXE;
+
+    if (flags & MM_MAPPED) {
+        assert(finode);
+        assert(S_ISREG(finode->mode) || S_ISBLK(finode->mode));
+
+        auto [ area, inserted ] = m_areas.emplace(
+                vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, finode, foff);
+        assert(inserted);
+
+        attributes |= PA_MMAPPED_PAGE;
+        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
+            pte.set(attributes, EMPTY_PAGE_PFN);
+    }
+    else if (flags & MM_ANONYMOUS) {
+        // private mapping of zero-filled pages
+        // TODO: shared mapping
+        auto [ area, inserted ] =
+            m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
+        assert(inserted);
+
+        attributes |= PA_ANONYMOUS_PAGE;
+        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
+            pte.set(attributes, EMPTY_PAGE_PFN);
+    }
+    else {
+        return -EINVAL;
+    }
+
+    return 0;
+}

+ 284 - 18
src/kernel/mem/paging.cc

@@ -1,16 +1,43 @@
 #include <assert.h>
 #include <string.h>
 
+#include <types/list.hpp>
+
+#include <kernel/async/lock.hpp>
+#include <kernel/log.hpp>
+#include <kernel/mem/mm_list.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/slab.hpp>
+#include <kernel/mem/vm_area.hpp>
+#include <kernel/process.hpp>
+
+using namespace types::list;
 
+using namespace kernel::async;
 using namespace kernel::mem::paging;
 
+static inline void __page_fault_die(uintptr_t vaddr)
+{
+    kmsgf("[kernel] kernel panic: invalid memory access to %p", vaddr);
+    freeze();
+}
+
+static inline PSE __parse_pse(PSE pse, bool priv)
+{
+    auto attr = priv ? PA_KERNEL_PAGE_TABLE : PA_PAGE_TABLE;
+    if (!(pse.attributes() & PA_P))
+        pse.set(attr, alloc_page_table());
+
+    return pse.parse();
+}
+
 static struct zone_info {
     page* next;
     std::size_t count;
 } zones[52];
 
+static mutex zone_lock;
+
 constexpr int _msb(std::size_t x)
 {
     int n = 0;
@@ -29,22 +56,45 @@ constexpr pfn_t parent(pfn_t pfn, int order)
     return pfn & ~(1 << (order + 12));
 }
 
+// call with zone_lock held
+static inline void _zone_list_insert(int order, page* zone)
+{
+    zones[order].count++;
+    list_insert(&zones[order].next, zone);
+}
+
+// call with zone_lock held
+static inline void _zone_list_remove(int order, page* zone)
+{
+    zones[order].count--;
+    list_remove(&zones[order].next, zone);
+}
+
+// call with zone_lock held
+static inline page* _zone_list_get(int order)
+{
+    if (zones[order].count == 0)
+        return nullptr;
+
+    zones[order].count--;
+    return list_get(&zones[order].next);
+}
+
 // where order represents power of 2
-page* _create_zone(pfn_t pfn, int order)
+// call with zone_lock held
+static inline page* _create_zone(pfn_t pfn, int order)
 {
     page* zone = pfn_to_page(pfn);
 
     assert(zone->flags & PAGE_PRESENT);
     zone->flags |= PAGE_BUDDY;
 
-    zone->next = zones[order].next;
-    zones[order].next = zone;
-    zones[order].count++;
-
+    _zone_list_insert(order, zone);
     return zone;
 }
 
-void _split_zone(page* zone, int order, int target_order)
+// call with zone_lock held
+static inline void _split_zone(page* zone, int order, int target_order)
 {
     while (order > target_order) {
         pfn_t pfn = page_to_pfn(zone);
@@ -54,18 +104,15 @@ void _split_zone(page* zone, int order, int target_order)
     }
 }
 
-page* _alloc_zone(int order)
+// call with zone_lock held
+static inline page* _alloc_zone(int order)
 {
     for (int i = order; i < 52; ++i) {
-        if (zones[i].count == 0)
+        auto zone = _zone_list_get(i);
+        if (!zone)
             continue;
 
-        auto* zone = zones[i].next;
-        zones[i].next = zone->next;
-        zones[i].count--;
-
-        // TODO: set free bitmap
-        zone->refcount++;
+        increase_refcount(zone);
 
         if (i > order)
             _split_zone(zone, i, order);
@@ -86,6 +133,8 @@ void kernel::mem::paging::create_zone(uintptr_t start, uintptr_t end)
     if (start >= end)
         return;
 
+    lock_guard_irq lock{zone_lock};
+
     unsigned long low = start;
     for (int i = 0; i < _msb(end); ++i, low >>= 1) {
         if (!(low & 1))
@@ -115,11 +164,10 @@ void kernel::mem::paging::mark_present(uintptr_t start, uintptr_t end)
 
 page* kernel::mem::paging::alloc_pages(int order)
 {
+    lock_guard_irq lock{zone_lock};
     auto* zone = _alloc_zone(order);
-    if (!zone) {
-        // TODO: die
-        return nullptr;
-    }
+    if (!zone)
+        freeze();
 
     return zone;
 }
@@ -139,6 +187,48 @@ pfn_t kernel::mem::paging::alloc_page_table()
     return pfn;
 }
 
+void kernel::mem::paging::free_pages(page* pg, int order)
+{
+    // TODO: atomic
+    if (!(pg->flags & PAGE_BUDDY) || --pg->refcount)
+        return;
+
+    lock_guard_irq lock{zone_lock};
+    while (order < 52) {
+        pfn_t pfn = page_to_pfn(pg);
+        pfn_t buddy_pfn = buddy(pfn, order);
+        page* buddy_page = pfn_to_page(buddy_pfn);
+
+        if (!(buddy_page->flags & PAGE_BUDDY) || buddy_page->refcount)
+            break;
+
+        _zone_list_remove(order, buddy_page);
+
+        if (buddy_page < pg)
+            std::swap(buddy_page, pg);
+
+        buddy_page->flags &= ~PAGE_BUDDY;
+        order++;
+    }
+
+    _zone_list_insert(order, pg);
+}
+
+void kernel::mem::paging::free_page(page* page)
+{
+    return free_pages(page, 0);
+}
+
+void kernel::mem::paging::free_pages(pfn_t pfn, int order)
+{
+    return free_pages(pfn_to_page(pfn), order);
+}
+
+void kernel::mem::paging::free_page(pfn_t pfn)
+{
+    return free_page(pfn_to_page(pfn));
+}
+
 pfn_t kernel::mem::paging::page_to_pfn(page* _page)
 {
     return (pfn_t)(_page - PAGE_ARRAY) * 0x1000;
@@ -148,3 +238,179 @@ page* kernel::mem::paging::pfn_to_page(pfn_t pfn)
 {
     return PAGE_ARRAY + pfn / 0x1000;
 }
+
+void kernel::mem::paging::increase_refcount(page* pg)
+{
+    pg->refcount++;
+}
+
+void kernel::mem::paging::handle_page_fault(unsigned long err)
+{
+    using namespace kernel::mem;
+    using namespace paging;
+
+    uintptr_t vaddr;
+    asm volatile("mov %%cr2, %0": "=g"(vaddr): : );
+    auto& mms = current_process->mms;
+
+    auto* mm_area = mms.find(vaddr);
+    if (!mm_area) [[unlikely]] {
+        // user access of address that does not exist
+        if (err & PAGE_FAULT_U)
+            kill_current(SIGSEGV);
+
+        __page_fault_die(vaddr);
+    }
+
+    if (err & PAGE_FAULT_U && err & PAGE_FAULT_P)
+        kill_current(SIGSEGV);
+
+    auto idx = idx_all(vaddr);
+
+    auto pe = mms.get_page_table()[std::get<1>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<2>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<3>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<4>(idx)];
+
+    bool mmapped = mm_area->flags & MM_MAPPED;
+    assert(!mmapped || mm_area->mapped_file);
+
+    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
+        __page_fault_die(vaddr);
+
+    pfn_t pfn = pe.pfn();
+    auto attr = pe.attributes();
+
+    page* pg = pfn_to_page(pfn);
+
+    if (attr & PA_COW) {
+        attr &= ~PA_COW;
+        if (mm_area->flags & MM_WRITE)
+            attr |= PA_RW;
+        else
+            attr &= ~PA_RW;
+
+        // if it is a dying page
+        // TODO: use atomic
+        if (pg->refcount == 1) {
+            pe.set(attr, pfn);
+            return;
+        }
+
+        // duplicate the page
+        page* new_page = alloc_page();
+        pfn_t new_pfn = page_to_pfn(new_page);
+        physaddr<void> new_page_addr{new_pfn};
+
+        if (attr & PA_ANON)
+            memset(new_page_addr, 0x00, 0x1000);
+        else
+            memcpy(new_page_addr, physaddr<void>{pfn}, 0x1000);
+
+        attr &= ~(PA_A | PA_ANON);
+        --pg->refcount;
+
+        pe.set(attr, new_pfn);
+        pfn = new_pfn;
+    }
+
+    if (attr & PA_MMAP) {
+        attr |= PA_P;
+
+        size_t offset = (vaddr & ~0xfff) - mm_area->start;
+        char* data = physaddr<char>{pfn};
+
+        int n = vfs_read(
+            mm_area->mapped_file,
+            data,
+            4096,
+            mm_area->file_offset + offset,
+            4096);
+
+        // TODO: send SIGBUS if offset is greater than real size
+        if (n != 4096)
+            memset(data + n, 0x00, 4096 - n);
+
+        // TODO: shared mapping
+        attr &= ~PA_MMAP;
+
+        pe.set(attr, pfn);
+    }
+}
+
+vaddr_range::vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool priv)
+    : n {start >= end ? 0 : ((end - start) >> 12)}
+    , idx4{!n ? 0 : idx_p4(start)}
+    , idx3{!n ? 0 : idx_p3(start)}
+    , idx2{!n ? 0 : idx_p2(start)}
+    , idx1{!n ? 0 : idx_p1(start)}
+    , pml4{!n ? PSE{0} : PSE{pt}}
+    , pdpt{!n ? PSE{0} : __parse_pse(pml4[idx4], priv)}
+    , pd{!n ? PSE{0} : __parse_pse(pdpt[idx3], priv)}
+    , pt{!n ? PSE{0} : __parse_pse(pd[idx2], priv)}
+    , m_start{!n ? 0 : start}, m_end{!n ? 0 : end}
+    , is_privilege{!n ? false : priv} { }
+
+vaddr_range::vaddr_range(std::nullptr_t)
+    : n{}
+    , idx4{}, idx3{}, idx2{}, idx1{}
+    , pml4{0}, pdpt{0}
+    , pd{0}, pt{0}
+    , m_start{}, m_end{}, is_privilege{} { }
+
+vaddr_range vaddr_range::begin() const noexcept
+{
+    return *this;
+}
+
+vaddr_range vaddr_range::end() const noexcept
+{
+    return vaddr_range {nullptr};
+}
+
+PSE vaddr_range::operator*() const noexcept
+{
+    return pt[idx1];
+}
+
+vaddr_range& vaddr_range::operator++()
+{
+    --n;
+
+    if ((idx1 = (idx1+1)%512) != 0)
+        return *this;
+
+    do {
+        if ((idx2 = (idx2+1)%512) != 0)
+            break;
+        do {
+            if ((idx3 = (idx3+1)%512) != 0)
+                break;
+
+            idx4 = (idx4+1) % 512;
+
+            // if idx4 is 0 after update, we have an overflow
+            assert(idx4 != 0);
+
+            pdpt = pml4[idx4];
+        } while (false);
+
+        pd = pdpt[idx3];
+    } while (false);
+
+    pt = pd[idx2];
+    return *this;
+}
+
+vaddr_range::operator bool() const noexcept
+{
+    return n;
+}
+
+bool vaddr_range::operator==(const vaddr_range& other) const noexcept
+{
+    return n == other.n;
+}

+ 3 - 37
src/kernel/mem/slab.cc

@@ -2,50 +2,16 @@
 
 #include <assert.h>
 
+#include <types/list.hpp>
+
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/slab.hpp>
 
 using namespace kernel::mem;
+using namespace types::list;
 
 constexpr std::size_t SLAB_PAGE_SIZE = 0x1000; // 4K
 
-template <typename ListNode>
-void list_insert(ListNode** head, ListNode* node)
-{
-    node->next = *head;
-    if (*head)
-        (*head)->prev = node;
-    *head = node;
-}
-
-template <typename ListNode>
-ListNode* list_get(ListNode** head)
-{
-    ListNode* node = *head;
-    if (node) {
-        *head = node->next;
-
-        node->next = nullptr;
-        node->prev = nullptr;
-    }
-    return node;
-}
-
-template <typename ListNode>
-void list_remove(ListNode** head, ListNode* node)
-{
-    if (node->prev)
-        node->prev->next = node->next;
-    else
-        *head = node->next;
-
-    if (node->next)
-        node->next->prev = node->prev;
-
-    node->next = nullptr;
-    node->prev = nullptr;
-}
-
 std::ptrdiff_t _slab_data_start_offset(std::size_t size)
 {
     return (sizeof(slab_head) + size - 1) & ~(size - 1);

+ 21 - 21
src/kernel/process.cpp

@@ -15,9 +15,7 @@
 #include <types/types.h>
 
 #include <kernel/async/lock.hpp>
-#include <kernel/interrupt.h>
 #include <kernel/log.hpp>
-#include <kernel/mm.hpp>
 #include <kernel/module.hpp>
 #include <kernel/process.hpp>
 #include <kernel/signal.hpp>
@@ -338,7 +336,7 @@ void proclist::kill(pid_t pid, int exit_code)
     proc.files.close_all();
 
     // unmap all user memory areas
-    proc.mms.clear_user();
+    proc.mms.clear();
 
     // init should never exit
     if (proc.ppid == 0) {
@@ -450,21 +448,20 @@ void NORETURN _kernel_init(void)
     current_process->attr.system = 0;
     current_thread->attr |= kernel::task::thread::SYSTEM;
 
-    const char* argv[] = { "/mnt/busybox", "sh", "/mnt/initsh" };
-    const char* envp[] = { "LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/", nullptr };
+    types::elf::elf32_load_data d{
+        .exec_dent{},
+        .argv{ "/mnt/busybox", "sh", "/mnt/initsh" },
+        .envp{ "LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/" },
+        .ip{}, .sp{}
+    };
 
-    types::elf::elf32_load_data d;
-    d.argv = argv;
-    d.envp = envp;
-    d.system = false;
-
-    d.exec_dent = fs::vfs_open(*fs::fs_root, types::path{argv[0]});
+    d.exec_dent = fs::vfs_open(*fs::fs_root, types::path{d.argv[0].c_str()});
     if (!d.exec_dent) {
         kmsg("kernel panic: init not found!\n");
         freeze();
     }
 
-    int ret = types::elf::elf32_load(&d);
+    int ret = types::elf::elf32_load(d);
     assert(ret == 0);
 
     asm volatile(
@@ -480,10 +477,8 @@ void NORETURN _kernel_init(void)
         "push $0x1b\n"
         "push %1\n"
 
-        "iret\n"
-        :
-        : "c"(d.sp), "d"(d.eip)
-        : "eax", "memory");
+        "iretq\n"
+        : : "g"(d.sp), "g"(d.ip) : "eax", "memory");
 
     freeze();
 }
@@ -502,7 +497,13 @@ void NORETURN init_scheduler(void)
 
     asm volatile(
         "mov %0, %%rsp\n"
-        "push %=f\n"
+        "sub $16, %%rsp\n"
+        "mov %=f, %%rbx\n"
+        "mov %%rbx, 8(%%rsp)\n" // return address
+        "xor %%rbx, %%rbx\n"
+        "mov %%rbx, (%%rsp)\n"  // previous rbp
+        "mov %%rsp, %%rbp\n"
+
         "push %1\n"
 
         "mov $0x10, %%ax\n"
@@ -512,9 +513,6 @@ void NORETURN init_scheduler(void)
         "mov %%ax, %%fs\n"
         "mov %%ax, %%gs\n"
 
-        "xor %%ebp, %%ebp\n"
-        "xor %%edx, %%edx\n"
-
         "push $0x0\n"
         "popf\n"
 
@@ -523,7 +521,7 @@ void NORETURN init_scheduler(void)
         "%=:\n"
         "ud2"
         :
-        : "a"(current_thread->kstack.esp), "c"(_kernel_init)
+        : "a"(current_thread->kstack.sp), "c"(_kernel_init)
         : "memory");
 
     freeze();
@@ -532,6 +530,8 @@ void NORETURN init_scheduler(void)
 extern "C" void asm_ctx_switch(uint32_t** curr_esp, uint32_t** next_esp);
 bool schedule()
 {
+    freeze();
+
     if (kernel::async::preempt_count() != 0)
         return true;
 

+ 21 - 23
src/kernel/signal.cpp

@@ -1,7 +1,7 @@
 #include <kernel/task/thread.hpp>
 #include <kernel/process.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 
 #include <signal.h>
 
@@ -142,11 +142,11 @@ signo_type signal_list::pending_signal()
 
         return *iter;
     }
-    
+
     return 0;
 }
 
-void signal_list::handle(interrupt_stack* context, mmx_registers* mmxregs)
+void signal_list::handle(interrupt_stack_normal* context, mmx_registers* mmxregs)
 {
     // assume that the pending signal is at the front of the list
     auto signal = m_list.front();
@@ -178,30 +178,28 @@ void signal_list::handle(interrupt_stack* context, mmx_registers* mmxregs)
     if (!(handler.sa_flags & SA_RESTORER))
         raise(SIGSYS);
 
-    // TODO: LONG MODE
-    // uint32_t esp = (uint32_t)context->esp;
-    // esp -= (sizeof(mmx_registers) + sizeof(interrupt_stack) + 16);
-    // esp &= 0xfffffff0;
-
-    // auto tmpesp = esp;
-    // *(uint32_t*)tmpesp = signal; // signal handler argument: int signo
-    // tmpesp += 4;
-    // *(uint32_t*)tmpesp = context->esp; // original esp
-    // tmpesp += 4;
+    // save current interrupt context to 128 bytes above current user stack
+    uintptr_t sp = (uintptr_t)context->rsp;
+    sp -= (128 + sizeof(mmx_registers) + sizeof(interrupt_stack_normal) + 16);
+    sp &= ~0xf;
 
-    // tmpesp += 8; // padding to align to 16 bytes
+    auto tmpsp = sp;
+    *(uint64_t*)tmpsp = signal; // signal handler argument: int signo
+    tmpsp += 8;
+    *(uintptr_t*)tmpsp = context->rsp; // original rsp
+    tmpsp += 8;
 
-    // memcpy((void*)tmpesp, mmxregs, sizeof(mmx_registers));
-    // tmpesp += sizeof(mmx_registers); // mmx registers
-    // memcpy((void*)tmpesp, context, sizeof(interrupt_stack));
-    // tmpesp += sizeof(interrupt_stack); // context
+    memcpy((void*)tmpsp, mmxregs, sizeof(mmx_registers));
+    tmpsp += sizeof(mmx_registers); // mmx registers
+    memcpy((void*)tmpsp, context, sizeof(interrupt_stack_normal));
+    tmpsp += sizeof(interrupt_stack_normal); // context
 
-    // esp -= sizeof(void*);
-    // // signal handler return address: restorer
-    // *(uint32_t*)esp = (uint32_t)handler.sa_restorer;
+    sp -= sizeof(void*);
+    // signal handler return address: restorer
+    *(uintptr_t*)sp = (uintptr_t)handler.sa_restorer;
 
-    // context->esp = esp;
-    // context->v_eip = (void*)handler.sa_handler;
+    context->rsp = sp;
+    context->v_rip = (uintptr_t)handler.sa_handler;
 }
 
 void signal_list::after_signal(signo_type signal)

+ 152 - 154
src/kernel/syscall.cpp

@@ -1,45 +1,40 @@
-#include <asm/port_io.h>
-
 #include <assert.h>
+#include <bits/alltypes.h>
+#include <bits/ioctl.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <poll.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
-#include <time.h>
-#include <termios.h>
-#include <unistd.h>
-#include <bits/alltypes.h>
-#include <bits/ioctl.h>
-#include <sys/types.h>
-#include <sys/prctl.h>
 #include <sys/mman.h>
+#include <sys/prctl.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <sys/utsname.h>
 #include <sys/wait.h>
+#include <termios.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <types/allocator.hpp>
+#include <types/elf.hpp>
+#include <types/path.hpp>
+#include <types/types.h>
 
 #include <kernel/async/lock.hpp>
-#include <kernel/user/thread_local.hpp>
-#include <kernel/task/readyqueue.hpp>
-#include <kernel/task/thread.hpp>
-#include <kernel/interrupt.h>
+#include <kernel/hw/timer.hpp>
+#include <kernel/interrupt.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
 #include <kernel/process.hpp>
 #include <kernel/signal.hpp>
 #include <kernel/syscall.hpp>
+#include <kernel/task/readyqueue.hpp>
+#include <kernel/task/thread.hpp>
 #include <kernel/tty.hpp>
+#include <kernel/user/thread_local.hpp>
 #include <kernel/utsname.hpp>
 #include <kernel/vfs.hpp>
-#include <kernel/hw/timer.h>
-
-#include <types/allocator.hpp>
-#include <types/elf.hpp>
-#include <types/path.hpp>
-#include <types/status.h>
-#include <types/types.h>
 
 #define SYSCALL_NO ((data)->s_regs.eax)
 #define SYSCALL_RETVAL ((data)->s_regs.eax)
@@ -57,7 +52,7 @@ static void not_implemented(const char* pos, int line)
 #define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
 
 extern "C" void _syscall_stub_fork_return(void);
-int _syscall_fork(interrupt_stack* data)
+long _syscall_fork(interrupt_stack* data)
 {
     auto& newproc = procs->copy_from(*current_process);
     auto [ iter_newthd, inserted ] = newproc.thds.emplace(*current_thread, newproc.pid);
@@ -66,7 +61,7 @@ int _syscall_fork(interrupt_stack* data)
 
     kernel::task::dispatcher::enqueue(newthd);
 
-    uint32_t newthd_oldesp = (uint32_t)newthd->kstack.esp;
+    uint32_t newthd_oldesp = (uintptr_t)newthd->kstack.esp;
     auto esp = &newthd->kstack.esp;
 
     // create fake interrupt stack
@@ -90,7 +85,7 @@ int _syscall_fork(interrupt_stack* data)
 
     // ctx_switch stack
     // return address
-    push_stack(esp, (uint32_t)_syscall_stub_fork_return);
+    push_stack(esp, (uintptr_t)_syscall_stub_fork_return);
     // ebx
     push_stack(esp, 0);
     // edi
@@ -107,7 +102,7 @@ int _syscall_fork(interrupt_stack* data)
     return newproc.pid;
 }
 
-int _syscall_write(interrupt_stack* data)
+long _syscall_write(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(const char* __user, buf);
@@ -120,7 +115,7 @@ int _syscall_write(interrupt_stack* data)
     return file->write(buf, n);
 }
 
-int _syscall_read(interrupt_stack* data)
+long _syscall_read(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(char* __user, buf);
@@ -134,7 +129,7 @@ int _syscall_read(interrupt_stack* data)
 }
 
 // TODO: sleep seconds
-int _syscall_sleep(interrupt_stack*)
+long _syscall_sleep(interrupt_stack*)
 {
     current_thread->set_attr(kernel::task::thread::USLEEP);
 
@@ -142,7 +137,7 @@ int _syscall_sleep(interrupt_stack*)
     return 0;
 }
 
-int _syscall_chdir(interrupt_stack* data)
+long _syscall_chdir(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char*, path);
 
@@ -164,7 +159,7 @@ int _syscall_chdir(interrupt_stack* data)
 // @param exec: the path of program to execute
 // @param argv: arguments end with nullptr
 // @param envp: environment variables end with nullptr
-int _syscall_execve(interrupt_stack* data)
+long _syscall_execve(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char*, exec);
     SYSCALL_ARG2(char* const*, argv);
@@ -184,11 +179,11 @@ int _syscall_execve(interrupt_stack* data)
     current_process->files.onexec();
 
     int ret = types::elf::elf32_load(&d);
-    if (ret != GB_OK)
+    if (ret != 0)
         return -d.errcode;
 
-    data->v_eip = d.eip;
-    data->esp = (uint32_t)d.sp;
+    data->v_rip = d.eip;
+    data->rsp = (uintptr_t)d.sp;
 
     current_thread->signals.on_exec();
 
@@ -196,7 +191,7 @@ int _syscall_execve(interrupt_stack* data)
 }
 
 // @param exit_code
-int NORETURN _syscall_exit(interrupt_stack* data)
+long NORETURN _syscall_exit(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, exit_code);
 
@@ -211,11 +206,17 @@ int NORETURN _syscall_exit(interrupt_stack* data)
     schedule_noreturn();
 }
 
+long NORETURN _syscall_exit_group(interrupt_stack* data)
+{
+    // we implement exit_group as exit for now
+    _syscall_exit(data);
+}
+
 // @param pid: pid of the process to wait
 // @param status: the exit code of the exited process
 // @param options: options for waitpid
 // @return pid of the exited process
-int _syscall_waitpid(interrupt_stack* data)
+long _syscall_waitpid(interrupt_stack* data)
 {
     SYSCALL_ARG1(pid_t, pid_to_wait);
     SYSCALL_ARG2(int*, arg1);
@@ -267,7 +268,7 @@ int _syscall_waitpid(interrupt_stack* data)
     return -EINVAL;
 }
 
-int _syscall_wait4(interrupt_stack* data)
+long _syscall_wait4(interrupt_stack* data)
 {
     SYSCALL_ARG4(void* __user, rusage);
 
@@ -278,7 +279,7 @@ int _syscall_wait4(interrupt_stack* data)
     return _syscall_waitpid(data);
 }
 
-int _syscall_getdents(interrupt_stack* data)
+long _syscall_getdents(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(char* __user, buf);
@@ -291,7 +292,7 @@ int _syscall_getdents(interrupt_stack* data)
     return dir->getdents(buf, cnt);
 }
 
-int _syscall_open(interrupt_stack* data)
+long _syscall_open(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char* __user, path);
     SYSCALL_ARG2(int, flags);
@@ -303,7 +304,7 @@ int _syscall_open(interrupt_stack* data)
         current_process->pwd + path, flags, mode);
 }
 
-int _syscall_getcwd(interrupt_stack* data)
+long _syscall_getcwd(interrupt_stack* data)
 {
     SYSCALL_ARG1(char*, buf);
     SYSCALL_ARG2(size_t, bufsize);
@@ -313,10 +314,10 @@ int _syscall_getcwd(interrupt_stack* data)
     strncpy(buf, path.c_str(), bufsize);
     buf[bufsize - 1] = 0;
 
-    return (uint32_t)buf;
+    return (uintptr_t)buf;
 }
 
-int _syscall_setsid(interrupt_stack*)
+long _syscall_setsid(interrupt_stack*)
 {
     if (current_process->pid == current_process->pgid)
         return -EPERM;
@@ -331,7 +332,7 @@ int _syscall_setsid(interrupt_stack*)
     return current_process->pid;
 }
 
-int _syscall_getsid(interrupt_stack* data)
+long _syscall_getsid(interrupt_stack* data)
 {
     SYSCALL_ARG1(pid_t, pid);
 
@@ -344,33 +345,33 @@ int _syscall_getsid(interrupt_stack* data)
     return pproc->sid;
 }
 
-int _syscall_close(interrupt_stack* data)
+long _syscall_close(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     current_process->files.close(fd);
     return 0;
 }
 
-int _syscall_dup(interrupt_stack* data)
+long _syscall_dup(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, old_fd);
     return current_process->files.dup(old_fd);
 }
 
-int _syscall_dup2(interrupt_stack* data)
+long _syscall_dup2(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, old_fd);
     SYSCALL_ARG2(int, new_fd);
     return current_process->files.dup2(old_fd, new_fd);
 }
 
-int _syscall_pipe(interrupt_stack* data)
+long _syscall_pipe(interrupt_stack* data)
 {
     SYSCALL_ARG1(int* __user, pipefd);
     return current_process->files.pipe(pipefd);
 }
 
-int _syscall_setpgid(interrupt_stack* data)
+long _syscall_setpgid(interrupt_stack* data)
 {
     SYSCALL_ARG1(pid_t, pid);
     SYSCALL_ARG2(pid_t, pgid);
@@ -396,7 +397,7 @@ int _syscall_setpgid(interrupt_stack* data)
     return 0;
 }
 
-int _syscall_ioctl(interrupt_stack* data)
+long _syscall_ioctl(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(unsigned long, request);
@@ -472,17 +473,17 @@ int _syscall_ioctl(interrupt_stack* data)
     return 0;
 }
 
-int _syscall_getpid(interrupt_stack*)
+long _syscall_getpid(interrupt_stack*)
 {
     return current_process->pid;
 }
 
-int _syscall_getppid(interrupt_stack*)
+long _syscall_getppid(interrupt_stack*)
 {
     return current_process->ppid;
 }
 
-int _syscall_set_thread_area(interrupt_stack* data)
+long _syscall_set_thread_area(interrupt_stack* data)
 {
     SYSCALL_ARG1(kernel::user::user_desc* __user, ptr);
 
@@ -494,14 +495,14 @@ int _syscall_set_thread_area(interrupt_stack* data)
     return 0;
 }
 
-int _syscall_set_tid_address(interrupt_stack* data)
+long _syscall_set_tid_address(interrupt_stack* data)
 {
     SYSCALL_ARG1(int* __user, tidptr);
     current_thread->set_child_tid = tidptr;
     return current_thread->tid();
 }
 
-int _syscall_readv(interrupt_stack* data)
+long _syscall_readv(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(const iovec* __user, iov);
@@ -534,7 +535,7 @@ int _syscall_readv(interrupt_stack* data)
 }
 
 // TODO: this operation SHOULD be atomic
-int _syscall_writev(interrupt_stack* data)
+long _syscall_writev(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(const iovec* __user, iov);
@@ -558,7 +559,7 @@ int _syscall_writev(interrupt_stack* data)
     return totn;
 }
 
-int _syscall_prctl(interrupt_stack* data)
+long _syscall_prctl(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, option);
 
@@ -583,7 +584,7 @@ int _syscall_prctl(interrupt_stack* data)
     return 0;
 }
 
-int _syscall_clock_gettime64(interrupt_stack* data)
+long _syscall_clock_gettime64(interrupt_stack* data)
 {
     SYSCALL_ARG1(clockid_t, clk_id);
     SYSCALL_ARG2(timespec* __user, tp);
@@ -594,31 +595,31 @@ int _syscall_clock_gettime64(interrupt_stack* data)
         return -EINVAL;
     }
 
-    int time = current_ticks();
+    auto time = kernel::hw::timer::current_ticks();
     tp->tv_sec = time / 100;
     tp->tv_nsec = 10000000 * (time % 100);
 
     return 0;
 }
 
-int _syscall_getuid(interrupt_stack*)
+long _syscall_getuid(interrupt_stack*)
 {
     return 0; // all user are root for now
 }
 
-int _syscall_geteuid(interrupt_stack*)
+long _syscall_geteuid(interrupt_stack*)
 {
     return 0; // all user are root for now
 }
 
-int _syscall_brk(interrupt_stack* data)
+long _syscall_brk(interrupt_stack* data)
 {
     SYSCALL_ARG1(void*, addr);
 
-    return (int)current_process->mms.set_brk(addr);
+    return (uintptr_t)current_process->mms.set_brk(addr);
 }
 
-int _syscall_mmap_pgoff(interrupt_stack* data)
+long _syscall_mmap_pgoff(interrupt_stack* data)
 {
     SYSCALL_ARG1(void*, addr);
     SYSCALL_ARG2(size_t, len);
@@ -627,7 +628,7 @@ int _syscall_mmap_pgoff(interrupt_stack* data)
     SYSCALL_ARG5(int, fd);
     SYSCALL_ARG6(off_t, pgoffset);
 
-    if ((ptr_t)addr % PAGE_SIZE != 0)
+    if ((uintptr_t)addr % 4096 != 0)
         return -EINVAL;
     if (len == 0)
         return -EINVAL;
@@ -652,7 +653,7 @@ int _syscall_mmap_pgoff(interrupt_stack* data)
         // do unmapping, equal to munmap, MAP_FIXED set
         if (prot == PROT_NONE) {
             auto ret = mms.unmap(addr, len, false);
-            if (ret != GB_OK)
+            if (ret != 0)
                 return ret;
         }
         else {
@@ -664,26 +665,26 @@ int _syscall_mmap_pgoff(interrupt_stack* data)
             }
 
             // TODO: append pages to the end of area if possible
-            mms.add_empty_area(addr, len / PAGE_SIZE,
+            mms.add_empty_area(addr, len / 4096,
                 PAGE_COW, prot & PROT_WRITE, false);
         }
     }
 
-    return (int)addr;
+    return (uintptr_t)addr;
 }
 
-int _syscall_munmap(interrupt_stack* data)
+long _syscall_munmap(interrupt_stack* data)
 {
     SYSCALL_ARG1(void*, addr);
     SYSCALL_ARG2(size_t, len);
 
-    if ((ptr_t)addr % PAGE_SIZE != 0)
+    if ((uintptr_t)addr % 4096 != 0)
         return -EINVAL;
 
     return current_process->mms.unmap(addr, len, false);
 }
 
-int _syscall_sendfile64(interrupt_stack* data)
+long _syscall_sendfile64(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, out_fd);
     SYSCALL_ARG2(int, in_fd);
@@ -733,7 +734,7 @@ int _syscall_sendfile64(interrupt_stack* data)
     return totn;
 }
 
-int _syscall_statx(interrupt_stack* data)
+long _syscall_statx(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, dirfd);
     SYSCALL_ARG2(const char* __user, path);
@@ -765,7 +766,7 @@ int _syscall_statx(interrupt_stack* data)
     return ret;
 }
 
-int _syscall_fcntl64(interrupt_stack* data)
+long _syscall_fcntl64(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(int, cmd);
@@ -788,7 +789,7 @@ int _syscall_fcntl64(interrupt_stack* data)
     }
 }
 
-int _syscall_getdents64(interrupt_stack* data)
+long _syscall_getdents64(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(char* __user, buf);
@@ -802,7 +803,7 @@ int _syscall_getdents64(interrupt_stack* data)
 }
 
 /* TODO: implement vfs_stat(stat*)
-int _syscall_stat(interrupt_stack* data)
+long _syscall_stat(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char* __user, pathname);
     SYSCALL_ARG2(struct stat* __user, buf);
@@ -818,7 +819,7 @@ int _syscall_stat(interrupt_stack* data)
 */
 
 /* TODO: implement vfs_stat(stat*)
-int _syscall_fstat(interrupt_stack* data)
+long _syscall_fstat(interrupt_stack* data)
 {
     SYSCALL_ARG1(int, fd);
     SYSCALL_ARG2(struct stat* __user, buf);
@@ -831,7 +832,7 @@ int _syscall_fstat(interrupt_stack* data)
 }
 */
 
-int _syscall_gettimeofday(interrupt_stack* data)
+long _syscall_gettimeofday(interrupt_stack* data)
 {
     SYSCALL_ARG1(timeval* __user, tv);
     SYSCALL_ARG2(void* __user, tz);
@@ -842,14 +843,15 @@ int _syscall_gettimeofday(interrupt_stack* data)
 
     if (likely(tv)) {
         // TODO: use copy_to_user
-        tv->tv_sec = current_ticks() / 100;
-        tv->tv_usec = current_ticks() * 10 * 1000;
+        auto ticks = kernel::hw::timer::current_ticks();
+        tv->tv_sec = ticks / 100;
+        tv->tv_usec = ticks * 10 * 1000;
     }
 
     return 0;
 }
 
-int _syscall_umask(interrupt_stack* data)
+long _syscall_umask(interrupt_stack* data)
 {
     SYSCALL_ARG1(mode_t, mask);
 
@@ -859,7 +861,7 @@ int _syscall_umask(interrupt_stack* data)
     return old;
 }
 
-int _syscall_kill(interrupt_stack* data)
+long _syscall_kill(interrupt_stack* data)
 {
     SYSCALL_ARG1(pid_t, pid);
     SYSCALL_ARG2(int, sig);
@@ -880,7 +882,7 @@ int _syscall_kill(interrupt_stack* data)
     return 0;
 }
 
-int _syscall_rt_sigprocmask(interrupt_stack* data)
+long _syscall_rt_sigprocmask(interrupt_stack* data)
 {
     using kernel::sigmask_type;
 
@@ -917,7 +919,7 @@ int _syscall_rt_sigprocmask(interrupt_stack* data)
     return 0;
 }
 
-int _syscall_rt_sigaction(interrupt_stack* data)
+long _syscall_rt_sigaction(interrupt_stack* data)
 {
     using kernel::sigaction;
     using kernel::sigmask_type;
@@ -946,7 +948,7 @@ int _syscall_rt_sigaction(interrupt_stack* data)
     return 0;
 }
 
-int _syscall_newuname(interrupt_stack* data)
+long _syscall_newuname(interrupt_stack* data)
 {
     SYSCALL_ARG1(new_utsname* __user, buf);
 
@@ -959,7 +961,7 @@ int _syscall_newuname(interrupt_stack* data)
     return 0;
 }
 
-pid_t _syscall_getpgid(interrupt_stack* data)
+long _syscall_getpgid(interrupt_stack* data)
 {
     SYSCALL_ARG1(pid_t, pid);
 
@@ -973,14 +975,14 @@ pid_t _syscall_getpgid(interrupt_stack* data)
     return pproc->pgid;
 }
 
-int _syscall_gettid(interrupt_stack* data)
+long _syscall_gettid(interrupt_stack* data)
 {
     // TODO: real tid
     (void)data;
     return current_process->pid;
 }
 
-int _syscall_mkdir(interrupt_stack* data)
+long _syscall_mkdir(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char* __user, pathname);
     SYSCALL_ARG2(mode_t, mode);
@@ -1006,13 +1008,13 @@ int _syscall_mkdir(interrupt_stack* data)
 
     auto ret = fs::vfs_mkdir(dent, dirname.c_str(), mode);
 
-    if (ret != GB_OK)
+    if (ret != 0)
         return ret;
 
     return 0;
 }
 
-int _syscall_truncate(interrupt_stack* data)
+long _syscall_truncate(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char* __user, pathname);
     SYSCALL_ARG2(long, length);
@@ -1028,13 +1030,13 @@ int _syscall_truncate(interrupt_stack* data)
 
     auto ret = fs::vfs_truncate(dent->ind, length);
 
-    if (ret != GB_OK)
+    if (ret != 0)
         return ret;
 
     return 0;
 }
 
-int _syscall_unlink(interrupt_stack* data)
+long _syscall_unlink(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char* __user, pathname);
 
@@ -1050,7 +1052,7 @@ int _syscall_unlink(interrupt_stack* data)
     return fs::vfs_rmfile(dent->parent, dent->name.c_str());
 }
 
-int _syscall_access(interrupt_stack* data)
+long _syscall_access(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char* __user, pathname);
     SYSCALL_ARG2(int, mode);
@@ -1074,7 +1076,7 @@ int _syscall_access(interrupt_stack* data)
     }
 }
 
-int _syscall_mknod(interrupt_stack* data)
+long _syscall_mknod(interrupt_stack* data)
 {
     SYSCALL_ARG1(const char* __user, pathname);
     SYSCALL_ARG2(mode_t, mode);
@@ -1096,7 +1098,7 @@ int _syscall_mknod(interrupt_stack* data)
     return fs::vfs_mknode(dent, filename.c_str(), mode, dev);
 }
 
-int _syscall_poll(interrupt_stack* data)
+long _syscall_poll(interrupt_stack* data)
 {
     SYSCALL_ARG1(struct pollfd* __user, fds);
     SYSCALL_ARG2(nfds_t, nfds);
@@ -1137,7 +1139,7 @@ int _syscall_poll(interrupt_stack* data)
     // return 0;
 }
 
-int _syscall_llseek(interrupt_stack* data)
+long _syscall_llseek(interrupt_stack* data)
 {
     SYSCALL_ARG1(unsigned int, fd);
     SYSCALL_ARG2(unsigned long, offset_high);
@@ -1191,68 +1193,64 @@ extern "C" void syscall_entry(
         current_thread->signals.handle(data, mmxregs);
 }
 
+#define REGISTER_SYSCALL_HANDLER(no, name) if (1) { extern long _syscall_ ## name (interrupt_stack*); syscall_handlers[(no)] = _syscall_ ## name; }
+
 SECTION(".text.kinit")
 void init_syscall(void)
 {
-    memset(syscall_handlers, 0x00, sizeof(syscall_handlers));
-
-    syscall_handlers[0x01] = _syscall_exit;
-    syscall_handlers[0x02] = _syscall_fork;
-    syscall_handlers[0x03] = _syscall_read;
-    syscall_handlers[0x04] = _syscall_write;
-    syscall_handlers[0x05] = _syscall_open;
-    syscall_handlers[0x06] = _syscall_close;
-    syscall_handlers[0x07] = _syscall_waitpid;
-    syscall_handlers[0x0a] = _syscall_unlink;
-    syscall_handlers[0x0b] = _syscall_execve;
-    syscall_handlers[0x0c] = _syscall_chdir;
-    syscall_handlers[0x0e] = _syscall_mknod;
-    syscall_handlers[0x14] = _syscall_getpid;
-    extern int _syscall_mount(interrupt_stack*);
-    syscall_handlers[0x15] = _syscall_mount;
-    syscall_handlers[0x21] = _syscall_access;
-    syscall_handlers[0x25] = _syscall_kill;
-    syscall_handlers[0x27] = _syscall_mkdir;
-    syscall_handlers[0x29] = _syscall_dup;
-    syscall_handlers[0x2a] = _syscall_pipe;
-    syscall_handlers[0x2d] = _syscall_brk;
-    syscall_handlers[0x36] = _syscall_ioctl;
-    syscall_handlers[0x39] = _syscall_setpgid;
-    syscall_handlers[0x3c] = _syscall_umask;
-    syscall_handlers[0x3f] = _syscall_dup2;
-    syscall_handlers[0x40] = _syscall_getppid;
-    syscall_handlers[0x42] = _syscall_setsid;
-    syscall_handlers[0x4e] = _syscall_gettimeofday;
-    extern int _syscall_symlink(interrupt_stack*);
-    syscall_handlers[0x53] = _syscall_symlink;
-    extern int _syscall_readlink(interrupt_stack*);
-    syscall_handlers[0x55] = _syscall_readlink;
-    syscall_handlers[0x5b] = _syscall_munmap;
-    syscall_handlers[0x5c] = _syscall_truncate;
-    syscall_handlers[0x72] = _syscall_wait4;
-    syscall_handlers[0x7a] = _syscall_newuname;
-    syscall_handlers[0x84] = _syscall_getpgid;
-    syscall_handlers[0x8c] = _syscall_llseek;
-    syscall_handlers[0x8d] = _syscall_getdents;
-    syscall_handlers[0x91] = _syscall_readv;
-    syscall_handlers[0x92] = _syscall_writev;
-    syscall_handlers[0x93] = _syscall_getsid;
-    syscall_handlers[0xa8] = _syscall_poll;
-    syscall_handlers[0xac] = _syscall_prctl;
-    syscall_handlers[0xae] = _syscall_rt_sigaction;
-    syscall_handlers[0xaf] = _syscall_rt_sigprocmask;
-    syscall_handlers[0xb7] = _syscall_getcwd;
-    syscall_handlers[0xc0] = _syscall_mmap_pgoff;
-    syscall_handlers[0xc7] = _syscall_getuid;
-    syscall_handlers[0xc9] = _syscall_geteuid;
-    syscall_handlers[0xdc] = _syscall_getdents64;
-    syscall_handlers[0xdd] = _syscall_fcntl64;
-    syscall_handlers[0xe0] = _syscall_gettid;
-    syscall_handlers[0xef] = _syscall_sendfile64;
-    syscall_handlers[0xf3] = _syscall_set_thread_area;
-    syscall_handlers[0xfc] = _syscall_exit; // we implement exit_group as exit for now
-    syscall_handlers[0x102] = _syscall_set_tid_address;
-    syscall_handlers[0x17f] = _syscall_statx;
-    syscall_handlers[0x193] = _syscall_clock_gettime64;
-    // syscall_handlers[35] = _syscall_sleep;
+    REGISTER_SYSCALL_HANDLER(0x01, exit);
+    REGISTER_SYSCALL_HANDLER(0x02, fork);
+    REGISTER_SYSCALL_HANDLER(0x03, read);
+    REGISTER_SYSCALL_HANDLER(0x04, write);
+    REGISTER_SYSCALL_HANDLER(0x05, open);
+    REGISTER_SYSCALL_HANDLER(0x06, close);
+    REGISTER_SYSCALL_HANDLER(0x07, waitpid);
+    REGISTER_SYSCALL_HANDLER(0x0a, unlink);
+    REGISTER_SYSCALL_HANDLER(0x0b, execve);
+    REGISTER_SYSCALL_HANDLER(0x0c, chdir);
+    REGISTER_SYSCALL_HANDLER(0x0e, mknod);
+    REGISTER_SYSCALL_HANDLER(0x14, getpid);
+    REGISTER_SYSCALL_HANDLER(0x15, mount);
+    REGISTER_SYSCALL_HANDLER(0x21, access);
+    REGISTER_SYSCALL_HANDLER(0x25, kill);
+    REGISTER_SYSCALL_HANDLER(0x27, mkdir);
+    REGISTER_SYSCALL_HANDLER(0x29, dup);
+    REGISTER_SYSCALL_HANDLER(0x2a, pipe);
+    REGISTER_SYSCALL_HANDLER(0x2d, brk);
+    REGISTER_SYSCALL_HANDLER(0x36, ioctl);
+    REGISTER_SYSCALL_HANDLER(0x39, setpgid);
+    REGISTER_SYSCALL_HANDLER(0x3c, umask);
+    REGISTER_SYSCALL_HANDLER(0x3f, dup2);
+    REGISTER_SYSCALL_HANDLER(0x40, getppid);
+    REGISTER_SYSCALL_HANDLER(0x42, setsid);
+    REGISTER_SYSCALL_HANDLER(0x4e, gettimeofday);
+    REGISTER_SYSCALL_HANDLER(0x53, symlink);
+    REGISTER_SYSCALL_HANDLER(0x55, readlink);
+    REGISTER_SYSCALL_HANDLER(0x5b, munmap);
+    REGISTER_SYSCALL_HANDLER(0x5c, truncate);
+    REGISTER_SYSCALL_HANDLER(0x72, wait4);
+    REGISTER_SYSCALL_HANDLER(0x7a, newuname);
+    REGISTER_SYSCALL_HANDLER(0x84, getpgid);
+    REGISTER_SYSCALL_HANDLER(0x8c, llseek);
+    REGISTER_SYSCALL_HANDLER(0x8d, getdents);
+    REGISTER_SYSCALL_HANDLER(0x91, readv);
+    REGISTER_SYSCALL_HANDLER(0x92, writev);
+    REGISTER_SYSCALL_HANDLER(0x93, getsid);
+    REGISTER_SYSCALL_HANDLER(0xa8, poll);
+    REGISTER_SYSCALL_HANDLER(0xac, prctl);
+    REGISTER_SYSCALL_HANDLER(0xae, rt_sigaction);
+    REGISTER_SYSCALL_HANDLER(0xaf, rt_sigprocmask);
+    REGISTER_SYSCALL_HANDLER(0xb7, getcwd);
+    REGISTER_SYSCALL_HANDLER(0xc0, mmap_pgoff);
+    REGISTER_SYSCALL_HANDLER(0xc7, getuid);
+    REGISTER_SYSCALL_HANDLER(0xc9, geteuid);
+    REGISTER_SYSCALL_HANDLER(0xdc, getdents64);
+    REGISTER_SYSCALL_HANDLER(0xdd, fcntl64);
+    REGISTER_SYSCALL_HANDLER(0xe0, gettid);
+    REGISTER_SYSCALL_HANDLER(0xef, sendfile64);
+    REGISTER_SYSCALL_HANDLER(0xf3, set_thread_area);
+    REGISTER_SYSCALL_HANDLER(0xfc, exit_group);
+    REGISTER_SYSCALL_HANDLER(0x10, _set_tid_address);
+    REGISTER_SYSCALL_HANDLER(0x17, _statx);
+    REGISTER_SYSCALL_HANDLER(0x19, _clock_gettime64);
 }

+ 2 - 2
src/kernel/syscall/fileops.cc

@@ -6,7 +6,7 @@
 #include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
 
-int _syscall_symlink(interrupt_stack* data)
+long _syscall_symlink(interrupt_stack_normal* data)
 {
     SYSCALL_ARG1(const char __user*, target);
     SYSCALL_ARG2(const char __user*, linkpath);
@@ -28,7 +28,7 @@ int _syscall_symlink(interrupt_stack* data)
     return dent->ind->fs->symlink(dent, linkname.c_str(), target);
 }
 
-int _syscall_readlink(interrupt_stack* data)
+long _syscall_readlink(interrupt_stack_normal* data)
 {
     SYSCALL_ARG1(const char __user*, pathname);
     SYSCALL_ARG2(char __user*, buf);

+ 1 - 1
src/kernel/syscall/mount.cc

@@ -6,7 +6,7 @@
 #include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
 
-int _syscall_mount(interrupt_stack* data)
+long _syscall_mount(interrupt_stack_normal* data)
 {
     SYSCALL_ARG1(const char __user*, source);
     SYSCALL_ARG2(const char __user*, target);

+ 71 - 76
src/kernel/task/thread.cc

@@ -1,28 +1,29 @@
-#include <kernel/task/thread.hpp>
-
 #include <queue>
 
+#include <stdint.h>
+
+#include <kernel/async/lock.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/async/lock.hpp>
 #include <kernel/task/readyqueue.hpp>
+#include <kernel/task/thread.hpp>
+
+constexpr std::size_t KERNEL_STACK_ORDER = 3; // 2^3 * 4096 = 32KB
 
 using namespace kernel::task;
+using namespace kernel::mem::paging;
 
 thread::thread(std::string name, pid_t owner)
-    : owner { owner }, attr { READY | SYSTEM }, name { name }
-{
-}
+    : owner { owner }, attr { READY | SYSTEM }, name { name } { }
 
 thread::thread(const thread& val, pid_t owner)
-    : owner { owner }, attr { val.attr }, name { val.name }
-{
-}
+    : owner { owner }, attr { val.attr }, name { val.name } { }
 
 tid_t thread::tid() const
 {
-    return (tid_t)kstack.stack_base;
+    return (tid_t)kstack.pfn;
 }
 
 bool thread::operator<(const thread& rhs) const
@@ -35,57 +36,50 @@ bool thread::operator==(const thread& rhs) const
     return tid() == rhs.tid();
 }
 
-static std::priority_queue<std::byte*> s_kstacks;
-static kernel::async::mutex s_mtx_kstacks;
+static inline uintptr_t __stack_bottom(pfn_t pfn)
+{
+    return (uintptr_t)(void*)
+        kernel::mem::physaddr<void>{pfn + (1 << KERNEL_STACK_ORDER) * 0x1000};
+}
 
 thread::kernel_stack::kernel_stack()
 {
-    static int allocated;
-    kernel::async::lock_guard_irq lck(s_mtx_kstacks);
-
-    if (!s_kstacks.empty()) {
-        stack_base = s_kstacks.top();
-        esp = (uint32_t*)stack_base;
-        s_kstacks.pop();
-        return;
-    }
-
-    // TODO: LONG MODE
-    // kernel stack pt is at page#0x00005
-    // kernel::paccess pa(0x00005);
-    // auto pt = (pt_t)pa.ptr();
-    // assert(pt);
-
-    // int cnt = THREAD_KERNEL_STACK_SIZE / PAGE_SIZE;
-    // pte_t* pte = *pt + allocated * cnt;
-
-    // for (int i = 0; i < cnt; ++i) {
-    //     pte[i].v = 0x3;
-    //     pte[i].in.page = __alloc_raw_page();
-    // }
-
-    // stack_base = (std::byte*)(0xffc00000 + THREAD_KERNEL_STACK_SIZE * (allocated + 1));
-    // esp = (uint32_t*)stack_base;
-
-    // ++allocated;
+    pfn = page_to_pfn(alloc_pages(KERNEL_STACK_ORDER));
+    sp = __stack_bottom(pfn);
 }
 
 thread::kernel_stack::kernel_stack(const kernel_stack& other)
     : kernel_stack()
 {
-    auto offset = vptrdiff(other.stack_base, other.esp);
-    esp = (uint32_t*)(stack_base - offset);
-    memcpy(esp, other.esp, offset);
+    auto offset = __stack_bottom(other.pfn) - other.sp;
+
+    sp -= offset;
+    memcpy((void*)sp, (void*)other.sp, offset);
 }
 
 thread::kernel_stack::kernel_stack(kernel_stack&& other)
-    : stack_base(std::exchange(other.stack_base, nullptr))
-    , esp(std::exchange(other.esp, nullptr)) { }
+    : pfn(std::exchange(other.pfn, 0))
+    , sp(std::exchange(other.sp, 0)) { }
 
 thread::kernel_stack::~kernel_stack()
 {
-    kernel::async::lock_guard_irq lck(s_mtx_kstacks);
-    s_kstacks.push(stack_base);
+    if (!pfn)
+        return;
+    free_pages(pfn, KERNEL_STACK_ORDER);
+}
+
+uint64_t thread::kernel_stack::pushq(uint64_t val)
+{
+    *(uint64_t*)sp = val;
+    sp -= 8;
+    return val;
+}
+
+uint32_t thread::kernel_stack::pushl(uint32_t val)
+{
+    *(uint32_t*)sp = val;
+    sp -= 4;
+    return val;
 }
 
 void thread::set_attr(thd_attr_t new_attr)
@@ -142,36 +136,37 @@ void thread::send_signal(signal_list::signo_type signal)
 
 int thread::set_thread_area(kernel::user::user_desc* ptr)
 {
-    // TODO: LONG MODE
-    // if (ptr->read_exec_only && ptr->seg_not_present) {
-    //     void* dst = (void*)ptr->base_addr;
-    //     std::size_t len = ptr->limit;
-    //     if (len > 0 && dst)
-    //         memset(dst, 0x00, len);
-    //     return 0;
-    // }
-
-    // if (ptr->entry_number == -1U)
-    //     ptr->entry_number = 6;
-    // else
-    //     return -1;
-
-    // tls_desc.limit_low = ptr->limit & 0xFFFF;
-    // tls_desc.base_low = ptr->base_addr & 0xFFFF;
-    // tls_desc.base_mid = (ptr->base_addr >> 16) & 0xFF;
-    // tls_desc.access = SD_TYPE_DATA_USER;
-    // tls_desc.limit_high = (ptr->limit >> 16) & 0xF;
-    // tls_desc.flags = (ptr->limit_in_pages << 3) | (ptr->seg_32bit << 2);
-    // tls_desc.base_high = (ptr->base_addr >> 24) & 0xFF;
-
-    // return 0;
+    if (ptr->read_exec_only && ptr->seg_not_present) {
+        // TODO: use copy_to_user
+        auto* dst = (void*)(uintptr_t)ptr->base_addr;
+        std::size_t len = ptr->limit;
+        if (len > 0 && dst)
+            memset(dst, 0x00, len);
+        return 0;
+    }
+
+    if (ptr->entry_number == -1U)
+        ptr->entry_number = 6;
+    else
+        return -1;
+
+    if (!ptr->seg_32bit)
+        return -1;
+
+    tls_desc[0]  = ptr->limit & 0x0000'ffff;
+    tls_desc[0] |= (ptr->base_addr & 0x00ff'ffffULL) << 16;
+    tls_desc[0] |= 0xe2'00'0000'0000;
+    tls_desc[0] |= (ptr->limit & 0x000f'0000ULL) << (48-16);
+    tls_desc[0] |= ((ptr->limit_in_pages + 0ULL) << 55);
+    tls_desc[0] |= (ptr->base_addr & 0xf000'0000) << (56-28);
+
+    tls_desc[1]  = 0; // 63:32: all 0, 31:0: ptr->base_addr[63:32]
+
+    return 0;
 }
 
 int thread::load_thread_area() const
 {
-    // TODO: LONG MODE
-    // if (tls_desc.flags == 0)
-    //     return -1;
-    // kernel::user::load_thread_area(tls_desc);
-    // return 0;
+    kernel::user::load_thread_area(tls_desc[0], tls_desc[1]);
+    return 0;
 }

+ 33 - 17
src/kernel/user/thread_local.cc

@@ -1,22 +1,38 @@
-#include <kernel/process.hpp>
+#include <cstddef>
+
+#include <stdint.h>
+
+#include <kernel/mem/phys.hpp>
+#include <kernel/mem/types.hpp>
 #include <kernel/user/thread_local.hpp>
 
-#include <string.h>
-#include <cstddef>
+using namespace kernel::user;
+
+void kernel::user::load_thread_area32(uint64_t desc)
+{
+    mem::gdt[7] = desc;
+    asm volatile(
+        "mov %%gs, %%ax\n\t"
+        "mov %%ax, %%gs\n\t"
+        : : : "ax"
+    );
+}
 
-namespace kernel::user {
+void kernel::user::load_thread_area64(uint64_t desc_lo, uint64_t desc_hi)
+{
+    mem::gdt[12] = desc_lo;
+    mem::gdt[13] = desc_hi;
 
-// TODO: LONG MODE
-// void load_thread_area(const segment_descriptor& desc)
-// {
-//     gdt[6] = desc;
-//     asm volatile(
-//         "mov %%gs, %%ax\n\t"
-//         "mov %%ax, %%gs\n\t"
-//         :
-//         :
-//         : "ax"
-//     );
-// }
+    asm volatile(
+        "mov %%fs, %%ax\n\t"
+        "mov %%ax, %%fs\n\t"
+        "mov %%gs, %%ax\n\t"
+        "mov %%ax, %%gs\n\t"
+        : : : "ax"
+    );
+}
 
-} // namespace kernel::user
+void kernel::user::load_thread_area(uint64_t desc_lo, uint64_t desc_hi)
+{
+    load_thread_area64(desc_lo, desc_hi);
+}

+ 2 - 1
src/kernel/vfs.cpp

@@ -155,7 +155,8 @@ fs::inode* vfs::cache_inode(size_t size, ino_t ino,
 
 void vfs::free_inode(ino_t ino)
 {
-    assert(_inodes.erase(ino) == 1);
+    int n = _inodes.erase(ino);
+    assert(n == 1);
 }
 
 fs::inode* vfs::get_inode(ino_t ino)

+ 1 - 2
src/kernel/vfs/tmpfs.cc

@@ -5,7 +5,6 @@
 #include <stdint.h>
 
 #include <kernel/log.hpp>
-#include <kernel/mm.hpp>
 #include <kernel/vfs.hpp>
 
 using fs::vfs, fs::inode, fs::dentry;
@@ -275,7 +274,7 @@ public:
         }
 
         if (mask & STATX_BLOCKS) {
-            st->stx_blocks = align_up<9>(ind->size) / 512;
+            st->stx_blocks = ((ind->size + 0x1ff) & ~0x1ff) / 512;
             st->stx_blksize = 4096;
             st->stx_mask |= STATX_BLOCKS;
         }

+ 49 - 6
src/kinit.cpp

@@ -7,9 +7,10 @@
 
 #include <kernel/hw/pci.hpp>
 #include <kernel/hw/timer.hpp>
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 #include <kernel/log.hpp>
 #include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
 #include <kernel/mem/types.hpp>
 #include <kernel/process.hpp>
 #include <kernel/syscall.hpp>
@@ -76,15 +77,13 @@ void NORETURN real_kernel_init()
 
     set_uname();
 
-    init_idt();
-    init_pic();
+    init_interrupt();
     hw::timer::init_pit();
 
-    kernel::kinit::init_pci();
+    init_pci();
 
     // TODO: remove this
     init_vfs();
-    // TODO: LONG MODE
     // init_syscall();
 
     init_scheduler();
@@ -108,6 +107,9 @@ static inline void setup_early_kernel_page_table()
 
     // clear kernel bss
     memset((void*)BSS_ADDR, 0x00, BSS_LENGTH);
+
+    // clear empty page
+    memset(mem::physaddr<void>{EMPTY_PAGE_PFN}, 0x00, 0x1000);
 }
 
 SECTION(".text.kinit")
@@ -134,6 +136,7 @@ static inline void setup_buddy(uintptr_t addr_max)
     using namespace kernel::mem::paging;
     constexpr auto idx = idx_all(0xffffff8040000000ULL);
 
+    addr_max += 0xfff;
     addr_max >>= 12;
     int count = (addr_max * sizeof(page) + 0x200000 - 1) / 0x200000;
 
@@ -191,15 +194,55 @@ static inline void save_memory_info(bootloader_data* data)
         sizeof(kernel::mem::info::e820_entries));
 }
 
+SECTION(".text.kinit")
+void setup_gdt()
+{
+    // user code
+    mem::gdt[3]  = 0x0020'fa00'0000'0000;
+    // user data
+    mem::gdt[4]  = 0x0000'f200'0000'0000;
+    // user code32
+    mem::gdt[5]  = 0x00cf'fa00'0000'ffff;
+    // user data32
+    mem::gdt[6]  = 0x00cf'f200'0000'ffff;
+    // reserved
+    mem::gdt[7]  = 0x0000'0000'0000'0000;
+
+    // TSS descriptor
+    mem::gdt[8]  = 0x0000'8900'0070'0067;
+    mem::gdt[9]  = 0x0000'0000'0000'0000;
+
+    // LDT descriptor
+    mem::gdt[10] = 0x0000'8200'0060'000f;
+    mem::gdt[11] = 0x0000'0000'0000'0000;
+
+    // thread local
+    mem::gdt[12] = 0x0000'0000'0000'0000;
+    mem::gdt[13] = 0x0000'0000'0000'0000;
+
+    uint64_t descriptor[] = {
+        0x005f'0000'0000'0000, (uintptr_t)(uint64_t*)mem::gdt
+    };
+
+    asm volatile(
+            "lgdt (%0)\n\t"
+            "mov $0x50, %%ax\n\t"
+            "lldt %%ax\n\t"
+            "mov $0x40, %%ax\n\t"
+            "ltr %%ax\n\t"
+            : : "r"((uintptr_t)descriptor+6): "ax"
+    );
+}
+
 extern "C" SECTION(".text.kinit")
 void NORETURN kernel_init(bootloader_data* data)
 {
     enable_sse();
 
     setup_early_kernel_page_table();
+    setup_gdt();
     save_memory_info(data);
 
-    // create struct pages
     uintptr_t addr_max = 0;
     for (int i = 0; i < (int)kernel::mem::info::e820_entry_count; ++i) {
         auto& ent = kernel::mem::info::e820_entries[i];

+ 97 - 88
src/types/elf.cpp

@@ -9,118 +9,121 @@
 
 #include <types/elf.hpp>
 
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
 
-#define align16_down(sp) (sp = ((char*)((uint32_t)(sp)&0xfffffff0)))
-
-template <typename T>
-inline void _user_push(char** sp, T d)
+static inline void __user_push32(uintptr_t* sp, uint32_t d)
 {
-    *sp -= sizeof(T);
-    *(T*)*sp = d;
+    // TODO: use copy_to_user
+    *(--*(uint32_t**)sp) = d;
 }
-template <>
-inline void _user_push(char** sp, const char* str)
+
+static inline void __user_push_string32(uintptr_t* sp, const char* str)
 {
     size_t len = strlen(str);
+
     *sp -= (len + 1);
-    size_t nsp = (size_t)sp;
-    size_t mask = 0xf;
-    nsp &= ~mask;
-    *sp = std::bit_cast<char*>(nsp);
-    memcpy(*sp, str, len + 1);
+    *sp &= 0xf; // align to 16 bytes
+
+    memcpy((void*)*sp, str, len + 1);
 }
 
-int types::elf::elf32_load(types::elf::elf32_load_data* d)
+int types::elf::elf32_load(types::elf::elf32_load_data& d)
 {
-    auto* ent_exec = d->exec_dent;
-    if (!ent_exec) {
-        d->errcode = ENOENT;
-        return -1;
-    }
+    auto& exec = d.exec_dent;
+    if (!exec)
+        return -ENOENT;
 
-    // TODO: detect file format
     types::elf::elf32_header hdr {};
     auto n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)&hdr,
         sizeof(types::elf::elf32_header),
         0, sizeof(types::elf::elf32_header));
 
-    if (n_read != sizeof(types::elf::elf32_header)) {
-        d->errcode = EINVAL;
-        return -1;
-    }
+    if (n_read != sizeof(types::elf::elf32_header))
+        return -EINVAL;
+
+    if (hdr.magic[0] != 0x7f || hdr.magic[1] != 'E'
+            || hdr.magic[2] != 'L' || hdr.magic[3] != 'F')
+        return -EINVAL;
 
     size_t phents_size = hdr.phentsize * hdr.phnum;
     size_t shents_size = hdr.shentsize * hdr.shnum;
     std::vector<types::elf::elf32_program_header_entry> phents(hdr.phnum);
     n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)phents.data(),
         phents_size,
         hdr.phoff, phents_size);
 
     // broken file or I/O error
-    if (n_read != phents_size) {
-        d->errcode = EINVAL;
-        return -1;
-    }
+    if (n_read != phents_size)
+        return -EINVAL;
 
     std::vector<types::elf::elf32_section_header_entry> shents(hdr.shnum);
     n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)shents.data(),
         shents_size,
         hdr.shoff, shents_size);
 
     // broken file or I/O error
-    if (n_read != shents_size) {
-        d->errcode = EINVAL;
-        return -1;
-    }
-
-    // copy argv and envp
-    std::vector<std::string> argv, envp;
-    for (const char* const* p = d->argv; *p; ++p)
-        argv.emplace_back(*p);
-    for (const char* const* p = d->envp; *p; ++p)
-        envp.emplace_back(*p);
+    if (n_read != shents_size)
+        return -EINVAL;
 
-    // from now on, caller process is recycled.
+    // from now on, caller process is gone.
     // so we can't just simply return to it on error.
-    current_process->mms.clear_user();
+    auto& mms = current_process->mms;
+    mms.clear();
 
-    uint32_t data_segment_end = 0;
+    uintptr_t data_segment_end = 0;
 
     for (const auto& phent : phents) {
         if (phent.type != types::elf::elf32_program_header_entry::PT_LOAD)
             continue;
 
-        auto vaddr = align_down<12>(phent.vaddr);
-        auto vlen = align_up<12>(phent.vaddr + phent.memsz) - vaddr;
-        auto flen = align_up<12>(phent.vaddr + phent.filesz) - vaddr;
-        auto fileoff = align_down<12>(phent.offset);
+        auto vaddr = phent.vaddr & ~0xfff;
+        auto vlen = ((phent.vaddr + phent.memsz + 0xfff) & ~0xfff) - vaddr;
+        auto flen = ((phent.vaddr + phent.filesz + 0xfff) & ~0xfff) - vaddr;
+        auto fileoff = phent.offset & ~0xfff;
 
+        using namespace kernel::mem;
         if (flen) {
-            auto ret = mmap(
-                (char*)vaddr,
-                phent.filesz + (phent.vaddr & 0xfff),
-                ent_exec->ind,
-                fileoff,
-                1,
-                d->system);
-
-            if (ret != 0)
+            mm_list::map_args args{};
+
+            args.vaddr = vaddr;
+            args.length = phent.filesz + (phent.vaddr & 0xfff);
+            args.file_inode = exec->ind;
+            args.file_offset = fileoff;
+
+            args.flags = MM_MAPPED;
+            if (phent.flags & elf32_program_header_entry::PF_W)
+                args.flags |= MM_WRITE;
+
+            if (phent.flags & elf32_program_header_entry::PF_X)
+                args.flags |= MM_EXECUTE;
+
+            if (auto ret = mms.mmap(args); ret != 0)
                 kill_current(SIGSEGV);
         }
 
         if (vlen > flen) {
-            auto ret = mmap((char*)vaddr + flen, vlen - flen,
-                nullptr, 0, true, d->system);
+            mm_list::map_args args{};
+
+            args.vaddr = vaddr + flen;
+            args.length = vlen - flen;
 
-            if (ret != 0)
+            args.flags = MM_ANONYMOUS;
+            if (phent.flags & elf32_program_header_entry::PF_W)
+                args.flags |= MM_WRITE;
+
+            if (phent.flags & elf32_program_header_entry::PF_X)
+                args.flags |= MM_EXECUTE;
+
+            if (auto ret = mms.mmap(args); ret != 0)
                 kill_current(SIGSEGV);
         }
 
@@ -128,60 +131,66 @@ int types::elf::elf32_load(types::elf::elf32_load_data* d)
             data_segment_end = vaddr + vlen;
     }
 
-    current_process->mms.register_brk((char*)data_segment_end + 0x10000);
+    current_process->mms.register_brk(data_segment_end + 0x10000);
 
     for (const auto& shent : shents) {
         if (shent.sh_type == elf32_section_header_entry::SHT_NOBITS)
-            memset((char*)shent.sh_addr, 0x00, shent.sh_size);
+            memset((char*)(uintptr_t)shent.sh_addr, 0x00, shent.sh_size);
     }
 
     // map stack area
-    auto ret = mmap((void*)types::elf::ELF_STACK_TOP,
-        types::elf::ELF_STACK_SIZE, nullptr, 0, true, false);
+    if (1) {
+        using namespace kernel::mem;
+        mm_list::map_args args{};
 
-    // TODO: destruct local variables before calling kill_current
-    if (ret != 0)
-        kill_current(SIGSEGV);
+        args.vaddr = ELF32_STACK_TOP;
+        args.length = ELF32_STACK_SIZE;
+        args.flags = MM_ANONYMOUS | MM_WRITE;
+
+        if (auto ret = mms.mmap(args); ret != 0)
+            kill_current(SIGSEGV);
+        // TODO: deconstruct local variables before calling kill_current
+    }
 
-    d->eip = (void*)hdr.entry;
-    d->sp = reinterpret_cast<uint32_t*>(types::elf::ELF_STACK_BOTTOM);
+    d.ip = hdr.entry;
+    d.sp = ELF32_STACK_BOTTOM;
 
-    auto* sp = (char**)&d->sp;
+    auto* sp = &d.sp;
 
     // fill information block area
-    std::vector<char*> args, envs;
-    for (const auto& env : envp) {
-        _user_push(sp, env.c_str());
-        envs.push_back(*sp);
+    std::vector<elf32_addr_t> args, envs;
+    for (const auto& env : d.envp) {
+        __user_push_string32(sp, env.c_str());
+        envs.push_back((uintptr_t)*sp);
     }
-    for (const auto& arg : argv) {
-        _user_push(sp, arg.c_str());
-        args.push_back(*sp);
+    for (const auto& arg : d.argv) {
+        __user_push_string32(sp, arg.c_str());
+        args.push_back((uintptr_t)*sp);
     }
 
     // push null auxiliary vector entry
-    _user_push(sp, 0);
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
+    __user_push32(sp, 0);
 
     // push 0 for envp
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
 
     // push envp
-    *sp -= sizeof(void*) * envs.size();
-    memcpy(*sp, envs.data(), sizeof(void*) * envs.size());
+    for (auto ent : envs)
+        __user_push32(sp, ent);
 
     // push 0 for argv
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
 
     // push argv
-    *sp -= sizeof(void*) * args.size();
-    memcpy(*sp, args.data(), sizeof(void*) * args.size());
+    for (auto ent : args)
+        __user_push32(sp, ent);
 
     // push argc
-    _user_push(sp, args.size());
+    __user_push32(sp, args.size());
 
     // rename current thread
-    current_thread->name = ent_exec->name;
+    current_thread->name = exec->name;
 
     return 0;
 }