Browse Source

Merge branch 'long-mode' into vfs-refactor

kernel is now in long mode

TODO: we still have no real 64bit elf executable support
greatbridf 11 tháng trước cách đây
mục cha
commit
e526dc0708
89 tập tin đã thay đổi với 4861 bổ sung4450 xóa
  1. 21 28
      CMakeLists.txt
  2. 3 2
      Makefile.src
  3. 3 8
      configure
  4. 29 8
      doc/mem_layout.txt
  5. 34 1
      gblibc/CMakeLists.txt
  6. 0 1
      gblibc/include/bits/alltypes.h
  7. 1 1
      gblibc/include/stdint.h
  8. 6 6
      gblibc/include/sys/types.h
  9. 1 1
      gblibc/src/fcntl.c
  10. 4 0
      gblibstdc++/include/bits/rbtree
  11. 1 1
      gblibstdc++/include/functional
  12. 2 2
      gblibstdc++/include/string
  13. 0 36
      include/asm/port_io.h
  14. 0 27
      include/asm/sys.h
  15. 1 4
      include/fs/fat.hpp
  16. 10 7
      include/kernel/async/lock.hpp
  17. 0 15
      include/kernel/hw/keyboard.h
  18. 0 1
      include/kernel/hw/serial.hpp
  19. 0 17
      include/kernel/hw/timer.h
  20. 11 0
      include/kernel/hw/timer.hpp
  21. 0 61
      include/kernel/interrupt.h
  22. 74 0
      include/kernel/interrupt.hpp
  23. 2 2
      include/kernel/log.hpp
  24. 0 141
      include/kernel/mem.h
  25. 107 0
      include/kernel/mem/mm_list.hpp
  26. 192 0
      include/kernel/mem/paging.hpp
  27. 65 0
      include/kernel/mem/phys.hpp
  28. 40 0
      include/kernel/mem/slab.hpp
  29. 36 0
      include/kernel/mem/types.hpp
  30. 46 0
      include/kernel/mem/vm_area.hpp
  31. 0 400
      include/kernel/mm.hpp
  32. 1 1
      include/kernel/module.hpp
  33. 12 20
      include/kernel/process.hpp
  34. 2 2
      include/kernel/signal.hpp
  35. 111 10
      include/kernel/syscall.hpp
  36. 0 18
      include/kernel/task.h
  37. 12 6
      include/kernel/task/thread.hpp
  38. 11 13
      include/kernel/tty.hpp
  39. 1 3
      include/kernel/user/thread_local.hpp
  40. 1 1
      include/kernel/vfs.hpp
  41. 10 47
      include/types/allocator.hpp
  42. 157 19
      include/types/elf.hpp
  43. 19 8
      include/types/hash_map.hpp
  44. 43 0
      include/types/list.hpp
  45. 0 22
      include/types/size.h
  46. 0 4
      include/types/status.h
  47. 6 2
      include/types/types.h
  48. 16 0
      include/types/user_types.hpp
  49. 1 1
      init_script.sh
  50. 7 22
      pretty-print.py
  51. 105 247
      src/asm/interrupt.s
  52. 0 56
      src/asm/port_io.s
  53. 0 53
      src/asm/sys.s
  54. 178 287
      src/boot.s
  55. 29 51
      src/fs/fat.cpp
  56. 26 3
      src/fs/procfs.cc
  57. 62 52
      src/kernel.ld
  58. 121 36
      src/kernel/allocator.cc
  59. 16 16
      src/kernel/async/lock.cc
  60. 43 52
      src/kernel/hw/ahci.cc
  61. 0 31
      src/kernel/hw/keyboard.cpp
  62. 124 0
      src/kernel/hw/serial.cc
  63. 0 71
      src/kernel/hw/serial.cpp
  64. 0 26
      src/kernel/hw/timer.c
  65. 31 0
      src/kernel/hw/timer.cc
  66. 100 268
      src/kernel/interrupt.cpp
  67. 0 586
      src/kernel/mem.cpp
  68. 348 0
      src/kernel/mem/mm_list.cc
  69. 448 0
      src/kernel/mem/paging.cc
  70. 125 0
      src/kernel/mem/slab.cc
  71. 122 148
      src/kernel/process.cpp
  72. 20 21
      src/kernel/signal.cpp
  73. 265 1079
      src/kernel/syscall.cpp
  74. 538 8
      src/kernel/syscall/fileops.cc
  75. 51 0
      src/kernel/syscall/infoops.cc
  76. 6 7
      src/kernel/syscall/mount.cc
  77. 391 0
      src/kernel/syscall/procops.cc
  78. 17 0
      src/kernel/task/readyqueue.cc
  79. 78 60
      src/kernel/task/thread.cc
  80. 14 17
      src/kernel/tty.cpp
  81. 14 13
      src/kernel/user/thread_local.cc
  82. 11 11
      src/kernel/vfs.cpp
  83. 17 16
      src/kernel/vfs/tmpfs.cc
  84. 208 96
      src/kinit.cpp
  85. 152 59
      src/mbr.S
  86. 0 15
      src/mbr.ld
  87. 98 87
      src/types/elf.cpp
  88. 1 6
      src/types/libstdcpp.cpp
  89. 3 3
      user-space-program/CMakeLists.txt

+ 21 - 28
CMakeLists.txt

@@ -6,11 +6,11 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_LINK_EXECUTABLE
     "<CMAKE_LINKER> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
 
-set(CMAKE_ASM_FLAGS "-m32")
-set(C_CXX_FLAGS "-nostdinc -m32 -nostdlib -W -Wall -Wextra -Wno-stringop-overflow -Wno-builtin-declaration-mismatch -Wno-format -fverbose-asm -fno-exceptions -ffreestanding -fno-pic -mstack-protector-guard=global")
+set(C_CXX_FLAGS "-nostdinc -nostdlib -W -Wall -Wextra -Wno-stringop-overflow -Wno-builtin-declaration-mismatch -Wno-format -fverbose-asm -fno-exceptions -ffreestanding -fno-pic -mno-red-zone -mstack-protector-guard=global -mcmodel=kernel")
 set(CMAKE_C_FLAGS "${C_CXX_FLAGS} -Werror=implicit-int -Werror=implicit-function-declaration -Werror=strict-aliasing")
 set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -fno-use-cxa-atexit -fno-rtti")
 set(CMAKE_CXX_LINK_FLAGS "")
+SET(CMAKE_ASM_FLAGS "${CFLAGS} -x assembler-with-cpp")
 set(CMAKE_CXX_STANDARD 20)
 
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -30,9 +30,8 @@ add_subdirectory(gblibstdc++)
 add_subdirectory(user-space-program)
 
 set(BOOTLOADER_SOURCES src/boot.s
+                       src/mbr.S
                        src/asm/interrupt.s
-                       src/asm/port_io.s
-                       src/asm/sys.s
                        )
 
 set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
@@ -46,16 +45,19 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         src/kernel/tty.cpp
                         src/kernel/syscall.cpp
                         src/kernel/syscall/fileops.cc
+                        src/kernel/syscall/infoops.cc
                         src/kernel/syscall/mount.cc
-                        src/kernel/mem.cpp
+                        src/kernel/syscall/procops.cc
+                        src/kernel/mem/mm_list.cc
+                        src/kernel/mem/paging.cc
+                        src/kernel/mem/slab.cc
                         src/kernel/module.cc
                         src/kernel/vfs.cpp
                         src/kernel/vga.cpp
                         src/kernel/hw/ahci.cc
-                        src/kernel/hw/keyboard.cpp
                         src/kernel/hw/pci.cc
-                        src/kernel/hw/serial.cpp
-                        src/kernel/hw/timer.c
+                        src/kernel/hw/serial.cc
+                        src/kernel/hw/timer.cc
                         src/kernel/task/thread.cc
                         src/kernel/task/readyqueue.cc
                         src/kernel/user/thread_local.cc
@@ -63,18 +65,19 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         src/kernel/signal.cpp
                         src/types/elf.cpp
                         src/types/libstdcpp.cpp
-                        include/asm/port_io.h
-                        include/asm/sys.h
                         include/fs/fat.hpp
                         include/kernel/async/waitlist.hpp
                         include/kernel/async/lock.hpp
                         include/kernel/tty.hpp
-                        include/kernel/interrupt.h
+                        include/kernel/interrupt.hpp
                         include/kernel/irq.hpp
                         include/kernel/process.hpp
                         include/kernel/syscall.hpp
-                        include/kernel/mem.h
-                        include/kernel/mm.hpp
+                        include/kernel/mem/mm_list.hpp
+                        include/kernel/mem/paging.hpp
+                        include/kernel/mem/slab.hpp
+                        include/kernel/mem/types.hpp
+                        include/kernel/mem/vm_area.hpp
                         include/kernel/module.hpp
                         include/kernel/utsname.hpp
                         include/kernel/vfs.hpp
@@ -87,20 +90,18 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         include/kernel/task/forward.hpp
                         include/kernel/task/thread.hpp
                         include/kernel/task/readyqueue.hpp
-                        include/kernel/hw/keyboard.h
                         include/kernel/hw/pci.hpp
                         include/kernel/hw/port.hpp
-                        include/kernel/hw/serial.h
-                        include/kernel/hw/timer.h
+                        include/kernel/hw/serial.hpp
+                        include/kernel/hw/timer.hpp
                         include/kernel/input/keycodes.h
                         include/kernel/user/thread_local.hpp
                         include/types/bitmap.hpp
                         include/types/buffer.hpp
                         include/types/elf.hpp
                         include/types/hash_map.hpp
+                        include/types/list.hpp
                         include/types/types.h
-                        include/types/size.h
-                        include/types/status.h
                         include/types/allocator.hpp
                         include/types/cplusplus.hpp
                         include/kernel/log.hpp
@@ -110,26 +111,18 @@ add_executable(kernel.out ${KERNEL_MAIN_SOURCES} ${BOOTLOADER_SOURCES})
 target_link_libraries(kernel.out gblibc gblibstdc++)
 target_include_directories(kernel.out PRIVATE ${PROJECT_SOURCE_DIR}/include)
 target_link_options(kernel.out PRIVATE
-    -T ${CMAKE_SOURCE_DIR}/src/kernel.ld -melf_i386 -lgblibc -L${CMAKE_BINARY_DIR}/gblibc)
+    -T ${CMAKE_SOURCE_DIR}/src/kernel.ld -lgblibc -L${CMAKE_BINARY_DIR}/gblibc)
 set_target_properties(kernel.out PROPERTIES LINK_DEPENDS ${CMAKE_SOURCE_DIR}/src/kernel.ld)
 
-add_custom_command(OUTPUT mbr.bin
-    DEPENDS ${PROJECT_SOURCE_DIR}/src/mbr.S ${PROJECT_SOURCE_DIR}/src/mbr.ld
-    COMMAND ${CMAKE_ASM_COMPILER} -m32 -c ${PROJECT_SOURCE_DIR}/src/mbr.S -o mbr.o
-    COMMAND ${CMAKE_LINKER} -T ${PROJECT_SOURCE_DIR}/src/mbr.ld mbr.o -o mbr.bin
-)
-
 add_custom_command(OUTPUT mbr_hole.bin
     DEPENDS kernel.out
     COMMAND ${CMAKE_OBJCOPY} --strip-debug -O binary ${CMAKE_BINARY_DIR}/kernel.out mbr_hole.bin
 )
 
 add_custom_target(boot.img
-    DEPENDS mbr.bin
     DEPENDS mbr_hole.bin
     DEPENDS user_space_programs
-    COMMAND dd if=mbr.bin of=boot.img
-    COMMAND cat mbr_hole.bin >> boot.img
+    COMMAND dd if=mbr_hole.bin of=boot.img
     COMMAND dd if=/dev/zero of=boot.img bs=`expr 512 \\* 1024 \\* 1024` count=0 seek=1
     COMMAND sh -c \"echo n\; echo\; echo\; echo\; echo\; echo a\; echo w\" | ${FDISK_BIN} boot.img
     COMMAND mkfs.fat --offset=2048 -v -n SYSTEM boot.img

+ 3 - 2
Makefile.src

@@ -6,7 +6,7 @@ QEMU_DEBUG_FLAG=#-d cpu_reset,int
 QEMU_ARGS=-machine q35 -drive id=disk,file=build/boot.img,format=raw,if=none \
 	-device ahci,id=ahci -device ide-hd,drive=disk,bus=ahci.0 \
 	-no-reboot -no-shutdown $(QEMU_ACCELERATION_FLAG) $(QEMU_DEBUG_FLAG)
-	
+
 CROSS_COMPILE=##PLACEHOLDER_4##
 .PHONY: run
 run: build
@@ -42,7 +42,8 @@ clean-all: clean
 
 .PHONY: debug
 debug:
-	$(GDB_BIN) --symbols=build/kernel.out --init-eval-command 'source pretty-print.py' --init-eval-command 'set pagination off' --init-eval-command 'target remote:1234' --eval-command 'hbr _kernel_init' --eval-command 'c'
+	-$(GDB_BIN) --symbols=build/kernel.out --init-eval-command 'source pretty-print.py' --init-eval-command 'set pagination off' --init-eval-command 'target remote:1234' --init-eval-command 'layout regs' --eval-command 'hbr _kernel_init' --eval-command 'c'
+	-killall $(QEMU_BIN)
 
 build/boot.vdi: build/boot.img
 	-rm build/boot.vdi

+ 3 - 8
configure

@@ -1,5 +1,5 @@
 #!/bin/sh
-QEMU_EXECUTABLES="qemu-system-i386 qemu-system-x86_64"
+QEMU_EXECUTABLES="qemu-system-x86_64"
 GDB_EXECUTABLES="gdb x86_64-elf-gdb"
 
 event() {
@@ -77,13 +77,8 @@ case "$OS" in
         QEMU_ACCEL='-enable-kvm'
         ;;
     "Darwin")
-        if [ "$QEMU" = "qemu-system-x86_64" ]; then
-            echo "hvf"
-            QEMU_ACCEL='-accel hvf'
-        else
-            echo "tcg"
-            QEMU_ACCEL='-accel tcg'
-        fi
+        echo "tcg"
+        QEMU_ACCEL='-accel tcg'
         ;;
 esac
 

+ 29 - 8
doc/mem_layout.txt

@@ -1,12 +1,33 @@
-0x00000000 - 0x00001000 kernel pd
-0x00001000 - 0x00005000 kernel pt
-0x00005000 - 0x00006000 empty page
+physical memory
 
-....
+0x0000 - 0x1000: GDT, TSS, LDT and some early kernel data
+0x1000 - 0x2000: kernel stage1
+0x2000 - ?     : kernel image
 
-0x00100000 - 0x???????? kernel code, data, bss
-0x???????? - 0x01000000 kernel early stack
+0x100000 - 0x101000 : kernel PML4
+0x101000 - 0x102000 : kernel PDPT for physical memory mappings
+0x102000 - 0x103000 : kernel PDPT for kernel space
+0x103000 - 0x104000 : kernel PD for kernel image
+0x104000 - 0x105000 : kernel PT for kernel image
+0x105000 - 0x106000 : kernel PD for struct page array#1
 
-....
+0x106000 - 0x200000 : unused empty pages
+0x200000 - 0x400000 : first kernel bss page (2MB)
 
-0x30000000 - 0x40000000 kernel heap
+
+virtual address space
+
+0xffff ff0 000 000 000 - 0xffff ff3 fff fff fff  256GB physical memory (cached)
+0xffff ff4 000 000 000 - 0xffff ff7 fff fff fff  256GB physical memory (not cached)
+0xffff ff8 000 000 000 - 0xffff ff8 03f fff fff    1GB unused
+0xffff ff8 040 000 000 - 0xffff ff8 13f fff fff    4GB struct page array
+0xffff ff8 140 000 000 - 0xffff ff8 17f fff fff    1GB unused
+0xffff ff8 180 000 000 - 0xffff ffb fff fff fff  250GB kernel heap
+
+0xffff ffc 000 000 000 - 0xffff fff fbf fff fff  255GB unused
+
+0xffff fff fc0 000 000 - 0xffff fff fc0 1ff fff    2MB unused
+0xffff fff fc0 200 000 - 0xffff fff fff 9ff fff 1016MB kernel bss
+0xffff fff fff a00 000 - 0xffff fff fff bff fff    2MB unused
+0xffff fff fff c00 000 - 0xffff fff fff dff fff    2MB kernel image
+0xffff fff fff e00 000 - 0xffff fff fff fff fff    2MB unused

+ 34 - 1
gblibc/CMakeLists.txt

@@ -21,10 +21,36 @@ add_library(gblibc STATIC
     src/platform-independent.s
 )
 
-add_library(crt0 OBJECT
+add_library(gblibc_32 STATIC
+    src/stdio.c
+    src/arithmetic.c
+    src/string.c
+    src/fcntl.c
+    src/unistd.c
+    src/wait.c
+    src/assert.c
+    src/dirent.c
+    src/ctype.c
+    src/stdlib.c
+    src/errno.c
+    src/init.c
+    src/internal.c
+    src/stat.c
+    src/time.c
+    src/signal.c
+    src/platform-independent.s
+)
+
+add_library(crt0_32 OBJECT
     src/crt0.s
 )
 
+target_compile_options(gblibc_32 PRIVATE "-m32")
+target_compile_options(gblibc_32 PRIVATE "-mcmodel=32")
+target_compile_options(crt0_32 PRIVATE "-m32")
+target_link_options(gblibc_32 PRIVATE "LINKER:-melf_i386")
+target_link_options(crt0_32 PRIVATE "LINKER:-melf_i386")
+
 file(GLOB_RECURSE GBLIBC_PUBLIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 target_include_directories(gblibc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
@@ -33,3 +59,10 @@ target_include_directories(gblibc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
 set_target_properties(gblibc PROPERTIES PRIVATE_HEADER
     "private-include/devutil.h,private-include/syscall.h")
 set_target_properties(gblibc PROPERTIES PUBLIC_HEADER "${GBLIBC_PUBLIC_HEADERS}")
+
+target_include_directories(gblibc_32 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
+                                  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/private-include)
+
+set_target_properties(gblibc_32 PROPERTIES PRIVATE_HEADER
+    "private-include/devutil.h,private-include/syscall.h")
+set_target_properties(gblibc_32 PROPERTIES PUBLIC_HEADER "${GBLIBC_PUBLIC_HEADERS}")

+ 0 - 1
gblibc/include/bits/alltypes.h

@@ -13,7 +13,6 @@ typedef size_t blkcnt_t;
 struct timespec {
     time_t tv_sec;
     long tv_nsec;
-    int : 32; // padding
 };
 
 struct timeval {

+ 1 - 1
gblibc/include/stdint.h

@@ -22,7 +22,7 @@ typedef __UINTPTR_TYPE__ uintptr_t;
 typedef __INTPTR_TYPE__ intptr_t;
 
 typedef __SIZE_TYPE__ size_t;
-typedef int32_t ssize_t;
+typedef int64_t ssize_t;
 
 typedef uint64_t time_t;
 typedef int64_t time_diff_t;

+ 6 - 6
gblibc/include/sys/types.h

@@ -8,16 +8,16 @@ extern "C" {
 #endif
 
 typedef int pid_t;
-typedef uint32_t ino_t;
-typedef int32_t off_t;
-typedef uint32_t dev_t;
+typedef unsigned long ino_t;
+typedef long off_t;
+typedef unsigned dev_t;
 typedef unsigned uid_t;
 typedef unsigned gid_t;
-typedef unsigned mode_t;
+typedef unsigned short mode_t;
 typedef unsigned long nlink_t;
 
-typedef uint64_t ino64_t;
-typedef int64_t off64_t;
+typedef unsigned long long ino64_t;
+typedef long long off64_t;
 
 typedef off64_t loff_t;
 

+ 1 - 1
gblibc/src/fcntl.c

@@ -12,7 +12,7 @@ int open(const char* filename, int flags, ...)
         va_list vl;
         va_start(vl, flags);
 
-        ret = syscall3(SYS_open, (uint32_t)filename, flags, va_arg(vl, mode_t));
+        ret = syscall3(SYS_open, (uint32_t)filename, flags, va_arg(vl, int));
 
         va_end(vl);
     }

+ 4 - 0
gblibstdc++/include/bits/rbtree

@@ -369,6 +369,8 @@ public:
         root = copy(other.root);
         if (root)
             root->parent = nullptr;
+
+        return *this;
     }
     
     constexpr rbtree& operator=(rbtree&& other) noexcept
@@ -380,6 +382,8 @@ public:
         if constexpr (node_alloc_traits::
             propagate_on_container_move_assignment::value)
             alloc = std::move(other.alloc);
+
+        return *this;
     }
 
     constexpr void rotateleft(node* rt)

+ 1 - 1
gblibstdc++/include/functional

@@ -147,7 +147,7 @@ public:
     using result_type = Ret;
 
 private:
-    static constexpr std::size_t STACK_ALLOCATED_SIZE = 12;
+    static constexpr std::size_t STACK_ALLOCATED_SIZE = 24;
 
     char _data[STACK_ALLOCATED_SIZE];
     using fb_t = __inner::_function_base<Ret, Args...>;

+ 2 - 2
gblibstdc++/include/string

@@ -493,12 +493,12 @@ public:
 
     constexpr int compare(const basic_string& str) const noexcept
     {
-        return traits_type::compare(c_str(), str.c_str(), size());
+        return traits_type::compare(c_str(), str.c_str(), size()+1);
     }
 
     constexpr int compare(const Char* str) const
     {
-        return traits_type::compare(c_str(), str, size());
+        return traits_type::compare(c_str(), str, size()+1);
     }
 };
 

+ 0 - 36
include/asm/port_io.h

@@ -1,36 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-typedef uint16_t port_id_t;
-
-#define PORT_PIC1 (0x20)
-#define PORT_PIC2 (0xa0)
-#define PORT_PIC1_COMMAND (PORT_PIC1)
-#define PORT_PIC1_DATA ((PORT_PIC1) + 1)
-#define PORT_PIC2_COMMAND (PORT_PIC2)
-#define PORT_PIC2_DATA ((PORT_PIC2) + 1)
-
-#define PORT_KEYBOARD_COMMAND (0x64)
-#define PORT_KEYBOARD_DATA (0x60)
-
-#define PORT_PIT_CONTROL (0x43)
-#define PORT_PIT_COUNT (0x40)
-
-#define PORT_KEYDATA 0x0060u
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern void asm_outb(port_id_t port_number, uint8_t data);
-extern uint8_t asm_inb(port_id_t port_number);
-
-extern void asm_hlt(void);
-extern void asm_cli(void);
-extern void asm_sti(void);
-extern void asm_enable_sse(void);
-
-#ifdef __cplusplus
-}
-#endif

+ 0 - 27
include/asm/sys.h

@@ -1,27 +0,0 @@
-#pragma once
-
-#include <kernel/mem.h>
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void asm_switch_pd(page_t pd_addr);
-void asm_enable_paging(pd_t pd_addr);
-
-pptr_t current_pd(void);
-
-// the limit should be set on the higher 16bit
-// e.g. (n * sizeof(segment_descriptor) - 1) << 16
-void asm_load_gdt(uint32_t limit, pptr_t addr);
-
-void asm_load_tr(uint16_t index);
-
-extern const uint32_t kernel_size;
-extern char* const bss_addr;
-extern const uint32_t bss_len;
-
-#ifdef __cplusplus
-}
-#endif

+ 1 - 4
include/fs/fat.hpp

@@ -7,9 +7,6 @@
 #include <string.h>
 #include <sys/types.h>
 
-#include <types/size.h>
-
-#include <kernel/mem.h>
 #include <kernel/vfs.hpp>
 
 namespace fs::fat {
@@ -129,10 +126,10 @@ private:
     char label[12];
     std::vector<cluster_t> fat;
 
+    // TODO: dirty flag
     struct buf_object {
         char* data;
         int ref;
-        // bool dirty;
     };
     std::map<cluster_t, buf_object> buf;
 

+ 10 - 7
include/kernel/async/lock.hpp

@@ -1,11 +1,14 @@
 #pragma once
 
+#include <cstddef>
+
 #include <stdint.h>
 
 namespace kernel::async {
 
-using spinlock_t = uint32_t volatile;
-using preempt_count_t = size_t;
+using spinlock_t = unsigned long volatile;
+using lock_context_t = unsigned long;
+using preempt_count_t = std::size_t;
 
 void preempt_disable();
 void preempt_enable();
@@ -16,8 +19,8 @@ void init_spinlock(spinlock_t& lock);
 void spin_lock(spinlock_t& lock);
 void spin_unlock(spinlock_t& lock);
 
-uint32_t spin_lock_irqsave(spinlock_t& lock);
-void spin_unlock_irqrestore(spinlock_t& lock, uint32_t state);
+lock_context_t spin_lock_irqsave(spinlock_t& lock);
+void spin_unlock_irqrestore(spinlock_t& lock, lock_context_t context);
 
 class mutex {
 private:
@@ -31,8 +34,8 @@ public:
     void lock();
     void unlock();
 
-    uint32_t lock_irq();
-    void unlock_irq(uint32_t state);
+    lock_context_t lock_irq();
+    void unlock_irq(lock_context_t state);
 };
 
 class lock_guard {
@@ -50,7 +53,7 @@ public:
 class lock_guard_irq {
 private:
     mutex& m_mtx;
-    uint32_t state;
+    lock_context_t state;
 
 public:
     explicit inline lock_guard_irq(mutex& mtx)

+ 0 - 15
include/kernel/hw/keyboard.h

@@ -1,15 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-// TODO: this whole thing needs rewriting
-
-int32_t keyboard_has_data(void);
-
-void process_keyboard_data(void);
-
-#ifdef __cplusplus
-extern "C" void handle_keyboard_interrupt(void);
-#else
-void handle_keyboard_interrupt(void);
-#endif

+ 0 - 1
include/kernel/hw/serial.h → include/kernel/hw/serial.hpp

@@ -1,5 +1,4 @@
 #pragma once
-#include <asm/port_io.h>
 
 #ifdef __cplusplus
 extern "C" {

+ 0 - 17
include/kernel/hw/timer.h

@@ -1,17 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void init_pit(void);
-
-void inc_tick(void);
-
-size_t current_ticks(void);
-
-#ifdef __cplusplus
-}
-#endif

+ 11 - 0
include/kernel/hw/timer.hpp

@@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstddef>
+
+namespace kernel::hw::timer {
+void init_pit(void);
+void inc_tick(void);
+
+std::size_t current_ticks(void);
+
+}

+ 0 - 61
include/kernel/interrupt.h

@@ -1,61 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
-#define USER_INTERRUPT_GATE_TYPE (0xee)
-
-#define PIC_EOI (0x20)
-
-struct regs_32 {
-    uint32_t edi;
-    uint32_t esi;
-    uint32_t ebp;
-    uint32_t esp;
-    uint32_t ebx;
-    uint32_t edx;
-    uint32_t ecx;
-    uint32_t eax;
-};
-
-struct interrupt_stack {
-    struct regs_32 s_regs;
-    void* v_eip;
-    uint32_t cs;
-    uint32_t eflags;
-    uint32_t esp;
-    uint32_t ss;
-};
-
-struct mmx_registers {
-    uint8_t data[512]; // TODO: list of content
-};
-
-// present: When set, the page fault was caused by a page-protection violation.
-//          When not set, it was caused by a non-present page.
-// write:   When set, the page fault was caused by a write access.
-//          When not set, it was caused by a read access.
-// user:    When set, the page fault was caused while CPL = 3.
-//          This does not necessarily mean that the page fault was a privilege violation.
-// from https://wiki.osdev.org/Exceptions#Page_Fault
-struct page_fault_error_code {
-    uint32_t present : 1;
-    uint32_t write : 1;
-    uint32_t user : 1;
-    uint32_t reserved_write : 1;
-    uint32_t instruction_fetch : 1;
-    uint32_t protection_key : 1;
-    uint32_t shadow_stack : 1;
-    uint32_t software_guard_extensions : 1;
-};
-
-void init_idt(void);
-void init_pic(void);
-
-#ifdef __cplusplus
-}
-#endif

+ 74 - 0
include/kernel/interrupt.hpp

@@ -0,0 +1,74 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <types/types.h>
+
+struct saved_regs {
+    unsigned long rax;
+    unsigned long rbx;
+    unsigned long rcx;
+    unsigned long rdx;
+    unsigned long rdi;
+    unsigned long rsi;
+    unsigned long r8;
+    unsigned long r9;
+    unsigned long r10;
+    unsigned long r11;
+    unsigned long r12;
+    unsigned long r13;
+    unsigned long r14;
+    unsigned long r15;
+    unsigned long rbp;
+};
+
+struct PACKED interrupt_stack_head {
+    saved_regs s_regs;
+    unsigned long int_no;
+};
+
+struct PACKED interrupt_stack_normal {
+    interrupt_stack_head head;
+    uintptr_t v_rip;
+    unsigned long cs;
+    unsigned long flags;
+    uintptr_t rsp;
+    unsigned long ss;
+};
+
+struct PACKED interrupt_stack_with_code {
+    interrupt_stack_head head;
+    unsigned long error_code;
+    uintptr_t v_rip;
+    unsigned long cs;
+    unsigned long flags;
+    uintptr_t rsp;
+    unsigned long ss;
+};
+
+struct mmx_registers {
+    uint8_t data[512]; // TODO: list of content
+};
+
+// present: When set, the page fault was caused by a page-protection violation.
+//          When not set, it was caused by a non-present page.
+// write:   When set, the page fault was caused by a write access.
+//          When not set, it was caused by a read access.
+// user:    When set, the page fault was caused while CPL = 3.
+//          This does not necessarily mean that the page fault was a privilege violation.
+// from https://wiki.osdev.org/Exceptions#Page_Fault
+struct page_fault_error_code {
+    unsigned long present : 1;
+    unsigned long write : 1;
+    unsigned long user : 1;
+    unsigned long reserved_write : 1;
+    unsigned long instruction_fetch : 1;
+    unsigned long protection_key : 1;
+    unsigned long shadow_stack : 1;
+    unsigned long software_guard_extensions : 1;
+};
+
+namespace kernel::kinit {
+void init_interrupt();
+
+} // namespace kernel::kinit

+ 2 - 2
include/kernel/log.hpp

@@ -8,7 +8,7 @@
     if (1) {\
         char buf[512]; \
         snprintf(buf, sizeof(buf), fmt "\n" __VA_OPT__(,) __VA_ARGS__); \
-        console->print(buf); \
+        if (kernel::tty::console) kernel::tty::console->print(buf); \
     }
 
-#define kmsg(msg) console->print(msg)
+#define kmsg(msg) if (kernel::tty::console) kernel::tty::console->print(msg "\n")

+ 0 - 141
include/kernel/mem.h

@@ -1,141 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <types/size.h>
-
-#define PAGE_SIZE (0x1000)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// don't forget to add the initial 1m to the total
-struct mem_size_info {
-    uint16_t n_1k_blks; // memory between 1m and 16m in 1k blocks
-    uint16_t n_64k_blks; // memory above 16m in 64k blocks
-};
-
-struct e820_mem_map_entry_20 {
-    uint64_t base;
-    uint64_t len;
-    uint32_t type;
-};
-
-struct e820_mem_map_entry_24 {
-    struct e820_mem_map_entry_20 in;
-    uint32_t acpi_extension_attr;
-};
-
-/*
- * page directory entry
- *
- * p   : present (1)
- * rw  : allow write (1)
- * us  : allow user access (1)
- * pwt : todo
- * pcd : todo
- * a   : accessed for linear address translation (1)
- * d   : dirty (1) (ignored)
- * ps  : use 4MiB pages (ignored)
- * addr: page table address
- */
-typedef union pde_t {
-    uint32_t v;
-    struct {
-        uint32_t p : 1;
-        uint32_t rw : 1;
-        uint32_t us : 1;
-        uint32_t pwt : 1;
-        uint32_t pcd : 1;
-        uint32_t a : 1;
-        uint32_t d : 1;
-        uint32_t ps : 1;
-        uint32_t ignored : 4;
-        page_t pt_page : 20;
-    } in;
-} pde_t;
-typedef pde_t (*pd_t)[1024];
-
-/*
- * page table entry
- *
- * p   : present (1)
- * rw  : allow write (1)
- * us  : allow user access (1)
- * pwt : todo
- * pcd : todo
- * a   : accessed for linear address translation (1)
- * d   : dirty (1)
- * pat : todo (ignored)
- * g   : used in cr4 mode (ignored)
- * addr: physical memory address
- */
-typedef union pte_t {
-    uint32_t v;
-    struct {
-        uint32_t p : 1;
-        uint32_t rw : 1;
-        uint32_t us : 1;
-        uint32_t pwt : 1;
-        uint32_t pcd : 1;
-        uint32_t a : 1;
-        uint32_t d : 1;
-        uint32_t pat : 1;
-        uint32_t g : 1;
-        uint32_t ignored : 3;
-        page_t page : 20;
-    } in;
-} pte_t;
-typedef pte_t (*pt_t)[1024];
-
-// in mem.cpp
-extern uint8_t e820_mem_map[1024];
-extern uint32_t e820_mem_map_count;
-extern uint32_t e820_mem_map_entry_size;
-extern struct mem_size_info mem_size_info;
-
-#define KERNEL_HEAP_START ((void*)0xd0000000)
-#define KERNEL_HEAP_LIMIT ((void*)0xd4000000)
-
-#define EARLY_KERNEL_PD_PAGE ((page_t)0x000001)
-
-void init_mem(void);
-
-#define KERNEL_CODE_SEGMENT (0x08)
-#define KERNEL_DATA_SEGMENT (0x10)
-#define USER_CODE_SEGMENT (0x18)
-#define USER_DATA_SEGMENT (0x20)
-#define USER_CODE_SELECTOR (USER_CODE_SEGMENT | 3)
-#define USER_DATA_SELECTOR (USER_DATA_SEGMENT | 3)
-
-#define SD_TYPE_CODE_SYSTEM (0x9a)
-#define SD_TYPE_DATA_SYSTEM (0x92)
-
-#define SD_TYPE_CODE_USER (0xfa)
-#define SD_TYPE_DATA_USER (0xf2)
-
-#define SD_TYPE_TSS (0x89)
-
-typedef struct segment_descriptor_struct {
-    uint64_t limit_low : 16;
-    uint64_t base_low : 16;
-    uint64_t base_mid : 8;
-    uint64_t access : 8;
-    uint64_t limit_high : 4;
-    uint64_t flags : 4;
-    uint64_t base_high : 8;
-} segment_descriptor;
-
-// in mem.cpp
-extern segment_descriptor gdt[7];
-
-void create_segment_descriptor(
-    segment_descriptor* sd,
-    uint32_t base,
-    uint32_t limit,
-    uint32_t flags,
-    uint32_t access);
-
-#ifdef __cplusplus
-}
-#endif

+ 107 - 0
include/kernel/mem/mm_list.hpp

@@ -0,0 +1,107 @@
+#pragma once
+
+#include <set>
+
+#include <stdint.h>
+
+#include "vm_area.hpp"
+#include "paging.hpp"
+
+namespace kernel::mem {
+
+constexpr uintptr_t KERNEL_SPACE_START    = 0x8000000000000000ULL;
+constexpr uintptr_t USER_SPACE_MEMORY_TOP = 0x0000800000000000ULL;
+constexpr uintptr_t MMAP_MIN_ADDR         = 0x0000000000001000ULL;
+constexpr uintptr_t STACK_MIN_ADDR        = 0x0000700000000000ULL;
+
+class mm_list {
+private:
+    struct comparator {
+        constexpr bool operator()(const vm_area& lhs, const vm_area& rhs) const noexcept
+        { return lhs < rhs; }
+        constexpr bool operator()(const vm_area& lhs, uintptr_t rhs) const noexcept
+        { return lhs < rhs; }
+        constexpr bool operator()(uintptr_t lhs, const vm_area& rhs) const noexcept
+        { return lhs < rhs; }
+    };
+
+public:
+    using list_type = std::set<vm_area, comparator>;
+    using iterator = list_type::iterator;
+    using const_iterator = list_type::const_iterator;
+
+    struct map_args {
+        // MUSE BE aligned to 4kb boundary
+        uintptr_t vaddr;
+        // MUSE BE aligned to 4kb boundary
+        std::size_t length;
+
+        unsigned long flags;
+
+        fs::inode* file_inode;
+        // MUSE BE aligned to 4kb boundary
+        std::size_t file_offset;
+    };
+
+private:
+    list_type m_areas;
+    paging::pfn_t m_pt;
+    iterator m_brk {};
+
+public:
+    // default constructor copies kernel_mms
+    explicit mm_list();
+    // copies kernel_mms and mirrors user space
+    explicit mm_list(const mm_list& other);
+
+    constexpr mm_list(mm_list&& v)
+        : m_areas(std::move(v.m_areas))
+        , m_pt(std::exchange(v.m_pt, 0))
+        , m_brk{std::move(v.m_brk)} { }
+
+    ~mm_list();
+
+    void switch_pd() const noexcept;
+
+    int register_brk(uintptr_t addr);
+    uintptr_t set_brk(uintptr_t addr);
+
+    void clear();
+
+    // split the memory block at the specified address
+    // return: iterator to the new block
+    iterator split(iterator area, uintptr_t at);
+
+    bool is_avail(uintptr_t addr) const;
+    bool is_avail(uintptr_t start, std::size_t length) const noexcept;
+
+    uintptr_t find_avail(uintptr_t hint, size_t length) const;
+
+    int unmap(iterator area, bool should_invalidate_tlb);
+    int unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb);
+
+    int mmap(const map_args& args);
+
+    constexpr vm_area* find(uintptr_t lp)
+    {
+        auto iter = m_areas.find(lp);
+        if (iter == m_areas.end())
+            return nullptr;
+        return &iter;
+    }
+
+    constexpr const vm_area* find(uintptr_t lp) const
+    {
+        auto iter = m_areas.find(lp);
+        if (iter == m_areas.end())
+            return nullptr;
+        return &iter;
+    }
+
+    constexpr paging::PSE get_page_table() const noexcept
+    {
+        return paging::PSE {m_pt};
+    }
+};
+
+} // namespace kernel::mem

+ 192 - 0
include/kernel/mem/paging.hpp

@@ -0,0 +1,192 @@
+#pragma once
+
+#include <bit>
+#include <tuple>
+#include <cstddef>
+
+#include <stdint.h>
+
+#include <kernel/mem/phys.hpp>
+
+namespace kernel::mem::paging {
+
+constexpr int idx_p5(uintptr_t vaddr) noexcept { return (vaddr >> 48) & 0x1ff; }
+constexpr int idx_p4(uintptr_t vaddr) noexcept { return (vaddr >> 39) & 0x1ff; }
+constexpr int idx_p3(uintptr_t vaddr) noexcept { return (vaddr >> 30) & 0x1ff; }
+constexpr int idx_p2(uintptr_t vaddr) noexcept { return (vaddr >> 21) & 0x1ff; }
+constexpr int idx_p1(uintptr_t vaddr) noexcept { return (vaddr >> 12) & 0x1ff; }
+
+constexpr std::tuple<int, int, int, int, int> idx_all(uintptr_t vaddr) noexcept
+{
+    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr), idx_p1(vaddr)};
+}
+
+// page frame number
+// since we have large pages now, pfns are not shifted right
+using pfn_t = uintptr_t;
+
+// paging structure attributes
+using psattr_t = uintptr_t;
+
+constexpr psattr_t PA_P    = 0x0000000000000001ULL;
+constexpr psattr_t PA_RW   = 0x0000000000000002ULL;
+constexpr psattr_t PA_US   = 0x0000000000000004ULL;
+constexpr psattr_t PA_PWT  = 0x0000000000000008ULL;
+constexpr psattr_t PA_PCD  = 0x0000000000000010ULL;
+constexpr psattr_t PA_A    = 0x0000000000000020ULL;
+constexpr psattr_t PA_D    = 0x0000000000000040ULL;
+constexpr psattr_t PA_PS   = 0x0000000000000080ULL;
+constexpr psattr_t PA_G    = 0x0000000000000100ULL;
+constexpr psattr_t PA_COW  = 0x0000000000000200ULL; // copy on write
+constexpr psattr_t PA_MMAP = 0x0000000000000400ULL; // memory mapped
+constexpr psattr_t PA_ANON = 0x0000000000000800ULL; // anonymous map
+constexpr psattr_t PA_NXE  = 0x8000000000000000ULL;
+constexpr psattr_t PA_MASK = 0xfff0000000000fffULL;
+
+constexpr psattr_t PA_DATA = PA_P | PA_RW | PA_NXE;
+constexpr psattr_t PA_KERNEL_DATA = PA_DATA | PA_G;
+constexpr psattr_t PA_USER_DATA = PA_DATA | PA_G | PA_US;
+
+constexpr psattr_t PA_PAGE_TABLE = PA_P | PA_RW;
+constexpr psattr_t PA_KERNEL_PAGE_TABLE = PA_PAGE_TABLE | PA_G;
+constexpr psattr_t PA_USER_PAGE_TABLE = PA_PAGE_TABLE | PA_US;
+
+constexpr psattr_t PA_DATA_HUGE = PA_DATA | PA_PS;
+constexpr psattr_t PA_KERNEL_DATA_HUGE = PA_DATA_HUGE | PA_G;
+constexpr psattr_t PA_USER_DATA_HUGE = PA_DATA_HUGE | PA_US;
+
+constexpr psattr_t PA_ANONYMOUS_PAGE = PA_P | PA_US | PA_COW | PA_ANON;
+constexpr psattr_t PA_MMAPPED_PAGE = PA_US | PA_COW | PA_ANON | PA_MMAP;
+
+namespace __inner {
+    using pse_t = uint64_t;
+
+} // namespace __inner
+
+class PSE {
+    physaddr<__inner::pse_t> m_ptrbase;
+
+public:
+    explicit constexpr PSE(uintptr_t pptr) noexcept : m_ptrbase{pptr} {}
+
+    constexpr void clear() noexcept
+    {
+        *m_ptrbase = 0;
+    }
+
+    constexpr void set(psattr_t attributes, pfn_t pfn)
+    {
+        *m_ptrbase = (attributes & PA_MASK) | (pfn & ~PA_MASK);
+    }
+
+    constexpr pfn_t pfn() const noexcept
+    {
+        return *m_ptrbase & ~PA_MASK;
+    }
+
+    constexpr psattr_t attributes() const noexcept
+    {
+        return *m_ptrbase & PA_MASK;
+    }
+
+    constexpr PSE operator[](std::size_t nth) const noexcept
+    {
+        return PSE{m_ptrbase.phys() + 8 * nth};
+    }
+
+    constexpr PSE parse() const noexcept
+    {
+        return PSE{*m_ptrbase & ~PA_MASK};
+    }
+};
+
+constexpr pfn_t EMPTY_PAGE_PFN = 0x7f000;
+
+constexpr uintptr_t KERNEL_PAGE_TABLE_ADDR = 0x100000;
+constexpr physaddr<void> KERNEL_PAGE_TABLE_PHYS_ADDR{KERNEL_PAGE_TABLE_ADDR};
+constexpr PSE KERNEL_PAGE_TABLE{0x100000};
+
+constexpr unsigned long PAGE_PRESENT = 0x00010000;
+constexpr unsigned long PAGE_BUDDY   = 0x00020000;
+constexpr unsigned long PAGE_SLAB    = 0x00040000;
+
+struct page {
+    // TODO: use atomic
+    unsigned long refcount;
+    unsigned long flags;
+
+    page* next;
+    page* prev;
+};
+
+inline page* PAGE_ARRAY;
+
+void create_zone(uintptr_t start, uintptr_t end);
+void mark_present(uintptr_t start, uintptr_t end);
+
+[[nodiscard]] page* alloc_page();
+// order represents power of 2
+[[nodiscard]] page* alloc_pages(unsigned order);
+
+// order represents power of 2
+void free_pages(page* page, unsigned order);
+void free_page(page* page);
+
+// order represents power of 2
+void free_pages(pfn_t pfn, unsigned order);
+void free_page(pfn_t pfn);
+
+// clear the page all zero
+[[nodiscard]] pfn_t alloc_page_table();
+
+pfn_t page_to_pfn(page* page);
+page* pfn_to_page(pfn_t pfn);
+
+void increase_refcount(page* page);
+
+constexpr unsigned long PAGE_FAULT_P   = 0x00000001;
+constexpr unsigned long PAGE_FAULT_W   = 0x00000002;
+constexpr unsigned long PAGE_FAULT_U   = 0x00000004;
+constexpr unsigned long PAGE_FAULT_R   = 0x00000008;
+constexpr unsigned long PAGE_FAULT_I   = 0x00000010;
+constexpr unsigned long PAGE_FAULT_PK  = 0x00000020;
+constexpr unsigned long PAGE_FAULT_SS  = 0x00000040;
+constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
+
+void handle_page_fault(unsigned long err);
+
+class vaddr_range {
+    std::size_t n;
+
+    int idx4;
+    int idx3;
+    int idx2;
+    int idx1;
+
+    PSE pml4;
+    PSE pdpt;
+    PSE pd;
+    PSE pt;
+
+    uintptr_t m_start;
+    uintptr_t m_end;
+
+    bool is_privilege;
+
+public:
+    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool is_privilege = false);
+    explicit vaddr_range(std::nullptr_t);
+
+    vaddr_range begin() const noexcept;
+    vaddr_range end() const noexcept;
+
+    PSE operator*() const noexcept;
+
+    vaddr_range& operator++();
+    operator bool() const noexcept;
+
+    // compares remaining pages to iterate
+    bool operator==(const vaddr_range& other) const noexcept;
+};
+
+} // namespace kernel::mem::paging

+ 65 - 0
include/kernel/mem/phys.hpp

@@ -0,0 +1,65 @@
+#pragma once
+
+#include <bit>
+#include <cstddef>
+
+#include <stdint.h>
+
+#include <types/types.h>
+
+#include <kernel/mem/types.hpp>
+
+namespace kernel::mem {
+
+template <typename T, bool Cached = true>
+class physaddr {
+    static constexpr uintptr_t PHYS_OFFSET =
+        Cached ? 0xffffff0000000000ULL : 0xffffff4000000000ULL;
+
+    uintptr_t m_ptr;
+
+public:
+    explicit constexpr physaddr(uintptr_t ptr) : m_ptr{ptr} {}
+    explicit constexpr physaddr(std::nullptr_t) : m_ptr{} {}
+
+    // cast to non-pointer types is prohibited
+    template <typename U, typename = std::enable_if_t<std::is_pointer_v<U>>>
+    constexpr U cast_to() const noexcept
+    {
+        return std::bit_cast<U>(m_ptr + PHYS_OFFSET);
+    }
+
+    constexpr operator T*() const noexcept
+    {
+        return cast_to<T*>();
+    }
+
+    constexpr T* operator->() const noexcept
+    {
+        return *this;
+    }
+
+    constexpr uintptr_t phys() const noexcept
+    {
+        return m_ptr;
+    }
+};
+
+//  gdt[0]:  null
+//  gdt[1]:  kernel code
+//  gdt[2]:  kernel data
+//  gdt[3]:  user code
+//  gdt[4]:  user data
+//  gdt[5]:  user code compability mode
+//  gdt[6]:  user data compability mode
+//  gdt[7]:  thread local 32bit
+//  gdt[8]:  tss descriptor low
+//  gdt[9]:  tss descriptor high
+//  gdt[10]: ldt descriptor low
+//  gdt[11]: ldt descriptor high
+//  gdt[12]: null segment(in ldt)
+//  gdt[13]: thread local 64bit(in ldt)
+// &gdt[14]: tss of 0x68 bytes from here
+constexpr physaddr<uint64_t> gdt{0x00000000 + 1 - 1};
+
+} // namespace kernel::mem

+ 40 - 0
include/kernel/mem/slab.hpp

@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+#include <stdint.h>
+
+#include "paging.hpp"
+#include "phys.hpp"
+
+namespace kernel::mem {
+
+struct slab_cache;
+
+struct slab_head {
+    slab_cache* cache;
+
+    slab_head* next;
+    slab_head* prev;
+
+    void* free;
+
+    unsigned int free_count;
+    unsigned int obj_size;
+};
+
+struct slab_cache {
+    slab_head* slabs_empty;
+    slab_head* slabs_partial;
+    slab_head* slabs_full;
+
+    std::size_t obj_size;
+};
+
+void init_slab_cache(slab_cache* cache, std::size_t obj_size);
+
+void* slab_alloc(slab_cache* cache);
+void slab_free(void* ptr);
+
+} // namespace kernel::mem

+ 36 - 0
include/kernel/mem/types.hpp

@@ -0,0 +1,36 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <cstddef>
+
+namespace kernel::mem {
+
+struct gdt_entry {
+    uint64_t limit_low : 16;
+    uint64_t base_low : 16;
+    uint64_t base_mid : 8;
+    uint64_t access : 8;
+    uint64_t limit_high : 4;
+    uint64_t flags : 4;
+    uint64_t base_high : 8;
+};
+
+struct e820_mem_map_entry {
+    uint64_t base;
+    uint64_t len;
+    uint32_t type;
+
+    // might not be valid
+    uint32_t acpi_extension_attr;
+};
+
+namespace info {
+    inline std::size_t memory_size;
+    inline std::size_t e820_entry_count;
+    inline std::size_t e820_entry_length;
+    inline e820_mem_map_entry e820_entries[(1024-16)/24];
+
+} // namespace info
+
+} // namespace kernel::mem

+ 46 - 0
include/kernel/mem/vm_area.hpp

@@ -0,0 +1,46 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <kernel/vfs.hpp>
+
+namespace kernel::mem {
+
+constexpr unsigned long MM_WRITE         = 0x00000000'00000001;
+constexpr unsigned long MM_EXECUTE       = 0x00000000'00000002;
+constexpr unsigned long MM_MAPPED        = 0x00000000'00000004;
+constexpr unsigned long MM_ANONYMOUS     = 0x00000000'00000008;
+constexpr unsigned long MM_INTERNAL_MASK = 0xffffffff'00000000;
+constexpr unsigned long MM_BREAK         = 0x80000000'00000000;
+
+struct vm_area {
+    uintptr_t start;
+    uintptr_t end;
+
+    unsigned long flags;
+
+    fs::inode* mapped_file;
+    std::size_t file_offset;
+
+    constexpr bool is_avail(uintptr_t ostart, uintptr_t oend) const noexcept
+    {
+        return (ostart >= end || oend <= start);
+    }
+
+    constexpr bool operator<(const vm_area& rhs) const noexcept
+    { return end <= rhs.start; }
+    constexpr bool operator<(uintptr_t rhs) const noexcept
+    { return end <= rhs; }
+    friend constexpr bool operator<(uintptr_t lhs, const vm_area& rhs) noexcept
+    { return lhs < rhs.start; }
+
+    constexpr vm_area(uintptr_t start, unsigned long flags, uintptr_t end,
+            fs::inode* mapped_file = nullptr, std::size_t offset = 0)
+        : start{start}, end{end}, flags{flags}, mapped_file{mapped_file}, file_offset{offset} { }
+
+    constexpr vm_area(uintptr_t start, unsigned long flags,
+            fs::inode* mapped_file = nullptr, std::size_t offset = 0)
+        : start{start}, end{start}, flags{flags}, mapped_file{mapped_file}, file_offset{offset} { }
+};
+
+} // namespace kernel::mem

+ 0 - 400
include/kernel/mm.hpp

@@ -1,400 +0,0 @@
-#pragma once
-
-#include <set>
-#include <vector>
-#include <bit>
-#include <cstddef>
-#include <utility>
-
-#include <kernel/mem.h>
-#include <kernel/vfs.hpp>
-#include <stdint.h>
-#include <types/allocator.hpp>
-#include <types/cplusplus.hpp>
-#include <types/size.h>
-#include <types/status.h>
-#include <types/types.h>
-
-#define invalidate_tlb(addr) asm volatile("invlpg (%0)" \
-                                 :             \
-                                 : "r"(addr)   \
-                                 : "memory")
-
-constexpr size_t THREAD_KERNEL_STACK_SIZE = 8 * PAGE_SIZE;
-
-constexpr uint32_t PAGE_COW = (1 << 0);
-constexpr uint32_t PAGE_MMAP = (1 << 1);
-#define PAGE_COW PAGE_COW
-#define PAGE_MMAP PAGE_MMAP
-
-struct page {
-    page_t phys_page_id;
-    size_t* ref_count;
-    // 0 :11 : pte_index
-    // 12:31 : pt_page
-    uint32_t pg_pteidx;
-    mutable uint32_t attr;
-};
-
-// private memory mapping
-// changes won't be neither written back to file nor shared between processes
-// TODO: shared mapping
-// @param len is aligned to 4kb boundary automatically, exceeding part will
-// be filled with '0's and not written back to the file
-// @param offset MUST be aligned to 4kb
-int mmap(
-    void* hint,
-    size_t len,
-    fs::inode* file,
-    size_t offset,
-    int write,
-    int priv);
-
-template <uint32_t base, uint32_t expo>
-constexpr uint32_t pow()
-{
-    if constexpr (expo == 0)
-        return 1;
-    if constexpr (expo == 1)
-        return base;
-    if constexpr (expo % 2 == 0)
-        return pow<base, expo / 2>() * pow<base, expo / 2>();
-    else
-        return pow<base, expo / 2>() * pow<base, expo / 2 + 1>();
-}
-
-template <int N>
-constexpr uint32_t align_down(uint32_t v)
-{
-    return v & ~(pow<2, N>() - 1);
-}
-template <int N>
-constexpr void* align_down(void* v)
-{
-    return std::bit_cast<void*>(align_down<N>(std::bit_cast<uint32_t>(v)));
-}
-template <int N>
-constexpr uint32_t align_up(uint32_t v)
-{
-    return align_down<N>(v + pow<2, N>() - 1);
-}
-template <int N>
-constexpr void* align_up(void* v)
-{
-    return std::bit_cast<void*>(align_up<N>(std::bit_cast<uint32_t>(v)));
-}
-
-constexpr size_t vptrdiff(void* p1, void* p2)
-{
-    auto* _p1 = static_cast<std::byte*>(p1);
-    auto* _p2 = static_cast<std::byte*>(p2);
-    return _p1 - _p2;
-}
-
-constexpr void* vptradd(void* p, std::size_t off)
-{
-    auto* _p = static_cast<std::byte*>(p);
-    return _p + off;
-}
-
-void dealloc_pd(page_t pd);
-
-// allocate a struct page together with the raw page
-page allocate_page(void);
-void free_page(page* pg);
-
-// TODO: this is for alloc_kstack()
-// CHANGE THIS
-page_t __alloc_raw_page(void);
-void __free_raw_page(page_t pg);
-
-namespace kernel {
-
-void* pmap(page_t pg, bool cached = true);
-void pfree(page_t pg);
-
-class paccess : public types::non_copyable {
-private:
-    page_t m_pg;
-    void* m_ptr;
-
-public:
-    paccess(void) = delete;
-    paccess(paccess&&) = delete;
-    paccess& operator=(paccess&&) = delete;
-
-    inline explicit paccess(page_t pg, bool cached = true)
-        : m_pg(pg)
-    {
-        m_ptr = pmap(pg, cached);
-    }
-
-    constexpr void* ptr(void) const { return m_ptr; }
-
-    ~paccess()
-    {
-        pfree(m_pg);
-    }
-};
-
-namespace memory {
-
-struct mm {
-public:
-    using pages_vector = std::vector<page, types::memory::ident_allocator<page>>;
-
-public:
-    void* start {};
-    struct mm_attr {
-        uint32_t write : 1;
-        uint32_t system : 1;
-        uint32_t mapped : 1;
-    } attr {};
-    pages_vector* pgs {};
-    fs::inode* mapped_file {};
-    size_t file_offset {};
-
-public:
-    constexpr void* end() const noexcept
-    { return vptradd(start, pgs->size() * PAGE_SIZE); }
-    constexpr bool is_kernel_space() const noexcept
-    { return attr.system; }
-    constexpr bool is_avail(void* ostart, void* oend) const noexcept
-    {
-        void* m_start = start;
-        void* m_end = end();
-
-        return (ostart >= m_end || oend <= m_start);
-    }
-
-    void append_page(pd_t pd, const page& pg, uint32_t attr, bool priv);
-
-    /**
-     * @brief Splits the memory block at the specified address.
-     * 
-     * @param addr The address at which the memory block will be split.
-     * @return The new memory block created after splitting.
-     */
-    mm split(void* addr);
-
-    constexpr bool operator<(const mm& rhs) const noexcept
-    { return end() <= rhs.start; }
-    constexpr bool operator<(void* rhs) const noexcept
-    { return end() <= rhs; }
-    friend constexpr bool operator<(void* lhs, const mm& rhs) noexcept
-    { return lhs < rhs.start; }
-};
-
-class mm_list {
-private:
-    struct comparator {
-        constexpr bool operator()(const mm& lhs, const mm& rhs) const noexcept
-        { return lhs < rhs; }
-        constexpr bool operator()(const mm& lhs, void* rhs) const noexcept
-        { return lhs < rhs; }
-        constexpr bool operator()(void* lhs, const mm& rhs) const noexcept
-        { return lhs < rhs; }
-    };
-
-public:
-    using list_type = std::set<mm, comparator, types::memory::ident_allocator<mm>>;
-    using iterator = list_type::iterator;
-    using const_iterator = list_type::const_iterator;
-
-public:
-    static inline mm_list* s_kernel_mms;
-
-private:
-    list_type m_areas;
-    page_t m_pd;
-    mm* m_brk {};
-
-public:
-    // for system initialization only
-    explicit constexpr mm_list(page_t pd)
-        : m_pd(pd) { }
-
-    // default constructor copies kernel_mms
-    explicit mm_list();
-    // copies kernel_mms and mirrors user space
-    explicit mm_list(const mm_list& other);
-
-    constexpr mm_list(mm_list&& v)
-        : m_areas(std::move(v.m_areas))
-        , m_pd(std::exchange(v.m_pd, 0)) { }
-
-    ~mm_list();
-    void switch_pd() const;
-
-    int register_brk(void* addr);
-    void* set_brk(void* addr);
-
-    void* find_avail(void* hint, size_t len, bool priv) const;
-
-    int unmap(void* start, size_t len, bool priv);
-
-    constexpr mm& addarea(void* start, bool w, bool system)
-    {
-        auto [ iter, inserted ] = m_areas.emplace(mm {
-            .start = start,
-            .attr {
-                .write = w,
-                .system = system,
-                .mapped = 0,
-            },
-            .pgs = types::memory::kinew<mm::pages_vector>(),
-        });
-        assert(inserted);
-        return *iter;
-    }
-
-    mm& add_empty_area(void* start, std::size_t page_count,
-        uint32_t page_attr, bool w, bool system);
-
-    constexpr void clear_user()
-    {
-        for (auto iter = m_areas.begin(); iter != m_areas.end(); ) {
-            if (iter->is_kernel_space()) {
-                ++iter;
-                continue;
-            }
-
-            this->unmap(*iter);
-            iter = m_areas.erase(iter);
-        }
-        m_brk = nullptr;
-    }
-
-    inline void unmap(mm& area)
-    {
-        int i = 0;
-
-        // TODO:
-        // if there are more than 4 pages, calling invlpg
-        // should be faster. otherwise, we use movl cr3
-        // bool should_invlpg = (area->pgs->size() > 4);
-
-        for (auto& pg : *area.pgs) {
-            kernel::paccess pa(pg.pg_pteidx >> 12);
-            auto pt = (pt_t)pa.ptr();
-            assert(pt);
-            auto* pte = *pt + (pg.pg_pteidx & 0xfff);
-            pte->v = 0;
-
-            free_page(&pg);
-
-            invalidate_tlb((uint32_t)area.start + (i++) * PAGE_SIZE);
-        }
-        types::memory::kidelete<mm::pages_vector>(area.pgs);
-    }
-
-    constexpr mm* find(void* lp)
-    {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &*iter;
-    }
-    constexpr const mm* find(void* lp) const
-    {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &*iter;
-    }
-
-    constexpr bool is_avail(void* start, size_t len) const noexcept
-    {
-        start = align_down<12>(start);
-        len = vptrdiff(align_up<12>(vptradd(start, len)), start);
-        for (const auto& area : m_areas) {
-            if (!area.is_avail(start, vptradd(start, len)))
-                return false;
-        }
-        return true;
-    }
-
-    constexpr bool is_avail(void* addr) const
-    {
-        auto iter = m_areas.find(addr);
-        return iter == m_areas.end();
-    }
-};
-
-} // namespace memory
-
-} // namespace kernel
-
-// global variables
-inline page empty_page;
-// --------------------------------
-
-// inline constexpr page* lto_page(mm* mm_area, void* l_ptr)
-// {
-//     size_t offset = vptrdiff(l_ptr, mm_area->start);
-//     return &mm_area->pgs->at(offset / PAGE_SIZE);
-// }
-// inline constexpr page_t to_page(pptr_t ptr)
-// {
-//     return ptr >> 12;
-// }
-// inline constexpr size_t to_pdi(page_t pg)
-// {
-//     return pg >> 10;
-// }
-// inline constexpr size_t to_pti(page_t pg)
-// {
-//     return pg & (1024 - 1);
-// }
-// inline constexpr pptr_t to_pp(page_t p)
-// {
-//     return p << 12;
-// }
-constexpr size_t v_to_pdi(void* addr)
-{
-    return std::bit_cast<uint32_t>(addr) >> 22;
-}
-constexpr size_t v_to_pti(void* addr)
-{
-    return (std::bit_cast<uint32_t>(addr) >> 12) & 0x3ff;
-}
-// inline constexpr pte_t* to_pte(pt_t pt, page_t pg)
-// {
-//     return *pt + to_pti(pg);
-// }
-// inline void* to_vp(page_t pg)
-// {
-//     return ptovp(to_pp(pg));
-// }
-// inline pd_t to_pd(page_t pg)
-// {
-//     return reinterpret_cast<pd_t>(to_vp(pg));
-// }
-// inline pt_t to_pt(page_t pg)
-// {
-//     return reinterpret_cast<pt_t>(to_vp(pg));
-// }
-// inline pt_t to_pt(pde_t* pde)
-// {
-//     return to_pt(pde->in.pt_page);
-// }
-// inline pde_t* to_pde(pd_t pd, void* addr)
-// {
-//     return *pd + lto_pdi((pptr_t)addr);
-// }
-// inline pte_t* to_pte(pt_t pt, void* addr)
-// {
-//     return *pt + lto_pti((pptr_t)addr);
-// }
-// inline pte_t* to_pte(pde_t* pde, void* addr)
-// {
-//     return to_pte(to_pt(pde), addr);
-// }
-// inline pte_t* to_pte(pd_t pd, void* addr)
-// {
-//     return to_pte(to_pde(pd, addr), addr);
-// }
-// inline pte_t* to_pte(pde_t* pde, page_t pg)
-// {
-//     return to_pte(to_pt(pde), pg);
-// }

+ 1 - 1
include/kernel/module.hpp

@@ -29,6 +29,6 @@ constexpr int MODULE_DELAYED = 2;
 // TODO: unique_ptr and Deleter
 int insmod(module* mod);
 
-extern "C" module_loader kmod_loaders_start[];
+extern "C" module_loader KMOD_LOADERS_START[];
 
 } // namespace kernel::module

+ 12 - 20
include/kernel/process.hpp

@@ -1,7 +1,7 @@
 #pragma once
 
-#include <map>
 #include <list>
+#include <map>
 #include <memory>
 #include <queue>
 #include <set>
@@ -13,23 +13,21 @@
 #include <stdint.h>
 #include <sys/types.h>
 
-#include <kernel/task/thread.hpp>
 #include <kernel/task/current.hpp>
+#include <kernel/task/thread.hpp>
 
 #include <types/allocator.hpp>
 #include <types/cplusplus.hpp>
 #include <types/path.hpp>
-#include <types/status.h>
 #include <types/types.h>
 
 #include <kernel/async/waitlist.hpp>
-#include <kernel/interrupt.h>
-#include <kernel/mm.hpp>
-#include <kernel/mem.h>
-#include <kernel/user/thread_local.hpp>
+#include <kernel/interrupt.hpp>
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/task.h>
 #include <kernel/tty.hpp>
+#include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 
 class process;
@@ -39,8 +37,6 @@ class proclist;
 inline process* volatile current_process;
 inline proclist* procs;
 
-inline tss32_t tss;
-
 struct process_attr {
     uint16_t system : 1;
     uint16_t zombie : 1 = 0;
@@ -175,7 +171,7 @@ public:
     };
 
 public:
-    kernel::memory::mm_list mms {};
+    kernel::mem::mm_list mms {};
     std::set<kernel::task::thread> thds;
     kernel::async::wait_list waitlist;
 
@@ -192,7 +188,7 @@ public:
     pid_t pgid {};
     pid_t sid {};
 
-    tty* control_tty {};
+    kernel::tty::tty* control_tty {};
     fs::dentry* root { fs::fs_root };
     std::set<pid_t> children;
 
@@ -292,20 +288,16 @@ public:
     }
 
     void kill(pid_t pid, int exit_code);
+
+    constexpr auto begin() const { return m_procs.begin(); }
+    constexpr auto end() const { return m_procs.end(); }
 };
 
-void NORETURN init_scheduler(void);
+void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn);
 /// @return true if returned normally, false if being interrupted
 bool schedule(void);
 void NORETURN schedule_noreturn(void);
 
-constexpr uint32_t push_stack(uint32_t** stack, uint32_t val)
-{
-    --*stack;
-    **stack = val;
-    return val;
-}
-
 void k_new_thread(void (*func)(void*), void* data);
 
 void NORETURN freeze(void);

+ 2 - 2
include/kernel/signal.hpp

@@ -9,7 +9,7 @@
 
 #include <types/cplusplus.hpp>
 
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 
 namespace kernel {
 
@@ -57,7 +57,7 @@ public:
 
     // return value: whether the thread should wake up
     bool raise(signo_type signal);
-    void handle(interrupt_stack* context, mmx_registers* mmxregs);
+    void handle(interrupt_stack_normal* context, mmx_registers* mmxregs);
     void after_signal(signo_type signal);
 };
 

+ 111 - 10
include/kernel/syscall.hpp

@@ -1,16 +1,117 @@
 #pragma once
 
-#include <kernel/interrupt.h>
+#include <string>
+#include <vector>
+
+#include <bits/alltypes.h>
+#include <poll.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <time.h>
+
 #include <types/types.h>
 
-#define SYSCALL_ARG1(type, name) type name = (type)((data)->s_regs.ebx)
-#define SYSCALL_ARG2(type, name) type name = (type)((data)->s_regs.ecx)
-#define SYSCALL_ARG3(type, name) type name = (type)((data)->s_regs.edx)
-#define SYSCALL_ARG4(type, name) type name = (type)((data)->s_regs.esi)
-#define SYSCALL_ARG5(type, name) type name = (type)((data)->s_regs.edi)
-#define SYSCALL_ARG6(type, name) type name = (type)((data)->s_regs.ebp)
+#include <kernel/interrupt.hpp>
+#include <kernel/signal.hpp>
+#include <kernel/user/thread_local.hpp>
+
+#define SYSCALL64_ARG1(type, name) type name = (type)((data)->head.s_regs.rdi)
+#define SYSCALL64_ARG2(type, name) type name = (type)((data)->head.s_regs.rsi)
+#define SYSCALL64_ARG3(type, name) type name = (type)((data)->head.s_regs.rdx)
+#define SYSCALL64_ARG4(type, name) type name = (type)((data)->head.s_regs.r10)
+#define SYSCALL64_ARG5(type, name) type name = (type)((data)->head.s_regs.r8)
+#define SYSCALL64_ARG6(type, name) type name = (type)((data)->head.s_regs.r9)
+
+namespace kernel {
+void init_syscall_table();
+
+void handle_syscall32(int no, interrupt_stack_normal* data, mmx_registers* mmxregs);
+void handle_syscall64(int no, interrupt_stack_normal* data, mmx_registers* mmxregs);
+
+namespace syscall {
+// in fileops.cc
+ssize_t do_write(int fd, const char __user* buf, size_t n);
+ssize_t do_read(int fd, char __user* buf, size_t n);
+int do_close(int fd);
+int do_dup(int old_fd);
+int do_dup2(int old_fd, int new_fd);
+int do_pipe(int __user* pipefd);
+ssize_t do_getdents(int fd, char __user* buf, size_t cnt);
+ssize_t do_getdents64(int fd, char __user* buf, size_t cnt);
+int do_open(const char __user* path, int flags, mode_t mode);
+int do_symlink(const char __user* target, const char __user* linkpath);
+int do_readlink(const char __user* pathname, char __user* buf, size_t buf_size);
+int do_ioctl(int fd, unsigned long request, uintptr_t arg3);
+ssize_t do_readv(int fd, const iovec* iov, int iovcnt);
+ssize_t do_writev(int fd, const iovec* iov, int iovcnt);
+off_t do_lseek(int fd, off_t offset, int whence);
+uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len,
+        int prot, int flags, int fd, off_t pgoffset);
+int do_munmap(uintptr_t addr, size_t len);
+ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset, size_t count);
+int do_statx(int dirfd, const char __user* path,
+        int flags, unsigned int mask, statx __user* statxbuf);
+int do_fcntl(int fd, int cmd, unsigned long arg);
+int do_poll(pollfd __user* fds, nfds_t nfds, int timeout);
+int do_mknod(const char __user* pathname, mode_t mode, dev_t dev);
+int do_access(const char __user* pathname, int mode);
+int do_unlink(const char __user* pathname);
+int do_truncate(const char __user* pathname, long length);
+int do_mkdir(const char __user* pathname, mode_t mode);
+
+// in procops.cc
+int do_chdir(const char __user* path);
+[[noreturn]] int do_exit(int status);
+int do_waitpid(pid_t waitpid, int __user* arg1, int options);
+pid_t do_getsid(pid_t pid);
+pid_t do_setsid();
+pid_t do_getpgid(pid_t pid);
+int do_setpgid(pid_t pid, pid_t pgid);
+int do_set_thread_area(user::user_desc __user* ptr);
+pid_t do_set_tid_address(int __user* tidptr);
+int do_prctl(int option, uintptr_t arg2);
+int do_arch_prctl(int option, uintptr_t arg2);
+pid_t do_getpid();
+pid_t do_getppid();
+uid_t do_getuid();
+uid_t do_geteuid();
+gid_t do_getgid();
+pid_t do_gettid();
+char __user* do_getcwd(char __user* buf, size_t buf_size);
+uintptr_t do_brk(uintptr_t addr);
+int do_umask(mode_t mask);
+int do_kill(pid_t pid, int sig);
+int do_rt_sigprocmask(int how, const kernel::sigmask_type __user* set,
+        kernel::sigmask_type __user* oldset, size_t sigsetsize);
+int do_rt_sigaction(int signum, const sigaction __user* act,
+        sigaction __user* oldact, size_t sigsetsize);
+int do_newuname(new_utsname __user* buf);
+
+struct execve_retval {
+    uintptr_t ip;
+    uintptr_t sp;
+    int status;
+};
+
+execve_retval do_execve(
+        const std::string& exec,
+        const std::vector<std::string>& args,
+        const std::vector<std::string>& envs);
+
+// in mount.cc
+int do_mount(
+        const char __user* source,
+        const char __user* target,
+        const char __user* fstype,
+        unsigned long flags,
+        const void __user* _fsdata);
+
+// in infoops.cc
+int do_clock_gettime(clockid_t clk_id, timespec __user* tp);
+int do_gettimeofday(timeval __user* tv, void __user* tz);
 
-// return value is stored in %eax and %edx
-typedef int (*syscall_handler)(interrupt_stack* data);
+} // namespace kernel::syscall
 
-void init_syscall(void);
+} // namespace kernel

+ 0 - 18
include/kernel/task.h

@@ -1,18 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct tss32_t {
-    uint32_t backlink, esp0, ss0, esp1, ss1, esp2, ss2, cr3;
-    uint32_t eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
-    uint32_t es, cs, ss, ds, fs, gs;
-    uint32_t ldtr, iomap;
-};
-
-#ifdef __cplusplus
-}
-#endif

+ 12 - 6
include/kernel/task/thread.hpp

@@ -8,12 +8,13 @@
 
 #include <types/types.h>
 
+#include <kernel/mem/paging.hpp>
 #include <kernel/signal.hpp>
 #include <kernel/user/thread_local.hpp>
 
 namespace kernel::task {
 
-using tid_t = uint32_t;
+using tid_t = std::size_t;
 
 struct thread {
 public:
@@ -27,13 +28,18 @@ public:
 
 private:
     struct kernel_stack {
-        std::byte* stack_base;
-        uint32_t* esp;
+        mem::paging::pfn_t pfn;
+        uintptr_t sp;
 
         kernel_stack();
         kernel_stack(const kernel_stack& other);
         kernel_stack(kernel_stack&& other);
         ~kernel_stack();
+
+        uint64_t pushq(uint64_t val);
+        uint32_t pushl(uint32_t val);
+
+        void load_interrupt_stack() const;
     };
 
 public:
@@ -46,14 +52,14 @@ public:
     int* __user clear_child_tid {};
 
     std::string name {};
-
-    segment_descriptor tls_desc {};
+    uint64_t tls_desc32 {};
+    std::size_t elected_times {};
 
     explicit thread(std::string name, pid_t owner);
     thread(const thread& val, pid_t owner);
 
     int set_thread_area(user::user_desc* ptr);
-    int load_thread_area() const;
+    int load_thread_area32() const;
 
     void set_attr(thd_attr_t new_attr);
 

+ 11 - 13
include/kernel/tty.hpp

@@ -1,5 +1,7 @@
 #pragma once
 
+#include <string>
+
 #include <stdint.h>
 #include <sys/types.h>
 #include <termios.h>
@@ -11,10 +13,11 @@
 #include <kernel/async/waitlist.hpp>
 #include <kernel/async/lock.hpp>
 
+namespace kernel::tty {
+
 class tty : public types::non_copyable {
 public:
     static constexpr size_t BUFFER_SIZE = 4096;
-    static constexpr size_t NAME_SIZE = 32;
 
 private:
     void _real_commit_char(int c);
@@ -23,7 +26,7 @@ private:
     int _do_erase(bool should_echo);
 
 public:
-    tty();
+    explicit tty(std::string name);
     virtual void putchar(char c) = 0;
     void print(const char* str);
     ssize_t read(char* buf, size_t buf_size, size_t n);
@@ -52,13 +55,13 @@ public:
         return fg_pgroup;
     }
 
-    char name[NAME_SIZE];
     termios termio;
+    std::string name;
 
 protected:
-    kernel::async::mutex mtx_buf;
+    async::mutex mtx_buf;
     types::buffer buf;
-    kernel::async::wait_list waitlist;
+    async::wait_list waitlist;
 
     pid_t fg_pgroup;
 };
@@ -69,13 +72,8 @@ public:
     virtual void putchar(char c) override;
 };
 
-class serial_tty : public virtual tty {
-public:
-    serial_tty(int id);
-    virtual void putchar(char c) override;
+inline tty* console;
 
-public:
-    uint16_t id;
-};
+int register_tty(tty* tty_dev);
 
-inline tty* console;
+} // namespace kernel::tty

+ 1 - 3
include/kernel/user/thread_local.hpp

@@ -1,7 +1,5 @@
 #pragma once
 
-#include <kernel/mem.h>
-
 #include <stdint.h>
 
 namespace kernel::user {
@@ -18,6 +16,6 @@ struct user_desc {
     uint32_t useable : 1;
 };
 
-void load_thread_area(const segment_descriptor& desc);
+void load_thread_area32(uint64_t desc);
 
 } // namespace kernel::user

+ 1 - 1
include/kernel/vfs.hpp

@@ -52,7 +52,7 @@ struct PACKED user_dirent {
     // uint8_t d_type; // file type, with offset of (d_reclen - 1)
 };
 
-struct user_dirent64 {
+struct PACKED user_dirent64 {
     ino64_t d_ino; // inode number
     uint64_t d_off; // implementation-defined field, ignored
     uint16_t d_reclen; // length of this struct user_dirent

+ 10 - 47
include/types/allocator.hpp

@@ -11,12 +11,6 @@
 
 #include <kernel/async/lock.hpp>
 
-namespace kernel::kinit {
-
-void init_kernel_heap(void* start, std::size_t size);
-
-} // namespace kernel::kinit
-
 namespace types::memory {
 
 class brk_memory_allocator {
@@ -28,17 +22,14 @@ private:
     byte* p_start;
     byte* p_limit;
     byte* p_break;
+    byte* p_allocated;
     kernel::async::mutex mtx;
 
-    constexpr byte* brk(byte* addr)
-    {
-        if (addr >= p_limit) [[unlikely]]
-            return nullptr;
-        return p_break = addr;
-    }
+    byte* brk(byte* addr);
+    byte* sbrk(size_type increment);
 
-    constexpr byte* sbrk(size_type increment)
-    { return brk(p_break + increment); }
+    constexpr byte* sbrk() const noexcept
+    { return p_break; }
 
 public:
     explicit brk_memory_allocator(byte* start, size_type size);
@@ -46,41 +37,13 @@ public:
 
     void* allocate(size_type size);
     void deallocate(void* ptr);
-};
-
-void* kimalloc(std::size_t size);
-void kifree(void* ptr);
 
-template <typename T>
-struct ident_allocator {
-    using value_type = T;
-    using propagate_on_container_move_assignment = std::true_type;
-
-    constexpr ident_allocator() = default;
-
-    template <typename U>
-    constexpr ident_allocator(const ident_allocator<U>&) noexcept {}
-    
-    inline T* allocate(std::size_t n)
-    { return (T*)kimalloc(n * sizeof(T)); }
-    inline void deallocate(T* ptr, std::size_t) { return kifree(ptr); }
+    bool allocated(void* ptr) const noexcept;
 };
 
-template <typename T, typename... Args>
-constexpr T* kinew(Args&&... args)
-{
-    ident_allocator<T> alloc { };
-    T* ptr = std::allocator_traits<ident_allocator<T>>::allocate(alloc, 1);
-    std::allocator_traits<ident_allocator<T>>::construct(alloc, ptr, std::forward<Args>(args)...);
-    return ptr;
-}
+} // namespace types::memory
 
-template <typename T>
-constexpr void kidelete(T* ptr)
-{
-    ident_allocator<T> alloc { };
-    std::allocator_traits<ident_allocator<T>>::destroy(alloc, ptr);
-    std::allocator_traits<ident_allocator<T>>::deallocate(alloc, ptr, 1);
-}
+namespace kernel::kinit {
+void init_allocator();
 
-} // namespace types::memory
+} // namespace kernel::kinit

+ 157 - 19
include/types/elf.hpp

@@ -1,22 +1,23 @@
 #pragma once
+
 #include <errno.h>
-#include <kernel/interrupt.h>
+#include <stdint.h>
+
+#include <kernel/interrupt.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
-#include <stdint.h>
-#include <types/size.h>
-#include <types/status.h>
 
 namespace types::elf {
+
 using elf32_addr_t = uint32_t;
 using elf32_off_t = uint32_t;
 
-using elf_addr_t = elf32_addr_t;
-using elf_off_t = elf32_off_t;
+using elf64_addr_t = uint64_t;
+using elf64_off_t = uint64_t;
 
-constexpr elf32_addr_t ELF_STACK_BOTTOM = 0xbffff000;
-constexpr elf32_off_t ELF_STACK_SIZE = 8 * 1024 * 1024;
-constexpr elf32_addr_t ELF_STACK_TOP = ELF_STACK_BOTTOM - ELF_STACK_SIZE;
+constexpr elf32_addr_t ELF32_STACK_BOTTOM = 0xbffff000;
+constexpr elf32_off_t ELF32_STACK_SIZE = 8 * 1024 * 1024;
+constexpr elf32_addr_t ELF32_STACK_TOP = ELF32_STACK_BOTTOM - ELF32_STACK_SIZE;
 
 struct PACKED elf32_header {
     // 0x7f, "ELF"
@@ -104,7 +105,11 @@ struct PACKED elf32_program_header_entry {
     elf32_off_t filesz;
     elf32_off_t memsz;
     // segment dependent
-    uint32_t flags;
+    enum : uint32_t {
+        PF_X = 0x1,
+        PF_W = 0x2,
+        PF_R = 0x4,
+    } flags;
     // 0 and 1 for no alignment, otherwise power of 2
     uint32_t align;
 };
@@ -131,21 +136,154 @@ struct PACKED elf32_section_header_entry {
     } sh_flags;
     elf32_addr_t sh_addr;
     elf32_off_t sh_offset;
-    uint32_t sh_size;
-    char _[16];
+    elf32_off_t sh_size;
+    uint32_t sh_link;
+    uint32_t sh_info;
+    elf32_off_t sh_addralign;
+    elf32_off_t sh_entsize;
 };
 
 struct elf32_load_data {
     const fs::dentry* exec_dent;
-    const char* const* argv;
-    const char* const* envp;
-    int errcode;
-    void* eip;
-    uint32_t* sp;
-    bool system;
+    const std::vector<std::string>& argv;
+    const std::vector<std::string>& envp;
+    uintptr_t ip;
+    uintptr_t sp;
 };
 
 // TODO: environment variables
-int elf32_load(elf32_load_data* data);
+int elf32_load(elf32_load_data& data);
+
+struct PACKED elf64_header {
+    // 0x7f, "ELF"
+    char magic[4];
+
+    enum : uint8_t {
+        FORMAT_32 = 1,
+        FORMAT_64 = 2,
+    } format;
+    enum : uint8_t {
+        ENDIAN_LITTLE = 1,
+        ENDIAN_BIG = 2,
+    } endian;
+    // should be 1
+    uint8_t _version1;
+    enum : uint8_t {
+        ABI_SYSTEM_V = 0x00,
+        // TODO:
+        ABI_LINUX = 0x03,
+    } abi;
+    uint8_t abi_version;
+    uint8_t _reserved[7];
+    enum : uint16_t {
+        ET_NONE = 0x00,
+        ET_REL = 0x01,
+        ET_EXEC = 0x02,
+        ET_DYN = 0x03,
+        ET_CORE = 0x04,
+        ET_LOOS = 0xfe00,
+        ET_HIOS = 0xfeff,
+        ET_LOPROC = 0xff00,
+        ET_HIPROC = 0xffff,
+    } type;
+    enum : uint16_t {
+        ARCH_NONE = 0x00,
+        ARCH_X86 = 0x03,
+        ARCH_ARM = 0x28,
+        ARCH_IA64 = 0x32,
+        ARCH_X86_64 = 0x3e,
+        ARCH_ARM64 = 0xb7,
+        ARCH_RISCV = 0xf3,
+    } arch;
+    // should be 1
+    uint32_t _version2;
+    // entry address
+    elf64_addr_t entry;
+    // program header table offset
+    elf64_off_t phoff;
+    // section header table offset
+    elf64_off_t shoff;
+    // architecture dependent flags
+    uint32_t flags;
+    // elf header size
+    uint16_t ehsize;
+    // program header table entry size
+    uint16_t phentsize;
+    // program header table entries number
+    uint16_t phnum;
+    // section header table entry size
+    uint16_t shentsize;
+    // section header table entries number
+    uint16_t shnum;
+    // section header table entry index that contains section names
+    uint16_t shstrndx;
+};
+
+struct PACKED elf64_program_header_entry {
+    enum : uint32_t {
+        PT_NULL = 0x00,
+        PT_LOAD = 0x01,
+        PT_DYNAMIC = 0x02,
+        PT_INTERP = 0x03,
+        PT_NOTE = 0x04,
+        PT_SHLIB = 0x05,
+        PT_PHDR = 0x06,
+        PT_TLS = 0x07,
+        PT_LOOS = 0x60000000,
+        PT_HIOS = 0x6fffffff,
+        PT_LIPROC = 0x70000000,
+        PT_HIPROC = 0x7fffffff,
+    } type;
+    // segment dependent
+    enum : uint32_t {
+        PF_X = 0x1,
+        PF_W = 0x2,
+        PF_R = 0x4,
+    } flags;
+    elf64_off_t offset;
+    elf64_addr_t vaddr;
+    elf64_addr_t paddr;
+    elf64_off_t filesz;
+    elf64_off_t memsz;
+    // 0 and 1 for no alignment, otherwise power of 2
+    uint64_t align;
+};
+
+struct PACKED elf64_section_header_entry {
+    uint32_t sh_name;
+    enum : uint32_t {
+        SHT_NULL = 0x00,
+        SHT_PROGBITS = 0x01,
+        SHT_RELA = 0x04,
+        SHT_DYNAMIC = 0x06,
+        SHT_NOTE = 0x07,
+        SHT_NOBITS = 0x08,
+        SHT_REL = 0x09,
+        SHT_DYNSYM = 0x0b,
+        SHT_INIT_ARRAY = 0x0e,
+        SHT_FINI_ARRAY = 0x0f,
+        SHT_PREINIT_ARRAY = 0x0f,
+    } sh_type;
+    enum : uint64_t {
+        SHF_WRITE = 0x01,
+        SHF_ALLOC = 0x02,
+        SHF_EXECINSTR = 0x04,
+    } sh_flags;
+    elf64_addr_t sh_addr;
+    elf64_off_t sh_offset;
+    elf64_off_t sh_size;
+    uint32_t sh_link;
+    uint32_t sh_info;
+    elf64_off_t sh_addralign;
+    elf64_off_t sh_entsize;
+};
+
+struct elf64_load_data {
+    const fs::dentry* exec_dent;
+    std::vector<std::string> argv;
+    std::vector<std::string> envp;
+    unsigned long ip;
+    unsigned long sp;
+};
 
 } // namespace types::elf

+ 19 - 8
include/types/hash_map.hpp

@@ -16,19 +16,30 @@ namespace types {
 
 // taken from linux
 constexpr uint32_t GOLDEN_RATIO_32 = 0x61C88647;
-// constexpr uint64_t GOLDEN_RATIO_64 = 0x61C8864680B583EBull;
+constexpr uint64_t GOLDEN_RATIO_64 = 0x61C8864680B583EBull;
 
-using hash_t = size_t;
+using hash_t = std::size_t;
 
 static inline constexpr hash_t _hash32(uint32_t val)
 {
     return val * GOLDEN_RATIO_32;
 }
 
-static inline constexpr hash_t hash32(uint32_t val, uint32_t bits)
+static inline constexpr hash_t hash32(uint32_t val, std::size_t bits)
 {
     // higher bits are more random
-    return _hash32(val) >> (32 - bits);
+    return _hash32(val) >> (8 * sizeof(hash_t) - bits);
+}
+
+static inline constexpr hash_t _hash64(uint64_t val)
+{
+    return val * GOLDEN_RATIO_64;
+}
+
+static inline constexpr hash_t hash64(uint64_t val, std::size_t bits)
+{
+    // higher bits are more random
+    return _hash64(val) >> (8 * sizeof(hash_t) - bits);
 }
 
 template <typename T>
@@ -36,17 +47,17 @@ constexpr bool is_c_string_v = std::is_same_v<std::decay_t<T>, char*>
     || std::is_same_v<std::decay_t<T>, const char*>;
 
 template <typename T,
-    std::enable_if_t<std::is_convertible_v<T, uint32_t>, bool> = true>
+    std::enable_if_t<std::is_convertible_v<T, uint64_t>, bool> = true>
 inline hash_t hash(T val, std::size_t bits)
 {
-    return hash32(static_cast<uint32_t>(val), bits);
+    return hash64(static_cast<uint64_t>(val), bits);
 }
 
 template <typename T,
     std::enable_if_t<std::is_pointer_v<T> && !is_c_string_v<T>, bool> = true>
 inline hash_t hash(T val, std::size_t bits)
 {
-    return hash32(std::bit_cast<uint32_t>(val), bits);
+    return hash(std::bit_cast<uintptr_t>(val), bits);
 }
 
 inline hash_t hash(const char* str, std::size_t bits)
@@ -57,7 +68,7 @@ inline hash_t hash(const char* str, std::size_t bits)
         while (*str)
             hash = hash * seed + (*str++);
 
-        return hash32(hash, bits);
+        return hash64(hash, bits);
 };
 
 template <template <typename, typename, typename> typename String,

+ 43 - 0
include/types/list.hpp

@@ -0,0 +1,43 @@
+#pragma once
+
+namespace types::list {
+
+template <typename ListNode>
+void list_insert(ListNode** head, ListNode* node)
+{
+    node->prev = nullptr;
+    node->next = *head;
+    if (*head)
+        (*head)->prev = node;
+    *head = node;
+}
+
+template <typename ListNode>
+ListNode* list_get(ListNode** head)
+{
+    ListNode* node = *head;
+    if (node) {
+        *head = node->next;
+
+        node->next = nullptr;
+        node->prev = nullptr;
+    }
+    return node;
+}
+
+template <typename ListNode>
+void list_remove(ListNode** head, ListNode* node)
+{
+    if (node->prev)
+        node->prev->next = node->next;
+    else
+        *head = node->next;
+
+    if (node->next)
+        node->next->prev = node->prev;
+
+    node->next = nullptr;
+    node->prev = nullptr;
+}
+
+} // namespace types

+ 0 - 22
include/types/size.h

@@ -1,22 +0,0 @@
-#pragma once
-
-#include "stdint.h"
-
-#ifdef __GNUC__
-#define PACKED __attribute__((__packed__))
-#else
-#error "no definition for ((PACKED))"
-#endif
-
-#define __32bit_system
-
-#ifdef __32bit_system
-typedef uint32_t ptr_t;
-typedef int32_t diff_t;
-#elif
-typedef uint64_t ptr_t;
-typedef int64_t diff_t;
-#endif
-
-typedef ptr_t pptr_t;
-typedef ssize_t page_t;

+ 0 - 4
include/types/status.h

@@ -1,4 +0,0 @@
-#pragma once
-
-#define GB_OK (0)
-#define GB_FAILED (1)

+ 6 - 2
include/types/types.h

@@ -1,7 +1,5 @@
 #pragma once
 
-#include "size.h"
-#include "status.h"
 #include "stdint.h"
 
 #define __user
@@ -18,6 +16,12 @@
 #error "no definition for ((SECTION))"
 #endif
 
+#ifdef __GNUC__
+#define PACKED __attribute__((__packed__))
+#else
+#error "no definition for ((PACKED))"
+#endif
+
 #ifdef __GNUC__
 #define likely(expr) (__builtin_expect(!!(expr), 1))
 #define unlikely(expr) (__builtin_expect(!!(expr), 0))

+ 16 - 0
include/types/user_types.hpp

@@ -0,0 +1,16 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <types/types.h>
+
+namespace types {
+
+using ptr32_t = uint32_t;
+
+struct iovec32 {
+    ptr32_t iov_base;
+    ptr32_t iov_len;
+};
+
+} // namespace types

+ 1 - 1
init_script.sh

@@ -19,7 +19,7 @@ export PATH="/bin"
 echo ok > /dev/console
 
 mkdir -p /etc /root /proc
-mount -t proc proc proc
+mount -t procfs proc proc
 
 cat > /etc/passwd <<EOF
 root:x:0:0:root:/root:/mnt/busybox sh

+ 7 - 22
pretty-print.py

@@ -94,31 +94,16 @@ class stringPrinter:
         self.val = val
 
     def to_string(self):
-        return self.val['m_data']
+        if self.val['m_data']['stackdata']['end'] == 0:
+            return self.val['m_data']['stackdata']['str'].string()
+        return self.val['m_data']['heapdata']['m_ptr'].string()
+
+    def num_children(self):
+        return 0
 
     def display_hint(self):
         return 'string'
 
-    def children(self):
-        return
-
-        yield 'str', self.val['m_data']
-
-        if self.val['m_data'] == 0:
-            return
-
-        yield 'length', self.val['m_size'] - 1
-
-        ptr = self.val['m_data']
-        i = 0
-
-        while ptr.dereference() != 0:
-            yield '[%d]' % i, ptr.dereference()
-            ptr += 1
-            i += 1
-
-        yield '[%d]' % i, 0
-
 class listPrinter:
     def __init__(self, val):
         self.val: gdb.Field = val
@@ -292,7 +277,7 @@ def build_pretty_printer(val):
     if re.compile(r"^std::impl::rbtree<.*, .*, .*>::_iterator<.*?>$").match(typename):
         return rbtreeIteratorPrinter(val)
 
-    if re.compile(r"^types::string<.*>$").match(typename):
+    if re.compile(r"^std::basic_string<.*>$").match(typename):
         return stringPrinter(val)
 
     return None

+ 105 - 247
src/asm/interrupt.s

@@ -1,264 +1,122 @@
-.code32
-
 .text
 
-# TODO: stack alignment
-.globl int6
-.type  int6 @function
-int6:
-    pushal
-    call int6_handler
-    popal
-
-    iret
-
-# TODO: stack alignment
-.globl int8
-.type  int8 @function
-int8:
-    nop
-    iret
-
-# TODO: stack alignment
-.globl int13
-.type  int13 @function
-int13:
-    pushal
-    call int13_handler
-    popal
-
-# remove the 32bit error code from stack
-    addl $4, %esp
-    iret
-
-.globl int14
-.type  int14 @function
-int14:
-    # push general purpose registers
-    pushal
-
-    # save %cr2
-    movl %cr2, %eax
-    pushl %eax
-
-    # save current esp (also pointer to struct int14_data)
-    mov %esp, %ebx
-
-    # allocate space for mmx registers and argument
-    subl $0x210, %esp
-
-    # align stack to 16byte boundary
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    fxsave 16(%esp)
-
-    # push (interrupt_stack*)data
-    mov %ebx, (%esp)
-
-    call int14_handler
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack and general purpose registers
-    leal 4(%ebx), %esp
-    popal
-
-# remove the 32bit error code from stack
-    addl $4, %esp
-    iret
-
-.globl irq0
-irq0:
-    pushal
-    mov $0, %eax
-    jmp irqstub
-.globl irq1
-irq1:
-    pushal
-    mov $1, %eax
-    jmp irqstub
-.globl irq2
-irq2:
-    pushal
-    mov $2, %eax
-    jmp irqstub
-.globl irq3
-irq3:
-    pushal
-    mov $3, %eax
-    jmp irqstub
-.globl irq4
-irq4:
-    pushal
-    mov $4, %eax
-    jmp irqstub
-.globl irq5
-irq5:
-    pushal
-    mov $5, %eax
-    jmp irqstub
-.globl irq6
-irq6:
-    pushal
-    mov $6, %eax
-    jmp irqstub
-.globl irq7
-irq7:
-    pushal
-    mov $7, %eax
-    jmp irqstub
-.globl irq8
-irq8:
-    pushal
-    mov $8, %eax
-    jmp irqstub
-.globl irq9
-irq9:
-    pushal
-    mov $9, %eax
-    jmp irqstub
-.globl irq10
-irq10:
-    pushal
-    mov $10, %eax
-    jmp irqstub
-.globl irq11
-irq11:
-    pushal
-    mov $11, %eax
-    jmp irqstub
-.globl irq12
-irq12:
-    pushal
-    mov $12, %eax
-    jmp irqstub
-.globl irq13
-irq13:
-    pushal
-    mov $13, %eax
-    jmp irqstub
-.globl irq14
-irq14:
-    pushal
-    mov $14, %eax
-    jmp irqstub
-.globl irq15
-irq15:
-    pushal
-    mov $15, %eax
-    jmp irqstub
-
-.globl irqstub
-irqstub:
-    # save current esp
-    mov %esp, %ebx
-
-    # align stack to 16byte boundary
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    sub $(512 + 16), %esp
-    fxsave 16(%esp)
-
-    # save irq number and pointers to context and mmx registers
-    mov %eax, (%esp)  # irq number
-    mov %ebx, 4(%esp) # pointer to context
-    lea 16(%esp), %eax
-    mov %eax, 8(%esp) # pointer to mmx registers
-
-    call irq_handler
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack and general purpose registers
-    mov %ebx, %esp
-    popal
-
-    iret
-
-.globl syscall_stub
-.type  syscall_stub @function
-syscall_stub:
-    pushal
-
-    # save current esp
-    mov %esp, %ebx
-
-    # stack alignment
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    sub $(512 + 16), %esp
-    fxsave 16(%esp)
-
-    # save pointers to context and mmx registers
-    mov %ebx, (%esp) # pointer to context
-    lea 16(%esp), %eax
-    mov %eax, 4(%esp) # pointer to mmx registers
-
-    call syscall_entry
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack
-    mov %ebx, %esp
-
-.globl _syscall_stub_fork_return
-.type  _syscall_stub_fork_return @function
-_syscall_stub_fork_return:
-    popal
-    iret
+.extern after_ctx_switch
+.globl ISR_stub_restore
+
+ISR_stub:
+	sub $0x78, %rsp
+	mov %rax,  0x00(%rsp)
+	mov %rbx,  0x08(%rsp)
+	mov %rcx,  0x10(%rsp)
+	mov %rdx,  0x18(%rsp)
+	mov %rdi,  0x20(%rsp)
+	mov %rsi,  0x28(%rsp)
+	mov %r8,   0x30(%rsp)
+	mov %r9,   0x38(%rsp)
+	mov %r10,  0x40(%rsp)
+	mov %r11,  0x48(%rsp)
+	mov %r12,  0x50(%rsp)
+	mov %r13,  0x58(%rsp)
+	mov %r14,  0x60(%rsp)
+	mov %r15,  0x68(%rsp)
+	mov %rbp,  0x70(%rsp)
+
+	mov 0x78(%rsp), %rax
+	sub $ISR0, %rax
+	shr $3, %rax
+	mov %rax, 0x78(%rsp)
+
+	mov %rsp, %rbx
+	and $~0xf, %rsp
+
+	sub $512, %rsp
+	fxsave (%rsp)
+
+	mov %rbx, %rdi
+	mov %rsp, %rsi
+	call interrupt_handler
+
+ISR_stub_restore:
+	fxrstor (%rsp)
+	mov %rbx, %rsp
+
+	mov 0x00(%rsp), %rax
+	mov 0x08(%rsp), %rbx
+	mov 0x10(%rsp), %rcx
+	mov 0x18(%rsp), %rdx
+	mov 0x20(%rsp), %rdi
+	mov 0x28(%rsp), %rsi
+	mov 0x30(%rsp), %r8
+	mov 0x38(%rsp), %r9
+	mov 0x40(%rsp), %r10
+	mov 0x48(%rsp), %r11
+	mov 0x50(%rsp), %r12
+	mov 0x58(%rsp), %r13
+	mov 0x60(%rsp), %r14
+	mov 0x68(%rsp), %r15
+	mov 0x70(%rsp), %rbp
+
+	mov 0x78(%rsp), %rsp
+	iretq
 
 # parameters
-# #1: esp* curr_esp
-# #2: esp* next_esp
+# #1: sp* current_task_sp
+# #2: sp* target_task_sp
 .globl asm_ctx_switch
 .type  asm_ctx_switch @function
 asm_ctx_switch:
-    movl 4(%esp), %ecx
-    movl 8(%esp), %eax
+    pushf
+	sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
 
-    push $_ctx_switch_return
-    push %ebx
-    push %edi
-    push %esi
-    push %ebp
-    pushfl
+    mov %rbx, 0x08(%rsp)
+	mov %rbp, 0x10(%rsp)
+	mov %r12, 0x18(%rsp)
+	mov %r13, 0x20(%rsp)
+	mov %r14, 0x28(%rsp)
+	mov %r15, 0x30(%rsp)
 
-    # push esp to restore
-    pushl (%ecx)
+    push (%rdi) 	 # save sp of previous stack frame of current
+	                 # acts as saving bp
 
-    mov %esp, (%ecx)
-    mov (%eax), %esp
+    mov %rsp, (%rdi) # save sp of current stack
+    mov (%rsi), %rsp # load sp of target stack
 
-    # restore esp
-    popl (%eax)
+    pop (%rsi)       # load sp of previous stack frame of target
+	                 # acts as restoring previous bp
 
-    popfl
-    pop %ebp
-    pop %esi
-    pop %edi
-    pop %ebx
+	pop %rax         # align to 16 bytes
 
-    ret
+	call after_ctx_switch
 
-_ctx_switch_return:
-    ret
+	mov 0x28(%rsp), %r15
+	mov 0x20(%rsp), %r14
+	mov 0x18(%rsp), %r13
+	mov 0x10(%rsp), %r12
+	mov 0x08(%rsp), %rbp
+    mov 0x00(%rsp), %rbx
 
-.section .text.kinit
+	add $0x30, %rsp
+    popf
 
-.globl asm_load_idt
-.type  asm_load_idt @function
-asm_load_idt:
-    movl 4(%esp), %edx
-    lidt (%edx)
-    movl 8(%esp), %edx
-    cmpl $0, %edx
-    je asm_load_idt_skip
-    sti
-asm_load_idt_skip:
     ret
+
+.altmacro
+.macro build_isr name
+	.align 8
+	ISR\name:
+		call ISR_stub
+.endm
+
+.set i, 0
+.rept 0x80+1
+	build_isr %i
+	.set i, i+1
+.endr
+
+.section .rodata
+
+.align 8
+.globl ISR_START_ADDR
+.type  ISR_START_ADDR @object
+ISR_START_ADDR:
+	.quad ISR0

+ 0 - 56
src/asm/port_io.s

@@ -1,56 +0,0 @@
-.code32
-
-.text
-
-.globl asm_outb
-.type  asm_outb @function
-asm_outb:
-    pushl %eax
-    pushl %edx
-    movw 12(%esp), %dx
-    movb 16(%esp), %al
-    outb %al, %dx
-    popl %edx
-    popl %eax
-    ret
-
-.globl asm_inb
-.type  asm_inb @function
-asm_inb:
-    pushl %edx
-    movw 8(%esp), %dx
-    inb %dx, %al
-    popl %edx
-    ret
-
-.globl asm_hlt
-.type  asm_hlt @function
-asm_hlt:
-    hlt
-    ret
-
-.globl asm_cli
-.type  asm_cli @function
-asm_cli:
-    cli
-    ret
-
-.globl asm_sti
-.type  asm_sti @function
-asm_sti:
-    sti
-    ret
-
-.section .text.kinit
-.globl asm_enable_sse
-.type  asm_enable_sse @function
-asm_enable_sse:
-	movl %cr0, %eax
-    andl $0xfffffff3, %eax
-	orl $0b100010, %eax
-	movl %eax, %cr0
-	movl %cr4, %eax
-	orl $0b11000000000, %eax
-	movl %eax, %cr4
-    fninit
-	ret

+ 0 - 53
src/asm/sys.s

@@ -1,53 +0,0 @@
-.code32
-
-.text
-
-.global asm_switch_pd
-.type   asm_switch_pd @function
-asm_switch_pd:
-    movl 4(%esp), %eax
-    shll $12, %eax
-    movl %eax, %cr3
-    ret
-
-.global current_pd
-.type   current_pd @function
-current_pd:
-    movl %cr3, %eax
-    ret
-
-.section .text.kinit
-
-.global asm_enable_paging
-.type   asm_enable_paging @function
-asm_enable_paging:
-    cli
-    // page directory address
-    movl 4(%esp), %eax
-    movl %eax, %cr3
-
-    movl %cr0, %eax
-    // SET PE, WP, PG
-    orl $0x80010001, %eax
-    movl %eax, %cr0
-
-    ret
-
-.global asm_load_gdt
-.type   asm_load_gdt @function
-asm_load_gdt:
-    cli
-    leal 6(%esp), %eax
-    lgdt (%eax)
-    ljmp $0x08, $_asm_load_gdt_fin
-_asm_load_gdt_fin:
-    ret
-
-.global asm_load_tr
-.type   asm_load_tr @function
-asm_load_tr:
-    cli
-    movl 4(%esp), %eax
-    orl $0, %eax
-    ltr %ax
-    ret

+ 178 - 287
src/boot.s

@@ -1,294 +1,185 @@
 .section .stage1
-.code16
-loader_start:
-# set segment registers
-    movw %cs, %ax
-    movw %ax, %ds
-
-_clear_screen:
-    mov $0x00, %ah
-    mov $0x03, %al
-    int $0x10
-
-# get memory size info and storage it
-_get_memory_size:
-    xorw %cx, %cx
-    xorw %dx, %dx
-    movw $0xe801, %ax
-
-    int $0x15
-    jc _get_memory_size_error
-
-    cmpb $0x86, %ah # unsupported function
-    je _get_memory_size_error
-    cmpb $0x80, %ah # invalid command
-    je _get_memory_size_error
-
-    jcxz _get_memory_size_use_ax
-    movw %cx, %ax
-    movw %dx, %bx
-
-_get_memory_size_use_ax:
-    movl $asm_mem_size_info, %edx
-    movw %ax, (%edx)
-    addw $2, %dx
-    movw %bx, (%edx)
-    jmp _e820_mem_map_load
-
-_get_memory_size_error:
-    xchgw %bx, %bx
-    jmp __stage1_halt
-
-_e820_mem_map_load:
-    addl $4, %esp
-    movl $0, (%esp)
-
-    # save the destination address to es:di
-    movw %cs, %ax
-    movw %ax, %es
-
-    movl $asm_e820_mem_map, %edi
-
-    # clear ebx
-    xorl %ebx, %ebx
-
-    # set the magic number to edx
-    movl $0x534D4150, %edx
-
-_e820_mem_map_load_loop:
-    # set function number to eax
-    movl $0xe820, %eax
-
-    # set default entry size
-    movl $24, %ecx
-
-    int $0x15
-
-    incl (%esp)
-    addl %ecx, %edi
-
-    jc _e820_mem_map_load_fin
-    cmpl $0, %ebx
-    jz _e820_mem_map_load_fin
-    jmp _e820_mem_map_load_loop
-
-_e820_mem_map_load_fin:
-    movl (%esp), %eax
-    movl $asm_e820_mem_map_count, %edi
-    movl %eax, (%edi)
-
-    movl $asm_e820_mem_map_entry_size, %edi
-    movl %ecx, (%edi)
-
-    jmp _load_gdt
-
-_load_gdt:
-    cli
-    lgdt asm_gdt_descriptor
-
-# enable protection enable (PE) bit
-    movl %cr0, %eax
-    orl $1, %eax
-    movl %eax, %cr0
-
-    ljmp $0x08, $start_32bit
-
 .code32
-
+.globl start_32bit
 start_32bit:
-    movw $0x10, %ax
-    movw %ax, %ds
-    movw %ax, %es
-    movw %ax, %fs
-    movw %ax, %gs
-    movw %ax, %ss
-
-    movl $0, %esp
-    movl $0, %ebp
-
-setup_early_kernel_page_table:
-# memory map:
-# 0x0000-0x1000: empty page
-# 0x1000-0x2000: early kernel pd
-# 0x2000-0x6000: 4 pts
-# 0x6000-0x8000: early kernel stack
-# so we fill the first 8KiB with zero
-    movl $0x00000000, %eax
-    movl $0x8000, %ecx
-
-_fill_zero:
-    cmpl $0, %ecx
-    jz _fill_zero_end
-    subl $4, %ecx
-    movl $0, (%eax)
-    addl $4, %eax
-    jmp _fill_zero
-_fill_zero_end:
-
-# pt#0: 0x00000000 to 0x00400000
-    movl $0x00001000, %eax
-    movl $0x00002003, (%eax)
-# pt#1: 0xc0000000 to 0xc0400000
-    movl $0x00001c00, %eax
-    movl $0x00003003, (%eax)
-# pt#2: 0xff000000 to 0xff400000
-    movl $0x00001ff0, %eax
-    movl $0x00004003, (%eax)
-# pt#3: 0xffc00000 to 0xffffffff
-    movl $0x00001ffc, %eax
-    movl $0x00005003, (%eax)
-
-# map early kernel page directory to 0xff000000
-    movl $0x00004000, %eax
-    movl $0x00001003, (%eax)
-
-# map kernel pt#2 to 0xff001000
-    movl $0x00004004, %eax
-    movl $0x00004003, (%eax)
-
-# map __stage1_start ---- __kinit_end identically
-    movl $__stage1_start, %ebx
-    movl $__kinit_end, %ecx
-    movl %ebx, %edx
-    shrl $12, %edx
-    andl $0x3ff, %edx
-
-
-__map_stage1_kinit:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00002000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_stage1_kinit
-
-# map __text_start ---- __data_end to 0xc0000000
-    movl %ecx, %ebx
-    movl $__text_start, %edx
-    shrl $12, %edx
-    andl $0x3ff, %edx
-
-    movl $__data_end, %ecx
-    subl $__text_start, %ecx
-    addl %ebx, %ecx
-
-__map_kernel_space:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00003000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_kernel_space
-
-# map __data_end ---- __bss_end from 0x100000
-    movl $0x100000, %ebx
-    movl $__bss_end, %ecx
-    subl $__data_end, %ecx
-    addl %ebx, %ecx
-
-__map_kernel_bss:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00003000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_kernel_bss
-
-# map kernel stack 0xffffe000-0xffffffff
-    movl $0x6000, %ebx
-    movl $0x8000, %ecx
-    movl $0x0ffffe, %edx
-    andl $0x3ff, %edx
-
-__map_kernel_stack:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00005000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_kernel_stack
-
-load_early_kernel_page_table:
-    movl $0x00001000, %eax
-    movl %eax, %cr3
-
-    movl %cr0, %eax
+    mov $0x10, %ax
+    mov %ax, %ds
+    mov %ax, %es
+    mov %ax, %fs
+    mov %ax, %gs
+    mov %ax, %ss
+
+    cld
+    xor %eax, %eax
+
+    # clear paging structures
+    mov $0x100000, %edi
+    mov %edi, %ecx
+    shr $2, %ecx # %ecx /= 4
+    rep stosl
+
+    # set P, RW, G
+    mov $0x00000103, %ebx
+	xor %edx, %edx
+    mov $0x00101000, %esi
+
+    # PML4E 0x000
+    # we need the first 1GB identically mapped
+    # so that we won't trigger a triple fault after
+    # enabling paging
+	lea -0x1000(%esi), %edi # %edi = 0x100000
+    call fill_pxe
+
+    # PML4E 0xff0
+	mov $0x80000000, %edx
+	lea 0xff0(%edi), %edi
+	call fill_pxe
+	xor %edx, %edx
+
+    # setup PDPT for physical memory mapping
+    mov %esi, %edi
+
+    # set PS
+    or $0x00000080, %ebx
+    mov $256, %ecx
+    xor %esi, %esi
+_fill_loop1:
+    call fill_pxe
+    lea 8(%edi), %edi
+    add $0x40000000, %esi # 1GB
+    adc $0, %edx
+    loop _fill_loop1
+
+	mov $0x80000000, %edx
+
+    # set PCD, PWT
+    or $0x00000018, %ebx
+    mov $256, %ecx
+    xor %esi, %esi
+_fill_loop2:
+    call fill_pxe
+    lea 8(%edi), %edi
+    add $0x40000000, %esi # 1GB
+    adc $0, %edx
+    loop _fill_loop2
+
+	xor %edx, %edx
+
+    # PML4E 0xff8
+    mov %edi, %esi # 0x102000
+    mov $0x100ff8, %edi
+    # clear PCD, PWT, PS
+    and $(~0x00000098), %ebx
+    call fill_pxe
+
+    # PDPTE 0xff8
+    lea 0xff8(%esi), %edi  # 0x102ff8
+    lea 0x1000(%esi), %esi # 0x103000
+    call fill_pxe
+
+    # PDE 0xff0
+    lea 0xff0(%esi), %edi  # 0x103ff0
+    lea 0x1000(%esi), %esi # 0x104000
+    call fill_pxe
+
+    # fill PT (kernel image)
+    mov %esi, %edi # 0x104000
+    mov $0x2000, %esi
+
+.extern KERNEL_PAGES
+    mov $KIMAGE_PAGES, %ecx
+
+_fill_loop3:
+    call fill_pxe
+    lea 8(%edi), %edi
+	lea 0x1000(%esi), %esi
+    loop _fill_loop3
+
+    # set msr
+    mov $0xc0000080, %ecx
+    rdmsr
+    or $0x901, %eax # set LME, NXE, SCE
+    wrmsr
+
+    # set cr4
+    mov %cr4, %eax
+    or $0xa0, %eax # set PAE, PGE
+    mov %eax, %cr4
+
+    # load new page table
+	xor %eax, %eax
+	inc %eax
+	shl $20, %eax # %eax = 0x100000
+    mov %eax, %cr3
+
+    mov %cr0, %eax
     // SET PE, WP, PG
-    orl $0x80010001, %eax
-    movl %eax, %cr0
-
-# set stack pointer and clear stack bottom
-    movl $0xfffffff0, %esp
-    movl $0xfffffff0, %ebp
-
-    movl $0x00, (%esp)
-    movl $0x00, 4(%esp)
-    movl $0x00, 8(%esp)
-    movl $0x00, 12(%esp)
+    or $0x80010001, %eax
+    mov %eax, %cr0
+
+    # create gdt
+	xor %eax, %eax # at 0x0000
+	mov %eax, 0x00(%eax)
+	mov %eax, 0x04(%eax) # null descriptor
+	mov %eax, 0x08(%eax) # code segment lower
+	mov %eax, 0x10(%eax) # data segment lower
+	mov $0x00209a00, %ecx
+	mov %ecx, 0x0c(%eax) # code segment higher
+	mov $0x00009200, %ecx
+	mov %ecx, 0x14(%eax) # data segment higher
+
+    # gdt descriptor
+	push %eax
+	push %eax
+
+    # pad with a word
+	mov $0x00170000, %eax
+	push %eax
+
+	lgdt 2(%esp)
+	add $12, %esp
+
+    ljmp $0x08, $_64bit_entry
+
+# %ebx: attribute low
+# %edx: attribute high
+# %esi: page physical address
+# %edi: page x entry address
+fill_pxe:
+    lea (%ebx, %esi, 1), %eax
+    mov %eax, (%edi)
+    mov %edx, 4(%edi)
+
+    ret
+
+.code64
+_64bit_entry:
+	jmp start_64bit
+
+.section .text.kinit
+start_64bit:
+    # set stack pointer and clear stack bottom
+	movzw %sp, %rdi
+	xor %rsp, %rsp
+	inc %rsp
+	neg %rsp
+	shr $40, %rsp
+	shl $40, %rsp
+
+	add %rdi, %rsp
+	mov %rsp, %rdi
+
+    # make stack frame
+	lea -16(%rsp), %rsp
+	mov %rsp, %rbp
+
+	xor %rax, %rax
+	mov %rax, (%rsp)
+	mov %rax, 8(%rsp)
 
     call kernel_init
 
-__stage1_halt:
-    hlt
-    jmp __stage1_halt
-
-asm_gdt_descriptor:
-    .word (5 * 8) - 1 # size
-    .long asm_gdt_table  # address
-asm_gdt_table:
-    .8byte 0         # null descriptor
-
-    # kernel code segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0x9a       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-    # kernel data segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0x92       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-    # user code segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0xfa       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-    # user data segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0xf2       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-.globl asm_mem_size_info
-.type  asm_mem_size_info @object
-.size  asm_mem_size_info, (.-asm_mem_size_info)
-asm_mem_size_info:
-    .word 0x12
-    .word 0x34
-
-.globl asm_e820_mem_map
-.type  asm_e820_mem_map @object
-.size  asm_e820_mem_map, (.-asm_e820_mem_map)
-asm_e820_mem_map:
-    .space 1024
-
-.globl asm_e820_mem_map_count
-.type  asm_e820_mem_map_count @object
-asm_e820_mem_map_count:
-    .long 0
-
-.globl asm_e820_mem_map_entry_size
-.type  asm_e820_mem_map_entry_size @object
-asm_e820_mem_map_entry_size:
-    .long 0
+_64bit_hlt:
+	cli
+	hlt
+	jmp _64bit_hlt

+ 29 - 51
src/fs/fat.cpp

@@ -7,11 +7,10 @@
 #include <stdio.h>
 
 #include <types/allocator.hpp>
-#include <types/status.h>
 
 #include <fs/fat.hpp>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
 #include <kernel/module.hpp>
 #include <kernel/vfs.hpp>
 
@@ -70,14 +69,15 @@ char* fat32::read_cluster(cluster_t no)
         ++buf.ref;
         return buf.data;
     }
-    auto* data = new char[sectors_per_cluster * SECTOR_SIZE];
+    // TODO: page buffer class
+    using namespace kernel::mem;
+    using namespace paging;
+    assert(sectors_per_cluster * SECTOR_SIZE <= 0x1000);
+
+    char* data = physaddr<char>{page_to_pfn(alloc_page())};
     _raw_read_cluster(data, no);
-    buf.emplace(no,
-        buf_object {
-            data,
-            1,
-            // false,
-        });
+    buf.emplace(no, buf_object { data, 1 });
+
     return data;
 }
 
@@ -142,7 +142,7 @@ int fat32::readdir(fs::inode* dir, size_t offset, const fs::vfs::filldir_func& f
             }
             auto ret = filldir(fname.c_str(), 0, ind, ind->mode & S_IFMT);
 
-            if (ret != GB_OK) {
+            if (ret != 0) {
                 release_cluster(next);
                 return nread;
             }
@@ -210,51 +210,29 @@ fat32::fat32(dev_t _device)
 
 size_t fat32::read(inode* file, char* buf, size_t buf_size, size_t offset, size_t n)
 {
-    cluster_t next = cl(file);
     uint32_t cluster_size = SECTOR_SIZE * sectors_per_cluster;
     size_t orig_n = n;
 
-    do {
-        if (offset == 0) {
-            if (n > cluster_size) {
-                auto* data = read_cluster(next);
-                memcpy(buf, data, cluster_size);
-                release_cluster(next);
-
-                buf_size -= cluster_size;
-                buf += cluster_size;
-                n -= cluster_size;
-            } else {
-                auto* data = read_cluster(next);
-                auto read = _write_buf_n(buf, buf_size, data, n);
-                release_cluster(next);
+    for (cluster_t cno = cl(file); n && cno < EOC; cno = fat[cno]) {
+        if (offset >= cluster_size) {
+            offset -= cluster_size;
+            continue;
+        }
 
-                return orig_n - n + read;
-            }
-        } else {
-            if (offset > cluster_size) {
-                offset -= cluster_size;
-            } else {
-                auto* data = read_cluster(next);
+        auto* data = read_cluster(cno);
+        data += offset;
 
-                auto to_read = cluster_size - offset;
-                if (to_read > n)
-                    to_read = n;
+        auto to_copy = std::min(n, cluster_size - offset);
+        auto ncopied = _write_buf_n(buf, buf_size, data, to_copy);
 
-                auto read = _write_buf_n(buf, buf_size, data + offset, to_read);
-                buf += read;
-                n -= read;
+        buf += ncopied, n -= ncopied;
 
-                release_cluster(next);
-                if (read != to_read) {
-                    return orig_n - n;
-                }
+        release_cluster(cno);
+        if (ncopied != to_copy)
+            break;
 
-                offset = 0;
-            }
-        }
-        next = fat[next];
-    } while (n && next < EOC);
+        offset = 0;
+    }
 
     return orig_n - n;
 }
@@ -268,7 +246,7 @@ int fat32::inode_statx(dentry* ent, statx* st, unsigned int mask)
     }
 
     if (mask & STATX_BLOCKS) {
-        st->stx_blocks = align_up<12>(ent->ind->size) / 512;
+        st->stx_blocks = ((ent->ind->size + 0xfff) & ~0xfff) / 512;
         st->stx_blksize = 4096;
         st->stx_mask |= STATX_BLOCKS;
     }
@@ -304,7 +282,7 @@ int fat32::inode_statx(dentry* ent, statx* st, unsigned int mask)
         st->stx_mask |= STATX_GID;
     }
 
-    return GB_OK;
+    return 0;
 }
 
 int fat32::inode_stat(dentry* dent, struct stat* st)
@@ -319,7 +297,7 @@ int fat32::inode_stat(dentry* dent, struct stat* st)
     st->st_blksize = 4096;
     st->st_blocks = (ind->size + 511) / 512;
     st->st_ino = ind->ino;
-    return GB_OK;
+    return 0;
 }
 
 static fat32* create_fat32(const char* source, unsigned long, const void*)

+ 26 - 3
src/fs/procfs.cc

@@ -5,9 +5,9 @@
 #include <sys/mount.h>
 #include <unistd.h>
 
-#include <types/status.h>
-
+#include <kernel/hw/timer.hpp>
 #include <kernel/module.hpp>
+#include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/vfs.hpp>
 
@@ -64,6 +64,28 @@ static ssize_t mounts_read(char* page, size_t n)
     return orig_n - n;
 }
 
+static ssize_t schedstat_read(char* page, size_t n)
+{
+    auto orig_n = n;
+
+    if (n == 0)
+        return n;
+
+    int nw = snprintf(page, n, "%d\n", kernel::hw::timer::current_ticks());
+    n -= nw, page += nw;
+
+    for (const auto& proc : *procs) {
+        for (const auto& thd : proc.second.thds) {
+            int nwrote = snprintf(page, n, "%d %x %d\n", proc.first, thd.tid(), thd.elected_times);
+
+            n -= nwrote;
+            page += nwrote;
+        }
+    }
+
+    return orig_n - n;
+}
+
 namespace fs::proc {
 
 struct proc_file {
@@ -107,6 +129,7 @@ public:
         auto* ind = cache_inode(0, 0, S_IFDIR | 0777, 0, 0);
 
         create_file("mounts", mounts_read, nullptr);
+        create_file("schedstat", schedstat_read, nullptr);
 
         register_root_node(ind);
     }
@@ -160,7 +183,7 @@ public:
         for (const auto& [ ino, pf ] : files) {
             auto* ind = get_inode(ino);
             int ret = callback(pf.name.c_str(), 0, ind, ind->mode);
-            if (ret != GB_OK)
+            if (ret != 0)
                 return -EIO;
             ++nread;
         }

+ 62 - 52
src/kernel.ld

@@ -1,36 +1,56 @@
-OUTPUT_FORMAT(elf32-i386)
-OUTPUT_ARCH(i386:i386)
+OUTPUT_FORMAT(elf64-x86-64)
 
 MEMORY
 {
-    MEM : org = 0x00000000, l = 4096M
+    MBR    (wx) : org = 0x0e00, l = 512
+    STAGE1 (wx) : org = 0x1000, l = 4K
+    PHYMEM (w)  : org = 0xffffff0000000000, len = 512 * 1024M
+    PARRAY (w)  : org = 0xffffff8000000000, len = 128 * 1024M
+    KBSS   (w)  : org = 0xffffffffc0200000, len = 2M
+    KIMAGE (wx) : org = 0xffffffffffc00000, len = 2M
 }
 
 SECTIONS
 {
-    .stage1 0x8000 : AT(0x00000000)
+    .mbr : AT(0)
+    {
+        *(.mbr)
+
+        . = 510;
+        BYTE(0x55);
+        BYTE(0xaa);
+    } > MBR
+
+    .stage1 : AT(LOADADDR(.mbr) + SIZEOF(.mbr))
     {
         __stage1_start = .;
         *(.stage1)
 
         . = ALIGN(0x1000);
         __stage1_end = .;
-    } > MEM
+    } > STAGE1
 
     .kinit :
         AT(LOADADDR(.stage1) + SIZEOF(.stage1))
     {
-        __kinit_start = .;
-        *(.text.kinit)
+        KIMAGE_START = .;
+        KINIT_START = .;
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
+        *(.text.kinit)
 
+        . = ALIGN(16);
         *(.rodata.kinit)
 
-        . = ALIGN(16);
+        KINIT_START_ADDR = .;
+        QUAD(ABSOLUTE(KINIT_START));
+
+        KINIT_END_ADDR = .;
+        QUAD(ABSOLUTE(KINIT_END));
 
+        KINIT_PAGES = .;
+        QUAD((KINIT_END - KINIT_START) / 0x1000);
+
+        . = ALIGN(16);
         start_ctors = .;
         KEEP(*(.init_array));
         KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
@@ -38,89 +58,79 @@ SECTIONS
         KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
         end_ctors = .;
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
-
+        . = ALIGN(16);
         *(.data.kinit)
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
-
+        . = ALIGN(16);
         *(.bss.kinit)
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
-
         . = ALIGN(0x1000);
-        __kinit_end = .;
-    } > MEM
+        KINIT_END = .;
+    } > KIMAGE
 
-    .text 0xc0000000 :
+    .text :
         AT(LOADADDR(.kinit) + SIZEOF(.kinit))
     {
-        __text_start = .;
+        TEXT_START = .;
         *(.text)
         *(.text*)
 
         . = ALIGN(0x1000);
-        __text_end = .;
-    } > MEM
+        TEXT_END = .;
+    } > KIMAGE
 
     .rodata :
         AT(LOADADDR(.text) + SIZEOF(.text))
     {
-        __rodata_start = .;
+        RODATA_START = .;
         *(.rodata)
         *(.rodata*)
 
         . = ALIGN(16);
-        kmod_loaders_start = .;
+        KMOD_LOADERS_START = .;
 
         *(.kmods)
-
-        __kmod_loaders_end = .;
-        LONG(0);
+        QUAD(0);
 
         . = ALIGN(16);
 
-        bss_addr = .;
-        LONG(ABSOLUTE(__bss_start));
-        bss_len = .;
-        LONG(__bss_end - __bss_start);
-        kernel_size = .;
-        LONG(__data_end - __kinit_start);
+        BSS_ADDR = .;
+        QUAD(ABSOLUTE(BSS_START));
+        BSS_LENGTH = .;
+        QUAD(BSS_END - BSS_START);
 
         . = ALIGN(0x1000);
-        __rodata_end = .;
-    } > MEM
+        RODATA_END = .;
+    } > KIMAGE
 
     .data :
         AT(LOADADDR(.rodata) + SIZEOF(.rodata))
     {
-        __data_start = .;
+        DATA_START = .;
         *(.data)
         *(.data*)
 
         . = ALIGN(0x1000);
-        __data_end = .;
-    } > MEM
+        DATA_END = .;
+        KIMAGE_END = .;
+    } > KIMAGE
+
+    .sentry :
+        AT(0x78000 - 0x4)
+    { LONG(0x01145140); } > KIMAGE
 
     .bss :
     {
-        __bss_start = .;
+        BSS_START = .;
         *(.bss)
         *(.bss*)
 
         . = ALIGN(0x1000);
-        __bss_end = .;
-    } > MEM
+        BSS_END = .;
+    } > KBSS
 
-    .sentry :
-        AT(0x60000)
-    { LONG(0x01145140); } > MEM
+    KIMAGE_PAGES = (KIMAGE_END - KIMAGE_START) / 0x1000;
+    BSS_PAGES = (BSS_END - BSS_START) / 0x1000;
 
     .eh_frame :
         AT(LOADADDR(.sentry) + SIZEOF(.sentry))
@@ -129,7 +139,7 @@ SECTIONS
         *(.eh_frame*)
         . = ALIGN(0x1000);
         __eh_frame_end = .;
-    } > MEM
+    } > KIMAGE
 
     /* Stabs debugging sections.  */
     .stab          0 : { *(.stab) }

+ 121 - 36
src/kernel/allocator.cc

@@ -7,14 +7,18 @@
 #include <stdint.h>
 
 #include <kernel/async/lock.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/slab.hpp>
+
+constexpr uintptr_t KERNEL_HEAP_START = 0xffff'ff81'8000'0000;
+constexpr uintptr_t KERNEL_HEAP_END   = 0xffff'ffbf'ffff'ffff;
+constexpr uintptr_t KERNEL_HEAP_SIZE  = KERNEL_HEAP_END - KERNEL_HEAP_START;
 
 namespace types::memory {
 
 struct mem_blk_flags {
-    uint8_t is_free;
-    uint8_t has_next;
-    uint8_t : 8; // unused1
-    uint8_t : 8; // unused2
+    unsigned long is_free  : 8;
+    unsigned long has_next : 8;
 };
 
 struct mem_blk {
@@ -84,7 +88,7 @@ constexpr void split_block(mem_blk* blk, std::size_t this_size)
     // block is too small to get split
     // that is, the block to be split should have enough room
     // for "this_size" bytes and also could contain a new block
-    if (blk->size < this_size + sizeof(mem_blk) + 8)
+    if (blk->size < this_size + sizeof(mem_blk) + 1024)
         return;
 
     mem_blk* blk_next = next(blk, this_size);
@@ -100,13 +104,52 @@ constexpr void split_block(mem_blk* blk, std::size_t this_size)
     blk->size = this_size;
 }
 
+std::byte* brk_memory_allocator::brk(byte* addr)
+{
+    if (addr >= p_limit)
+        return nullptr;
+
+    uintptr_t current_allocated = reinterpret_cast<uintptr_t>(p_allocated);
+    uintptr_t new_brk = reinterpret_cast<uintptr_t>(addr);
+
+    current_allocated &= ~(0x200000-1);
+    new_brk &= ~(0x200000-1);
+
+    using namespace kernel::mem::paging;
+    while (current_allocated <= new_brk) {
+        auto idx = idx_all(current_allocated);
+        auto pdpt = KERNEL_PAGE_TABLE[std::get<1>(idx)].parse();
+
+        auto pdpte = pdpt[std::get<2>(idx)];
+        if (!pdpte.pfn())
+            pdpte.set(PA_KERNEL_PAGE_TABLE, alloc_page_table());
+
+        auto pde = pdpte.parse()[std::get<3>(idx)];
+        assert(!(pde.attributes() & PA_P));
+        pde.set(PA_KERNEL_DATA_HUGE, page_to_pfn(alloc_pages(9)));
+
+        current_allocated += 0x200000;
+    }
+    p_allocated = (std::byte*)current_allocated;
+
+    return p_break = addr;
+}
+
+std::byte* brk_memory_allocator::sbrk(size_type increment)
+{
+    return brk(p_break + increment);
+}
+
 brk_memory_allocator::brk_memory_allocator(byte* start, size_type size)
     : p_start(start)
     , p_limit(start + size)
+    , p_break(start)
+    , p_allocated(start)
 {
-    brk(p_start);
-    auto* p_blk = aspblk(sbrk(0));
-    p_blk->size = 8;
+    auto* p_blk = aspblk(brk(p_start));
+    sbrk(sizeof(mem_blk) + 1024); // 1024 bytes (minimum size for a block)
+
+    p_blk->size = 1024;
     p_blk->flags.has_next = 0;
     p_blk->flags.is_free = 1;
 }
@@ -114,8 +157,8 @@ brk_memory_allocator::brk_memory_allocator(byte* start, size_type size)
 void* brk_memory_allocator::allocate(size_type size)
 {
     kernel::async::lock_guard_irq lck(mtx);
-    // align to 8 bytes boundary
-    size = (size + 7) & ~7;
+    // align to 1024 bytes boundary
+    size = (size + 1024-1) & ~(1024-1);
 
     auto* block_allocated = find_blk(&p_start, size);
     if (!block_allocated->flags.has_next
@@ -156,59 +199,101 @@ void brk_memory_allocator::deallocate(void* ptr)
     unite_afterwards(blk);
 }
 
-static std::byte ki_heap[0x100000];
-static brk_memory_allocator ki_alloc(ki_heap, sizeof(ki_heap));
-static brk_memory_allocator* k_alloc;
-
-void* kimalloc(std::size_t size)
+bool brk_memory_allocator::allocated(void* ptr) const noexcept
 {
-    return ki_alloc.allocate(size);
+    return (void*)KERNEL_HEAP_START <= aspbyte(ptr) && aspbyte(ptr) < sbrk();
 }
 
-void kifree(void* ptr)
-{
-    ki_alloc.deallocate(ptr);
-}
+static brk_memory_allocator* k_alloc;
 
 } // namespace types::memory
 
-SECTION(".text.kinit")
-void kernel::kinit::init_kernel_heap(void *start, std::size_t size)
+static kernel::mem::slab_cache caches[7];
+
+static constexpr int __cache_index(std::size_t size)
 {
-    using namespace types::memory;
-    k_alloc = kinew<brk_memory_allocator>((std::byte*)start, size);
+    if (size <= 32)
+        return 0;
+    if (size <= 64)
+        return 1;
+    if (size <= 96)
+        return 2;
+    if (size <= 128)
+        return 3;
+    if (size <= 192)
+        return 4;
+    if (size <= 256)
+        return 5;
+    if (size <= 512)
+        return 6;
+    return -1;
 }
 
-void* operator new(size_t sz)
+SECTION(".text.kinit")
+void kernel::kinit::init_allocator()
 {
-    void* ptr = types::memory::k_alloc->allocate(sz);
-    assert(ptr);
-    return ptr;
+    mem::init_slab_cache(caches+0, 32);
+    mem::init_slab_cache(caches+1, 64);
+    mem::init_slab_cache(caches+2, 96);
+    mem::init_slab_cache(caches+3, 128);
+    mem::init_slab_cache(caches+4, 192);
+    mem::init_slab_cache(caches+5, 256);
+    mem::init_slab_cache(caches+6, 512);
+
+    types::memory::k_alloc = new types::memory::brk_memory_allocator(
+        (std::byte*)KERNEL_HEAP_START, KERNEL_HEAP_SIZE);
 }
 
-void* operator new[](size_t sz)
+void* operator new(size_t size)
 {
-    void* ptr = types::memory::k_alloc->allocate(sz);
+    int idx = __cache_index(size);
+    void* ptr = nullptr;
+    if (idx < 0)
+        ptr = types::memory::k_alloc->allocate(size);
+    else
+        ptr = kernel::mem::slab_alloc(&caches[idx]);
+
     assert(ptr);
     return ptr;
 }
 
 void operator delete(void* ptr)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    if (!ptr)
+        return;
+
+    if (types::memory::k_alloc->allocated(ptr))
+        types::memory::k_alloc->deallocate(ptr);
+    else
+        kernel::mem::slab_free(ptr);
 }
 
-void operator delete(void* ptr, size_t)
+void operator delete(void* ptr, std::size_t size)
+{
+    if (!ptr)
+        return;
+
+    if (types::memory::k_alloc->allocated(ptr)) {
+        types::memory::k_alloc->deallocate(ptr);
+        return;
+    }
+    int idx = __cache_index(size);
+    assert(idx >= 0);
+
+    kernel::mem::slab_free(ptr);
+}
+
+void* operator new[](size_t sz)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    return ::operator new(sz);
 }
 
 void operator delete[](void* ptr)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    ::operator delete(ptr);
 }
 
-void operator delete[](void* ptr, size_t)
+void operator delete[](void* ptr, std::size_t size)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    ::operator delete(ptr, size);
 }

+ 16 - 16
src/kernel/async/lock.cc

@@ -9,8 +9,8 @@ static inline void _raw_spin_lock(spinlock_t* lock_addr)
 {
     asm volatile(
         "%=:\n\t\
-         movl $1, %%eax\n\t\
-         xchgl %%eax, (%0)\n\t\
+         mov $1, %%eax\n\t\
+         xchg %%eax, (%0)\n\t\
          cmp $0, %%eax\n\t\
          jne %=b\n\t\
         "
@@ -22,19 +22,19 @@ static inline void _raw_spin_lock(spinlock_t* lock_addr)
 static inline void _raw_spin_unlock(spinlock_t* lock_addr)
 {
     asm volatile(
-        "movl $0, %%eax\n\
-         xchgl %%eax, (%0)"
+        "mov $0, %%eax\n\
+         xchg %%eax, (%0)"
         :
         : "r"(lock_addr)
         : "eax", "memory");
 }
 
-static inline uint32_t _save_interrupt_state()
+static inline lock_context_t _save_interrupt_state()
 {
-    uint32_t retval;
+    lock_context_t retval;
     asm volatile(
-        "pushfl\n\t"
-        "popl %0\n\t"
+        "pushf\n\t"
+        "pop %0\n\t"
         "cli"
         : "=g"(retval)
         :
@@ -44,13 +44,13 @@ static inline uint32_t _save_interrupt_state()
     return retval;
 }
 
-static inline void _restore_interrupt_state(uint32_t flags)
+static inline void _restore_interrupt_state(lock_context_t context)
 {
     asm volatile(
-        "pushl %0\n\t"
-        "popfl"
+        "push %0\n\t"
+        "popf"
         :
-        : "g"(flags)
+        : "g"(context)
         :
         );
 }
@@ -90,7 +90,7 @@ void spin_unlock(spinlock_t& lock)
     preempt_enable();
 }
 
-uint32_t spin_lock_irqsave(spinlock_t& lock)
+lock_context_t spin_lock_irqsave(spinlock_t& lock)
 {
     auto state = _save_interrupt_state();
     preempt_disable();
@@ -100,7 +100,7 @@ uint32_t spin_lock_irqsave(spinlock_t& lock)
     return state;
 }
 
-void spin_unlock_irqrestore(spinlock_t& lock, uint32_t state)
+void spin_unlock_irqrestore(spinlock_t& lock, lock_context_t state)
 {
     _raw_spin_unlock(&lock);
     preempt_enable();
@@ -122,12 +122,12 @@ void mutex::unlock()
     spin_unlock(m_lock);
 }
 
-uint32_t mutex::lock_irq()
+lock_context_t mutex::lock_irq()
 {
     return spin_lock_irqsave(m_lock);
 }
 
-void mutex::unlock_irq(uint32_t state)
+void mutex::unlock_irq(lock_context_t state)
 {
     spin_unlock_irqrestore(m_lock, state);
 }

+ 43 - 52
src/kernel/hw/ahci.cc

@@ -2,14 +2,13 @@
 #include <cstddef>
 #include <algorithm>
 
-#include <kernel/vfs.hpp>
-#include <kernel/log.hpp>
-#include <kernel/mm.hpp>
-#include <kernel/module.hpp>
 #include <kernel/hw/pci.hpp>
 #include <kernel/irq.hpp>
-
-#include <types/size.h>
+#include <kernel/log.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
+#include <kernel/module.hpp>
+#include <kernel/vfs.hpp>
 
 #include <stdint.h>
 #include <errno.h>
@@ -21,6 +20,9 @@
 
 using namespace kernel::module;
 using namespace kernel::hw::pci;
+using namespace kernel::mem::paging;
+
+using kernel::mem::physaddr;
 
 constexpr uint32_t MAX_SPINS = 100000;
 
@@ -40,11 +42,8 @@ constexpr uint32_t PORT_CMD_CR = 0x00008000;
 namespace ahci {
 
 typedef volatile struct hba_port_t {
-    uint32_t command_list_base;
-    uint32_t command_list_base_upper;
-
-    uint32_t fis_base;
-    uint32_t fis_base_upper;
+    uint64_t command_list_base;
+    uint64_t fis_base;
 
     uint32_t interrupt_status;
     uint32_t interrupt_enable;
@@ -102,8 +101,7 @@ struct command_header {
 
     uint32_t volatile bytes_transferred;
 
-    uint32_t command_table_base;
-    uint32_t command_table_base_upper;
+    uint64_t command_table_base;
 
     uint32_t reserved1[4];
 };
@@ -220,8 +218,7 @@ struct received_fis {
 };
 
 struct prdt_entry {
-    uint32_t data_base;
-    uint32_t data_base_upper;
+    uint64_t data_base;
 
     uint32_t reserved0;
 
@@ -291,13 +288,12 @@ struct quick_queue {
 struct ahci_port {
 private:
     // quick_queue<32> qu;
-    page_t page;
+    physaddr<command_header, false> cmd_header;
     hba_port* port;
-    command_header* cmd_header { };
     received_fis* fis { };
     std::size_t sectors { -1U };
 
-    int send_command(char* buf, uint64_t lba, uint32_t count, uint8_t cmd, bool write)
+    int send_command(physaddr<void> buf, uint64_t lba, uint32_t count, uint8_t cmd, bool write)
     {
         // count must be a multiple of 512
         if (count & (512 - 1))
@@ -307,9 +303,10 @@ private:
         int n = 0;
         // auto n = qu.pop();
 
-        // for now, we read 3.5KB at most at a time
         // command fis and prdt will take up the lower 128+Bytes
-        auto cmdtable_page = __alloc_raw_page();
+        // TODO: buffer array
+        pfn_t command_table_pfn = page_to_pfn(alloc_page());
+        physaddr<command_table, false> cmdtable{command_table_pfn};
 
         // construct command header
         memset(cmd_header + n, 0x00, sizeof(command_header));
@@ -318,9 +315,8 @@ private:
 
         cmd_header[n].write = write;
         cmd_header[n].prdt_length = 1;
-        cmd_header[n].command_table_base = cmdtable_page << 12;
+        cmd_header[n].command_table_base = cmdtable.phys();
 
-        auto* cmdtable = (command_table*)kernel::pmap(cmdtable_page);
         memset(cmdtable, 0x00, sizeof(command_table) + sizeof(prdt_entry));
 
         // first, set up command fis
@@ -340,7 +336,7 @@ private:
 
         // fill in prdt
         auto* pprdt = cmdtable->prdt;
-        pprdt->data_base = (cmdtable_page << 12) + 512;
+        pprdt->data_base = buf.phys();
         pprdt->byte_count = count;
         pprdt->interrupt = 1;
 
@@ -359,17 +355,17 @@ private:
         SPIN(port->command_issue & (1 << n), spins)
             return -1;
 
-        memcpy(buf, (char*)cmdtable + 512, count);
-
-        kernel::pfree(cmdtable_page);
-        __free_raw_page(cmdtable_page);
+        free_page(command_table_pfn);
         return 0;
     }
 
     int identify()
     {
-        char buf[512];
-        int ret = send_command(buf, 0, 512, 0xEC, false);
+        pfn_t buffer_page = page_to_pfn(alloc_page());
+        int ret = send_command(physaddr<void>{buffer_page},
+                0, 512, 0xEC, false);
+
+        free_page(buffer_page);
         if (ret != 0)
             return -1;
         return 0;
@@ -377,40 +373,43 @@ private:
 
 public:
     explicit ahci_port(hba_port* port)
-        : page(__alloc_raw_page()), port(port) { }
+        : cmd_header{page_to_pfn(alloc_page())}, port(port) { }
 
     ~ahci_port()
     {
         if (!cmd_header)
             return;
-        kernel::pfree(page);
-        __free_raw_page(page);
+        free_page(cmd_header.phys());
     }
 
     ssize_t read(char* buf, std::size_t buf_size, std::size_t offset, std::size_t cnt)
     {
         cnt = std::min(buf_size, cnt);
 
-        constexpr size_t READ_BUF_SECTORS = 6;
+        pfn_t buffer_page = page_to_pfn(alloc_page());
+        physaddr<void> buffer_ptr{buffer_page};
 
-        char b[READ_BUF_SECTORS * 512] {};
         char* orig_buf = buf;
         size_t start = offset / 512;
         size_t end = std::min((offset + cnt + 511) / 512, sectors);
 
         offset -= start * 512;
-        for (size_t i = start; i < end; i += READ_BUF_SECTORS) {
-            size_t n_read = std::min(end - i, READ_BUF_SECTORS) * 512;
-            int status = send_command(b, i, n_read, 0xC8, false);
-            if (status != 0)
+        for (size_t i = start; i < end; i += 4096UL / 512) {
+            size_t n_read = std::min(end - i, 4096UL / 512) * 512;
+            int status = send_command(buffer_ptr, i, n_read, 0xC8, false);
+            if (status != 0) {
+                free_page(buffer_page);
                 return -EIO;
+            }
 
             size_t to_copy = std::min(cnt, n_read - offset);
-            memcpy(buf, b + offset, to_copy);
+            memcpy(buf, (std::byte*)(void*)buffer_ptr + offset, to_copy);
             offset = 0;
             buf += to_copy;
             cnt -= to_copy;
         }
+
+        free_page(buffer_page);
         return buf - orig_buf;
     }
 
@@ -425,13 +424,9 @@ public:
         //
         // port->interrupt_enable = 1;
 
-        port->command_list_base = page << 12;
-        port->command_list_base_upper = 0;
-
-        port->fis_base = (page << 12) + 0x400;
-        port->fis_base_upper = 0;
+        port->command_list_base = cmd_header.phys();
+        port->fis_base = cmd_header.phys() + 0x400;
 
-        cmd_header = (command_header*)kernel::pmap(page, false);
         fis = (received_fis*)(cmd_header + 1);
 
         if (start_command(port) != 0)
@@ -455,9 +450,6 @@ public:
     ~ahci_module()
     {
         // TODO: release PCI device
-        if (ghc)
-            kernel::pfree(dev->reg[PCI_REG_ABAR] >> 12);
-
         for (auto& item : ports) {
             if (!item)
                 continue;
@@ -481,7 +473,7 @@ public:
             auto* port = new ahci_port(ghc_port);
             if (port->init() != 0) {
                 delete port;
-                kmsg("An error occurred while configuring an ahci port\n");
+                kmsg("An error occurred while configuring an ahci port");
                 continue;
             }
 
@@ -506,10 +498,9 @@ public:
         auto ret = kernel::hw::pci::register_driver(VENDOR_INTEL, DEVICE_AHCI,
             [this](pci_device* dev) -> int {
                 this->dev = dev;
-                uint32_t abar_address = dev->reg[PCI_REG_ABAR];
 
-                void* base = kernel::pmap(abar_address >> 12, false);
-                this->ghc = (hba_ghc*)base;
+                physaddr<hba_ghc, false> pp_base{dev->reg[PCI_REG_ABAR]};
+                this->ghc = pp_base;
 
                 this->ghc->global_host_control =
                     this->ghc->global_host_control | 2; // set interrupt enable

+ 0 - 31
src/kernel/hw/keyboard.cpp

@@ -1,31 +0,0 @@
-#include <asm/port_io.h>
-#include <kernel/hw/keyboard.h>
-#include <kernel/input/input_event.h>
-
-extern "C" void
-handle_keyboard_interrupt(void)
-{
-    input_event evt {
-        .type = input_event::input_event_type::KEYBOARD,
-        .code = KEY_DOWN,
-        .data = 0
-    };
-
-    uint8_t keycode = asm_inb(PORT_KEYDATA);
-    if (keycode >= 0xd8) {
-        // TODO: report not_supported event
-        return;
-    }
-
-    // key release
-    if (keycode >= 0x80) {
-        evt.code = KEY_UP;
-        keycode -= 0x80;
-    }
-
-    evt.data = keycode;
-
-    // TODO: fix it
-    // commit_input_event(&evt);
-    (void)evt;
-}

+ 124 - 0
src/kernel/hw/serial.cc

@@ -0,0 +1,124 @@
+#include <errno.h>
+#include <stdio.h>
+
+#include <kernel/hw/port.hpp>
+#include <kernel/irq.hpp>
+#include <kernel/log.hpp>
+#include <kernel/module.hpp>
+#include <kernel/tty.hpp>
+
+using namespace kernel::tty;
+using namespace kernel::hw;
+
+constexpr int PORT0 = 0x3f8;
+constexpr int PORT1 = 0x2f8;
+
+using port_group = const p8[6];
+
+constexpr p8 port0[] = {
+    p8{PORT0+0},
+    p8{PORT0+1},
+    p8{PORT0+2},
+    p8{PORT0+3},
+    p8{PORT0+4},
+    p8{PORT0+5},
+};
+
+constexpr p8 port1[] = {
+    p8{PORT1+0},
+    p8{PORT1+1},
+    p8{PORT1+2},
+    p8{PORT1+3},
+    p8{PORT1+4},
+    p8{PORT1+5},
+};
+
+static void _serial0_receive_data_interrupt()
+{
+    while (*port0[5] & 1)
+        console->commit_char(*port0[0]);
+}
+
+static void _serial1_receive_data_interrupt()
+{
+    while (*port1[5] & 1)
+        console->commit_char(*port1[0]);
+}
+
+static inline int _init_port(port_group ports)
+{
+    // taken from osdev.org
+
+    ports[1] = 0x00; // Disable all interrupts
+    ports[3] = 0x80; // Enable DLAB (set baud rate divisor)
+    // TODO: set baud rate
+    ports[0] = 0x00; // Set divisor to 0 -3- (lo byte) 115200 -38400- baud
+    ports[1] = 0x00; //                  (hi byte)
+    ports[3] = 0x03; // 8 bits, no parity, one stop bit
+    ports[2] = 0xC7; // Enable FIFO, clear them, with 14-byte threshold
+    // TODO: IRQ disabled
+    ports[4] = 0x0B; // IRQs enabled, RTS/DSR set
+    ports[4] = 0x1E; // Set in loopback mode, test the serial chip
+    ports[0] = 0xAE; // Test serial chip (send byte 0xAE and check if serial returns same byte)
+
+    // Check if serial is faulty (i.e: not same byte as sent)
+    if (*ports[0] != 0xAE)
+        return -EIO;
+
+    // If serial is not faulty set it in normal operation mode
+    // (not-loopback with IRQs enabled and OUT#1 and OUT#2 bits enabled)
+    ports[4] = 0x0F;
+
+    ports[1] = 0x01; // Enable interrupts #0: Received Data Available
+
+    return 0;
+}
+
+class serial_tty : public virtual tty {
+    const p8* ports;
+
+public:
+    serial_tty(port_group ports, int id)
+        : tty{"ttyS"}, ports(ports)
+    {
+        name += '0'+id;
+    }
+
+    virtual void putchar(char c) override
+    {
+        while (!(*ports[5] & 0x20))
+            ; // nop
+        ports[0] = c;
+    }
+};
+
+class serial_module : public virtual kernel::module::module {
+public:
+    serial_module() : module("serial-tty") { }
+
+    virtual int init() override
+    {
+        if (int ret = _init_port(port0); ret == 0) {
+            auto* dev = new serial_tty(port0, 0);
+            kernel::irq::register_handler(4, _serial0_receive_data_interrupt);
+
+            if (int ret = register_tty(dev); ret != 0)
+                kmsg("[serial] cannot register ttyS0");
+        }
+
+        if (int ret = _init_port(port1); ret == 0) {
+            auto* dev = new serial_tty(port1, 0);
+            kernel::irq::register_handler(3, _serial1_receive_data_interrupt);
+
+            if (int ret = register_tty(dev); ret != 0)
+                kmsg("[serial] cannot register ttyS1");
+        }
+
+        return kernel::module::MODULE_SUCCESS;
+    }
+
+};
+
+kernel::module::module* serial_module_init()
+{ return new serial_module(); }
+INTERNAL_MODULE(serial_module_loader, serial_module_init);

+ 0 - 71
src/kernel/hw/serial.cpp

@@ -1,71 +0,0 @@
-#include <asm/port_io.h>
-#include <kernel/hw/serial.h>
-#include <kernel/irq.hpp>
-#include <kernel/tty.hpp>
-#include <stdio.h>
-#include <types/status.h>
-
-static void serial_receive_data_interrupt(void)
-{
-    while (is_serial_has_data(PORT_SERIAL0)) {
-        uint8_t data = serial_read_data(PORT_SERIAL0);
-        console->commit_char(data);
-    }
-}
-
-SECTION(".text.kinit")
-int32_t init_serial_port(port_id_t port)
-{
-    // taken from osdev.org
-
-    asm_outb(port + 1, 0x00); // Disable all interrupts
-    asm_outb(port + 3, 0x80); // Enable DLAB (set baud rate divisor)
-    // TODO: set baud rate
-    asm_outb(port + 0, 0x00); // Set divisor to 0 -3- (lo byte) 115200 -38400- baud
-    asm_outb(port + 1, 0x00); //                  (hi byte)
-    asm_outb(port + 3, 0x03); // 8 bits, no parity, one stop bit
-    asm_outb(port + 2, 0xC7); // Enable FIFO, clear them, with 14-byte threshold
-    // TODO: IRQ disabled
-    asm_outb(port + 4, 0x0B); // IRQs enabled, RTS/DSR set
-    asm_outb(port + 4, 0x1E); // Set in loopback mode, test the serial chip
-    asm_outb(port + 0, 0xAE); // Test serial chip (send byte 0xAE and check if serial returns same byte)
-
-    // Check if serial is faulty (i.e: not same byte as sent)
-    if (asm_inb(port + 0) != 0xAE) {
-        return GB_FAILED;
-    }
-
-    // If serial is not faulty set it in normal operation mode
-    // (not-loopback with IRQs enabled and OUT#1 and OUT#2 bits enabled)
-    asm_outb(port + 4, 0x0F);
-
-    asm_outb(port + 1, 0x01); // Enable interrupts #0: Received Data Available
-
-    kernel::irq::register_handler(4, serial_receive_data_interrupt);
-
-    return GB_OK;
-}
-
-int32_t is_serial_has_data(port_id_t port)
-{
-    return asm_inb(port + 5) & 1;
-}
-
-uint8_t serial_read_data(port_id_t port)
-{
-    while (is_serial_has_data(port) == 0)
-        ;
-    return asm_inb(port);
-}
-
-int32_t is_serial_ready_for_transmition(port_id_t port)
-{
-    return asm_inb(port + 5) & 0x20;
-}
-
-void serial_send_data(port_id_t port, uint8_t data)
-{
-    while (is_serial_ready_for_transmition(port) == 0)
-        ;
-    return asm_outb(port, data);
-}

+ 0 - 26
src/kernel/hw/timer.c

@@ -1,26 +0,0 @@
-#include <asm/port_io.h>
-#include <kernel/hw/timer.h>
-
-static size_t _current_ticks = 0;
-
-SECTION(".text.kinit")
-void init_pit(void)
-{
-    // set interval
-    asm_outb(PORT_PIT_CONTROL, 0x34);
-
-    // send interval number
-    // 0x2e9a = 11930 = 100Hz
-    asm_outb(PORT_PIT_COUNT, 0x9a);
-    asm_outb(PORT_PIT_COUNT, 0x2e);
-}
-
-void inc_tick(void)
-{
-    ++_current_ticks;
-}
-
-size_t current_ticks(void)
-{
-    return _current_ticks;
-}

+ 31 - 0
src/kernel/hw/timer.cc

@@ -0,0 +1,31 @@
+#include <types/types.h>
+
+#include <kernel/hw/port.hpp>
+#include <kernel/hw/timer.hpp>
+
+constexpr kernel::hw::p8 port_control(0x43);
+constexpr kernel::hw::p8 port_count(0x40);
+
+static std::size_t _current_ticks = 0;
+
+SECTION(".text.kinit")
+void kernel::hw::timer::init_pit(void)
+{
+    // set interval
+    port_control = 0x34;
+
+    // send interval number
+    // 0x2e9a = 11930 = 100Hz
+    port_count = 0x9a;
+    port_count = 0x2e;
+}
+
+void kernel::hw::timer::inc_tick(void)
+{
+    ++_current_ticks;
+}
+
+size_t kernel::hw::timer::current_ticks(void)
+{
+    return _current_ticks;
+}

+ 100 - 268
src/kernel/interrupt.cpp

@@ -5,325 +5,157 @@
 #include <stdint.h>
 #include <stdio.h>
 
-#include <types/size.h>
 #include <types/types.h>
 
-#include <asm/port_io.h>
-#include <kernel/hw/keyboard.h>
-#include <kernel/hw/serial.h>
-#include <kernel/hw/timer.h>
-#include <kernel/interrupt.h>
+#include <kernel/hw/port.hpp>
+#include <kernel/hw/timer.hpp>
+#include <kernel/interrupt.hpp>
 #include <kernel/irq.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/process.hpp>
+#include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
-#include <kernel/vga.hpp>
 
-struct IDT_entry {
-    uint16_t offset_low;
-    uint16_t selector;
-    uint8_t zero;
-    uint8_t type_attr;
-    uint16_t offset_high;
-};
+#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
+#define USER_INTERRUPT_GATE_TYPE (0xee)
 
-// interrupt stubs
-extern "C" void irq0(); extern "C" void irq1(); extern "C" void irq2();
-extern "C" void irq3(); extern "C" void irq4(); extern "C" void irq5();
-extern "C" void irq6(); extern "C" void irq7(); extern "C" void irq8();
-extern "C" void irq9(); extern "C" void irq10(); extern "C" void irq11();
-extern "C" void irq12(); extern "C" void irq13(); extern "C" void irq14();
-extern "C" void irq15(); extern "C" void int6(); extern "C" void int8();
-extern "C" void int13(); extern "C" void int14();
-extern "C" void syscall_stub();
+constexpr kernel::hw::p8 port_pic1_command{0x20};
+constexpr kernel::hw::p8 port_pic1_data{0x21};
+constexpr kernel::hw::p8 port_pic2_command{0xa0};
+constexpr kernel::hw::p8 port_pic2_data{0xa1};
 
-#define SET_UP_IRQ(N, SELECTOR)                   \
-    ptr_t addr_irq##N = (ptr_t)irq##N;            \
-    set_idt_entry(IDT, 0x20 + (N), (addr_irq##N), \
-        (SELECTOR), KERNEL_INTERRUPT_GATE_TYPE);
+struct IDT_entry {
+    uint16_t offset_low;
+    uint16_t segment;
 
-#define SET_IDT_ENTRY_FN(N, FUNC_NAME, SELECTOR, TYPE) \
-    ptr_t addr_##FUNC_NAME = (ptr_t)FUNC_NAME;         \
-    set_idt_entry(IDT, (N), (addr_##FUNC_NAME), (SELECTOR), (TYPE));
+    uint8_t IST;
+    uint8_t attributes;
 
-SECTION(".text.kinit")
-static void set_idt_entry(IDT_entry (&idt)[256], int n,
-    uintptr_t offset, uint16_t selector, uint8_t type)
-{
-    idt[n].offset_low = offset & 0xffff;
-    idt[n].selector = selector;
-    idt[n].zero = 0;
-    idt[n].type_attr = type;
-    idt[n].offset_high = (offset >> 16) & 0xffff;
-}
-
-// idt_descriptor: uint16_t[3]
-// [0] bit 0 :15 => limit
-// [1] bit 16:47 => address
-extern "C" void asm_load_idt(uint16_t idt_descriptor[3], int sti);
+    uint16_t offset_mid;
+    uint32_t offset_high;
+    uint32_t reserved;
+};
 
 static struct IDT_entry IDT[256];
 
-static inline void NORETURN die(regs_32& regs, ptr_t eip)
-{
-    char buf[512] = {};
-    snprintf(
-        buf, sizeof(buf),
-        "***** KERNEL PANIC *****\n"
-        "eax: %x, ebx: %x, ecx: %x, edx: %x\n"
-        "esp: %x, ebp: %x, esi: %x, edi: %x\n"
-        "eip: %x\n",
-        regs.eax, regs.ebx, regs.ecx,
-        regs.edx, regs.esp, regs.ebp,
-        regs.esi, regs.edi, eip);
-    kmsg(buf);
-    freeze();
-}
+extern "C" uintptr_t ISR_START_ADDR;
 
 SECTION(".text.kinit")
-void init_idt()
+static inline void set_idt_entry(IDT_entry (&idt)[256], int n,
+    uintptr_t offset, uint16_t selector, uint8_t type)
 {
-    asm_cli();
-
-    memset(IDT, 0x00, sizeof(IDT));
-
-    // invalid opcode
-    SET_IDT_ENTRY_FN(6, int6, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // double fault
-    SET_IDT_ENTRY_FN(8, int8, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // general protection
-    SET_IDT_ENTRY_FN(13, int13, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // page fault
-    SET_IDT_ENTRY_FN(14, int14, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // system call
-    SET_IDT_ENTRY_FN(0x80, syscall_stub, 0x08, USER_INTERRUPT_GATE_TYPE);
-
-    uint16_t idt_descriptor[3];
-    idt_descriptor[0] = sizeof(struct IDT_entry) * 256;
-    *((uint32_t*)(idt_descriptor + 1)) = (ptr_t)IDT;
-
-    asm_load_idt(idt_descriptor, 0);
+    idt[n].offset_low = offset & 0xffff;
+    idt[n].segment = selector;
+    idt[n].IST = 0;
+    idt[n].attributes = type;
+    idt[n].offset_mid = (offset >> 16) & 0xffff;
+    idt[n].offset_high = (offset >> 32) & 0xffffffff;
+    idt[n].reserved = 0;
 }
 
 using kernel::irq::irq_handler_t;
 static std::vector<std::list<irq_handler_t>> s_irq_handlers;
 
-void kernel::irq::register_handler(int irqno, irq_handler_t handler)
-{
-    s_irq_handlers[irqno].emplace_back(std::move(handler));
-}
-
 SECTION(".text.kinit")
-void init_pic(void)
+void kernel::kinit::init_interrupt()
 {
-    asm_cli();
+    for (int i = 0; i < 0x30; ++i)
+        set_idt_entry(IDT, i, ISR_START_ADDR+8*i, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
+    set_idt_entry(IDT, 0x80, ISR_START_ADDR+8*0x80, 0x08, USER_INTERRUPT_GATE_TYPE);
 
+    uint64_t idt_descriptor[2];
+    idt_descriptor[0] = (sizeof(IDT_entry) * 256) << 48;
+    idt_descriptor[1] = (uintptr_t)IDT;
+
+    // initialize PIC
+    asm volatile("lidt (%0)": :"r"((uintptr_t)idt_descriptor + 6): );
     s_irq_handlers.resize(16);
 
     // TODO: move this to timer driver
     kernel::irq::register_handler(0, []() {
-        inc_tick();
+        kernel::hw::timer::inc_tick();
         schedule();
     });
 
-    asm_outb(PORT_PIC1_COMMAND, 0x11); // edge trigger mode
-    asm_outb(PORT_PIC1_DATA, 0x20); // start from int 0x20
-    asm_outb(PORT_PIC1_DATA, 0x04); // PIC1 is connected to IRQ2 (1 << 2)
-    asm_outb(PORT_PIC1_DATA, 0x01); // no buffer mode
+    port_pic1_command = 0x11; // edge trigger mode
+    port_pic1_data = 0x20;    // start from int 0x20
+    port_pic1_data = 0x04;    // PIC1 is connected to IRQ2 (1 << 2)
+    port_pic1_data = 0x01;    // no buffer mode
 
-    asm_outb(PORT_PIC2_COMMAND, 0x11); // edge trigger mode
-    asm_outb(PORT_PIC2_DATA, 0x28); // start from int 0x28
-    asm_outb(PORT_PIC2_DATA, 0x02); // connected to IRQ2
-    asm_outb(PORT_PIC2_DATA, 0x01); // no buffer mode
+    port_pic2_command = 0x11; // edge trigger mode
+    port_pic2_data = 0x28;    // start from int 0x28
+    port_pic2_data = 0x02;    // connected to IRQ2
+    port_pic2_data = 0x01;    // no buffer mode
 
     // allow all the interrupts
-    asm_outb(PORT_PIC1_DATA, 0x00);
-    asm_outb(PORT_PIC2_DATA, 0x00);
-
-    // 0x08 stands for kernel code segment
-    SET_UP_IRQ(0, 0x08);
-    SET_UP_IRQ(1, 0x08);
-    SET_UP_IRQ(2, 0x08);
-    SET_UP_IRQ(3, 0x08);
-    SET_UP_IRQ(4, 0x08);
-    SET_UP_IRQ(5, 0x08);
-    SET_UP_IRQ(6, 0x08);
-    SET_UP_IRQ(7, 0x08);
-    SET_UP_IRQ(8, 0x08);
-    SET_UP_IRQ(9, 0x08);
-    SET_UP_IRQ(10, 0x08);
-    SET_UP_IRQ(11, 0x08);
-    SET_UP_IRQ(12, 0x08);
-    SET_UP_IRQ(13, 0x08);
-    SET_UP_IRQ(14, 0x08);
-    SET_UP_IRQ(15, 0x08);
+    port_pic1_data = 0x00;
+    port_pic2_data = 0x00;
 }
 
-extern "C" void int6_handler(
-    regs_32 s_regs,
-    ptr_t eip,
-    uint16_t cs,
-    uint32_t eflags)
+void kernel::irq::register_handler(int irqno, irq_handler_t handler)
 {
-    if (!current_process->attr.system)
-        kill_current(SIGSEGV);
-
-    char buf[128];
-    snprintf(buf, sizeof(buf),
-        "[kernel] int6 data: cs: %x, eflags: %x\n", cs, eflags);
-    kmsg(buf);
-
-    die(s_regs, eip);
+    s_irq_handlers[irqno].emplace_back(std::move(handler));
 }
 
-// general protection
-extern "C" void int13_handler(
-    struct regs_32 s_regs,
-    uint32_t error_code,
-    ptr_t eip,
-    uint16_t cs,
-    uint32_t eflags)
+static inline void fault_handler(
+        interrupt_stack_with_code* context,
+        mmx_registers*)
 {
-    if (!current_process->attr.system)
-        kill_current(SIGILL);
-
-    char buf[128] = {};
-    snprintf(buf, sizeof(buf),
-        "[kernel] int13 data: error_code: %x, cs: %x, eflags: %x\n",
-        error_code, cs, eflags);
-    kmsg(buf);
-
-    die(s_regs, eip);
-}
-
-struct PACKED int14_data {
-    void* l_addr;
-    struct regs_32 s_regs;
-    struct page_fault_error_code error_code;
-    void* v_eip;
-    uint32_t cs;
-    uint32_t eflags;
-};
+    switch (context->head.int_no) {
+    case 6:
+    case 8: {
+        if (!current_process->attr.system)
+            kill_current(SIGSEGV); // noreturn
+    } break;
+    case 13: {
+        if (!current_process->attr.system)
+            kill_current(SIGILL); // noreturn
+    } break;
+    case 14: {
+        kernel::mem::paging::handle_page_fault(context->error_code);
+        context->head.int_no = (unsigned long)context + 0x88;
+        return;
+    } break;
+    }
 
-static inline void _int14_panic(void* eip, void* cr2, struct page_fault_error_code error_code)
-{
-    char buf[128] = {};
-    snprintf(buf, sizeof(buf),
-        "[kernel] int14 data: eip: %p, cr2: %p, error_code: %x\n"
-        "[kernel] freezing...\n",
-        eip, cr2, error_code);
-    kmsg(buf);
+    // fault can not be resolved
     freeze();
 }
 
-static inline void NORETURN _int14_kill_user(void)
-{
-    kill_current(SIGSEGV);
-}
-
-// page fault
-extern "C" void int14_handler(int14_data* d)
+static inline void irq_handler(
+        interrupt_stack_normal* context,
+        mmx_registers*)
 {
-    kernel::memory::mm_list* mms = nullptr;
-    if (current_process) [[likely]]
-        mms = &current_process->mms;
-    else
-        mms = kernel::memory::mm_list::s_kernel_mms;
-
-    auto* mm_area = mms->find(d->l_addr);
-    if (!mm_area) [[unlikely]] {
-        if (d->error_code.user) {
-            // user access of address that does not exist
-            _int14_kill_user();
-        } else {
-            _int14_panic(d->v_eip, d->l_addr, d->error_code);
-        }
-    }
-    if (d->error_code.user && mm_area->attr.system)
-        _int14_kill_user();
-
-    page* page = &(*mm_area->pgs)[vptrdiff(d->l_addr, mm_area->start) / PAGE_SIZE];
-    kernel::paccess pa(page->pg_pteidx >> 12);
-    auto pt = (pt_t)pa.ptr();
-    assert(pt);
-    pte_t* pte = *pt + (page->pg_pteidx & 0xfff);
-
-    if (unlikely(d->error_code.present == 0 && !mm_area->mapped_file))
-        _int14_panic(d->v_eip, d->l_addr, d->error_code);
-
-    if (page->attr & PAGE_COW) {
-        // if it is a dying page
-        if (*page->ref_count == 1) {
-            page->attr &= ~PAGE_COW;
-            pte->in.p = 1;
-            pte->in.a = 0;
-            pte->in.rw = mm_area->attr.write;
-            return;
-        }
-        // duplicate the page
-        page_t new_page = __alloc_raw_page();
-
-        {
-            kernel::paccess pdst(new_page), psrc(page->phys_page_id);
-            auto* new_page_data = (char*)pdst.ptr();
-            auto* src = psrc.ptr();
-            assert(new_page_data && src);
-            memcpy(new_page_data, src, PAGE_SIZE);
-        }
-
-        pte->in.page = new_page;
-        pte->in.rw = mm_area->attr.write;
-        pte->in.a = 0;
-
-        --*page->ref_count;
+    int irqno = context->head.int_no - 0x20;
 
-        page->ref_count = types::memory::kinew<size_t>(1);
-        page->attr &= ~PAGE_COW;
-        page->phys_page_id = new_page;
-    }
-
-    if (page->attr & PAGE_MMAP) {
-        pte->in.p = 1;
-
-        size_t offset = align_down<12>((uint32_t)d->l_addr);
-        offset -= (uint32_t)mm_area->start;
-
-        kernel::paccess pa(page->phys_page_id);
-        auto* data = (char*)pa.ptr();
-        assert(data);
-
-        int n = vfs_read(
-            mm_area->mapped_file,
-            data,
-            PAGE_SIZE,
-            mm_area->file_offset + offset,
-            PAGE_SIZE);
-
-        // TODO: send SIGBUS if offset is greater than real size
-        if (n != PAGE_SIZE)
-            memset(data + n, 0x00, PAGE_SIZE - n);
-
-        page->attr &= ~PAGE_MMAP;
-    }
-}
+    constexpr uint8_t PIC_EOI = 0x20;
 
-extern "C" void irq_handler(
-    int irqno,
-    interrupt_stack* context,
-    mmx_registers* mmxregs)
-{
-    asm_outb(PORT_PIC1_COMMAND, PIC_EOI);
+    port_pic1_command = PIC_EOI;
     if (irqno >= 8)
-        asm_outb(PORT_PIC2_COMMAND, PIC_EOI);
+        port_pic2_command = PIC_EOI;
 
     for (const auto& handler : s_irq_handlers[irqno])
         handler();
+}
 
-    if (context->cs != USER_CODE_SEGMENT)
-        return;
-
-    if (current_thread->signals.pending_signal())
-        current_thread->signals.handle(context, mmxregs);
+extern "C" void interrupt_handler(
+        interrupt_stack_head* context,
+        mmx_registers* mmxregs)
+{
+    // interrupt is a fault
+    if (context->int_no < 0x20) {
+        auto* with_code = (interrupt_stack_with_code*)context;
+        fault_handler(with_code, mmxregs);
+    }
+    else if (context->int_no == 0x80) { // syscall by int 0x80
+        auto* normal = (interrupt_stack_normal*)context;
+        kernel::handle_syscall32(context->s_regs.rax, normal, mmxregs);
+        context->int_no = (unsigned long)context + 0x80;
+    }
+    else {
+        auto* normal = (interrupt_stack_normal*)context;
+        irq_handler(normal, mmxregs);
+        context->int_no = (unsigned long)context + 0x80;
+    }
 }

+ 0 - 586
src/kernel/mem.cpp

@@ -1,586 +0,0 @@
-#include <cstddef>
-
-#include <asm/port_io.h>
-#include <asm/sys.h>
-#include <assert.h>
-#include <errno.h>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
-#include <kernel/process.hpp>
-#include <kernel/task.h>
-#include <kernel/vga.hpp>
-#include <stdint.h>
-#include <stdio.h>
-#include <types/allocator.hpp>
-#include <types/bitmap.hpp>
-#include <types/size.h>
-#include <types/status.h>
-
-// constant values
-
-#define EMPTY_PAGE ((page_t)0)
-
-// ---------------------
-
-static size_t mem_size;
-static uint8_t _mem_bitmap[1024 * 1024 / 8];
-static types::bitmap mem_bitmap(
-    [](unsigned char*, std::size_t){}, _mem_bitmap,
-    1024 * 1024);
-
-// global
-segment_descriptor gdt[7];
-
-uint8_t e820_mem_map[1024];
-uint32_t e820_mem_map_count;
-uint32_t e820_mem_map_entry_size;
-struct mem_size_info mem_size_info;
-
-constexpr void mark_addr_len(pptr_t start, size_t n)
-{
-    if (n == 0)
-        return;
-    page_t start_page = align_down<12>(start) >> 12;
-    page_t end_page = align_up<12>(start + n) >> 12;
-    for (page_t i = start_page; i < end_page; ++i)
-        mem_bitmap.set(i);
-}
-
-constexpr void free_addr_len(pptr_t start, size_t n)
-{
-    if (n == 0)
-        return;
-    page_t start_page = align_down<12>(start) >> 12;
-    page_t end_page = align_up<12>(start + n) >> 12;
-    for (page_t i = start_page; i < end_page; ++i)
-        mem_bitmap.clear(i);
-}
-
-constexpr void mark_addr_range(pptr_t start, pptr_t end)
-{
-    mark_addr_len(start, end - start);
-}
-
-constexpr void free_addr_range(pptr_t start, pptr_t end)
-{
-    free_addr_len(start, end - start);
-}
-
-page_t __alloc_raw_page(void)
-{
-    const auto size = mem_bitmap.size();
-    for (size_t i = 0; i < size; ++i) {
-        if (mem_bitmap.test(i) == 0) {
-            mem_bitmap.set(i);
-            return i;
-        }
-    }
-    return -1;
-}
-
-void __free_raw_page(page_t pg)
-{
-    mem_bitmap.clear(pg);
-}
-
-page allocate_page(void)
-{
-    return page {
-        .phys_page_id = __alloc_raw_page(),
-        .ref_count = types::memory::kinew<size_t>(0),
-        .pg_pteidx = 0,
-        .attr = 0,
-    };
-}
-
-void free_page(page* pg)
-{
-    if (*pg->ref_count == 1) {
-        types::memory::kidelete<size_t>(pg->ref_count);
-        __free_raw_page(pg->phys_page_id);
-    } else {
-        --*pg->ref_count;
-    }
-}
-
-void dealloc_pd(page_t pd)
-{
-    {
-        kernel::paccess pa(pd);
-        auto p_pd = (pd_t)pa.ptr();
-        assert(p_pd);
-        for (pde_t* ent = (*p_pd); ent < (*p_pd) + 768; ++ent) {
-            if (!ent->in.p)
-                continue;
-            __free_raw_page(ent->in.pt_page);
-        }
-    }
-    __free_raw_page(pd);
-}
-
-SECTION(".text.kinit")
-static inline void init_mem_layout(void)
-{
-    mem_size = 1024 * mem_size_info.n_1k_blks;
-    mem_size += 64 * 1024 * mem_size_info.n_64k_blks;
-
-    // mark empty page
-    mark_addr_range(0x00000000, 0x00001000);
-    // mark kernel page directory
-    mark_addr_range(0x00001000, 0x00002000);
-    // mark kernel page table
-    mark_addr_range(0x00002000, 0x00006000);
-    // mark kernel early stack
-    mark_addr_range(0x00006000, 0x00008000);
-    // mark EBDA and upper memory as allocated
-    mark_addr_range(0x80000, 0x100000);
-    extern char __stage1_start[];
-    extern char __kinit_end[];
-    extern char __text_start[];
-    extern char __data_end[];
-
-    constexpr pptr_t PHYS_BSS_START = 0x100000;
-    // mark .stage1 and .kinit
-    mark_addr_range((pptr_t)__stage1_start, (pptr_t)__kinit_end);
-    // mark kernel .text to .data
-    mark_addr_len((pptr_t)__kinit_end, __data_end - __text_start);
-    // mark kernel .bss
-    mark_addr_len(PHYS_BSS_START, bss_len);
-
-    if (e820_mem_map_entry_size == 20) {
-        struct e820_mem_map_entry_20* entry = (struct e820_mem_map_entry_20*)e820_mem_map;
-        for (uint32_t i = 0; i < e820_mem_map_count; ++i, ++entry) {
-            if (entry->type != 1) {
-                mark_addr_len(entry->base, entry->len);
-            }
-        }
-    } else {
-        struct e820_mem_map_entry_24* entry = (struct e820_mem_map_entry_24*)e820_mem_map;
-        for (uint32_t i = 0; i < e820_mem_map_count; ++i, ++entry) {
-            if (entry->in.type != 1) {
-                mark_addr_len(entry->in.base, entry->in.len);
-            }
-        }
-    }
-}
-
-using kernel::memory::mm_list;
-using kernel::memory::mm;
-
-mm_list::mm_list()
-    : m_areas(s_kernel_mms->m_areas)
-{
-    m_pd = __alloc_raw_page();
-    kernel::paccess pdst(m_pd), psrc(s_kernel_mms->m_pd);
-    auto* dst = pdst.ptr();
-    auto* src = psrc.ptr();
-    assert(dst && src);
-    memcpy(dst, src, PAGE_SIZE);
-}
-
-mm_list::mm_list(const mm_list& other)
-    : mm_list()
-{
-    m_brk = other.m_brk;
-    for (auto& src : other.m_areas) {
-        if (src.is_kernel_space() || src.attr.system)
-            continue;
-
-        auto& area = this->addarea(
-            src.start, src.attr.write, src.attr.system);
-
-        if (src.attr.mapped) {
-            area.attr.mapped = 1;
-            area.mapped_file = src.mapped_file;
-            area.file_offset = src.file_offset;
-        }
-
-        paccess pa(m_pd);
-        pd_t pd = (pd_t)pa.ptr();
-
-        for (const auto& pg : *src.pgs) {
-            area.append_page(pd, pg,
-                    PAGE_COW | (pg.attr & PAGE_MMAP),
-                    src.attr.system);
-        }
-    }
-}
-
-mm_list::~mm_list()
-{
-    if (!m_pd)
-        return;
-
-    clear_user();
-    dealloc_pd(m_pd);
-}
-
-void mm_list::switch_pd() const
-{
-    asm_switch_pd(m_pd);
-}
-
-int mm_list::register_brk(void* addr)
-{
-    if (!is_avail(addr))
-        return GB_FAILED;
-    m_brk = &addarea(addr, true, false);
-    return GB_OK;
-}
-
-void* mm_list::set_brk(void* addr)
-{
-    assert(m_brk);
-    void* curbrk = m_brk->end();
-
-    if (addr <= curbrk || !is_avail(curbrk, vptrdiff(addr, curbrk)))
-        return curbrk;
-
-    kernel::paccess pa(m_pd);
-    pd_t pd = (pd_t)pa.ptr();
-
-    while (curbrk < addr) {
-        m_brk->append_page(pd, empty_page, PAGE_COW, false);
-        curbrk = (char*)curbrk + PAGE_SIZE;
-    }
-
-    return curbrk;
-}
-
-void* mm_list::find_avail(void* hint, size_t len, bool priv) const
-{
-    void* addr = hint;
-    if (!addr) {
-        // default value of mmapp'ed area
-        if (!priv)
-            addr = (void*)0x40000000;
-        else
-            addr = (void*)0xe0000000;
-    }
-
-    while (!is_avail(addr, len)) {
-        auto iter = m_areas.lower_bound(addr);
-        if (iter == m_areas.end())
-            return nullptr;
-
-        addr = iter->end();
-    }
-
-    if (!priv && addr >= (void*)0xc0000000)
-        return nullptr;
-
-    return addr;
-}
-
-// TODO: write dirty pages to file
-int mm_list::unmap(void* start, size_t len, bool system)
-{
-    ptr_t addr = (ptr_t)start;
-    void* end = vptradd(start, align_up<12>(len));
-
-    // standard says that addr and len MUST be
-    // page-aligned or the call is invalid
-    if (addr % PAGE_SIZE != 0)
-        return -EINVAL;
-
-    // if doing user mode unmapping, check area privilege
-    if (!system) {
-        if (addr >= 0xc0000000 || end > (void*)0xc0000000)
-            return -EINVAL;
-    }
-
-    auto iter = m_areas.lower_bound(start);
-
-    for ( ; iter != m_areas.end() && *iter < end; ) {
-        if (!(start < *iter) && start != iter->start) {
-            mm newmm = iter->split(start);
-            unmap(newmm);
-            ++iter;
-            continue;
-        }
-        else if (!(*iter < end)) {
-            mm newmm = iter->split(end);
-            unmap(*iter);
-            m_areas.erase(iter);
-
-            bool inserted;
-            std::tie(std::ignore, inserted) = m_areas.emplace(std::move(newmm));
-            assert(inserted);
-            break;
-        }
-        else {
-            unmap(*iter);
-            iter = m_areas.erase(iter);
-        }
-    }
-
-    return GB_OK;
-}
-
-mm& mm_list::add_empty_area(void *start, std::size_t page_count,
-    uint32_t page_attr, bool w, bool system)
-{
-    auto& area = addarea(start, w, system);
-    kernel::paccess pa(m_pd);
-    pd_t pd = (pd_t)pa.ptr();
-
-    while (page_count--)
-        area.append_page(pd, empty_page, page_attr, system);
-
-    return area;
-}
-
-constexpr void map_raw_page_to_pte(
-    pte_t* pte, page_t page,
-    bool present, bool write, bool priv)
-{
-    // set P bit
-    pte->v = 0;
-    pte->in.p = present;
-    pte->in.rw = write;
-    pte->in.us = !priv;
-    pte->in.page = page;
-}
-
-void mm::append_page(pd_t pd, const page& pg, uint32_t attr, bool priv)
-{
-    assert(pd);
-
-    void* addr = this->end();
-    pde_t* pde = *pd + v_to_pdi(addr);
-
-    page_t pt_pg = 0;
-    pte_t* pte = nullptr;
-    // page table not exist
-    if (!pde->in.p) [[unlikely]] {
-        // allocate a page for the page table
-        pt_pg = __alloc_raw_page();
-        pde->in.p = 1;
-        pde->in.rw = 1;
-        pde->in.us = 1;
-        pde->in.pt_page = pt_pg;
-
-        auto pt = (pt_t)kernel::pmap(pt_pg);
-        assert(pt);
-        pte = *pt;
-
-        memset(pt, 0x00, PAGE_SIZE);
-    } else {
-        pt_pg = pde->in.pt_page;
-        auto pt = (pt_t)kernel::pmap(pt_pg);
-        assert(pt);
-        pte = *pt;
-    }
-
-    // map the page in the page table
-    int pti = v_to_pti(addr);
-    pte += pti;
-
-    map_raw_page_to_pte(
-        pte,
-        pg.phys_page_id,
-        !(attr & PAGE_MMAP),
-        false,
-        priv);
-
-    kernel::pfree(pt_pg);
-
-    if (unlikely((attr & PAGE_COW) && !(pg.attr & PAGE_COW))) {
-        kernel::paccess pa(pg.pg_pteidx >> 12);
-        auto* pg_pte = (pte_t*)pa.ptr();
-        assert(pg_pte);
-        pg_pte += (pg.pg_pteidx & 0xfff);
-        pg.attr |= PAGE_COW;
-        pg_pte->in.rw = 0;
-        pg_pte->in.a = 0;
-        invalidate_tlb(addr);
-    }
-
-    ++*pg.ref_count;
-
-    this->pgs->emplace_back(pg);
-    auto& emplaced = this->pgs->back();
-    emplaced.pg_pteidx = (pt_pg << 12) + pti;
-    emplaced.attr = attr;
-}
-
-mm mm::split(void *addr)
-{
-    assert(addr > start && addr < end());
-    assert((ptr_t)addr % PAGE_SIZE == 0);
-
-    size_t this_count = vptrdiff(addr, start) / PAGE_SIZE;
-    size_t new_count = pgs->size() - this_count;
-
-    mm newmm {
-        .start = addr,
-        .attr { attr },
-        .pgs = types::memory::kinew<mm::pages_vector>(),
-        .mapped_file = mapped_file,
-        .file_offset = attr.mapped ? file_offset + this_count * PAGE_SIZE : 0,
-    };
-
-    for (size_t i = 0; i < new_count; ++i) {
-        newmm.pgs->emplace_back(pgs->back());
-        pgs->pop_back();
-    }
-
-    return newmm;
-}
-
-int mmap(
-    void* hint,
-    size_t len,
-    fs::inode* file,
-    size_t offset,
-    int write,
-    int priv)
-{
-    auto& mms = current_process->mms;
-
-    if (file && !S_ISREG(file->mode) && !S_ISBLK(file->mode)) [[unlikely]] {
-        errno = EINVAL;
-        return GB_FAILED;
-    }
-
-    // TODO: find another address
-    assert(((uint32_t)hint & 0xfff) == 0);
-    // TODO: return failed
-    assert((offset & 0xfff) == 0);
-
-    size_t n_pgs = align_up<12>(len) >> 12;
-
-    if (!mms.is_avail(hint, len)) {
-        errno = EEXIST;
-        return GB_FAILED;
-    }
-
-    if (file) {
-        auto& mm = mms.add_empty_area(hint, n_pgs, PAGE_MMAP | PAGE_COW, write, priv);
-
-        mm.attr.mapped = 1;
-        mm.mapped_file = file;
-        mm.file_offset = offset;
-    }
-    else {
-        // private mapping of zero-filled pages
-        auto& mm = mms.add_empty_area(hint, n_pgs, PAGE_COW, write, priv);
-
-        mm.attr.mapped = 0;
-    }
-
-    return GB_OK;
-}
-
-SECTION(".text.kinit")
-void init_mem(void)
-{
-    init_mem_layout();
-
-    // TODO: replace early kernel pd
-    auto* __kernel_mms = types::memory::kinew<kernel::memory::mm_list>(EARLY_KERNEL_PD_PAGE);
-    kernel::memory::mm_list::s_kernel_mms = __kernel_mms;
-
-    // create empty_page struct
-    empty_page.attr = 0;
-    empty_page.phys_page_id = EMPTY_PAGE;
-    empty_page.ref_count = types::memory::kinew<size_t>(2);
-    empty_page.pg_pteidx = 0x00002000;
-
-    // 0xd0000000 to 0xd4000000 or 3.5GiB, size 64MiB
-    __kernel_mms->add_empty_area(KERNEL_HEAP_START,
-        64 * 1024 * 1024 / PAGE_SIZE, PAGE_COW, true, true);
-
-    kernel::kinit::init_kernel_heap(KERNEL_HEAP_START,
-        vptrdiff(KERNEL_HEAP_LIMIT, KERNEL_HEAP_START));
-}
-
-SECTION(".text.kinit")
-void create_segment_descriptor(
-    segment_descriptor* sd,
-    uint32_t base,
-    uint32_t limit,
-    uint32_t flags,
-    uint32_t access)
-{
-    sd->base_low = base & 0x0000ffff;
-    sd->base_mid = ((base & 0x00ff0000) >> 16);
-    sd->base_high = ((base & 0xff000000) >> 24);
-    sd->limit_low = limit & 0x0000ffff;
-    sd->limit_high = ((limit & 0x000f0000) >> 16);
-    sd->access = access;
-    sd->flags = flags;
-}
-
-namespace __physmapper {
-struct mapped_area {
-    size_t ref;
-    void* ptr;
-};
-
-static types::hash_map<page_t, mapped_area,
-    types::memory::ident_allocator<std::pair<page_t, mapped_area>>>
-    mapped;
-static uint8_t _freebm[0x400 / 8];
-static types::bitmap freebm(
-    [](unsigned char*, std::size_t){}, _freebm, 0x400);
-} // namespace __physmapper
-
-void* kernel::pmap(page_t pg, bool cached)
-{
-    auto* const pmap_pt = std::bit_cast<pte_t*>(0xff001000);
-    auto* const mapped_start = std::bit_cast<void*>(0xff000000);
-
-    auto iter = __physmapper::mapped.find(pg);
-    if (iter) {
-        auto [ idx, area ] = *iter;
-        ++area.ref;
-        return area.ptr;
-    }
-
-    for (int i = 2; i < 0x400; ++i) {
-        if (__physmapper::freebm.test(i) == 0) {
-            auto* pte = pmap_pt + i;
-            if (cached)
-                pte->v = 0x3;
-            else
-                pte->v = 0x13;
-            pte->in.page = pg;
-
-            void* ptr = vptradd(mapped_start, 0x1000 * i);
-            invalidate_tlb(ptr);
-
-            __physmapper::freebm.set(i);
-            __physmapper::mapped.emplace(pg,
-                __physmapper::mapped_area { 1, ptr });
-            return ptr;
-        }
-    }
-
-    return nullptr;
-}
-void kernel::pfree(page_t pg)
-{
-    auto* const pmap_pt = std::bit_cast<pte_t*>(0xff001000);
-    auto* const mapped_start = std::bit_cast<void*>(0xff000000);
-
-    auto iter = __physmapper::mapped.find(pg);
-    if (!iter)
-        return;
-    auto& [ ref, ptr ] = iter->second;
-
-    if (ref > 1) {
-        --ref;
-        return;
-    }
-
-    int i = vptrdiff(ptr, mapped_start);
-    i /= 0x1000;
-
-    auto* pte = pmap_pt + i;
-    pte->v = 0;
-    invalidate_tlb(ptr);
-
-    __physmapper::freebm.clear(i);
-    __physmapper::mapped.remove(iter);
-}

+ 348 - 0
src/kernel/mem/mm_list.cc

@@ -0,0 +1,348 @@
+#include <assert.h>
+#include <stdint.h>
+
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/vm_area.hpp>
+
+
+using namespace kernel::mem;
+
+static inline void __invalidate_all_tlb()
+{
+    asm volatile(
+            "mov %%cr3, %%rax\n\t"
+            "mov %%rax, %%cr3\n\t"
+            : : : "rax", "memory"
+            );
+}
+
+static inline void __dealloc_page_table_all(
+        paging::pfn_t pt, int depth, int from, int to)
+{
+    using namespace paging;
+
+    if (depth > 1) {
+        for (int i = from; i < to; ++i) {
+            auto pse = PSE{pt}[i];
+            if (!(pse.attributes() & PA_P))
+                continue;
+
+            int pfn = pse.pfn();
+            __dealloc_page_table_all(pfn, depth-1, 0, 512);
+        }
+    }
+
+    free_page(pt);
+}
+
+static inline void __dealloc_page_table(paging::pfn_t pt)
+{
+    using namespace paging;
+    auto start_idx = idx_p4(0);
+    auto end_idx = idx_p4(KERNEL_SPACE_START);
+
+    __dealloc_page_table_all(pt, 4, start_idx, end_idx);
+}
+
+mm_list::mm_list()
+    : m_pt{paging::alloc_page_table()}
+    , m_brk{m_areas.end()}
+{
+    memcpy(physaddr<void>{m_pt},
+           paging::KERNEL_PAGE_TABLE_PHYS_ADDR, 0x1000);
+}
+
+mm_list::mm_list(const mm_list& other): mm_list{}
+{
+    m_areas = other.m_areas;
+
+    using namespace paging;
+    for (auto iter = m_areas.begin(); iter != m_areas.end(); ++iter) {
+        auto& area = *iter;
+
+        if (area.flags & MM_BREAK)
+            m_brk = iter;
+
+        auto this_iter = vaddr_range{m_pt, area.start, area.end};
+        auto other_iter = vaddr_range{other.m_pt, area.start, area.end};
+
+        while (this_iter) {
+            auto this_pte = *this_iter, other_pte = *other_iter;
+            auto attributes = other_pte.attributes();
+            auto pfn = other_pte.pfn();
+
+            attributes &= ~(PA_RW | PA_A | PA_D);
+            attributes |= PA_COW;
+            this_pte.set(attributes, pfn);
+
+            increase_refcount(pfn_to_page(pfn));
+
+            // TODO: create a function to set COW mappings
+            attributes = other_pte.attributes();
+            attributes &= ~PA_RW;
+            attributes |= PA_COW;
+            other_pte.set(attributes, pfn);
+
+            ++this_iter, ++other_iter;
+        }
+    }
+
+    __invalidate_all_tlb();
+}
+
+mm_list::~mm_list()
+{
+    if (!m_pt)
+        return;
+
+    clear();
+    __dealloc_page_table(m_pt);
+}
+
+bool mm_list::is_avail(uintptr_t start, std::size_t len) const noexcept
+{
+    start &= ~0xfff;
+    uintptr_t end = (start + len + 0xfff) & ~0xfff;
+    len = end - start;
+
+    if (end > USER_SPACE_MEMORY_TOP)
+        return false;
+
+    for (const auto& area : m_areas) {
+        if (!area.is_avail(start, end))
+            return false;
+    }
+    return true;
+}
+
+bool mm_list::is_avail(uintptr_t addr) const
+{
+    if (addr >= USER_SPACE_MEMORY_TOP)
+        return false;
+
+    auto iter = m_areas.find(addr);
+    return iter == m_areas.end();
+}
+
+uintptr_t mm_list::find_avail(uintptr_t hint, size_t len) const
+{
+    auto addr = std::max(hint, MMAP_MIN_ADDR);
+
+    while (!is_avail(addr, len)) {
+        auto iter = m_areas.lower_bound(addr);
+        if (iter == m_areas.end())
+            return 0;
+
+        addr = iter->end;
+    }
+
+    return addr;
+}
+
+void mm_list::switch_pd() const noexcept
+{
+    asm volatile("mov %0, %%cr3": : "r"(m_pt): "memory");
+}
+
+int mm_list::register_brk(uintptr_t addr)
+{
+    assert(m_brk == m_areas.end());
+    if (!is_avail(addr))
+        return -ENOMEM;
+
+    bool inserted;
+    std::tie(m_brk, inserted) = m_areas.emplace(
+            addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
+
+    assert(inserted);
+    return 0;
+}
+
+uintptr_t mm_list::set_brk(uintptr_t addr)
+{
+    using namespace paging;
+    assert(m_brk != m_areas.end());
+    uintptr_t curbrk = m_brk->end;
+
+    addr += 4096-1;
+    addr &= ~0xfff;
+
+    if (addr <= curbrk || !is_avail(curbrk, addr - curbrk))
+        return curbrk;
+
+    for (auto pte : vaddr_range{m_pt, curbrk, addr})
+        pte.set(PA_ANONYMOUS_PAGE | PA_NXE, EMPTY_PAGE_PFN);
+
+    m_brk->end = addr;
+    return m_brk->end;
+}
+
+void mm_list::clear()
+{
+    for (auto iter = m_areas.begin(); iter != m_areas.end(); ++iter)
+        unmap(iter, false);
+
+    __invalidate_all_tlb();
+
+    m_areas.clear();
+    m_brk = m_areas.end();
+}
+
+mm_list::iterator mm_list::split(iterator area, uintptr_t addr)
+{
+    assert(!(addr & 0xfff));
+    assert(addr > area->start && addr < area->end);
+
+    std::size_t old_len = addr - area->start;
+    std::size_t new_file_offset = 0;
+
+    if (area->mapped_file)
+        new_file_offset = area->file_offset + old_len;
+
+    auto new_end = area->end;
+    area->end = addr;
+
+    auto [ iter, inserted ] =
+        m_areas.emplace(addr, area->flags, new_end,
+                area->mapped_file, new_file_offset);
+
+    assert(inserted);
+    return iter;
+}
+
+int mm_list::unmap(iterator area, bool should_invalidate_tlb)
+{
+    using namespace paging;
+
+    bool should_use_invlpg = area->end - area->start <= 0x4000;
+    auto range = vaddr_range{m_pt, area->start, area->end};
+    uintptr_t cur_addr = area->start;
+
+    // TODO: write back dirty pages
+    for (auto pte : range) {
+        free_page(pte.pfn());
+        pte.clear();
+
+        if (should_invalidate_tlb && should_use_invlpg) {
+            asm volatile("invlpg (%0)": : "r"(cur_addr): "memory");
+            cur_addr += 0x1000;
+        }
+    }
+
+    if (should_invalidate_tlb && !should_use_invlpg)
+        __invalidate_all_tlb();
+
+    return 0;
+}
+
+int mm_list::unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb)
+{
+    // standard says that addr and len MUST be
+    // page-aligned or the call is invalid
+    if (start & 0xfff)
+        return -EINVAL;
+
+    uintptr_t end = (start + length + 0xfff) & ~0xfff;
+
+    // check address validity
+    if (end > KERNEL_SPACE_START)
+        return -EINVAL;
+    if (end > USER_SPACE_MEMORY_TOP)
+        return -ENOMEM;
+
+    auto iter = m_areas.lower_bound(start);
+    auto iter_end = m_areas.upper_bound(end);
+
+    // start <= iter <= end a.k.a. !(start > *iter) && !(*iter > end)
+    while (iter != iter_end) {
+        // start == iter:
+        // start is between (iter->start, iter->end)
+        //
+        // strip out the area before start
+        if (!(start < *iter) && start != iter->start)
+            iter = split(iter, start);
+
+        // iter.end <= end
+        // it is safe to unmap the area directly
+        if (*iter < end) {
+            if (int ret = unmap(iter, should_invalidate_tlb); ret != 0)
+                return ret;
+
+            iter = m_areas.erase(iter);
+            continue;
+        }
+
+        // end == iter:
+        // end is between [iter->start, iter->end)
+        //
+        // if end == iter->start, no need to strip the area
+        if (end == iter->start) {
+            ++iter;
+            continue;
+        }
+
+        (void)split(iter, end);
+        if (int ret = unmap(iter, should_invalidate_tlb); ret != 0)
+            return ret;
+
+        iter = m_areas.erase(iter);
+
+        // no need to check areas after this
+        break;
+    }
+
+    return 0;
+}
+
+int mm_list::mmap(const map_args& args)
+{
+    auto& vaddr = args.vaddr;
+    auto& length = args.length;
+    auto& finode = args.file_inode;
+    auto& foff = args.file_offset;
+    auto& flags = args.flags;
+
+    assert((vaddr & 0xfff) == 0 && (foff & 0xfff) == 0);
+    assert((length & 0xfff) == 0 && length != 0);
+
+    if (!is_avail(vaddr, length))
+        return -EEXIST;
+
+    using namespace kernel::mem::paging;
+
+    // PA_RW is set during page fault while PA_NXE is preserved
+    // so we set PA_NXE now
+    psattr_t attributes = PA_US;
+    if (!(flags & MM_EXECUTE))
+        attributes |= PA_NXE;
+
+    if (flags & MM_MAPPED) {
+        assert(finode);
+        assert(S_ISREG(finode->mode) || S_ISBLK(finode->mode));
+
+        auto [ area, inserted ] = m_areas.emplace(
+                vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, finode, foff);
+        assert(inserted);
+
+        attributes |= PA_MMAPPED_PAGE;
+        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
+            pte.set(attributes, EMPTY_PAGE_PFN);
+    }
+    else if (flags & MM_ANONYMOUS) {
+        // private mapping of zero-filled pages
+        // TODO: shared mapping
+        auto [ area, inserted ] =
+            m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
+        assert(inserted);
+
+        attributes |= PA_ANONYMOUS_PAGE;
+        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
+            pte.set(attributes, EMPTY_PAGE_PFN);
+    }
+    else {
+        return -EINVAL;
+    }
+
+    return 0;
+}

+ 448 - 0
src/kernel/mem/paging.cc

@@ -0,0 +1,448 @@
+#include <assert.h>
+#include <string.h>
+
+#include <types/list.hpp>
+
+#include <kernel/async/lock.hpp>
+#include <kernel/log.hpp>
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/slab.hpp>
+#include <kernel/mem/vm_area.hpp>
+#include <kernel/process.hpp>
+
+using namespace types::list;
+
+using namespace kernel::async;
+using namespace kernel::mem::paging;
+
+static inline void __page_fault_die(uintptr_t vaddr)
+{
+    kmsgf("[kernel] kernel panic: invalid memory access to %p", vaddr);
+    freeze();
+}
+
+static inline PSE __parse_pse(PSE pse, bool priv)
+{
+    auto attr = priv ? PA_KERNEL_PAGE_TABLE : PA_USER_PAGE_TABLE;
+    if (!(pse.attributes() & PA_P))
+        pse.set(attr, alloc_page_table());
+
+    return pse.parse();
+}
+
+static struct zone_info {
+    page* next;
+    std::size_t count;
+} zones[52];
+
+static mutex zone_lock;
+
+constexpr unsigned _msb(std::size_t x)
+{
+    unsigned n = 0;
+    while (x >>= 1)
+        n++;
+    return n;
+}
+
+constexpr pfn_t buddy(pfn_t pfn, unsigned order)
+{
+    return pfn ^ (1 << (order + 12));
+}
+
+constexpr pfn_t parent(pfn_t pfn, unsigned order)
+{
+    return pfn & ~(1 << (order + 12));
+}
+
+// call with zone_lock held
+static inline void _zone_list_insert(unsigned order, page* zone)
+{
+    assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
+    assert((zone->flags & 0xff) == 0);
+    zone->flags |= order;
+
+    zones[order].count++;
+    list_insert(&zones[order].next, zone);
+}
+
+// call with zone_lock held
+static inline void _zone_list_remove(unsigned order, page* zone)
+{
+    assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
+    assert(zones[order].count > 0 && (zone->flags & 0xff) == order);
+    zone->flags &= ~0xff;
+
+    zones[order].count--;
+    list_remove(&zones[order].next, zone);
+}
+
+// call with zone_lock held
+static inline page* _zone_list_get(unsigned order)
+{
+    if (zones[order].count == 0)
+        return nullptr;
+
+    zones[order].count--;
+    auto* pg = list_get(&zones[order].next);
+
+    assert((pg->flags & 0xff) == order);
+    return pg;
+}
+
+// where order represents power of 2
+// call with zone_lock held
+static inline page* _create_zone(pfn_t pfn, unsigned order)
+{
+    page* zone = pfn_to_page(pfn);
+
+    assert(zone->flags & PAGE_PRESENT);
+    zone->flags |= PAGE_BUDDY;
+
+    _zone_list_insert(order, zone);
+    return zone;
+}
+
+// call with zone_lock held
+static inline void _split_zone(page* zone, unsigned order, unsigned target_order)
+{
+    while (order > target_order) {
+        pfn_t pfn = page_to_pfn(zone);
+        _create_zone(buddy(pfn, order - 1), order - 1);
+
+        order--;
+    }
+
+    zone->flags &= ~0xff;
+    zone->flags |= target_order;
+}
+
+// call with zone_lock held
+static inline page* _alloc_zone(unsigned order)
+{
+    for (unsigned i = order; i < 52; ++i) {
+        auto zone = _zone_list_get(i);
+        if (!zone)
+            continue;
+
+        increase_refcount(zone);
+
+        if (i > order)
+            _split_zone(zone, i, order);
+
+        assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
+        return zone;
+    }
+
+    return nullptr;
+}
+
+void kernel::mem::paging::create_zone(uintptr_t start, uintptr_t end)
+{
+    start += (4096 - 1);
+    start >>= 12;
+    end >>= 12;
+
+    if (start >= end)
+        return;
+
+    lock_guard_irq lock{zone_lock};
+
+    unsigned long low = start;
+    for (unsigned i = 0; i < _msb(end); ++i, low >>= 1) {
+        if (!(low & 1))
+            continue;
+        _create_zone(low << (12+i), i);
+        low++;
+    }
+
+    low = 1 << _msb(end);
+    while (low < end) {
+        unsigned order = _msb(end - low);
+        _create_zone(low << 12, order);
+        low |= (1 << order);
+    }
+}
+
+void kernel::mem::paging::mark_present(uintptr_t start, uintptr_t end)
+{
+    start >>= 12;
+
+    end += (4096 - 1);
+    end >>= 12;
+
+    while (start < end)
+        PAGE_ARRAY[start++].flags |= PAGE_PRESENT;
+}
+
+page* kernel::mem::paging::alloc_pages(unsigned order)
+{
+    lock_guard_irq lock{zone_lock};
+    auto* zone = _alloc_zone(order);
+    if (!zone)
+        freeze();
+
+    return zone;
+}
+
+page* kernel::mem::paging::alloc_page()
+{
+    return alloc_pages(0);
+}
+
+pfn_t kernel::mem::paging::alloc_page_table()
+{
+    page* zone = alloc_page();
+    pfn_t pfn = page_to_pfn(zone);
+
+    memset(physaddr<void>{pfn}, 0x00, 0x1000);
+
+    return pfn;
+}
+
+void kernel::mem::paging::free_pages(page* pg, unsigned order)
+{
+    assert((pg->flags & 0xff) == order);
+
+    // TODO: atomic
+    if (!(pg->flags & PAGE_BUDDY) || --pg->refcount)
+        return;
+
+    lock_guard_irq lock{zone_lock};
+    while (order < 52) {
+        pfn_t pfn = page_to_pfn(pg);
+        pfn_t buddy_pfn = buddy(pfn, order);
+        page* buddy_page = pfn_to_page(buddy_pfn);
+
+        if (!(buddy_page->flags & PAGE_BUDDY))
+            break;
+
+        if ((buddy_page->flags & 0xff) != order)
+            break;
+
+        if (buddy_page->refcount)
+            break;
+
+        _zone_list_remove(order, buddy_page);
+
+        if (buddy_page < pg)
+            std::swap(buddy_page, pg);
+
+        buddy_page->flags &= ~PAGE_BUDDY;
+        order++;
+    }
+
+    pg->flags &= ~0xff;
+    _zone_list_insert(order, pg);
+}
+
+void kernel::mem::paging::free_page(page* page)
+{
+    return free_pages(page, 0);
+}
+
+void kernel::mem::paging::free_pages(pfn_t pfn, unsigned order)
+{
+    return free_pages(pfn_to_page(pfn), order);
+}
+
+void kernel::mem::paging::free_page(pfn_t pfn)
+{
+    return free_page(pfn_to_page(pfn));
+}
+
+pfn_t kernel::mem::paging::page_to_pfn(page* _page)
+{
+    return (pfn_t)(_page - PAGE_ARRAY) * 0x1000;
+}
+
+page* kernel::mem::paging::pfn_to_page(pfn_t pfn)
+{
+    return PAGE_ARRAY + pfn / 0x1000;
+}
+
+void kernel::mem::paging::increase_refcount(page* pg)
+{
+    pg->refcount++;
+}
+
+void kernel::mem::paging::handle_page_fault(unsigned long err)
+{
+    using namespace kernel::mem;
+    using namespace paging;
+
+    uintptr_t vaddr;
+    asm volatile("mov %%cr2, %0": "=g"(vaddr): : );
+    auto& mms = current_process->mms;
+
+    auto* mm_area = mms.find(vaddr);
+    if (!mm_area) [[unlikely]] {
+        // user access of address that does not exist
+        if (err & PAGE_FAULT_U)
+            kill_current(SIGSEGV);
+
+        __page_fault_die(vaddr);
+    }
+
+    // user access to a present page caused the fault
+    // check access rights
+    if (err & PAGE_FAULT_U && err & PAGE_FAULT_P) {
+        // write to read only pages
+        if (err & PAGE_FAULT_W && !(mm_area->flags & MM_WRITE))
+            kill_current(SIGSEGV);
+
+        // execute from non-executable pages
+        if (err & PAGE_FAULT_I && !(mm_area->flags & MM_EXECUTE))
+            kill_current(SIGSEGV);
+    }
+
+    auto idx = idx_all(vaddr);
+
+    auto pe = mms.get_page_table()[std::get<1>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<2>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<3>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<4>(idx)];
+
+    bool mmapped = mm_area->flags & MM_MAPPED;
+    assert(!mmapped || mm_area->mapped_file);
+
+    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
+        __page_fault_die(vaddr);
+
+    pfn_t pfn = pe.pfn();
+    auto attr = pe.attributes();
+
+    page* pg = pfn_to_page(pfn);
+
+    if (attr & PA_COW) {
+        attr &= ~PA_COW;
+        if (mm_area->flags & MM_WRITE)
+            attr |= PA_RW;
+        else
+            attr &= ~PA_RW;
+
+        // if it is a dying page
+        // TODO: use atomic
+        if (pg->refcount == 1) {
+            pe.set(attr, pfn);
+            return;
+        }
+
+        // duplicate the page
+        page* new_page = alloc_page();
+        pfn_t new_pfn = page_to_pfn(new_page);
+        physaddr<void> new_page_addr{new_pfn};
+
+        if (attr & PA_ANON)
+            memset(new_page_addr, 0x00, 0x1000);
+        else
+            memcpy(new_page_addr, physaddr<void>{pfn}, 0x1000);
+
+        attr &= ~(PA_A | PA_ANON);
+        --pg->refcount;
+
+        pe.set(attr, new_pfn);
+        pfn = new_pfn;
+    }
+
+    if (attr & PA_MMAP) {
+        attr |= PA_P;
+
+        size_t offset = (vaddr & ~0xfff) - mm_area->start;
+        char* data = physaddr<char>{pfn};
+
+        int n = vfs_read(
+            mm_area->mapped_file,
+            data,
+            4096,
+            mm_area->file_offset + offset,
+            4096);
+
+        // TODO: send SIGBUS if offset is greater than real size
+        if (n != 4096)
+            memset(data + n, 0x00, 4096 - n);
+
+        // TODO: shared mapping
+        attr &= ~PA_MMAP;
+
+        pe.set(attr, pfn);
+    }
+}
+
+vaddr_range::vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool priv)
+    : n {start >= end ? 0 : ((end - start) >> 12)}
+    , idx4{!n ? 0 : idx_p4(start)}
+    , idx3{!n ? 0 : idx_p3(start)}
+    , idx2{!n ? 0 : idx_p2(start)}
+    , idx1{!n ? 0 : idx_p1(start)}
+    , pml4{!n ? PSE{0} : PSE{pt}}
+    , pdpt{!n ? PSE{0} : __parse_pse(pml4[idx4], priv)}
+    , pd{!n ? PSE{0} : __parse_pse(pdpt[idx3], priv)}
+    , pt{!n ? PSE{0} : __parse_pse(pd[idx2], priv)}
+    , m_start{!n ? 0 : start}, m_end{!n ? 0 : end}
+    , is_privilege{!n ? false : priv} { }
+
+vaddr_range::vaddr_range(std::nullptr_t)
+    : n{}
+    , idx4{}, idx3{}, idx2{}, idx1{}
+    , pml4{0}, pdpt{0}
+    , pd{0}, pt{0}
+    , m_start{}, m_end{}, is_privilege{} { }
+
+vaddr_range vaddr_range::begin() const noexcept
+{
+    return *this;
+}
+
+vaddr_range vaddr_range::end() const noexcept
+{
+    return vaddr_range {nullptr};
+}
+
+PSE vaddr_range::operator*() const noexcept
+{
+    return pt[idx1];
+}
+
+vaddr_range& vaddr_range::operator++()
+{
+    --n;
+
+    if ((idx1 = (idx1+1)%512) != 0)
+        return *this;
+
+    do {
+        if ((idx2 = (idx2+1)%512) != 0)
+            break;
+        do {
+            if ((idx3 = (idx3+1)%512) != 0)
+                break;
+
+            idx4 = (idx4+1) % 512;
+
+            // if idx4 is 0 after update, we have an overflow
+            assert(idx4 != 0);
+
+            pdpt = __parse_pse(pml4[idx4], is_privilege);
+        } while (false);
+
+        pd = __parse_pse(pdpt[idx3], is_privilege);
+    } while (false);
+
+    pt = __parse_pse(pd[idx2], is_privilege);
+    return *this;
+}
+
+vaddr_range::operator bool() const noexcept
+{
+    return n;
+}
+
+bool vaddr_range::operator==(const vaddr_range& other) const noexcept
+{
+    return n == other.n;
+}

+ 125 - 0
src/kernel/mem/slab.cc

@@ -0,0 +1,125 @@
+#include <cstddef>
+
+#include <assert.h>
+
+#include <types/list.hpp>
+
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/slab.hpp>
+
+using namespace kernel::mem;
+using namespace types::list;
+
+constexpr std::size_t SLAB_PAGE_SIZE = 0x1000; // 4K
+
+std::ptrdiff_t _slab_data_start_offset(std::size_t size)
+{
+    return (sizeof(slab_head) + size - 1) & ~(size - 1);
+}
+
+std::size_t _slab_max_count(std::size_t size)
+{
+    return (SLAB_PAGE_SIZE - _slab_data_start_offset(size)) / size;
+}
+
+void* _slab_head_alloc(slab_head* slab)
+{
+    if (slab->free_count == 0)
+        return nullptr;
+
+    void* ptr = slab->free;
+    slab->free = *(void**)ptr;
+    slab->free_count--;
+
+    return ptr;
+}
+
+slab_head* _make_slab(uintptr_t start, std::size_t size)
+{
+    slab_head* slab = physaddr<slab_head>{start};
+
+    slab->obj_size = size;
+    slab->free_count = _slab_max_count(size);
+    slab->next = nullptr;
+    slab->prev = nullptr;
+
+    slab->free = physaddr<void>{start + _slab_data_start_offset(size)};
+
+    std::byte* ptr = (std::byte*)slab->free;
+    for (unsigned i = 0; i < slab->free_count; ++i) {
+        void* nextptr = ptr + size;
+        if (i == slab->free_count-1)
+            *(void**)ptr = nullptr;
+        else
+            *(void**)ptr = nextptr;
+        ptr = (std::byte*)nextptr;
+    }
+
+    return slab;
+}
+
+void _slab_add_page(slab_cache* cache) {
+    auto* new_page = paging::alloc_page();
+    auto new_page_pfn = paging::page_to_pfn(new_page);
+
+    new_page->flags |= paging::PAGE_SLAB;
+
+    auto* slab = _make_slab(new_page_pfn, cache->obj_size);
+    slab->cache = cache;
+
+    list_insert(&cache->slabs_empty, slab);
+}
+
+void* kernel::mem::slab_alloc(slab_cache* cache) {
+    slab_head* slab = cache->slabs_partial;
+    if (!slab) { // no partial slabs, try to get an empty slab
+        if (!cache->slabs_empty) // no empty slabs, create a new one
+            _slab_add_page(cache);
+
+        slab = list_get(&cache->slabs_empty);
+
+        list_insert(&cache->slabs_partial, slab);
+    }
+
+    void* ptr = _slab_head_alloc(slab);
+
+    if (slab->free_count == 0) { // slab is full
+        list_remove(&cache->slabs_partial, slab);
+        list_insert(&cache->slabs_full, slab);
+    }
+
+    return ptr;
+}
+
+void kernel::mem::slab_free(void* ptr) {
+    slab_head* slab = (slab_head*)((uintptr_t)ptr & ~(SLAB_PAGE_SIZE-1));
+
+    *(void**)ptr = slab->free;
+    slab->free = ptr;
+    slab->free_count++;
+
+    if (slab->free_count == _slab_max_count(slab->obj_size)) {
+        auto* cache = slab->cache;
+        slab_head** head = nullptr;
+
+        if (cache->slabs_full == slab) {
+            head = &cache->slabs_full;
+        } else {
+            assert(cache->slabs_partial == slab);
+            head = &cache->slabs_partial;
+        }
+
+        list_remove(head, slab);
+        list_insert(&cache->slabs_empty, slab);
+    }
+}
+
+void kernel::mem::init_slab_cache(slab_cache* cache, std::size_t obj_size)
+{
+    cache->obj_size = obj_size;
+    cache->slabs_empty = nullptr;
+    cache->slabs_partial = nullptr;
+    cache->slabs_full = nullptr;
+
+    _slab_add_page(cache);
+}

+ 122 - 148
src/kernel/process.cpp

@@ -10,20 +10,13 @@
 #include <sys/wait.h>
 
 #include <types/allocator.hpp>
-#include <types/bitmap.hpp>
 #include <types/cplusplus.hpp>
 #include <types/elf.hpp>
-#include <types/size.h>
-#include <types/status.h>
 #include <types/types.h>
 
-#include <asm/port_io.h>
-#include <asm/sys.h>
 #include <kernel/async/lock.hpp>
-#include <kernel/interrupt.h>
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/module.hpp>
 #include <kernel/process.hpp>
 #include <kernel/signal.hpp>
@@ -45,7 +38,7 @@ namespace kernel {
 struct no_irq_guard {
     explicit no_irq_guard()
     {
-        asm_cli();
+        asm volatile("cli");
     }
 
     no_irq_guard(const no_irq_guard&) = delete;
@@ -53,7 +46,7 @@ struct no_irq_guard {
 
     ~no_irq_guard()
     {
-        asm_sti();
+        asm volatile("sti");
     }
 };
 
@@ -162,7 +155,7 @@ int filearr::open(const process &current,
             if (!parent)
                 return -EINVAL;
             int ret = fs::vfs_mkfile(parent, filename.c_str(), mode);
-            if (ret != GB_OK)
+            if (ret != 0)
                 return ret;
             dentry = fs::vfs_open(*current.root, filepath);
             assert(dentry);
@@ -236,7 +229,7 @@ void process::send_signal(signo_type signal)
 
 void kernel_threadd_main(void)
 {
-    kmsg("kernel thread daemon started\n");
+    kmsg("[kernel] kthread daemon started");
 
     for (;;) {
         if (kthreadd_new_thd_func) {
@@ -255,22 +248,29 @@ void kernel_threadd_main(void)
             // TODO
             (void)func, (void)data;
             assert(false);
-
-            // syscall_fork
-            // int ret = syscall(0x00);
-
-            // if (ret == 0) {
-            //     // child process
-            //     func(data);
-            //     // the function shouldn't return here
-            //     assert(false);
-            // }
         }
         // TODO: sleep here to wait for new_kernel_thread event
-        asm_hlt();
+        asm volatile("hlt");
     }
 }
 
+static inline void __spawn(kernel::task::thread& thd, uintptr_t entry)
+{
+    auto prev_sp = thd.kstack.sp;
+
+    // return(start) address
+    thd.kstack.pushq(entry);
+    thd.kstack.pushq(0x200);       // flags
+    thd.kstack.pushq(0);           // r15
+    thd.kstack.pushq(0);           // r14
+    thd.kstack.pushq(0);           // r13
+    thd.kstack.pushq(0);           // r12
+    thd.kstack.pushq(0);           // rbp
+    thd.kstack.pushq(0);           // rbx
+    thd.kstack.pushq(0);           // 0 for alignment
+    thd.kstack.pushq(prev_sp);     // previous sp
+}
+
 SECTION(".text.kinit")
 proclist::proclist()
 {
@@ -278,17 +278,15 @@ proclist::proclist()
     auto& init = real_emplace(1, 0);
     assert(init.pid == 1 && init.ppid == 0);
 
-    auto& thd = *init.thds.begin();
-    thd.name.assign("[kernel init]");
+    auto thd = init.thds.begin();
+    thd->name.assign("[kernel init]");
 
     current_process = &init;
     current_thread = &thd;
 
     kernel::task::dispatcher::enqueue(current_thread);
 
-    tss.ss0 = KERNEL_DATA_SEGMENT;
-    tss.esp0 = (uint32_t)current_thread->kstack.esp;
-
+    current_thread->kstack.load_interrupt_stack();
     current_process->mms.switch_pd();
 
     if (1) {
@@ -297,26 +295,10 @@ proclist::proclist()
         assert(proc.pid == 0 && proc.ppid == 0);
 
         // create thread
-        auto& thd = *proc.thds.begin();
-        thd.name.assign("[kernel thread daemon]");
-
-        auto* esp = &thd.kstack.esp;
-        auto old_esp = (uint32_t)thd.kstack.esp;
-
-        // return(start) address
-        push_stack(esp, (uint32_t)kernel_threadd_main);
-        // ebx
-        push_stack(esp, 0);
-        // edi
-        push_stack(esp, 0);
-        // esi
-        push_stack(esp, 0);
-        // ebp
-        push_stack(esp, 0);
-        // eflags
-        push_stack(esp, 0x200);
-        // original esp
-        push_stack(esp, old_esp);
+        auto thd = proc.thds.begin();
+        thd->name.assign("[kernel thread daemon]");
+
+        __spawn(*thd, (uintptr_t)kernel_threadd_main);
 
         kernel::task::dispatcher::enqueue(&thd);
     }
@@ -334,6 +316,12 @@ void proclist::kill(pid_t pid, int exit_code)
 {
     auto& proc = this->find(pid);
 
+    // init should never exit
+    if (proc.ppid == 0) {
+        kmsg("kernel panic: init exited!");
+        freeze();
+    }
+
     // put all threads into sleep
     for (auto& thd : proc.thds)
         thd.set_attr(kernel::task::thread::ZOMBIE);
@@ -342,13 +330,7 @@ void proclist::kill(pid_t pid, int exit_code)
     proc.files.close_all();
 
     // unmap all user memory areas
-    proc.mms.clear_user();
-
-    // init should never exit
-    if (proc.ppid == 0) {
-        console->print("kernel panic: init exited!\n");
-        freeze();
-    }
+    proc.mms.clear();
 
     // make child processes orphans (children of init)
     this->make_children_orphans(pid);
@@ -391,48 +373,40 @@ void proclist::kill(pid_t pid, int exit_code)
 
 static void release_kinit()
 {
-    extern char __stage1_start[];
-    extern char __kinit_end[];
-
-    kernel::paccess pa(EARLY_KERNEL_PD_PAGE);
-    auto pd = (pd_t)pa.ptr();
-    assert(pd);
-    (*pd)[0].v = 0;
+    // free .kinit
+    using namespace kernel::mem::paging;
+    extern uintptr_t volatile KINIT_START_ADDR, KINIT_END_ADDR, KINIT_PAGES;
 
-    // free pt#0
-    __free_raw_page(0x00002);
+    std::size_t pages = KINIT_PAGES;
+    auto range = vaddr_range{KERNEL_PAGE_TABLE_ADDR,
+        KINIT_START_ADDR, KINIT_END_ADDR, true};
+    for (auto pte : range)
+        pte.clear();
 
-    // free .stage1 and .kinit
-    for (uint32_t i = ((uint32_t)__stage1_start >> 12);
-            i < ((uint32_t)__kinit_end >> 12); ++i) {
-        __free_raw_page(i);
-    }
+    create_zone(0x2000, 0x2000 + 0x1000 * pages);
 }
 
-void NORETURN _kernel_init(void)
+void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn)
 {
+    kernel::mem::paging::free_pages(kernel_stack_pfn, 9);
     release_kinit();
 
-    asm_sti();
+    asm volatile("sti");
 
     // ------------------------------------------
     // interrupt enabled
     // ------------------------------------------
 
     // load kmods
-    for (auto loader = kernel::module::kmod_loaders_start; *loader; ++loader) {
+    for (auto loader = kernel::module::KMOD_LOADERS_START; *loader; ++loader) {
         auto* mod = (*loader)();
         if (!mod)
             continue;
 
-        auto ret = insmod(mod);
-        if (ret == kernel::module::MODULE_SUCCESS)
+        if (auto ret = insmod(mod); ret == kernel::module::MODULE_SUCCESS)
             continue;
 
-        char buf[256];
-        snprintf(buf, sizeof(buf),
-            "[kernel] An error occured while loading \"%s\"\n", mod->name);
-        kmsg(buf);
+        kmsgf("[kernel] An error occured while loading \"%s\"", mod->name);
     }
 
     // mount fat32 /mnt directory
@@ -454,42 +428,42 @@ void NORETURN _kernel_init(void)
     }
 
     current_process->attr.system = 0;
-    current_thread->attr |= kernel::task::thread::SYSTEM;
-
-    const char* argv[] = { "/mnt/busybox", "sh", "/mnt/initsh" };
-    const char* envp[] = { "LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/", nullptr };
+    current_thread->attr &= ~kernel::task::thread::SYSTEM;
 
-    types::elf::elf32_load_data d;
-    d.argv = argv;
-    d.envp = envp;
-    d.system = false;
+    types::elf::elf32_load_data d{
+        .exec_dent{},
+        .argv{ "/mnt/busybox", "sh", "/mnt/initsh" },
+        .envp{ "LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/" },
+        .ip{}, .sp{}
+    };
 
-    d.exec_dent = fs::vfs_open(*fs::fs_root, types::path{argv[0]});
+    d.exec_dent = fs::vfs_open(*fs::fs_root, types::path{d.argv[0].c_str()});
     if (!d.exec_dent) {
-        console->print("kernel panic: init not found!\n");
+        kmsg("kernel panic: init not found!");
         freeze();
     }
 
-    int ret = types::elf::elf32_load(&d);
-    assert(ret == GB_OK);
+    int ret = types::elf::elf32_load(d);
+    assert(ret == 0);
+
+    int ds = 0x33, cs = 0x2b;
 
     asm volatile(
-        "movw $0x23, %%ax\n"
-        "movw %%ax, %%ds\n"
-        "movw %%ax, %%es\n"
-        "movw %%ax, %%fs\n"
-        "movw %%ax, %%gs\n"
-
-        "pushl $0x23\n"
-        "pushl %0\n"
-        "pushl $0x200\n"
-        "pushl $0x1b\n"
-        "pushl %1\n"
-
-        "iret\n"
-        :
-        : "c"(d.sp), "d"(d.eip)
-        : "eax", "memory");
+        "mov %0, %%rax\n"
+        "mov %%ax, %%ds\n"
+        "mov %%ax, %%es\n"
+        "mov %%ax, %%fs\n"
+        "mov %%ax, %%gs\n"
+
+        "push %%rax\n"
+        "push %2\n"
+        "push $0x200\n"
+        "push %1\n"
+        "push %3\n"
+
+        "iretq\n"
+        : : "g"(ds), "g"(cs), "g"(d.sp),
+            "g"(d.ip) : "eax", "memory");
 
     freeze();
 }
@@ -502,69 +476,71 @@ void k_new_thread(void (*func)(void*), void* data)
 }
 
 SECTION(".text.kinit")
-void NORETURN init_scheduler(void)
+void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn)
 {
     procs = new proclist;
 
     asm volatile(
-        "movl %0, %%esp\n"
-        "pushl %=f\n"
-        "pushl %1\n"
-
-        "movw $0x10, %%ax\n"
-        "movw %%ax, %%ss\n"
-        "movw %%ax, %%ds\n"
-        "movw %%ax, %%es\n"
-        "movw %%ax, %%fs\n"
-        "movw %%ax, %%gs\n"
-
-        "xorl %%ebp, %%ebp\n"
-        "xorl %%edx, %%edx\n"
-
-        "pushl $0x0\n"
-        "popfl\n"
+        "mov %2, %%rdi\n"
+        "mov %0, %%rsp\n"
+        "sub $24, %%rsp\n"
+        "mov %=f, %%rbx\n"
+        "mov %%rbx, (%%rsp)\n"   // return address
+        "mov %%rbx, 16(%%rsp)\n" // previous frame return address
+        "xor %%rbx, %%rbx\n"
+        "mov %%rbx, 8(%%rsp)\n"  // previous frame rbp
+        "mov %%rsp, %%rbp\n"     // current frame rbp
+
+        "push %1\n"
+
+        "mov $0x10, %%ax\n"
+        "mov %%ax, %%ss\n"
+        "mov %%ax, %%ds\n"
+        "mov %%ax, %%es\n"
+        "mov %%ax, %%fs\n"
+        "mov %%ax, %%gs\n"
+
+        "push $0x0\n"
+        "popf\n"
 
         "ret\n"
 
         "%=:\n"
         "ud2"
         :
-        : "a"(current_thread->kstack.esp), "c"(_kernel_init)
+        : "a"(current_thread->kstack.sp), "c"(_kernel_init), "g"(kernel_stack_pfn)
         : "memory");
 
     freeze();
 }
 
-extern "C" void asm_ctx_switch(uint32_t** curr_esp, uint32_t** next_esp);
+extern "C" void asm_ctx_switch(uintptr_t* curr_sp, uintptr_t* next_sp);
+
+extern "C" void after_ctx_switch()
+{
+    current_thread->kstack.load_interrupt_stack();
+    current_thread->load_thread_area32();
+}
+
 bool schedule()
 {
     if (kernel::async::preempt_count() != 0)
         return true;
 
     auto* next_thd = kernel::task::dispatcher::next();
-    process* proc = nullptr;
-    kernel::task::thread* curr_thd = nullptr;
-
-    if (current_thread == next_thd)
-        goto _end;
-
-    proc = &procs->find(next_thd->owner);
-    if (current_process != proc) {
-        proc->mms.switch_pd();
-        current_process = proc;
-    }
 
-    curr_thd = current_thread;
-
-    current_thread = next_thd;
-    tss.esp0 = (uint32_t)next_thd->kstack.esp;
-
-    next_thd->load_thread_area();
+    if (current_thread != next_thd) {
+        auto* proc = &procs->find(next_thd->owner);
+        if (current_process != proc) {
+            proc->mms.switch_pd();
+            current_process = proc;
+        }
 
-    asm_ctx_switch(&curr_thd->kstack.esp, &next_thd->kstack.esp);
-    tss.esp0 = (uint32_t)curr_thd->kstack.esp;
+        auto* curr_thd = current_thread;
+        current_thread = next_thd;
 
-_end:
+        asm_ctx_switch(&curr_thd->kstack.sp, &next_thd->kstack.sp);
+    }
 
     return current_thread->signals.pending_signal() == 0;
 }
@@ -577,10 +553,8 @@ void NORETURN schedule_noreturn(void)
 
 void NORETURN freeze(void)
 {
-    asm_cli();
-    asm_hlt();
     for (;;)
-        ;
+        asm volatile("cli\n\thlt");
 }
 
 void NORETURN kill_current(int signo)

+ 20 - 21
src/kernel/signal.cpp

@@ -1,7 +1,7 @@
 #include <kernel/task/thread.hpp>
 #include <kernel/process.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 
 #include <signal.h>
 
@@ -142,11 +142,11 @@ signo_type signal_list::pending_signal()
 
         return *iter;
     }
-    
+
     return 0;
 }
 
-void signal_list::handle(interrupt_stack* context, mmx_registers* mmxregs)
+void signal_list::handle(interrupt_stack_normal* context, mmx_registers* mmxregs)
 {
     // assume that the pending signal is at the front of the list
     auto signal = m_list.front();
@@ -178,29 +178,28 @@ void signal_list::handle(interrupt_stack* context, mmx_registers* mmxregs)
     if (!(handler.sa_flags & SA_RESTORER))
         raise(SIGSYS);
 
-    uint32_t esp = (uint32_t)context->esp;
-    esp -= (sizeof(mmx_registers) + sizeof(interrupt_stack) + 16);
-    esp &= 0xfffffff0;
-
-    auto tmpesp = esp;
-    *(uint32_t*)tmpesp = signal; // signal handler argument: int signo
-    tmpesp += 4;
-    *(uint32_t*)tmpesp = context->esp; // original esp
-    tmpesp += 4;
+    // save current interrupt context to 128 bytes above current user stack
+    uintptr_t sp = (uintptr_t)context->rsp;
+    sp -= (128 + sizeof(mmx_registers) + sizeof(interrupt_stack_normal) + 16);
+    sp &= ~0xf;
 
-    tmpesp += 8; // padding to align to 16 bytes
+    auto tmpsp = sp;
+    *(uint64_t*)tmpsp = signal; // signal handler argument: int signo
+    tmpsp += 8;
+    *(uintptr_t*)tmpsp = context->rsp; // original rsp
+    tmpsp += 8;
 
-    memcpy((void*)tmpesp, mmxregs, sizeof(mmx_registers));
-    tmpesp += sizeof(mmx_registers); // mmx registers
-    memcpy((void*)tmpesp, context, sizeof(interrupt_stack));
-    tmpesp += sizeof(interrupt_stack); // context
+    memcpy((void*)tmpsp, mmxregs, sizeof(mmx_registers));
+    tmpsp += sizeof(mmx_registers); // mmx registers
+    memcpy((void*)tmpsp, context, sizeof(interrupt_stack_normal));
+    tmpsp += sizeof(interrupt_stack_normal); // context
 
-    esp -= sizeof(void*);
+    sp -= sizeof(void*);
     // signal handler return address: restorer
-    *(uint32_t*)esp = (uint32_t)handler.sa_restorer;
+    *(uintptr_t*)sp = (uintptr_t)handler.sa_restorer;
 
-    context->esp = esp;
-    context->v_eip = (void*)handler.sa_handler;
+    context->rsp = sp;
+    context->v_rip = (uintptr_t)handler.sa_handler;
 }
 
 void signal_list::after_signal(signo_type signal)

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 265 - 1079
src/kernel/syscall.cpp


+ 538 - 8
src/kernel/syscall/fileops.cc

@@ -1,16 +1,93 @@
+#include <bits/ioctl.h>
 #include <errno.h>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 #include <types/path.hpp>
 
+#include <kernel/log.hpp>
+#include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 #include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
 
-int _syscall_symlink(interrupt_stack* data)
+#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
+
+static inline void not_implemented(const char* pos, int line)
+{
+    kmsgf("[kernel] the function at %s:%d is not implemented, killing the pid%d...",
+            pos, line, current_process->pid);
+    current_thread->send_signal(SIGSYS);
+}
+
+ssize_t kernel::syscall::do_write(int fd, const char __user* buf, size_t n)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return file->write(buf, n);
+}
+
+ssize_t kernel::syscall::do_read(int fd, char __user* buf, size_t n)
 {
-    SYSCALL_ARG1(const char __user*, target);
-    SYSCALL_ARG2(const char __user*, linkpath);
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return file->read(buf, n);
+}
 
+int kernel::syscall::do_close(int fd)
+{
+    current_process->files.close(fd);
+    return 0;
+}
+
+int kernel::syscall::do_dup(int old_fd)
+{
+    return current_process->files.dup(old_fd);
+}
+
+int kernel::syscall::do_dup2(int old_fd, int new_fd)
+{
+    return current_process->files.dup2(old_fd, new_fd);
+}
+
+int kernel::syscall::do_pipe(int __user* pipefd)
+{
+    return current_process->files.pipe(pipefd);
+}
+
+ssize_t kernel::syscall::do_getdents(int fd, char __user* buf, size_t cnt)
+{
+    auto* dir = current_process->files[fd];
+    if (!dir)
+        return -EBADF;
+
+    return dir->getdents(buf, cnt);
+}
+
+ssize_t kernel::syscall::do_getdents64(int fd, char __user* buf, size_t cnt)
+{
+    auto* dir = current_process->files[fd];
+    if (!dir)
+        return -EBADF;
+
+    return dir->getdents64(buf, cnt);
+}
+
+int kernel::syscall::do_open(const char __user* path, int flags, mode_t mode)
+{
+    mode &= ~current_process->umask;
+
+    return current_process->files.open(*current_process,
+        current_process->pwd + path, flags, mode);
+}
+
+int kernel::syscall::do_symlink(const char __user* target, const char __user* linkpath)
+{
     // TODO: use copy_from_user
     auto path = current_process->pwd + linkpath;
     auto* dent = fs::vfs_open(*current_process->root, path);
@@ -28,12 +105,8 @@ int _syscall_symlink(interrupt_stack* data)
     return dent->ind->fs->symlink(dent, linkname.c_str(), target);
 }
 
-int _syscall_readlink(interrupt_stack* data)
+int kernel::syscall::do_readlink(const char __user* pathname, char __user* buf, size_t buf_size)
 {
-    SYSCALL_ARG1(const char __user*, pathname);
-    SYSCALL_ARG2(char __user*, buf);
-    SYSCALL_ARG3(size_t, buf_size);
-
     // TODO: use copy_from_user
     auto path = current_process->pwd + pathname;
     auto* dent = fs::vfs_open(*current_process->root, path, false);
@@ -47,3 +120,460 @@ int _syscall_readlink(interrupt_stack* data)
     // TODO: use copy_to_user
     return dent->ind->fs->readlink(dent->ind, buf, buf_size);
 }
+
+int kernel::syscall::do_ioctl(int fd, unsigned long request, uintptr_t arg3)
+{
+    // TODO: check fd type and get tty* from fd
+    //
+    //       we use a trick for now, check whether
+    //       the file that fd points to is a pipe or
+    //       not. and we suppose that stdin will be
+    //       either a tty or a pipe.
+    auto* file = current_process->files[fd];
+    if (!file || !S_ISCHR(file->mode))
+        return -ENOTTY;
+
+    switch (request) {
+    case TIOCGPGRP: {
+        auto* pgid = (pid_t __user*)arg3;
+        auto* ctrl_tty = current_process->control_tty;
+
+        if (!ctrl_tty)
+            return -ENOTTY;
+
+        // TODO: copy_to_user
+        *pgid = ctrl_tty->get_pgrp();
+        break;
+    }
+    case TIOCSPGRP: {
+        // TODO: copy_from_user
+        auto pgid = *(const pid_t __user*)arg3;
+        auto* ctrl_tty = current_process->control_tty;
+
+        if (!ctrl_tty)
+            return -ENOTTY;
+
+        ctrl_tty->set_pgrp(pgid);
+        break;
+    }
+    case TIOCGWINSZ: {
+        auto* ws = (winsize __user*)arg3;
+        // TODO: copy_to_user
+        ws->ws_col = 80;
+        ws->ws_row = 10;
+        break;
+    }
+    case TCGETS: {
+        auto* argp = (struct termios __user*)arg3;
+
+        auto* ctrl_tty = current_process->control_tty;
+        if (!ctrl_tty)
+            return -EINVAL;
+
+        // TODO: use copy_to_user
+        memcpy(argp, &ctrl_tty->termio, sizeof(ctrl_tty->termio));
+
+        break;
+    }
+    case TCSETS: {
+        auto* argp = (const struct termios __user*)arg3;
+
+        auto* ctrl_tty = current_process->control_tty;
+        if (!ctrl_tty)
+            return -EINVAL;
+
+        // TODO: use copy_from_user
+        memcpy(&ctrl_tty->termio, argp, sizeof(ctrl_tty->termio));
+
+        break;
+    }
+    default:
+        kmsgf("[error] the ioctl() function %x is not implemented", request);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+ssize_t kernel::syscall::do_readv(int fd, const iovec* iov, int iovcnt)
+{
+    auto* file = current_process->files[fd];
+
+    if (!file)
+        return -EBADF;
+
+    // TODO: fix fake EOF
+    ssize_t totn = 0;
+    for (int i = 0; i < iovcnt; ++i) {
+        ssize_t ret = file->read(
+            (char*)iov[i].iov_base, iov[i].iov_len);
+
+        if (ret < 0)
+            return ret;
+
+        if (ret == 0)
+            break;
+
+        totn += ret;
+
+        if ((size_t)ret != iov[i].iov_len)
+            break;
+    }
+
+    return totn;
+}
+
+// TODO: this operation SHOULD be atomic
+ssize_t kernel::syscall::do_writev(int fd, const iovec* iov, int iovcnt)
+{
+    auto* file = current_process->files[fd];
+
+    if (!file)
+        return -EBADF;
+
+    ssize_t totn = 0;
+    for (int i = 0; i < iovcnt; ++i) {
+        ssize_t ret = file->write(
+            (const char*)iov[i].iov_base, iov[i].iov_len);
+
+        if (ret < 0)
+            return ret;
+        totn += ret;
+    }
+
+    return totn;
+}
+
+off_t kernel::syscall::do_lseek(int fd, off_t offset, int whence)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return file->seek(offset, whence);
+}
+
+uintptr_t kernel::syscall::do_mmap_pgoff(uintptr_t addr, size_t len,
+        int prot, int flags, int fd, off_t pgoffset)
+{
+    if (addr & 0xfff)
+        return -EINVAL;
+    if (len == 0)
+        return -EINVAL;
+
+    len = (len + 0xfff) & ~0xfff;
+
+    // TODO: shared mappings
+    if (flags & MAP_SHARED)
+        return -ENOMEM;
+
+    if (flags & MAP_ANONYMOUS) {
+        if (fd != -1)
+            return -EINVAL;
+        if (pgoffset != 0)
+            return -EINVAL;
+
+        // TODO: shared mappings
+        if (!(flags & MAP_PRIVATE))
+            return -EINVAL;
+
+        auto& mms = current_process->mms;
+
+        // do unmapping, equal to munmap, MAP_FIXED set
+        if (prot == PROT_NONE) {
+            if (int ret = mms.unmap(addr, len, true); ret != 0)
+                return ret;
+        }
+        else {
+            // TODO: add NULL check in mm_list
+            if (!addr || !mms.is_avail(addr, len)) {
+                if (flags & MAP_FIXED)
+                    return -ENOMEM;
+                addr = mms.find_avail(addr, len);
+            }
+
+            // TODO: check current cs
+            if (addr + len > 0x100000000ULL)
+                return -ENOMEM;
+
+            mem::mm_list::map_args args{};
+            args.vaddr = addr;
+            args.length = len;
+            args.flags = mem::MM_ANONYMOUS;
+
+            if (prot & PROT_WRITE)
+                args.flags |= mem::MM_WRITE;
+
+            if (prot & PROT_EXEC)
+                args.flags |= mem::MM_EXECUTE;
+
+            if (int ret = mms.mmap(args); ret != 0)
+                return ret;
+        }
+    }
+
+    return addr;
+}
+
+int kernel::syscall::do_munmap(uintptr_t addr, size_t len)
+{
+    if (addr & 0xfff)
+        return -EINVAL;
+
+    return current_process->mms.unmap(addr, len, true);
+}
+
+ssize_t kernel::syscall::do_sendfile(int out_fd, int in_fd,
+        off_t __user* offset, size_t count)
+{
+    auto* out_file = current_process->files[out_fd];
+    auto* in_file = current_process->files[in_fd];
+
+    if (!out_file || !in_file)
+        return -EBADF;
+
+    // TODO: check whether in_fd supports mmapping
+    if (!S_ISREG(in_file->mode) && !S_ISBLK(in_file->mode))
+        return -EINVAL;
+
+    if (offset) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    constexpr size_t bufsize = 4096;
+    std::vector<char> buf(bufsize);
+    size_t totn = 0;
+    while (totn < count) {
+        if (current_thread->signals.pending_signal() != 0)
+            return (totn == 0) ? -EINTR : totn;
+
+        size_t n = std::min(count - totn, bufsize);
+        ssize_t ret = in_file->read(buf.data(), n);
+        if (ret < 0)
+            return ret;
+        if (ret == 0)
+            break;
+        ret = out_file->write(buf.data(), ret);
+        if (ret < 0)
+            return ret;
+        totn += ret;
+
+        // TODO: this won't work, since when we are in the syscall handler,
+        //       interrupts are blocked.
+        //       one solution is to put the sendfile action into a kernel
+        //       worker and pause the calling thread so that the worker
+        //       thread could be interrupted normally.
+    }
+
+    return totn;
+}
+
+int kernel::syscall::do_statx(int dirfd, const char __user* path,
+        int flags, unsigned int mask, statx __user* statxbuf)
+{
+    // AT_STATX_SYNC_AS_STAT is the default value
+    if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_SYNC_AS_STAT) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    if (dirfd != AT_FDCWD) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    auto* dent = fs::vfs_open(*current_process->root,
+            current_process->pwd + path,
+            !(flags & AT_SYMLINK_NOFOLLOW));
+
+    if (!dent)
+        return -ENOENT;
+
+    // TODO: copy to user
+    auto ret = fs::vfs_stat(dent, statxbuf, mask);
+
+    return ret;
+}
+
+int kernel::syscall::do_fcntl(int fd, int cmd, unsigned long arg)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    switch (cmd) {
+    case F_SETFD:
+        return current_process->files.set_flags(fd, arg);
+    case F_DUPFD:
+    case F_DUPFD_CLOEXEC: {
+        return current_process->files.dupfd(fd, arg, FD_CLOEXEC);
+    }
+    default:
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+}
+
+int kernel::syscall::do_mkdir(const char __user* pathname, mode_t mode)
+{
+    mode &= (~current_process->umask & 0777);
+
+    auto path = current_process->pwd + pathname;
+
+    auto* dent = fs::vfs_open(*current_process->root, path);
+    if (dent)
+        return -EEXIST;
+
+    // get parent path
+    auto dirname = path.last_name();
+    path.remove_last();
+
+    dent = fs::vfs_open(*current_process->root, path);
+    if (!dent)
+        return -ENOENT;
+
+    if (!S_ISDIR(dent->ind->mode))
+        return -ENOTDIR;
+
+    auto ret = fs::vfs_mkdir(dent, dirname.c_str(), mode);
+
+    if (ret != 0)
+        return ret;
+
+    return 0;
+}
+
+int kernel::syscall::do_truncate(const char __user* pathname, long length)
+{
+    auto path = current_process->pwd + pathname;
+
+    auto* dent = fs::vfs_open(*current_process->root, path);
+    if (!dent)
+        return -ENOENT;
+
+    if (S_ISDIR(dent->ind->mode))
+        return -EISDIR;
+
+    auto ret = fs::vfs_truncate(dent->ind, length);
+
+    if (ret != 0)
+        return ret;
+
+    return 0;
+}
+
+int kernel::syscall::do_unlink(const char __user* pathname)
+{
+    auto path = current_process->pwd + pathname;
+    auto* dent = fs::vfs_open(*current_process->root, path, false);
+
+    if (!dent)
+        return -ENOENT;
+
+    if (S_ISDIR(dent->ind->mode))
+        return -EISDIR;
+
+    return fs::vfs_rmfile(dent->parent, dent->name.c_str());
+}
+
+int kernel::syscall::do_access(const char __user* pathname, int mode)
+{
+    auto path = current_process->pwd + pathname;
+    auto* dent = fs::vfs_open(*current_process->root, path);
+
+    if (!dent)
+        return -ENOENT;
+
+    switch (mode) {
+    case F_OK:
+        return 0;
+    case R_OK:
+    case W_OK:
+    case X_OK:
+        // TODO: check privilege
+        return 0;
+    default:
+        return -EINVAL;
+    }
+}
+
+int kernel::syscall::do_mknod(const char __user* pathname, mode_t mode, dev_t dev)
+{
+    auto path = current_process->pwd + pathname;
+    auto* dent = fs::vfs_open(*current_process->root, path);
+
+    if (dent)
+        return -EEXIST;
+
+    auto filename = path.last_name();
+    path.remove_last();
+
+    dent = fs::vfs_open(*current_process->root, path);
+    if (!dent)
+        return -ENOENT;
+
+    return fs::vfs_mknode(dent, filename.c_str(), mode, dev);
+}
+
+int kernel::syscall::do_poll(pollfd __user* fds, nfds_t nfds, int timeout)
+{
+    if (nfds == 0)
+        return 0;
+
+    if (nfds > 1) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    // TODO: handle timeout
+    // if (timeout != -1) {
+    // }
+    (void)timeout;
+
+    // for now, we will poll from console only
+    int ret = tty::console->poll();
+    if (ret < 0)
+        return ret;
+
+    fds[0].revents = POLLIN;
+    return ret;
+
+    // TODO: check address validity
+    // TODO: poll multiple fds and other type of files
+    // for (nfds_t i = 0; i < nfds; ++i) {
+    //     auto& pfd = fds[i];
+
+    //     auto* file = current_process->files[pfd.fd];
+    //     if (!file || !S_ISCHR(file->mode))
+    //         return -EINVAL;
+
+    //     // poll the fds
+    // }
+    //
+    // return 0;
+}
+
+/* TODO: implement vfs_stat(stat*)
+int do_stat(const char __user* pathname, stat __user* buf)
+{
+    auto* dent = fs::vfs_open(*current_process->root,
+        types::make_path(pathname, current_process->pwd));
+
+    if (!dent)
+        return -ENOENT;
+
+    return fs::vfs_stat(dent, buf);
+}
+*/
+
+/* TODO: implement vfs_stat(stat*)
+int do_fstat(int fd, stat __user* buf)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return fs::vfs_stat(file, buf);
+}
+*/

+ 51 - 0
src/kernel/syscall/infoops.cc

@@ -0,0 +1,51 @@
+#include <bits/alltypes.h>
+#include <time.h>
+
+#include <kernel/hw/timer.hpp>
+#include <kernel/log.hpp>
+#include <kernel/process.hpp>
+#include <kernel/syscall.hpp>
+
+#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
+
+static inline void not_implemented(const char* pos, int line)
+{
+    kmsgf("[kernel] the function at %s:%d is not implemented, killing the pid%d...",
+            pos, line, current_process->pid);
+    current_thread->send_signal(SIGSYS);
+}
+
+int kernel::syscall::do_clock_gettime(clockid_t clk_id, timespec __user* tp)
+{
+    if (clk_id != CLOCK_REALTIME && clk_id != CLOCK_MONOTONIC) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    if (!tp)
+        return -EFAULT;
+
+    auto time = hw::timer::current_ticks();
+
+    // TODO: copy_to_user
+    tp->tv_sec = time / 100;
+    tp->tv_nsec = 10000000 * (time % 100);
+
+    return 0;
+}
+
+int kernel::syscall::do_gettimeofday(timeval __user* tv, void __user* tz)
+{
+    // TODO: return time of the day, not time from this boot
+    if (tz) [[unlikely]]
+        return -EINVAL;
+
+    if (tv) {
+        // TODO: use copy_to_user
+        auto ticks = kernel::hw::timer::current_ticks();
+        tv->tv_sec = ticks / 100;
+        tv->tv_usec = ticks * 10 * 1000;
+    }
+
+    return 0;
+}

+ 6 - 7
src/kernel/syscall/mount.cc

@@ -6,14 +6,13 @@
 #include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
 
-int _syscall_mount(interrupt_stack* data)
+int kernel::syscall::do_mount(
+        const char __user* source,
+        const char __user* target,
+        const char __user* fstype,
+        unsigned long flags,
+        const void __user* _fsdata)
 {
-    SYSCALL_ARG1(const char __user*, source);
-    SYSCALL_ARG2(const char __user*, target);
-    SYSCALL_ARG3(const char __user*, fstype);
-    SYSCALL_ARG4(unsigned long, flags);
-    SYSCALL_ARG5(const void __user*, _fsdata);
-
     if (!fstype)
         return -EINVAL;
 

+ 391 - 0
src/kernel/syscall/procops.cc

@@ -0,0 +1,391 @@
+#include <string>
+#include <vector>
+
+#include <sys/prctl.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+
+#include <types/elf.hpp>
+
+#include <kernel/log.hpp>
+#include <kernel/process.hpp>
+#include <kernel/signal.hpp>
+#include <kernel/syscall.hpp>
+#include <kernel/utsname.hpp>
+
+using namespace kernel::syscall;
+
+#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
+
+static inline void not_implemented(const char* pos, int line)
+{
+    kmsgf("[kernel] the function at %s:%d is not implemented, killing the pid%d...",
+            pos, line, current_process->pid);
+    current_thread->send_signal(SIGSYS);
+}
+
+int kernel::syscall::do_chdir(const char __user* path)
+{
+    auto* dir = fs::vfs_open(*current_process->root,
+            current_process->pwd + path);
+    if (!dir)
+        return -ENOENT;
+
+    if (!S_ISDIR(dir->ind->mode))
+        return -ENOTDIR;
+
+    current_process->pwd.clear();
+    dir->path(*current_process->root, current_process->pwd);
+
+    return 0;
+}
+
+execve_retval kernel::syscall::do_execve(
+        const std::string& exec,
+        const std::vector<std::string>& args,
+        const std::vector<std::string>& envs)
+{
+    types::elf::elf32_load_data d{
+        .exec_dent{},
+        .argv{args},
+        .envp{envs},
+        .ip{}, .sp{},
+    };
+
+    d.exec_dent = fs::vfs_open(*current_process->root,
+            current_process->pwd + exec.c_str());
+
+    if (!d.exec_dent)
+        return { 0, 0, -ENOENT };
+
+    current_process->files.onexec();
+
+    // TODO: set cs and ss to compatibility mode
+    if (int ret = types::elf::elf32_load(d); ret != 0)
+        return { 0, 0, ret };
+
+    current_thread->signals.on_exec();
+
+    return { d.ip, d.sp, 0 };
+}
+
+
+int kernel::syscall::do_exit(int status)
+{
+    // TODO: terminating a thread only
+    assert(current_process->thds.size() == 1);
+
+    // terminating a whole process:
+    procs->kill(current_process->pid, (status & 0xff) << 8);
+
+    // switch to new process and continue
+    schedule_noreturn();
+}
+
+int kernel::syscall::do_waitpid(pid_t waitpid, int __user* arg1, int options)
+{
+    if (waitpid != -1)
+        return -EINVAL;
+
+    auto& cv = current_process->waitlist;
+    kernel::async::lock_guard lck(current_process->mtx_waitprocs);
+
+    auto& waitlist = current_process->waitprocs;
+
+    // TODO: check if it is waiting for stopped process
+    if (options & ~(WNOHANG | WUNTRACED)) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    while (waitlist.empty()) {
+        if (current_process->children.empty())
+            return -ECHILD;
+
+        if (options & WNOHANG)
+            return 0;
+
+        bool interrupted = cv.wait(current_process->mtx_waitprocs);
+        if (interrupted)
+            return -EINTR;
+    }
+
+    for (auto iter = waitlist.begin(); iter != waitlist.end(); ++iter) {
+        if (WIFSTOPPED(iter->code) && !(options & WUNTRACED))
+            continue;
+
+        pid_t pid = iter->pid;
+
+        // TODO: copy_to_user
+        *arg1 = iter->code;
+
+        procs->remove(pid);
+        waitlist.erase(iter);
+
+        return pid;
+    }
+
+    // we should never reach here
+    freeze();
+    return -EINVAL;
+}
+
+char __user* kernel::syscall::do_getcwd(char __user* buf, size_t buf_size)
+{
+    // TODO: use copy_to_user
+    auto path = current_process->pwd.full_path();
+    strncpy(buf, path.c_str(), buf_size);
+    buf[buf_size - 1] = 0;
+
+    return buf;
+}
+
+pid_t kernel::syscall::do_setsid()
+{
+    if (current_process->pid == current_process->pgid)
+        return -EPERM;
+
+    current_process->sid = current_process->pid;
+    current_process->pgid = current_process->pid;
+
+    // TODO: get tty* from fd or block device id
+    tty::console->set_pgrp(current_process->pid);
+    current_process->control_tty = tty::console;
+
+    return current_process->pid;
+}
+
+pid_t kernel::syscall::do_getsid(pid_t pid)
+{
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+    if (pproc->sid != current_process->sid)
+        return -EPERM;
+
+    return pproc->sid;
+}
+
+int kernel::syscall::do_setpgid(pid_t pid, pid_t pgid)
+{
+    if (pgid < 0)
+        return -EINVAL;
+
+    if (pid == 0)
+        pid = current_process->pid;
+
+    if (pgid == 0)
+        pgid = pid;
+
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+
+    // TODO: check whether pgid and the original
+    //       pgid is in the same session
+
+    pproc->pgid = pgid;
+
+    return 0;
+}
+
+int kernel::syscall::do_set_thread_area(kernel::user::user_desc __user* ptr)
+{
+    auto ret = current_thread->set_thread_area(ptr);
+    if (ret != 0)
+        return ret;
+
+    current_thread->load_thread_area32();
+    return 0;
+}
+
+pid_t kernel::syscall::do_set_tid_address(int __user* tidptr)
+{
+    // TODO: copy_from_user
+    current_thread->set_child_tid = tidptr;
+    return current_thread->tid();
+}
+
+int kernel::syscall::do_prctl(int option, uintptr_t arg2)
+{
+    switch (option) {
+    case PR_SET_NAME: {
+        // TODO: copy_from_user
+        auto* name = (const char __user*)arg2;
+        current_thread->name.assign(name, 15);
+        break;
+    }
+    case PR_GET_NAME: {
+        auto* name = (char __user*)arg2;
+        // TODO: copy_to_user
+        strncpy(name, current_thread->name.c_str(), 16);
+        name[15] = 0;
+        break;
+    }
+    default:
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+int kernel::syscall::do_arch_prctl(int option, uintptr_t arg2)
+{
+    switch (option) {
+    case PR_SET_NAME: {
+        // TODO: copy_from_user
+        auto* name = (const char __user*)arg2;
+        current_thread->name.assign(name, 15);
+        break;
+    }
+    case PR_GET_NAME: {
+        auto* name = (char __user*)arg2;
+        // TODO: copy_to_user
+        strncpy(name, current_thread->name.c_str(), 16);
+        name[15] = 0;
+        break;
+    }
+    default:
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+int kernel::syscall::do_umask(mode_t mask)
+{
+    mode_t old = current_process->umask;
+    current_process->umask = mask;
+
+    return old;
+}
+
+int kernel::syscall::do_kill(pid_t pid, int sig)
+{
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+
+    if (!kernel::signal_list::check_valid(sig))
+        return -EINVAL;
+
+    if (pproc->is_system())
+        return 0;
+
+    // TODO: check permission
+    procs->send_signal(pid, sig);
+
+    return 0;
+}
+
+int kernel::syscall::do_rt_sigprocmask(int how, const sigmask_type __user* set,
+        sigmask_type __user* oldset, size_t sigsetsize)
+{
+    if (sigsetsize != sizeof(sigmask_type))
+        return -EINVAL;
+
+    sigmask_type sigs = current_thread->signals.get_mask();
+
+    // TODO: use copy_to_user
+    if (oldset)
+        memcpy(oldset, &sigs, sizeof(sigmask_type));
+
+    if (!set)
+        return 0;
+
+    // TODO: use copy_from_user
+    switch (how) {
+    case SIG_BLOCK:
+        current_thread->signals.mask(*set);
+        break;
+    case SIG_UNBLOCK:
+        current_thread->signals.unmask(*set);
+        break;
+    case SIG_SETMASK:
+        current_thread->signals.set_mask(*set);
+        break;
+    }
+
+    return 0;
+}
+
+int kernel::syscall::do_rt_sigaction(int signum, const sigaction __user* act,
+        sigaction __user* oldact, size_t sigsetsize)
+{
+    if (sigsetsize != sizeof(sigmask_type))
+        return -EINVAL;
+
+    if (!kernel::signal_list::check_valid(signum)
+        || signum == SIGKILL || signum == SIGSTOP)
+        return -EINVAL;
+
+    // TODO: use copy_to_user
+    if (oldact)
+        current_thread->signals.get_handler(signum, *oldact);
+
+    if (!act)
+        return 0;
+
+    // TODO: use copy_from_user
+    current_thread->signals.set_handler(signum, *act);
+
+    return 0;
+}
+
+int kernel::syscall::do_newuname(new_utsname __user* buf)
+{
+    if (!buf)
+        return -EFAULT;
+
+    // TODO: use copy_to_user
+    memcpy(buf, sys_utsname, sizeof(new_utsname));
+
+    return 0;
+}
+
+pid_t kernel::syscall::do_getpgid(pid_t pid)
+{
+    if (pid == 0)
+        return current_process->pgid;
+
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+
+    return pproc->pgid;
+}
+
+pid_t kernel::syscall::do_getpid()
+{
+    return current_process->pid;
+}
+
+pid_t kernel::syscall::do_getppid()
+{
+    return current_process->ppid;
+}
+
+uid_t kernel::syscall::do_getuid()
+{
+    return 0; // all users are root for now
+}
+
+uid_t kernel::syscall::do_geteuid()
+{
+    return 0; // all users are root for now
+}
+
+gid_t kernel::syscall::do_getgid()
+{
+    return 0; // all users are root for now
+}
+
+pid_t kernel::syscall::do_gettid()
+{
+    return current_thread->tid();
+}
+
+uintptr_t kernel::syscall::do_brk(uintptr_t addr)
+{
+    return current_process->mms.set_brk(addr);
+}

+ 17 - 0
src/kernel/task/readyqueue.cc

@@ -28,11 +28,28 @@ void dispatcher::dequeue(thread* thd)
 thread* dispatcher::next()
 {
     lock_guard_irq lck(dispatcher_mtx);
+    auto back = dispatcher_thds.back();
+
+    if (dispatcher_thds.size() == 1) {
+        back->elected_times++;
+        return back;
+    }
+
+    if (dispatcher_thds.size() == 2) {
+        if (back->owner == 0) {
+            auto front = dispatcher_thds.front();
+            front->elected_times++;
+            return front;
+        }
+        back->elected_times++;
+        return back;
+    }
 
     auto* retval = dispatcher_thds.front();
 
     dispatcher_thds.pop_front();
     dispatcher_thds.push_back(retval);
 
+    retval->elected_times++;
     return retval;
 }

+ 78 - 60
src/kernel/task/thread.cc

@@ -1,28 +1,42 @@
-#include <kernel/task/thread.hpp>
-
 #include <queue>
 
+#include <stdint.h>
+
+#include <types/types.h>
+
+#include <kernel/async/lock.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/async/lock.hpp>
 #include <kernel/task/readyqueue.hpp>
+#include <kernel/task/thread.hpp>
+
+constexpr std::size_t KERNEL_STACK_ORDER = 3; // 2^3 * 4096 = 32KB
 
 using namespace kernel::task;
+using namespace kernel::mem;
+using namespace kernel::mem::paging;
+
+struct PACKED tss64_t {
+    uint32_t _reserved1;
+    uint64_t rsp[3];
+    uint64_t _reserved2;
+    uint64_t ist[7];
+    uint64_t _reserved3;
+    uint32_t _reserved4;
+};
+constexpr physaddr<tss64_t> tss{0x00000070};
 
 thread::thread(std::string name, pid_t owner)
-    : owner { owner }, attr { READY | SYSTEM }, name { name }
-{
-}
+    : owner { owner }, attr { READY | SYSTEM }, name { name } { }
 
 thread::thread(const thread& val, pid_t owner)
-    : owner { owner }, attr { val.attr }, name { val.name }
-{
-}
+    : owner { owner }, attr { val.attr }, name { val.name }, tls_desc32{val.tls_desc32} { }
 
 tid_t thread::tid() const
 {
-    return (tid_t)kstack.stack_base;
+    return (tid_t)kstack.pfn;
 }
 
 bool thread::operator<(const thread& rhs) const
@@ -35,56 +49,55 @@ bool thread::operator==(const thread& rhs) const
     return tid() == rhs.tid();
 }
 
-static std::priority_queue<std::byte*> s_kstacks;
-static kernel::async::mutex s_mtx_kstacks;
+static inline uintptr_t __stack_bottom(pfn_t pfn)
+{
+    return (uintptr_t)(void*)
+        kernel::mem::physaddr<void>{pfn + (1 << KERNEL_STACK_ORDER) * 0x1000};
+}
 
 thread::kernel_stack::kernel_stack()
 {
-    static int allocated;
-    kernel::async::lock_guard_irq lck(s_mtx_kstacks);
-
-    if (!s_kstacks.empty()) {
-        stack_base = s_kstacks.top();
-        esp = (uint32_t*)stack_base;
-        s_kstacks.pop();
-        return;
-    }
-
-    // kernel stack pt is at page#0x00005
-    kernel::paccess pa(0x00005);
-    auto pt = (pt_t)pa.ptr();
-    assert(pt);
-
-    int cnt = THREAD_KERNEL_STACK_SIZE / PAGE_SIZE;
-    pte_t* pte = *pt + allocated * cnt;
-
-    for (int i = 0; i < cnt; ++i) {
-        pte[i].v = 0x3;
-        pte[i].in.page = __alloc_raw_page();
-    }
-
-    stack_base = (std::byte*)(0xffc00000 + THREAD_KERNEL_STACK_SIZE * (allocated + 1));
-    esp = (uint32_t*)stack_base;
-
-    ++allocated;
+    pfn = page_to_pfn(alloc_pages(KERNEL_STACK_ORDER));
+    sp = __stack_bottom(pfn);
 }
 
 thread::kernel_stack::kernel_stack(const kernel_stack& other)
     : kernel_stack()
 {
-    auto offset = vptrdiff(other.stack_base, other.esp);
-    esp = (uint32_t*)(stack_base - offset);
-    memcpy(esp, other.esp, offset);
+    auto offset = __stack_bottom(other.pfn) - other.sp;
+
+    sp -= offset;
+    memcpy((void*)sp, (void*)other.sp, offset);
 }
 
 thread::kernel_stack::kernel_stack(kernel_stack&& other)
-    : stack_base(std::exchange(other.stack_base, nullptr))
-    , esp(std::exchange(other.esp, nullptr)) { }
+    : pfn(std::exchange(other.pfn, 0))
+    , sp(std::exchange(other.sp, 0)) { }
 
 thread::kernel_stack::~kernel_stack()
 {
-    kernel::async::lock_guard_irq lck(s_mtx_kstacks);
-    s_kstacks.push(stack_base);
+    if (!pfn)
+        return;
+    free_pages(pfn, KERNEL_STACK_ORDER);
+}
+
+uint64_t thread::kernel_stack::pushq(uint64_t val)
+{
+    sp -= 8;
+    *(uint64_t*)sp = val;
+    return val;
+}
+
+uint32_t thread::kernel_stack::pushl(uint32_t val)
+{
+    sp -= 4;
+    *(uint32_t*)sp = val;
+    return val;
+}
+
+void thread::kernel_stack::load_interrupt_stack() const
+{
+    tss->rsp[0] = sp;
 }
 
 void thread::set_attr(thd_attr_t new_attr)
@@ -142,7 +155,8 @@ void thread::send_signal(signal_list::signo_type signal)
 int thread::set_thread_area(kernel::user::user_desc* ptr)
 {
     if (ptr->read_exec_only && ptr->seg_not_present) {
-        void* dst = (void*)ptr->base_addr;
+        // TODO: use copy_to_user
+        auto* dst = (void*)(uintptr_t)ptr->base_addr;
         std::size_t len = ptr->limit;
         if (len > 0 && dst)
             memset(dst, 0x00, len);
@@ -150,25 +164,29 @@ int thread::set_thread_area(kernel::user::user_desc* ptr)
     }
 
     if (ptr->entry_number == -1U)
-        ptr->entry_number = 6;
+        ptr->entry_number = 7;
     else
         return -1;
 
-    tls_desc.limit_low = ptr->limit & 0xFFFF;
-    tls_desc.base_low = ptr->base_addr & 0xFFFF;
-    tls_desc.base_mid = (ptr->base_addr >> 16) & 0xFF;
-    tls_desc.access = SD_TYPE_DATA_USER;
-    tls_desc.limit_high = (ptr->limit >> 16) & 0xF;
-    tls_desc.flags = (ptr->limit_in_pages << 3) | (ptr->seg_32bit << 2);
-    tls_desc.base_high = (ptr->base_addr >> 24) & 0xFF;
+    if (!ptr->seg_32bit)
+        return -1;
+
+    if ((ptr->limit & 0xffff) != 0xffff) {
+        asm volatile("nop": : : "memory");
+    }
+
+    tls_desc32  = ptr->limit & 0x0'ffff;
+    tls_desc32 |= (ptr->base_addr & 0x00'ffffffULL) << 16;
+    tls_desc32 |= 0x4'0'f2'000000'0000;
+    tls_desc32 |= (ptr->limit & 0xf'0000ULL) << (48-16);
+    tls_desc32 |= ((ptr->limit_in_pages + 0ULL) << 55);
+    tls_desc32 |= (ptr->base_addr & 0xff'000000ULL) << (56-24);
 
     return 0;
 }
 
-int thread::load_thread_area() const
+int thread::load_thread_area32() const
 {
-    if (tls_desc.flags == 0)
-        return -1;
-    kernel::user::load_thread_area(tls_desc);
+    kernel::user::load_thread_area32(tls_desc32);
     return 0;
 }

+ 14 - 17
src/kernel/tty.cpp

@@ -5,7 +5,6 @@
 #include <termios.h>
 
 #include <kernel/async/lock.hpp>
-#include <kernel/hw/serial.h>
 #include <kernel/process.hpp>
 #include <kernel/tty.hpp>
 #include <kernel/vga.hpp>
@@ -20,7 +19,9 @@
 
 #define TERMIOS_TESTCC(c, termios, cc) ((c != 0xff) && (c == ((termios).c_cc[cc])))
 
-tty::tty()
+using namespace kernel::tty;
+
+tty::tty(std::string name)
     : termio {
         .c_iflag = ICRNL | IXOFF,
         .c_oflag = OPOST | ONLCR,
@@ -32,6 +33,7 @@ tty::tty()
         .c_ispeed = 38400,
         .c_ospeed = 38400,
     }
+    , name{name}
     , buf(BUFFER_SIZE)
     , fg_pgroup { 0 }
 {
@@ -280,21 +282,7 @@ void tty::show_char(int c)
     this->putchar(c);
 }
 
-vga_tty::vga_tty()
-{
-    snprintf(this->name, sizeof(this->name), "ttyVGA");
-}
-
-serial_tty::serial_tty(int id)
-    : id(id)
-{
-    snprintf(this->name, sizeof(this->name), "ttyS%x", (int)id);
-}
-
-void serial_tty::putchar(char c)
-{
-    serial_send_data(id, c);
-}
+vga_tty::vga_tty(): tty{"ttyVGA"} { }
 
 void vga_tty::putchar(char c)
 {
@@ -307,3 +295,12 @@ void tty::clear_read_buf(void)
 {
     this->buf.clear();
 }
+
+int kernel::tty::register_tty(tty* tty_dev)
+{
+    // TODO: manage all ttys
+    if (!console)
+        console = tty_dev;
+
+    return 0;
+}

+ 14 - 13
src/kernel/user/thread_local.cc

@@ -1,22 +1,23 @@
-#include <kernel/process.hpp>
-#include <kernel/mem.h>
-#include <kernel/user/thread_local.hpp>
-
-#include <string.h>
 #include <cstddef>
 
-namespace kernel::user {
+#include <stdint.h>
+
+#include <kernel/mem/phys.hpp>
+#include <kernel/mem/types.hpp>
+#include <kernel/user/thread_local.hpp>
+
+using namespace kernel::user;
 
-void load_thread_area(const segment_descriptor& desc)
+void kernel::user::load_thread_area32(uint64_t desc)
 {
-    gdt[6] = desc;
+    if (!desc)
+        return;
+
+    kernel::mem::gdt[7] = desc;
+
     asm volatile(
         "mov %%gs, %%ax\n\t"
         "mov %%ax, %%gs\n\t"
-        :
-        :
-        : "ax"
+        : : : "ax"
     );
 }
-
-} // namespace kernel::user

+ 11 - 11
src/kernel/vfs.cpp

@@ -15,10 +15,8 @@
 
 #include <types/allocator.hpp>
 #include <types/path.hpp>
-#include <types/status.h>
 
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
 #include <kernel/process.hpp>
 #include <kernel/tty.hpp>
 #include <kernel/vfs.hpp>
@@ -55,7 +53,7 @@ int dentry::load()
                 else
                     append(ind, dentry::name_type(name, len));
 
-                return GB_OK;
+                return 0;
             });
 
         if (ret == 0)
@@ -157,7 +155,8 @@ fs::inode* vfs::cache_inode(size_t size, ino_t ino,
 
 void vfs::free_inode(ino_t ino)
 {
-    assert(_inodes.erase(ino) == 1);
+    int n = _inodes.erase(ino);
+    assert(n == 1);
 }
 
 fs::inode* vfs::get_inode(ino_t ino)
@@ -296,7 +295,7 @@ ssize_t fs::regular_file::do_write(const char* __user buf, size_t n)
     return n_wrote;
 }
 
-ssize_t fs::regular_file::seek(off_t n, int whence)
+off_t fs::regular_file::seek(off_t n, int whence)
 {
     if (!S_ISREG(mode))
         return -ESPIPE;
@@ -337,7 +336,7 @@ int fs::regular_file::getdents(char* __user buf, size_t cnt)
 
             size_t reclen = sizeof(fs::user_dirent) + 1 + len;
             if (cnt < reclen)
-                return GB_FAILED;
+                return -EFAULT;
 
             auto* dirp = (fs::user_dirent*)buf;
             dirp->d_ino = ind->ino;
@@ -351,7 +350,7 @@ int fs::regular_file::getdents(char* __user buf, size_t cnt)
 
             buf += reclen;
             cnt -= reclen;
-            return GB_OK;
+            return 0;
         });
 
     if (nread > 0)
@@ -373,7 +372,7 @@ int fs::regular_file::getdents64(char* __user buf, size_t cnt)
 
             size_t reclen = sizeof(fs::user_dirent64) + len;
             if (cnt < reclen)
-                return GB_FAILED;
+                return -EFAULT;
 
             auto* dirp = (fs::user_dirent64*)buf;
             dirp->d_ino = ind->ino;
@@ -386,7 +385,7 @@ int fs::regular_file::getdents64(char* __user buf, size_t cnt)
 
             buf += reclen;
             cnt -= reclen;
-            return GB_OK;
+            return 0;
         });
 
     if (nread > 0)
@@ -784,13 +783,14 @@ ssize_t b_null_write(const char*, size_t n)
 
 static ssize_t console_read(char* buf, size_t buf_size, size_t n)
 {
-    return console->read(buf, buf_size, n);
+    return kernel::tty::console->read(buf, buf_size, n);
 }
+
 static ssize_t console_write(const char* buf, size_t n)
 {
     size_t orig_n = n;
     while (n--)
-        console->putchar(*(buf++));
+        kernel::tty::console->putchar(*(buf++));
 
     return orig_n;
 }

+ 17 - 16
src/kernel/vfs/tmpfs.cc

@@ -1,10 +1,11 @@
-#include <kernel/vfs.hpp>
-#include <kernel/mm.hpp>
-#include <kernel/log.hpp>
-
 #include <algorithm>
-#include <vector>
 #include <map>
+#include <vector>
+
+#include <stdint.h>
+
+#include <kernel/log.hpp>
+#include <kernel/vfs.hpp>
 
 using fs::vfs, fs::inode, fs::dentry;
 
@@ -37,9 +38,9 @@ private:
     {
         return static_cast<fdata_t*>(data);
     }
-    static constexpr ptr_t as_val(void* data)
+    static constexpr uintptr_t as_val(void* data)
     {
-        return std::bit_cast<ptr_t>(data);
+        return std::bit_cast<uintptr_t>(data);
     }
     inline void* _getdata(ino_t ino) const
     {
@@ -51,7 +52,7 @@ private:
         inode_data.insert(std::make_pair(ino, data));
         return ino;
     }
-    inline ino_t _savedata(ptr_t data)
+    inline ino_t _savedata(uintptr_t data)
     {
         return _savedata((void*)data);
     }
@@ -93,7 +94,7 @@ protected:
 
             // inode mode filetype is compatible with user dentry filetype
             auto ret = filldir(entry.filename, 0, ind, ind->mode & S_IFMT);
-            if (ret != GB_OK)
+            if (ret != 0)
                 break;
         }
 
@@ -158,7 +159,7 @@ public:
         if (dir->flags.present)
             dir->append(get_inode(file.ino), filename);
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int inode_mknode(dentry* dir, const char* filename, mode_t mode, dev_t dev) override
@@ -175,7 +176,7 @@ public:
         if (dir->flags.present)
             dir->append(get_inode(node.ino), filename);
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int inode_mkdir(dentry* dir, const char* dirname, mode_t mode) override
@@ -192,7 +193,7 @@ public:
         if (dir->flags.present)
             dir->append(new_dir, dirname);
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int symlink(dentry* dir, const char* linkname, const char* target) override
@@ -273,7 +274,7 @@ public:
         }
 
         if (mask & STATX_BLOCKS) {
-            st->stx_blocks = align_up<9>(ind->size) / 512;
+            st->stx_blocks = ((ind->size + 0x1ff) & ~0x1ff) / 512;
             st->stx_blksize = 4096;
             st->stx_mask |= STATX_BLOCKS;
         }
@@ -288,7 +289,7 @@ public:
             st->stx_mask |= STATX_GID;
         }
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int inode_rmfile(dentry* dir, const char* filename) override
@@ -326,7 +327,7 @@ public:
             return 0;
         }
 
-        kmsg("[tmpfs] warning: file entry not found in vfe\n");
+        kmsg("[tmpfs] warning: file entry not found in vfe");
         return -EIO;
     }
 
@@ -344,7 +345,7 @@ public:
         auto* data = as_fdata(_getdata(file->ino));
         data->resize(size);
         file->size = size;
-        return GB_OK;
+        return 0;
     }
 };
 

+ 208 - 96
src/kinit.cpp

@@ -1,144 +1,256 @@
-#include <asm/port_io.h>
-#include <asm/sys.h>
-
 #include <assert.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <sys/utsname.h>
 
-#include <types/status.h>
+#include <types/allocator.hpp>
 #include <types/types.h>
 
-#include <kernel/hw/keyboard.h>
 #include <kernel/hw/pci.hpp>
-#include <kernel/hw/serial.h>
-#include <kernel/hw/timer.h>
-#include <kernel/interrupt.h>
+#include <kernel/hw/timer.hpp>
+#include <kernel/interrupt.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
+#include <kernel/mem/types.hpp>
 #include <kernel/process.hpp>
 #include <kernel/syscall.hpp>
-#include <kernel/task.h>
-#include <kernel/tty.hpp>
 #include <kernel/utsname.hpp>
-#include <kernel/vga.hpp>
 
-typedef void (*constructor)(void);
-extern constructor const SECTION(".rodata.kinit") start_ctors;
-extern constructor const SECTION(".rodata.kinit") end_ctors;
+using constructor = void (*)();
+extern "C" constructor const start_ctors, end_ctors;
+extern "C" uint64_t BSS_ADDR, BSS_LENGTH;
+
+struct PACKED bootloader_data {
+    uint32_t meminfo_entry_count;
+    uint32_t meminfo_entry_length;
+
+    // don't forget to add the initial 1m to the total
+    uint32_t meminfo_1k_blocks;
+    uint32_t meminfo_64k_blocks;
 
-extern struct mem_size_info SECTION(".stage1") asm_mem_size_info;
-extern uint8_t SECTION(".stage1") asm_e820_mem_map[1024];
-extern uint32_t SECTION(".stage1") asm_e820_mem_map_count;
-extern uint32_t SECTION(".stage1") asm_e820_mem_map_entry_size;
+    // meminfo entries
+    kernel::mem::e820_mem_map_entry
+        meminfo_entries[(1024-4*4)/24];
+};
+
+extern void init_vfs();
+
+namespace kernel::kinit {
 
 SECTION(".text.kinit")
-static inline void save_loader_data(void)
+static inline void enable_sse()
 {
-    memcpy(e820_mem_map, asm_e820_mem_map, sizeof(e820_mem_map));
-    e820_mem_map_count = asm_e820_mem_map_count;
-    e820_mem_map_entry_size = asm_e820_mem_map_entry_size;
-    memcpy(&mem_size_info, &asm_mem_size_info, sizeof(struct mem_size_info));
+    asm volatile(
+            "mov %%cr0, %%rax\n\t"
+            "and $(~0xc), %%rax\n\t"
+            "or $0x22, %%rax\n\t"
+            "mov %%rax, %%cr0\n\t"
+            "\n\t"
+            "mov %%cr4, %%rax\n\t"
+            "or $0x600, %%rax\n\t"
+            "mov %%rax, %%cr4\n\t"
+            "fninit\n\t"
+            ::: "rax"
+            );
 }
 
 SECTION(".text.kinit")
-static inline void load_new_gdt(void)
+static inline void set_uname()
 {
-    create_segment_descriptor(gdt + 0, 0, 0, 0, 0);
-    create_segment_descriptor(gdt + 1, 0, ~0, 0b1100, SD_TYPE_CODE_SYSTEM);
-    create_segment_descriptor(gdt + 2, 0, ~0, 0b1100, SD_TYPE_DATA_SYSTEM);
-    create_segment_descriptor(gdt + 3, 0, ~0, 0b1100, SD_TYPE_CODE_USER);
-    create_segment_descriptor(gdt + 4, 0, ~0, 0b1100, SD_TYPE_DATA_USER);
-    create_segment_descriptor(gdt + 5, (uint32_t)&tss, sizeof(tss), 0b0000, SD_TYPE_TSS);
-    create_segment_descriptor(gdt + 6, 0, 0, 0b1100, SD_TYPE_DATA_USER);
-
-    asm_load_gdt((7 * 8 - 1) << 16, (pptr_t)gdt);
-    asm_load_tr((6 - 1) * 8);
-
-    asm_cli();
+    kernel::sys_utsname = new new_utsname;
+    strcpy(kernel::sys_utsname->sysname, "Linux"); // linux compatible
+    strcpy(kernel::sys_utsname->nodename, "(none)");
+    strcpy(kernel::sys_utsname->release, "1.0.0");
+    strcpy(kernel::sys_utsname->version, "1.0.0");
+    strcpy(kernel::sys_utsname->machine, "x86");
+    strcpy(kernel::sys_utsname->domainname, "(none)");
 }
 
 SECTION(".text.kinit")
-static inline void init_bss_section(void)
+void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn)
 {
-    memset(bss_addr, 0x00, bss_len);
+    // call global constructors
+    // NOTE: the initializer of global objects MUST NOT contain
+    // all kinds of memory allocations
+    for (auto* ctor = &start_ctors; ctor != &end_ctors; ++ctor)
+        (*ctor)();
+
+    set_uname();
+
+    init_interrupt();
+    hw::timer::init_pit();
+
+    init_pci();
+
+    // TODO: remove this
+    init_vfs();
+    init_syscall_table();
+
+    init_scheduler(kernel_stack_pfn);
 }
 
 SECTION(".text.kinit")
-static inline int init_console(const char* name)
+static inline void setup_early_kernel_page_table()
 {
-    if (name[0] == 't' && name[1] == 't' && name[2] == 'y') {
-        if (name[3] == 'S' || name[3] == 's') {
-            if (name[4] == '0') {
-                console = types::memory::kinew<serial_tty>(PORT_SERIAL0);
-                return GB_OK;
-            }
-            if (name[4] == '1') {
-                console = types::memory::kinew<serial_tty>(PORT_SERIAL1);
-                return GB_OK;
-            }
-        }
-        if (name[3] == 'V' && name[3] == 'G' && name[3] == 'A') {
-            console = types::memory::kinew<vga_tty>();
-            return GB_OK;
-        }
-    }
-    return GB_FAILED;
-}
+    using namespace kernel::mem::paging;
 
-extern void init_vfs();
+    // remove temporary mapping
+    KERNEL_PAGE_TABLE[0x000].clear();
 
-namespace kernel::kinit {
+    constexpr auto idx = idx_all(0xffffffffc0200000ULL);
+
+    auto pdpt = KERNEL_PAGE_TABLE[std::get<1>(idx)].parse();
+    auto pd = pdpt[std::get<2>(idx)].parse();
+
+    // kernel bss, size 2M
+    pd[std::get<3>(idx)].set(PA_KERNEL_DATA_HUGE, 0x200000);
+
+    // clear kernel bss
+    memset((void*)BSS_ADDR, 0x00, BSS_LENGTH);
+
+    // clear empty page
+    memset(mem::physaddr<void>{EMPTY_PAGE_PFN}, 0x00, 0x1000);
+}
 
 SECTION(".text.kinit")
-static void init_uname()
+static inline void setup_buddy(uintptr_t addr_max)
 {
-    kernel::sys_utsname = new new_utsname;
-    strcpy(kernel::sys_utsname->sysname, "Linux"); // linux compatible
-    strcpy(kernel::sys_utsname->nodename, "(none)");
-    strcpy(kernel::sys_utsname->release, "1.0.0");
-    strcpy(kernel::sys_utsname->version, "1.0.0");
-    strcpy(kernel::sys_utsname->machine, "x86");
-    strcpy(kernel::sys_utsname->domainname, "(none)");
+    using namespace kernel::mem;
+    using namespace kernel::mem::paging;
+    constexpr auto idx = idx_all(0xffffff8040000000ULL);
+
+    addr_max += 0xfff;
+    addr_max >>= 12;
+    int count = (addr_max * sizeof(page) + 0x200000 - 1) / 0x200000;
+
+    pfn_t start_pfn = 0x400000;
+
+    memset(physaddr<void>{0x105000}, 0x00, 4096);
+
+    auto pdpte = KERNEL_PAGE_TABLE[std::get<1>(idx)].parse()[std::get<2>(idx)];
+    pdpte.set(PA_KERNEL_PAGE_TABLE, 0x105000);
+
+    auto pd = pdpte.parse();
+    for (int i = 0; i < count; ++i, start_pfn += 0x200000)
+        pd[std::get<3>(idx)+i].set(PA_KERNEL_DATA_HUGE, start_pfn);
+
+    PAGE_ARRAY = (page*)0xffffff8040000000ULL;
+    memset(PAGE_ARRAY, 0x00, addr_max * sizeof(page));
+
+    for (int i = 0; i < (int)info::e820_entry_count; ++i) {
+        auto& ent = info::e820_entries[i];
+
+        if (ent.type != 1) // type == 1: free area
+            continue;
+        mark_present(ent.base, ent.base + ent.len);
+
+        auto start = ent.base;
+        auto end = start + ent.len;
+        if (end <= start_pfn)
+            continue;
+
+        if (start < start_pfn)
+            start = start_pfn;
+
+        if (start > end)
+            continue;
+
+        mem::paging::create_zone(start, end);
+    }
+
+    // free .stage1
+    create_zone(0x1000, 0x2000);
+    // unused space
+    create_zone(0x106000, 0x200000);
 }
 
-} // namespace kernel::kinit
+SECTION(".text.kinit")
+static inline void save_memory_info(bootloader_data* data)
+{
+    kernel::mem::info::memory_size = 1ULL * 1024ULL * 1024ULL + // initial 1M
+        1024ULL * data->meminfo_1k_blocks + 64ULL * 1024ULL * data->meminfo_64k_blocks;
+    kernel::mem::info::e820_entry_count = data->meminfo_entry_count;
+    kernel::mem::info::e820_entry_length = data->meminfo_entry_length;
+
+    memcpy(kernel::mem::info::e820_entries, data->meminfo_entries,
+        sizeof(kernel::mem::info::e820_entries));
+}
 
-extern "C" SECTION(".text.kinit") void NORETURN kernel_init(void)
+SECTION(".text.kinit")
+void setup_gdt()
 {
-    asm_enable_sse();
+    // user code
+    mem::gdt[3]  = 0x0020'fa00'0000'0000;
+    // user data
+    mem::gdt[4]  = 0x0000'f200'0000'0000;
+    // user code32
+    mem::gdt[5]  = 0x00cf'fa00'0000'ffff;
+    // user data32
+    mem::gdt[6]  = 0x00cf'f200'0000'ffff;
+    // thread load 32bit
+    mem::gdt[7]  = 0x0000'0000'0000'0000;
 
-    init_bss_section();
+    // TSS descriptor
+    mem::gdt[8]  = 0x0000'8900'0070'0067;
+    mem::gdt[9]  = 0x0000'0000'ffff'ff00;
 
-    save_loader_data();
+    // LDT descriptor
+    mem::gdt[10] = 0x0000'8200'0060'001f;
+    mem::gdt[11] = 0x0000'0000'ffff'ff00;
 
-    load_new_gdt();
+    // null segment
+    mem::gdt[12] = 0x0000'0000'0000'0000;
+    // thread local 64bit
+    mem::gdt[13] = 0x0000'0000'0000'0000;
 
-    // call global ctors
-    // NOTE:
-    // the initializer of global objects MUST NOT contain
-    // all kinds of memory allocations
-    for (const constructor* ctor = &start_ctors; ctor != &end_ctors; ++ctor) {
-        (*ctor)();
-    }
+    uint64_t descriptor[] = {
+        0x005f'0000'0000'0000, (uintptr_t)(uint64_t*)mem::gdt
+    };
 
-    init_idt();
-    init_mem();
-    init_pic();
-    init_pit();
+    asm volatile(
+            "lgdt (%0)\n\t"
+            "mov $0x50, %%ax\n\t"
+            "lldt %%ax\n\t"
+            "mov $0x40, %%ax\n\t"
+            "ltr %%ax\n\t"
+            : : "r"((uintptr_t)descriptor+6): "ax", "memory"
+    );
+}
 
-    kernel::kinit::init_uname();
+extern "C" SECTION(".text.kinit")
+void NORETURN kernel_init(bootloader_data* data)
+{
+    enable_sse();
 
-    int ret = init_serial_port(PORT_SERIAL0);
-    assert(ret == GB_OK);
+    setup_early_kernel_page_table();
+    setup_gdt();
+    save_memory_info(data);
 
-    ret = init_console("ttyS0");
-    assert(ret == GB_OK);
+    uintptr_t addr_max = 0;
+    for (int i = 0; i < (int)kernel::mem::info::e820_entry_count; ++i) {
+        auto& ent = kernel::mem::info::e820_entries[i];
+        if (ent.type != 1)
+            continue;
+        addr_max = std::max(addr_max, ent.base + ent.len);
+    }
 
-    kernel::kinit::init_pci();
-    init_vfs();
-    init_syscall();
+    setup_buddy(addr_max);
+    init_allocator();
 
-    kmsg("switching execution to the scheduler...\n");
-    init_scheduler();
+    using namespace mem::paging;
+    auto kernel_stack_pfn = page_to_pfn(alloc_pages(9));
+    auto kernel_stack_ptr =
+        mem::physaddr<std::byte>{kernel_stack_pfn} + (1<<9) * 0x1000;
+
+    asm volatile(
+            "mov %1, %%rdi\n\t"
+            "mov %2, %%rsp\n\t"
+            "xor %%rbp, %%rbp\n\t"
+            "call *%0\n\t"
+            : : "r"(real_kernel_init), "g"(kernel_stack_pfn), "g"(kernel_stack_ptr):
+    );
+
+    freeze();
 }
+
+} // namespace kernel::kinit

+ 152 - 59
src/mbr.S

@@ -1,79 +1,172 @@
-.section .text.bootsect
+.section .mbr
 .code16
 
-.globl mbr_start
-mbr_start:
-    movw %cs, %ax
-    movw %ax, %ds
-    movw %ax, %es
-    movw %ax, %ss
-
-# perform a temporary stack
-    movw $stack_base, %ax
-    movw %ax, %bp
-    movw %ax, %sp
-
-# read the first 64k
-    call read_data
-
-# read the following 128k
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-# read the 128k more
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-# read 64k more
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-# loader start
-    jmp 0x8000
-
-read_data:
-    movw $read_data_pack, %si
+move_mbr:
+    xor %ax, %ax
+    mov %ax, %ds
+    mov %ax, %es
+    mov %ax, %ss
+
+    # build a temporary stack
+    mov $0x0e00, %esp
+    mov %esp, %ebp
+
+    mov $128, %cx # 512 / 4
+    mov $0x7c00, %si
+    mov $0x0e00, %di
+    rep movsl
+
+    ljmp $0x00, $mbr_start
+
+# %eax: lba lower 4bytes
+# %edx: destination address
+read_disk:
+	push %eax
+	push %edx
+	push %ecx
+
+	mov %eax, read_data_lba
+	shr $4, %edx
+	mov %dx, read_data_segment
+
+    mov $read_data_pack, %si
     mov $0x42, %ah
     mov $0x80, %dl
     int $0x13
-    jc read_data_error
-    ret
+    jc halt
+
+	pop %ecx
+	pop %edx
+	pop %eax
+	ret
+
+mbr_start:
+    # clear screen
+    mov $0x00, %ah
+    mov $0x03, %al
+    int $0x10
+
+    # read kernel image: 32K * 15 = 480K
+	xor %eax, %eax
+	inc %eax # %eax = 1
+	mov %eax, %edx
+	shl $12, %edx # %edx = 0x1000
+
+	mov $15, %ecx
+_loop_read_kernel:
+	call read_disk
+	add $64, %eax # %eax += 64
+
+	shr $12, %edx
+	add $8, %edx
+	shl $12, %edx # %edx += 32K
+
+	loop _loop_read_kernel
+
+    # get memory size info and storage it
+    xor %ecx, %ecx
+    xor %edx, %edx
+	xor %eax, %eax
+    mov $0xe801, %ax
+
+    int $0x15
+    jc halt
+
+    cmp $0x86, %ah # unsupported function
+    je halt
+    cmp $0x80, %ah # invalid command
+    je halt
+
+    jcxz _get_memory_size_use_ax
+    mov %cx, %ax
+    mov %dx, %bx
+
+_get_memory_size_use_ax:
+    sub $1024, %esp
+    movzw %ax, %eax
+    mov %eax, 8(%esp)  # 1k blocks
+    movzw %bx, %ebx
+    mov %ebx, 12(%esp) # 64k blocks
+
+    # save the destination address to es:di
+    lea 16(%esp), %di # buffer is 1024 - 16 bytes
 
-read_data_error:
+    # clear %ebx, len
+    xor %ebx, %ebx
+    mov %ebx, (%esp)
+
+    # set default entry size
+    movl $20, 4(%esp)
+
+_e820_mem_map_load_loop:
+    # set the magic number to edx
+    mov $0x534D4150, %edx
+
+    # set function number to eax
+    mov $0xe820, %eax
+
+    # set default entry size
+    mov $24, %ecx
+
+    int $0x15
+
+    incl (%esp)
+    add $24, %edi
+
+    jc _e820_mem_map_load_fin
+    cmp $0, %ebx
+    jz _e820_mem_map_load_fin
+
+    cmp $24, %ecx
+    cmovnz 4(%esp), %ecx
+    mov %ecx, 4(%esp)
+
+    jmp _e820_mem_map_load_loop
+
+_e820_mem_map_load_fin:
+    # load GDT and IDT
+    cli
+    lidt null_idt_descriptor
+    lgdt _32bit_gdt_descriptor
+
+    # enable protection enable (PE) bit
+    mov %cr0, %eax
+    or $1, %eax
+    mov %eax, %cr0
+
+    ljmp $0x08, $start_32bit
+
+halt:
     hlt
-    jmp read_data_error
+    jmp halt
 
-.align 4
+.align 16
 read_data_pack:
     .byte 0x10, 0
 read_data_count:
-    .word 128    # sector count (read 64k)
+    .word 64     # sector count (read 32k)
 read_data_offset:
     .word 0x0000 # offset address
 read_data_segment:
-    .word 0x0800 # segment address
+    .word 0x0100 # segment address
 read_data_lba:
     .long 1      # lower 4 bytes of the LBA to read
     .long 0      # higher 2 bytes of the LBA to read
 
-__mbr_code_border__:
-    .long 0xffffffff
+# null IDT descriptor
+# so that exceptions will cause the system to reset
+.align 4
+null_idt_descriptor:
+    .word 0 # size
+    .long 0 # base
 
-.align 16
-stack_edge:
-.space 128
-stack_base:
+.align 4
+_32bit_gdt_descriptor:
+    .word (3 * 8) - 1 # size
+    .long _32bit_gdt  # address
 
-. = 510
-.byte 0x55, 0xaa
+.align 16
+_32bit_gdt:
+    .8byte 0x0                # null selector
+    .8byte 0x00cf9a000000ffff # code selector
+    .8byte 0x00cf92000000ffff # data selector

+ 0 - 15
src/mbr.ld

@@ -1,15 +0,0 @@
-OUTPUT_FORMAT(binary)
-OUTPUT_ARCH(i386:i386)
-
-SECTIONS
-{
-    .text 0x7c00 :
-    {
-        *(.text.bootsect)
-    }
-
-    /DISCARD/ :
-    {
-        *(.note*)
-    }
-}

+ 98 - 87
src/types/elf.cpp

@@ -9,116 +9,121 @@
 
 #include <types/elf.hpp>
 
-#include <kernel/mem.h>
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
 
-#define align16_down(sp) (sp = ((char*)((uint32_t)(sp)&0xfffffff0)))
-
-template <typename T>
-inline void _user_push(char** sp, T d)
+static inline void __user_push32(uintptr_t* sp, uint32_t d)
 {
-    *sp -= sizeof(T);
-    *(T*)*sp = d;
+    // TODO: use copy_to_user
+    *(--*(uint32_t**)sp) = d;
 }
-template <>
-inline void _user_push(char** sp, const char* str)
+
+static inline void __user_push_string32(uintptr_t* sp, const char* str)
 {
     size_t len = strlen(str);
+
     *sp -= (len + 1);
-    align16_down(*sp);
-    memcpy(*sp, str, len + 1);
+    *sp &= ~0xf; // align to 16 bytes
+
+    memcpy((void*)*sp, str, len + 1);
 }
 
-int types::elf::elf32_load(types::elf::elf32_load_data* d)
+int types::elf::elf32_load(types::elf::elf32_load_data& d)
 {
-    auto* ent_exec = d->exec_dent;
-    if (!ent_exec) {
-        d->errcode = ENOENT;
-        return GB_FAILED;
-    }
+    auto& exec = d.exec_dent;
+    if (!exec)
+        return -ENOENT;
 
-    // TODO: detect file format
     types::elf::elf32_header hdr {};
     auto n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)&hdr,
         sizeof(types::elf::elf32_header),
         0, sizeof(types::elf::elf32_header));
 
-    if (n_read != sizeof(types::elf::elf32_header)) {
-        d->errcode = EINVAL;
-        return GB_FAILED;
-    }
+    if (n_read != sizeof(types::elf::elf32_header))
+        return -EINVAL;
+
+    if (hdr.magic[0] != 0x7f || hdr.magic[1] != 'E'
+            || hdr.magic[2] != 'L' || hdr.magic[3] != 'F')
+        return -EINVAL;
 
     size_t phents_size = hdr.phentsize * hdr.phnum;
     size_t shents_size = hdr.shentsize * hdr.shnum;
     std::vector<types::elf::elf32_program_header_entry> phents(hdr.phnum);
     n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)phents.data(),
         phents_size,
         hdr.phoff, phents_size);
 
     // broken file or I/O error
-    if (n_read != phents_size) {
-        d->errcode = EINVAL;
-        return GB_FAILED;
-    }
+    if (n_read != phents_size)
+        return -EINVAL;
 
     std::vector<types::elf::elf32_section_header_entry> shents(hdr.shnum);
     n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)shents.data(),
         shents_size,
         hdr.shoff, shents_size);
 
     // broken file or I/O error
-    if (n_read != shents_size) {
-        d->errcode = EINVAL;
-        return GB_FAILED;
-    }
-
-    // copy argv and envp
-    std::vector<std::string> argv, envp;
-    for (const char* const* p = d->argv; *p; ++p)
-        argv.emplace_back(*p);
-    for (const char* const* p = d->envp; *p; ++p)
-        envp.emplace_back(*p);
+    if (n_read != shents_size)
+        return -EINVAL;
 
-    // from now on, caller process is recycled.
+    // from now on, caller process is gone.
     // so we can't just simply return to it on error.
-    current_process->mms.clear_user();
+    auto& mms = current_process->mms;
+    mms.clear();
 
-    uint32_t data_segment_end = 0;
+    uintptr_t data_segment_end = 0;
 
     for (const auto& phent : phents) {
         if (phent.type != types::elf::elf32_program_header_entry::PT_LOAD)
             continue;
 
-        auto vaddr = align_down<12>(phent.vaddr);
-        auto vlen = align_up<12>(phent.vaddr + phent.memsz) - vaddr;
-        auto flen = align_up<12>(phent.vaddr + phent.filesz) - vaddr;
-        auto fileoff = align_down<12>(phent.offset);
+        auto vaddr = phent.vaddr & ~0xfff;
+        auto vlen = ((phent.vaddr + phent.memsz + 0xfff) & ~0xfff) - vaddr;
+        auto flen = ((phent.vaddr + phent.filesz + 0xfff) & ~0xfff) - vaddr;
+        auto fileoff = phent.offset & ~0xfff;
 
+        using namespace kernel::mem;
         if (flen) {
-            auto ret = mmap(
-                (char*)vaddr,
-                phent.filesz + (phent.vaddr & 0xfff),
-                ent_exec->ind,
-                fileoff,
-                1,
-                d->system);
-
-            if (ret != GB_OK)
+            mm_list::map_args args{};
+
+            args.vaddr = vaddr;
+            args.length = flen;
+            args.file_inode = exec->ind;
+            args.file_offset = fileoff;
+
+            args.flags = MM_MAPPED;
+            if (phent.flags & elf32_program_header_entry::PF_W)
+                args.flags |= MM_WRITE;
+
+            if (phent.flags & elf32_program_header_entry::PF_X)
+                args.flags |= MM_EXECUTE;
+
+            if (auto ret = mms.mmap(args); ret != 0)
                 kill_current(SIGSEGV);
         }
 
         if (vlen > flen) {
-            auto ret = mmap((char*)vaddr + flen, vlen - flen,
-                nullptr, 0, true, d->system);
+            mm_list::map_args args{};
+
+            args.vaddr = vaddr + flen;
+            args.length = vlen - flen;
 
-            if (ret != GB_OK)
+            args.flags = MM_ANONYMOUS;
+            if (phent.flags & elf32_program_header_entry::PF_W)
+                args.flags |= MM_WRITE;
+
+            if (phent.flags & elf32_program_header_entry::PF_X)
+                args.flags |= MM_EXECUTE;
+
+            if (auto ret = mms.mmap(args); ret != 0)
                 kill_current(SIGSEGV);
         }
 
@@ -126,60 +131,66 @@ int types::elf::elf32_load(types::elf::elf32_load_data* d)
             data_segment_end = vaddr + vlen;
     }
 
-    current_process->mms.register_brk((char*)data_segment_end + 0x10000);
+    current_process->mms.register_brk(data_segment_end + 0x10000);
 
     for (const auto& shent : shents) {
         if (shent.sh_type == elf32_section_header_entry::SHT_NOBITS)
-            memset((char*)shent.sh_addr, 0x00, shent.sh_size);
+            memset((char*)(uintptr_t)shent.sh_addr, 0x00, shent.sh_size);
     }
 
     // map stack area
-    auto ret = mmap((void*)types::elf::ELF_STACK_TOP,
-        types::elf::ELF_STACK_SIZE, nullptr, 0, true, false);
+    if (1) {
+        using namespace kernel::mem;
+        mm_list::map_args args{};
 
-    // TODO: destruct local variables before calling kill_current
-    if (ret != GB_OK)
-        kill_current(SIGSEGV);
+        args.vaddr = ELF32_STACK_TOP;
+        args.length = ELF32_STACK_SIZE;
+        args.flags = MM_ANONYMOUS | MM_WRITE;
+
+        if (auto ret = mms.mmap(args); ret != 0)
+            kill_current(SIGSEGV);
+        // TODO: deconstruct local variables before calling kill_current
+    }
 
-    d->eip = (void*)hdr.entry;
-    d->sp = reinterpret_cast<uint32_t*>(types::elf::ELF_STACK_BOTTOM);
+    d.ip = hdr.entry;
+    d.sp = ELF32_STACK_BOTTOM;
 
-    auto* sp = (char**)&d->sp;
+    auto* sp = &d.sp;
 
     // fill information block area
-    std::vector<char*> args, envs;
-    for (const auto& env : envp) {
-        _user_push(sp, env.c_str());
-        envs.push_back(*sp);
+    std::vector<elf32_addr_t> args, envs;
+    for (const auto& env : d.envp) {
+        __user_push_string32(sp, env.c_str());
+        envs.push_back((uintptr_t)*sp);
     }
-    for (const auto& arg : argv) {
-        _user_push(sp, arg.c_str());
-        args.push_back(*sp);
+    for (const auto& arg : d.argv) {
+        __user_push_string32(sp, arg.c_str());
+        args.push_back((uintptr_t)*sp);
     }
 
     // push null auxiliary vector entry
-    _user_push(sp, 0);
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
+    __user_push32(sp, 0);
 
     // push 0 for envp
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
 
     // push envp
-    *sp -= sizeof(void*) * envs.size();
-    memcpy(*sp, envs.data(), sizeof(void*) * envs.size());
+    for (auto ent : envs)
+        __user_push32(sp, ent);
 
     // push 0 for argv
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
 
     // push argv
-    *sp -= sizeof(void*) * args.size();
-    memcpy(*sp, args.data(), sizeof(void*) * args.size());
+    for (int i = args.size()-1; i >= 0; --i)
+        __user_push32(sp, args[i]);
 
     // push argc
-    _user_push(sp, args.size());
+    __user_push32(sp, args.size());
 
     // rename current thread
-    current_thread->name = ent_exec->name;
+    current_thread->name = exec->name;
 
-    return GB_OK;
+    return 0;
 }

+ 1 - 6
src/types/libstdcpp.cpp

@@ -1,8 +1,6 @@
-#include <asm/port_io.h>
 #include <assert.h>
 #include <kernel/log.hpp>
 #include <kernel/process.hpp>
-#include <stdio.h>
 #include <types/types.h>
 
 extern "C" void NORETURN __stack_chk_fail(void)
@@ -20,9 +18,6 @@ extern "C" void NORETURN __cxa_pure_virtual(void)
 void NORETURN
 __assert_fail(const char* statement, const char* file, int line, const char* func)
 {
-    char buf[256];
-    snprintf(buf, sizeof(buf), "Kernel assertion failed: (%s), %s:%d, %s\n",
-        statement, file, line, func);
-    kmsg(buf);
+    kmsgf("Kernel assertion failed: (%s), %s:%d, %s", statement, file, line, func);
     freeze();
 }

+ 3 - 3
user-space-program/CMakeLists.txt

@@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.15)
 project(user_space_program C ASM)
 
-set(CMAKE_C_FLAGS "-nostdlib -nostdinc -static -m32 -W -Wall -Wextra -Werror -mstack-protector-guard=global")
-set(CMAKE_ASM_FLAGS "-nostdlib -m32 -static -mstack-protector-guard=global -g0")
+set(CMAKE_C_FLAGS "-nostdlib -nostdinc -m32 -static -W -Wall -mstack-protector-guard=global")
+set(CMAKE_ASM_FLAGS "-nostdlib -static -m32 -mstack-protector-guard=global")
 
-link_libraries(gblibc crt0)
+link_libraries(gblibc_32 crt0_32)
 add_link_options("LINKER:-melf_i386")
 
 set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "")

Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác