Browse Source

Merge branch 'long-mode' into vfs-refactor

kernel is now in long mode

TODO: we still have no real 64bit elf executable support
greatbridf 9 months ago
parent
commit
e526dc0708
89 changed files with 4861 additions and 4450 deletions
  1. 21 28
      CMakeLists.txt
  2. 3 2
      Makefile.src
  3. 3 8
      configure
  4. 29 8
      doc/mem_layout.txt
  5. 34 1
      gblibc/CMakeLists.txt
  6. 0 1
      gblibc/include/bits/alltypes.h
  7. 1 1
      gblibc/include/stdint.h
  8. 6 6
      gblibc/include/sys/types.h
  9. 1 1
      gblibc/src/fcntl.c
  10. 4 0
      gblibstdc++/include/bits/rbtree
  11. 1 1
      gblibstdc++/include/functional
  12. 2 2
      gblibstdc++/include/string
  13. 0 36
      include/asm/port_io.h
  14. 0 27
      include/asm/sys.h
  15. 1 4
      include/fs/fat.hpp
  16. 10 7
      include/kernel/async/lock.hpp
  17. 0 15
      include/kernel/hw/keyboard.h
  18. 0 1
      include/kernel/hw/serial.hpp
  19. 0 17
      include/kernel/hw/timer.h
  20. 11 0
      include/kernel/hw/timer.hpp
  21. 0 61
      include/kernel/interrupt.h
  22. 74 0
      include/kernel/interrupt.hpp
  23. 2 2
      include/kernel/log.hpp
  24. 0 141
      include/kernel/mem.h
  25. 107 0
      include/kernel/mem/mm_list.hpp
  26. 192 0
      include/kernel/mem/paging.hpp
  27. 65 0
      include/kernel/mem/phys.hpp
  28. 40 0
      include/kernel/mem/slab.hpp
  29. 36 0
      include/kernel/mem/types.hpp
  30. 46 0
      include/kernel/mem/vm_area.hpp
  31. 0 400
      include/kernel/mm.hpp
  32. 1 1
      include/kernel/module.hpp
  33. 12 20
      include/kernel/process.hpp
  34. 2 2
      include/kernel/signal.hpp
  35. 111 10
      include/kernel/syscall.hpp
  36. 0 18
      include/kernel/task.h
  37. 12 6
      include/kernel/task/thread.hpp
  38. 11 13
      include/kernel/tty.hpp
  39. 1 3
      include/kernel/user/thread_local.hpp
  40. 1 1
      include/kernel/vfs.hpp
  41. 10 47
      include/types/allocator.hpp
  42. 157 19
      include/types/elf.hpp
  43. 19 8
      include/types/hash_map.hpp
  44. 43 0
      include/types/list.hpp
  45. 0 22
      include/types/size.h
  46. 0 4
      include/types/status.h
  47. 6 2
      include/types/types.h
  48. 16 0
      include/types/user_types.hpp
  49. 1 1
      init_script.sh
  50. 7 22
      pretty-print.py
  51. 105 247
      src/asm/interrupt.s
  52. 0 56
      src/asm/port_io.s
  53. 0 53
      src/asm/sys.s
  54. 178 287
      src/boot.s
  55. 29 51
      src/fs/fat.cpp
  56. 26 3
      src/fs/procfs.cc
  57. 62 52
      src/kernel.ld
  58. 121 36
      src/kernel/allocator.cc
  59. 16 16
      src/kernel/async/lock.cc
  60. 43 52
      src/kernel/hw/ahci.cc
  61. 0 31
      src/kernel/hw/keyboard.cpp
  62. 124 0
      src/kernel/hw/serial.cc
  63. 0 71
      src/kernel/hw/serial.cpp
  64. 0 26
      src/kernel/hw/timer.c
  65. 31 0
      src/kernel/hw/timer.cc
  66. 100 268
      src/kernel/interrupt.cpp
  67. 0 586
      src/kernel/mem.cpp
  68. 348 0
      src/kernel/mem/mm_list.cc
  69. 448 0
      src/kernel/mem/paging.cc
  70. 125 0
      src/kernel/mem/slab.cc
  71. 122 148
      src/kernel/process.cpp
  72. 20 21
      src/kernel/signal.cpp
  73. 265 1079
      src/kernel/syscall.cpp
  74. 538 8
      src/kernel/syscall/fileops.cc
  75. 51 0
      src/kernel/syscall/infoops.cc
  76. 6 7
      src/kernel/syscall/mount.cc
  77. 391 0
      src/kernel/syscall/procops.cc
  78. 17 0
      src/kernel/task/readyqueue.cc
  79. 78 60
      src/kernel/task/thread.cc
  80. 14 17
      src/kernel/tty.cpp
  81. 14 13
      src/kernel/user/thread_local.cc
  82. 11 11
      src/kernel/vfs.cpp
  83. 17 16
      src/kernel/vfs/tmpfs.cc
  84. 208 96
      src/kinit.cpp
  85. 152 59
      src/mbr.S
  86. 0 15
      src/mbr.ld
  87. 98 87
      src/types/elf.cpp
  88. 1 6
      src/types/libstdcpp.cpp
  89. 3 3
      user-space-program/CMakeLists.txt

+ 21 - 28
CMakeLists.txt

@@ -6,11 +6,11 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_LINK_EXECUTABLE
     "<CMAKE_LINKER> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
 
-set(CMAKE_ASM_FLAGS "-m32")
-set(C_CXX_FLAGS "-nostdinc -m32 -nostdlib -W -Wall -Wextra -Wno-stringop-overflow -Wno-builtin-declaration-mismatch -Wno-format -fverbose-asm -fno-exceptions -ffreestanding -fno-pic -mstack-protector-guard=global")
+set(C_CXX_FLAGS "-nostdinc -nostdlib -W -Wall -Wextra -Wno-stringop-overflow -Wno-builtin-declaration-mismatch -Wno-format -fverbose-asm -fno-exceptions -ffreestanding -fno-pic -mno-red-zone -mstack-protector-guard=global -mcmodel=kernel")
 set(CMAKE_C_FLAGS "${C_CXX_FLAGS} -Werror=implicit-int -Werror=implicit-function-declaration -Werror=strict-aliasing")
 set(CMAKE_CXX_FLAGS "${C_CXX_FLAGS} -fno-use-cxa-atexit -fno-rtti")
 set(CMAKE_CXX_LINK_FLAGS "")
+SET(CMAKE_ASM_FLAGS "${CFLAGS} -x assembler-with-cpp")
 set(CMAKE_CXX_STANDARD 20)
 
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -30,9 +30,8 @@ add_subdirectory(gblibstdc++)
 add_subdirectory(user-space-program)
 
 set(BOOTLOADER_SOURCES src/boot.s
+                       src/mbr.S
                        src/asm/interrupt.s
-                       src/asm/port_io.s
-                       src/asm/sys.s
                        )
 
 set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
@@ -46,16 +45,19 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         src/kernel/tty.cpp
                         src/kernel/syscall.cpp
                         src/kernel/syscall/fileops.cc
+                        src/kernel/syscall/infoops.cc
                         src/kernel/syscall/mount.cc
-                        src/kernel/mem.cpp
+                        src/kernel/syscall/procops.cc
+                        src/kernel/mem/mm_list.cc
+                        src/kernel/mem/paging.cc
+                        src/kernel/mem/slab.cc
                         src/kernel/module.cc
                         src/kernel/vfs.cpp
                         src/kernel/vga.cpp
                         src/kernel/hw/ahci.cc
-                        src/kernel/hw/keyboard.cpp
                         src/kernel/hw/pci.cc
-                        src/kernel/hw/serial.cpp
-                        src/kernel/hw/timer.c
+                        src/kernel/hw/serial.cc
+                        src/kernel/hw/timer.cc
                         src/kernel/task/thread.cc
                         src/kernel/task/readyqueue.cc
                         src/kernel/user/thread_local.cc
@@ -63,18 +65,19 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         src/kernel/signal.cpp
                         src/types/elf.cpp
                         src/types/libstdcpp.cpp
-                        include/asm/port_io.h
-                        include/asm/sys.h
                         include/fs/fat.hpp
                         include/kernel/async/waitlist.hpp
                         include/kernel/async/lock.hpp
                         include/kernel/tty.hpp
-                        include/kernel/interrupt.h
+                        include/kernel/interrupt.hpp
                         include/kernel/irq.hpp
                         include/kernel/process.hpp
                         include/kernel/syscall.hpp
-                        include/kernel/mem.h
-                        include/kernel/mm.hpp
+                        include/kernel/mem/mm_list.hpp
+                        include/kernel/mem/paging.hpp
+                        include/kernel/mem/slab.hpp
+                        include/kernel/mem/types.hpp
+                        include/kernel/mem/vm_area.hpp
                         include/kernel/module.hpp
                         include/kernel/utsname.hpp
                         include/kernel/vfs.hpp
@@ -87,20 +90,18 @@ set(KERNEL_MAIN_SOURCES src/fs/fat.cpp
                         include/kernel/task/forward.hpp
                         include/kernel/task/thread.hpp
                         include/kernel/task/readyqueue.hpp
-                        include/kernel/hw/keyboard.h
                         include/kernel/hw/pci.hpp
                         include/kernel/hw/port.hpp
-                        include/kernel/hw/serial.h
-                        include/kernel/hw/timer.h
+                        include/kernel/hw/serial.hpp
+                        include/kernel/hw/timer.hpp
                         include/kernel/input/keycodes.h
                         include/kernel/user/thread_local.hpp
                         include/types/bitmap.hpp
                         include/types/buffer.hpp
                         include/types/elf.hpp
                         include/types/hash_map.hpp
+                        include/types/list.hpp
                         include/types/types.h
-                        include/types/size.h
-                        include/types/status.h
                         include/types/allocator.hpp
                         include/types/cplusplus.hpp
                         include/kernel/log.hpp
@@ -110,26 +111,18 @@ add_executable(kernel.out ${KERNEL_MAIN_SOURCES} ${BOOTLOADER_SOURCES})
 target_link_libraries(kernel.out gblibc gblibstdc++)
 target_include_directories(kernel.out PRIVATE ${PROJECT_SOURCE_DIR}/include)
 target_link_options(kernel.out PRIVATE
-    -T ${CMAKE_SOURCE_DIR}/src/kernel.ld -melf_i386 -lgblibc -L${CMAKE_BINARY_DIR}/gblibc)
+    -T ${CMAKE_SOURCE_DIR}/src/kernel.ld -lgblibc -L${CMAKE_BINARY_DIR}/gblibc)
 set_target_properties(kernel.out PROPERTIES LINK_DEPENDS ${CMAKE_SOURCE_DIR}/src/kernel.ld)
 
-add_custom_command(OUTPUT mbr.bin
-    DEPENDS ${PROJECT_SOURCE_DIR}/src/mbr.S ${PROJECT_SOURCE_DIR}/src/mbr.ld
-    COMMAND ${CMAKE_ASM_COMPILER} -m32 -c ${PROJECT_SOURCE_DIR}/src/mbr.S -o mbr.o
-    COMMAND ${CMAKE_LINKER} -T ${PROJECT_SOURCE_DIR}/src/mbr.ld mbr.o -o mbr.bin
-)
-
 add_custom_command(OUTPUT mbr_hole.bin
     DEPENDS kernel.out
     COMMAND ${CMAKE_OBJCOPY} --strip-debug -O binary ${CMAKE_BINARY_DIR}/kernel.out mbr_hole.bin
 )
 
 add_custom_target(boot.img
-    DEPENDS mbr.bin
     DEPENDS mbr_hole.bin
     DEPENDS user_space_programs
-    COMMAND dd if=mbr.bin of=boot.img
-    COMMAND cat mbr_hole.bin >> boot.img
+    COMMAND dd if=mbr_hole.bin of=boot.img
     COMMAND dd if=/dev/zero of=boot.img bs=`expr 512 \\* 1024 \\* 1024` count=0 seek=1
     COMMAND sh -c \"echo n\; echo\; echo\; echo\; echo\; echo a\; echo w\" | ${FDISK_BIN} boot.img
     COMMAND mkfs.fat --offset=2048 -v -n SYSTEM boot.img

+ 3 - 2
Makefile.src

@@ -6,7 +6,7 @@ QEMU_DEBUG_FLAG=#-d cpu_reset,int
 QEMU_ARGS=-machine q35 -drive id=disk,file=build/boot.img,format=raw,if=none \
 	-device ahci,id=ahci -device ide-hd,drive=disk,bus=ahci.0 \
 	-no-reboot -no-shutdown $(QEMU_ACCELERATION_FLAG) $(QEMU_DEBUG_FLAG)
-	
+
 CROSS_COMPILE=##PLACEHOLDER_4##
 .PHONY: run
 run: build
@@ -42,7 +42,8 @@ clean-all: clean
 
 .PHONY: debug
 debug:
-	$(GDB_BIN) --symbols=build/kernel.out --init-eval-command 'source pretty-print.py' --init-eval-command 'set pagination off' --init-eval-command 'target remote:1234' --eval-command 'hbr _kernel_init' --eval-command 'c'
+	-$(GDB_BIN) --symbols=build/kernel.out --init-eval-command 'source pretty-print.py' --init-eval-command 'set pagination off' --init-eval-command 'target remote:1234' --init-eval-command 'layout regs' --eval-command 'hbr _kernel_init' --eval-command 'c'
+	-killall $(QEMU_BIN)
 
 build/boot.vdi: build/boot.img
 	-rm build/boot.vdi

+ 3 - 8
configure

@@ -1,5 +1,5 @@
 #!/bin/sh
-QEMU_EXECUTABLES="qemu-system-i386 qemu-system-x86_64"
+QEMU_EXECUTABLES="qemu-system-x86_64"
 GDB_EXECUTABLES="gdb x86_64-elf-gdb"
 
 event() {
@@ -77,13 +77,8 @@ case "$OS" in
         QEMU_ACCEL='-enable-kvm'
         ;;
     "Darwin")
-        if [ "$QEMU" = "qemu-system-x86_64" ]; then
-            echo "hvf"
-            QEMU_ACCEL='-accel hvf'
-        else
-            echo "tcg"
-            QEMU_ACCEL='-accel tcg'
-        fi
+        echo "tcg"
+        QEMU_ACCEL='-accel tcg'
         ;;
 esac
 

+ 29 - 8
doc/mem_layout.txt

@@ -1,12 +1,33 @@
-0x00000000 - 0x00001000 kernel pd
-0x00001000 - 0x00005000 kernel pt
-0x00005000 - 0x00006000 empty page
+physical memory
 
-....
+0x0000 - 0x1000: GDT, TSS, LDT and some early kernel data
+0x1000 - 0x2000: kernel stage1
+0x2000 - ?     : kernel image
 
-0x00100000 - 0x???????? kernel code, data, bss
-0x???????? - 0x01000000 kernel early stack
+0x100000 - 0x101000 : kernel PML4
+0x101000 - 0x102000 : kernel PDPT for physical memory mappings
+0x102000 - 0x103000 : kernel PDPT for kernel space
+0x103000 - 0x104000 : kernel PD for kernel image
+0x104000 - 0x105000 : kernel PT for kernel image
+0x105000 - 0x106000 : kernel PD for struct page array#1
 
-....
+0x106000 - 0x200000 : unused empty pages
+0x200000 - 0x400000 : first kernel bss page (2MB)
 
-0x30000000 - 0x40000000 kernel heap
+
+virtual address space
+
+0xffff ff0 000 000 000 - 0xffff ff3 fff fff fff  256GB physical memory (cached)
+0xffff ff4 000 000 000 - 0xffff ff7 fff fff fff  256GB physical memory (not cached)
+0xffff ff8 000 000 000 - 0xffff ff8 03f fff fff    1GB unused
+0xffff ff8 040 000 000 - 0xffff ff8 13f fff fff    4GB struct page array
+0xffff ff8 140 000 000 - 0xffff ff8 17f fff fff    1GB unused
+0xffff ff8 180 000 000 - 0xffff ffb fff fff fff  250GB kernel heap
+
+0xffff ffc 000 000 000 - 0xffff fff fbf fff fff  255GB unused
+
+0xffff fff fc0 000 000 - 0xffff fff fc0 1ff fff    2MB unused
+0xffff fff fc0 200 000 - 0xffff fff fff 9ff fff 1016MB kernel bss
+0xffff fff fff a00 000 - 0xffff fff fff bff fff    2MB unused
+0xffff fff fff c00 000 - 0xffff fff fff dff fff    2MB kernel image
+0xffff fff fff e00 000 - 0xffff fff fff fff fff    2MB unused

+ 34 - 1
gblibc/CMakeLists.txt

@@ -21,10 +21,36 @@ add_library(gblibc STATIC
     src/platform-independent.s
 )
 
-add_library(crt0 OBJECT
+add_library(gblibc_32 STATIC
+    src/stdio.c
+    src/arithmetic.c
+    src/string.c
+    src/fcntl.c
+    src/unistd.c
+    src/wait.c
+    src/assert.c
+    src/dirent.c
+    src/ctype.c
+    src/stdlib.c
+    src/errno.c
+    src/init.c
+    src/internal.c
+    src/stat.c
+    src/time.c
+    src/signal.c
+    src/platform-independent.s
+)
+
+add_library(crt0_32 OBJECT
     src/crt0.s
 )
 
+target_compile_options(gblibc_32 PRIVATE "-m32")
+target_compile_options(gblibc_32 PRIVATE "-mcmodel=32")
+target_compile_options(crt0_32 PRIVATE "-m32")
+target_link_options(gblibc_32 PRIVATE "LINKER:-melf_i386")
+target_link_options(crt0_32 PRIVATE "LINKER:-melf_i386")
+
 file(GLOB_RECURSE GBLIBC_PUBLIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 target_include_directories(gblibc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
@@ -33,3 +59,10 @@ target_include_directories(gblibc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
 set_target_properties(gblibc PROPERTIES PRIVATE_HEADER
     "private-include/devutil.h,private-include/syscall.h")
 set_target_properties(gblibc PROPERTIES PUBLIC_HEADER "${GBLIBC_PUBLIC_HEADERS}")
+
+target_include_directories(gblibc_32 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
+                                  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/private-include)
+
+set_target_properties(gblibc_32 PROPERTIES PRIVATE_HEADER
+    "private-include/devutil.h,private-include/syscall.h")
+set_target_properties(gblibc_32 PROPERTIES PUBLIC_HEADER "${GBLIBC_PUBLIC_HEADERS}")

+ 0 - 1
gblibc/include/bits/alltypes.h

@@ -13,7 +13,6 @@ typedef size_t blkcnt_t;
 struct timespec {
     time_t tv_sec;
     long tv_nsec;
-    int : 32; // padding
 };
 
 struct timeval {

+ 1 - 1
gblibc/include/stdint.h

@@ -22,7 +22,7 @@ typedef __UINTPTR_TYPE__ uintptr_t;
 typedef __INTPTR_TYPE__ intptr_t;
 
 typedef __SIZE_TYPE__ size_t;
-typedef int32_t ssize_t;
+typedef int64_t ssize_t;
 
 typedef uint64_t time_t;
 typedef int64_t time_diff_t;

+ 6 - 6
gblibc/include/sys/types.h

@@ -8,16 +8,16 @@ extern "C" {
 #endif
 
 typedef int pid_t;
-typedef uint32_t ino_t;
-typedef int32_t off_t;
-typedef uint32_t dev_t;
+typedef unsigned long ino_t;
+typedef long off_t;
+typedef unsigned dev_t;
 typedef unsigned uid_t;
 typedef unsigned gid_t;
-typedef unsigned mode_t;
+typedef unsigned short mode_t;
 typedef unsigned long nlink_t;
 
-typedef uint64_t ino64_t;
-typedef int64_t off64_t;
+typedef unsigned long long ino64_t;
+typedef long long off64_t;
 
 typedef off64_t loff_t;
 

+ 1 - 1
gblibc/src/fcntl.c

@@ -12,7 +12,7 @@ int open(const char* filename, int flags, ...)
         va_list vl;
         va_start(vl, flags);
 
-        ret = syscall3(SYS_open, (uint32_t)filename, flags, va_arg(vl, mode_t));
+        ret = syscall3(SYS_open, (uint32_t)filename, flags, va_arg(vl, int));
 
         va_end(vl);
     }

+ 4 - 0
gblibstdc++/include/bits/rbtree

@@ -369,6 +369,8 @@ public:
         root = copy(other.root);
         if (root)
             root->parent = nullptr;
+
+        return *this;
     }
     
     constexpr rbtree& operator=(rbtree&& other) noexcept
@@ -380,6 +382,8 @@ public:
         if constexpr (node_alloc_traits::
             propagate_on_container_move_assignment::value)
             alloc = std::move(other.alloc);
+
+        return *this;
     }
 
     constexpr void rotateleft(node* rt)

+ 1 - 1
gblibstdc++/include/functional

@@ -147,7 +147,7 @@ public:
     using result_type = Ret;
 
 private:
-    static constexpr std::size_t STACK_ALLOCATED_SIZE = 12;
+    static constexpr std::size_t STACK_ALLOCATED_SIZE = 24;
 
     char _data[STACK_ALLOCATED_SIZE];
     using fb_t = __inner::_function_base<Ret, Args...>;

+ 2 - 2
gblibstdc++/include/string

@@ -493,12 +493,12 @@ public:
 
     constexpr int compare(const basic_string& str) const noexcept
     {
-        return traits_type::compare(c_str(), str.c_str(), size());
+        return traits_type::compare(c_str(), str.c_str(), size()+1);
     }
 
     constexpr int compare(const Char* str) const
     {
-        return traits_type::compare(c_str(), str, size());
+        return traits_type::compare(c_str(), str, size()+1);
     }
 };
 

+ 0 - 36
include/asm/port_io.h

@@ -1,36 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-typedef uint16_t port_id_t;
-
-#define PORT_PIC1 (0x20)
-#define PORT_PIC2 (0xa0)
-#define PORT_PIC1_COMMAND (PORT_PIC1)
-#define PORT_PIC1_DATA ((PORT_PIC1) + 1)
-#define PORT_PIC2_COMMAND (PORT_PIC2)
-#define PORT_PIC2_DATA ((PORT_PIC2) + 1)
-
-#define PORT_KEYBOARD_COMMAND (0x64)
-#define PORT_KEYBOARD_DATA (0x60)
-
-#define PORT_PIT_CONTROL (0x43)
-#define PORT_PIT_COUNT (0x40)
-
-#define PORT_KEYDATA 0x0060u
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern void asm_outb(port_id_t port_number, uint8_t data);
-extern uint8_t asm_inb(port_id_t port_number);
-
-extern void asm_hlt(void);
-extern void asm_cli(void);
-extern void asm_sti(void);
-extern void asm_enable_sse(void);
-
-#ifdef __cplusplus
-}
-#endif

+ 0 - 27
include/asm/sys.h

@@ -1,27 +0,0 @@
-#pragma once
-
-#include <kernel/mem.h>
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void asm_switch_pd(page_t pd_addr);
-void asm_enable_paging(pd_t pd_addr);
-
-pptr_t current_pd(void);
-
-// the limit should be set on the higher 16bit
-// e.g. (n * sizeof(segment_descriptor) - 1) << 16
-void asm_load_gdt(uint32_t limit, pptr_t addr);
-
-void asm_load_tr(uint16_t index);
-
-extern const uint32_t kernel_size;
-extern char* const bss_addr;
-extern const uint32_t bss_len;
-
-#ifdef __cplusplus
-}
-#endif

+ 1 - 4
include/fs/fat.hpp

@@ -7,9 +7,6 @@
 #include <string.h>
 #include <sys/types.h>
 
-#include <types/size.h>
-
-#include <kernel/mem.h>
 #include <kernel/vfs.hpp>
 
 namespace fs::fat {
@@ -129,10 +126,10 @@ private:
     char label[12];
     std::vector<cluster_t> fat;
 
+    // TODO: dirty flag
     struct buf_object {
         char* data;
         int ref;
-        // bool dirty;
     };
     std::map<cluster_t, buf_object> buf;
 

+ 10 - 7
include/kernel/async/lock.hpp

@@ -1,11 +1,14 @@
 #pragma once
 
+#include <cstddef>
+
 #include <stdint.h>
 
 namespace kernel::async {
 
-using spinlock_t = uint32_t volatile;
-using preempt_count_t = size_t;
+using spinlock_t = unsigned long volatile;
+using lock_context_t = unsigned long;
+using preempt_count_t = std::size_t;
 
 void preempt_disable();
 void preempt_enable();
@@ -16,8 +19,8 @@ void init_spinlock(spinlock_t& lock);
 void spin_lock(spinlock_t& lock);
 void spin_unlock(spinlock_t& lock);
 
-uint32_t spin_lock_irqsave(spinlock_t& lock);
-void spin_unlock_irqrestore(spinlock_t& lock, uint32_t state);
+lock_context_t spin_lock_irqsave(spinlock_t& lock);
+void spin_unlock_irqrestore(spinlock_t& lock, lock_context_t context);
 
 class mutex {
 private:
@@ -31,8 +34,8 @@ public:
     void lock();
     void unlock();
 
-    uint32_t lock_irq();
-    void unlock_irq(uint32_t state);
+    lock_context_t lock_irq();
+    void unlock_irq(lock_context_t state);
 };
 
 class lock_guard {
@@ -50,7 +53,7 @@ public:
 class lock_guard_irq {
 private:
     mutex& m_mtx;
-    uint32_t state;
+    lock_context_t state;
 
 public:
     explicit inline lock_guard_irq(mutex& mtx)

+ 0 - 15
include/kernel/hw/keyboard.h

@@ -1,15 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-// TODO: this whole thing needs rewriting
-
-int32_t keyboard_has_data(void);
-
-void process_keyboard_data(void);
-
-#ifdef __cplusplus
-extern "C" void handle_keyboard_interrupt(void);
-#else
-void handle_keyboard_interrupt(void);
-#endif

+ 0 - 1
include/kernel/hw/serial.h → include/kernel/hw/serial.hpp

@@ -1,5 +1,4 @@
 #pragma once
-#include <asm/port_io.h>
 
 #ifdef __cplusplus
 extern "C" {

+ 0 - 17
include/kernel/hw/timer.h

@@ -1,17 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void init_pit(void);
-
-void inc_tick(void);
-
-size_t current_ticks(void);
-
-#ifdef __cplusplus
-}
-#endif

+ 11 - 0
include/kernel/hw/timer.hpp

@@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstddef>
+
+namespace kernel::hw::timer {
+void init_pit(void);
+void inc_tick(void);
+
+std::size_t current_ticks(void);
+
+}

+ 0 - 61
include/kernel/interrupt.h

@@ -1,61 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
-#define USER_INTERRUPT_GATE_TYPE (0xee)
-
-#define PIC_EOI (0x20)
-
-struct regs_32 {
-    uint32_t edi;
-    uint32_t esi;
-    uint32_t ebp;
-    uint32_t esp;
-    uint32_t ebx;
-    uint32_t edx;
-    uint32_t ecx;
-    uint32_t eax;
-};
-
-struct interrupt_stack {
-    struct regs_32 s_regs;
-    void* v_eip;
-    uint32_t cs;
-    uint32_t eflags;
-    uint32_t esp;
-    uint32_t ss;
-};
-
-struct mmx_registers {
-    uint8_t data[512]; // TODO: list of content
-};
-
-// present: When set, the page fault was caused by a page-protection violation.
-//          When not set, it was caused by a non-present page.
-// write:   When set, the page fault was caused by a write access.
-//          When not set, it was caused by a read access.
-// user:    When set, the page fault was caused while CPL = 3.
-//          This does not necessarily mean that the page fault was a privilege violation.
-// from https://wiki.osdev.org/Exceptions#Page_Fault
-struct page_fault_error_code {
-    uint32_t present : 1;
-    uint32_t write : 1;
-    uint32_t user : 1;
-    uint32_t reserved_write : 1;
-    uint32_t instruction_fetch : 1;
-    uint32_t protection_key : 1;
-    uint32_t shadow_stack : 1;
-    uint32_t software_guard_extensions : 1;
-};
-
-void init_idt(void);
-void init_pic(void);
-
-#ifdef __cplusplus
-}
-#endif

+ 74 - 0
include/kernel/interrupt.hpp

@@ -0,0 +1,74 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <types/types.h>
+
+struct saved_regs {
+    unsigned long rax;
+    unsigned long rbx;
+    unsigned long rcx;
+    unsigned long rdx;
+    unsigned long rdi;
+    unsigned long rsi;
+    unsigned long r8;
+    unsigned long r9;
+    unsigned long r10;
+    unsigned long r11;
+    unsigned long r12;
+    unsigned long r13;
+    unsigned long r14;
+    unsigned long r15;
+    unsigned long rbp;
+};
+
+struct PACKED interrupt_stack_head {
+    saved_regs s_regs;
+    unsigned long int_no;
+};
+
+struct PACKED interrupt_stack_normal {
+    interrupt_stack_head head;
+    uintptr_t v_rip;
+    unsigned long cs;
+    unsigned long flags;
+    uintptr_t rsp;
+    unsigned long ss;
+};
+
+struct PACKED interrupt_stack_with_code {
+    interrupt_stack_head head;
+    unsigned long error_code;
+    uintptr_t v_rip;
+    unsigned long cs;
+    unsigned long flags;
+    uintptr_t rsp;
+    unsigned long ss;
+};
+
+struct mmx_registers {
+    uint8_t data[512]; // TODO: list of content
+};
+
+// present: When set, the page fault was caused by a page-protection violation.
+//          When not set, it was caused by a non-present page.
+// write:   When set, the page fault was caused by a write access.
+//          When not set, it was caused by a read access.
+// user:    When set, the page fault was caused while CPL = 3.
+//          This does not necessarily mean that the page fault was a privilege violation.
+// from https://wiki.osdev.org/Exceptions#Page_Fault
+struct page_fault_error_code {
+    unsigned long present : 1;
+    unsigned long write : 1;
+    unsigned long user : 1;
+    unsigned long reserved_write : 1;
+    unsigned long instruction_fetch : 1;
+    unsigned long protection_key : 1;
+    unsigned long shadow_stack : 1;
+    unsigned long software_guard_extensions : 1;
+};
+
+namespace kernel::kinit {
+void init_interrupt();
+
+} // namespace kernel::kinit

+ 2 - 2
include/kernel/log.hpp

@@ -8,7 +8,7 @@
     if (1) {\
         char buf[512]; \
         snprintf(buf, sizeof(buf), fmt "\n" __VA_OPT__(,) __VA_ARGS__); \
-        console->print(buf); \
+        if (kernel::tty::console) kernel::tty::console->print(buf); \
     }
 
-#define kmsg(msg) console->print(msg)
+#define kmsg(msg) if (kernel::tty::console) kernel::tty::console->print(msg "\n")

+ 0 - 141
include/kernel/mem.h

@@ -1,141 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-#include <types/size.h>
-
-#define PAGE_SIZE (0x1000)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// don't forget to add the initial 1m to the total
-struct mem_size_info {
-    uint16_t n_1k_blks; // memory between 1m and 16m in 1k blocks
-    uint16_t n_64k_blks; // memory above 16m in 64k blocks
-};
-
-struct e820_mem_map_entry_20 {
-    uint64_t base;
-    uint64_t len;
-    uint32_t type;
-};
-
-struct e820_mem_map_entry_24 {
-    struct e820_mem_map_entry_20 in;
-    uint32_t acpi_extension_attr;
-};
-
-/*
- * page directory entry
- *
- * p   : present (1)
- * rw  : allow write (1)
- * us  : allow user access (1)
- * pwt : todo
- * pcd : todo
- * a   : accessed for linear address translation (1)
- * d   : dirty (1) (ignored)
- * ps  : use 4MiB pages (ignored)
- * addr: page table address
- */
-typedef union pde_t {
-    uint32_t v;
-    struct {
-        uint32_t p : 1;
-        uint32_t rw : 1;
-        uint32_t us : 1;
-        uint32_t pwt : 1;
-        uint32_t pcd : 1;
-        uint32_t a : 1;
-        uint32_t d : 1;
-        uint32_t ps : 1;
-        uint32_t ignored : 4;
-        page_t pt_page : 20;
-    } in;
-} pde_t;
-typedef pde_t (*pd_t)[1024];
-
-/*
- * page table entry
- *
- * p   : present (1)
- * rw  : allow write (1)
- * us  : allow user access (1)
- * pwt : todo
- * pcd : todo
- * a   : accessed for linear address translation (1)
- * d   : dirty (1)
- * pat : todo (ignored)
- * g   : used in cr4 mode (ignored)
- * addr: physical memory address
- */
-typedef union pte_t {
-    uint32_t v;
-    struct {
-        uint32_t p : 1;
-        uint32_t rw : 1;
-        uint32_t us : 1;
-        uint32_t pwt : 1;
-        uint32_t pcd : 1;
-        uint32_t a : 1;
-        uint32_t d : 1;
-        uint32_t pat : 1;
-        uint32_t g : 1;
-        uint32_t ignored : 3;
-        page_t page : 20;
-    } in;
-} pte_t;
-typedef pte_t (*pt_t)[1024];
-
-// in mem.cpp
-extern uint8_t e820_mem_map[1024];
-extern uint32_t e820_mem_map_count;
-extern uint32_t e820_mem_map_entry_size;
-extern struct mem_size_info mem_size_info;
-
-#define KERNEL_HEAP_START ((void*)0xd0000000)
-#define KERNEL_HEAP_LIMIT ((void*)0xd4000000)
-
-#define EARLY_KERNEL_PD_PAGE ((page_t)0x000001)
-
-void init_mem(void);
-
-#define KERNEL_CODE_SEGMENT (0x08)
-#define KERNEL_DATA_SEGMENT (0x10)
-#define USER_CODE_SEGMENT (0x18)
-#define USER_DATA_SEGMENT (0x20)
-#define USER_CODE_SELECTOR (USER_CODE_SEGMENT | 3)
-#define USER_DATA_SELECTOR (USER_DATA_SEGMENT | 3)
-
-#define SD_TYPE_CODE_SYSTEM (0x9a)
-#define SD_TYPE_DATA_SYSTEM (0x92)
-
-#define SD_TYPE_CODE_USER (0xfa)
-#define SD_TYPE_DATA_USER (0xf2)
-
-#define SD_TYPE_TSS (0x89)
-
-typedef struct segment_descriptor_struct {
-    uint64_t limit_low : 16;
-    uint64_t base_low : 16;
-    uint64_t base_mid : 8;
-    uint64_t access : 8;
-    uint64_t limit_high : 4;
-    uint64_t flags : 4;
-    uint64_t base_high : 8;
-} segment_descriptor;
-
-// in mem.cpp
-extern segment_descriptor gdt[7];
-
-void create_segment_descriptor(
-    segment_descriptor* sd,
-    uint32_t base,
-    uint32_t limit,
-    uint32_t flags,
-    uint32_t access);
-
-#ifdef __cplusplus
-}
-#endif

+ 107 - 0
include/kernel/mem/mm_list.hpp

@@ -0,0 +1,107 @@
+#pragma once
+
+#include <set>
+
+#include <stdint.h>
+
+#include "vm_area.hpp"
+#include "paging.hpp"
+
+namespace kernel::mem {
+
+constexpr uintptr_t KERNEL_SPACE_START    = 0x8000000000000000ULL;
+constexpr uintptr_t USER_SPACE_MEMORY_TOP = 0x0000800000000000ULL;
+constexpr uintptr_t MMAP_MIN_ADDR         = 0x0000000000001000ULL;
+constexpr uintptr_t STACK_MIN_ADDR        = 0x0000700000000000ULL;
+
+class mm_list {
+private:
+    struct comparator {
+        constexpr bool operator()(const vm_area& lhs, const vm_area& rhs) const noexcept
+        { return lhs < rhs; }
+        constexpr bool operator()(const vm_area& lhs, uintptr_t rhs) const noexcept
+        { return lhs < rhs; }
+        constexpr bool operator()(uintptr_t lhs, const vm_area& rhs) const noexcept
+        { return lhs < rhs; }
+    };
+
+public:
+    using list_type = std::set<vm_area, comparator>;
+    using iterator = list_type::iterator;
+    using const_iterator = list_type::const_iterator;
+
+    struct map_args {
+        // MUSE BE aligned to 4kb boundary
+        uintptr_t vaddr;
+        // MUSE BE aligned to 4kb boundary
+        std::size_t length;
+
+        unsigned long flags;
+
+        fs::inode* file_inode;
+        // MUSE BE aligned to 4kb boundary
+        std::size_t file_offset;
+    };
+
+private:
+    list_type m_areas;
+    paging::pfn_t m_pt;
+    iterator m_brk {};
+
+public:
+    // default constructor copies kernel_mms
+    explicit mm_list();
+    // copies kernel_mms and mirrors user space
+    explicit mm_list(const mm_list& other);
+
+    constexpr mm_list(mm_list&& v)
+        : m_areas(std::move(v.m_areas))
+        , m_pt(std::exchange(v.m_pt, 0))
+        , m_brk{std::move(v.m_brk)} { }
+
+    ~mm_list();
+
+    void switch_pd() const noexcept;
+
+    int register_brk(uintptr_t addr);
+    uintptr_t set_brk(uintptr_t addr);
+
+    void clear();
+
+    // split the memory block at the specified address
+    // return: iterator to the new block
+    iterator split(iterator area, uintptr_t at);
+
+    bool is_avail(uintptr_t addr) const;
+    bool is_avail(uintptr_t start, std::size_t length) const noexcept;
+
+    uintptr_t find_avail(uintptr_t hint, size_t length) const;
+
+    int unmap(iterator area, bool should_invalidate_tlb);
+    int unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb);
+
+    int mmap(const map_args& args);
+
+    constexpr vm_area* find(uintptr_t lp)
+    {
+        auto iter = m_areas.find(lp);
+        if (iter == m_areas.end())
+            return nullptr;
+        return &iter;
+    }
+
+    constexpr const vm_area* find(uintptr_t lp) const
+    {
+        auto iter = m_areas.find(lp);
+        if (iter == m_areas.end())
+            return nullptr;
+        return &iter;
+    }
+
+    constexpr paging::PSE get_page_table() const noexcept
+    {
+        return paging::PSE {m_pt};
+    }
+};
+
+} // namespace kernel::mem

+ 192 - 0
include/kernel/mem/paging.hpp

@@ -0,0 +1,192 @@
+#pragma once
+
+#include <bit>
+#include <tuple>
+#include <cstddef>
+
+#include <stdint.h>
+
+#include <kernel/mem/phys.hpp>
+
+namespace kernel::mem::paging {
+
+constexpr int idx_p5(uintptr_t vaddr) noexcept { return (vaddr >> 48) & 0x1ff; }
+constexpr int idx_p4(uintptr_t vaddr) noexcept { return (vaddr >> 39) & 0x1ff; }
+constexpr int idx_p3(uintptr_t vaddr) noexcept { return (vaddr >> 30) & 0x1ff; }
+constexpr int idx_p2(uintptr_t vaddr) noexcept { return (vaddr >> 21) & 0x1ff; }
+constexpr int idx_p1(uintptr_t vaddr) noexcept { return (vaddr >> 12) & 0x1ff; }
+
+constexpr std::tuple<int, int, int, int, int> idx_all(uintptr_t vaddr) noexcept
+{
+    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr), idx_p1(vaddr)};
+}
+
+// page frame number
+// since we have large pages now, pfns are not shifted right
+using pfn_t = uintptr_t;
+
+// paging structure attributes
+using psattr_t = uintptr_t;
+
+constexpr psattr_t PA_P    = 0x0000000000000001ULL;
+constexpr psattr_t PA_RW   = 0x0000000000000002ULL;
+constexpr psattr_t PA_US   = 0x0000000000000004ULL;
+constexpr psattr_t PA_PWT  = 0x0000000000000008ULL;
+constexpr psattr_t PA_PCD  = 0x0000000000000010ULL;
+constexpr psattr_t PA_A    = 0x0000000000000020ULL;
+constexpr psattr_t PA_D    = 0x0000000000000040ULL;
+constexpr psattr_t PA_PS   = 0x0000000000000080ULL;
+constexpr psattr_t PA_G    = 0x0000000000000100ULL;
+constexpr psattr_t PA_COW  = 0x0000000000000200ULL; // copy on write
+constexpr psattr_t PA_MMAP = 0x0000000000000400ULL; // memory mapped
+constexpr psattr_t PA_ANON = 0x0000000000000800ULL; // anonymous map
+constexpr psattr_t PA_NXE  = 0x8000000000000000ULL;
+constexpr psattr_t PA_MASK = 0xfff0000000000fffULL;
+
+constexpr psattr_t PA_DATA = PA_P | PA_RW | PA_NXE;
+constexpr psattr_t PA_KERNEL_DATA = PA_DATA | PA_G;
+constexpr psattr_t PA_USER_DATA = PA_DATA | PA_G | PA_US;
+
+constexpr psattr_t PA_PAGE_TABLE = PA_P | PA_RW;
+constexpr psattr_t PA_KERNEL_PAGE_TABLE = PA_PAGE_TABLE | PA_G;
+constexpr psattr_t PA_USER_PAGE_TABLE = PA_PAGE_TABLE | PA_US;
+
+constexpr psattr_t PA_DATA_HUGE = PA_DATA | PA_PS;
+constexpr psattr_t PA_KERNEL_DATA_HUGE = PA_DATA_HUGE | PA_G;
+constexpr psattr_t PA_USER_DATA_HUGE = PA_DATA_HUGE | PA_US;
+
+constexpr psattr_t PA_ANONYMOUS_PAGE = PA_P | PA_US | PA_COW | PA_ANON;
+constexpr psattr_t PA_MMAPPED_PAGE = PA_US | PA_COW | PA_ANON | PA_MMAP;
+
+namespace __inner {
+    using pse_t = uint64_t;
+
+} // namespace __inner
+
+class PSE {
+    physaddr<__inner::pse_t> m_ptrbase;
+
+public:
+    explicit constexpr PSE(uintptr_t pptr) noexcept : m_ptrbase{pptr} {}
+
+    constexpr void clear() noexcept
+    {
+        *m_ptrbase = 0;
+    }
+
+    constexpr void set(psattr_t attributes, pfn_t pfn)
+    {
+        *m_ptrbase = (attributes & PA_MASK) | (pfn & ~PA_MASK);
+    }
+
+    constexpr pfn_t pfn() const noexcept
+    {
+        return *m_ptrbase & ~PA_MASK;
+    }
+
+    constexpr psattr_t attributes() const noexcept
+    {
+        return *m_ptrbase & PA_MASK;
+    }
+
+    constexpr PSE operator[](std::size_t nth) const noexcept
+    {
+        return PSE{m_ptrbase.phys() + 8 * nth};
+    }
+
+    constexpr PSE parse() const noexcept
+    {
+        return PSE{*m_ptrbase & ~PA_MASK};
+    }
+};
+
+constexpr pfn_t EMPTY_PAGE_PFN = 0x7f000;
+
+constexpr uintptr_t KERNEL_PAGE_TABLE_ADDR = 0x100000;
+constexpr physaddr<void> KERNEL_PAGE_TABLE_PHYS_ADDR{KERNEL_PAGE_TABLE_ADDR};
+constexpr PSE KERNEL_PAGE_TABLE{0x100000};
+
+constexpr unsigned long PAGE_PRESENT = 0x00010000;
+constexpr unsigned long PAGE_BUDDY   = 0x00020000;
+constexpr unsigned long PAGE_SLAB    = 0x00040000;
+
+struct page {
+    // TODO: use atomic
+    unsigned long refcount;
+    unsigned long flags;
+
+    page* next;
+    page* prev;
+};
+
+inline page* PAGE_ARRAY;
+
+void create_zone(uintptr_t start, uintptr_t end);
+void mark_present(uintptr_t start, uintptr_t end);
+
+[[nodiscard]] page* alloc_page();
+// order represents power of 2
+[[nodiscard]] page* alloc_pages(unsigned order);
+
+// order represents power of 2
+void free_pages(page* page, unsigned order);
+void free_page(page* page);
+
+// order represents power of 2
+void free_pages(pfn_t pfn, unsigned order);
+void free_page(pfn_t pfn);
+
+// clear the page all zero
+[[nodiscard]] pfn_t alloc_page_table();
+
+pfn_t page_to_pfn(page* page);
+page* pfn_to_page(pfn_t pfn);
+
+void increase_refcount(page* page);
+
+constexpr unsigned long PAGE_FAULT_P   = 0x00000001;
+constexpr unsigned long PAGE_FAULT_W   = 0x00000002;
+constexpr unsigned long PAGE_FAULT_U   = 0x00000004;
+constexpr unsigned long PAGE_FAULT_R   = 0x00000008;
+constexpr unsigned long PAGE_FAULT_I   = 0x00000010;
+constexpr unsigned long PAGE_FAULT_PK  = 0x00000020;
+constexpr unsigned long PAGE_FAULT_SS  = 0x00000040;
+constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
+
+void handle_page_fault(unsigned long err);
+
+class vaddr_range {
+    std::size_t n;
+
+    int idx4;
+    int idx3;
+    int idx2;
+    int idx1;
+
+    PSE pml4;
+    PSE pdpt;
+    PSE pd;
+    PSE pt;
+
+    uintptr_t m_start;
+    uintptr_t m_end;
+
+    bool is_privilege;
+
+public:
+    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool is_privilege = false);
+    explicit vaddr_range(std::nullptr_t);
+
+    vaddr_range begin() const noexcept;
+    vaddr_range end() const noexcept;
+
+    PSE operator*() const noexcept;
+
+    vaddr_range& operator++();
+    operator bool() const noexcept;
+
+    // compares remaining pages to iterate
+    bool operator==(const vaddr_range& other) const noexcept;
+};
+
+} // namespace kernel::mem::paging

+ 65 - 0
include/kernel/mem/phys.hpp

@@ -0,0 +1,65 @@
+#pragma once
+
+#include <bit>
+#include <cstddef>
+
+#include <stdint.h>
+
+#include <types/types.h>
+
+#include <kernel/mem/types.hpp>
+
+namespace kernel::mem {
+
+template <typename T, bool Cached = true>
+class physaddr {
+    static constexpr uintptr_t PHYS_OFFSET =
+        Cached ? 0xffffff0000000000ULL : 0xffffff4000000000ULL;
+
+    uintptr_t m_ptr;
+
+public:
+    explicit constexpr physaddr(uintptr_t ptr) : m_ptr{ptr} {}
+    explicit constexpr physaddr(std::nullptr_t) : m_ptr{} {}
+
+    // cast to non-pointer types is prohibited
+    template <typename U, typename = std::enable_if_t<std::is_pointer_v<U>>>
+    constexpr U cast_to() const noexcept
+    {
+        return std::bit_cast<U>(m_ptr + PHYS_OFFSET);
+    }
+
+    constexpr operator T*() const noexcept
+    {
+        return cast_to<T*>();
+    }
+
+    constexpr T* operator->() const noexcept
+    {
+        return *this;
+    }
+
+    constexpr uintptr_t phys() const noexcept
+    {
+        return m_ptr;
+    }
+};
+
+//  gdt[0]:  null
+//  gdt[1]:  kernel code
+//  gdt[2]:  kernel data
+//  gdt[3]:  user code
+//  gdt[4]:  user data
+//  gdt[5]:  user code compability mode
+//  gdt[6]:  user data compability mode
+//  gdt[7]:  thread local 32bit
+//  gdt[8]:  tss descriptor low
+//  gdt[9]:  tss descriptor high
+//  gdt[10]: ldt descriptor low
+//  gdt[11]: ldt descriptor high
+//  gdt[12]: null segment(in ldt)
+//  gdt[13]: thread local 64bit(in ldt)
+// &gdt[14]: tss of 0x68 bytes from here
+constexpr physaddr<uint64_t> gdt{0x00000000 + 1 - 1};
+
+} // namespace kernel::mem

+ 40 - 0
include/kernel/mem/slab.hpp

@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+#include <stdint.h>
+
+#include "paging.hpp"
+#include "phys.hpp"
+
+namespace kernel::mem {
+
+struct slab_cache;
+
+struct slab_head {
+    slab_cache* cache;
+
+    slab_head* next;
+    slab_head* prev;
+
+    void* free;
+
+    unsigned int free_count;
+    unsigned int obj_size;
+};
+
+struct slab_cache {
+    slab_head* slabs_empty;
+    slab_head* slabs_partial;
+    slab_head* slabs_full;
+
+    std::size_t obj_size;
+};
+
+void init_slab_cache(slab_cache* cache, std::size_t obj_size);
+
+void* slab_alloc(slab_cache* cache);
+void slab_free(void* ptr);
+
+} // namespace kernel::mem

+ 36 - 0
include/kernel/mem/types.hpp

@@ -0,0 +1,36 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <cstddef>
+
+namespace kernel::mem {
+
+struct gdt_entry {
+    uint64_t limit_low : 16;
+    uint64_t base_low : 16;
+    uint64_t base_mid : 8;
+    uint64_t access : 8;
+    uint64_t limit_high : 4;
+    uint64_t flags : 4;
+    uint64_t base_high : 8;
+};
+
+struct e820_mem_map_entry {
+    uint64_t base;
+    uint64_t len;
+    uint32_t type;
+
+    // might not be valid
+    uint32_t acpi_extension_attr;
+};
+
+namespace info {
+    inline std::size_t memory_size;
+    inline std::size_t e820_entry_count;
+    inline std::size_t e820_entry_length;
+    inline e820_mem_map_entry e820_entries[(1024-16)/24];
+
+} // namespace info
+
+} // namespace kernel::mem

+ 46 - 0
include/kernel/mem/vm_area.hpp

@@ -0,0 +1,46 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <kernel/vfs.hpp>
+
+namespace kernel::mem {
+
+constexpr unsigned long MM_WRITE         = 0x00000000'00000001;
+constexpr unsigned long MM_EXECUTE       = 0x00000000'00000002;
+constexpr unsigned long MM_MAPPED        = 0x00000000'00000004;
+constexpr unsigned long MM_ANONYMOUS     = 0x00000000'00000008;
+constexpr unsigned long MM_INTERNAL_MASK = 0xffffffff'00000000;
+constexpr unsigned long MM_BREAK         = 0x80000000'00000000;
+
+struct vm_area {
+    uintptr_t start;
+    uintptr_t end;
+
+    unsigned long flags;
+
+    fs::inode* mapped_file;
+    std::size_t file_offset;
+
+    constexpr bool is_avail(uintptr_t ostart, uintptr_t oend) const noexcept
+    {
+        return (ostart >= end || oend <= start);
+    }
+
+    constexpr bool operator<(const vm_area& rhs) const noexcept
+    { return end <= rhs.start; }
+    constexpr bool operator<(uintptr_t rhs) const noexcept
+    { return end <= rhs; }
+    friend constexpr bool operator<(uintptr_t lhs, const vm_area& rhs) noexcept
+    { return lhs < rhs.start; }
+
+    constexpr vm_area(uintptr_t start, unsigned long flags, uintptr_t end,
+            fs::inode* mapped_file = nullptr, std::size_t offset = 0)
+        : start{start}, end{end}, flags{flags}, mapped_file{mapped_file}, file_offset{offset} { }
+
+    constexpr vm_area(uintptr_t start, unsigned long flags,
+            fs::inode* mapped_file = nullptr, std::size_t offset = 0)
+        : start{start}, end{start}, flags{flags}, mapped_file{mapped_file}, file_offset{offset} { }
+};
+
+} // namespace kernel::mem

+ 0 - 400
include/kernel/mm.hpp

@@ -1,400 +0,0 @@
-#pragma once
-
-#include <set>
-#include <vector>
-#include <bit>
-#include <cstddef>
-#include <utility>
-
-#include <kernel/mem.h>
-#include <kernel/vfs.hpp>
-#include <stdint.h>
-#include <types/allocator.hpp>
-#include <types/cplusplus.hpp>
-#include <types/size.h>
-#include <types/status.h>
-#include <types/types.h>
-
-#define invalidate_tlb(addr) asm volatile("invlpg (%0)" \
-                                 :             \
-                                 : "r"(addr)   \
-                                 : "memory")
-
-constexpr size_t THREAD_KERNEL_STACK_SIZE = 8 * PAGE_SIZE;
-
-constexpr uint32_t PAGE_COW = (1 << 0);
-constexpr uint32_t PAGE_MMAP = (1 << 1);
-#define PAGE_COW PAGE_COW
-#define PAGE_MMAP PAGE_MMAP
-
-struct page {
-    page_t phys_page_id;
-    size_t* ref_count;
-    // 0 :11 : pte_index
-    // 12:31 : pt_page
-    uint32_t pg_pteidx;
-    mutable uint32_t attr;
-};
-
-// private memory mapping
-// changes won't be neither written back to file nor shared between processes
-// TODO: shared mapping
-// @param len is aligned to 4kb boundary automatically, exceeding part will
-// be filled with '0's and not written back to the file
-// @param offset MUST be aligned to 4kb
-int mmap(
-    void* hint,
-    size_t len,
-    fs::inode* file,
-    size_t offset,
-    int write,
-    int priv);
-
-template <uint32_t base, uint32_t expo>
-constexpr uint32_t pow()
-{
-    if constexpr (expo == 0)
-        return 1;
-    if constexpr (expo == 1)
-        return base;
-    if constexpr (expo % 2 == 0)
-        return pow<base, expo / 2>() * pow<base, expo / 2>();
-    else
-        return pow<base, expo / 2>() * pow<base, expo / 2 + 1>();
-}
-
-template <int N>
-constexpr uint32_t align_down(uint32_t v)
-{
-    return v & ~(pow<2, N>() - 1);
-}
-template <int N>
-constexpr void* align_down(void* v)
-{
-    return std::bit_cast<void*>(align_down<N>(std::bit_cast<uint32_t>(v)));
-}
-template <int N>
-constexpr uint32_t align_up(uint32_t v)
-{
-    return align_down<N>(v + pow<2, N>() - 1);
-}
-template <int N>
-constexpr void* align_up(void* v)
-{
-    return std::bit_cast<void*>(align_up<N>(std::bit_cast<uint32_t>(v)));
-}
-
-constexpr size_t vptrdiff(void* p1, void* p2)
-{
-    auto* _p1 = static_cast<std::byte*>(p1);
-    auto* _p2 = static_cast<std::byte*>(p2);
-    return _p1 - _p2;
-}
-
-constexpr void* vptradd(void* p, std::size_t off)
-{
-    auto* _p = static_cast<std::byte*>(p);
-    return _p + off;
-}
-
-void dealloc_pd(page_t pd);
-
-// allocate a struct page together with the raw page
-page allocate_page(void);
-void free_page(page* pg);
-
-// TODO: this is for alloc_kstack()
-// CHANGE THIS
-page_t __alloc_raw_page(void);
-void __free_raw_page(page_t pg);
-
-namespace kernel {
-
-void* pmap(page_t pg, bool cached = true);
-void pfree(page_t pg);
-
-class paccess : public types::non_copyable {
-private:
-    page_t m_pg;
-    void* m_ptr;
-
-public:
-    paccess(void) = delete;
-    paccess(paccess&&) = delete;
-    paccess& operator=(paccess&&) = delete;
-
-    inline explicit paccess(page_t pg, bool cached = true)
-        : m_pg(pg)
-    {
-        m_ptr = pmap(pg, cached);
-    }
-
-    constexpr void* ptr(void) const { return m_ptr; }
-
-    ~paccess()
-    {
-        pfree(m_pg);
-    }
-};
-
-namespace memory {
-
-struct mm {
-public:
-    using pages_vector = std::vector<page, types::memory::ident_allocator<page>>;
-
-public:
-    void* start {};
-    struct mm_attr {
-        uint32_t write : 1;
-        uint32_t system : 1;
-        uint32_t mapped : 1;
-    } attr {};
-    pages_vector* pgs {};
-    fs::inode* mapped_file {};
-    size_t file_offset {};
-
-public:
-    constexpr void* end() const noexcept
-    { return vptradd(start, pgs->size() * PAGE_SIZE); }
-    constexpr bool is_kernel_space() const noexcept
-    { return attr.system; }
-    constexpr bool is_avail(void* ostart, void* oend) const noexcept
-    {
-        void* m_start = start;
-        void* m_end = end();
-
-        return (ostart >= m_end || oend <= m_start);
-    }
-
-    void append_page(pd_t pd, const page& pg, uint32_t attr, bool priv);
-
-    /**
-     * @brief Splits the memory block at the specified address.
-     * 
-     * @param addr The address at which the memory block will be split.
-     * @return The new memory block created after splitting.
-     */
-    mm split(void* addr);
-
-    constexpr bool operator<(const mm& rhs) const noexcept
-    { return end() <= rhs.start; }
-    constexpr bool operator<(void* rhs) const noexcept
-    { return end() <= rhs; }
-    friend constexpr bool operator<(void* lhs, const mm& rhs) noexcept
-    { return lhs < rhs.start; }
-};
-
-class mm_list {
-private:
-    struct comparator {
-        constexpr bool operator()(const mm& lhs, const mm& rhs) const noexcept
-        { return lhs < rhs; }
-        constexpr bool operator()(const mm& lhs, void* rhs) const noexcept
-        { return lhs < rhs; }
-        constexpr bool operator()(void* lhs, const mm& rhs) const noexcept
-        { return lhs < rhs; }
-    };
-
-public:
-    using list_type = std::set<mm, comparator, types::memory::ident_allocator<mm>>;
-    using iterator = list_type::iterator;
-    using const_iterator = list_type::const_iterator;
-
-public:
-    static inline mm_list* s_kernel_mms;
-
-private:
-    list_type m_areas;
-    page_t m_pd;
-    mm* m_brk {};
-
-public:
-    // for system initialization only
-    explicit constexpr mm_list(page_t pd)
-        : m_pd(pd) { }
-
-    // default constructor copies kernel_mms
-    explicit mm_list();
-    // copies kernel_mms and mirrors user space
-    explicit mm_list(const mm_list& other);
-
-    constexpr mm_list(mm_list&& v)
-        : m_areas(std::move(v.m_areas))
-        , m_pd(std::exchange(v.m_pd, 0)) { }
-
-    ~mm_list();
-    void switch_pd() const;
-
-    int register_brk(void* addr);
-    void* set_brk(void* addr);
-
-    void* find_avail(void* hint, size_t len, bool priv) const;
-
-    int unmap(void* start, size_t len, bool priv);
-
-    constexpr mm& addarea(void* start, bool w, bool system)
-    {
-        auto [ iter, inserted ] = m_areas.emplace(mm {
-            .start = start,
-            .attr {
-                .write = w,
-                .system = system,
-                .mapped = 0,
-            },
-            .pgs = types::memory::kinew<mm::pages_vector>(),
-        });
-        assert(inserted);
-        return *iter;
-    }
-
-    mm& add_empty_area(void* start, std::size_t page_count,
-        uint32_t page_attr, bool w, bool system);
-
-    constexpr void clear_user()
-    {
-        for (auto iter = m_areas.begin(); iter != m_areas.end(); ) {
-            if (iter->is_kernel_space()) {
-                ++iter;
-                continue;
-            }
-
-            this->unmap(*iter);
-            iter = m_areas.erase(iter);
-        }
-        m_brk = nullptr;
-    }
-
-    inline void unmap(mm& area)
-    {
-        int i = 0;
-
-        // TODO:
-        // if there are more than 4 pages, calling invlpg
-        // should be faster. otherwise, we use movl cr3
-        // bool should_invlpg = (area->pgs->size() > 4);
-
-        for (auto& pg : *area.pgs) {
-            kernel::paccess pa(pg.pg_pteidx >> 12);
-            auto pt = (pt_t)pa.ptr();
-            assert(pt);
-            auto* pte = *pt + (pg.pg_pteidx & 0xfff);
-            pte->v = 0;
-
-            free_page(&pg);
-
-            invalidate_tlb((uint32_t)area.start + (i++) * PAGE_SIZE);
-        }
-        types::memory::kidelete<mm::pages_vector>(area.pgs);
-    }
-
-    constexpr mm* find(void* lp)
-    {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &*iter;
-    }
-    constexpr const mm* find(void* lp) const
-    {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &*iter;
-    }
-
-    constexpr bool is_avail(void* start, size_t len) const noexcept
-    {
-        start = align_down<12>(start);
-        len = vptrdiff(align_up<12>(vptradd(start, len)), start);
-        for (const auto& area : m_areas) {
-            if (!area.is_avail(start, vptradd(start, len)))
-                return false;
-        }
-        return true;
-    }
-
-    constexpr bool is_avail(void* addr) const
-    {
-        auto iter = m_areas.find(addr);
-        return iter == m_areas.end();
-    }
-};
-
-} // namespace memory
-
-} // namespace kernel
-
-// global variables
-inline page empty_page;
-// --------------------------------
-
-// inline constexpr page* lto_page(mm* mm_area, void* l_ptr)
-// {
-//     size_t offset = vptrdiff(l_ptr, mm_area->start);
-//     return &mm_area->pgs->at(offset / PAGE_SIZE);
-// }
-// inline constexpr page_t to_page(pptr_t ptr)
-// {
-//     return ptr >> 12;
-// }
-// inline constexpr size_t to_pdi(page_t pg)
-// {
-//     return pg >> 10;
-// }
-// inline constexpr size_t to_pti(page_t pg)
-// {
-//     return pg & (1024 - 1);
-// }
-// inline constexpr pptr_t to_pp(page_t p)
-// {
-//     return p << 12;
-// }
-constexpr size_t v_to_pdi(void* addr)
-{
-    return std::bit_cast<uint32_t>(addr) >> 22;
-}
-constexpr size_t v_to_pti(void* addr)
-{
-    return (std::bit_cast<uint32_t>(addr) >> 12) & 0x3ff;
-}
-// inline constexpr pte_t* to_pte(pt_t pt, page_t pg)
-// {
-//     return *pt + to_pti(pg);
-// }
-// inline void* to_vp(page_t pg)
-// {
-//     return ptovp(to_pp(pg));
-// }
-// inline pd_t to_pd(page_t pg)
-// {
-//     return reinterpret_cast<pd_t>(to_vp(pg));
-// }
-// inline pt_t to_pt(page_t pg)
-// {
-//     return reinterpret_cast<pt_t>(to_vp(pg));
-// }
-// inline pt_t to_pt(pde_t* pde)
-// {
-//     return to_pt(pde->in.pt_page);
-// }
-// inline pde_t* to_pde(pd_t pd, void* addr)
-// {
-//     return *pd + lto_pdi((pptr_t)addr);
-// }
-// inline pte_t* to_pte(pt_t pt, void* addr)
-// {
-//     return *pt + lto_pti((pptr_t)addr);
-// }
-// inline pte_t* to_pte(pde_t* pde, void* addr)
-// {
-//     return to_pte(to_pt(pde), addr);
-// }
-// inline pte_t* to_pte(pd_t pd, void* addr)
-// {
-//     return to_pte(to_pde(pd, addr), addr);
-// }
-// inline pte_t* to_pte(pde_t* pde, page_t pg)
-// {
-//     return to_pte(to_pt(pde), pg);
-// }

+ 1 - 1
include/kernel/module.hpp

@@ -29,6 +29,6 @@ constexpr int MODULE_DELAYED = 2;
 // TODO: unique_ptr and Deleter
 int insmod(module* mod);
 
-extern "C" module_loader kmod_loaders_start[];
+extern "C" module_loader KMOD_LOADERS_START[];
 
 } // namespace kernel::module

+ 12 - 20
include/kernel/process.hpp

@@ -1,7 +1,7 @@
 #pragma once
 
-#include <map>
 #include <list>
+#include <map>
 #include <memory>
 #include <queue>
 #include <set>
@@ -13,23 +13,21 @@
 #include <stdint.h>
 #include <sys/types.h>
 
-#include <kernel/task/thread.hpp>
 #include <kernel/task/current.hpp>
+#include <kernel/task/thread.hpp>
 
 #include <types/allocator.hpp>
 #include <types/cplusplus.hpp>
 #include <types/path.hpp>
-#include <types/status.h>
 #include <types/types.h>
 
 #include <kernel/async/waitlist.hpp>
-#include <kernel/interrupt.h>
-#include <kernel/mm.hpp>
-#include <kernel/mem.h>
-#include <kernel/user/thread_local.hpp>
+#include <kernel/interrupt.hpp>
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/task.h>
 #include <kernel/tty.hpp>
+#include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 
 class process;
@@ -39,8 +37,6 @@ class proclist;
 inline process* volatile current_process;
 inline proclist* procs;
 
-inline tss32_t tss;
-
 struct process_attr {
     uint16_t system : 1;
     uint16_t zombie : 1 = 0;
@@ -175,7 +171,7 @@ public:
     };
 
 public:
-    kernel::memory::mm_list mms {};
+    kernel::mem::mm_list mms {};
     std::set<kernel::task::thread> thds;
     kernel::async::wait_list waitlist;
 
@@ -192,7 +188,7 @@ public:
     pid_t pgid {};
     pid_t sid {};
 
-    tty* control_tty {};
+    kernel::tty::tty* control_tty {};
     fs::dentry* root { fs::fs_root };
     std::set<pid_t> children;
 
@@ -292,20 +288,16 @@ public:
     }
 
     void kill(pid_t pid, int exit_code);
+
+    constexpr auto begin() const { return m_procs.begin(); }
+    constexpr auto end() const { return m_procs.end(); }
 };
 
-void NORETURN init_scheduler(void);
+void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn);
 /// @return true if returned normally, false if being interrupted
 bool schedule(void);
 void NORETURN schedule_noreturn(void);
 
-constexpr uint32_t push_stack(uint32_t** stack, uint32_t val)
-{
-    --*stack;
-    **stack = val;
-    return val;
-}
-
 void k_new_thread(void (*func)(void*), void* data);
 
 void NORETURN freeze(void);

+ 2 - 2
include/kernel/signal.hpp

@@ -9,7 +9,7 @@
 
 #include <types/cplusplus.hpp>
 
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 
 namespace kernel {
 
@@ -57,7 +57,7 @@ public:
 
     // return value: whether the thread should wake up
     bool raise(signo_type signal);
-    void handle(interrupt_stack* context, mmx_registers* mmxregs);
+    void handle(interrupt_stack_normal* context, mmx_registers* mmxregs);
     void after_signal(signo_type signal);
 };
 

+ 111 - 10
include/kernel/syscall.hpp

@@ -1,16 +1,117 @@
 #pragma once
 
-#include <kernel/interrupt.h>
+#include <string>
+#include <vector>
+
+#include <bits/alltypes.h>
+#include <poll.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <time.h>
+
 #include <types/types.h>
 
-#define SYSCALL_ARG1(type, name) type name = (type)((data)->s_regs.ebx)
-#define SYSCALL_ARG2(type, name) type name = (type)((data)->s_regs.ecx)
-#define SYSCALL_ARG3(type, name) type name = (type)((data)->s_regs.edx)
-#define SYSCALL_ARG4(type, name) type name = (type)((data)->s_regs.esi)
-#define SYSCALL_ARG5(type, name) type name = (type)((data)->s_regs.edi)
-#define SYSCALL_ARG6(type, name) type name = (type)((data)->s_regs.ebp)
+#include <kernel/interrupt.hpp>
+#include <kernel/signal.hpp>
+#include <kernel/user/thread_local.hpp>
+
+#define SYSCALL64_ARG1(type, name) type name = (type)((data)->head.s_regs.rdi)
+#define SYSCALL64_ARG2(type, name) type name = (type)((data)->head.s_regs.rsi)
+#define SYSCALL64_ARG3(type, name) type name = (type)((data)->head.s_regs.rdx)
+#define SYSCALL64_ARG4(type, name) type name = (type)((data)->head.s_regs.r10)
+#define SYSCALL64_ARG5(type, name) type name = (type)((data)->head.s_regs.r8)
+#define SYSCALL64_ARG6(type, name) type name = (type)((data)->head.s_regs.r9)
+
+namespace kernel {
+void init_syscall_table();
+
+void handle_syscall32(int no, interrupt_stack_normal* data, mmx_registers* mmxregs);
+void handle_syscall64(int no, interrupt_stack_normal* data, mmx_registers* mmxregs);
+
+namespace syscall {
+// in fileops.cc
+ssize_t do_write(int fd, const char __user* buf, size_t n);
+ssize_t do_read(int fd, char __user* buf, size_t n);
+int do_close(int fd);
+int do_dup(int old_fd);
+int do_dup2(int old_fd, int new_fd);
+int do_pipe(int __user* pipefd);
+ssize_t do_getdents(int fd, char __user* buf, size_t cnt);
+ssize_t do_getdents64(int fd, char __user* buf, size_t cnt);
+int do_open(const char __user* path, int flags, mode_t mode);
+int do_symlink(const char __user* target, const char __user* linkpath);
+int do_readlink(const char __user* pathname, char __user* buf, size_t buf_size);
+int do_ioctl(int fd, unsigned long request, uintptr_t arg3);
+ssize_t do_readv(int fd, const iovec* iov, int iovcnt);
+ssize_t do_writev(int fd, const iovec* iov, int iovcnt);
+off_t do_lseek(int fd, off_t offset, int whence);
+uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len,
+        int prot, int flags, int fd, off_t pgoffset);
+int do_munmap(uintptr_t addr, size_t len);
+ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset, size_t count);
+int do_statx(int dirfd, const char __user* path,
+        int flags, unsigned int mask, statx __user* statxbuf);
+int do_fcntl(int fd, int cmd, unsigned long arg);
+int do_poll(pollfd __user* fds, nfds_t nfds, int timeout);
+int do_mknod(const char __user* pathname, mode_t mode, dev_t dev);
+int do_access(const char __user* pathname, int mode);
+int do_unlink(const char __user* pathname);
+int do_truncate(const char __user* pathname, long length);
+int do_mkdir(const char __user* pathname, mode_t mode);
+
+// in procops.cc
+int do_chdir(const char __user* path);
+[[noreturn]] int do_exit(int status);
+int do_waitpid(pid_t waitpid, int __user* arg1, int options);
+pid_t do_getsid(pid_t pid);
+pid_t do_setsid();
+pid_t do_getpgid(pid_t pid);
+int do_setpgid(pid_t pid, pid_t pgid);
+int do_set_thread_area(user::user_desc __user* ptr);
+pid_t do_set_tid_address(int __user* tidptr);
+int do_prctl(int option, uintptr_t arg2);
+int do_arch_prctl(int option, uintptr_t arg2);
+pid_t do_getpid();
+pid_t do_getppid();
+uid_t do_getuid();
+uid_t do_geteuid();
+gid_t do_getgid();
+pid_t do_gettid();
+char __user* do_getcwd(char __user* buf, size_t buf_size);
+uintptr_t do_brk(uintptr_t addr);
+int do_umask(mode_t mask);
+int do_kill(pid_t pid, int sig);
+int do_rt_sigprocmask(int how, const kernel::sigmask_type __user* set,
+        kernel::sigmask_type __user* oldset, size_t sigsetsize);
+int do_rt_sigaction(int signum, const sigaction __user* act,
+        sigaction __user* oldact, size_t sigsetsize);
+int do_newuname(new_utsname __user* buf);
+
+struct execve_retval {
+    uintptr_t ip;
+    uintptr_t sp;
+    int status;
+};
+
+execve_retval do_execve(
+        const std::string& exec,
+        const std::vector<std::string>& args,
+        const std::vector<std::string>& envs);
+
+// in mount.cc
+int do_mount(
+        const char __user* source,
+        const char __user* target,
+        const char __user* fstype,
+        unsigned long flags,
+        const void __user* _fsdata);
+
+// in infoops.cc
+int do_clock_gettime(clockid_t clk_id, timespec __user* tp);
+int do_gettimeofday(timeval __user* tv, void __user* tz);
 
-// return value is stored in %eax and %edx
-typedef int (*syscall_handler)(interrupt_stack* data);
+} // namespace kernel::syscall
 
-void init_syscall(void);
+} // namespace kernel

+ 0 - 18
include/kernel/task.h

@@ -1,18 +0,0 @@
-#pragma once
-
-#include <types/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct tss32_t {
-    uint32_t backlink, esp0, ss0, esp1, ss1, esp2, ss2, cr3;
-    uint32_t eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi;
-    uint32_t es, cs, ss, ds, fs, gs;
-    uint32_t ldtr, iomap;
-};
-
-#ifdef __cplusplus
-}
-#endif

+ 12 - 6
include/kernel/task/thread.hpp

@@ -8,12 +8,13 @@
 
 #include <types/types.h>
 
+#include <kernel/mem/paging.hpp>
 #include <kernel/signal.hpp>
 #include <kernel/user/thread_local.hpp>
 
 namespace kernel::task {
 
-using tid_t = uint32_t;
+using tid_t = std::size_t;
 
 struct thread {
 public:
@@ -27,13 +28,18 @@ public:
 
 private:
     struct kernel_stack {
-        std::byte* stack_base;
-        uint32_t* esp;
+        mem::paging::pfn_t pfn;
+        uintptr_t sp;
 
         kernel_stack();
         kernel_stack(const kernel_stack& other);
         kernel_stack(kernel_stack&& other);
         ~kernel_stack();
+
+        uint64_t pushq(uint64_t val);
+        uint32_t pushl(uint32_t val);
+
+        void load_interrupt_stack() const;
     };
 
 public:
@@ -46,14 +52,14 @@ public:
     int* __user clear_child_tid {};
 
     std::string name {};
-
-    segment_descriptor tls_desc {};
+    uint64_t tls_desc32 {};
+    std::size_t elected_times {};
 
     explicit thread(std::string name, pid_t owner);
     thread(const thread& val, pid_t owner);
 
     int set_thread_area(user::user_desc* ptr);
-    int load_thread_area() const;
+    int load_thread_area32() const;
 
     void set_attr(thd_attr_t new_attr);
 

+ 11 - 13
include/kernel/tty.hpp

@@ -1,5 +1,7 @@
 #pragma once
 
+#include <string>
+
 #include <stdint.h>
 #include <sys/types.h>
 #include <termios.h>
@@ -11,10 +13,11 @@
 #include <kernel/async/waitlist.hpp>
 #include <kernel/async/lock.hpp>
 
+namespace kernel::tty {
+
 class tty : public types::non_copyable {
 public:
     static constexpr size_t BUFFER_SIZE = 4096;
-    static constexpr size_t NAME_SIZE = 32;
 
 private:
     void _real_commit_char(int c);
@@ -23,7 +26,7 @@ private:
     int _do_erase(bool should_echo);
 
 public:
-    tty();
+    explicit tty(std::string name);
     virtual void putchar(char c) = 0;
     void print(const char* str);
     ssize_t read(char* buf, size_t buf_size, size_t n);
@@ -52,13 +55,13 @@ public:
         return fg_pgroup;
     }
 
-    char name[NAME_SIZE];
     termios termio;
+    std::string name;
 
 protected:
-    kernel::async::mutex mtx_buf;
+    async::mutex mtx_buf;
     types::buffer buf;
-    kernel::async::wait_list waitlist;
+    async::wait_list waitlist;
 
     pid_t fg_pgroup;
 };
@@ -69,13 +72,8 @@ public:
     virtual void putchar(char c) override;
 };
 
-class serial_tty : public virtual tty {
-public:
-    serial_tty(int id);
-    virtual void putchar(char c) override;
+inline tty* console;
 
-public:
-    uint16_t id;
-};
+int register_tty(tty* tty_dev);
 
-inline tty* console;
+} // namespace kernel::tty

+ 1 - 3
include/kernel/user/thread_local.hpp

@@ -1,7 +1,5 @@
 #pragma once
 
-#include <kernel/mem.h>
-
 #include <stdint.h>
 
 namespace kernel::user {
@@ -18,6 +16,6 @@ struct user_desc {
     uint32_t useable : 1;
 };
 
-void load_thread_area(const segment_descriptor& desc);
+void load_thread_area32(uint64_t desc);
 
 } // namespace kernel::user

+ 1 - 1
include/kernel/vfs.hpp

@@ -52,7 +52,7 @@ struct PACKED user_dirent {
     // uint8_t d_type; // file type, with offset of (d_reclen - 1)
 };
 
-struct user_dirent64 {
+struct PACKED user_dirent64 {
     ino64_t d_ino; // inode number
     uint64_t d_off; // implementation-defined field, ignored
     uint16_t d_reclen; // length of this struct user_dirent

+ 10 - 47
include/types/allocator.hpp

@@ -11,12 +11,6 @@
 
 #include <kernel/async/lock.hpp>
 
-namespace kernel::kinit {
-
-void init_kernel_heap(void* start, std::size_t size);
-
-} // namespace kernel::kinit
-
 namespace types::memory {
 
 class brk_memory_allocator {
@@ -28,17 +22,14 @@ private:
     byte* p_start;
     byte* p_limit;
     byte* p_break;
+    byte* p_allocated;
     kernel::async::mutex mtx;
 
-    constexpr byte* brk(byte* addr)
-    {
-        if (addr >= p_limit) [[unlikely]]
-            return nullptr;
-        return p_break = addr;
-    }
+    byte* brk(byte* addr);
+    byte* sbrk(size_type increment);
 
-    constexpr byte* sbrk(size_type increment)
-    { return brk(p_break + increment); }
+    constexpr byte* sbrk() const noexcept
+    { return p_break; }
 
 public:
     explicit brk_memory_allocator(byte* start, size_type size);
@@ -46,41 +37,13 @@ public:
 
     void* allocate(size_type size);
     void deallocate(void* ptr);
-};
-
-void* kimalloc(std::size_t size);
-void kifree(void* ptr);
 
-template <typename T>
-struct ident_allocator {
-    using value_type = T;
-    using propagate_on_container_move_assignment = std::true_type;
-
-    constexpr ident_allocator() = default;
-
-    template <typename U>
-    constexpr ident_allocator(const ident_allocator<U>&) noexcept {}
-    
-    inline T* allocate(std::size_t n)
-    { return (T*)kimalloc(n * sizeof(T)); }
-    inline void deallocate(T* ptr, std::size_t) { return kifree(ptr); }
+    bool allocated(void* ptr) const noexcept;
 };
 
-template <typename T, typename... Args>
-constexpr T* kinew(Args&&... args)
-{
-    ident_allocator<T> alloc { };
-    T* ptr = std::allocator_traits<ident_allocator<T>>::allocate(alloc, 1);
-    std::allocator_traits<ident_allocator<T>>::construct(alloc, ptr, std::forward<Args>(args)...);
-    return ptr;
-}
+} // namespace types::memory
 
-template <typename T>
-constexpr void kidelete(T* ptr)
-{
-    ident_allocator<T> alloc { };
-    std::allocator_traits<ident_allocator<T>>::destroy(alloc, ptr);
-    std::allocator_traits<ident_allocator<T>>::deallocate(alloc, ptr, 1);
-}
+namespace kernel::kinit {
+void init_allocator();
 
-} // namespace types::memory
+} // namespace kernel::kinit

+ 157 - 19
include/types/elf.hpp

@@ -1,22 +1,23 @@
 #pragma once
+
 #include <errno.h>
-#include <kernel/interrupt.h>
+#include <stdint.h>
+
+#include <kernel/interrupt.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
-#include <stdint.h>
-#include <types/size.h>
-#include <types/status.h>
 
 namespace types::elf {
+
 using elf32_addr_t = uint32_t;
 using elf32_off_t = uint32_t;
 
-using elf_addr_t = elf32_addr_t;
-using elf_off_t = elf32_off_t;
+using elf64_addr_t = uint64_t;
+using elf64_off_t = uint64_t;
 
-constexpr elf32_addr_t ELF_STACK_BOTTOM = 0xbffff000;
-constexpr elf32_off_t ELF_STACK_SIZE = 8 * 1024 * 1024;
-constexpr elf32_addr_t ELF_STACK_TOP = ELF_STACK_BOTTOM - ELF_STACK_SIZE;
+constexpr elf32_addr_t ELF32_STACK_BOTTOM = 0xbffff000;
+constexpr elf32_off_t ELF32_STACK_SIZE = 8 * 1024 * 1024;
+constexpr elf32_addr_t ELF32_STACK_TOP = ELF32_STACK_BOTTOM - ELF32_STACK_SIZE;
 
 struct PACKED elf32_header {
     // 0x7f, "ELF"
@@ -104,7 +105,11 @@ struct PACKED elf32_program_header_entry {
     elf32_off_t filesz;
     elf32_off_t memsz;
     // segment dependent
-    uint32_t flags;
+    enum : uint32_t {
+        PF_X = 0x1,
+        PF_W = 0x2,
+        PF_R = 0x4,
+    } flags;
     // 0 and 1 for no alignment, otherwise power of 2
     uint32_t align;
 };
@@ -131,21 +136,154 @@ struct PACKED elf32_section_header_entry {
     } sh_flags;
     elf32_addr_t sh_addr;
     elf32_off_t sh_offset;
-    uint32_t sh_size;
-    char _[16];
+    elf32_off_t sh_size;
+    uint32_t sh_link;
+    uint32_t sh_info;
+    elf32_off_t sh_addralign;
+    elf32_off_t sh_entsize;
 };
 
 struct elf32_load_data {
     const fs::dentry* exec_dent;
-    const char* const* argv;
-    const char* const* envp;
-    int errcode;
-    void* eip;
-    uint32_t* sp;
-    bool system;
+    const std::vector<std::string>& argv;
+    const std::vector<std::string>& envp;
+    uintptr_t ip;
+    uintptr_t sp;
 };
 
 // TODO: environment variables
-int elf32_load(elf32_load_data* data);
+int elf32_load(elf32_load_data& data);
+
+struct PACKED elf64_header {
+    // 0x7f, "ELF"
+    char magic[4];
+
+    enum : uint8_t {
+        FORMAT_32 = 1,
+        FORMAT_64 = 2,
+    } format;
+    enum : uint8_t {
+        ENDIAN_LITTLE = 1,
+        ENDIAN_BIG = 2,
+    } endian;
+    // should be 1
+    uint8_t _version1;
+    enum : uint8_t {
+        ABI_SYSTEM_V = 0x00,
+        // TODO:
+        ABI_LINUX = 0x03,
+    } abi;
+    uint8_t abi_version;
+    uint8_t _reserved[7];
+    enum : uint16_t {
+        ET_NONE = 0x00,
+        ET_REL = 0x01,
+        ET_EXEC = 0x02,
+        ET_DYN = 0x03,
+        ET_CORE = 0x04,
+        ET_LOOS = 0xfe00,
+        ET_HIOS = 0xfeff,
+        ET_LOPROC = 0xff00,
+        ET_HIPROC = 0xffff,
+    } type;
+    enum : uint16_t {
+        ARCH_NONE = 0x00,
+        ARCH_X86 = 0x03,
+        ARCH_ARM = 0x28,
+        ARCH_IA64 = 0x32,
+        ARCH_X86_64 = 0x3e,
+        ARCH_ARM64 = 0xb7,
+        ARCH_RISCV = 0xf3,
+    } arch;
+    // should be 1
+    uint32_t _version2;
+    // entry address
+    elf64_addr_t entry;
+    // program header table offset
+    elf64_off_t phoff;
+    // section header table offset
+    elf64_off_t shoff;
+    // architecture dependent flags
+    uint32_t flags;
+    // elf header size
+    uint16_t ehsize;
+    // program header table entry size
+    uint16_t phentsize;
+    // program header table entries number
+    uint16_t phnum;
+    // section header table entry size
+    uint16_t shentsize;
+    // section header table entries number
+    uint16_t shnum;
+    // section header table entry index that contains section names
+    uint16_t shstrndx;
+};
+
+struct PACKED elf64_program_header_entry {
+    enum : uint32_t {
+        PT_NULL = 0x00,
+        PT_LOAD = 0x01,
+        PT_DYNAMIC = 0x02,
+        PT_INTERP = 0x03,
+        PT_NOTE = 0x04,
+        PT_SHLIB = 0x05,
+        PT_PHDR = 0x06,
+        PT_TLS = 0x07,
+        PT_LOOS = 0x60000000,
+        PT_HIOS = 0x6fffffff,
+        PT_LIPROC = 0x70000000,
+        PT_HIPROC = 0x7fffffff,
+    } type;
+    // segment dependent
+    enum : uint32_t {
+        PF_X = 0x1,
+        PF_W = 0x2,
+        PF_R = 0x4,
+    } flags;
+    elf64_off_t offset;
+    elf64_addr_t vaddr;
+    elf64_addr_t paddr;
+    elf64_off_t filesz;
+    elf64_off_t memsz;
+    // 0 and 1 for no alignment, otherwise power of 2
+    uint64_t align;
+};
+
+struct PACKED elf64_section_header_entry {
+    uint32_t sh_name;
+    enum : uint32_t {
+        SHT_NULL = 0x00,
+        SHT_PROGBITS = 0x01,
+        SHT_RELA = 0x04,
+        SHT_DYNAMIC = 0x06,
+        SHT_NOTE = 0x07,
+        SHT_NOBITS = 0x08,
+        SHT_REL = 0x09,
+        SHT_DYNSYM = 0x0b,
+        SHT_INIT_ARRAY = 0x0e,
+        SHT_FINI_ARRAY = 0x0f,
+        SHT_PREINIT_ARRAY = 0x0f,
+    } sh_type;
+    enum : uint64_t {
+        SHF_WRITE = 0x01,
+        SHF_ALLOC = 0x02,
+        SHF_EXECINSTR = 0x04,
+    } sh_flags;
+    elf64_addr_t sh_addr;
+    elf64_off_t sh_offset;
+    elf64_off_t sh_size;
+    uint32_t sh_link;
+    uint32_t sh_info;
+    elf64_off_t sh_addralign;
+    elf64_off_t sh_entsize;
+};
+
+struct elf64_load_data {
+    const fs::dentry* exec_dent;
+    std::vector<std::string> argv;
+    std::vector<std::string> envp;
+    unsigned long ip;
+    unsigned long sp;
+};
 
 } // namespace types::elf

+ 19 - 8
include/types/hash_map.hpp

@@ -16,19 +16,30 @@ namespace types {
 
 // taken from linux
 constexpr uint32_t GOLDEN_RATIO_32 = 0x61C88647;
-// constexpr uint64_t GOLDEN_RATIO_64 = 0x61C8864680B583EBull;
+constexpr uint64_t GOLDEN_RATIO_64 = 0x61C8864680B583EBull;
 
-using hash_t = size_t;
+using hash_t = std::size_t;
 
 static inline constexpr hash_t _hash32(uint32_t val)
 {
     return val * GOLDEN_RATIO_32;
 }
 
-static inline constexpr hash_t hash32(uint32_t val, uint32_t bits)
+static inline constexpr hash_t hash32(uint32_t val, std::size_t bits)
 {
     // higher bits are more random
-    return _hash32(val) >> (32 - bits);
+    return _hash32(val) >> (8 * sizeof(hash_t) - bits);
+}
+
+static inline constexpr hash_t _hash64(uint64_t val)
+{
+    return val * GOLDEN_RATIO_64;
+}
+
+static inline constexpr hash_t hash64(uint64_t val, std::size_t bits)
+{
+    // higher bits are more random
+    return _hash64(val) >> (8 * sizeof(hash_t) - bits);
 }
 
 template <typename T>
@@ -36,17 +47,17 @@ constexpr bool is_c_string_v = std::is_same_v<std::decay_t<T>, char*>
     || std::is_same_v<std::decay_t<T>, const char*>;
 
 template <typename T,
-    std::enable_if_t<std::is_convertible_v<T, uint32_t>, bool> = true>
+    std::enable_if_t<std::is_convertible_v<T, uint64_t>, bool> = true>
 inline hash_t hash(T val, std::size_t bits)
 {
-    return hash32(static_cast<uint32_t>(val), bits);
+    return hash64(static_cast<uint64_t>(val), bits);
 }
 
 template <typename T,
     std::enable_if_t<std::is_pointer_v<T> && !is_c_string_v<T>, bool> = true>
 inline hash_t hash(T val, std::size_t bits)
 {
-    return hash32(std::bit_cast<uint32_t>(val), bits);
+    return hash(std::bit_cast<uintptr_t>(val), bits);
 }
 
 inline hash_t hash(const char* str, std::size_t bits)
@@ -57,7 +68,7 @@ inline hash_t hash(const char* str, std::size_t bits)
         while (*str)
             hash = hash * seed + (*str++);
 
-        return hash32(hash, bits);
+        return hash64(hash, bits);
 };
 
 template <template <typename, typename, typename> typename String,

+ 43 - 0
include/types/list.hpp

@@ -0,0 +1,43 @@
+#pragma once
+
+namespace types::list {
+
+template <typename ListNode>
+void list_insert(ListNode** head, ListNode* node)
+{
+    node->prev = nullptr;
+    node->next = *head;
+    if (*head)
+        (*head)->prev = node;
+    *head = node;
+}
+
+template <typename ListNode>
+ListNode* list_get(ListNode** head)
+{
+    ListNode* node = *head;
+    if (node) {
+        *head = node->next;
+
+        node->next = nullptr;
+        node->prev = nullptr;
+    }
+    return node;
+}
+
+template <typename ListNode>
+void list_remove(ListNode** head, ListNode* node)
+{
+    if (node->prev)
+        node->prev->next = node->next;
+    else
+        *head = node->next;
+
+    if (node->next)
+        node->next->prev = node->prev;
+
+    node->next = nullptr;
+    node->prev = nullptr;
+}
+
+} // namespace types

+ 0 - 22
include/types/size.h

@@ -1,22 +0,0 @@
-#pragma once
-
-#include "stdint.h"
-
-#ifdef __GNUC__
-#define PACKED __attribute__((__packed__))
-#else
-#error "no definition for ((PACKED))"
-#endif
-
-#define __32bit_system
-
-#ifdef __32bit_system
-typedef uint32_t ptr_t;
-typedef int32_t diff_t;
-#elif
-typedef uint64_t ptr_t;
-typedef int64_t diff_t;
-#endif
-
-typedef ptr_t pptr_t;
-typedef ssize_t page_t;

+ 0 - 4
include/types/status.h

@@ -1,4 +0,0 @@
-#pragma once
-
-#define GB_OK (0)
-#define GB_FAILED (1)

+ 6 - 2
include/types/types.h

@@ -1,7 +1,5 @@
 #pragma once
 
-#include "size.h"
-#include "status.h"
 #include "stdint.h"
 
 #define __user
@@ -18,6 +16,12 @@
 #error "no definition for ((SECTION))"
 #endif
 
+#ifdef __GNUC__
+#define PACKED __attribute__((__packed__))
+#else
+#error "no definition for ((PACKED))"
+#endif
+
 #ifdef __GNUC__
 #define likely(expr) (__builtin_expect(!!(expr), 1))
 #define unlikely(expr) (__builtin_expect(!!(expr), 0))

+ 16 - 0
include/types/user_types.hpp

@@ -0,0 +1,16 @@
+#pragma once
+
+#include <stdint.h>
+
+#include <types/types.h>
+
+namespace types {
+
+using ptr32_t = uint32_t;
+
+struct iovec32 {
+    ptr32_t iov_base;
+    ptr32_t iov_len;
+};
+
+} // namespace types

+ 1 - 1
init_script.sh

@@ -19,7 +19,7 @@ export PATH="/bin"
 echo ok > /dev/console
 
 mkdir -p /etc /root /proc
-mount -t proc proc proc
+mount -t procfs proc proc
 
 cat > /etc/passwd <<EOF
 root:x:0:0:root:/root:/mnt/busybox sh

+ 7 - 22
pretty-print.py

@@ -94,31 +94,16 @@ class stringPrinter:
         self.val = val
 
     def to_string(self):
-        return self.val['m_data']
+        if self.val['m_data']['stackdata']['end'] == 0:
+            return self.val['m_data']['stackdata']['str'].string()
+        return self.val['m_data']['heapdata']['m_ptr'].string()
+
+    def num_children(self):
+        return 0
 
     def display_hint(self):
         return 'string'
 
-    def children(self):
-        return
-
-        yield 'str', self.val['m_data']
-
-        if self.val['m_data'] == 0:
-            return
-
-        yield 'length', self.val['m_size'] - 1
-
-        ptr = self.val['m_data']
-        i = 0
-
-        while ptr.dereference() != 0:
-            yield '[%d]' % i, ptr.dereference()
-            ptr += 1
-            i += 1
-
-        yield '[%d]' % i, 0
-
 class listPrinter:
     def __init__(self, val):
         self.val: gdb.Field = val
@@ -292,7 +277,7 @@ def build_pretty_printer(val):
     if re.compile(r"^std::impl::rbtree<.*, .*, .*>::_iterator<.*?>$").match(typename):
         return rbtreeIteratorPrinter(val)
 
-    if re.compile(r"^types::string<.*>$").match(typename):
+    if re.compile(r"^std::basic_string<.*>$").match(typename):
         return stringPrinter(val)
 
     return None

+ 105 - 247
src/asm/interrupt.s

@@ -1,264 +1,122 @@
-.code32
-
 .text
 
-# TODO: stack alignment
-.globl int6
-.type  int6 @function
-int6:
-    pushal
-    call int6_handler
-    popal
-
-    iret
-
-# TODO: stack alignment
-.globl int8
-.type  int8 @function
-int8:
-    nop
-    iret
-
-# TODO: stack alignment
-.globl int13
-.type  int13 @function
-int13:
-    pushal
-    call int13_handler
-    popal
-
-# remove the 32bit error code from stack
-    addl $4, %esp
-    iret
-
-.globl int14
-.type  int14 @function
-int14:
-    # push general purpose registers
-    pushal
-
-    # save %cr2
-    movl %cr2, %eax
-    pushl %eax
-
-    # save current esp (also pointer to struct int14_data)
-    mov %esp, %ebx
-
-    # allocate space for mmx registers and argument
-    subl $0x210, %esp
-
-    # align stack to 16byte boundary
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    fxsave 16(%esp)
-
-    # push (interrupt_stack*)data
-    mov %ebx, (%esp)
-
-    call int14_handler
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack and general purpose registers
-    leal 4(%ebx), %esp
-    popal
-
-# remove the 32bit error code from stack
-    addl $4, %esp
-    iret
-
-.globl irq0
-irq0:
-    pushal
-    mov $0, %eax
-    jmp irqstub
-.globl irq1
-irq1:
-    pushal
-    mov $1, %eax
-    jmp irqstub
-.globl irq2
-irq2:
-    pushal
-    mov $2, %eax
-    jmp irqstub
-.globl irq3
-irq3:
-    pushal
-    mov $3, %eax
-    jmp irqstub
-.globl irq4
-irq4:
-    pushal
-    mov $4, %eax
-    jmp irqstub
-.globl irq5
-irq5:
-    pushal
-    mov $5, %eax
-    jmp irqstub
-.globl irq6
-irq6:
-    pushal
-    mov $6, %eax
-    jmp irqstub
-.globl irq7
-irq7:
-    pushal
-    mov $7, %eax
-    jmp irqstub
-.globl irq8
-irq8:
-    pushal
-    mov $8, %eax
-    jmp irqstub
-.globl irq9
-irq9:
-    pushal
-    mov $9, %eax
-    jmp irqstub
-.globl irq10
-irq10:
-    pushal
-    mov $10, %eax
-    jmp irqstub
-.globl irq11
-irq11:
-    pushal
-    mov $11, %eax
-    jmp irqstub
-.globl irq12
-irq12:
-    pushal
-    mov $12, %eax
-    jmp irqstub
-.globl irq13
-irq13:
-    pushal
-    mov $13, %eax
-    jmp irqstub
-.globl irq14
-irq14:
-    pushal
-    mov $14, %eax
-    jmp irqstub
-.globl irq15
-irq15:
-    pushal
-    mov $15, %eax
-    jmp irqstub
-
-.globl irqstub
-irqstub:
-    # save current esp
-    mov %esp, %ebx
-
-    # align stack to 16byte boundary
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    sub $(512 + 16), %esp
-    fxsave 16(%esp)
-
-    # save irq number and pointers to context and mmx registers
-    mov %eax, (%esp)  # irq number
-    mov %ebx, 4(%esp) # pointer to context
-    lea 16(%esp), %eax
-    mov %eax, 8(%esp) # pointer to mmx registers
-
-    call irq_handler
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack and general purpose registers
-    mov %ebx, %esp
-    popal
-
-    iret
-
-.globl syscall_stub
-.type  syscall_stub @function
-syscall_stub:
-    pushal
-
-    # save current esp
-    mov %esp, %ebx
-
-    # stack alignment
-    and $0xfffffff0, %esp
-
-    # save mmx registers
-    sub $(512 + 16), %esp
-    fxsave 16(%esp)
-
-    # save pointers to context and mmx registers
-    mov %ebx, (%esp) # pointer to context
-    lea 16(%esp), %eax
-    mov %eax, 4(%esp) # pointer to mmx registers
-
-    call syscall_entry
-
-    # restore mmx registers
-    fxrstor 16(%esp)
-
-    # restore stack
-    mov %ebx, %esp
-
-.globl _syscall_stub_fork_return
-.type  _syscall_stub_fork_return @function
-_syscall_stub_fork_return:
-    popal
-    iret
+.extern after_ctx_switch
+.globl ISR_stub_restore
+
+ISR_stub:
+	sub $0x78, %rsp
+	mov %rax,  0x00(%rsp)
+	mov %rbx,  0x08(%rsp)
+	mov %rcx,  0x10(%rsp)
+	mov %rdx,  0x18(%rsp)
+	mov %rdi,  0x20(%rsp)
+	mov %rsi,  0x28(%rsp)
+	mov %r8,   0x30(%rsp)
+	mov %r9,   0x38(%rsp)
+	mov %r10,  0x40(%rsp)
+	mov %r11,  0x48(%rsp)
+	mov %r12,  0x50(%rsp)
+	mov %r13,  0x58(%rsp)
+	mov %r14,  0x60(%rsp)
+	mov %r15,  0x68(%rsp)
+	mov %rbp,  0x70(%rsp)
+
+	mov 0x78(%rsp), %rax
+	sub $ISR0, %rax
+	shr $3, %rax
+	mov %rax, 0x78(%rsp)
+
+	mov %rsp, %rbx
+	and $~0xf, %rsp
+
+	sub $512, %rsp
+	fxsave (%rsp)
+
+	mov %rbx, %rdi
+	mov %rsp, %rsi
+	call interrupt_handler
+
+ISR_stub_restore:
+	fxrstor (%rsp)
+	mov %rbx, %rsp
+
+	mov 0x00(%rsp), %rax
+	mov 0x08(%rsp), %rbx
+	mov 0x10(%rsp), %rcx
+	mov 0x18(%rsp), %rdx
+	mov 0x20(%rsp), %rdi
+	mov 0x28(%rsp), %rsi
+	mov 0x30(%rsp), %r8
+	mov 0x38(%rsp), %r9
+	mov 0x40(%rsp), %r10
+	mov 0x48(%rsp), %r11
+	mov 0x50(%rsp), %r12
+	mov 0x58(%rsp), %r13
+	mov 0x60(%rsp), %r14
+	mov 0x68(%rsp), %r15
+	mov 0x70(%rsp), %rbp
+
+	mov 0x78(%rsp), %rsp
+	iretq
 
 # parameters
-# #1: esp* curr_esp
-# #2: esp* next_esp
+# #1: sp* current_task_sp
+# #2: sp* target_task_sp
 .globl asm_ctx_switch
 .type  asm_ctx_switch @function
 asm_ctx_switch:
-    movl 4(%esp), %ecx
-    movl 8(%esp), %eax
+    pushf
+	sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
 
-    push $_ctx_switch_return
-    push %ebx
-    push %edi
-    push %esi
-    push %ebp
-    pushfl
+    mov %rbx, 0x08(%rsp)
+	mov %rbp, 0x10(%rsp)
+	mov %r12, 0x18(%rsp)
+	mov %r13, 0x20(%rsp)
+	mov %r14, 0x28(%rsp)
+	mov %r15, 0x30(%rsp)
 
-    # push esp to restore
-    pushl (%ecx)
+    push (%rdi) 	 # save sp of previous stack frame of current
+	                 # acts as saving bp
 
-    mov %esp, (%ecx)
-    mov (%eax), %esp
+    mov %rsp, (%rdi) # save sp of current stack
+    mov (%rsi), %rsp # load sp of target stack
 
-    # restore esp
-    popl (%eax)
+    pop (%rsi)       # load sp of previous stack frame of target
+	                 # acts as restoring previous bp
 
-    popfl
-    pop %ebp
-    pop %esi
-    pop %edi
-    pop %ebx
+	pop %rax         # align to 16 bytes
 
-    ret
+	call after_ctx_switch
 
-_ctx_switch_return:
-    ret
+	mov 0x28(%rsp), %r15
+	mov 0x20(%rsp), %r14
+	mov 0x18(%rsp), %r13
+	mov 0x10(%rsp), %r12
+	mov 0x08(%rsp), %rbp
+    mov 0x00(%rsp), %rbx
 
-.section .text.kinit
+	add $0x30, %rsp
+    popf
 
-.globl asm_load_idt
-.type  asm_load_idt @function
-asm_load_idt:
-    movl 4(%esp), %edx
-    lidt (%edx)
-    movl 8(%esp), %edx
-    cmpl $0, %edx
-    je asm_load_idt_skip
-    sti
-asm_load_idt_skip:
     ret
+
+.altmacro
+.macro build_isr name
+	.align 8
+	ISR\name:
+		call ISR_stub
+.endm
+
+.set i, 0
+.rept 0x80+1
+	build_isr %i
+	.set i, i+1
+.endr
+
+.section .rodata
+
+.align 8
+.globl ISR_START_ADDR
+.type  ISR_START_ADDR @object
+ISR_START_ADDR:
+	.quad ISR0

+ 0 - 56
src/asm/port_io.s

@@ -1,56 +0,0 @@
-.code32
-
-.text
-
-.globl asm_outb
-.type  asm_outb @function
-asm_outb:
-    pushl %eax
-    pushl %edx
-    movw 12(%esp), %dx
-    movb 16(%esp), %al
-    outb %al, %dx
-    popl %edx
-    popl %eax
-    ret
-
-.globl asm_inb
-.type  asm_inb @function
-asm_inb:
-    pushl %edx
-    movw 8(%esp), %dx
-    inb %dx, %al
-    popl %edx
-    ret
-
-.globl asm_hlt
-.type  asm_hlt @function
-asm_hlt:
-    hlt
-    ret
-
-.globl asm_cli
-.type  asm_cli @function
-asm_cli:
-    cli
-    ret
-
-.globl asm_sti
-.type  asm_sti @function
-asm_sti:
-    sti
-    ret
-
-.section .text.kinit
-.globl asm_enable_sse
-.type  asm_enable_sse @function
-asm_enable_sse:
-	movl %cr0, %eax
-    andl $0xfffffff3, %eax
-	orl $0b100010, %eax
-	movl %eax, %cr0
-	movl %cr4, %eax
-	orl $0b11000000000, %eax
-	movl %eax, %cr4
-    fninit
-	ret

+ 0 - 53
src/asm/sys.s

@@ -1,53 +0,0 @@
-.code32
-
-.text
-
-.global asm_switch_pd
-.type   asm_switch_pd @function
-asm_switch_pd:
-    movl 4(%esp), %eax
-    shll $12, %eax
-    movl %eax, %cr3
-    ret
-
-.global current_pd
-.type   current_pd @function
-current_pd:
-    movl %cr3, %eax
-    ret
-
-.section .text.kinit
-
-.global asm_enable_paging
-.type   asm_enable_paging @function
-asm_enable_paging:
-    cli
-    // page directory address
-    movl 4(%esp), %eax
-    movl %eax, %cr3
-
-    movl %cr0, %eax
-    // SET PE, WP, PG
-    orl $0x80010001, %eax
-    movl %eax, %cr0
-
-    ret
-
-.global asm_load_gdt
-.type   asm_load_gdt @function
-asm_load_gdt:
-    cli
-    leal 6(%esp), %eax
-    lgdt (%eax)
-    ljmp $0x08, $_asm_load_gdt_fin
-_asm_load_gdt_fin:
-    ret
-
-.global asm_load_tr
-.type   asm_load_tr @function
-asm_load_tr:
-    cli
-    movl 4(%esp), %eax
-    orl $0, %eax
-    ltr %ax
-    ret

+ 178 - 287
src/boot.s

@@ -1,294 +1,185 @@
 .section .stage1
-.code16
-loader_start:
-# set segment registers
-    movw %cs, %ax
-    movw %ax, %ds
-
-_clear_screen:
-    mov $0x00, %ah
-    mov $0x03, %al
-    int $0x10
-
-# get memory size info and storage it
-_get_memory_size:
-    xorw %cx, %cx
-    xorw %dx, %dx
-    movw $0xe801, %ax
-
-    int $0x15
-    jc _get_memory_size_error
-
-    cmpb $0x86, %ah # unsupported function
-    je _get_memory_size_error
-    cmpb $0x80, %ah # invalid command
-    je _get_memory_size_error
-
-    jcxz _get_memory_size_use_ax
-    movw %cx, %ax
-    movw %dx, %bx
-
-_get_memory_size_use_ax:
-    movl $asm_mem_size_info, %edx
-    movw %ax, (%edx)
-    addw $2, %dx
-    movw %bx, (%edx)
-    jmp _e820_mem_map_load
-
-_get_memory_size_error:
-    xchgw %bx, %bx
-    jmp __stage1_halt
-
-_e820_mem_map_load:
-    addl $4, %esp
-    movl $0, (%esp)
-
-    # save the destination address to es:di
-    movw %cs, %ax
-    movw %ax, %es
-
-    movl $asm_e820_mem_map, %edi
-
-    # clear ebx
-    xorl %ebx, %ebx
-
-    # set the magic number to edx
-    movl $0x534D4150, %edx
-
-_e820_mem_map_load_loop:
-    # set function number to eax
-    movl $0xe820, %eax
-
-    # set default entry size
-    movl $24, %ecx
-
-    int $0x15
-
-    incl (%esp)
-    addl %ecx, %edi
-
-    jc _e820_mem_map_load_fin
-    cmpl $0, %ebx
-    jz _e820_mem_map_load_fin
-    jmp _e820_mem_map_load_loop
-
-_e820_mem_map_load_fin:
-    movl (%esp), %eax
-    movl $asm_e820_mem_map_count, %edi
-    movl %eax, (%edi)
-
-    movl $asm_e820_mem_map_entry_size, %edi
-    movl %ecx, (%edi)
-
-    jmp _load_gdt
-
-_load_gdt:
-    cli
-    lgdt asm_gdt_descriptor
-
-# enable protection enable (PE) bit
-    movl %cr0, %eax
-    orl $1, %eax
-    movl %eax, %cr0
-
-    ljmp $0x08, $start_32bit
-
 .code32
-
+.globl start_32bit
 start_32bit:
-    movw $0x10, %ax
-    movw %ax, %ds
-    movw %ax, %es
-    movw %ax, %fs
-    movw %ax, %gs
-    movw %ax, %ss
-
-    movl $0, %esp
-    movl $0, %ebp
-
-setup_early_kernel_page_table:
-# memory map:
-# 0x0000-0x1000: empty page
-# 0x1000-0x2000: early kernel pd
-# 0x2000-0x6000: 4 pts
-# 0x6000-0x8000: early kernel stack
-# so we fill the first 8KiB with zero
-    movl $0x00000000, %eax
-    movl $0x8000, %ecx
-
-_fill_zero:
-    cmpl $0, %ecx
-    jz _fill_zero_end
-    subl $4, %ecx
-    movl $0, (%eax)
-    addl $4, %eax
-    jmp _fill_zero
-_fill_zero_end:
-
-# pt#0: 0x00000000 to 0x00400000
-    movl $0x00001000, %eax
-    movl $0x00002003, (%eax)
-# pt#1: 0xc0000000 to 0xc0400000
-    movl $0x00001c00, %eax
-    movl $0x00003003, (%eax)
-# pt#2: 0xff000000 to 0xff400000
-    movl $0x00001ff0, %eax
-    movl $0x00004003, (%eax)
-# pt#3: 0xffc00000 to 0xffffffff
-    movl $0x00001ffc, %eax
-    movl $0x00005003, (%eax)
-
-# map early kernel page directory to 0xff000000
-    movl $0x00004000, %eax
-    movl $0x00001003, (%eax)
-
-# map kernel pt#2 to 0xff001000
-    movl $0x00004004, %eax
-    movl $0x00004003, (%eax)
-
-# map __stage1_start ---- __kinit_end identically
-    movl $__stage1_start, %ebx
-    movl $__kinit_end, %ecx
-    movl %ebx, %edx
-    shrl $12, %edx
-    andl $0x3ff, %edx
-
-
-__map_stage1_kinit:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00002000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_stage1_kinit
-
-# map __text_start ---- __data_end to 0xc0000000
-    movl %ecx, %ebx
-    movl $__text_start, %edx
-    shrl $12, %edx
-    andl $0x3ff, %edx
-
-    movl $__data_end, %ecx
-    subl $__text_start, %ecx
-    addl %ebx, %ecx
-
-__map_kernel_space:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00003000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_kernel_space
-
-# map __data_end ---- __bss_end from 0x100000
-    movl $0x100000, %ebx
-    movl $__bss_end, %ecx
-    subl $__data_end, %ecx
-    addl %ebx, %ecx
-
-__map_kernel_bss:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00003000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_kernel_bss
-
-# map kernel stack 0xffffe000-0xffffffff
-    movl $0x6000, %ebx
-    movl $0x8000, %ecx
-    movl $0x0ffffe, %edx
-    andl $0x3ff, %edx
-
-__map_kernel_stack:
-    leal 3(%ebx), %eax
-    movl %eax, 0x00005000(, %edx, 4)
-    addl $0x1000, %ebx
-    incl %edx
-    cmpl %ebx, %ecx
-    jne __map_kernel_stack
-
-load_early_kernel_page_table:
-    movl $0x00001000, %eax
-    movl %eax, %cr3
-
-    movl %cr0, %eax
+    mov $0x10, %ax
+    mov %ax, %ds
+    mov %ax, %es
+    mov %ax, %fs
+    mov %ax, %gs
+    mov %ax, %ss
+
+    cld
+    xor %eax, %eax
+
+    # clear paging structures
+    mov $0x100000, %edi
+    mov %edi, %ecx
+    shr $2, %ecx # %ecx /= 4
+    rep stosl
+
+    # set P, RW, G
+    mov $0x00000103, %ebx
+	xor %edx, %edx
+    mov $0x00101000, %esi
+
+    # PML4E 0x000
+    # we need the first 1GB identically mapped
+    # so that we won't trigger a triple fault after
+    # enabling paging
+	lea -0x1000(%esi), %edi # %edi = 0x100000
+    call fill_pxe
+
+    # PML4E 0xff0
+	mov $0x80000000, %edx
+	lea 0xff0(%edi), %edi
+	call fill_pxe
+	xor %edx, %edx
+
+    # setup PDPT for physical memory mapping
+    mov %esi, %edi
+
+    # set PS
+    or $0x00000080, %ebx
+    mov $256, %ecx
+    xor %esi, %esi
+_fill_loop1:
+    call fill_pxe
+    lea 8(%edi), %edi
+    add $0x40000000, %esi # 1GB
+    adc $0, %edx
+    loop _fill_loop1
+
+	mov $0x80000000, %edx
+
+    # set PCD, PWT
+    or $0x00000018, %ebx
+    mov $256, %ecx
+    xor %esi, %esi
+_fill_loop2:
+    call fill_pxe
+    lea 8(%edi), %edi
+    add $0x40000000, %esi # 1GB
+    adc $0, %edx
+    loop _fill_loop2
+
+	xor %edx, %edx
+
+    # PML4E 0xff8
+    mov %edi, %esi # 0x102000
+    mov $0x100ff8, %edi
+    # clear PCD, PWT, PS
+    and $(~0x00000098), %ebx
+    call fill_pxe
+
+    # PDPTE 0xff8
+    lea 0xff8(%esi), %edi  # 0x102ff8
+    lea 0x1000(%esi), %esi # 0x103000
+    call fill_pxe
+
+    # PDE 0xff0
+    lea 0xff0(%esi), %edi  # 0x103ff0
+    lea 0x1000(%esi), %esi # 0x104000
+    call fill_pxe
+
+    # fill PT (kernel image)
+    mov %esi, %edi # 0x104000
+    mov $0x2000, %esi
+
+.extern KERNEL_PAGES
+    mov $KIMAGE_PAGES, %ecx
+
+_fill_loop3:
+    call fill_pxe
+    lea 8(%edi), %edi
+	lea 0x1000(%esi), %esi
+    loop _fill_loop3
+
+    # set msr
+    mov $0xc0000080, %ecx
+    rdmsr
+    or $0x901, %eax # set LME, NXE, SCE
+    wrmsr
+
+    # set cr4
+    mov %cr4, %eax
+    or $0xa0, %eax # set PAE, PGE
+    mov %eax, %cr4
+
+    # load new page table
+	xor %eax, %eax
+	inc %eax
+	shl $20, %eax # %eax = 0x100000
+    mov %eax, %cr3
+
+    mov %cr0, %eax
     // SET PE, WP, PG
-    orl $0x80010001, %eax
-    movl %eax, %cr0
-
-# set stack pointer and clear stack bottom
-    movl $0xfffffff0, %esp
-    movl $0xfffffff0, %ebp
-
-    movl $0x00, (%esp)
-    movl $0x00, 4(%esp)
-    movl $0x00, 8(%esp)
-    movl $0x00, 12(%esp)
+    or $0x80010001, %eax
+    mov %eax, %cr0
+
+    # create gdt
+	xor %eax, %eax # at 0x0000
+	mov %eax, 0x00(%eax)
+	mov %eax, 0x04(%eax) # null descriptor
+	mov %eax, 0x08(%eax) # code segment lower
+	mov %eax, 0x10(%eax) # data segment lower
+	mov $0x00209a00, %ecx
+	mov %ecx, 0x0c(%eax) # code segment higher
+	mov $0x00009200, %ecx
+	mov %ecx, 0x14(%eax) # data segment higher
+
+    # gdt descriptor
+	push %eax
+	push %eax
+
+    # pad with a word
+	mov $0x00170000, %eax
+	push %eax
+
+	lgdt 2(%esp)
+	add $12, %esp
+
+    ljmp $0x08, $_64bit_entry
+
+# %ebx: attribute low
+# %edx: attribute high
+# %esi: page physical address
+# %edi: page x entry address
+fill_pxe:
+    lea (%ebx, %esi, 1), %eax
+    mov %eax, (%edi)
+    mov %edx, 4(%edi)
+
+    ret
+
+.code64
+_64bit_entry:
+	jmp start_64bit
+
+.section .text.kinit
+start_64bit:
+    # set stack pointer and clear stack bottom
+	movzw %sp, %rdi
+	xor %rsp, %rsp
+	inc %rsp
+	neg %rsp
+	shr $40, %rsp
+	shl $40, %rsp
+
+	add %rdi, %rsp
+	mov %rsp, %rdi
+
+    # make stack frame
+	lea -16(%rsp), %rsp
+	mov %rsp, %rbp
+
+	xor %rax, %rax
+	mov %rax, (%rsp)
+	mov %rax, 8(%rsp)
 
     call kernel_init
 
-__stage1_halt:
-    hlt
-    jmp __stage1_halt
-
-asm_gdt_descriptor:
-    .word (5 * 8) - 1 # size
-    .long asm_gdt_table  # address
-asm_gdt_table:
-    .8byte 0         # null descriptor
-
-    # kernel code segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0x9a       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-    # kernel data segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0x92       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-    # user code segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0xfa       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-    # user data segment
-    .word 0xffff     # limit 0 :15
-    .word 0x0000     # base  0 :15
-    .byte 0x00       # base  16:23
-    .byte 0xf2       # access
-    .byte 0b11001111 # flag and limit 16:20
-    .byte 0x00       # base 24:31
-
-.globl asm_mem_size_info
-.type  asm_mem_size_info @object
-.size  asm_mem_size_info, (.-asm_mem_size_info)
-asm_mem_size_info:
-    .word 0x12
-    .word 0x34
-
-.globl asm_e820_mem_map
-.type  asm_e820_mem_map @object
-.size  asm_e820_mem_map, (.-asm_e820_mem_map)
-asm_e820_mem_map:
-    .space 1024
-
-.globl asm_e820_mem_map_count
-.type  asm_e820_mem_map_count @object
-asm_e820_mem_map_count:
-    .long 0
-
-.globl asm_e820_mem_map_entry_size
-.type  asm_e820_mem_map_entry_size @object
-asm_e820_mem_map_entry_size:
-    .long 0
+_64bit_hlt:
+	cli
+	hlt
+	jmp _64bit_hlt

+ 29 - 51
src/fs/fat.cpp

@@ -7,11 +7,10 @@
 #include <stdio.h>
 
 #include <types/allocator.hpp>
-#include <types/status.h>
 
 #include <fs/fat.hpp>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
 #include <kernel/module.hpp>
 #include <kernel/vfs.hpp>
 
@@ -70,14 +69,15 @@ char* fat32::read_cluster(cluster_t no)
         ++buf.ref;
         return buf.data;
     }
-    auto* data = new char[sectors_per_cluster * SECTOR_SIZE];
+    // TODO: page buffer class
+    using namespace kernel::mem;
+    using namespace paging;
+    assert(sectors_per_cluster * SECTOR_SIZE <= 0x1000);
+
+    char* data = physaddr<char>{page_to_pfn(alloc_page())};
     _raw_read_cluster(data, no);
-    buf.emplace(no,
-        buf_object {
-            data,
-            1,
-            // false,
-        });
+    buf.emplace(no, buf_object { data, 1 });
+
     return data;
 }
 
@@ -142,7 +142,7 @@ int fat32::readdir(fs::inode* dir, size_t offset, const fs::vfs::filldir_func& f
             }
             auto ret = filldir(fname.c_str(), 0, ind, ind->mode & S_IFMT);
 
-            if (ret != GB_OK) {
+            if (ret != 0) {
                 release_cluster(next);
                 return nread;
             }
@@ -210,51 +210,29 @@ fat32::fat32(dev_t _device)
 
 size_t fat32::read(inode* file, char* buf, size_t buf_size, size_t offset, size_t n)
 {
-    cluster_t next = cl(file);
     uint32_t cluster_size = SECTOR_SIZE * sectors_per_cluster;
     size_t orig_n = n;
 
-    do {
-        if (offset == 0) {
-            if (n > cluster_size) {
-                auto* data = read_cluster(next);
-                memcpy(buf, data, cluster_size);
-                release_cluster(next);
-
-                buf_size -= cluster_size;
-                buf += cluster_size;
-                n -= cluster_size;
-            } else {
-                auto* data = read_cluster(next);
-                auto read = _write_buf_n(buf, buf_size, data, n);
-                release_cluster(next);
+    for (cluster_t cno = cl(file); n && cno < EOC; cno = fat[cno]) {
+        if (offset >= cluster_size) {
+            offset -= cluster_size;
+            continue;
+        }
 
-                return orig_n - n + read;
-            }
-        } else {
-            if (offset > cluster_size) {
-                offset -= cluster_size;
-            } else {
-                auto* data = read_cluster(next);
+        auto* data = read_cluster(cno);
+        data += offset;
 
-                auto to_read = cluster_size - offset;
-                if (to_read > n)
-                    to_read = n;
+        auto to_copy = std::min(n, cluster_size - offset);
+        auto ncopied = _write_buf_n(buf, buf_size, data, to_copy);
 
-                auto read = _write_buf_n(buf, buf_size, data + offset, to_read);
-                buf += read;
-                n -= read;
+        buf += ncopied, n -= ncopied;
 
-                release_cluster(next);
-                if (read != to_read) {
-                    return orig_n - n;
-                }
+        release_cluster(cno);
+        if (ncopied != to_copy)
+            break;
 
-                offset = 0;
-            }
-        }
-        next = fat[next];
-    } while (n && next < EOC);
+        offset = 0;
+    }
 
     return orig_n - n;
 }
@@ -268,7 +246,7 @@ int fat32::inode_statx(dentry* ent, statx* st, unsigned int mask)
     }
 
     if (mask & STATX_BLOCKS) {
-        st->stx_blocks = align_up<12>(ent->ind->size) / 512;
+        st->stx_blocks = ((ent->ind->size + 0xfff) & ~0xfff) / 512;
         st->stx_blksize = 4096;
         st->stx_mask |= STATX_BLOCKS;
     }
@@ -304,7 +282,7 @@ int fat32::inode_statx(dentry* ent, statx* st, unsigned int mask)
         st->stx_mask |= STATX_GID;
     }
 
-    return GB_OK;
+    return 0;
 }
 
 int fat32::inode_stat(dentry* dent, struct stat* st)
@@ -319,7 +297,7 @@ int fat32::inode_stat(dentry* dent, struct stat* st)
     st->st_blksize = 4096;
     st->st_blocks = (ind->size + 511) / 512;
     st->st_ino = ind->ino;
-    return GB_OK;
+    return 0;
 }
 
 static fat32* create_fat32(const char* source, unsigned long, const void*)

+ 26 - 3
src/fs/procfs.cc

@@ -5,9 +5,9 @@
 #include <sys/mount.h>
 #include <unistd.h>
 
-#include <types/status.h>
-
+#include <kernel/hw/timer.hpp>
 #include <kernel/module.hpp>
+#include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs/vfs.hpp>
 
@@ -64,6 +64,28 @@ static ssize_t mounts_read(char* page, size_t n)
     return orig_n - n;
 }
 
+static ssize_t schedstat_read(char* page, size_t n)
+{
+    auto orig_n = n;
+
+    if (n == 0)
+        return n;
+
+    int nw = snprintf(page, n, "%d\n", kernel::hw::timer::current_ticks());
+    n -= nw, page += nw;
+
+    for (const auto& proc : *procs) {
+        for (const auto& thd : proc.second.thds) {
+            int nwrote = snprintf(page, n, "%d %x %d\n", proc.first, thd.tid(), thd.elected_times);
+
+            n -= nwrote;
+            page += nwrote;
+        }
+    }
+
+    return orig_n - n;
+}
+
 namespace fs::proc {
 
 struct proc_file {
@@ -107,6 +129,7 @@ public:
         auto* ind = cache_inode(0, 0, S_IFDIR | 0777, 0, 0);
 
         create_file("mounts", mounts_read, nullptr);
+        create_file("schedstat", schedstat_read, nullptr);
 
         register_root_node(ind);
     }
@@ -160,7 +183,7 @@ public:
         for (const auto& [ ino, pf ] : files) {
             auto* ind = get_inode(ino);
             int ret = callback(pf.name.c_str(), 0, ind, ind->mode);
-            if (ret != GB_OK)
+            if (ret != 0)
                 return -EIO;
             ++nread;
         }

+ 62 - 52
src/kernel.ld

@@ -1,36 +1,56 @@
-OUTPUT_FORMAT(elf32-i386)
-OUTPUT_ARCH(i386:i386)
+OUTPUT_FORMAT(elf64-x86-64)
 
 MEMORY
 {
-    MEM : org = 0x00000000, l = 4096M
+    MBR    (wx) : org = 0x0e00, l = 512
+    STAGE1 (wx) : org = 0x1000, l = 4K
+    PHYMEM (w)  : org = 0xffffff0000000000, len = 512 * 1024M
+    PARRAY (w)  : org = 0xffffff8000000000, len = 128 * 1024M
+    KBSS   (w)  : org = 0xffffffffc0200000, len = 2M
+    KIMAGE (wx) : org = 0xffffffffffc00000, len = 2M
 }
 
 SECTIONS
 {
-    .stage1 0x8000 : AT(0x00000000)
+    .mbr : AT(0)
+    {
+        *(.mbr)
+
+        . = 510;
+        BYTE(0x55);
+        BYTE(0xaa);
+    } > MBR
+
+    .stage1 : AT(LOADADDR(.mbr) + SIZEOF(.mbr))
     {
         __stage1_start = .;
         *(.stage1)
 
         . = ALIGN(0x1000);
         __stage1_end = .;
-    } > MEM
+    } > STAGE1
 
     .kinit :
         AT(LOADADDR(.stage1) + SIZEOF(.stage1))
     {
-        __kinit_start = .;
-        *(.text.kinit)
+        KIMAGE_START = .;
+        KINIT_START = .;
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
+        *(.text.kinit)
 
+        . = ALIGN(16);
         *(.rodata.kinit)
 
-        . = ALIGN(16);
+        KINIT_START_ADDR = .;
+        QUAD(ABSOLUTE(KINIT_START));
+
+        KINIT_END_ADDR = .;
+        QUAD(ABSOLUTE(KINIT_END));
 
+        KINIT_PAGES = .;
+        QUAD((KINIT_END - KINIT_START) / 0x1000);
+
+        . = ALIGN(16);
         start_ctors = .;
         KEEP(*(.init_array));
         KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
@@ -38,89 +58,79 @@ SECTIONS
         KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
         end_ctors = .;
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
-
+        . = ALIGN(16);
         *(.data.kinit)
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
-
+        . = ALIGN(16);
         *(.bss.kinit)
 
-        LONG(0x00000000)
-        LONG(0x19191919)
-        LONG(0x00000000)
-
         . = ALIGN(0x1000);
-        __kinit_end = .;
-    } > MEM
+        KINIT_END = .;
+    } > KIMAGE
 
-    .text 0xc0000000 :
+    .text :
         AT(LOADADDR(.kinit) + SIZEOF(.kinit))
     {
-        __text_start = .;
+        TEXT_START = .;
         *(.text)
         *(.text*)
 
         . = ALIGN(0x1000);
-        __text_end = .;
-    } > MEM
+        TEXT_END = .;
+    } > KIMAGE
 
     .rodata :
         AT(LOADADDR(.text) + SIZEOF(.text))
     {
-        __rodata_start = .;
+        RODATA_START = .;
         *(.rodata)
         *(.rodata*)
 
         . = ALIGN(16);
-        kmod_loaders_start = .;
+        KMOD_LOADERS_START = .;
 
         *(.kmods)
-
-        __kmod_loaders_end = .;
-        LONG(0);
+        QUAD(0);
 
         . = ALIGN(16);
 
-        bss_addr = .;
-        LONG(ABSOLUTE(__bss_start));
-        bss_len = .;
-        LONG(__bss_end - __bss_start);
-        kernel_size = .;
-        LONG(__data_end - __kinit_start);
+        BSS_ADDR = .;
+        QUAD(ABSOLUTE(BSS_START));
+        BSS_LENGTH = .;
+        QUAD(BSS_END - BSS_START);
 
         . = ALIGN(0x1000);
-        __rodata_end = .;
-    } > MEM
+        RODATA_END = .;
+    } > KIMAGE
 
     .data :
         AT(LOADADDR(.rodata) + SIZEOF(.rodata))
     {
-        __data_start = .;
+        DATA_START = .;
         *(.data)
         *(.data*)
 
         . = ALIGN(0x1000);
-        __data_end = .;
-    } > MEM
+        DATA_END = .;
+        KIMAGE_END = .;
+    } > KIMAGE
+
+    .sentry :
+        AT(0x78000 - 0x4)
+    { LONG(0x01145140); } > KIMAGE
 
     .bss :
     {
-        __bss_start = .;
+        BSS_START = .;
         *(.bss)
         *(.bss*)
 
         . = ALIGN(0x1000);
-        __bss_end = .;
-    } > MEM
+        BSS_END = .;
+    } > KBSS
 
-    .sentry :
-        AT(0x60000)
-    { LONG(0x01145140); } > MEM
+    KIMAGE_PAGES = (KIMAGE_END - KIMAGE_START) / 0x1000;
+    BSS_PAGES = (BSS_END - BSS_START) / 0x1000;
 
     .eh_frame :
         AT(LOADADDR(.sentry) + SIZEOF(.sentry))
@@ -129,7 +139,7 @@ SECTIONS
         *(.eh_frame*)
         . = ALIGN(0x1000);
         __eh_frame_end = .;
-    } > MEM
+    } > KIMAGE
 
     /* Stabs debugging sections.  */
     .stab          0 : { *(.stab) }

+ 121 - 36
src/kernel/allocator.cc

@@ -7,14 +7,18 @@
 #include <stdint.h>
 
 #include <kernel/async/lock.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/slab.hpp>
+
+constexpr uintptr_t KERNEL_HEAP_START = 0xffff'ff81'8000'0000;
+constexpr uintptr_t KERNEL_HEAP_END   = 0xffff'ffbf'ffff'ffff;
+constexpr uintptr_t KERNEL_HEAP_SIZE  = KERNEL_HEAP_END - KERNEL_HEAP_START;
 
 namespace types::memory {
 
 struct mem_blk_flags {
-    uint8_t is_free;
-    uint8_t has_next;
-    uint8_t : 8; // unused1
-    uint8_t : 8; // unused2
+    unsigned long is_free  : 8;
+    unsigned long has_next : 8;
 };
 
 struct mem_blk {
@@ -84,7 +88,7 @@ constexpr void split_block(mem_blk* blk, std::size_t this_size)
     // block is too small to get split
     // that is, the block to be split should have enough room
     // for "this_size" bytes and also could contain a new block
-    if (blk->size < this_size + sizeof(mem_blk) + 8)
+    if (blk->size < this_size + sizeof(mem_blk) + 1024)
         return;
 
     mem_blk* blk_next = next(blk, this_size);
@@ -100,13 +104,52 @@ constexpr void split_block(mem_blk* blk, std::size_t this_size)
     blk->size = this_size;
 }
 
+std::byte* brk_memory_allocator::brk(byte* addr)
+{
+    if (addr >= p_limit)
+        return nullptr;
+
+    uintptr_t current_allocated = reinterpret_cast<uintptr_t>(p_allocated);
+    uintptr_t new_brk = reinterpret_cast<uintptr_t>(addr);
+
+    current_allocated &= ~(0x200000-1);
+    new_brk &= ~(0x200000-1);
+
+    using namespace kernel::mem::paging;
+    while (current_allocated <= new_brk) {
+        auto idx = idx_all(current_allocated);
+        auto pdpt = KERNEL_PAGE_TABLE[std::get<1>(idx)].parse();
+
+        auto pdpte = pdpt[std::get<2>(idx)];
+        if (!pdpte.pfn())
+            pdpte.set(PA_KERNEL_PAGE_TABLE, alloc_page_table());
+
+        auto pde = pdpte.parse()[std::get<3>(idx)];
+        assert(!(pde.attributes() & PA_P));
+        pde.set(PA_KERNEL_DATA_HUGE, page_to_pfn(alloc_pages(9)));
+
+        current_allocated += 0x200000;
+    }
+    p_allocated = (std::byte*)current_allocated;
+
+    return p_break = addr;
+}
+
+std::byte* brk_memory_allocator::sbrk(size_type increment)
+{
+    return brk(p_break + increment);
+}
+
 brk_memory_allocator::brk_memory_allocator(byte* start, size_type size)
     : p_start(start)
     , p_limit(start + size)
+    , p_break(start)
+    , p_allocated(start)
 {
-    brk(p_start);
-    auto* p_blk = aspblk(sbrk(0));
-    p_blk->size = 8;
+    auto* p_blk = aspblk(brk(p_start));
+    sbrk(sizeof(mem_blk) + 1024); // 1024 bytes (minimum size for a block)
+
+    p_blk->size = 1024;
     p_blk->flags.has_next = 0;
     p_blk->flags.is_free = 1;
 }
@@ -114,8 +157,8 @@ brk_memory_allocator::brk_memory_allocator(byte* start, size_type size)
 void* brk_memory_allocator::allocate(size_type size)
 {
     kernel::async::lock_guard_irq lck(mtx);
-    // align to 8 bytes boundary
-    size = (size + 7) & ~7;
+    // align to 1024 bytes boundary
+    size = (size + 1024-1) & ~(1024-1);
 
     auto* block_allocated = find_blk(&p_start, size);
     if (!block_allocated->flags.has_next
@@ -156,59 +199,101 @@ void brk_memory_allocator::deallocate(void* ptr)
     unite_afterwards(blk);
 }
 
-static std::byte ki_heap[0x100000];
-static brk_memory_allocator ki_alloc(ki_heap, sizeof(ki_heap));
-static brk_memory_allocator* k_alloc;
-
-void* kimalloc(std::size_t size)
+bool brk_memory_allocator::allocated(void* ptr) const noexcept
 {
-    return ki_alloc.allocate(size);
+    return (void*)KERNEL_HEAP_START <= aspbyte(ptr) && aspbyte(ptr) < sbrk();
 }
 
-void kifree(void* ptr)
-{
-    ki_alloc.deallocate(ptr);
-}
+static brk_memory_allocator* k_alloc;
 
 } // namespace types::memory
 
-SECTION(".text.kinit")
-void kernel::kinit::init_kernel_heap(void *start, std::size_t size)
+static kernel::mem::slab_cache caches[7];
+
+static constexpr int __cache_index(std::size_t size)
 {
-    using namespace types::memory;
-    k_alloc = kinew<brk_memory_allocator>((std::byte*)start, size);
+    if (size <= 32)
+        return 0;
+    if (size <= 64)
+        return 1;
+    if (size <= 96)
+        return 2;
+    if (size <= 128)
+        return 3;
+    if (size <= 192)
+        return 4;
+    if (size <= 256)
+        return 5;
+    if (size <= 512)
+        return 6;
+    return -1;
 }
 
-void* operator new(size_t sz)
+SECTION(".text.kinit")
+void kernel::kinit::init_allocator()
 {
-    void* ptr = types::memory::k_alloc->allocate(sz);
-    assert(ptr);
-    return ptr;
+    mem::init_slab_cache(caches+0, 32);
+    mem::init_slab_cache(caches+1, 64);
+    mem::init_slab_cache(caches+2, 96);
+    mem::init_slab_cache(caches+3, 128);
+    mem::init_slab_cache(caches+4, 192);
+    mem::init_slab_cache(caches+5, 256);
+    mem::init_slab_cache(caches+6, 512);
+
+    types::memory::k_alloc = new types::memory::brk_memory_allocator(
+        (std::byte*)KERNEL_HEAP_START, KERNEL_HEAP_SIZE);
 }
 
-void* operator new[](size_t sz)
+void* operator new(size_t size)
 {
-    void* ptr = types::memory::k_alloc->allocate(sz);
+    int idx = __cache_index(size);
+    void* ptr = nullptr;
+    if (idx < 0)
+        ptr = types::memory::k_alloc->allocate(size);
+    else
+        ptr = kernel::mem::slab_alloc(&caches[idx]);
+
     assert(ptr);
     return ptr;
 }
 
 void operator delete(void* ptr)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    if (!ptr)
+        return;
+
+    if (types::memory::k_alloc->allocated(ptr))
+        types::memory::k_alloc->deallocate(ptr);
+    else
+        kernel::mem::slab_free(ptr);
 }
 
-void operator delete(void* ptr, size_t)
+void operator delete(void* ptr, std::size_t size)
+{
+    if (!ptr)
+        return;
+
+    if (types::memory::k_alloc->allocated(ptr)) {
+        types::memory::k_alloc->deallocate(ptr);
+        return;
+    }
+    int idx = __cache_index(size);
+    assert(idx >= 0);
+
+    kernel::mem::slab_free(ptr);
+}
+
+void* operator new[](size_t sz)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    return ::operator new(sz);
 }
 
 void operator delete[](void* ptr)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    ::operator delete(ptr);
 }
 
-void operator delete[](void* ptr, size_t)
+void operator delete[](void* ptr, std::size_t size)
 {
-    types::memory::k_alloc->deallocate(ptr);
+    ::operator delete(ptr, size);
 }

+ 16 - 16
src/kernel/async/lock.cc

@@ -9,8 +9,8 @@ static inline void _raw_spin_lock(spinlock_t* lock_addr)
 {
     asm volatile(
         "%=:\n\t\
-         movl $1, %%eax\n\t\
-         xchgl %%eax, (%0)\n\t\
+         mov $1, %%eax\n\t\
+         xchg %%eax, (%0)\n\t\
          cmp $0, %%eax\n\t\
          jne %=b\n\t\
         "
@@ -22,19 +22,19 @@ static inline void _raw_spin_lock(spinlock_t* lock_addr)
 static inline void _raw_spin_unlock(spinlock_t* lock_addr)
 {
     asm volatile(
-        "movl $0, %%eax\n\
-         xchgl %%eax, (%0)"
+        "mov $0, %%eax\n\
+         xchg %%eax, (%0)"
         :
         : "r"(lock_addr)
         : "eax", "memory");
 }
 
-static inline uint32_t _save_interrupt_state()
+static inline lock_context_t _save_interrupt_state()
 {
-    uint32_t retval;
+    lock_context_t retval;
     asm volatile(
-        "pushfl\n\t"
-        "popl %0\n\t"
+        "pushf\n\t"
+        "pop %0\n\t"
         "cli"
         : "=g"(retval)
         :
@@ -44,13 +44,13 @@ static inline uint32_t _save_interrupt_state()
     return retval;
 }
 
-static inline void _restore_interrupt_state(uint32_t flags)
+static inline void _restore_interrupt_state(lock_context_t context)
 {
     asm volatile(
-        "pushl %0\n\t"
-        "popfl"
+        "push %0\n\t"
+        "popf"
         :
-        : "g"(flags)
+        : "g"(context)
         :
         );
 }
@@ -90,7 +90,7 @@ void spin_unlock(spinlock_t& lock)
     preempt_enable();
 }
 
-uint32_t spin_lock_irqsave(spinlock_t& lock)
+lock_context_t spin_lock_irqsave(spinlock_t& lock)
 {
     auto state = _save_interrupt_state();
     preempt_disable();
@@ -100,7 +100,7 @@ uint32_t spin_lock_irqsave(spinlock_t& lock)
     return state;
 }
 
-void spin_unlock_irqrestore(spinlock_t& lock, uint32_t state)
+void spin_unlock_irqrestore(spinlock_t& lock, lock_context_t state)
 {
     _raw_spin_unlock(&lock);
     preempt_enable();
@@ -122,12 +122,12 @@ void mutex::unlock()
     spin_unlock(m_lock);
 }
 
-uint32_t mutex::lock_irq()
+lock_context_t mutex::lock_irq()
 {
     return spin_lock_irqsave(m_lock);
 }
 
-void mutex::unlock_irq(uint32_t state)
+void mutex::unlock_irq(lock_context_t state)
 {
     spin_unlock_irqrestore(m_lock, state);
 }

+ 43 - 52
src/kernel/hw/ahci.cc

@@ -2,14 +2,13 @@
 #include <cstddef>
 #include <algorithm>
 
-#include <kernel/vfs.hpp>
-#include <kernel/log.hpp>
-#include <kernel/mm.hpp>
-#include <kernel/module.hpp>
 #include <kernel/hw/pci.hpp>
 #include <kernel/irq.hpp>
-
-#include <types/size.h>
+#include <kernel/log.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
+#include <kernel/module.hpp>
+#include <kernel/vfs.hpp>
 
 #include <stdint.h>
 #include <errno.h>
@@ -21,6 +20,9 @@
 
 using namespace kernel::module;
 using namespace kernel::hw::pci;
+using namespace kernel::mem::paging;
+
+using kernel::mem::physaddr;
 
 constexpr uint32_t MAX_SPINS = 100000;
 
@@ -40,11 +42,8 @@ constexpr uint32_t PORT_CMD_CR = 0x00008000;
 namespace ahci {
 
 typedef volatile struct hba_port_t {
-    uint32_t command_list_base;
-    uint32_t command_list_base_upper;
-
-    uint32_t fis_base;
-    uint32_t fis_base_upper;
+    uint64_t command_list_base;
+    uint64_t fis_base;
 
     uint32_t interrupt_status;
     uint32_t interrupt_enable;
@@ -102,8 +101,7 @@ struct command_header {
 
     uint32_t volatile bytes_transferred;
 
-    uint32_t command_table_base;
-    uint32_t command_table_base_upper;
+    uint64_t command_table_base;
 
     uint32_t reserved1[4];
 };
@@ -220,8 +218,7 @@ struct received_fis {
 };
 
 struct prdt_entry {
-    uint32_t data_base;
-    uint32_t data_base_upper;
+    uint64_t data_base;
 
     uint32_t reserved0;
 
@@ -291,13 +288,12 @@ struct quick_queue {
 struct ahci_port {
 private:
     // quick_queue<32> qu;
-    page_t page;
+    physaddr<command_header, false> cmd_header;
     hba_port* port;
-    command_header* cmd_header { };
     received_fis* fis { };
     std::size_t sectors { -1U };
 
-    int send_command(char* buf, uint64_t lba, uint32_t count, uint8_t cmd, bool write)
+    int send_command(physaddr<void> buf, uint64_t lba, uint32_t count, uint8_t cmd, bool write)
     {
         // count must be a multiple of 512
         if (count & (512 - 1))
@@ -307,9 +303,10 @@ private:
         int n = 0;
         // auto n = qu.pop();
 
-        // for now, we read 3.5KB at most at a time
         // command fis and prdt will take up the lower 128+Bytes
-        auto cmdtable_page = __alloc_raw_page();
+        // TODO: buffer array
+        pfn_t command_table_pfn = page_to_pfn(alloc_page());
+        physaddr<command_table, false> cmdtable{command_table_pfn};
 
         // construct command header
         memset(cmd_header + n, 0x00, sizeof(command_header));
@@ -318,9 +315,8 @@ private:
 
         cmd_header[n].write = write;
         cmd_header[n].prdt_length = 1;
-        cmd_header[n].command_table_base = cmdtable_page << 12;
+        cmd_header[n].command_table_base = cmdtable.phys();
 
-        auto* cmdtable = (command_table*)kernel::pmap(cmdtable_page);
         memset(cmdtable, 0x00, sizeof(command_table) + sizeof(prdt_entry));
 
         // first, set up command fis
@@ -340,7 +336,7 @@ private:
 
         // fill in prdt
         auto* pprdt = cmdtable->prdt;
-        pprdt->data_base = (cmdtable_page << 12) + 512;
+        pprdt->data_base = buf.phys();
         pprdt->byte_count = count;
         pprdt->interrupt = 1;
 
@@ -359,17 +355,17 @@ private:
         SPIN(port->command_issue & (1 << n), spins)
             return -1;
 
-        memcpy(buf, (char*)cmdtable + 512, count);
-
-        kernel::pfree(cmdtable_page);
-        __free_raw_page(cmdtable_page);
+        free_page(command_table_pfn);
         return 0;
     }
 
     int identify()
     {
-        char buf[512];
-        int ret = send_command(buf, 0, 512, 0xEC, false);
+        pfn_t buffer_page = page_to_pfn(alloc_page());
+        int ret = send_command(physaddr<void>{buffer_page},
+                0, 512, 0xEC, false);
+
+        free_page(buffer_page);
         if (ret != 0)
             return -1;
         return 0;
@@ -377,40 +373,43 @@ private:
 
 public:
     explicit ahci_port(hba_port* port)
-        : page(__alloc_raw_page()), port(port) { }
+        : cmd_header{page_to_pfn(alloc_page())}, port(port) { }
 
     ~ahci_port()
     {
         if (!cmd_header)
             return;
-        kernel::pfree(page);
-        __free_raw_page(page);
+        free_page(cmd_header.phys());
     }
 
     ssize_t read(char* buf, std::size_t buf_size, std::size_t offset, std::size_t cnt)
     {
         cnt = std::min(buf_size, cnt);
 
-        constexpr size_t READ_BUF_SECTORS = 6;
+        pfn_t buffer_page = page_to_pfn(alloc_page());
+        physaddr<void> buffer_ptr{buffer_page};
 
-        char b[READ_BUF_SECTORS * 512] {};
         char* orig_buf = buf;
         size_t start = offset / 512;
         size_t end = std::min((offset + cnt + 511) / 512, sectors);
 
         offset -= start * 512;
-        for (size_t i = start; i < end; i += READ_BUF_SECTORS) {
-            size_t n_read = std::min(end - i, READ_BUF_SECTORS) * 512;
-            int status = send_command(b, i, n_read, 0xC8, false);
-            if (status != 0)
+        for (size_t i = start; i < end; i += 4096UL / 512) {
+            size_t n_read = std::min(end - i, 4096UL / 512) * 512;
+            int status = send_command(buffer_ptr, i, n_read, 0xC8, false);
+            if (status != 0) {
+                free_page(buffer_page);
                 return -EIO;
+            }
 
             size_t to_copy = std::min(cnt, n_read - offset);
-            memcpy(buf, b + offset, to_copy);
+            memcpy(buf, (std::byte*)(void*)buffer_ptr + offset, to_copy);
             offset = 0;
             buf += to_copy;
             cnt -= to_copy;
         }
+
+        free_page(buffer_page);
         return buf - orig_buf;
     }
 
@@ -425,13 +424,9 @@ public:
         //
         // port->interrupt_enable = 1;
 
-        port->command_list_base = page << 12;
-        port->command_list_base_upper = 0;
-
-        port->fis_base = (page << 12) + 0x400;
-        port->fis_base_upper = 0;
+        port->command_list_base = cmd_header.phys();
+        port->fis_base = cmd_header.phys() + 0x400;
 
-        cmd_header = (command_header*)kernel::pmap(page, false);
         fis = (received_fis*)(cmd_header + 1);
 
         if (start_command(port) != 0)
@@ -455,9 +450,6 @@ public:
     ~ahci_module()
     {
         // TODO: release PCI device
-        if (ghc)
-            kernel::pfree(dev->reg[PCI_REG_ABAR] >> 12);
-
         for (auto& item : ports) {
             if (!item)
                 continue;
@@ -481,7 +473,7 @@ public:
             auto* port = new ahci_port(ghc_port);
             if (port->init() != 0) {
                 delete port;
-                kmsg("An error occurred while configuring an ahci port\n");
+                kmsg("An error occurred while configuring an ahci port");
                 continue;
             }
 
@@ -506,10 +498,9 @@ public:
         auto ret = kernel::hw::pci::register_driver(VENDOR_INTEL, DEVICE_AHCI,
             [this](pci_device* dev) -> int {
                 this->dev = dev;
-                uint32_t abar_address = dev->reg[PCI_REG_ABAR];
 
-                void* base = kernel::pmap(abar_address >> 12, false);
-                this->ghc = (hba_ghc*)base;
+                physaddr<hba_ghc, false> pp_base{dev->reg[PCI_REG_ABAR]};
+                this->ghc = pp_base;
 
                 this->ghc->global_host_control =
                     this->ghc->global_host_control | 2; // set interrupt enable

+ 0 - 31
src/kernel/hw/keyboard.cpp

@@ -1,31 +0,0 @@
-#include <asm/port_io.h>
-#include <kernel/hw/keyboard.h>
-#include <kernel/input/input_event.h>
-
-extern "C" void
-handle_keyboard_interrupt(void)
-{
-    input_event evt {
-        .type = input_event::input_event_type::KEYBOARD,
-        .code = KEY_DOWN,
-        .data = 0
-    };
-
-    uint8_t keycode = asm_inb(PORT_KEYDATA);
-    if (keycode >= 0xd8) {
-        // TODO: report not_supported event
-        return;
-    }
-
-    // key release
-    if (keycode >= 0x80) {
-        evt.code = KEY_UP;
-        keycode -= 0x80;
-    }
-
-    evt.data = keycode;
-
-    // TODO: fix it
-    // commit_input_event(&evt);
-    (void)evt;
-}

+ 124 - 0
src/kernel/hw/serial.cc

@@ -0,0 +1,124 @@
+#include <errno.h>
+#include <stdio.h>
+
+#include <kernel/hw/port.hpp>
+#include <kernel/irq.hpp>
+#include <kernel/log.hpp>
+#include <kernel/module.hpp>
+#include <kernel/tty.hpp>
+
+using namespace kernel::tty;
+using namespace kernel::hw;
+
+constexpr int PORT0 = 0x3f8;
+constexpr int PORT1 = 0x2f8;
+
+using port_group = const p8[6];
+
+constexpr p8 port0[] = {
+    p8{PORT0+0},
+    p8{PORT0+1},
+    p8{PORT0+2},
+    p8{PORT0+3},
+    p8{PORT0+4},
+    p8{PORT0+5},
+};
+
+constexpr p8 port1[] = {
+    p8{PORT1+0},
+    p8{PORT1+1},
+    p8{PORT1+2},
+    p8{PORT1+3},
+    p8{PORT1+4},
+    p8{PORT1+5},
+};
+
+static void _serial0_receive_data_interrupt()
+{
+    while (*port0[5] & 1)
+        console->commit_char(*port0[0]);
+}
+
+static void _serial1_receive_data_interrupt()
+{
+    while (*port1[5] & 1)
+        console->commit_char(*port1[0]);
+}
+
+static inline int _init_port(port_group ports)
+{
+    // taken from osdev.org
+
+    ports[1] = 0x00; // Disable all interrupts
+    ports[3] = 0x80; // Enable DLAB (set baud rate divisor)
+    // TODO: set baud rate
+    ports[0] = 0x00; // Set divisor to 0 -3- (lo byte) 115200 -38400- baud
+    ports[1] = 0x00; //                  (hi byte)
+    ports[3] = 0x03; // 8 bits, no parity, one stop bit
+    ports[2] = 0xC7; // Enable FIFO, clear them, with 14-byte threshold
+    // TODO: IRQ disabled
+    ports[4] = 0x0B; // IRQs enabled, RTS/DSR set
+    ports[4] = 0x1E; // Set in loopback mode, test the serial chip
+    ports[0] = 0xAE; // Test serial chip (send byte 0xAE and check if serial returns same byte)
+
+    // Check if serial is faulty (i.e: not same byte as sent)
+    if (*ports[0] != 0xAE)
+        return -EIO;
+
+    // If serial is not faulty set it in normal operation mode
+    // (not-loopback with IRQs enabled and OUT#1 and OUT#2 bits enabled)
+    ports[4] = 0x0F;
+
+    ports[1] = 0x01; // Enable interrupts #0: Received Data Available
+
+    return 0;
+}
+
+class serial_tty : public virtual tty {
+    const p8* ports;
+
+public:
+    serial_tty(port_group ports, int id)
+        : tty{"ttyS"}, ports(ports)
+    {
+        name += '0'+id;
+    }
+
+    virtual void putchar(char c) override
+    {
+        while (!(*ports[5] & 0x20))
+            ; // nop
+        ports[0] = c;
+    }
+};
+
+class serial_module : public virtual kernel::module::module {
+public:
+    serial_module() : module("serial-tty") { }
+
+    virtual int init() override
+    {
+        if (int ret = _init_port(port0); ret == 0) {
+            auto* dev = new serial_tty(port0, 0);
+            kernel::irq::register_handler(4, _serial0_receive_data_interrupt);
+
+            if (int ret = register_tty(dev); ret != 0)
+                kmsg("[serial] cannot register ttyS0");
+        }
+
+        if (int ret = _init_port(port1); ret == 0) {
+            auto* dev = new serial_tty(port1, 0);
+            kernel::irq::register_handler(3, _serial1_receive_data_interrupt);
+
+            if (int ret = register_tty(dev); ret != 0)
+                kmsg("[serial] cannot register ttyS1");
+        }
+
+        return kernel::module::MODULE_SUCCESS;
+    }
+
+};
+
+kernel::module::module* serial_module_init()
+{ return new serial_module(); }
+INTERNAL_MODULE(serial_module_loader, serial_module_init);

+ 0 - 71
src/kernel/hw/serial.cpp

@@ -1,71 +0,0 @@
-#include <asm/port_io.h>
-#include <kernel/hw/serial.h>
-#include <kernel/irq.hpp>
-#include <kernel/tty.hpp>
-#include <stdio.h>
-#include <types/status.h>
-
-static void serial_receive_data_interrupt(void)
-{
-    while (is_serial_has_data(PORT_SERIAL0)) {
-        uint8_t data = serial_read_data(PORT_SERIAL0);
-        console->commit_char(data);
-    }
-}
-
-SECTION(".text.kinit")
-int32_t init_serial_port(port_id_t port)
-{
-    // taken from osdev.org
-
-    asm_outb(port + 1, 0x00); // Disable all interrupts
-    asm_outb(port + 3, 0x80); // Enable DLAB (set baud rate divisor)
-    // TODO: set baud rate
-    asm_outb(port + 0, 0x00); // Set divisor to 0 -3- (lo byte) 115200 -38400- baud
-    asm_outb(port + 1, 0x00); //                  (hi byte)
-    asm_outb(port + 3, 0x03); // 8 bits, no parity, one stop bit
-    asm_outb(port + 2, 0xC7); // Enable FIFO, clear them, with 14-byte threshold
-    // TODO: IRQ disabled
-    asm_outb(port + 4, 0x0B); // IRQs enabled, RTS/DSR set
-    asm_outb(port + 4, 0x1E); // Set in loopback mode, test the serial chip
-    asm_outb(port + 0, 0xAE); // Test serial chip (send byte 0xAE and check if serial returns same byte)
-
-    // Check if serial is faulty (i.e: not same byte as sent)
-    if (asm_inb(port + 0) != 0xAE) {
-        return GB_FAILED;
-    }
-
-    // If serial is not faulty set it in normal operation mode
-    // (not-loopback with IRQs enabled and OUT#1 and OUT#2 bits enabled)
-    asm_outb(port + 4, 0x0F);
-
-    asm_outb(port + 1, 0x01); // Enable interrupts #0: Received Data Available
-
-    kernel::irq::register_handler(4, serial_receive_data_interrupt);
-
-    return GB_OK;
-}
-
-int32_t is_serial_has_data(port_id_t port)
-{
-    return asm_inb(port + 5) & 1;
-}
-
-uint8_t serial_read_data(port_id_t port)
-{
-    while (is_serial_has_data(port) == 0)
-        ;
-    return asm_inb(port);
-}
-
-int32_t is_serial_ready_for_transmition(port_id_t port)
-{
-    return asm_inb(port + 5) & 0x20;
-}
-
-void serial_send_data(port_id_t port, uint8_t data)
-{
-    while (is_serial_ready_for_transmition(port) == 0)
-        ;
-    return asm_outb(port, data);
-}

+ 0 - 26
src/kernel/hw/timer.c

@@ -1,26 +0,0 @@
-#include <asm/port_io.h>
-#include <kernel/hw/timer.h>
-
-static size_t _current_ticks = 0;
-
-SECTION(".text.kinit")
-void init_pit(void)
-{
-    // set interval
-    asm_outb(PORT_PIT_CONTROL, 0x34);
-
-    // send interval number
-    // 0x2e9a = 11930 = 100Hz
-    asm_outb(PORT_PIT_COUNT, 0x9a);
-    asm_outb(PORT_PIT_COUNT, 0x2e);
-}
-
-void inc_tick(void)
-{
-    ++_current_ticks;
-}
-
-size_t current_ticks(void)
-{
-    return _current_ticks;
-}

+ 31 - 0
src/kernel/hw/timer.cc

@@ -0,0 +1,31 @@
+#include <types/types.h>
+
+#include <kernel/hw/port.hpp>
+#include <kernel/hw/timer.hpp>
+
+constexpr kernel::hw::p8 port_control(0x43);
+constexpr kernel::hw::p8 port_count(0x40);
+
+static std::size_t _current_ticks = 0;
+
+SECTION(".text.kinit")
+void kernel::hw::timer::init_pit(void)
+{
+    // set interval
+    port_control = 0x34;
+
+    // send interval number
+    // 0x2e9a = 11930 = 100Hz
+    port_count = 0x9a;
+    port_count = 0x2e;
+}
+
+void kernel::hw::timer::inc_tick(void)
+{
+    ++_current_ticks;
+}
+
+size_t kernel::hw::timer::current_ticks(void)
+{
+    return _current_ticks;
+}

+ 100 - 268
src/kernel/interrupt.cpp

@@ -5,325 +5,157 @@
 #include <stdint.h>
 #include <stdio.h>
 
-#include <types/size.h>
 #include <types/types.h>
 
-#include <asm/port_io.h>
-#include <kernel/hw/keyboard.h>
-#include <kernel/hw/serial.h>
-#include <kernel/hw/timer.h>
-#include <kernel/interrupt.h>
+#include <kernel/hw/port.hpp>
+#include <kernel/hw/timer.hpp>
+#include <kernel/interrupt.hpp>
 #include <kernel/irq.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/process.hpp>
+#include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
-#include <kernel/vga.hpp>
 
-struct IDT_entry {
-    uint16_t offset_low;
-    uint16_t selector;
-    uint8_t zero;
-    uint8_t type_attr;
-    uint16_t offset_high;
-};
+#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
+#define USER_INTERRUPT_GATE_TYPE (0xee)
 
-// interrupt stubs
-extern "C" void irq0(); extern "C" void irq1(); extern "C" void irq2();
-extern "C" void irq3(); extern "C" void irq4(); extern "C" void irq5();
-extern "C" void irq6(); extern "C" void irq7(); extern "C" void irq8();
-extern "C" void irq9(); extern "C" void irq10(); extern "C" void irq11();
-extern "C" void irq12(); extern "C" void irq13(); extern "C" void irq14();
-extern "C" void irq15(); extern "C" void int6(); extern "C" void int8();
-extern "C" void int13(); extern "C" void int14();
-extern "C" void syscall_stub();
+constexpr kernel::hw::p8 port_pic1_command{0x20};
+constexpr kernel::hw::p8 port_pic1_data{0x21};
+constexpr kernel::hw::p8 port_pic2_command{0xa0};
+constexpr kernel::hw::p8 port_pic2_data{0xa1};
 
-#define SET_UP_IRQ(N, SELECTOR)                   \
-    ptr_t addr_irq##N = (ptr_t)irq##N;            \
-    set_idt_entry(IDT, 0x20 + (N), (addr_irq##N), \
-        (SELECTOR), KERNEL_INTERRUPT_GATE_TYPE);
+struct IDT_entry {
+    uint16_t offset_low;
+    uint16_t segment;
 
-#define SET_IDT_ENTRY_FN(N, FUNC_NAME, SELECTOR, TYPE) \
-    ptr_t addr_##FUNC_NAME = (ptr_t)FUNC_NAME;         \
-    set_idt_entry(IDT, (N), (addr_##FUNC_NAME), (SELECTOR), (TYPE));
+    uint8_t IST;
+    uint8_t attributes;
 
-SECTION(".text.kinit")
-static void set_idt_entry(IDT_entry (&idt)[256], int n,
-    uintptr_t offset, uint16_t selector, uint8_t type)
-{
-    idt[n].offset_low = offset & 0xffff;
-    idt[n].selector = selector;
-    idt[n].zero = 0;
-    idt[n].type_attr = type;
-    idt[n].offset_high = (offset >> 16) & 0xffff;
-}
-
-// idt_descriptor: uint16_t[3]
-// [0] bit 0 :15 => limit
-// [1] bit 16:47 => address
-extern "C" void asm_load_idt(uint16_t idt_descriptor[3], int sti);
+    uint16_t offset_mid;
+    uint32_t offset_high;
+    uint32_t reserved;
+};
 
 static struct IDT_entry IDT[256];
 
-static inline void NORETURN die(regs_32& regs, ptr_t eip)
-{
-    char buf[512] = {};
-    snprintf(
-        buf, sizeof(buf),
-        "***** KERNEL PANIC *****\n"
-        "eax: %x, ebx: %x, ecx: %x, edx: %x\n"
-        "esp: %x, ebp: %x, esi: %x, edi: %x\n"
-        "eip: %x\n",
-        regs.eax, regs.ebx, regs.ecx,
-        regs.edx, regs.esp, regs.ebp,
-        regs.esi, regs.edi, eip);
-    kmsg(buf);
-    freeze();
-}
+extern "C" uintptr_t ISR_START_ADDR;
 
 SECTION(".text.kinit")
-void init_idt()
+static inline void set_idt_entry(IDT_entry (&idt)[256], int n,
+    uintptr_t offset, uint16_t selector, uint8_t type)
 {
-    asm_cli();
-
-    memset(IDT, 0x00, sizeof(IDT));
-
-    // invalid opcode
-    SET_IDT_ENTRY_FN(6, int6, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // double fault
-    SET_IDT_ENTRY_FN(8, int8, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // general protection
-    SET_IDT_ENTRY_FN(13, int13, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // page fault
-    SET_IDT_ENTRY_FN(14, int14, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
-    // system call
-    SET_IDT_ENTRY_FN(0x80, syscall_stub, 0x08, USER_INTERRUPT_GATE_TYPE);
-
-    uint16_t idt_descriptor[3];
-    idt_descriptor[0] = sizeof(struct IDT_entry) * 256;
-    *((uint32_t*)(idt_descriptor + 1)) = (ptr_t)IDT;
-
-    asm_load_idt(idt_descriptor, 0);
+    idt[n].offset_low = offset & 0xffff;
+    idt[n].segment = selector;
+    idt[n].IST = 0;
+    idt[n].attributes = type;
+    idt[n].offset_mid = (offset >> 16) & 0xffff;
+    idt[n].offset_high = (offset >> 32) & 0xffffffff;
+    idt[n].reserved = 0;
 }
 
 using kernel::irq::irq_handler_t;
 static std::vector<std::list<irq_handler_t>> s_irq_handlers;
 
-void kernel::irq::register_handler(int irqno, irq_handler_t handler)
-{
-    s_irq_handlers[irqno].emplace_back(std::move(handler));
-}
-
 SECTION(".text.kinit")
-void init_pic(void)
+void kernel::kinit::init_interrupt()
 {
-    asm_cli();
+    for (int i = 0; i < 0x30; ++i)
+        set_idt_entry(IDT, i, ISR_START_ADDR+8*i, 0x08, KERNEL_INTERRUPT_GATE_TYPE);
+    set_idt_entry(IDT, 0x80, ISR_START_ADDR+8*0x80, 0x08, USER_INTERRUPT_GATE_TYPE);
 
+    uint64_t idt_descriptor[2];
+    idt_descriptor[0] = (sizeof(IDT_entry) * 256) << 48;
+    idt_descriptor[1] = (uintptr_t)IDT;
+
+    // initialize PIC
+    asm volatile("lidt (%0)": :"r"((uintptr_t)idt_descriptor + 6): );
     s_irq_handlers.resize(16);
 
     // TODO: move this to timer driver
     kernel::irq::register_handler(0, []() {
-        inc_tick();
+        kernel::hw::timer::inc_tick();
         schedule();
     });
 
-    asm_outb(PORT_PIC1_COMMAND, 0x11); // edge trigger mode
-    asm_outb(PORT_PIC1_DATA, 0x20); // start from int 0x20
-    asm_outb(PORT_PIC1_DATA, 0x04); // PIC1 is connected to IRQ2 (1 << 2)
-    asm_outb(PORT_PIC1_DATA, 0x01); // no buffer mode
+    port_pic1_command = 0x11; // edge trigger mode
+    port_pic1_data = 0x20;    // start from int 0x20
+    port_pic1_data = 0x04;    // PIC1 is connected to IRQ2 (1 << 2)
+    port_pic1_data = 0x01;    // no buffer mode
 
-    asm_outb(PORT_PIC2_COMMAND, 0x11); // edge trigger mode
-    asm_outb(PORT_PIC2_DATA, 0x28); // start from int 0x28
-    asm_outb(PORT_PIC2_DATA, 0x02); // connected to IRQ2
-    asm_outb(PORT_PIC2_DATA, 0x01); // no buffer mode
+    port_pic2_command = 0x11; // edge trigger mode
+    port_pic2_data = 0x28;    // start from int 0x28
+    port_pic2_data = 0x02;    // connected to IRQ2
+    port_pic2_data = 0x01;    // no buffer mode
 
     // allow all the interrupts
-    asm_outb(PORT_PIC1_DATA, 0x00);
-    asm_outb(PORT_PIC2_DATA, 0x00);
-
-    // 0x08 stands for kernel code segment
-    SET_UP_IRQ(0, 0x08);
-    SET_UP_IRQ(1, 0x08);
-    SET_UP_IRQ(2, 0x08);
-    SET_UP_IRQ(3, 0x08);
-    SET_UP_IRQ(4, 0x08);
-    SET_UP_IRQ(5, 0x08);
-    SET_UP_IRQ(6, 0x08);
-    SET_UP_IRQ(7, 0x08);
-    SET_UP_IRQ(8, 0x08);
-    SET_UP_IRQ(9, 0x08);
-    SET_UP_IRQ(10, 0x08);
-    SET_UP_IRQ(11, 0x08);
-    SET_UP_IRQ(12, 0x08);
-    SET_UP_IRQ(13, 0x08);
-    SET_UP_IRQ(14, 0x08);
-    SET_UP_IRQ(15, 0x08);
+    port_pic1_data = 0x00;
+    port_pic2_data = 0x00;
 }
 
-extern "C" void int6_handler(
-    regs_32 s_regs,
-    ptr_t eip,
-    uint16_t cs,
-    uint32_t eflags)
+void kernel::irq::register_handler(int irqno, irq_handler_t handler)
 {
-    if (!current_process->attr.system)
-        kill_current(SIGSEGV);
-
-    char buf[128];
-    snprintf(buf, sizeof(buf),
-        "[kernel] int6 data: cs: %x, eflags: %x\n", cs, eflags);
-    kmsg(buf);
-
-    die(s_regs, eip);
+    s_irq_handlers[irqno].emplace_back(std::move(handler));
 }
 
-// general protection
-extern "C" void int13_handler(
-    struct regs_32 s_regs,
-    uint32_t error_code,
-    ptr_t eip,
-    uint16_t cs,
-    uint32_t eflags)
+static inline void fault_handler(
+        interrupt_stack_with_code* context,
+        mmx_registers*)
 {
-    if (!current_process->attr.system)
-        kill_current(SIGILL);
-
-    char buf[128] = {};
-    snprintf(buf, sizeof(buf),
-        "[kernel] int13 data: error_code: %x, cs: %x, eflags: %x\n",
-        error_code, cs, eflags);
-    kmsg(buf);
-
-    die(s_regs, eip);
-}
-
-struct PACKED int14_data {
-    void* l_addr;
-    struct regs_32 s_regs;
-    struct page_fault_error_code error_code;
-    void* v_eip;
-    uint32_t cs;
-    uint32_t eflags;
-};
+    switch (context->head.int_no) {
+    case 6:
+    case 8: {
+        if (!current_process->attr.system)
+            kill_current(SIGSEGV); // noreturn
+    } break;
+    case 13: {
+        if (!current_process->attr.system)
+            kill_current(SIGILL); // noreturn
+    } break;
+    case 14: {
+        kernel::mem::paging::handle_page_fault(context->error_code);
+        context->head.int_no = (unsigned long)context + 0x88;
+        return;
+    } break;
+    }
 
-static inline void _int14_panic(void* eip, void* cr2, struct page_fault_error_code error_code)
-{
-    char buf[128] = {};
-    snprintf(buf, sizeof(buf),
-        "[kernel] int14 data: eip: %p, cr2: %p, error_code: %x\n"
-        "[kernel] freezing...\n",
-        eip, cr2, error_code);
-    kmsg(buf);
+    // fault can not be resolved
     freeze();
 }
 
-static inline void NORETURN _int14_kill_user(void)
-{
-    kill_current(SIGSEGV);
-}
-
-// page fault
-extern "C" void int14_handler(int14_data* d)
+static inline void irq_handler(
+        interrupt_stack_normal* context,
+        mmx_registers*)
 {
-    kernel::memory::mm_list* mms = nullptr;
-    if (current_process) [[likely]]
-        mms = &current_process->mms;
-    else
-        mms = kernel::memory::mm_list::s_kernel_mms;
-
-    auto* mm_area = mms->find(d->l_addr);
-    if (!mm_area) [[unlikely]] {
-        if (d->error_code.user) {
-            // user access of address that does not exist
-            _int14_kill_user();
-        } else {
-            _int14_panic(d->v_eip, d->l_addr, d->error_code);
-        }
-    }
-    if (d->error_code.user && mm_area->attr.system)
-        _int14_kill_user();
-
-    page* page = &(*mm_area->pgs)[vptrdiff(d->l_addr, mm_area->start) / PAGE_SIZE];
-    kernel::paccess pa(page->pg_pteidx >> 12);
-    auto pt = (pt_t)pa.ptr();
-    assert(pt);
-    pte_t* pte = *pt + (page->pg_pteidx & 0xfff);
-
-    if (unlikely(d->error_code.present == 0 && !mm_area->mapped_file))
-        _int14_panic(d->v_eip, d->l_addr, d->error_code);
-
-    if (page->attr & PAGE_COW) {
-        // if it is a dying page
-        if (*page->ref_count == 1) {
-            page->attr &= ~PAGE_COW;
-            pte->in.p = 1;
-            pte->in.a = 0;
-            pte->in.rw = mm_area->attr.write;
-            return;
-        }
-        // duplicate the page
-        page_t new_page = __alloc_raw_page();
-
-        {
-            kernel::paccess pdst(new_page), psrc(page->phys_page_id);
-            auto* new_page_data = (char*)pdst.ptr();
-            auto* src = psrc.ptr();
-            assert(new_page_data && src);
-            memcpy(new_page_data, src, PAGE_SIZE);
-        }
-
-        pte->in.page = new_page;
-        pte->in.rw = mm_area->attr.write;
-        pte->in.a = 0;
-
-        --*page->ref_count;
+    int irqno = context->head.int_no - 0x20;
 
-        page->ref_count = types::memory::kinew<size_t>(1);
-        page->attr &= ~PAGE_COW;
-        page->phys_page_id = new_page;
-    }
-
-    if (page->attr & PAGE_MMAP) {
-        pte->in.p = 1;
-
-        size_t offset = align_down<12>((uint32_t)d->l_addr);
-        offset -= (uint32_t)mm_area->start;
-
-        kernel::paccess pa(page->phys_page_id);
-        auto* data = (char*)pa.ptr();
-        assert(data);
-
-        int n = vfs_read(
-            mm_area->mapped_file,
-            data,
-            PAGE_SIZE,
-            mm_area->file_offset + offset,
-            PAGE_SIZE);
-
-        // TODO: send SIGBUS if offset is greater than real size
-        if (n != PAGE_SIZE)
-            memset(data + n, 0x00, PAGE_SIZE - n);
-
-        page->attr &= ~PAGE_MMAP;
-    }
-}
+    constexpr uint8_t PIC_EOI = 0x20;
 
-extern "C" void irq_handler(
-    int irqno,
-    interrupt_stack* context,
-    mmx_registers* mmxregs)
-{
-    asm_outb(PORT_PIC1_COMMAND, PIC_EOI);
+    port_pic1_command = PIC_EOI;
     if (irqno >= 8)
-        asm_outb(PORT_PIC2_COMMAND, PIC_EOI);
+        port_pic2_command = PIC_EOI;
 
     for (const auto& handler : s_irq_handlers[irqno])
         handler();
+}
 
-    if (context->cs != USER_CODE_SEGMENT)
-        return;
-
-    if (current_thread->signals.pending_signal())
-        current_thread->signals.handle(context, mmxregs);
+extern "C" void interrupt_handler(
+        interrupt_stack_head* context,
+        mmx_registers* mmxregs)
+{
+    // interrupt is a fault
+    if (context->int_no < 0x20) {
+        auto* with_code = (interrupt_stack_with_code*)context;
+        fault_handler(with_code, mmxregs);
+    }
+    else if (context->int_no == 0x80) { // syscall by int 0x80
+        auto* normal = (interrupt_stack_normal*)context;
+        kernel::handle_syscall32(context->s_regs.rax, normal, mmxregs);
+        context->int_no = (unsigned long)context + 0x80;
+    }
+    else {
+        auto* normal = (interrupt_stack_normal*)context;
+        irq_handler(normal, mmxregs);
+        context->int_no = (unsigned long)context + 0x80;
+    }
 }

+ 0 - 586
src/kernel/mem.cpp

@@ -1,586 +0,0 @@
-#include <cstddef>
-
-#include <asm/port_io.h>
-#include <asm/sys.h>
-#include <assert.h>
-#include <errno.h>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
-#include <kernel/process.hpp>
-#include <kernel/task.h>
-#include <kernel/vga.hpp>
-#include <stdint.h>
-#include <stdio.h>
-#include <types/allocator.hpp>
-#include <types/bitmap.hpp>
-#include <types/size.h>
-#include <types/status.h>
-
-// constant values
-
-#define EMPTY_PAGE ((page_t)0)
-
-// ---------------------
-
-static size_t mem_size;
-static uint8_t _mem_bitmap[1024 * 1024 / 8];
-static types::bitmap mem_bitmap(
-    [](unsigned char*, std::size_t){}, _mem_bitmap,
-    1024 * 1024);
-
-// global
-segment_descriptor gdt[7];
-
-uint8_t e820_mem_map[1024];
-uint32_t e820_mem_map_count;
-uint32_t e820_mem_map_entry_size;
-struct mem_size_info mem_size_info;
-
-constexpr void mark_addr_len(pptr_t start, size_t n)
-{
-    if (n == 0)
-        return;
-    page_t start_page = align_down<12>(start) >> 12;
-    page_t end_page = align_up<12>(start + n) >> 12;
-    for (page_t i = start_page; i < end_page; ++i)
-        mem_bitmap.set(i);
-}
-
-constexpr void free_addr_len(pptr_t start, size_t n)
-{
-    if (n == 0)
-        return;
-    page_t start_page = align_down<12>(start) >> 12;
-    page_t end_page = align_up<12>(start + n) >> 12;
-    for (page_t i = start_page; i < end_page; ++i)
-        mem_bitmap.clear(i);
-}
-
-constexpr void mark_addr_range(pptr_t start, pptr_t end)
-{
-    mark_addr_len(start, end - start);
-}
-
-constexpr void free_addr_range(pptr_t start, pptr_t end)
-{
-    free_addr_len(start, end - start);
-}
-
-page_t __alloc_raw_page(void)
-{
-    const auto size = mem_bitmap.size();
-    for (size_t i = 0; i < size; ++i) {
-        if (mem_bitmap.test(i) == 0) {
-            mem_bitmap.set(i);
-            return i;
-        }
-    }
-    return -1;
-}
-
-void __free_raw_page(page_t pg)
-{
-    mem_bitmap.clear(pg);
-}
-
-page allocate_page(void)
-{
-    return page {
-        .phys_page_id = __alloc_raw_page(),
-        .ref_count = types::memory::kinew<size_t>(0),
-        .pg_pteidx = 0,
-        .attr = 0,
-    };
-}
-
-void free_page(page* pg)
-{
-    if (*pg->ref_count == 1) {
-        types::memory::kidelete<size_t>(pg->ref_count);
-        __free_raw_page(pg->phys_page_id);
-    } else {
-        --*pg->ref_count;
-    }
-}
-
-void dealloc_pd(page_t pd)
-{
-    {
-        kernel::paccess pa(pd);
-        auto p_pd = (pd_t)pa.ptr();
-        assert(p_pd);
-        for (pde_t* ent = (*p_pd); ent < (*p_pd) + 768; ++ent) {
-            if (!ent->in.p)
-                continue;
-            __free_raw_page(ent->in.pt_page);
-        }
-    }
-    __free_raw_page(pd);
-}
-
-SECTION(".text.kinit")
-static inline void init_mem_layout(void)
-{
-    mem_size = 1024 * mem_size_info.n_1k_blks;
-    mem_size += 64 * 1024 * mem_size_info.n_64k_blks;
-
-    // mark empty page
-    mark_addr_range(0x00000000, 0x00001000);
-    // mark kernel page directory
-    mark_addr_range(0x00001000, 0x00002000);
-    // mark kernel page table
-    mark_addr_range(0x00002000, 0x00006000);
-    // mark kernel early stack
-    mark_addr_range(0x00006000, 0x00008000);
-    // mark EBDA and upper memory as allocated
-    mark_addr_range(0x80000, 0x100000);
-    extern char __stage1_start[];
-    extern char __kinit_end[];
-    extern char __text_start[];
-    extern char __data_end[];
-
-    constexpr pptr_t PHYS_BSS_START = 0x100000;
-    // mark .stage1 and .kinit
-    mark_addr_range((pptr_t)__stage1_start, (pptr_t)__kinit_end);
-    // mark kernel .text to .data
-    mark_addr_len((pptr_t)__kinit_end, __data_end - __text_start);
-    // mark kernel .bss
-    mark_addr_len(PHYS_BSS_START, bss_len);
-
-    if (e820_mem_map_entry_size == 20) {
-        struct e820_mem_map_entry_20* entry = (struct e820_mem_map_entry_20*)e820_mem_map;
-        for (uint32_t i = 0; i < e820_mem_map_count; ++i, ++entry) {
-            if (entry->type != 1) {
-                mark_addr_len(entry->base, entry->len);
-            }
-        }
-    } else {
-        struct e820_mem_map_entry_24* entry = (struct e820_mem_map_entry_24*)e820_mem_map;
-        for (uint32_t i = 0; i < e820_mem_map_count; ++i, ++entry) {
-            if (entry->in.type != 1) {
-                mark_addr_len(entry->in.base, entry->in.len);
-            }
-        }
-    }
-}
-
-using kernel::memory::mm_list;
-using kernel::memory::mm;
-
-mm_list::mm_list()
-    : m_areas(s_kernel_mms->m_areas)
-{
-    m_pd = __alloc_raw_page();
-    kernel::paccess pdst(m_pd), psrc(s_kernel_mms->m_pd);
-    auto* dst = pdst.ptr();
-    auto* src = psrc.ptr();
-    assert(dst && src);
-    memcpy(dst, src, PAGE_SIZE);
-}
-
-mm_list::mm_list(const mm_list& other)
-    : mm_list()
-{
-    m_brk = other.m_brk;
-    for (auto& src : other.m_areas) {
-        if (src.is_kernel_space() || src.attr.system)
-            continue;
-
-        auto& area = this->addarea(
-            src.start, src.attr.write, src.attr.system);
-
-        if (src.attr.mapped) {
-            area.attr.mapped = 1;
-            area.mapped_file = src.mapped_file;
-            area.file_offset = src.file_offset;
-        }
-
-        paccess pa(m_pd);
-        pd_t pd = (pd_t)pa.ptr();
-
-        for (const auto& pg : *src.pgs) {
-            area.append_page(pd, pg,
-                    PAGE_COW | (pg.attr & PAGE_MMAP),
-                    src.attr.system);
-        }
-    }
-}
-
-mm_list::~mm_list()
-{
-    if (!m_pd)
-        return;
-
-    clear_user();
-    dealloc_pd(m_pd);
-}
-
-void mm_list::switch_pd() const
-{
-    asm_switch_pd(m_pd);
-}
-
-int mm_list::register_brk(void* addr)
-{
-    if (!is_avail(addr))
-        return GB_FAILED;
-    m_brk = &addarea(addr, true, false);
-    return GB_OK;
-}
-
-void* mm_list::set_brk(void* addr)
-{
-    assert(m_brk);
-    void* curbrk = m_brk->end();
-
-    if (addr <= curbrk || !is_avail(curbrk, vptrdiff(addr, curbrk)))
-        return curbrk;
-
-    kernel::paccess pa(m_pd);
-    pd_t pd = (pd_t)pa.ptr();
-
-    while (curbrk < addr) {
-        m_brk->append_page(pd, empty_page, PAGE_COW, false);
-        curbrk = (char*)curbrk + PAGE_SIZE;
-    }
-
-    return curbrk;
-}
-
-void* mm_list::find_avail(void* hint, size_t len, bool priv) const
-{
-    void* addr = hint;
-    if (!addr) {
-        // default value of mmapp'ed area
-        if (!priv)
-            addr = (void*)0x40000000;
-        else
-            addr = (void*)0xe0000000;
-    }
-
-    while (!is_avail(addr, len)) {
-        auto iter = m_areas.lower_bound(addr);
-        if (iter == m_areas.end())
-            return nullptr;
-
-        addr = iter->end();
-    }
-
-    if (!priv && addr >= (void*)0xc0000000)
-        return nullptr;
-
-    return addr;
-}
-
-// TODO: write dirty pages to file
-int mm_list::unmap(void* start, size_t len, bool system)
-{
-    ptr_t addr = (ptr_t)start;
-    void* end = vptradd(start, align_up<12>(len));
-
-    // standard says that addr and len MUST be
-    // page-aligned or the call is invalid
-    if (addr % PAGE_SIZE != 0)
-        return -EINVAL;
-
-    // if doing user mode unmapping, check area privilege
-    if (!system) {
-        if (addr >= 0xc0000000 || end > (void*)0xc0000000)
-            return -EINVAL;
-    }
-
-    auto iter = m_areas.lower_bound(start);
-
-    for ( ; iter != m_areas.end() && *iter < end; ) {
-        if (!(start < *iter) && start != iter->start) {
-            mm newmm = iter->split(start);
-            unmap(newmm);
-            ++iter;
-            continue;
-        }
-        else if (!(*iter < end)) {
-            mm newmm = iter->split(end);
-            unmap(*iter);
-            m_areas.erase(iter);
-
-            bool inserted;
-            std::tie(std::ignore, inserted) = m_areas.emplace(std::move(newmm));
-            assert(inserted);
-            break;
-        }
-        else {
-            unmap(*iter);
-            iter = m_areas.erase(iter);
-        }
-    }
-
-    return GB_OK;
-}
-
-mm& mm_list::add_empty_area(void *start, std::size_t page_count,
-    uint32_t page_attr, bool w, bool system)
-{
-    auto& area = addarea(start, w, system);
-    kernel::paccess pa(m_pd);
-    pd_t pd = (pd_t)pa.ptr();
-
-    while (page_count--)
-        area.append_page(pd, empty_page, page_attr, system);
-
-    return area;
-}
-
-constexpr void map_raw_page_to_pte(
-    pte_t* pte, page_t page,
-    bool present, bool write, bool priv)
-{
-    // set P bit
-    pte->v = 0;
-    pte->in.p = present;
-    pte->in.rw = write;
-    pte->in.us = !priv;
-    pte->in.page = page;
-}
-
-void mm::append_page(pd_t pd, const page& pg, uint32_t attr, bool priv)
-{
-    assert(pd);
-
-    void* addr = this->end();
-    pde_t* pde = *pd + v_to_pdi(addr);
-
-    page_t pt_pg = 0;
-    pte_t* pte = nullptr;
-    // page table not exist
-    if (!pde->in.p) [[unlikely]] {
-        // allocate a page for the page table
-        pt_pg = __alloc_raw_page();
-        pde->in.p = 1;
-        pde->in.rw = 1;
-        pde->in.us = 1;
-        pde->in.pt_page = pt_pg;
-
-        auto pt = (pt_t)kernel::pmap(pt_pg);
-        assert(pt);
-        pte = *pt;
-
-        memset(pt, 0x00, PAGE_SIZE);
-    } else {
-        pt_pg = pde->in.pt_page;
-        auto pt = (pt_t)kernel::pmap(pt_pg);
-        assert(pt);
-        pte = *pt;
-    }
-
-    // map the page in the page table
-    int pti = v_to_pti(addr);
-    pte += pti;
-
-    map_raw_page_to_pte(
-        pte,
-        pg.phys_page_id,
-        !(attr & PAGE_MMAP),
-        false,
-        priv);
-
-    kernel::pfree(pt_pg);
-
-    if (unlikely((attr & PAGE_COW) && !(pg.attr & PAGE_COW))) {
-        kernel::paccess pa(pg.pg_pteidx >> 12);
-        auto* pg_pte = (pte_t*)pa.ptr();
-        assert(pg_pte);
-        pg_pte += (pg.pg_pteidx & 0xfff);
-        pg.attr |= PAGE_COW;
-        pg_pte->in.rw = 0;
-        pg_pte->in.a = 0;
-        invalidate_tlb(addr);
-    }
-
-    ++*pg.ref_count;
-
-    this->pgs->emplace_back(pg);
-    auto& emplaced = this->pgs->back();
-    emplaced.pg_pteidx = (pt_pg << 12) + pti;
-    emplaced.attr = attr;
-}
-
-mm mm::split(void *addr)
-{
-    assert(addr > start && addr < end());
-    assert((ptr_t)addr % PAGE_SIZE == 0);
-
-    size_t this_count = vptrdiff(addr, start) / PAGE_SIZE;
-    size_t new_count = pgs->size() - this_count;
-
-    mm newmm {
-        .start = addr,
-        .attr { attr },
-        .pgs = types::memory::kinew<mm::pages_vector>(),
-        .mapped_file = mapped_file,
-        .file_offset = attr.mapped ? file_offset + this_count * PAGE_SIZE : 0,
-    };
-
-    for (size_t i = 0; i < new_count; ++i) {
-        newmm.pgs->emplace_back(pgs->back());
-        pgs->pop_back();
-    }
-
-    return newmm;
-}
-
-int mmap(
-    void* hint,
-    size_t len,
-    fs::inode* file,
-    size_t offset,
-    int write,
-    int priv)
-{
-    auto& mms = current_process->mms;
-
-    if (file && !S_ISREG(file->mode) && !S_ISBLK(file->mode)) [[unlikely]] {
-        errno = EINVAL;
-        return GB_FAILED;
-    }
-
-    // TODO: find another address
-    assert(((uint32_t)hint & 0xfff) == 0);
-    // TODO: return failed
-    assert((offset & 0xfff) == 0);
-
-    size_t n_pgs = align_up<12>(len) >> 12;
-
-    if (!mms.is_avail(hint, len)) {
-        errno = EEXIST;
-        return GB_FAILED;
-    }
-
-    if (file) {
-        auto& mm = mms.add_empty_area(hint, n_pgs, PAGE_MMAP | PAGE_COW, write, priv);
-
-        mm.attr.mapped = 1;
-        mm.mapped_file = file;
-        mm.file_offset = offset;
-    }
-    else {
-        // private mapping of zero-filled pages
-        auto& mm = mms.add_empty_area(hint, n_pgs, PAGE_COW, write, priv);
-
-        mm.attr.mapped = 0;
-    }
-
-    return GB_OK;
-}
-
-SECTION(".text.kinit")
-void init_mem(void)
-{
-    init_mem_layout();
-
-    // TODO: replace early kernel pd
-    auto* __kernel_mms = types::memory::kinew<kernel::memory::mm_list>(EARLY_KERNEL_PD_PAGE);
-    kernel::memory::mm_list::s_kernel_mms = __kernel_mms;
-
-    // create empty_page struct
-    empty_page.attr = 0;
-    empty_page.phys_page_id = EMPTY_PAGE;
-    empty_page.ref_count = types::memory::kinew<size_t>(2);
-    empty_page.pg_pteidx = 0x00002000;
-
-    // 0xd0000000 to 0xd4000000 or 3.5GiB, size 64MiB
-    __kernel_mms->add_empty_area(KERNEL_HEAP_START,
-        64 * 1024 * 1024 / PAGE_SIZE, PAGE_COW, true, true);
-
-    kernel::kinit::init_kernel_heap(KERNEL_HEAP_START,
-        vptrdiff(KERNEL_HEAP_LIMIT, KERNEL_HEAP_START));
-}
-
-SECTION(".text.kinit")
-void create_segment_descriptor(
-    segment_descriptor* sd,
-    uint32_t base,
-    uint32_t limit,
-    uint32_t flags,
-    uint32_t access)
-{
-    sd->base_low = base & 0x0000ffff;
-    sd->base_mid = ((base & 0x00ff0000) >> 16);
-    sd->base_high = ((base & 0xff000000) >> 24);
-    sd->limit_low = limit & 0x0000ffff;
-    sd->limit_high = ((limit & 0x000f0000) >> 16);
-    sd->access = access;
-    sd->flags = flags;
-}
-
-namespace __physmapper {
-struct mapped_area {
-    size_t ref;
-    void* ptr;
-};
-
-static types::hash_map<page_t, mapped_area,
-    types::memory::ident_allocator<std::pair<page_t, mapped_area>>>
-    mapped;
-static uint8_t _freebm[0x400 / 8];
-static types::bitmap freebm(
-    [](unsigned char*, std::size_t){}, _freebm, 0x400);
-} // namespace __physmapper
-
-void* kernel::pmap(page_t pg, bool cached)
-{
-    auto* const pmap_pt = std::bit_cast<pte_t*>(0xff001000);
-    auto* const mapped_start = std::bit_cast<void*>(0xff000000);
-
-    auto iter = __physmapper::mapped.find(pg);
-    if (iter) {
-        auto [ idx, area ] = *iter;
-        ++area.ref;
-        return area.ptr;
-    }
-
-    for (int i = 2; i < 0x400; ++i) {
-        if (__physmapper::freebm.test(i) == 0) {
-            auto* pte = pmap_pt + i;
-            if (cached)
-                pte->v = 0x3;
-            else
-                pte->v = 0x13;
-            pte->in.page = pg;
-
-            void* ptr = vptradd(mapped_start, 0x1000 * i);
-            invalidate_tlb(ptr);
-
-            __physmapper::freebm.set(i);
-            __physmapper::mapped.emplace(pg,
-                __physmapper::mapped_area { 1, ptr });
-            return ptr;
-        }
-    }
-
-    return nullptr;
-}
-void kernel::pfree(page_t pg)
-{
-    auto* const pmap_pt = std::bit_cast<pte_t*>(0xff001000);
-    auto* const mapped_start = std::bit_cast<void*>(0xff000000);
-
-    auto iter = __physmapper::mapped.find(pg);
-    if (!iter)
-        return;
-    auto& [ ref, ptr ] = iter->second;
-
-    if (ref > 1) {
-        --ref;
-        return;
-    }
-
-    int i = vptrdiff(ptr, mapped_start);
-    i /= 0x1000;
-
-    auto* pte = pmap_pt + i;
-    pte->v = 0;
-    invalidate_tlb(ptr);
-
-    __physmapper::freebm.clear(i);
-    __physmapper::mapped.remove(iter);
-}

+ 348 - 0
src/kernel/mem/mm_list.cc

@@ -0,0 +1,348 @@
+#include <assert.h>
+#include <stdint.h>
+
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/vm_area.hpp>
+
+
+using namespace kernel::mem;
+
+static inline void __invalidate_all_tlb()
+{
+    asm volatile(
+            "mov %%cr3, %%rax\n\t"
+            "mov %%rax, %%cr3\n\t"
+            : : : "rax", "memory"
+            );
+}
+
+static inline void __dealloc_page_table_all(
+        paging::pfn_t pt, int depth, int from, int to)
+{
+    using namespace paging;
+
+    if (depth > 1) {
+        for (int i = from; i < to; ++i) {
+            auto pse = PSE{pt}[i];
+            if (!(pse.attributes() & PA_P))
+                continue;
+
+            int pfn = pse.pfn();
+            __dealloc_page_table_all(pfn, depth-1, 0, 512);
+        }
+    }
+
+    free_page(pt);
+}
+
+static inline void __dealloc_page_table(paging::pfn_t pt)
+{
+    using namespace paging;
+    auto start_idx = idx_p4(0);
+    auto end_idx = idx_p4(KERNEL_SPACE_START);
+
+    __dealloc_page_table_all(pt, 4, start_idx, end_idx);
+}
+
+mm_list::mm_list()
+    : m_pt{paging::alloc_page_table()}
+    , m_brk{m_areas.end()}
+{
+    memcpy(physaddr<void>{m_pt},
+           paging::KERNEL_PAGE_TABLE_PHYS_ADDR, 0x1000);
+}
+
+mm_list::mm_list(const mm_list& other): mm_list{}
+{
+    m_areas = other.m_areas;
+
+    using namespace paging;
+    for (auto iter = m_areas.begin(); iter != m_areas.end(); ++iter) {
+        auto& area = *iter;
+
+        if (area.flags & MM_BREAK)
+            m_brk = iter;
+
+        auto this_iter = vaddr_range{m_pt, area.start, area.end};
+        auto other_iter = vaddr_range{other.m_pt, area.start, area.end};
+
+        while (this_iter) {
+            auto this_pte = *this_iter, other_pte = *other_iter;
+            auto attributes = other_pte.attributes();
+            auto pfn = other_pte.pfn();
+
+            attributes &= ~(PA_RW | PA_A | PA_D);
+            attributes |= PA_COW;
+            this_pte.set(attributes, pfn);
+
+            increase_refcount(pfn_to_page(pfn));
+
+            // TODO: create a function to set COW mappings
+            attributes = other_pte.attributes();
+            attributes &= ~PA_RW;
+            attributes |= PA_COW;
+            other_pte.set(attributes, pfn);
+
+            ++this_iter, ++other_iter;
+        }
+    }
+
+    __invalidate_all_tlb();
+}
+
+mm_list::~mm_list()
+{
+    if (!m_pt)
+        return;
+
+    clear();
+    __dealloc_page_table(m_pt);
+}
+
+bool mm_list::is_avail(uintptr_t start, std::size_t len) const noexcept
+{
+    start &= ~0xfff;
+    uintptr_t end = (start + len + 0xfff) & ~0xfff;
+    len = end - start;
+
+    if (end > USER_SPACE_MEMORY_TOP)
+        return false;
+
+    for (const auto& area : m_areas) {
+        if (!area.is_avail(start, end))
+            return false;
+    }
+    return true;
+}
+
+bool mm_list::is_avail(uintptr_t addr) const
+{
+    if (addr >= USER_SPACE_MEMORY_TOP)
+        return false;
+
+    auto iter = m_areas.find(addr);
+    return iter == m_areas.end();
+}
+
+uintptr_t mm_list::find_avail(uintptr_t hint, size_t len) const
+{
+    auto addr = std::max(hint, MMAP_MIN_ADDR);
+
+    while (!is_avail(addr, len)) {
+        auto iter = m_areas.lower_bound(addr);
+        if (iter == m_areas.end())
+            return 0;
+
+        addr = iter->end;
+    }
+
+    return addr;
+}
+
+void mm_list::switch_pd() const noexcept
+{
+    asm volatile("mov %0, %%cr3": : "r"(m_pt): "memory");
+}
+
+int mm_list::register_brk(uintptr_t addr)
+{
+    assert(m_brk == m_areas.end());
+    if (!is_avail(addr))
+        return -ENOMEM;
+
+    bool inserted;
+    std::tie(m_brk, inserted) = m_areas.emplace(
+            addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
+
+    assert(inserted);
+    return 0;
+}
+
+uintptr_t mm_list::set_brk(uintptr_t addr)
+{
+    using namespace paging;
+    assert(m_brk != m_areas.end());
+    uintptr_t curbrk = m_brk->end;
+
+    addr += 4096-1;
+    addr &= ~0xfff;
+
+    if (addr <= curbrk || !is_avail(curbrk, addr - curbrk))
+        return curbrk;
+
+    for (auto pte : vaddr_range{m_pt, curbrk, addr})
+        pte.set(PA_ANONYMOUS_PAGE | PA_NXE, EMPTY_PAGE_PFN);
+
+    m_brk->end = addr;
+    return m_brk->end;
+}
+
+void mm_list::clear()
+{
+    for (auto iter = m_areas.begin(); iter != m_areas.end(); ++iter)
+        unmap(iter, false);
+
+    __invalidate_all_tlb();
+
+    m_areas.clear();
+    m_brk = m_areas.end();
+}
+
+mm_list::iterator mm_list::split(iterator area, uintptr_t addr)
+{
+    assert(!(addr & 0xfff));
+    assert(addr > area->start && addr < area->end);
+
+    std::size_t old_len = addr - area->start;
+    std::size_t new_file_offset = 0;
+
+    if (area->mapped_file)
+        new_file_offset = area->file_offset + old_len;
+
+    auto new_end = area->end;
+    area->end = addr;
+
+    auto [ iter, inserted ] =
+        m_areas.emplace(addr, area->flags, new_end,
+                area->mapped_file, new_file_offset);
+
+    assert(inserted);
+    return iter;
+}
+
+int mm_list::unmap(iterator area, bool should_invalidate_tlb)
+{
+    using namespace paging;
+
+    bool should_use_invlpg = area->end - area->start <= 0x4000;
+    auto range = vaddr_range{m_pt, area->start, area->end};
+    uintptr_t cur_addr = area->start;
+
+    // TODO: write back dirty pages
+    for (auto pte : range) {
+        free_page(pte.pfn());
+        pte.clear();
+
+        if (should_invalidate_tlb && should_use_invlpg) {
+            asm volatile("invlpg (%0)": : "r"(cur_addr): "memory");
+            cur_addr += 0x1000;
+        }
+    }
+
+    if (should_invalidate_tlb && !should_use_invlpg)
+        __invalidate_all_tlb();
+
+    return 0;
+}
+
+int mm_list::unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb)
+{
+    // standard says that addr and len MUST be
+    // page-aligned or the call is invalid
+    if (start & 0xfff)
+        return -EINVAL;
+
+    uintptr_t end = (start + length + 0xfff) & ~0xfff;
+
+    // check address validity
+    if (end > KERNEL_SPACE_START)
+        return -EINVAL;
+    if (end > USER_SPACE_MEMORY_TOP)
+        return -ENOMEM;
+
+    auto iter = m_areas.lower_bound(start);
+    auto iter_end = m_areas.upper_bound(end);
+
+    // start <= iter <= end a.k.a. !(start > *iter) && !(*iter > end)
+    while (iter != iter_end) {
+        // start == iter:
+        // start is between (iter->start, iter->end)
+        //
+        // strip out the area before start
+        if (!(start < *iter) && start != iter->start)
+            iter = split(iter, start);
+
+        // iter.end <= end
+        // it is safe to unmap the area directly
+        if (*iter < end) {
+            if (int ret = unmap(iter, should_invalidate_tlb); ret != 0)
+                return ret;
+
+            iter = m_areas.erase(iter);
+            continue;
+        }
+
+        // end == iter:
+        // end is between [iter->start, iter->end)
+        //
+        // if end == iter->start, no need to strip the area
+        if (end == iter->start) {
+            ++iter;
+            continue;
+        }
+
+        (void)split(iter, end);
+        if (int ret = unmap(iter, should_invalidate_tlb); ret != 0)
+            return ret;
+
+        iter = m_areas.erase(iter);
+
+        // no need to check areas after this
+        break;
+    }
+
+    return 0;
+}
+
+int mm_list::mmap(const map_args& args)
+{
+    auto& vaddr = args.vaddr;
+    auto& length = args.length;
+    auto& finode = args.file_inode;
+    auto& foff = args.file_offset;
+    auto& flags = args.flags;
+
+    assert((vaddr & 0xfff) == 0 && (foff & 0xfff) == 0);
+    assert((length & 0xfff) == 0 && length != 0);
+
+    if (!is_avail(vaddr, length))
+        return -EEXIST;
+
+    using namespace kernel::mem::paging;
+
+    // PA_RW is set during page fault while PA_NXE is preserved
+    // so we set PA_NXE now
+    psattr_t attributes = PA_US;
+    if (!(flags & MM_EXECUTE))
+        attributes |= PA_NXE;
+
+    if (flags & MM_MAPPED) {
+        assert(finode);
+        assert(S_ISREG(finode->mode) || S_ISBLK(finode->mode));
+
+        auto [ area, inserted ] = m_areas.emplace(
+                vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, finode, foff);
+        assert(inserted);
+
+        attributes |= PA_MMAPPED_PAGE;
+        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
+            pte.set(attributes, EMPTY_PAGE_PFN);
+    }
+    else if (flags & MM_ANONYMOUS) {
+        // private mapping of zero-filled pages
+        // TODO: shared mapping
+        auto [ area, inserted ] =
+            m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
+        assert(inserted);
+
+        attributes |= PA_ANONYMOUS_PAGE;
+        for (auto pte : vaddr_range{m_pt, vaddr, vaddr + length})
+            pte.set(attributes, EMPTY_PAGE_PFN);
+    }
+    else {
+        return -EINVAL;
+    }
+
+    return 0;
+}

+ 448 - 0
src/kernel/mem/paging.cc

@@ -0,0 +1,448 @@
+#include <assert.h>
+#include <string.h>
+
+#include <types/list.hpp>
+
+#include <kernel/async/lock.hpp>
+#include <kernel/log.hpp>
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/slab.hpp>
+#include <kernel/mem/vm_area.hpp>
+#include <kernel/process.hpp>
+
+using namespace types::list;
+
+using namespace kernel::async;
+using namespace kernel::mem::paging;
+
+static inline void __page_fault_die(uintptr_t vaddr)
+{
+    kmsgf("[kernel] kernel panic: invalid memory access to %p", vaddr);
+    freeze();
+}
+
+static inline PSE __parse_pse(PSE pse, bool priv)
+{
+    auto attr = priv ? PA_KERNEL_PAGE_TABLE : PA_USER_PAGE_TABLE;
+    if (!(pse.attributes() & PA_P))
+        pse.set(attr, alloc_page_table());
+
+    return pse.parse();
+}
+
+static struct zone_info {
+    page* next;
+    std::size_t count;
+} zones[52];
+
+static mutex zone_lock;
+
+constexpr unsigned _msb(std::size_t x)
+{
+    unsigned n = 0;
+    while (x >>= 1)
+        n++;
+    return n;
+}
+
+constexpr pfn_t buddy(pfn_t pfn, unsigned order)
+{
+    return pfn ^ (1 << (order + 12));
+}
+
+constexpr pfn_t parent(pfn_t pfn, unsigned order)
+{
+    return pfn & ~(1 << (order + 12));
+}
+
+// call with zone_lock held
+static inline void _zone_list_insert(unsigned order, page* zone)
+{
+    assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
+    assert((zone->flags & 0xff) == 0);
+    zone->flags |= order;
+
+    zones[order].count++;
+    list_insert(&zones[order].next, zone);
+}
+
+// call with zone_lock held
+static inline void _zone_list_remove(unsigned order, page* zone)
+{
+    assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
+    assert(zones[order].count > 0 && (zone->flags & 0xff) == order);
+    zone->flags &= ~0xff;
+
+    zones[order].count--;
+    list_remove(&zones[order].next, zone);
+}
+
+// call with zone_lock held
+static inline page* _zone_list_get(unsigned order)
+{
+    if (zones[order].count == 0)
+        return nullptr;
+
+    zones[order].count--;
+    auto* pg = list_get(&zones[order].next);
+
+    assert((pg->flags & 0xff) == order);
+    return pg;
+}
+
+// where order represents power of 2
+// call with zone_lock held
+static inline page* _create_zone(pfn_t pfn, unsigned order)
+{
+    page* zone = pfn_to_page(pfn);
+
+    assert(zone->flags & PAGE_PRESENT);
+    zone->flags |= PAGE_BUDDY;
+
+    _zone_list_insert(order, zone);
+    return zone;
+}
+
+// call with zone_lock held
+static inline void _split_zone(page* zone, unsigned order, unsigned target_order)
+{
+    while (order > target_order) {
+        pfn_t pfn = page_to_pfn(zone);
+        _create_zone(buddy(pfn, order - 1), order - 1);
+
+        order--;
+    }
+
+    zone->flags &= ~0xff;
+    zone->flags |= target_order;
+}
+
+// call with zone_lock held
+static inline page* _alloc_zone(unsigned order)
+{
+    for (unsigned i = order; i < 52; ++i) {
+        auto zone = _zone_list_get(i);
+        if (!zone)
+            continue;
+
+        increase_refcount(zone);
+
+        if (i > order)
+            _split_zone(zone, i, order);
+
+        assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
+        return zone;
+    }
+
+    return nullptr;
+}
+
+void kernel::mem::paging::create_zone(uintptr_t start, uintptr_t end)
+{
+    start += (4096 - 1);
+    start >>= 12;
+    end >>= 12;
+
+    if (start >= end)
+        return;
+
+    lock_guard_irq lock{zone_lock};
+
+    unsigned long low = start;
+    for (unsigned i = 0; i < _msb(end); ++i, low >>= 1) {
+        if (!(low & 1))
+            continue;
+        _create_zone(low << (12+i), i);
+        low++;
+    }
+
+    low = 1 << _msb(end);
+    while (low < end) {
+        unsigned order = _msb(end - low);
+        _create_zone(low << 12, order);
+        low |= (1 << order);
+    }
+}
+
+void kernel::mem::paging::mark_present(uintptr_t start, uintptr_t end)
+{
+    start >>= 12;
+
+    end += (4096 - 1);
+    end >>= 12;
+
+    while (start < end)
+        PAGE_ARRAY[start++].flags |= PAGE_PRESENT;
+}
+
+page* kernel::mem::paging::alloc_pages(unsigned order)
+{
+    lock_guard_irq lock{zone_lock};
+    auto* zone = _alloc_zone(order);
+    if (!zone)
+        freeze();
+
+    return zone;
+}
+
+page* kernel::mem::paging::alloc_page()
+{
+    return alloc_pages(0);
+}
+
+pfn_t kernel::mem::paging::alloc_page_table()
+{
+    page* zone = alloc_page();
+    pfn_t pfn = page_to_pfn(zone);
+
+    memset(physaddr<void>{pfn}, 0x00, 0x1000);
+
+    return pfn;
+}
+
+void kernel::mem::paging::free_pages(page* pg, unsigned order)
+{
+    assert((pg->flags & 0xff) == order);
+
+    // TODO: atomic
+    if (!(pg->flags & PAGE_BUDDY) || --pg->refcount)
+        return;
+
+    lock_guard_irq lock{zone_lock};
+    while (order < 52) {
+        pfn_t pfn = page_to_pfn(pg);
+        pfn_t buddy_pfn = buddy(pfn, order);
+        page* buddy_page = pfn_to_page(buddy_pfn);
+
+        if (!(buddy_page->flags & PAGE_BUDDY))
+            break;
+
+        if ((buddy_page->flags & 0xff) != order)
+            break;
+
+        if (buddy_page->refcount)
+            break;
+
+        _zone_list_remove(order, buddy_page);
+
+        if (buddy_page < pg)
+            std::swap(buddy_page, pg);
+
+        buddy_page->flags &= ~PAGE_BUDDY;
+        order++;
+    }
+
+    pg->flags &= ~0xff;
+    _zone_list_insert(order, pg);
+}
+
+void kernel::mem::paging::free_page(page* page)
+{
+    return free_pages(page, 0);
+}
+
+void kernel::mem::paging::free_pages(pfn_t pfn, unsigned order)
+{
+    return free_pages(pfn_to_page(pfn), order);
+}
+
+void kernel::mem::paging::free_page(pfn_t pfn)
+{
+    return free_page(pfn_to_page(pfn));
+}
+
+pfn_t kernel::mem::paging::page_to_pfn(page* _page)
+{
+    return (pfn_t)(_page - PAGE_ARRAY) * 0x1000;
+}
+
+page* kernel::mem::paging::pfn_to_page(pfn_t pfn)
+{
+    return PAGE_ARRAY + pfn / 0x1000;
+}
+
+void kernel::mem::paging::increase_refcount(page* pg)
+{
+    pg->refcount++;
+}
+
+void kernel::mem::paging::handle_page_fault(unsigned long err)
+{
+    using namespace kernel::mem;
+    using namespace paging;
+
+    uintptr_t vaddr;
+    asm volatile("mov %%cr2, %0": "=g"(vaddr): : );
+    auto& mms = current_process->mms;
+
+    auto* mm_area = mms.find(vaddr);
+    if (!mm_area) [[unlikely]] {
+        // user access of address that does not exist
+        if (err & PAGE_FAULT_U)
+            kill_current(SIGSEGV);
+
+        __page_fault_die(vaddr);
+    }
+
+    // user access to a present page caused the fault
+    // check access rights
+    if (err & PAGE_FAULT_U && err & PAGE_FAULT_P) {
+        // write to read only pages
+        if (err & PAGE_FAULT_W && !(mm_area->flags & MM_WRITE))
+            kill_current(SIGSEGV);
+
+        // execute from non-executable pages
+        if (err & PAGE_FAULT_I && !(mm_area->flags & MM_EXECUTE))
+            kill_current(SIGSEGV);
+    }
+
+    auto idx = idx_all(vaddr);
+
+    auto pe = mms.get_page_table()[std::get<1>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<2>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<3>(idx)];
+    assert(pe.attributes() & PA_P);
+    pe = pe.parse()[std::get<4>(idx)];
+
+    bool mmapped = mm_area->flags & MM_MAPPED;
+    assert(!mmapped || mm_area->mapped_file);
+
+    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
+        __page_fault_die(vaddr);
+
+    pfn_t pfn = pe.pfn();
+    auto attr = pe.attributes();
+
+    page* pg = pfn_to_page(pfn);
+
+    if (attr & PA_COW) {
+        attr &= ~PA_COW;
+        if (mm_area->flags & MM_WRITE)
+            attr |= PA_RW;
+        else
+            attr &= ~PA_RW;
+
+        // if it is a dying page
+        // TODO: use atomic
+        if (pg->refcount == 1) {
+            pe.set(attr, pfn);
+            return;
+        }
+
+        // duplicate the page
+        page* new_page = alloc_page();
+        pfn_t new_pfn = page_to_pfn(new_page);
+        physaddr<void> new_page_addr{new_pfn};
+
+        if (attr & PA_ANON)
+            memset(new_page_addr, 0x00, 0x1000);
+        else
+            memcpy(new_page_addr, physaddr<void>{pfn}, 0x1000);
+
+        attr &= ~(PA_A | PA_ANON);
+        --pg->refcount;
+
+        pe.set(attr, new_pfn);
+        pfn = new_pfn;
+    }
+
+    if (attr & PA_MMAP) {
+        attr |= PA_P;
+
+        size_t offset = (vaddr & ~0xfff) - mm_area->start;
+        char* data = physaddr<char>{pfn};
+
+        int n = vfs_read(
+            mm_area->mapped_file,
+            data,
+            4096,
+            mm_area->file_offset + offset,
+            4096);
+
+        // TODO: send SIGBUS if offset is greater than real size
+        if (n != 4096)
+            memset(data + n, 0x00, 4096 - n);
+
+        // TODO: shared mapping
+        attr &= ~PA_MMAP;
+
+        pe.set(attr, pfn);
+    }
+}
+
+vaddr_range::vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool priv)
+    : n {start >= end ? 0 : ((end - start) >> 12)}
+    , idx4{!n ? 0 : idx_p4(start)}
+    , idx3{!n ? 0 : idx_p3(start)}
+    , idx2{!n ? 0 : idx_p2(start)}
+    , idx1{!n ? 0 : idx_p1(start)}
+    , pml4{!n ? PSE{0} : PSE{pt}}
+    , pdpt{!n ? PSE{0} : __parse_pse(pml4[idx4], priv)}
+    , pd{!n ? PSE{0} : __parse_pse(pdpt[idx3], priv)}
+    , pt{!n ? PSE{0} : __parse_pse(pd[idx2], priv)}
+    , m_start{!n ? 0 : start}, m_end{!n ? 0 : end}
+    , is_privilege{!n ? false : priv} { }
+
+vaddr_range::vaddr_range(std::nullptr_t)
+    : n{}
+    , idx4{}, idx3{}, idx2{}, idx1{}
+    , pml4{0}, pdpt{0}
+    , pd{0}, pt{0}
+    , m_start{}, m_end{}, is_privilege{} { }
+
+vaddr_range vaddr_range::begin() const noexcept
+{
+    return *this;
+}
+
+vaddr_range vaddr_range::end() const noexcept
+{
+    return vaddr_range {nullptr};
+}
+
+PSE vaddr_range::operator*() const noexcept
+{
+    return pt[idx1];
+}
+
+vaddr_range& vaddr_range::operator++()
+{
+    --n;
+
+    if ((idx1 = (idx1+1)%512) != 0)
+        return *this;
+
+    do {
+        if ((idx2 = (idx2+1)%512) != 0)
+            break;
+        do {
+            if ((idx3 = (idx3+1)%512) != 0)
+                break;
+
+            idx4 = (idx4+1) % 512;
+
+            // if idx4 is 0 after update, we have an overflow
+            assert(idx4 != 0);
+
+            pdpt = __parse_pse(pml4[idx4], is_privilege);
+        } while (false);
+
+        pd = __parse_pse(pdpt[idx3], is_privilege);
+    } while (false);
+
+    pt = __parse_pse(pd[idx2], is_privilege);
+    return *this;
+}
+
+vaddr_range::operator bool() const noexcept
+{
+    return n;
+}
+
+bool vaddr_range::operator==(const vaddr_range& other) const noexcept
+{
+    return n == other.n;
+}

+ 125 - 0
src/kernel/mem/slab.cc

@@ -0,0 +1,125 @@
+#include <cstddef>
+
+#include <assert.h>
+
+#include <types/list.hpp>
+
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/slab.hpp>
+
+using namespace kernel::mem;
+using namespace types::list;
+
+constexpr std::size_t SLAB_PAGE_SIZE = 0x1000; // 4K
+
+std::ptrdiff_t _slab_data_start_offset(std::size_t size)
+{
+    return (sizeof(slab_head) + size - 1) & ~(size - 1);
+}
+
+std::size_t _slab_max_count(std::size_t size)
+{
+    return (SLAB_PAGE_SIZE - _slab_data_start_offset(size)) / size;
+}
+
+void* _slab_head_alloc(slab_head* slab)
+{
+    if (slab->free_count == 0)
+        return nullptr;
+
+    void* ptr = slab->free;
+    slab->free = *(void**)ptr;
+    slab->free_count--;
+
+    return ptr;
+}
+
+slab_head* _make_slab(uintptr_t start, std::size_t size)
+{
+    slab_head* slab = physaddr<slab_head>{start};
+
+    slab->obj_size = size;
+    slab->free_count = _slab_max_count(size);
+    slab->next = nullptr;
+    slab->prev = nullptr;
+
+    slab->free = physaddr<void>{start + _slab_data_start_offset(size)};
+
+    std::byte* ptr = (std::byte*)slab->free;
+    for (unsigned i = 0; i < slab->free_count; ++i) {
+        void* nextptr = ptr + size;
+        if (i == slab->free_count-1)
+            *(void**)ptr = nullptr;
+        else
+            *(void**)ptr = nextptr;
+        ptr = (std::byte*)nextptr;
+    }
+
+    return slab;
+}
+
+void _slab_add_page(slab_cache* cache) {
+    auto* new_page = paging::alloc_page();
+    auto new_page_pfn = paging::page_to_pfn(new_page);
+
+    new_page->flags |= paging::PAGE_SLAB;
+
+    auto* slab = _make_slab(new_page_pfn, cache->obj_size);
+    slab->cache = cache;
+
+    list_insert(&cache->slabs_empty, slab);
+}
+
+void* kernel::mem::slab_alloc(slab_cache* cache) {
+    slab_head* slab = cache->slabs_partial;
+    if (!slab) { // no partial slabs, try to get an empty slab
+        if (!cache->slabs_empty) // no empty slabs, create a new one
+            _slab_add_page(cache);
+
+        slab = list_get(&cache->slabs_empty);
+
+        list_insert(&cache->slabs_partial, slab);
+    }
+
+    void* ptr = _slab_head_alloc(slab);
+
+    if (slab->free_count == 0) { // slab is full
+        list_remove(&cache->slabs_partial, slab);
+        list_insert(&cache->slabs_full, slab);
+    }
+
+    return ptr;
+}
+
+void kernel::mem::slab_free(void* ptr) {
+    slab_head* slab = (slab_head*)((uintptr_t)ptr & ~(SLAB_PAGE_SIZE-1));
+
+    *(void**)ptr = slab->free;
+    slab->free = ptr;
+    slab->free_count++;
+
+    if (slab->free_count == _slab_max_count(slab->obj_size)) {
+        auto* cache = slab->cache;
+        slab_head** head = nullptr;
+
+        if (cache->slabs_full == slab) {
+            head = &cache->slabs_full;
+        } else {
+            assert(cache->slabs_partial == slab);
+            head = &cache->slabs_partial;
+        }
+
+        list_remove(head, slab);
+        list_insert(&cache->slabs_empty, slab);
+    }
+}
+
+void kernel::mem::init_slab_cache(slab_cache* cache, std::size_t obj_size)
+{
+    cache->obj_size = obj_size;
+    cache->slabs_empty = nullptr;
+    cache->slabs_partial = nullptr;
+    cache->slabs_full = nullptr;
+
+    _slab_add_page(cache);
+}

+ 122 - 148
src/kernel/process.cpp

@@ -10,20 +10,13 @@
 #include <sys/wait.h>
 
 #include <types/allocator.hpp>
-#include <types/bitmap.hpp>
 #include <types/cplusplus.hpp>
 #include <types/elf.hpp>
-#include <types/size.h>
-#include <types/status.h>
 #include <types/types.h>
 
-#include <asm/port_io.h>
-#include <asm/sys.h>
 #include <kernel/async/lock.hpp>
-#include <kernel/interrupt.h>
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
 #include <kernel/module.hpp>
 #include <kernel/process.hpp>
 #include <kernel/signal.hpp>
@@ -45,7 +38,7 @@ namespace kernel {
 struct no_irq_guard {
     explicit no_irq_guard()
     {
-        asm_cli();
+        asm volatile("cli");
     }
 
     no_irq_guard(const no_irq_guard&) = delete;
@@ -53,7 +46,7 @@ struct no_irq_guard {
 
     ~no_irq_guard()
     {
-        asm_sti();
+        asm volatile("sti");
     }
 };
 
@@ -162,7 +155,7 @@ int filearr::open(const process &current,
             if (!parent)
                 return -EINVAL;
             int ret = fs::vfs_mkfile(parent, filename.c_str(), mode);
-            if (ret != GB_OK)
+            if (ret != 0)
                 return ret;
             dentry = fs::vfs_open(*current.root, filepath);
             assert(dentry);
@@ -236,7 +229,7 @@ void process::send_signal(signo_type signal)
 
 void kernel_threadd_main(void)
 {
-    kmsg("kernel thread daemon started\n");
+    kmsg("[kernel] kthread daemon started");
 
     for (;;) {
         if (kthreadd_new_thd_func) {
@@ -255,22 +248,29 @@ void kernel_threadd_main(void)
             // TODO
             (void)func, (void)data;
             assert(false);
-
-            // syscall_fork
-            // int ret = syscall(0x00);
-
-            // if (ret == 0) {
-            //     // child process
-            //     func(data);
-            //     // the function shouldn't return here
-            //     assert(false);
-            // }
         }
         // TODO: sleep here to wait for new_kernel_thread event
-        asm_hlt();
+        asm volatile("hlt");
     }
 }
 
+static inline void __spawn(kernel::task::thread& thd, uintptr_t entry)
+{
+    auto prev_sp = thd.kstack.sp;
+
+    // return(start) address
+    thd.kstack.pushq(entry);
+    thd.kstack.pushq(0x200);       // flags
+    thd.kstack.pushq(0);           // r15
+    thd.kstack.pushq(0);           // r14
+    thd.kstack.pushq(0);           // r13
+    thd.kstack.pushq(0);           // r12
+    thd.kstack.pushq(0);           // rbp
+    thd.kstack.pushq(0);           // rbx
+    thd.kstack.pushq(0);           // 0 for alignment
+    thd.kstack.pushq(prev_sp);     // previous sp
+}
+
 SECTION(".text.kinit")
 proclist::proclist()
 {
@@ -278,17 +278,15 @@ proclist::proclist()
     auto& init = real_emplace(1, 0);
     assert(init.pid == 1 && init.ppid == 0);
 
-    auto& thd = *init.thds.begin();
-    thd.name.assign("[kernel init]");
+    auto thd = init.thds.begin();
+    thd->name.assign("[kernel init]");
 
     current_process = &init;
     current_thread = &thd;
 
     kernel::task::dispatcher::enqueue(current_thread);
 
-    tss.ss0 = KERNEL_DATA_SEGMENT;
-    tss.esp0 = (uint32_t)current_thread->kstack.esp;
-
+    current_thread->kstack.load_interrupt_stack();
     current_process->mms.switch_pd();
 
     if (1) {
@@ -297,26 +295,10 @@ proclist::proclist()
         assert(proc.pid == 0 && proc.ppid == 0);
 
         // create thread
-        auto& thd = *proc.thds.begin();
-        thd.name.assign("[kernel thread daemon]");
-
-        auto* esp = &thd.kstack.esp;
-        auto old_esp = (uint32_t)thd.kstack.esp;
-
-        // return(start) address
-        push_stack(esp, (uint32_t)kernel_threadd_main);
-        // ebx
-        push_stack(esp, 0);
-        // edi
-        push_stack(esp, 0);
-        // esi
-        push_stack(esp, 0);
-        // ebp
-        push_stack(esp, 0);
-        // eflags
-        push_stack(esp, 0x200);
-        // original esp
-        push_stack(esp, old_esp);
+        auto thd = proc.thds.begin();
+        thd->name.assign("[kernel thread daemon]");
+
+        __spawn(*thd, (uintptr_t)kernel_threadd_main);
 
         kernel::task::dispatcher::enqueue(&thd);
     }
@@ -334,6 +316,12 @@ void proclist::kill(pid_t pid, int exit_code)
 {
     auto& proc = this->find(pid);
 
+    // init should never exit
+    if (proc.ppid == 0) {
+        kmsg("kernel panic: init exited!");
+        freeze();
+    }
+
     // put all threads into sleep
     for (auto& thd : proc.thds)
         thd.set_attr(kernel::task::thread::ZOMBIE);
@@ -342,13 +330,7 @@ void proclist::kill(pid_t pid, int exit_code)
     proc.files.close_all();
 
     // unmap all user memory areas
-    proc.mms.clear_user();
-
-    // init should never exit
-    if (proc.ppid == 0) {
-        console->print("kernel panic: init exited!\n");
-        freeze();
-    }
+    proc.mms.clear();
 
     // make child processes orphans (children of init)
     this->make_children_orphans(pid);
@@ -391,48 +373,40 @@ void proclist::kill(pid_t pid, int exit_code)
 
 static void release_kinit()
 {
-    extern char __stage1_start[];
-    extern char __kinit_end[];
-
-    kernel::paccess pa(EARLY_KERNEL_PD_PAGE);
-    auto pd = (pd_t)pa.ptr();
-    assert(pd);
-    (*pd)[0].v = 0;
+    // free .kinit
+    using namespace kernel::mem::paging;
+    extern uintptr_t volatile KINIT_START_ADDR, KINIT_END_ADDR, KINIT_PAGES;
 
-    // free pt#0
-    __free_raw_page(0x00002);
+    std::size_t pages = KINIT_PAGES;
+    auto range = vaddr_range{KERNEL_PAGE_TABLE_ADDR,
+        KINIT_START_ADDR, KINIT_END_ADDR, true};
+    for (auto pte : range)
+        pte.clear();
 
-    // free .stage1 and .kinit
-    for (uint32_t i = ((uint32_t)__stage1_start >> 12);
-            i < ((uint32_t)__kinit_end >> 12); ++i) {
-        __free_raw_page(i);
-    }
+    create_zone(0x2000, 0x2000 + 0x1000 * pages);
 }
 
-void NORETURN _kernel_init(void)
+void NORETURN _kernel_init(kernel::mem::paging::pfn_t kernel_stack_pfn)
 {
+    kernel::mem::paging::free_pages(kernel_stack_pfn, 9);
     release_kinit();
 
-    asm_sti();
+    asm volatile("sti");
 
     // ------------------------------------------
     // interrupt enabled
     // ------------------------------------------
 
     // load kmods
-    for (auto loader = kernel::module::kmod_loaders_start; *loader; ++loader) {
+    for (auto loader = kernel::module::KMOD_LOADERS_START; *loader; ++loader) {
         auto* mod = (*loader)();
         if (!mod)
             continue;
 
-        auto ret = insmod(mod);
-        if (ret == kernel::module::MODULE_SUCCESS)
+        if (auto ret = insmod(mod); ret == kernel::module::MODULE_SUCCESS)
             continue;
 
-        char buf[256];
-        snprintf(buf, sizeof(buf),
-            "[kernel] An error occured while loading \"%s\"\n", mod->name);
-        kmsg(buf);
+        kmsgf("[kernel] An error occured while loading \"%s\"", mod->name);
     }
 
     // mount fat32 /mnt directory
@@ -454,42 +428,42 @@ void NORETURN _kernel_init(void)
     }
 
     current_process->attr.system = 0;
-    current_thread->attr |= kernel::task::thread::SYSTEM;
-
-    const char* argv[] = { "/mnt/busybox", "sh", "/mnt/initsh" };
-    const char* envp[] = { "LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/", nullptr };
+    current_thread->attr &= ~kernel::task::thread::SYSTEM;
 
-    types::elf::elf32_load_data d;
-    d.argv = argv;
-    d.envp = envp;
-    d.system = false;
+    types::elf::elf32_load_data d{
+        .exec_dent{},
+        .argv{ "/mnt/busybox", "sh", "/mnt/initsh" },
+        .envp{ "LANG=C", "HOME=/root", "PATH=/mnt", "PWD=/" },
+        .ip{}, .sp{}
+    };
 
-    d.exec_dent = fs::vfs_open(*fs::fs_root, types::path{argv[0]});
+    d.exec_dent = fs::vfs_open(*fs::fs_root, types::path{d.argv[0].c_str()});
     if (!d.exec_dent) {
-        console->print("kernel panic: init not found!\n");
+        kmsg("kernel panic: init not found!");
         freeze();
     }
 
-    int ret = types::elf::elf32_load(&d);
-    assert(ret == GB_OK);
+    int ret = types::elf::elf32_load(d);
+    assert(ret == 0);
+
+    int ds = 0x33, cs = 0x2b;
 
     asm volatile(
-        "movw $0x23, %%ax\n"
-        "movw %%ax, %%ds\n"
-        "movw %%ax, %%es\n"
-        "movw %%ax, %%fs\n"
-        "movw %%ax, %%gs\n"
-
-        "pushl $0x23\n"
-        "pushl %0\n"
-        "pushl $0x200\n"
-        "pushl $0x1b\n"
-        "pushl %1\n"
-
-        "iret\n"
-        :
-        : "c"(d.sp), "d"(d.eip)
-        : "eax", "memory");
+        "mov %0, %%rax\n"
+        "mov %%ax, %%ds\n"
+        "mov %%ax, %%es\n"
+        "mov %%ax, %%fs\n"
+        "mov %%ax, %%gs\n"
+
+        "push %%rax\n"
+        "push %2\n"
+        "push $0x200\n"
+        "push %1\n"
+        "push %3\n"
+
+        "iretq\n"
+        : : "g"(ds), "g"(cs), "g"(d.sp),
+            "g"(d.ip) : "eax", "memory");
 
     freeze();
 }
@@ -502,69 +476,71 @@ void k_new_thread(void (*func)(void*), void* data)
 }
 
 SECTION(".text.kinit")
-void NORETURN init_scheduler(void)
+void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn)
 {
     procs = new proclist;
 
     asm volatile(
-        "movl %0, %%esp\n"
-        "pushl %=f\n"
-        "pushl %1\n"
-
-        "movw $0x10, %%ax\n"
-        "movw %%ax, %%ss\n"
-        "movw %%ax, %%ds\n"
-        "movw %%ax, %%es\n"
-        "movw %%ax, %%fs\n"
-        "movw %%ax, %%gs\n"
-
-        "xorl %%ebp, %%ebp\n"
-        "xorl %%edx, %%edx\n"
-
-        "pushl $0x0\n"
-        "popfl\n"
+        "mov %2, %%rdi\n"
+        "mov %0, %%rsp\n"
+        "sub $24, %%rsp\n"
+        "mov %=f, %%rbx\n"
+        "mov %%rbx, (%%rsp)\n"   // return address
+        "mov %%rbx, 16(%%rsp)\n" // previous frame return address
+        "xor %%rbx, %%rbx\n"
+        "mov %%rbx, 8(%%rsp)\n"  // previous frame rbp
+        "mov %%rsp, %%rbp\n"     // current frame rbp
+
+        "push %1\n"
+
+        "mov $0x10, %%ax\n"
+        "mov %%ax, %%ss\n"
+        "mov %%ax, %%ds\n"
+        "mov %%ax, %%es\n"
+        "mov %%ax, %%fs\n"
+        "mov %%ax, %%gs\n"
+
+        "push $0x0\n"
+        "popf\n"
 
         "ret\n"
 
         "%=:\n"
         "ud2"
         :
-        : "a"(current_thread->kstack.esp), "c"(_kernel_init)
+        : "a"(current_thread->kstack.sp), "c"(_kernel_init), "g"(kernel_stack_pfn)
         : "memory");
 
     freeze();
 }
 
-extern "C" void asm_ctx_switch(uint32_t** curr_esp, uint32_t** next_esp);
+extern "C" void asm_ctx_switch(uintptr_t* curr_sp, uintptr_t* next_sp);
+
+extern "C" void after_ctx_switch()
+{
+    current_thread->kstack.load_interrupt_stack();
+    current_thread->load_thread_area32();
+}
+
 bool schedule()
 {
     if (kernel::async::preempt_count() != 0)
         return true;
 
     auto* next_thd = kernel::task::dispatcher::next();
-    process* proc = nullptr;
-    kernel::task::thread* curr_thd = nullptr;
-
-    if (current_thread == next_thd)
-        goto _end;
-
-    proc = &procs->find(next_thd->owner);
-    if (current_process != proc) {
-        proc->mms.switch_pd();
-        current_process = proc;
-    }
 
-    curr_thd = current_thread;
-
-    current_thread = next_thd;
-    tss.esp0 = (uint32_t)next_thd->kstack.esp;
-
-    next_thd->load_thread_area();
+    if (current_thread != next_thd) {
+        auto* proc = &procs->find(next_thd->owner);
+        if (current_process != proc) {
+            proc->mms.switch_pd();
+            current_process = proc;
+        }
 
-    asm_ctx_switch(&curr_thd->kstack.esp, &next_thd->kstack.esp);
-    tss.esp0 = (uint32_t)curr_thd->kstack.esp;
+        auto* curr_thd = current_thread;
+        current_thread = next_thd;
 
-_end:
+        asm_ctx_switch(&curr_thd->kstack.sp, &next_thd->kstack.sp);
+    }
 
     return current_thread->signals.pending_signal() == 0;
 }
@@ -577,10 +553,8 @@ void NORETURN schedule_noreturn(void)
 
 void NORETURN freeze(void)
 {
-    asm_cli();
-    asm_hlt();
     for (;;)
-        ;
+        asm volatile("cli\n\thlt");
 }
 
 void NORETURN kill_current(int signo)

+ 20 - 21
src/kernel/signal.cpp

@@ -1,7 +1,7 @@
 #include <kernel/task/thread.hpp>
 #include <kernel/process.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/interrupt.h>
+#include <kernel/interrupt.hpp>
 
 #include <signal.h>
 
@@ -142,11 +142,11 @@ signo_type signal_list::pending_signal()
 
         return *iter;
     }
-    
+
     return 0;
 }
 
-void signal_list::handle(interrupt_stack* context, mmx_registers* mmxregs)
+void signal_list::handle(interrupt_stack_normal* context, mmx_registers* mmxregs)
 {
     // assume that the pending signal is at the front of the list
     auto signal = m_list.front();
@@ -178,29 +178,28 @@ void signal_list::handle(interrupt_stack* context, mmx_registers* mmxregs)
     if (!(handler.sa_flags & SA_RESTORER))
         raise(SIGSYS);
 
-    uint32_t esp = (uint32_t)context->esp;
-    esp -= (sizeof(mmx_registers) + sizeof(interrupt_stack) + 16);
-    esp &= 0xfffffff0;
-
-    auto tmpesp = esp;
-    *(uint32_t*)tmpesp = signal; // signal handler argument: int signo
-    tmpesp += 4;
-    *(uint32_t*)tmpesp = context->esp; // original esp
-    tmpesp += 4;
+    // save current interrupt context to 128 bytes above current user stack
+    uintptr_t sp = (uintptr_t)context->rsp;
+    sp -= (128 + sizeof(mmx_registers) + sizeof(interrupt_stack_normal) + 16);
+    sp &= ~0xf;
 
-    tmpesp += 8; // padding to align to 16 bytes
+    auto tmpsp = sp;
+    *(uint64_t*)tmpsp = signal; // signal handler argument: int signo
+    tmpsp += 8;
+    *(uintptr_t*)tmpsp = context->rsp; // original rsp
+    tmpsp += 8;
 
-    memcpy((void*)tmpesp, mmxregs, sizeof(mmx_registers));
-    tmpesp += sizeof(mmx_registers); // mmx registers
-    memcpy((void*)tmpesp, context, sizeof(interrupt_stack));
-    tmpesp += sizeof(interrupt_stack); // context
+    memcpy((void*)tmpsp, mmxregs, sizeof(mmx_registers));
+    tmpsp += sizeof(mmx_registers); // mmx registers
+    memcpy((void*)tmpsp, context, sizeof(interrupt_stack_normal));
+    tmpsp += sizeof(interrupt_stack_normal); // context
 
-    esp -= sizeof(void*);
+    sp -= sizeof(void*);
     // signal handler return address: restorer
-    *(uint32_t*)esp = (uint32_t)handler.sa_restorer;
+    *(uintptr_t*)sp = (uintptr_t)handler.sa_restorer;
 
-    context->esp = esp;
-    context->v_eip = (void*)handler.sa_handler;
+    context->rsp = sp;
+    context->v_rip = (uintptr_t)handler.sa_handler;
 }
 
 void signal_list::after_signal(signo_type signal)

File diff suppressed because it is too large
+ 265 - 1079
src/kernel/syscall.cpp


+ 538 - 8
src/kernel/syscall/fileops.cc

@@ -1,16 +1,93 @@
+#include <bits/ioctl.h>
 #include <errno.h>
+#include <poll.h>
+#include <sys/mman.h>
+#include <unistd.h>
 
 #include <types/path.hpp>
 
+#include <kernel/log.hpp>
+#include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 #include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
 
-int _syscall_symlink(interrupt_stack* data)
+#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
+
+static inline void not_implemented(const char* pos, int line)
+{
+    kmsgf("[kernel] the function at %s:%d is not implemented, killing the pid%d...",
+            pos, line, current_process->pid);
+    current_thread->send_signal(SIGSYS);
+}
+
+ssize_t kernel::syscall::do_write(int fd, const char __user* buf, size_t n)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return file->write(buf, n);
+}
+
+ssize_t kernel::syscall::do_read(int fd, char __user* buf, size_t n)
 {
-    SYSCALL_ARG1(const char __user*, target);
-    SYSCALL_ARG2(const char __user*, linkpath);
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return file->read(buf, n);
+}
 
+int kernel::syscall::do_close(int fd)
+{
+    current_process->files.close(fd);
+    return 0;
+}
+
+int kernel::syscall::do_dup(int old_fd)
+{
+    return current_process->files.dup(old_fd);
+}
+
+int kernel::syscall::do_dup2(int old_fd, int new_fd)
+{
+    return current_process->files.dup2(old_fd, new_fd);
+}
+
+int kernel::syscall::do_pipe(int __user* pipefd)
+{
+    return current_process->files.pipe(pipefd);
+}
+
+ssize_t kernel::syscall::do_getdents(int fd, char __user* buf, size_t cnt)
+{
+    auto* dir = current_process->files[fd];
+    if (!dir)
+        return -EBADF;
+
+    return dir->getdents(buf, cnt);
+}
+
+ssize_t kernel::syscall::do_getdents64(int fd, char __user* buf, size_t cnt)
+{
+    auto* dir = current_process->files[fd];
+    if (!dir)
+        return -EBADF;
+
+    return dir->getdents64(buf, cnt);
+}
+
+int kernel::syscall::do_open(const char __user* path, int flags, mode_t mode)
+{
+    mode &= ~current_process->umask;
+
+    return current_process->files.open(*current_process,
+        current_process->pwd + path, flags, mode);
+}
+
+int kernel::syscall::do_symlink(const char __user* target, const char __user* linkpath)
+{
     // TODO: use copy_from_user
     auto path = current_process->pwd + linkpath;
     auto* dent = fs::vfs_open(*current_process->root, path);
@@ -28,12 +105,8 @@ int _syscall_symlink(interrupt_stack* data)
     return dent->ind->fs->symlink(dent, linkname.c_str(), target);
 }
 
-int _syscall_readlink(interrupt_stack* data)
+int kernel::syscall::do_readlink(const char __user* pathname, char __user* buf, size_t buf_size)
 {
-    SYSCALL_ARG1(const char __user*, pathname);
-    SYSCALL_ARG2(char __user*, buf);
-    SYSCALL_ARG3(size_t, buf_size);
-
     // TODO: use copy_from_user
     auto path = current_process->pwd + pathname;
     auto* dent = fs::vfs_open(*current_process->root, path, false);
@@ -47,3 +120,460 @@ int _syscall_readlink(interrupt_stack* data)
     // TODO: use copy_to_user
     return dent->ind->fs->readlink(dent->ind, buf, buf_size);
 }
+
+int kernel::syscall::do_ioctl(int fd, unsigned long request, uintptr_t arg3)
+{
+    // TODO: check fd type and get tty* from fd
+    //
+    //       we use a trick for now, check whether
+    //       the file that fd points to is a pipe or
+    //       not. and we suppose that stdin will be
+    //       either a tty or a pipe.
+    auto* file = current_process->files[fd];
+    if (!file || !S_ISCHR(file->mode))
+        return -ENOTTY;
+
+    switch (request) {
+    case TIOCGPGRP: {
+        auto* pgid = (pid_t __user*)arg3;
+        auto* ctrl_tty = current_process->control_tty;
+
+        if (!ctrl_tty)
+            return -ENOTTY;
+
+        // TODO: copy_to_user
+        *pgid = ctrl_tty->get_pgrp();
+        break;
+    }
+    case TIOCSPGRP: {
+        // TODO: copy_from_user
+        auto pgid = *(const pid_t __user*)arg3;
+        auto* ctrl_tty = current_process->control_tty;
+
+        if (!ctrl_tty)
+            return -ENOTTY;
+
+        ctrl_tty->set_pgrp(pgid);
+        break;
+    }
+    case TIOCGWINSZ: {
+        auto* ws = (winsize __user*)arg3;
+        // TODO: copy_to_user
+        ws->ws_col = 80;
+        ws->ws_row = 10;
+        break;
+    }
+    case TCGETS: {
+        auto* argp = (struct termios __user*)arg3;
+
+        auto* ctrl_tty = current_process->control_tty;
+        if (!ctrl_tty)
+            return -EINVAL;
+
+        // TODO: use copy_to_user
+        memcpy(argp, &ctrl_tty->termio, sizeof(ctrl_tty->termio));
+
+        break;
+    }
+    case TCSETS: {
+        auto* argp = (const struct termios __user*)arg3;
+
+        auto* ctrl_tty = current_process->control_tty;
+        if (!ctrl_tty)
+            return -EINVAL;
+
+        // TODO: use copy_from_user
+        memcpy(&ctrl_tty->termio, argp, sizeof(ctrl_tty->termio));
+
+        break;
+    }
+    default:
+        kmsgf("[error] the ioctl() function %x is not implemented", request);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+ssize_t kernel::syscall::do_readv(int fd, const iovec* iov, int iovcnt)
+{
+    auto* file = current_process->files[fd];
+
+    if (!file)
+        return -EBADF;
+
+    // TODO: fix fake EOF
+    ssize_t totn = 0;
+    for (int i = 0; i < iovcnt; ++i) {
+        ssize_t ret = file->read(
+            (char*)iov[i].iov_base, iov[i].iov_len);
+
+        if (ret < 0)
+            return ret;
+
+        if (ret == 0)
+            break;
+
+        totn += ret;
+
+        if ((size_t)ret != iov[i].iov_len)
+            break;
+    }
+
+    return totn;
+}
+
+// TODO: this operation SHOULD be atomic
+ssize_t kernel::syscall::do_writev(int fd, const iovec* iov, int iovcnt)
+{
+    auto* file = current_process->files[fd];
+
+    if (!file)
+        return -EBADF;
+
+    ssize_t totn = 0;
+    for (int i = 0; i < iovcnt; ++i) {
+        ssize_t ret = file->write(
+            (const char*)iov[i].iov_base, iov[i].iov_len);
+
+        if (ret < 0)
+            return ret;
+        totn += ret;
+    }
+
+    return totn;
+}
+
+off_t kernel::syscall::do_lseek(int fd, off_t offset, int whence)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return file->seek(offset, whence);
+}
+
+uintptr_t kernel::syscall::do_mmap_pgoff(uintptr_t addr, size_t len,
+        int prot, int flags, int fd, off_t pgoffset)
+{
+    if (addr & 0xfff)
+        return -EINVAL;
+    if (len == 0)
+        return -EINVAL;
+
+    len = (len + 0xfff) & ~0xfff;
+
+    // TODO: shared mappings
+    if (flags & MAP_SHARED)
+        return -ENOMEM;
+
+    if (flags & MAP_ANONYMOUS) {
+        if (fd != -1)
+            return -EINVAL;
+        if (pgoffset != 0)
+            return -EINVAL;
+
+        // TODO: shared mappings
+        if (!(flags & MAP_PRIVATE))
+            return -EINVAL;
+
+        auto& mms = current_process->mms;
+
+        // do unmapping, equal to munmap, MAP_FIXED set
+        if (prot == PROT_NONE) {
+            if (int ret = mms.unmap(addr, len, true); ret != 0)
+                return ret;
+        }
+        else {
+            // TODO: add NULL check in mm_list
+            if (!addr || !mms.is_avail(addr, len)) {
+                if (flags & MAP_FIXED)
+                    return -ENOMEM;
+                addr = mms.find_avail(addr, len);
+            }
+
+            // TODO: check current cs
+            if (addr + len > 0x100000000ULL)
+                return -ENOMEM;
+
+            mem::mm_list::map_args args{};
+            args.vaddr = addr;
+            args.length = len;
+            args.flags = mem::MM_ANONYMOUS;
+
+            if (prot & PROT_WRITE)
+                args.flags |= mem::MM_WRITE;
+
+            if (prot & PROT_EXEC)
+                args.flags |= mem::MM_EXECUTE;
+
+            if (int ret = mms.mmap(args); ret != 0)
+                return ret;
+        }
+    }
+
+    return addr;
+}
+
+int kernel::syscall::do_munmap(uintptr_t addr, size_t len)
+{
+    if (addr & 0xfff)
+        return -EINVAL;
+
+    return current_process->mms.unmap(addr, len, true);
+}
+
+ssize_t kernel::syscall::do_sendfile(int out_fd, int in_fd,
+        off_t __user* offset, size_t count)
+{
+    auto* out_file = current_process->files[out_fd];
+    auto* in_file = current_process->files[in_fd];
+
+    if (!out_file || !in_file)
+        return -EBADF;
+
+    // TODO: check whether in_fd supports mmapping
+    if (!S_ISREG(in_file->mode) && !S_ISBLK(in_file->mode))
+        return -EINVAL;
+
+    if (offset) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    constexpr size_t bufsize = 4096;
+    std::vector<char> buf(bufsize);
+    size_t totn = 0;
+    while (totn < count) {
+        if (current_thread->signals.pending_signal() != 0)
+            return (totn == 0) ? -EINTR : totn;
+
+        size_t n = std::min(count - totn, bufsize);
+        ssize_t ret = in_file->read(buf.data(), n);
+        if (ret < 0)
+            return ret;
+        if (ret == 0)
+            break;
+        ret = out_file->write(buf.data(), ret);
+        if (ret < 0)
+            return ret;
+        totn += ret;
+
+        // TODO: this won't work, since when we are in the syscall handler,
+        //       interrupts are blocked.
+        //       one solution is to put the sendfile action into a kernel
+        //       worker and pause the calling thread so that the worker
+        //       thread could be interrupted normally.
+    }
+
+    return totn;
+}
+
+int kernel::syscall::do_statx(int dirfd, const char __user* path,
+        int flags, unsigned int mask, statx __user* statxbuf)
+{
+    // AT_STATX_SYNC_AS_STAT is the default value
+    if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_SYNC_AS_STAT) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    if (dirfd != AT_FDCWD) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    auto* dent = fs::vfs_open(*current_process->root,
+            current_process->pwd + path,
+            !(flags & AT_SYMLINK_NOFOLLOW));
+
+    if (!dent)
+        return -ENOENT;
+
+    // TODO: copy to user
+    auto ret = fs::vfs_stat(dent, statxbuf, mask);
+
+    return ret;
+}
+
+int kernel::syscall::do_fcntl(int fd, int cmd, unsigned long arg)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    switch (cmd) {
+    case F_SETFD:
+        return current_process->files.set_flags(fd, arg);
+    case F_DUPFD:
+    case F_DUPFD_CLOEXEC: {
+        return current_process->files.dupfd(fd, arg, FD_CLOEXEC);
+    }
+    default:
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+}
+
+int kernel::syscall::do_mkdir(const char __user* pathname, mode_t mode)
+{
+    mode &= (~current_process->umask & 0777);
+
+    auto path = current_process->pwd + pathname;
+
+    auto* dent = fs::vfs_open(*current_process->root, path);
+    if (dent)
+        return -EEXIST;
+
+    // get parent path
+    auto dirname = path.last_name();
+    path.remove_last();
+
+    dent = fs::vfs_open(*current_process->root, path);
+    if (!dent)
+        return -ENOENT;
+
+    if (!S_ISDIR(dent->ind->mode))
+        return -ENOTDIR;
+
+    auto ret = fs::vfs_mkdir(dent, dirname.c_str(), mode);
+
+    if (ret != 0)
+        return ret;
+
+    return 0;
+}
+
+int kernel::syscall::do_truncate(const char __user* pathname, long length)
+{
+    auto path = current_process->pwd + pathname;
+
+    auto* dent = fs::vfs_open(*current_process->root, path);
+    if (!dent)
+        return -ENOENT;
+
+    if (S_ISDIR(dent->ind->mode))
+        return -EISDIR;
+
+    auto ret = fs::vfs_truncate(dent->ind, length);
+
+    if (ret != 0)
+        return ret;
+
+    return 0;
+}
+
+int kernel::syscall::do_unlink(const char __user* pathname)
+{
+    auto path = current_process->pwd + pathname;
+    auto* dent = fs::vfs_open(*current_process->root, path, false);
+
+    if (!dent)
+        return -ENOENT;
+
+    if (S_ISDIR(dent->ind->mode))
+        return -EISDIR;
+
+    return fs::vfs_rmfile(dent->parent, dent->name.c_str());
+}
+
+int kernel::syscall::do_access(const char __user* pathname, int mode)
+{
+    auto path = current_process->pwd + pathname;
+    auto* dent = fs::vfs_open(*current_process->root, path);
+
+    if (!dent)
+        return -ENOENT;
+
+    switch (mode) {
+    case F_OK:
+        return 0;
+    case R_OK:
+    case W_OK:
+    case X_OK:
+        // TODO: check privilege
+        return 0;
+    default:
+        return -EINVAL;
+    }
+}
+
+int kernel::syscall::do_mknod(const char __user* pathname, mode_t mode, dev_t dev)
+{
+    auto path = current_process->pwd + pathname;
+    auto* dent = fs::vfs_open(*current_process->root, path);
+
+    if (dent)
+        return -EEXIST;
+
+    auto filename = path.last_name();
+    path.remove_last();
+
+    dent = fs::vfs_open(*current_process->root, path);
+    if (!dent)
+        return -ENOENT;
+
+    return fs::vfs_mknode(dent, filename.c_str(), mode, dev);
+}
+
+int kernel::syscall::do_poll(pollfd __user* fds, nfds_t nfds, int timeout)
+{
+    if (nfds == 0)
+        return 0;
+
+    if (nfds > 1) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    // TODO: handle timeout
+    // if (timeout != -1) {
+    // }
+    (void)timeout;
+
+    // for now, we will poll from console only
+    int ret = tty::console->poll();
+    if (ret < 0)
+        return ret;
+
+    fds[0].revents = POLLIN;
+    return ret;
+
+    // TODO: check address validity
+    // TODO: poll multiple fds and other type of files
+    // for (nfds_t i = 0; i < nfds; ++i) {
+    //     auto& pfd = fds[i];
+
+    //     auto* file = current_process->files[pfd.fd];
+    //     if (!file || !S_ISCHR(file->mode))
+    //         return -EINVAL;
+
+    //     // poll the fds
+    // }
+    //
+    // return 0;
+}
+
+/* TODO: implement vfs_stat(stat*)
+int do_stat(const char __user* pathname, stat __user* buf)
+{
+    auto* dent = fs::vfs_open(*current_process->root,
+        types::make_path(pathname, current_process->pwd));
+
+    if (!dent)
+        return -ENOENT;
+
+    return fs::vfs_stat(dent, buf);
+}
+*/
+
+/* TODO: implement vfs_stat(stat*)
+int do_fstat(int fd, stat __user* buf)
+{
+    auto* file = current_process->files[fd];
+    if (!file)
+        return -EBADF;
+
+    return fs::vfs_stat(file, buf);
+}
+*/

+ 51 - 0
src/kernel/syscall/infoops.cc

@@ -0,0 +1,51 @@
+#include <bits/alltypes.h>
+#include <time.h>
+
+#include <kernel/hw/timer.hpp>
+#include <kernel/log.hpp>
+#include <kernel/process.hpp>
+#include <kernel/syscall.hpp>
+
+#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
+
+static inline void not_implemented(const char* pos, int line)
+{
+    kmsgf("[kernel] the function at %s:%d is not implemented, killing the pid%d...",
+            pos, line, current_process->pid);
+    current_thread->send_signal(SIGSYS);
+}
+
+int kernel::syscall::do_clock_gettime(clockid_t clk_id, timespec __user* tp)
+{
+    if (clk_id != CLOCK_REALTIME && clk_id != CLOCK_MONOTONIC) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    if (!tp)
+        return -EFAULT;
+
+    auto time = hw::timer::current_ticks();
+
+    // TODO: copy_to_user
+    tp->tv_sec = time / 100;
+    tp->tv_nsec = 10000000 * (time % 100);
+
+    return 0;
+}
+
+int kernel::syscall::do_gettimeofday(timeval __user* tv, void __user* tz)
+{
+    // TODO: return time of the day, not time from this boot
+    if (tz) [[unlikely]]
+        return -EINVAL;
+
+    if (tv) {
+        // TODO: use copy_to_user
+        auto ticks = kernel::hw::timer::current_ticks();
+        tv->tv_sec = ticks / 100;
+        tv->tv_usec = ticks * 10 * 1000;
+    }
+
+    return 0;
+}

+ 6 - 7
src/kernel/syscall/mount.cc

@@ -6,14 +6,13 @@
 #include <kernel/syscall.hpp>
 #include <kernel/vfs.hpp>
 
-int _syscall_mount(interrupt_stack* data)
+int kernel::syscall::do_mount(
+        const char __user* source,
+        const char __user* target,
+        const char __user* fstype,
+        unsigned long flags,
+        const void __user* _fsdata)
 {
-    SYSCALL_ARG1(const char __user*, source);
-    SYSCALL_ARG2(const char __user*, target);
-    SYSCALL_ARG3(const char __user*, fstype);
-    SYSCALL_ARG4(unsigned long, flags);
-    SYSCALL_ARG5(const void __user*, _fsdata);
-
     if (!fstype)
         return -EINVAL;
 

+ 391 - 0
src/kernel/syscall/procops.cc

@@ -0,0 +1,391 @@
+#include <string>
+#include <vector>
+
+#include <sys/prctl.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+
+#include <types/elf.hpp>
+
+#include <kernel/log.hpp>
+#include <kernel/process.hpp>
+#include <kernel/signal.hpp>
+#include <kernel/syscall.hpp>
+#include <kernel/utsname.hpp>
+
+using namespace kernel::syscall;
+
+#define NOT_IMPLEMENTED not_implemented(__FILE__, __LINE__)
+
+static inline void not_implemented(const char* pos, int line)
+{
+    kmsgf("[kernel] the function at %s:%d is not implemented, killing the pid%d...",
+            pos, line, current_process->pid);
+    current_thread->send_signal(SIGSYS);
+}
+
+int kernel::syscall::do_chdir(const char __user* path)
+{
+    auto* dir = fs::vfs_open(*current_process->root,
+            current_process->pwd + path);
+    if (!dir)
+        return -ENOENT;
+
+    if (!S_ISDIR(dir->ind->mode))
+        return -ENOTDIR;
+
+    current_process->pwd.clear();
+    dir->path(*current_process->root, current_process->pwd);
+
+    return 0;
+}
+
+execve_retval kernel::syscall::do_execve(
+        const std::string& exec,
+        const std::vector<std::string>& args,
+        const std::vector<std::string>& envs)
+{
+    types::elf::elf32_load_data d{
+        .exec_dent{},
+        .argv{args},
+        .envp{envs},
+        .ip{}, .sp{},
+    };
+
+    d.exec_dent = fs::vfs_open(*current_process->root,
+            current_process->pwd + exec.c_str());
+
+    if (!d.exec_dent)
+        return { 0, 0, -ENOENT };
+
+    current_process->files.onexec();
+
+    // TODO: set cs and ss to compatibility mode
+    if (int ret = types::elf::elf32_load(d); ret != 0)
+        return { 0, 0, ret };
+
+    current_thread->signals.on_exec();
+
+    return { d.ip, d.sp, 0 };
+}
+
+
+int kernel::syscall::do_exit(int status)
+{
+    // TODO: terminating a thread only
+    assert(current_process->thds.size() == 1);
+
+    // terminating a whole process:
+    procs->kill(current_process->pid, (status & 0xff) << 8);
+
+    // switch to new process and continue
+    schedule_noreturn();
+}
+
+int kernel::syscall::do_waitpid(pid_t waitpid, int __user* arg1, int options)
+{
+    if (waitpid != -1)
+        return -EINVAL;
+
+    auto& cv = current_process->waitlist;
+    kernel::async::lock_guard lck(current_process->mtx_waitprocs);
+
+    auto& waitlist = current_process->waitprocs;
+
+    // TODO: check if it is waiting for stopped process
+    if (options & ~(WNOHANG | WUNTRACED)) {
+        NOT_IMPLEMENTED;
+        return -EINVAL;
+    }
+
+    while (waitlist.empty()) {
+        if (current_process->children.empty())
+            return -ECHILD;
+
+        if (options & WNOHANG)
+            return 0;
+
+        bool interrupted = cv.wait(current_process->mtx_waitprocs);
+        if (interrupted)
+            return -EINTR;
+    }
+
+    for (auto iter = waitlist.begin(); iter != waitlist.end(); ++iter) {
+        if (WIFSTOPPED(iter->code) && !(options & WUNTRACED))
+            continue;
+
+        pid_t pid = iter->pid;
+
+        // TODO: copy_to_user
+        *arg1 = iter->code;
+
+        procs->remove(pid);
+        waitlist.erase(iter);
+
+        return pid;
+    }
+
+    // we should never reach here
+    freeze();
+    return -EINVAL;
+}
+
+char __user* kernel::syscall::do_getcwd(char __user* buf, size_t buf_size)
+{
+    // TODO: use copy_to_user
+    auto path = current_process->pwd.full_path();
+    strncpy(buf, path.c_str(), buf_size);
+    buf[buf_size - 1] = 0;
+
+    return buf;
+}
+
+pid_t kernel::syscall::do_setsid()
+{
+    if (current_process->pid == current_process->pgid)
+        return -EPERM;
+
+    current_process->sid = current_process->pid;
+    current_process->pgid = current_process->pid;
+
+    // TODO: get tty* from fd or block device id
+    tty::console->set_pgrp(current_process->pid);
+    current_process->control_tty = tty::console;
+
+    return current_process->pid;
+}
+
+pid_t kernel::syscall::do_getsid(pid_t pid)
+{
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+    if (pproc->sid != current_process->sid)
+        return -EPERM;
+
+    return pproc->sid;
+}
+
+int kernel::syscall::do_setpgid(pid_t pid, pid_t pgid)
+{
+    if (pgid < 0)
+        return -EINVAL;
+
+    if (pid == 0)
+        pid = current_process->pid;
+
+    if (pgid == 0)
+        pgid = pid;
+
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+
+    // TODO: check whether pgid and the original
+    //       pgid is in the same session
+
+    pproc->pgid = pgid;
+
+    return 0;
+}
+
+int kernel::syscall::do_set_thread_area(kernel::user::user_desc __user* ptr)
+{
+    auto ret = current_thread->set_thread_area(ptr);
+    if (ret != 0)
+        return ret;
+
+    current_thread->load_thread_area32();
+    return 0;
+}
+
+pid_t kernel::syscall::do_set_tid_address(int __user* tidptr)
+{
+    // TODO: copy_from_user
+    current_thread->set_child_tid = tidptr;
+    return current_thread->tid();
+}
+
+int kernel::syscall::do_prctl(int option, uintptr_t arg2)
+{
+    switch (option) {
+    case PR_SET_NAME: {
+        // TODO: copy_from_user
+        auto* name = (const char __user*)arg2;
+        current_thread->name.assign(name, 15);
+        break;
+    }
+    case PR_GET_NAME: {
+        auto* name = (char __user*)arg2;
+        // TODO: copy_to_user
+        strncpy(name, current_thread->name.c_str(), 16);
+        name[15] = 0;
+        break;
+    }
+    default:
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+int kernel::syscall::do_arch_prctl(int option, uintptr_t arg2)
+{
+    switch (option) {
+    case PR_SET_NAME: {
+        // TODO: copy_from_user
+        auto* name = (const char __user*)arg2;
+        current_thread->name.assign(name, 15);
+        break;
+    }
+    case PR_GET_NAME: {
+        auto* name = (char __user*)arg2;
+        // TODO: copy_to_user
+        strncpy(name, current_thread->name.c_str(), 16);
+        name[15] = 0;
+        break;
+    }
+    default:
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+int kernel::syscall::do_umask(mode_t mask)
+{
+    mode_t old = current_process->umask;
+    current_process->umask = mask;
+
+    return old;
+}
+
+int kernel::syscall::do_kill(pid_t pid, int sig)
+{
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+
+    if (!kernel::signal_list::check_valid(sig))
+        return -EINVAL;
+
+    if (pproc->is_system())
+        return 0;
+
+    // TODO: check permission
+    procs->send_signal(pid, sig);
+
+    return 0;
+}
+
+int kernel::syscall::do_rt_sigprocmask(int how, const sigmask_type __user* set,
+        sigmask_type __user* oldset, size_t sigsetsize)
+{
+    if (sigsetsize != sizeof(sigmask_type))
+        return -EINVAL;
+
+    sigmask_type sigs = current_thread->signals.get_mask();
+
+    // TODO: use copy_to_user
+    if (oldset)
+        memcpy(oldset, &sigs, sizeof(sigmask_type));
+
+    if (!set)
+        return 0;
+
+    // TODO: use copy_from_user
+    switch (how) {
+    case SIG_BLOCK:
+        current_thread->signals.mask(*set);
+        break;
+    case SIG_UNBLOCK:
+        current_thread->signals.unmask(*set);
+        break;
+    case SIG_SETMASK:
+        current_thread->signals.set_mask(*set);
+        break;
+    }
+
+    return 0;
+}
+
+int kernel::syscall::do_rt_sigaction(int signum, const sigaction __user* act,
+        sigaction __user* oldact, size_t sigsetsize)
+{
+    if (sigsetsize != sizeof(sigmask_type))
+        return -EINVAL;
+
+    if (!kernel::signal_list::check_valid(signum)
+        || signum == SIGKILL || signum == SIGSTOP)
+        return -EINVAL;
+
+    // TODO: use copy_to_user
+    if (oldact)
+        current_thread->signals.get_handler(signum, *oldact);
+
+    if (!act)
+        return 0;
+
+    // TODO: use copy_from_user
+    current_thread->signals.set_handler(signum, *act);
+
+    return 0;
+}
+
+int kernel::syscall::do_newuname(new_utsname __user* buf)
+{
+    if (!buf)
+        return -EFAULT;
+
+    // TODO: use copy_to_user
+    memcpy(buf, sys_utsname, sizeof(new_utsname));
+
+    return 0;
+}
+
+pid_t kernel::syscall::do_getpgid(pid_t pid)
+{
+    if (pid == 0)
+        return current_process->pgid;
+
+    auto [ pproc, found ] = procs->try_find(pid);
+    if (!found)
+        return -ESRCH;
+
+    return pproc->pgid;
+}
+
+pid_t kernel::syscall::do_getpid()
+{
+    return current_process->pid;
+}
+
+pid_t kernel::syscall::do_getppid()
+{
+    return current_process->ppid;
+}
+
+uid_t kernel::syscall::do_getuid()
+{
+    return 0; // all users are root for now
+}
+
+uid_t kernel::syscall::do_geteuid()
+{
+    return 0; // all users are root for now
+}
+
+gid_t kernel::syscall::do_getgid()
+{
+    return 0; // all users are root for now
+}
+
+pid_t kernel::syscall::do_gettid()
+{
+    return current_thread->tid();
+}
+
+uintptr_t kernel::syscall::do_brk(uintptr_t addr)
+{
+    return current_process->mms.set_brk(addr);
+}

+ 17 - 0
src/kernel/task/readyqueue.cc

@@ -28,11 +28,28 @@ void dispatcher::dequeue(thread* thd)
 thread* dispatcher::next()
 {
     lock_guard_irq lck(dispatcher_mtx);
+    auto back = dispatcher_thds.back();
+
+    if (dispatcher_thds.size() == 1) {
+        back->elected_times++;
+        return back;
+    }
+
+    if (dispatcher_thds.size() == 2) {
+        if (back->owner == 0) {
+            auto front = dispatcher_thds.front();
+            front->elected_times++;
+            return front;
+        }
+        back->elected_times++;
+        return back;
+    }
 
     auto* retval = dispatcher_thds.front();
 
     dispatcher_thds.pop_front();
     dispatcher_thds.push_back(retval);
 
+    retval->elected_times++;
     return retval;
 }

+ 78 - 60
src/kernel/task/thread.cc

@@ -1,28 +1,42 @@
-#include <kernel/task/thread.hpp>
-
 #include <queue>
 
+#include <stdint.h>
+
+#include <types/types.h>
+
+#include <kernel/async/lock.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mm.hpp>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
 #include <kernel/signal.hpp>
-#include <kernel/async/lock.hpp>
 #include <kernel/task/readyqueue.hpp>
+#include <kernel/task/thread.hpp>
+
+constexpr std::size_t KERNEL_STACK_ORDER = 3; // 2^3 * 4096 = 32KB
 
 using namespace kernel::task;
+using namespace kernel::mem;
+using namespace kernel::mem::paging;
+
+struct PACKED tss64_t {
+    uint32_t _reserved1;
+    uint64_t rsp[3];
+    uint64_t _reserved2;
+    uint64_t ist[7];
+    uint64_t _reserved3;
+    uint32_t _reserved4;
+};
+constexpr physaddr<tss64_t> tss{0x00000070};
 
 thread::thread(std::string name, pid_t owner)
-    : owner { owner }, attr { READY | SYSTEM }, name { name }
-{
-}
+    : owner { owner }, attr { READY | SYSTEM }, name { name } { }
 
 thread::thread(const thread& val, pid_t owner)
-    : owner { owner }, attr { val.attr }, name { val.name }
-{
-}
+    : owner { owner }, attr { val.attr }, name { val.name }, tls_desc32{val.tls_desc32} { }
 
 tid_t thread::tid() const
 {
-    return (tid_t)kstack.stack_base;
+    return (tid_t)kstack.pfn;
 }
 
 bool thread::operator<(const thread& rhs) const
@@ -35,56 +49,55 @@ bool thread::operator==(const thread& rhs) const
     return tid() == rhs.tid();
 }
 
-static std::priority_queue<std::byte*> s_kstacks;
-static kernel::async::mutex s_mtx_kstacks;
+static inline uintptr_t __stack_bottom(pfn_t pfn)
+{
+    return (uintptr_t)(void*)
+        kernel::mem::physaddr<void>{pfn + (1 << KERNEL_STACK_ORDER) * 0x1000};
+}
 
 thread::kernel_stack::kernel_stack()
 {
-    static int allocated;
-    kernel::async::lock_guard_irq lck(s_mtx_kstacks);
-
-    if (!s_kstacks.empty()) {
-        stack_base = s_kstacks.top();
-        esp = (uint32_t*)stack_base;
-        s_kstacks.pop();
-        return;
-    }
-
-    // kernel stack pt is at page#0x00005
-    kernel::paccess pa(0x00005);
-    auto pt = (pt_t)pa.ptr();
-    assert(pt);
-
-    int cnt = THREAD_KERNEL_STACK_SIZE / PAGE_SIZE;
-    pte_t* pte = *pt + allocated * cnt;
-
-    for (int i = 0; i < cnt; ++i) {
-        pte[i].v = 0x3;
-        pte[i].in.page = __alloc_raw_page();
-    }
-
-    stack_base = (std::byte*)(0xffc00000 + THREAD_KERNEL_STACK_SIZE * (allocated + 1));
-    esp = (uint32_t*)stack_base;
-
-    ++allocated;
+    pfn = page_to_pfn(alloc_pages(KERNEL_STACK_ORDER));
+    sp = __stack_bottom(pfn);
 }
 
 thread::kernel_stack::kernel_stack(const kernel_stack& other)
     : kernel_stack()
 {
-    auto offset = vptrdiff(other.stack_base, other.esp);
-    esp = (uint32_t*)(stack_base - offset);
-    memcpy(esp, other.esp, offset);
+    auto offset = __stack_bottom(other.pfn) - other.sp;
+
+    sp -= offset;
+    memcpy((void*)sp, (void*)other.sp, offset);
 }
 
 thread::kernel_stack::kernel_stack(kernel_stack&& other)
-    : stack_base(std::exchange(other.stack_base, nullptr))
-    , esp(std::exchange(other.esp, nullptr)) { }
+    : pfn(std::exchange(other.pfn, 0))
+    , sp(std::exchange(other.sp, 0)) { }
 
 thread::kernel_stack::~kernel_stack()
 {
-    kernel::async::lock_guard_irq lck(s_mtx_kstacks);
-    s_kstacks.push(stack_base);
+    if (!pfn)
+        return;
+    free_pages(pfn, KERNEL_STACK_ORDER);
+}
+
+uint64_t thread::kernel_stack::pushq(uint64_t val)
+{
+    sp -= 8;
+    *(uint64_t*)sp = val;
+    return val;
+}
+
+uint32_t thread::kernel_stack::pushl(uint32_t val)
+{
+    sp -= 4;
+    *(uint32_t*)sp = val;
+    return val;
+}
+
+void thread::kernel_stack::load_interrupt_stack() const
+{
+    tss->rsp[0] = sp;
 }
 
 void thread::set_attr(thd_attr_t new_attr)
@@ -142,7 +155,8 @@ void thread::send_signal(signal_list::signo_type signal)
 int thread::set_thread_area(kernel::user::user_desc* ptr)
 {
     if (ptr->read_exec_only && ptr->seg_not_present) {
-        void* dst = (void*)ptr->base_addr;
+        // TODO: use copy_to_user
+        auto* dst = (void*)(uintptr_t)ptr->base_addr;
         std::size_t len = ptr->limit;
         if (len > 0 && dst)
             memset(dst, 0x00, len);
@@ -150,25 +164,29 @@ int thread::set_thread_area(kernel::user::user_desc* ptr)
     }
 
     if (ptr->entry_number == -1U)
-        ptr->entry_number = 6;
+        ptr->entry_number = 7;
     else
         return -1;
 
-    tls_desc.limit_low = ptr->limit & 0xFFFF;
-    tls_desc.base_low = ptr->base_addr & 0xFFFF;
-    tls_desc.base_mid = (ptr->base_addr >> 16) & 0xFF;
-    tls_desc.access = SD_TYPE_DATA_USER;
-    tls_desc.limit_high = (ptr->limit >> 16) & 0xF;
-    tls_desc.flags = (ptr->limit_in_pages << 3) | (ptr->seg_32bit << 2);
-    tls_desc.base_high = (ptr->base_addr >> 24) & 0xFF;
+    if (!ptr->seg_32bit)
+        return -1;
+
+    if ((ptr->limit & 0xffff) != 0xffff) {
+        asm volatile("nop": : : "memory");
+    }
+
+    tls_desc32  = ptr->limit & 0x0'ffff;
+    tls_desc32 |= (ptr->base_addr & 0x00'ffffffULL) << 16;
+    tls_desc32 |= 0x4'0'f2'000000'0000;
+    tls_desc32 |= (ptr->limit & 0xf'0000ULL) << (48-16);
+    tls_desc32 |= ((ptr->limit_in_pages + 0ULL) << 55);
+    tls_desc32 |= (ptr->base_addr & 0xff'000000ULL) << (56-24);
 
     return 0;
 }
 
-int thread::load_thread_area() const
+int thread::load_thread_area32() const
 {
-    if (tls_desc.flags == 0)
-        return -1;
-    kernel::user::load_thread_area(tls_desc);
+    kernel::user::load_thread_area32(tls_desc32);
     return 0;
 }

+ 14 - 17
src/kernel/tty.cpp

@@ -5,7 +5,6 @@
 #include <termios.h>
 
 #include <kernel/async/lock.hpp>
-#include <kernel/hw/serial.h>
 #include <kernel/process.hpp>
 #include <kernel/tty.hpp>
 #include <kernel/vga.hpp>
@@ -20,7 +19,9 @@
 
 #define TERMIOS_TESTCC(c, termios, cc) ((c != 0xff) && (c == ((termios).c_cc[cc])))
 
-tty::tty()
+using namespace kernel::tty;
+
+tty::tty(std::string name)
     : termio {
         .c_iflag = ICRNL | IXOFF,
         .c_oflag = OPOST | ONLCR,
@@ -32,6 +33,7 @@ tty::tty()
         .c_ispeed = 38400,
         .c_ospeed = 38400,
     }
+    , name{name}
     , buf(BUFFER_SIZE)
     , fg_pgroup { 0 }
 {
@@ -280,21 +282,7 @@ void tty::show_char(int c)
     this->putchar(c);
 }
 
-vga_tty::vga_tty()
-{
-    snprintf(this->name, sizeof(this->name), "ttyVGA");
-}
-
-serial_tty::serial_tty(int id)
-    : id(id)
-{
-    snprintf(this->name, sizeof(this->name), "ttyS%x", (int)id);
-}
-
-void serial_tty::putchar(char c)
-{
-    serial_send_data(id, c);
-}
+vga_tty::vga_tty(): tty{"ttyVGA"} { }
 
 void vga_tty::putchar(char c)
 {
@@ -307,3 +295,12 @@ void tty::clear_read_buf(void)
 {
     this->buf.clear();
 }
+
+int kernel::tty::register_tty(tty* tty_dev)
+{
+    // TODO: manage all ttys
+    if (!console)
+        console = tty_dev;
+
+    return 0;
+}

+ 14 - 13
src/kernel/user/thread_local.cc

@@ -1,22 +1,23 @@
-#include <kernel/process.hpp>
-#include <kernel/mem.h>
-#include <kernel/user/thread_local.hpp>
-
-#include <string.h>
 #include <cstddef>
 
-namespace kernel::user {
+#include <stdint.h>
+
+#include <kernel/mem/phys.hpp>
+#include <kernel/mem/types.hpp>
+#include <kernel/user/thread_local.hpp>
+
+using namespace kernel::user;
 
-void load_thread_area(const segment_descriptor& desc)
+void kernel::user::load_thread_area32(uint64_t desc)
 {
-    gdt[6] = desc;
+    if (!desc)
+        return;
+
+    kernel::mem::gdt[7] = desc;
+
     asm volatile(
         "mov %%gs, %%ax\n\t"
         "mov %%ax, %%gs\n\t"
-        :
-        :
-        : "ax"
+        : : : "ax"
     );
 }
-
-} // namespace kernel::user

+ 11 - 11
src/kernel/vfs.cpp

@@ -15,10 +15,8 @@
 
 #include <types/allocator.hpp>
 #include <types/path.hpp>
-#include <types/status.h>
 
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
 #include <kernel/process.hpp>
 #include <kernel/tty.hpp>
 #include <kernel/vfs.hpp>
@@ -55,7 +53,7 @@ int dentry::load()
                 else
                     append(ind, dentry::name_type(name, len));
 
-                return GB_OK;
+                return 0;
             });
 
         if (ret == 0)
@@ -157,7 +155,8 @@ fs::inode* vfs::cache_inode(size_t size, ino_t ino,
 
 void vfs::free_inode(ino_t ino)
 {
-    assert(_inodes.erase(ino) == 1);
+    int n = _inodes.erase(ino);
+    assert(n == 1);
 }
 
 fs::inode* vfs::get_inode(ino_t ino)
@@ -296,7 +295,7 @@ ssize_t fs::regular_file::do_write(const char* __user buf, size_t n)
     return n_wrote;
 }
 
-ssize_t fs::regular_file::seek(off_t n, int whence)
+off_t fs::regular_file::seek(off_t n, int whence)
 {
     if (!S_ISREG(mode))
         return -ESPIPE;
@@ -337,7 +336,7 @@ int fs::regular_file::getdents(char* __user buf, size_t cnt)
 
             size_t reclen = sizeof(fs::user_dirent) + 1 + len;
             if (cnt < reclen)
-                return GB_FAILED;
+                return -EFAULT;
 
             auto* dirp = (fs::user_dirent*)buf;
             dirp->d_ino = ind->ino;
@@ -351,7 +350,7 @@ int fs::regular_file::getdents(char* __user buf, size_t cnt)
 
             buf += reclen;
             cnt -= reclen;
-            return GB_OK;
+            return 0;
         });
 
     if (nread > 0)
@@ -373,7 +372,7 @@ int fs::regular_file::getdents64(char* __user buf, size_t cnt)
 
             size_t reclen = sizeof(fs::user_dirent64) + len;
             if (cnt < reclen)
-                return GB_FAILED;
+                return -EFAULT;
 
             auto* dirp = (fs::user_dirent64*)buf;
             dirp->d_ino = ind->ino;
@@ -386,7 +385,7 @@ int fs::regular_file::getdents64(char* __user buf, size_t cnt)
 
             buf += reclen;
             cnt -= reclen;
-            return GB_OK;
+            return 0;
         });
 
     if (nread > 0)
@@ -784,13 +783,14 @@ ssize_t b_null_write(const char*, size_t n)
 
 static ssize_t console_read(char* buf, size_t buf_size, size_t n)
 {
-    return console->read(buf, buf_size, n);
+    return kernel::tty::console->read(buf, buf_size, n);
 }
+
 static ssize_t console_write(const char* buf, size_t n)
 {
     size_t orig_n = n;
     while (n--)
-        console->putchar(*(buf++));
+        kernel::tty::console->putchar(*(buf++));
 
     return orig_n;
 }

+ 17 - 16
src/kernel/vfs/tmpfs.cc

@@ -1,10 +1,11 @@
-#include <kernel/vfs.hpp>
-#include <kernel/mm.hpp>
-#include <kernel/log.hpp>
-
 #include <algorithm>
-#include <vector>
 #include <map>
+#include <vector>
+
+#include <stdint.h>
+
+#include <kernel/log.hpp>
+#include <kernel/vfs.hpp>
 
 using fs::vfs, fs::inode, fs::dentry;
 
@@ -37,9 +38,9 @@ private:
     {
         return static_cast<fdata_t*>(data);
     }
-    static constexpr ptr_t as_val(void* data)
+    static constexpr uintptr_t as_val(void* data)
     {
-        return std::bit_cast<ptr_t>(data);
+        return std::bit_cast<uintptr_t>(data);
     }
     inline void* _getdata(ino_t ino) const
     {
@@ -51,7 +52,7 @@ private:
         inode_data.insert(std::make_pair(ino, data));
         return ino;
     }
-    inline ino_t _savedata(ptr_t data)
+    inline ino_t _savedata(uintptr_t data)
     {
         return _savedata((void*)data);
     }
@@ -93,7 +94,7 @@ protected:
 
             // inode mode filetype is compatible with user dentry filetype
             auto ret = filldir(entry.filename, 0, ind, ind->mode & S_IFMT);
-            if (ret != GB_OK)
+            if (ret != 0)
                 break;
         }
 
@@ -158,7 +159,7 @@ public:
         if (dir->flags.present)
             dir->append(get_inode(file.ino), filename);
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int inode_mknode(dentry* dir, const char* filename, mode_t mode, dev_t dev) override
@@ -175,7 +176,7 @@ public:
         if (dir->flags.present)
             dir->append(get_inode(node.ino), filename);
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int inode_mkdir(dentry* dir, const char* dirname, mode_t mode) override
@@ -192,7 +193,7 @@ public:
         if (dir->flags.present)
             dir->append(new_dir, dirname);
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int symlink(dentry* dir, const char* linkname, const char* target) override
@@ -273,7 +274,7 @@ public:
         }
 
         if (mask & STATX_BLOCKS) {
-            st->stx_blocks = align_up<9>(ind->size) / 512;
+            st->stx_blocks = ((ind->size + 0x1ff) & ~0x1ff) / 512;
             st->stx_blksize = 4096;
             st->stx_mask |= STATX_BLOCKS;
         }
@@ -288,7 +289,7 @@ public:
             st->stx_mask |= STATX_GID;
         }
 
-        return GB_OK;
+        return 0;
     }
 
     virtual int inode_rmfile(dentry* dir, const char* filename) override
@@ -326,7 +327,7 @@ public:
             return 0;
         }
 
-        kmsg("[tmpfs] warning: file entry not found in vfe\n");
+        kmsg("[tmpfs] warning: file entry not found in vfe");
         return -EIO;
     }
 
@@ -344,7 +345,7 @@ public:
         auto* data = as_fdata(_getdata(file->ino));
         data->resize(size);
         file->size = size;
-        return GB_OK;
+        return 0;
     }
 };
 

+ 208 - 96
src/kinit.cpp

@@ -1,144 +1,256 @@
-#include <asm/port_io.h>
-#include <asm/sys.h>
-
 #include <assert.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <sys/utsname.h>
 
-#include <types/status.h>
+#include <types/allocator.hpp>
 #include <types/types.h>
 
-#include <kernel/hw/keyboard.h>
 #include <kernel/hw/pci.hpp>
-#include <kernel/hw/serial.h>
-#include <kernel/hw/timer.h>
-#include <kernel/interrupt.h>
+#include <kernel/hw/timer.hpp>
+#include <kernel/interrupt.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mem.h>
+#include <kernel/mem/paging.hpp>
+#include <kernel/mem/phys.hpp>
+#include <kernel/mem/types.hpp>
 #include <kernel/process.hpp>
 #include <kernel/syscall.hpp>
-#include <kernel/task.h>
-#include <kernel/tty.hpp>
 #include <kernel/utsname.hpp>
-#include <kernel/vga.hpp>
 
-typedef void (*constructor)(void);
-extern constructor const SECTION(".rodata.kinit") start_ctors;
-extern constructor const SECTION(".rodata.kinit") end_ctors;
+using constructor = void (*)();
+extern "C" constructor const start_ctors, end_ctors;
+extern "C" uint64_t BSS_ADDR, BSS_LENGTH;
+
+struct PACKED bootloader_data {
+    uint32_t meminfo_entry_count;
+    uint32_t meminfo_entry_length;
+
+    // don't forget to add the initial 1m to the total
+    uint32_t meminfo_1k_blocks;
+    uint32_t meminfo_64k_blocks;
 
-extern struct mem_size_info SECTION(".stage1") asm_mem_size_info;
-extern uint8_t SECTION(".stage1") asm_e820_mem_map[1024];
-extern uint32_t SECTION(".stage1") asm_e820_mem_map_count;
-extern uint32_t SECTION(".stage1") asm_e820_mem_map_entry_size;
+    // meminfo entries
+    kernel::mem::e820_mem_map_entry
+        meminfo_entries[(1024-4*4)/24];
+};
+
+extern void init_vfs();
+
+namespace kernel::kinit {
 
 SECTION(".text.kinit")
-static inline void save_loader_data(void)
+static inline void enable_sse()
 {
-    memcpy(e820_mem_map, asm_e820_mem_map, sizeof(e820_mem_map));
-    e820_mem_map_count = asm_e820_mem_map_count;
-    e820_mem_map_entry_size = asm_e820_mem_map_entry_size;
-    memcpy(&mem_size_info, &asm_mem_size_info, sizeof(struct mem_size_info));
+    asm volatile(
+            "mov %%cr0, %%rax\n\t"
+            "and $(~0xc), %%rax\n\t"
+            "or $0x22, %%rax\n\t"
+            "mov %%rax, %%cr0\n\t"
+            "\n\t"
+            "mov %%cr4, %%rax\n\t"
+            "or $0x600, %%rax\n\t"
+            "mov %%rax, %%cr4\n\t"
+            "fninit\n\t"
+            ::: "rax"
+            );
 }
 
 SECTION(".text.kinit")
-static inline void load_new_gdt(void)
+static inline void set_uname()
 {
-    create_segment_descriptor(gdt + 0, 0, 0, 0, 0);
-    create_segment_descriptor(gdt + 1, 0, ~0, 0b1100, SD_TYPE_CODE_SYSTEM);
-    create_segment_descriptor(gdt + 2, 0, ~0, 0b1100, SD_TYPE_DATA_SYSTEM);
-    create_segment_descriptor(gdt + 3, 0, ~0, 0b1100, SD_TYPE_CODE_USER);
-    create_segment_descriptor(gdt + 4, 0, ~0, 0b1100, SD_TYPE_DATA_USER);
-    create_segment_descriptor(gdt + 5, (uint32_t)&tss, sizeof(tss), 0b0000, SD_TYPE_TSS);
-    create_segment_descriptor(gdt + 6, 0, 0, 0b1100, SD_TYPE_DATA_USER);
-
-    asm_load_gdt((7 * 8 - 1) << 16, (pptr_t)gdt);
-    asm_load_tr((6 - 1) * 8);
-
-    asm_cli();
+    kernel::sys_utsname = new new_utsname;
+    strcpy(kernel::sys_utsname->sysname, "Linux"); // linux compatible
+    strcpy(kernel::sys_utsname->nodename, "(none)");
+    strcpy(kernel::sys_utsname->release, "1.0.0");
+    strcpy(kernel::sys_utsname->version, "1.0.0");
+    strcpy(kernel::sys_utsname->machine, "x86");
+    strcpy(kernel::sys_utsname->domainname, "(none)");
 }
 
 SECTION(".text.kinit")
-static inline void init_bss_section(void)
+void NORETURN real_kernel_init(mem::paging::pfn_t kernel_stack_pfn)
 {
-    memset(bss_addr, 0x00, bss_len);
+    // call global constructors
+    // NOTE: the initializer of global objects MUST NOT contain
+    // all kinds of memory allocations
+    for (auto* ctor = &start_ctors; ctor != &end_ctors; ++ctor)
+        (*ctor)();
+
+    set_uname();
+
+    init_interrupt();
+    hw::timer::init_pit();
+
+    init_pci();
+
+    // TODO: remove this
+    init_vfs();
+    init_syscall_table();
+
+    init_scheduler(kernel_stack_pfn);
 }
 
 SECTION(".text.kinit")
-static inline int init_console(const char* name)
+static inline void setup_early_kernel_page_table()
 {
-    if (name[0] == 't' && name[1] == 't' && name[2] == 'y') {
-        if (name[3] == 'S' || name[3] == 's') {
-            if (name[4] == '0') {
-                console = types::memory::kinew<serial_tty>(PORT_SERIAL0);
-                return GB_OK;
-            }
-            if (name[4] == '1') {
-                console = types::memory::kinew<serial_tty>(PORT_SERIAL1);
-                return GB_OK;
-            }
-        }
-        if (name[3] == 'V' && name[3] == 'G' && name[3] == 'A') {
-            console = types::memory::kinew<vga_tty>();
-            return GB_OK;
-        }
-    }
-    return GB_FAILED;
-}
+    using namespace kernel::mem::paging;
 
-extern void init_vfs();
+    // remove temporary mapping
+    KERNEL_PAGE_TABLE[0x000].clear();
 
-namespace kernel::kinit {
+    constexpr auto idx = idx_all(0xffffffffc0200000ULL);
+
+    auto pdpt = KERNEL_PAGE_TABLE[std::get<1>(idx)].parse();
+    auto pd = pdpt[std::get<2>(idx)].parse();
+
+    // kernel bss, size 2M
+    pd[std::get<3>(idx)].set(PA_KERNEL_DATA_HUGE, 0x200000);
+
+    // clear kernel bss
+    memset((void*)BSS_ADDR, 0x00, BSS_LENGTH);
+
+    // clear empty page
+    memset(mem::physaddr<void>{EMPTY_PAGE_PFN}, 0x00, 0x1000);
+}
 
 SECTION(".text.kinit")
-static void init_uname()
+static inline void setup_buddy(uintptr_t addr_max)
 {
-    kernel::sys_utsname = new new_utsname;
-    strcpy(kernel::sys_utsname->sysname, "Linux"); // linux compatible
-    strcpy(kernel::sys_utsname->nodename, "(none)");
-    strcpy(kernel::sys_utsname->release, "1.0.0");
-    strcpy(kernel::sys_utsname->version, "1.0.0");
-    strcpy(kernel::sys_utsname->machine, "x86");
-    strcpy(kernel::sys_utsname->domainname, "(none)");
+    using namespace kernel::mem;
+    using namespace kernel::mem::paging;
+    constexpr auto idx = idx_all(0xffffff8040000000ULL);
+
+    addr_max += 0xfff;
+    addr_max >>= 12;
+    int count = (addr_max * sizeof(page) + 0x200000 - 1) / 0x200000;
+
+    pfn_t start_pfn = 0x400000;
+
+    memset(physaddr<void>{0x105000}, 0x00, 4096);
+
+    auto pdpte = KERNEL_PAGE_TABLE[std::get<1>(idx)].parse()[std::get<2>(idx)];
+    pdpte.set(PA_KERNEL_PAGE_TABLE, 0x105000);
+
+    auto pd = pdpte.parse();
+    for (int i = 0; i < count; ++i, start_pfn += 0x200000)
+        pd[std::get<3>(idx)+i].set(PA_KERNEL_DATA_HUGE, start_pfn);
+
+    PAGE_ARRAY = (page*)0xffffff8040000000ULL;
+    memset(PAGE_ARRAY, 0x00, addr_max * sizeof(page));
+
+    for (int i = 0; i < (int)info::e820_entry_count; ++i) {
+        auto& ent = info::e820_entries[i];
+
+        if (ent.type != 1) // type == 1: free area
+            continue;
+        mark_present(ent.base, ent.base + ent.len);
+
+        auto start = ent.base;
+        auto end = start + ent.len;
+        if (end <= start_pfn)
+            continue;
+
+        if (start < start_pfn)
+            start = start_pfn;
+
+        if (start > end)
+            continue;
+
+        mem::paging::create_zone(start, end);
+    }
+
+    // free .stage1
+    create_zone(0x1000, 0x2000);
+    // unused space
+    create_zone(0x106000, 0x200000);
 }
 
-} // namespace kernel::kinit
+SECTION(".text.kinit")
+static inline void save_memory_info(bootloader_data* data)
+{
+    kernel::mem::info::memory_size = 1ULL * 1024ULL * 1024ULL + // initial 1M
+        1024ULL * data->meminfo_1k_blocks + 64ULL * 1024ULL * data->meminfo_64k_blocks;
+    kernel::mem::info::e820_entry_count = data->meminfo_entry_count;
+    kernel::mem::info::e820_entry_length = data->meminfo_entry_length;
+
+    memcpy(kernel::mem::info::e820_entries, data->meminfo_entries,
+        sizeof(kernel::mem::info::e820_entries));
+}
 
-extern "C" SECTION(".text.kinit") void NORETURN kernel_init(void)
+SECTION(".text.kinit")
+void setup_gdt()
 {
-    asm_enable_sse();
+    // user code
+    mem::gdt[3]  = 0x0020'fa00'0000'0000;
+    // user data
+    mem::gdt[4]  = 0x0000'f200'0000'0000;
+    // user code32
+    mem::gdt[5]  = 0x00cf'fa00'0000'ffff;
+    // user data32
+    mem::gdt[6]  = 0x00cf'f200'0000'ffff;
+    // thread load 32bit
+    mem::gdt[7]  = 0x0000'0000'0000'0000;
 
-    init_bss_section();
+    // TSS descriptor
+    mem::gdt[8]  = 0x0000'8900'0070'0067;
+    mem::gdt[9]  = 0x0000'0000'ffff'ff00;
 
-    save_loader_data();
+    // LDT descriptor
+    mem::gdt[10] = 0x0000'8200'0060'001f;
+    mem::gdt[11] = 0x0000'0000'ffff'ff00;
 
-    load_new_gdt();
+    // null segment
+    mem::gdt[12] = 0x0000'0000'0000'0000;
+    // thread local 64bit
+    mem::gdt[13] = 0x0000'0000'0000'0000;
 
-    // call global ctors
-    // NOTE:
-    // the initializer of global objects MUST NOT contain
-    // all kinds of memory allocations
-    for (const constructor* ctor = &start_ctors; ctor != &end_ctors; ++ctor) {
-        (*ctor)();
-    }
+    uint64_t descriptor[] = {
+        0x005f'0000'0000'0000, (uintptr_t)(uint64_t*)mem::gdt
+    };
 
-    init_idt();
-    init_mem();
-    init_pic();
-    init_pit();
+    asm volatile(
+            "lgdt (%0)\n\t"
+            "mov $0x50, %%ax\n\t"
+            "lldt %%ax\n\t"
+            "mov $0x40, %%ax\n\t"
+            "ltr %%ax\n\t"
+            : : "r"((uintptr_t)descriptor+6): "ax", "memory"
+    );
+}
 
-    kernel::kinit::init_uname();
+extern "C" SECTION(".text.kinit")
+void NORETURN kernel_init(bootloader_data* data)
+{
+    enable_sse();
 
-    int ret = init_serial_port(PORT_SERIAL0);
-    assert(ret == GB_OK);
+    setup_early_kernel_page_table();
+    setup_gdt();
+    save_memory_info(data);
 
-    ret = init_console("ttyS0");
-    assert(ret == GB_OK);
+    uintptr_t addr_max = 0;
+    for (int i = 0; i < (int)kernel::mem::info::e820_entry_count; ++i) {
+        auto& ent = kernel::mem::info::e820_entries[i];
+        if (ent.type != 1)
+            continue;
+        addr_max = std::max(addr_max, ent.base + ent.len);
+    }
 
-    kernel::kinit::init_pci();
-    init_vfs();
-    init_syscall();
+    setup_buddy(addr_max);
+    init_allocator();
 
-    kmsg("switching execution to the scheduler...\n");
-    init_scheduler();
+    using namespace mem::paging;
+    auto kernel_stack_pfn = page_to_pfn(alloc_pages(9));
+    auto kernel_stack_ptr =
+        mem::physaddr<std::byte>{kernel_stack_pfn} + (1<<9) * 0x1000;
+
+    asm volatile(
+            "mov %1, %%rdi\n\t"
+            "mov %2, %%rsp\n\t"
+            "xor %%rbp, %%rbp\n\t"
+            "call *%0\n\t"
+            : : "r"(real_kernel_init), "g"(kernel_stack_pfn), "g"(kernel_stack_ptr):
+    );
+
+    freeze();
 }
+
+} // namespace kernel::kinit

+ 152 - 59
src/mbr.S

@@ -1,79 +1,172 @@
-.section .text.bootsect
+.section .mbr
 .code16
 
-.globl mbr_start
-mbr_start:
-    movw %cs, %ax
-    movw %ax, %ds
-    movw %ax, %es
-    movw %ax, %ss
-
-# perform a temporary stack
-    movw $stack_base, %ax
-    movw %ax, %bp
-    movw %ax, %sp
-
-# read the first 64k
-    call read_data
-
-# read the following 128k
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-# read the 128k more
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-# read 64k more
-    addw $(0x100 * 16), read_data_segment
-    addl $(8 * 16), read_data_lba
-    call read_data
-
-# loader start
-    jmp 0x8000
-
-read_data:
-    movw $read_data_pack, %si
+move_mbr:
+    xor %ax, %ax
+    mov %ax, %ds
+    mov %ax, %es
+    mov %ax, %ss
+
+    # build a temporary stack
+    mov $0x0e00, %esp
+    mov %esp, %ebp
+
+    mov $128, %cx # 512 / 4
+    mov $0x7c00, %si
+    mov $0x0e00, %di
+    rep movsl
+
+    ljmp $0x00, $mbr_start
+
+# %eax: lba lower 4bytes
+# %edx: destination address
+read_disk:
+	push %eax
+	push %edx
+	push %ecx
+
+	mov %eax, read_data_lba
+	shr $4, %edx
+	mov %dx, read_data_segment
+
+    mov $read_data_pack, %si
     mov $0x42, %ah
     mov $0x80, %dl
     int $0x13
-    jc read_data_error
-    ret
+    jc halt
+
+	pop %ecx
+	pop %edx
+	pop %eax
+	ret
+
+mbr_start:
+    # clear screen
+    mov $0x00, %ah
+    mov $0x03, %al
+    int $0x10
+
+    # read kernel image: 32K * 15 = 480K
+	xor %eax, %eax
+	inc %eax # %eax = 1
+	mov %eax, %edx
+	shl $12, %edx # %edx = 0x1000
+
+	mov $15, %ecx
+_loop_read_kernel:
+	call read_disk
+	add $64, %eax # %eax += 64
+
+	shr $12, %edx
+	add $8, %edx
+	shl $12, %edx # %edx += 32K
+
+	loop _loop_read_kernel
+
+    # get memory size info and storage it
+    xor %ecx, %ecx
+    xor %edx, %edx
+	xor %eax, %eax
+    mov $0xe801, %ax
+
+    int $0x15
+    jc halt
+
+    cmp $0x86, %ah # unsupported function
+    je halt
+    cmp $0x80, %ah # invalid command
+    je halt
+
+    jcxz _get_memory_size_use_ax
+    mov %cx, %ax
+    mov %dx, %bx
+
+_get_memory_size_use_ax:
+    sub $1024, %esp
+    movzw %ax, %eax
+    mov %eax, 8(%esp)  # 1k blocks
+    movzw %bx, %ebx
+    mov %ebx, 12(%esp) # 64k blocks
+
+    # save the destination address to es:di
+    lea 16(%esp), %di # buffer is 1024 - 16 bytes
 
-read_data_error:
+    # clear %ebx, len
+    xor %ebx, %ebx
+    mov %ebx, (%esp)
+
+    # set default entry size
+    movl $20, 4(%esp)
+
+_e820_mem_map_load_loop:
+    # set the magic number to edx
+    mov $0x534D4150, %edx
+
+    # set function number to eax
+    mov $0xe820, %eax
+
+    # set default entry size
+    mov $24, %ecx
+
+    int $0x15
+
+    incl (%esp)
+    add $24, %edi
+
+    jc _e820_mem_map_load_fin
+    cmp $0, %ebx
+    jz _e820_mem_map_load_fin
+
+    cmp $24, %ecx
+    cmovnz 4(%esp), %ecx
+    mov %ecx, 4(%esp)
+
+    jmp _e820_mem_map_load_loop
+
+_e820_mem_map_load_fin:
+    # load GDT and IDT
+    cli
+    lidt null_idt_descriptor
+    lgdt _32bit_gdt_descriptor
+
+    # enable protection enable (PE) bit
+    mov %cr0, %eax
+    or $1, %eax
+    mov %eax, %cr0
+
+    ljmp $0x08, $start_32bit
+
+halt:
     hlt
-    jmp read_data_error
+    jmp halt
 
-.align 4
+.align 16
 read_data_pack:
     .byte 0x10, 0
 read_data_count:
-    .word 128    # sector count (read 64k)
+    .word 64     # sector count (read 32k)
 read_data_offset:
     .word 0x0000 # offset address
 read_data_segment:
-    .word 0x0800 # segment address
+    .word 0x0100 # segment address
 read_data_lba:
     .long 1      # lower 4 bytes of the LBA to read
     .long 0      # higher 2 bytes of the LBA to read
 
-__mbr_code_border__:
-    .long 0xffffffff
+# null IDT descriptor
+# so that exceptions will cause the system to reset
+.align 4
+null_idt_descriptor:
+    .word 0 # size
+    .long 0 # base
 
-.align 16
-stack_edge:
-.space 128
-stack_base:
+.align 4
+_32bit_gdt_descriptor:
+    .word (3 * 8) - 1 # size
+    .long _32bit_gdt  # address
 
-. = 510
-.byte 0x55, 0xaa
+.align 16
+_32bit_gdt:
+    .8byte 0x0                # null selector
+    .8byte 0x00cf9a000000ffff # code selector
+    .8byte 0x00cf92000000ffff # data selector

+ 0 - 15
src/mbr.ld

@@ -1,15 +0,0 @@
-OUTPUT_FORMAT(binary)
-OUTPUT_ARCH(i386:i386)
-
-SECTIONS
-{
-    .text 0x7c00 :
-    {
-        *(.text.bootsect)
-    }
-
-    /DISCARD/ :
-    {
-        *(.note*)
-    }
-}

+ 98 - 87
src/types/elf.cpp

@@ -9,116 +9,121 @@
 
 #include <types/elf.hpp>
 
-#include <kernel/mem.h>
+#include <kernel/mem/mm_list.hpp>
+#include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 #include <kernel/vfs.hpp>
 
-#define align16_down(sp) (sp = ((char*)((uint32_t)(sp)&0xfffffff0)))
-
-template <typename T>
-inline void _user_push(char** sp, T d)
+static inline void __user_push32(uintptr_t* sp, uint32_t d)
 {
-    *sp -= sizeof(T);
-    *(T*)*sp = d;
+    // TODO: use copy_to_user
+    *(--*(uint32_t**)sp) = d;
 }
-template <>
-inline void _user_push(char** sp, const char* str)
+
+static inline void __user_push_string32(uintptr_t* sp, const char* str)
 {
     size_t len = strlen(str);
+
     *sp -= (len + 1);
-    align16_down(*sp);
-    memcpy(*sp, str, len + 1);
+    *sp &= ~0xf; // align to 16 bytes
+
+    memcpy((void*)*sp, str, len + 1);
 }
 
-int types::elf::elf32_load(types::elf::elf32_load_data* d)
+int types::elf::elf32_load(types::elf::elf32_load_data& d)
 {
-    auto* ent_exec = d->exec_dent;
-    if (!ent_exec) {
-        d->errcode = ENOENT;
-        return GB_FAILED;
-    }
+    auto& exec = d.exec_dent;
+    if (!exec)
+        return -ENOENT;
 
-    // TODO: detect file format
     types::elf::elf32_header hdr {};
     auto n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)&hdr,
         sizeof(types::elf::elf32_header),
         0, sizeof(types::elf::elf32_header));
 
-    if (n_read != sizeof(types::elf::elf32_header)) {
-        d->errcode = EINVAL;
-        return GB_FAILED;
-    }
+    if (n_read != sizeof(types::elf::elf32_header))
+        return -EINVAL;
+
+    if (hdr.magic[0] != 0x7f || hdr.magic[1] != 'E'
+            || hdr.magic[2] != 'L' || hdr.magic[3] != 'F')
+        return -EINVAL;
 
     size_t phents_size = hdr.phentsize * hdr.phnum;
     size_t shents_size = hdr.shentsize * hdr.shnum;
     std::vector<types::elf::elf32_program_header_entry> phents(hdr.phnum);
     n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)phents.data(),
         phents_size,
         hdr.phoff, phents_size);
 
     // broken file or I/O error
-    if (n_read != phents_size) {
-        d->errcode = EINVAL;
-        return GB_FAILED;
-    }
+    if (n_read != phents_size)
+        return -EINVAL;
 
     std::vector<types::elf::elf32_section_header_entry> shents(hdr.shnum);
     n_read = fs::vfs_read(
-        ent_exec->ind,
+        exec->ind,
         (char*)shents.data(),
         shents_size,
         hdr.shoff, shents_size);
 
     // broken file or I/O error
-    if (n_read != shents_size) {
-        d->errcode = EINVAL;
-        return GB_FAILED;
-    }
-
-    // copy argv and envp
-    std::vector<std::string> argv, envp;
-    for (const char* const* p = d->argv; *p; ++p)
-        argv.emplace_back(*p);
-    for (const char* const* p = d->envp; *p; ++p)
-        envp.emplace_back(*p);
+    if (n_read != shents_size)
+        return -EINVAL;
 
-    // from now on, caller process is recycled.
+    // from now on, caller process is gone.
     // so we can't just simply return to it on error.
-    current_process->mms.clear_user();
+    auto& mms = current_process->mms;
+    mms.clear();
 
-    uint32_t data_segment_end = 0;
+    uintptr_t data_segment_end = 0;
 
     for (const auto& phent : phents) {
         if (phent.type != types::elf::elf32_program_header_entry::PT_LOAD)
             continue;
 
-        auto vaddr = align_down<12>(phent.vaddr);
-        auto vlen = align_up<12>(phent.vaddr + phent.memsz) - vaddr;
-        auto flen = align_up<12>(phent.vaddr + phent.filesz) - vaddr;
-        auto fileoff = align_down<12>(phent.offset);
+        auto vaddr = phent.vaddr & ~0xfff;
+        auto vlen = ((phent.vaddr + phent.memsz + 0xfff) & ~0xfff) - vaddr;
+        auto flen = ((phent.vaddr + phent.filesz + 0xfff) & ~0xfff) - vaddr;
+        auto fileoff = phent.offset & ~0xfff;
 
+        using namespace kernel::mem;
         if (flen) {
-            auto ret = mmap(
-                (char*)vaddr,
-                phent.filesz + (phent.vaddr & 0xfff),
-                ent_exec->ind,
-                fileoff,
-                1,
-                d->system);
-
-            if (ret != GB_OK)
+            mm_list::map_args args{};
+
+            args.vaddr = vaddr;
+            args.length = flen;
+            args.file_inode = exec->ind;
+            args.file_offset = fileoff;
+
+            args.flags = MM_MAPPED;
+            if (phent.flags & elf32_program_header_entry::PF_W)
+                args.flags |= MM_WRITE;
+
+            if (phent.flags & elf32_program_header_entry::PF_X)
+                args.flags |= MM_EXECUTE;
+
+            if (auto ret = mms.mmap(args); ret != 0)
                 kill_current(SIGSEGV);
         }
 
         if (vlen > flen) {
-            auto ret = mmap((char*)vaddr + flen, vlen - flen,
-                nullptr, 0, true, d->system);
+            mm_list::map_args args{};
+
+            args.vaddr = vaddr + flen;
+            args.length = vlen - flen;
 
-            if (ret != GB_OK)
+            args.flags = MM_ANONYMOUS;
+            if (phent.flags & elf32_program_header_entry::PF_W)
+                args.flags |= MM_WRITE;
+
+            if (phent.flags & elf32_program_header_entry::PF_X)
+                args.flags |= MM_EXECUTE;
+
+            if (auto ret = mms.mmap(args); ret != 0)
                 kill_current(SIGSEGV);
         }
 
@@ -126,60 +131,66 @@ int types::elf::elf32_load(types::elf::elf32_load_data* d)
             data_segment_end = vaddr + vlen;
     }
 
-    current_process->mms.register_brk((char*)data_segment_end + 0x10000);
+    current_process->mms.register_brk(data_segment_end + 0x10000);
 
     for (const auto& shent : shents) {
         if (shent.sh_type == elf32_section_header_entry::SHT_NOBITS)
-            memset((char*)shent.sh_addr, 0x00, shent.sh_size);
+            memset((char*)(uintptr_t)shent.sh_addr, 0x00, shent.sh_size);
     }
 
     // map stack area
-    auto ret = mmap((void*)types::elf::ELF_STACK_TOP,
-        types::elf::ELF_STACK_SIZE, nullptr, 0, true, false);
+    if (1) {
+        using namespace kernel::mem;
+        mm_list::map_args args{};
 
-    // TODO: destruct local variables before calling kill_current
-    if (ret != GB_OK)
-        kill_current(SIGSEGV);
+        args.vaddr = ELF32_STACK_TOP;
+        args.length = ELF32_STACK_SIZE;
+        args.flags = MM_ANONYMOUS | MM_WRITE;
+
+        if (auto ret = mms.mmap(args); ret != 0)
+            kill_current(SIGSEGV);
+        // TODO: deconstruct local variables before calling kill_current
+    }
 
-    d->eip = (void*)hdr.entry;
-    d->sp = reinterpret_cast<uint32_t*>(types::elf::ELF_STACK_BOTTOM);
+    d.ip = hdr.entry;
+    d.sp = ELF32_STACK_BOTTOM;
 
-    auto* sp = (char**)&d->sp;
+    auto* sp = &d.sp;
 
     // fill information block area
-    std::vector<char*> args, envs;
-    for (const auto& env : envp) {
-        _user_push(sp, env.c_str());
-        envs.push_back(*sp);
+    std::vector<elf32_addr_t> args, envs;
+    for (const auto& env : d.envp) {
+        __user_push_string32(sp, env.c_str());
+        envs.push_back((uintptr_t)*sp);
     }
-    for (const auto& arg : argv) {
-        _user_push(sp, arg.c_str());
-        args.push_back(*sp);
+    for (const auto& arg : d.argv) {
+        __user_push_string32(sp, arg.c_str());
+        args.push_back((uintptr_t)*sp);
     }
 
     // push null auxiliary vector entry
-    _user_push(sp, 0);
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
+    __user_push32(sp, 0);
 
     // push 0 for envp
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
 
     // push envp
-    *sp -= sizeof(void*) * envs.size();
-    memcpy(*sp, envs.data(), sizeof(void*) * envs.size());
+    for (auto ent : envs)
+        __user_push32(sp, ent);
 
     // push 0 for argv
-    _user_push(sp, 0);
+    __user_push32(sp, 0);
 
     // push argv
-    *sp -= sizeof(void*) * args.size();
-    memcpy(*sp, args.data(), sizeof(void*) * args.size());
+    for (int i = args.size()-1; i >= 0; --i)
+        __user_push32(sp, args[i]);
 
     // push argc
-    _user_push(sp, args.size());
+    __user_push32(sp, args.size());
 
     // rename current thread
-    current_thread->name = ent_exec->name;
+    current_thread->name = exec->name;
 
-    return GB_OK;
+    return 0;
 }

+ 1 - 6
src/types/libstdcpp.cpp

@@ -1,8 +1,6 @@
-#include <asm/port_io.h>
 #include <assert.h>
 #include <kernel/log.hpp>
 #include <kernel/process.hpp>
-#include <stdio.h>
 #include <types/types.h>
 
 extern "C" void NORETURN __stack_chk_fail(void)
@@ -20,9 +18,6 @@ extern "C" void NORETURN __cxa_pure_virtual(void)
 void NORETURN
 __assert_fail(const char* statement, const char* file, int line, const char* func)
 {
-    char buf[256];
-    snprintf(buf, sizeof(buf), "Kernel assertion failed: (%s), %s:%d, %s\n",
-        statement, file, line, func);
-    kmsg(buf);
+    kmsgf("Kernel assertion failed: (%s), %s:%d, %s", statement, file, line, func);
     freeze();
 }

+ 3 - 3
user-space-program/CMakeLists.txt

@@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.15)
 project(user_space_program C ASM)
 
-set(CMAKE_C_FLAGS "-nostdlib -nostdinc -static -m32 -W -Wall -Wextra -Werror -mstack-protector-guard=global")
-set(CMAKE_ASM_FLAGS "-nostdlib -m32 -static -mstack-protector-guard=global -g0")
+set(CMAKE_C_FLAGS "-nostdlib -nostdinc -m32 -static -W -Wall -mstack-protector-guard=global")
+set(CMAKE_ASM_FLAGS "-nostdlib -static -m32 -mstack-protector-guard=global")
 
-link_libraries(gblibc crt0)
+link_libraries(gblibc_32 crt0_32)
 add_link_options("LINKER:-melf_i386")
 
 set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "")

Some files were not shown because too many files changed in this diff