ソースを参照

Merge branch 'smp'

greatbridf 3 週間 前
コミット
07cdd43e60
100 ファイル変更4739 行追加3279 行削除
  1. 1 0
      .clang-format
  2. 3 3
      .rustfmt.toml
  3. 12 49
      CMakeLists.txt
  4. 45 14
      Cargo.lock
  5. 11 0
      Cargo.toml
  6. 7 1
      Makefile.src
  7. 65 0
      arch/Cargo.lock
  8. 8 0
      arch/Cargo.toml
  9. 53 0
      arch/percpu/Cargo.lock
  10. 8 0
      arch/percpu/Cargo.toml
  11. 47 0
      arch/percpu/macros/Cargo.lock
  12. 12 0
      arch/percpu/macros/Cargo.toml
  13. 22 0
      arch/percpu/macros/src/arch.rs
  14. 117 0
      arch/percpu/macros/src/lib.rs
  15. 25 0
      arch/percpu/src/arch.rs
  16. 6 0
      arch/percpu/src/lib.rs
  17. 98 0
      arch/src/lib.rs
  18. 6 0
      arch/x86_64/Cargo.toml
  19. 92 0
      arch/x86_64/src/gdt.rs
  20. 27 0
      arch/x86_64/src/interrupt.rs
  21. 93 0
      arch/x86_64/src/io.rs
  22. 70 0
      arch/x86_64/src/lib.rs
  23. 172 0
      arch/x86_64/src/task.rs
  24. 2 2
      doc/mem_layout.txt
  25. 4 1
      gblibc/CMakeLists.txt
  26. 2 0
      gblibc/include/errno.h
  27. 0 19
      include/kernel/async/lock.hpp
  28. 0 29
      include/kernel/async/waitlist.hpp
  29. 0 6
      include/kernel/hw/pci.hpp
  30. 0 20
      include/kernel/hw/serial.hpp
  31. 0 11
      include/kernel/hw/timer.hpp
  32. 0 5
      include/kernel/interrupt.hpp
  33. 0 11
      include/kernel/irq.hpp
  34. 2 15
      include/kernel/log.hpp
  35. 0 112
      include/kernel/mem/mm_list.hpp
  36. 5 47
      include/kernel/mem/paging.hpp
  37. 1 2
      include/kernel/mem/paging_asm.h
  38. 1 19
      include/kernel/mem/phys.hpp
  39. 0 60
      include/kernel/mem/vm_area.hpp
  40. 0 37
      include/kernel/module.hpp
  41. 2 156
      include/kernel/process.hpp
  42. 0 74
      include/kernel/signal.hpp
  43. 0 118
      include/kernel/syscall.hpp
  44. 0 5
      include/kernel/task/current.hpp
  45. 0 16
      include/kernel/task/readyqueue.hpp
  46. 0 76
      include/kernel/task/thread.hpp
  47. 0 73
      include/kernel/tty.hpp
  48. 0 21
      include/kernel/user/thread_local.hpp
  49. 0 84
      include/kernel/vfs.hpp
  50. 0 28
      include/kernel/vfs/dentry.hpp
  51. 0 106
      include/kernel/vfs/file.hpp
  52. 0 51
      include/kernel/vfs/filearr.hpp
  53. 0 25
      include/kernel/vfs/vfsfwd.hpp
  54. 0 293
      include/types/elf.hpp
  55. 6 5
      init_script.sh
  56. 11 53
      src/asm/interrupt.s
  57. 81 6
      src/boot.s
  58. 0 71
      src/dev/builtin-chardev.cc
  59. 20 0
      src/driver.rs
  60. 5 4
      src/driver/ahci/command.rs
  61. 35 38
      src/driver/ahci/control.rs
  62. 33 33
      src/driver/ahci/defs.rs
  63. 108 57
      src/driver/ahci/mod.rs
  64. 266 58
      src/driver/ahci/port.rs
  65. 23 11
      src/driver/e1000e.rs
  66. 145 0
      src/driver/serial.rs
  67. 370 0
      src/elf.rs
  68. 137 148
      src/fs/fat32.rs
  69. 170 140
      src/fs/procfs.rs
  70. 207 243
      src/fs/tmpfs.rs
  71. 34 51
      src/io.rs
  72. 79 89
      src/kernel.ld
  73. 16 0
      src/kernel.rs
  74. 0 1
      src/kernel/allocator.cc
  75. 5 0
      src/kernel/arch.rs
  76. 82 0
      src/kernel/arch/x86_64.rs
  77. 126 0
      src/kernel/arch/x86_64/init.rs
  78. 129 0
      src/kernel/arch/x86_64/interrupt.rs
  79. 7 27
      src/kernel/async/lock.cc
  80. 0 57
      src/kernel/async/waitlist.cc
  81. 12 10
      src/kernel/block.rs
  82. 155 0
      src/kernel/chardev.rs
  83. 60 11
      src/kernel/console.rs
  84. 39 0
      src/kernel/constants.rs
  85. 2 2
      src/kernel/hw/pci.cc
  86. 0 115
      src/kernel/hw/serial.cc
  87. 0 28
      src/kernel/hw/timer.cc
  88. 0 147
      src/kernel/interrupt.cpp
  89. 77 26
      src/kernel/interrupt.rs
  90. 10 0
      src/kernel/mem.rs
  91. 102 0
      src/kernel/mem/mm_area.rs
  92. 13 15
      src/kernel/mem/mm_list.cc
  93. 357 0
      src/kernel/mem/mm_list.rs
  94. 206 0
      src/kernel/mem/mm_list/page_fault.rs
  95. 307 0
      src/kernel/mem/page_table.rs
  96. 4 197
      src/kernel/mem/paging.cc
  97. 91 35
      src/kernel/mem/paging.rs
  98. 2 2
      src/kernel/mem/phys.rs
  99. 20 10
      src/kernel/mem/slab.cc
  100. 168 0
      src/kernel/mem/vrange.rs

+ 1 - 0
.clang-format

@@ -6,6 +6,7 @@ AllowShortFunctionsOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLoopsOnASingleLine: 'false'
 BreakConstructorInitializers: BeforeComma
+ColumnLimit: '100'
 FixNamespaceComments: 'true'
 IncludeBlocks: Regroup
 IndentWidth: '4'

+ 3 - 3
.rustfmt.toml

@@ -1,4 +1,4 @@
-max_width = 80
+max_width = 100
 hard_tabs = false
 tab_spaces = 4
 newline_style = "Auto"
@@ -10,8 +10,8 @@ struct_lit_width = 18
 struct_variant_width = 35
 array_width = 60
 chain_width = 60
-single_line_if_else_max_width = 50
-single_line_let_else_max_width = 50
+single_line_if_else_max_width = 60
+single_line_let_else_max_width = 60
 wrap_comments = false
 format_code_in_doc_comments = false
 doc_comment_code_block_width = 100

+ 12 - 49
CMakeLists.txt

@@ -38,75 +38,37 @@ set(BOOTLOADER_SOURCES src/boot.s
                        src/asm/interrupt.s
                        )
 
-set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
-                        src/kinit.cpp
-                        src/kernel/async/waitlist.cc
+set(KERNEL_MAIN_SOURCES src/kinit.cpp
                         src/kernel/async/lock.cc
                         src/kernel/allocator.cc
-                        src/kernel/interrupt.cpp
                         src/kernel/process.cpp
-                        src/kernel/tty.cpp
-                        src/kernel/syscall.cpp
-                        src/kernel/syscall/fileops.cc
-                        src/kernel/syscall/infoops.cc
-                        src/kernel/syscall/mount.cc
-                        src/kernel/syscall/procops.cc
-                        src/kernel/mem/mm_list.cc
                         src/kernel/mem/paging.cc
                         src/kernel/mem/slab.cc
-                        src/kernel/module.cc
-                        src/kernel/vfs.cpp
                         src/kernel/vga.cpp
                         src/kernel/hw/acpi.cc
                         src/kernel/hw/pci.cc
-                        src/kernel/hw/serial.cc
-                        src/kernel/hw/timer.cc
-                        src/kernel/task/thread.cc
-                        src/kernel/task/readyqueue.cc
-                        src/kernel/user/thread_local.cc
-                        src/kernel/vfs/filearr.cc
-                        src/kernel/signal.cpp
                         src/net/ethernet.cc
                         src/types/crc.cc
-                        src/types/elf.cpp
                         src/types/libstdcpp.cpp
                         include/defs.hpp
-                        include/kernel/async/waitlist.hpp
                         include/kernel/async/lock.hpp
-                        include/kernel/tty.hpp
                         include/kernel/interrupt.hpp
-                        include/kernel/irq.hpp
                         include/kernel/process.hpp
-                        include/kernel/syscall.hpp
-                        include/kernel/mem/mm_list.hpp
                         include/kernel/mem/paging.hpp
                         include/kernel/mem/slab.hpp
                         include/kernel/mem/types.hpp
-                        include/kernel/mem/vm_area.hpp
-                        include/kernel/module.hpp
                         include/kernel/utsname.hpp
-                        include/kernel/vfs.hpp
-                        include/kernel/vfs/dentry.hpp
-                        include/kernel/vfs/file.hpp
-                        include/kernel/vfs/filearr.hpp
                         include/kernel/vga.hpp
-                        include/kernel/signal.hpp
                         include/kernel/task/forward.hpp
-                        include/kernel/task/thread.hpp
-                        include/kernel/task/readyqueue.hpp
                         include/kernel/hw/acpi.hpp
                         include/kernel/hw/pci.hpp
                         include/kernel/hw/port.hpp
-                        include/kernel/hw/serial.hpp
-                        include/kernel/hw/timer.hpp
                         include/kernel/input/keycodes.h
-                        include/kernel/user/thread_local.hpp
                         include/net/arp.hpp
                         include/net/ethernet.hpp
                         include/net/netdev.hpp
                         include/types/bitmap.hpp
                         include/types/buffer.hpp
-                        include/types/elf.hpp
                         include/types/list.hpp
                         include/types/types.h
                         include/types/allocator.hpp
@@ -121,6 +83,7 @@ target_include_directories(kernel.out PRIVATE ${PROJECT_SOURCE_DIR}/include)
 target_link_options(kernel.out PRIVATE
     -T "${CMAKE_SOURCE_DIR}/src/kernel.ld"
     -L "${CMAKE_BINARY_DIR}/x86_64-unknown-none/${CARGO_BUILD_TYPE}"
+    --no-check-sections
     )
 set_target_properties(kernel.out PROPERTIES LINK_DEPENDS "${CMAKE_SOURCE_DIR}/src/kernel.ld")
 set_source_files_properties(src/mbr.S PROPERTIES OBJECT_DEPENDS
@@ -141,16 +104,16 @@ add_custom_target(boot.img
     DEPENDS user_space_programs
     COMMAND dd if=mbr_hole.bin of=boot.img
     COMMAND dd if=/dev/zero of=boot.img bs=`expr 512 \\* 1024 \\* 1024` count=0 seek=1
-    COMMAND sh -c \"echo n\; echo\; echo\; echo\; echo\; echo a\; echo w\" | ${FDISK_BIN} boot.img
-    COMMAND mkfs.fat --offset=2048 -v -n SYSTEM boot.img
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/hello-world.out ::hello
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/interrupt-test.out ::int
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/stack-test.out ::stack
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/init.out ::init
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/priv-test.out ::priv
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_SOURCE_DIR}/busybox-minimal ::busybox_
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_SOURCE_DIR}/busybox ::busybox
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_SOURCE_DIR}/init_script.sh ::initsh
+    COMMAND sh -c \"echo n\; echo\; echo \; echo 8192\; echo\; echo a\; echo w\" | ${FDISK_BIN} boot.img
+    COMMAND mkfs.fat --offset=8192 -v -n SYSTEM boot.img
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/hello-world.out ::hello
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/interrupt-test.out ::int
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/stack-test.out ::stack
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/init.out ::init
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/priv-test.out ::priv
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_SOURCE_DIR}/busybox-minimal ::busybox_
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_SOURCE_DIR}/busybox ::busybox
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_SOURCE_DIR}/init_script.sh ::initsh
 )
 
 add_custom_command(OUTPUT run

+ 45 - 14
Cargo.lock

@@ -11,6 +11,14 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "arch"
+version = "0.1.0"
+dependencies = [
+ "percpu",
+ "x86_64",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.4.0"
@@ -79,7 +87,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 name = "gbos-rust-part"
 version = "0.1.0"
 dependencies = [
+ "arch",
  "bindgen",
+ "bitflags",
  "itertools",
  "lazy_static",
  "spin",
@@ -111,9 +121,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.159"
+version = "0.2.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
+checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f"
 
 [[package]]
 name = "libloading"
@@ -163,11 +173,28 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "percpu"
+version = "0.1.0"
+dependencies = [
+ "percpu-macros",
+ "x86_64",
+]
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "prettyplease"
-version = "0.2.22"
+version = "0.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba"
+checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033"
 dependencies = [
  "proc-macro2",
  "syn",
@@ -175,9 +202,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.87"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -193,9 +220,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -205,9 +232,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -249,9 +276,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.79"
+version = "2.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -260,9 +287,9 @@ dependencies = [
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "windows-targets"
@@ -327,3 +354,7 @@ name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "x86_64"
+version = "0.1.0"

+ 11 - 0
Cargo.toml

@@ -7,16 +7,27 @@ edition = "2021"
 crate-type = ["staticlib"]
 
 [dependencies]
+arch = { path="./arch" }
+bitflags = "2.6.0"
 itertools = { version = "0.13.0", default-features = false }
 lazy_static = { version = "1.5.0", features = ["spin_no_std"] }
 spin = "0.9.8"
 
+[features]
+default = ["smp"]
+debug_syscall = []
+smp = []
+
 [build-dependencies]
 bindgen = "0.70.1"
 
 [profile.dev]
 panic = "abort"
 
+[profile.dev.package.core]
+opt-level = 2
+debug = true
+
 [profile.dev.package."*"]
 opt-level = 2
 debug = false

+ 7 - 1
Makefile.src

@@ -42,7 +42,13 @@ clean-all: clean
 
 .PHONY: debug
 debug:
-	-$(GDB_BIN) --symbols=build/kernel.out --init-eval-command 'source pretty-print.py' --init-eval-command 'set pagination off' --init-eval-command 'set output-radix 16' --init-eval-command 'set print pretty on' --init-eval-command 'target remote:1234'
+	-$(GDB_BIN) --symbols=build/kernel.out \
+		-iex 'source pretty-print.py' \
+		-iex 'set pagination off' \
+		-iex 'set output-radix 16' \
+		-iex 'set print asm-demangle on' \
+		-iex 'set print pretty on' \
+		-iex 'target remote:1234'
 	-killall $(QEMU_BIN)
 
 build/boot.vdi: build/boot.img

+ 65 - 0
arch/Cargo.lock

@@ -0,0 +1,65 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "arch"
+version = "0.1.0"
+dependencies = [
+ "percpu",
+ "x86_64",
+]
+
+[[package]]
+name = "percpu"
+version = "0.1.0"
+dependencies = [
+ "percpu-macros",
+]
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
+
+[[package]]
+name = "x86_64"
+version = "0.1.0"

+ 8 - 0
arch/Cargo.toml

@@ -0,0 +1,8 @@
+[package]
+name = "arch"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+x86_64 = { path="./x86_64" }
+percpu = { path="./percpu" }

+ 53 - 0
arch/percpu/Cargo.lock

@@ -0,0 +1,53 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "percpu"
+version = "0.1.0"
+dependencies = [
+ "percpu-macros",
+]
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"

+ 8 - 0
arch/percpu/Cargo.toml

@@ -0,0 +1,8 @@
+[package]
+name = "percpu"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+percpu-macros = { path = "macros" }
+x86_64 = { path = "../x86_64" }

+ 47 - 0
arch/percpu/macros/Cargo.lock

@@ -0,0 +1,47 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"

+ 12 - 0
arch/percpu/macros/Cargo.toml

@@ -0,0 +1,12 @@
+[package]
+name = "percpu-macros"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+syn = { version = "2.0", features = ["full"] }
+quote = "1.0"
+proc-macro2 = "1.0"

+ 22 - 0
arch/percpu/macros/src/arch.rs

@@ -0,0 +1,22 @@
+use proc_macro2::TokenStream;
+use quote::quote;
+use syn::{Ident, Type};
+
+/// Get the base address for percpu variables of the current thread.
+pub fn get_percpu_pointer(percpu: &Ident, ty: &Type) -> TokenStream {
+    quote! {
+        #[cfg(target_arch = "x86_64")]
+        {
+            let base: *mut #ty;
+            ::core::arch::asm!(
+                "mov %gs:0, {address}",
+                "add ${percpu_pointer}, {address}",
+                percpu_pointer = sym #percpu,
+                address = out(reg) base,
+                options(att_syntax)
+            );
+            base
+        }
+    }
+    .into()
+}

+ 117 - 0
arch/percpu/macros/src/lib.rs

@@ -0,0 +1,117 @@
+extern crate proc_macro;
+
+use proc_macro::TokenStream;
+use quote::{format_ident, quote};
+use syn::{parse_macro_input, ItemStatic};
+
+mod arch;
+
+#[proc_macro_attribute]
+pub fn define_percpu(attrs: TokenStream, item: TokenStream) -> TokenStream {
+    if !attrs.is_empty() {
+        panic!("`define_percpu` attribute does not take any arguments");
+    }
+
+    let item = parse_macro_input!(item as ItemStatic);
+    let vis = &item.vis;
+    let ident = &item.ident;
+    let ty = &item.ty;
+    let expr = &item.expr;
+
+    let is_bool = quote!(#ty).to_string().as_str() == "bool";
+    let is_integer =
+        ["u8", "u16", "u32", "u64", "usize"].contains(&quote!(#ty).to_string().as_str());
+
+    let is_atomic_like = is_bool || is_integer || quote!(#ty).to_string().contains("NonNull");
+
+    let inner_ident = format_ident!("_percpu_inner_{}", ident);
+    let access_ident = format_ident!("_access_{}", ident);
+
+    let integer_methods = if is_integer {
+        quote! {
+            pub fn add(&self, value: #ty) {
+                *unsafe { self.as_mut() } += value;
+            }
+
+            pub fn sub(&self, value: #ty) {
+                *unsafe { self.as_mut() } -= value;
+            }
+        }
+    } else {
+        quote! {}
+    };
+
+    let preempt_disable = if !is_atomic_like {
+        quote! { crate::sync::preempt::disable(); }
+    } else {
+        quote! {}
+    };
+
+    let preempt_enable = if !is_atomic_like {
+        quote! { crate::sync::preempt::enable(); }
+    } else {
+        quote! {}
+    };
+
+    let as_ptr = arch::get_percpu_pointer(&inner_ident, &ty);
+
+    quote! {
+        #[link_section = ".percpu"]
+        #[allow(non_upper_case_globals)]
+        static mut #inner_ident: #ty = #expr;
+        #[allow(non_camel_case_types)]
+        #vis struct #access_ident;
+        #vis static #ident: #access_ident = #access_ident;
+
+        impl #access_ident {
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ptr(&self) -> *mut #ty {
+                #as_ptr
+            }
+
+            pub fn get(&self) -> #ty {
+                #preempt_disable
+                let value = unsafe { self.as_ptr().read() };
+                #preempt_enable
+                value
+            }
+
+            pub fn set(&self, value: #ty) {
+                #preempt_disable
+                unsafe { self.as_ptr().write(value) }
+                #preempt_enable
+            }
+
+            pub fn swap(&self, mut value: #ty) -> #ty {
+                #preempt_disable
+                unsafe { self.as_ptr().swap(&mut value) }
+                #preempt_enable
+                value
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for immutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ref(&self) -> & #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_ref().unwrap()
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_mut(&self) -> &mut #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_mut().unwrap()
+            }
+
+            #integer_methods
+        }
+    }
+    .into()
+}

+ 25 - 0
arch/percpu/src/arch.rs

@@ -0,0 +1,25 @@
+pub unsafe fn save_percpu_pointer(percpu_area_base: *mut ()) {
+    #[cfg(target_arch = "x86_64")]
+    x86_64::task::wrmsr(0xC0000101, percpu_area_base as u64);
+
+    #[cfg(not(target_arch = "x86_64"))]
+    compile_error!("unsupported architecture");
+}
+
+pub unsafe fn set_percpu_area_thiscpu(percpu_area_base: *mut ()) {
+    use core::arch::asm;
+
+    save_percpu_pointer(percpu_area_base);
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        asm!(
+            "movq {}, %gs:0",
+            in(reg) percpu_area_base,
+            options(att_syntax)
+        );
+    }
+
+    #[cfg(not(target_arch = "x86_64"))]
+    compile_error!("unsupported architecture");
+}

+ 6 - 0
arch/percpu/src/lib.rs

@@ -0,0 +1,6 @@
+#![no_std]
+
+mod arch;
+
+pub use arch::set_percpu_area_thiscpu;
+pub use percpu_macros::define_percpu;

+ 98 - 0
arch/src/lib.rs

@@ -0,0 +1,98 @@
+#![no_std]
+
+pub mod vm {
+    pub fn invlpg(vaddr: usize) {
+        x86_64::vm::invlpg(vaddr)
+    }
+
+    pub fn invlpg_all() {
+        x86_64::vm::invlpg_all()
+    }
+
+    pub fn current_page_table() -> usize {
+        x86_64::vm::get_cr3()
+    }
+
+    pub fn switch_page_table(pfn: usize) {
+        x86_64::vm::set_cr3(pfn)
+    }
+}
+
+pub mod task {
+    #[inline(always)]
+    pub fn halt() {
+        x86_64::task::halt()
+    }
+
+    #[inline(always)]
+    pub fn pause() {
+        x86_64::task::pause()
+    }
+
+    #[inline(always)]
+    pub fn freeze() -> ! {
+        x86_64::task::freeze()
+    }
+
+    /// Switch to the `next` task. `IF` state is also switched.
+    ///
+    /// This function should only be used to switch between tasks that do not need SMP synchronization.
+    ///
+    /// # Arguments
+    /// * `current_task_sp` - Pointer to the stack pointer of the current task.
+    /// * `next_task_sp` - Pointer to the stack pointer of the next task.
+    #[inline(always)]
+    pub fn context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize) {
+        x86_64::task::context_switch_light(current_task_sp, next_task_sp);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub use x86_64::task::{rdmsr, wrmsr};
+}
+
+pub mod interrupt {
+    #[inline(always)]
+    pub fn enable() {
+        x86_64::interrupt::enable()
+    }
+
+    #[inline(always)]
+    pub fn disable() {
+        x86_64::interrupt::disable()
+    }
+}
+
+pub mod io {
+    #[inline(always)]
+    pub fn inb(port: u16) -> u8 {
+        x86_64::io::inb(port)
+    }
+
+    #[inline(always)]
+    pub fn outb(port: u16, data: u8) {
+        x86_64::io::outb(port, data)
+    }
+
+    #[inline(always)]
+    pub fn inw(port: u16) -> u16 {
+        x86_64::io::inw(port)
+    }
+
+    #[inline(always)]
+    pub fn outw(port: u16, data: u16) {
+        x86_64::io::outw(port, data)
+    }
+
+    #[inline(always)]
+    pub fn inl(port: u16) -> u32 {
+        x86_64::io::inl(port)
+    }
+
+    #[inline(always)]
+    pub fn outl(port: u16, data: u32) {
+        x86_64::io::outl(port, data)
+    }
+}
+
+pub use percpu::{define_percpu, set_percpu_area_thiscpu};
+pub use x86_64;

+ 6 - 0
arch/x86_64/Cargo.toml

@@ -0,0 +1,6 @@
+[package]
+name = "x86_64"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]

+ 92 - 0
arch/x86_64/src/gdt.rs

@@ -0,0 +1,92 @@
+use core::arch::asm;
+
+use crate::task::TSS;
+
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy)]
+pub struct GDTEntry(u64);
+
+pub struct GDT([GDTEntry; GDT::LEN]);
+
+impl GDTEntry {
+    const NULL: Self = Self(0);
+
+    const KERNEL_CODE64: Self = Self::new(0, 0, 0x9a, 0x2);
+    const KERNEL_DATA64: Self = Self::new(0, 0, 0x92, 0x0);
+
+    const USER_CODE64: Self = Self::new(0, 0, 0xfa, 0x2);
+    const USER_DATA64: Self = Self::new(0, 0, 0xf2, 0x0);
+
+    const USER_CODE32: Self = Self::new(0, 0xfffff, 0xfa, 0xc);
+    const USER_DATA32: Self = Self::new(0, 0xfffff, 0xf2, 0xc);
+
+    pub const fn new(base: u32, limit: u32, access: u8, flags: u8) -> Self {
+        let mut entry = 0u64;
+        entry |= (limit & 0x0000_ffff) as u64;
+        entry |= ((limit & 0x000f_0000) as u64) << 32;
+        entry |= ((base & 0x00ff_ffff) as u64) << 16;
+        entry |= ((base & 0xff00_0000) as u64) << 32;
+        entry |= (access as u64) << 40;
+        entry |= (flags as u64) << 52;
+
+        GDTEntry(entry)
+    }
+
+    pub const fn new_ldt(base: u64, limit: u32) -> [Self; 2] {
+        let first = Self::new(base as u32, limit, 0x82, 0x0);
+        let second = Self(base >> 32);
+        [first, second]
+    }
+
+    pub const fn new_tss(base: u64, limit: u32) -> [Self; 2] {
+        let first = Self::new(base as u32, limit, 0x89, 0x0);
+        let second = Self(base >> 32);
+        [first, second]
+    }
+}
+
+impl GDT {
+    const LEN: usize = 10;
+    const TLS32_INDEX: usize = 7;
+    const TSS_INDEX: usize = 8;
+
+    pub fn new() -> Self {
+        Self([
+            GDTEntry::NULL,
+            GDTEntry::KERNEL_CODE64,
+            GDTEntry::KERNEL_DATA64,
+            GDTEntry::USER_CODE64,
+            GDTEntry::USER_DATA64,
+            GDTEntry::USER_CODE32,
+            GDTEntry::USER_DATA32,
+            GDTEntry::NULL, // User TLS 32bit
+            GDTEntry::NULL, // TSS Descriptor Low
+            GDTEntry::NULL, // TSS Descriptor High
+        ])
+    }
+
+    pub fn set_tss(&mut self, base: u64) {
+        let tss = GDTEntry::new_tss(base, size_of::<TSS>() as u32 - 1);
+        self.0[Self::TSS_INDEX] = tss[0];
+        self.0[Self::TSS_INDEX + 1] = tss[1];
+    }
+
+    pub fn set_tls32(&mut self, desc: GDTEntry) {
+        self.0[Self::TLS32_INDEX] = desc;
+    }
+
+    pub unsafe fn load(&self) {
+        let len = Self::LEN * 8 - 1;
+        let descriptor: [u64; 2] = [(len as u64) << 48, self.0.as_ptr() as u64];
+        assert!(len < 0x10000, "GDT too large");
+
+        let descriptor_address = &descriptor as *const _ as usize + 6;
+        asm!(
+            "lgdt ({})",
+            "ltr %ax",
+            in(reg) descriptor_address,
+            in("ax") Self::TSS_INDEX as u16 * 8,
+            options(att_syntax)
+        );
+    }
+}

+ 27 - 0
arch/x86_64/src/interrupt.rs

@@ -0,0 +1,27 @@
+use core::arch::asm;
+
+pub fn enable() {
+    unsafe {
+        asm!("sti");
+    }
+}
+
+pub fn disable() {
+    unsafe {
+        asm!("cli");
+    }
+}
+
+pub fn lidt(base: usize, limit: u16) {
+    let mut idt_descriptor = [0u16; 5];
+
+    idt_descriptor[0] = limit;
+    idt_descriptor[1] = base as u16;
+    idt_descriptor[2] = (base >> 16) as u16;
+    idt_descriptor[3] = (base >> 32) as u16;
+    idt_descriptor[4] = (base >> 48) as u16;
+
+    unsafe {
+        asm!("lidt ({})", in(reg) &idt_descriptor, options(att_syntax));
+    }
+}

+ 93 - 0
arch/x86_64/src/io.rs

@@ -0,0 +1,93 @@
+use core::arch::asm;
+
+pub fn enable_sse() {
+    unsafe {
+        asm!(
+            "mov %cr0, %rax",
+            "and $(~0xc), %rax",
+            "or $0x22, %rax",
+            "mov %rax, %cr0",
+            "mov %cr4, %rax",
+            "or $0x600, %rax",
+            "mov %rax, %cr4",
+            "fninit",
+            out("rax") _,
+            options(att_syntax, nomem, nostack)
+        )
+    }
+}
+
+pub fn inb(no: u16) -> u8 {
+    let data;
+    unsafe {
+        asm!(
+            "inb %dx, %al",
+            in("dx") no,
+            out("al") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn inw(no: u16) -> u16 {
+    let data;
+    unsafe {
+        asm!(
+            "inw %dx, %ax",
+            in("dx") no,
+            out("ax") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn inl(no: u16) -> u32 {
+    let data;
+    unsafe {
+        asm!(
+            "inl %dx, %eax",
+            in("dx") no,
+            out("eax") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn outb(no: u16, data: u8) {
+    unsafe {
+        asm!(
+            "outb %al, %dx",
+            in("al") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}
+
+pub fn outw(no: u16, data: u16) {
+    unsafe {
+        asm!(
+            "outw %ax, %dx",
+            in("ax") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}
+
+pub fn outl(no: u16, data: u32) {
+    unsafe {
+        asm!(
+            "outl %eax, %dx",
+            in("eax") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}

+ 70 - 0
arch/x86_64/src/lib.rs

@@ -0,0 +1,70 @@
+#![no_std]
+
+pub mod vm {
+    use core::arch::asm;
+
+    #[inline(always)]
+    pub fn invlpg(vaddr: usize) {
+        unsafe {
+            asm!(
+                "invlpg ({})",
+                in(reg) vaddr,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn invlpg_all() {
+        unsafe {
+            asm!(
+                "mov %cr3, %rax",
+                "mov %rax, %cr3",
+                out("rax") _,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn get_cr3() -> usize {
+        let cr3: usize;
+        unsafe {
+            asm!(
+                "mov %cr3, {0}",
+                out(reg) cr3,
+                options(att_syntax)
+            );
+        }
+        cr3
+    }
+
+    #[inline(always)]
+    pub fn set_cr3(pfn: usize) {
+        unsafe {
+            asm!(
+                "mov {0}, %cr3",
+                in(reg) pfn,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn get_cr2() -> usize {
+        let cr2: usize;
+        unsafe {
+            asm!(
+                "mov %cr2, {}",
+                out(reg) cr2,
+                options(att_syntax)
+            );
+        }
+        cr2
+    }
+}
+
+pub mod gdt;
+pub mod interrupt;
+pub mod io;
+pub mod task;

+ 172 - 0
arch/x86_64/src/task.rs

@@ -0,0 +1,172 @@
+use core::arch::{asm, global_asm};
+
+use crate::interrupt;
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+struct SP {
+    low: u32,
+    high: u32,
+}
+
+#[repr(C)]
+pub struct TSS {
+    _reserved1: u32,
+    rsp: [SP; 3],
+    _reserved2: u32,
+    _reserved3: u32,
+    ist: [SP; 7],
+    _reserved4: u32,
+    _reserved5: u32,
+    _reserved6: u16,
+    iomap_base: u16,
+}
+
+impl TSS {
+    pub fn new() -> Self {
+        Self {
+            _reserved1: 0,
+            rsp: [SP { low: 0, high: 0 }; 3],
+            _reserved2: 0,
+            _reserved3: 0,
+            ist: [SP { low: 0, high: 0 }; 7],
+            _reserved4: 0,
+            _reserved5: 0,
+            _reserved6: 0,
+            iomap_base: 0,
+        }
+    }
+
+    pub fn set_rsp0(&mut self, rsp: u64) {
+        self.rsp[0].low = rsp as u32;
+        self.rsp[0].high = (rsp >> 32) as u32;
+    }
+}
+
+#[inline(always)]
+pub fn halt() {
+    unsafe {
+        asm!("hlt", options(att_syntax, nostack));
+    }
+}
+
+#[inline(always)]
+pub fn pause() {
+    unsafe {
+        asm!("pause", options(att_syntax, nostack));
+    }
+}
+
+#[inline(always)]
+pub fn freeze() -> ! {
+    loop {
+        interrupt::disable();
+        halt();
+    }
+}
+
+#[inline(always)]
+pub fn rdmsr(msr: u32) -> u64 {
+    let edx: u32;
+    let eax: u32;
+
+    unsafe {
+        asm!(
+            "rdmsr",
+            in("ecx") msr,
+            out("eax") eax,
+            out("edx") edx,
+            options(att_syntax),
+        );
+    }
+
+    (edx as u64) << 32 | eax as u64
+}
+
+#[inline(always)]
+pub fn wrmsr(msr: u32, value: u64) {
+    let eax = value as u32;
+    let edx = (value >> 32) as u32;
+
+    unsafe {
+        asm!(
+            "wrmsr",
+            in("ecx") msr,
+            in("eax") eax,
+            in("edx") edx,
+            options(att_syntax),
+        );
+    }
+}
+
+global_asm!(
+    r"
+    .macro movcfi reg, offset
+        mov \reg, \offset(%rsp)
+        .cfi_rel_offset \reg, \offset
+    .endm
+
+    .macro movrst reg, offset
+        mov \offset(%rsp), \reg
+        .cfi_restore \reg
+    .endm
+
+    .globl __context_switch_light
+    .type __context_switch_light @function
+    __context_switch_light:
+    .cfi_startproc
+
+        pushf
+    .cfi_def_cfa_offset 0x10
+
+        sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
+    .cfi_def_cfa_offset 0x48
+
+        movcfi %rbx, 0x08
+        movcfi %rbp, 0x10
+        movcfi %r12, 0x18
+        movcfi %r13, 0x20
+        movcfi %r14, 0x28
+        movcfi %r15, 0x30
+
+        push (%rdi)      # save sp of previous stack frame of current
+                         # acts as saving bp
+    .cfi_def_cfa_offset 0x50
+
+        mov %rsp, (%rdi) # save sp of current stack
+        mov (%rsi), %rsp # load sp of target stack
+
+        pop (%rsi)       # load sp of previous stack frame of target
+                         # acts as restoring previous bp
+    .cfi_def_cfa_offset 0x48
+
+        pop %rax         # align to 16 bytes
+    .cfi_def_cfa_offset 0x40
+
+        mov 0x28(%rsp), %r15
+        mov 0x20(%rsp), %r14
+        mov 0x18(%rsp), %r13
+        mov 0x10(%rsp), %r12
+        mov 0x08(%rsp), %rbp
+        mov 0x00(%rsp), %rbx
+
+        add $0x30, %rsp
+    .cfi_def_cfa_offset 0x10
+
+        popf
+    .cfi_def_cfa_offset 0x08
+
+        ret
+    .cfi_endproc
+    ",
+    options(att_syntax),
+);
+
+extern "C" {
+    fn __context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize);
+}
+
+#[inline(always)]
+pub fn context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize) {
+    unsafe { __context_switch_light(current_task_sp, next_task_sp) }
+}

+ 2 - 2
doc/mem_layout.txt

@@ -1,8 +1,8 @@
 physical memory
 
-0x0000 - 0x1000 : GDT, TSS, LDT and some early kernel data
+0x0000 - 0x1000 : GDT for kernel initialization use and some early kernel data
 0x1000 - 0x2000 : kernel stage1
-0x2000 - 0x3000 : kernel PML4
+0x2000 - 0x3000 : kernel space PML4
 0x3000 - 0x4000 : kernel PDPT for physical memory mappings
 0x4000 - 0x5000 : kernel PDPT for kernel space
 0x5000 - 0x6000 : kernel PD for kernel image

+ 4 - 1
gblibc/CMakeLists.txt

@@ -45,9 +45,12 @@ add_library(crt0_32 OBJECT
     src/crt0.s
 )
 
-target_compile_options(gblibc_32 PRIVATE "-m32")
+target_compile_options(gblibc_32 PRIVATE "-fno-pic")
 target_compile_options(gblibc_32 PRIVATE "-mcmodel=32")
+target_compile_options(gblibc_32 PRIVATE "-m32")
+target_compile_options(crt0_32 PRIVATE "-fno-pic")
 target_compile_options(crt0_32 PRIVATE "-m32")
+target_compile_options(crt0_32 PRIVATE "-mcmodel=32")
 target_link_options(gblibc_32 PRIVATE "LINKER:-melf_i386")
 target_link_options(crt0_32 PRIVATE "LINKER:-melf_i386")
 

+ 2 - 0
gblibc/include/errno.h

@@ -30,7 +30,9 @@ extern int* __errno_location(void);
 #define ESPIPE 29
 #define EROFS 30
 #define EPIPE 32
+#define ERANGE 34
 #define ELOOP 40
+#define EOVERFLOW 75
 
 #ifdef __cplusplus
 }

+ 0 - 19
include/kernel/async/lock.hpp

@@ -8,11 +8,6 @@ namespace kernel::async {
 
 using spinlock_t = unsigned long volatile;
 using lock_context_t = unsigned long;
-using preempt_count_t = std::size_t;
-
-void preempt_disable();
-void preempt_enable();
-preempt_count_t preempt_count();
 
 void init_spinlock(spinlock_t& lock);
 
@@ -31,24 +26,10 @@ class mutex {
     mutex(const mutex&) = delete;
     ~mutex();
 
-    void lock();
-    void unlock();
-
     lock_context_t lock_irq();
     void unlock_irq(lock_context_t state);
 };
 
-class lock_guard {
-   private:
-    mutex& m_mtx;
-
-   public:
-    explicit inline lock_guard(mutex& mtx) : m_mtx{mtx} { m_mtx.lock(); }
-    lock_guard(const lock_guard&) = delete;
-
-    inline ~lock_guard() { m_mtx.unlock(); }
-};
-
 class lock_guard_irq {
    private:
     mutex& m_mtx;

+ 0 - 29
include/kernel/async/waitlist.hpp

@@ -1,29 +0,0 @@
-#pragma once
-
-#include <set>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/task/forward.hpp>
-
-namespace kernel::async {
-
-class wait_list {
-   private:
-    mutex m_mtx;
-    std::set<task::thread*> m_subscribers;
-
-    wait_list(const wait_list&) = delete;
-
-   public:
-    explicit wait_list() = default;
-
-    // @return whether the wait is interrupted
-    bool wait(mutex& lck);
-
-    void subscribe();
-
-    void notify_one();
-    void notify_all();
-};
-
-} // namespace kernel::async

+ 0 - 6
include/kernel/hw/pci.hpp

@@ -9,12 +9,6 @@
 
 #include <kernel/mem/phys.hpp>
 
-namespace kernel::kinit {
-
-void init_pci();
-
-} // namespace kernel::kinit
-
 namespace kernel::hw::pci {
 
 struct PACKED device_header_base {

+ 0 - 20
include/kernel/hw/serial.hpp

@@ -1,20 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define PORT_SERIAL0 (0x3f8)
-#define PORT_SERIAL1 (0x2f8)
-
-int32_t init_serial_port(port_id_t port);
-
-int32_t is_serial_has_data(port_id_t port);
-uint8_t serial_read_data(port_id_t port);
-
-int32_t is_serial_ready_for_transmition(port_id_t port);
-void serial_send_data(port_id_t port, uint8_t data);
-
-#ifdef __cplusplus
-}
-#endif

+ 0 - 11
include/kernel/hw/timer.hpp

@@ -1,11 +0,0 @@
-#pragma once
-
-#include <cstddef>
-
-namespace kernel::hw::timer {
-void init_pit(void);
-void inc_tick(void);
-
-std::size_t current_ticks(void);
-
-} // namespace kernel::hw::timer

+ 0 - 5
include/kernel/interrupt.hpp

@@ -36,8 +36,3 @@ struct interrupt_stack {
 struct mmx_registers {
     uint8_t data[512]; // TODO: list of content
 };
-
-namespace kernel::kinit {
-void init_interrupt();
-
-} // namespace kernel::kinit

+ 0 - 11
include/kernel/irq.hpp

@@ -1,11 +0,0 @@
-#pragma once
-
-#include <functional>
-
-namespace kernel::irq {
-
-using irq_handler_t = std::function<void()>;
-
-void register_handler(int irqno, irq_handler_t handler);
-
-}; // namespace kernel::irq

+ 2 - 15
include/kernel/log.hpp

@@ -1,20 +1,7 @@
 #pragma once
 
-#include <stdio.h>
-
-#include <kernel/tty.hpp>
-
-#define kmsgf(fmt, ...)                                                  \
-    if (1) {                                                             \
-        char buf[512];                                                   \
-        snprintf(buf, sizeof(buf), fmt "\n" __VA_OPT__(, ) __VA_ARGS__); \
-        if (kernel::tty::console)                                        \
-            kernel::tty::console->print(buf);                            \
-    }
-
-#define kmsg(msg)             \
-    if (kernel::tty::console) \
-    kernel::tty::console->print(msg "\n")
+#define kmsgf(fmt, ...)
+#define kmsg(msg)
 
 #ifdef NDEBUG
 #define kmsgf_debug(...)

+ 0 - 112
include/kernel/mem/mm_list.hpp

@@ -1,112 +0,0 @@
-#pragma once
-
-#include "paging.hpp"
-#include "vm_area.hpp"
-
-#include <set>
-
-#include <stdint.h>
-
-#include <kernel/vfs/dentry.hpp>
-
-namespace kernel::mem {
-
-constexpr uintptr_t KERNEL_SPACE_START = 0x8000000000000000ULL;
-constexpr uintptr_t USER_SPACE_MEMORY_TOP = 0x0000800000000000ULL;
-constexpr uintptr_t MMAP_MIN_ADDR = 0x0000000000001000ULL;
-constexpr uintptr_t STACK_MIN_ADDR = 0x0000700000000000ULL;
-
-class mm_list {
-   private:
-    struct comparator {
-        constexpr bool operator()(const vm_area& lhs,
-                                  const vm_area& rhs) const noexcept {
-            return lhs < rhs;
-        }
-        constexpr bool operator()(const vm_area& lhs,
-                                  uintptr_t rhs) const noexcept {
-            return lhs < rhs;
-        }
-        constexpr bool operator()(uintptr_t lhs,
-                                  const vm_area& rhs) const noexcept {
-            return lhs < rhs;
-        }
-    };
-
-   public:
-    using list_type = std::set<vm_area, comparator>;
-    using iterator = list_type::iterator;
-    using const_iterator = list_type::const_iterator;
-
-    struct map_args {
-        // MUSE BE aligned to 4kb boundary
-        uintptr_t vaddr;
-        // MUSE BE aligned to 4kb boundary
-        std::size_t length;
-
-        unsigned long flags;
-
-        const fs::rust_inode_handle* file_inode;
-        // MUSE BE aligned to 4kb boundary
-        std::size_t file_offset;
-    };
-
-   private:
-    list_type m_areas;
-    paging::pfn_t m_pt;
-    iterator m_brk{};
-
-   public:
-    // default constructor copies kernel_mms
-    explicit mm_list();
-    // copies kernel_mms and mirrors user space
-    explicit mm_list(const mm_list& other);
-
-    constexpr mm_list(mm_list&& v)
-        : m_areas(std::move(v.m_areas))
-        , m_pt(std::exchange(v.m_pt, 0))
-        , m_brk{std::move(v.m_brk)} {}
-
-    ~mm_list();
-
-    void switch_pd() const noexcept;
-
-    int register_brk(uintptr_t addr);
-    uintptr_t set_brk(uintptr_t addr);
-
-    void clear();
-
-    // split the memory block at the specified address
-    // return: iterator to the new block
-    iterator split(iterator area, uintptr_t at);
-
-    bool is_avail(uintptr_t addr) const;
-    bool is_avail(uintptr_t start, std::size_t length) const noexcept;
-
-    uintptr_t find_avail(uintptr_t hint, size_t length) const;
-
-    int unmap(iterator area, bool should_invalidate_tlb);
-    int unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb);
-
-    int mmap(const map_args& args);
-
-    constexpr vm_area* find(uintptr_t lp) {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &iter;
-    }
-
-    constexpr const vm_area* find(uintptr_t lp) const {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &iter;
-    }
-
-    constexpr paging::PSE get_page_table() const noexcept {
-        return paging::PSE{m_pt};
-    }
-};
-
-} // namespace kernel::mem

+ 5 - 47
include/kernel/mem/paging.hpp

@@ -6,6 +6,7 @@
 
 #include <stdint.h>
 
+#include <kernel/interrupt.hpp>
 #include <kernel/mem/paging_asm.h>
 #include <kernel/mem/phys.hpp>
 
@@ -27,10 +28,8 @@ constexpr int idx_p1(uintptr_t vaddr) noexcept {
     return (vaddr >> 12) & 0x1ff;
 }
 
-constexpr std::tuple<int, int, int, int, int> idx_all(
-    uintptr_t vaddr) noexcept {
-    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr),
-            idx_p1(vaddr)};
+constexpr std::tuple<int, int, int, int, int> idx_all(uintptr_t vaddr) noexcept {
+    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr), idx_p1(vaddr)};
 }
 
 // page frame number
@@ -46,15 +45,11 @@ constexpr psattr_t PA_USER_DATA = PA_DATA | PA_G | PA_US;
 
 constexpr psattr_t PA_PAGE_TABLE = PA_P | PA_RW;
 constexpr psattr_t PA_KERNEL_PAGE_TABLE = PA_PAGE_TABLE | PA_G;
-constexpr psattr_t PA_USER_PAGE_TABLE = PA_PAGE_TABLE | PA_US;
 
 constexpr psattr_t PA_DATA_HUGE = PA_DATA | PA_PS;
 constexpr psattr_t PA_KERNEL_DATA_HUGE = PA_DATA_HUGE | PA_G;
 constexpr psattr_t PA_USER_DATA_HUGE = PA_DATA_HUGE | PA_US;
 
-constexpr psattr_t PA_ANONYMOUS_PAGE = PA_P | PA_US | PA_COW | PA_ANON;
-constexpr psattr_t PA_MMAPPED_PAGE = PA_US | PA_COW | PA_ANON | PA_MMAP;
-
 namespace __inner {
     using pse_t = uint64_t;
 
@@ -74,9 +69,7 @@ class PSE {
 
     constexpr pfn_t pfn() const noexcept { return *m_ptrbase & ~PA_MASK; }
 
-    constexpr psattr_t attributes() const noexcept {
-        return *m_ptrbase & PA_MASK;
-    }
+    constexpr psattr_t attributes() const noexcept { return *m_ptrbase & PA_MASK; }
 
     constexpr PSE operator[](std::size_t nth) const noexcept {
         return PSE{m_ptrbase.phys() + 8 * nth};
@@ -135,41 +128,6 @@ constexpr unsigned long PAGE_FAULT_PK = 0x00000020;
 constexpr unsigned long PAGE_FAULT_SS = 0x00000040;
 constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
 
-void handle_page_fault(unsigned long err);
-
-class vaddr_range {
-    std::size_t n;
-
-    int idx4;
-    int idx3;
-    int idx2;
-    int idx1;
-
-    PSE pml4;
-    PSE pdpt;
-    PSE pd;
-    PSE pt;
-
-    uintptr_t m_start;
-    uintptr_t m_end;
-
-    bool is_privilege;
-
-   public:
-    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end,
-                         bool is_privilege = false);
-    explicit vaddr_range(std::nullptr_t);
-
-    vaddr_range begin() const noexcept;
-    vaddr_range end() const noexcept;
-
-    PSE operator*() const noexcept;
-
-    vaddr_range& operator++();
-    operator bool() const noexcept;
-
-    // compares remaining pages to iterate
-    bool operator==(const vaddr_range& other) const noexcept;
-};
+void handle_page_fault(interrupt_stack* int_stack);
 
 } // namespace kernel::mem::paging

+ 1 - 2
include/kernel/mem/paging_asm.h

@@ -1,4 +1,3 @@
-
 #define KERNEL_IMAGE_PADDR         0x400000
 #define KERNEL_STAGE1_PADDR        0x001000
 #define KERNEL_PML4                0x002000
@@ -7,10 +6,10 @@
 #define KERNEL_PD_KIMAGE           0x005000
 #define KERNEL_PT_KIMAGE           0x006000
 #define KERNEL_PD_STRUCT_PAGE_ARR  0x007000
-#define EMPTY_PAGE_PFN             0x008000
 
 #define KERNEL_BSS_HUGE_PAGE       0x200000
 
+
 #define PA_P    0x0000000000000001
 #define PA_RW   0x0000000000000002
 #define PA_US   0x0000000000000004

+ 1 - 19
include/kernel/mem/phys.hpp

@@ -13,8 +13,7 @@ namespace kernel::mem {
 
 template <typename T, bool Cached = true>
 class physaddr {
-    static constexpr uintptr_t PHYS_OFFSET =
-        Cached ? 0xffffff0000000000ULL : 0xffffff4000000000ULL;
+    static constexpr uintptr_t PHYS_OFFSET = Cached ? 0xffffff0000000000ULL : 0xffffff4000000000ULL;
 
     uintptr_t m_ptr;
 
@@ -33,21 +32,4 @@ class physaddr {
     constexpr uintptr_t phys() const noexcept { return m_ptr; }
 };
 
-//  gdt[0]:  null
-//  gdt[1]:  kernel code
-//  gdt[2]:  kernel data
-//  gdt[3]:  user code
-//  gdt[4]:  user data
-//  gdt[5]:  user code compability mode
-//  gdt[6]:  user data compability mode
-//  gdt[7]:  thread local 32bit
-//  gdt[8]:  tss descriptor low
-//  gdt[9]:  tss descriptor high
-//  gdt[10]: ldt descriptor low
-//  gdt[11]: ldt descriptor high
-//  gdt[12]: null segment(in ldt)
-//  gdt[13]: thread local 64bit(in ldt)
-// &gdt[14]: tss of 0x68 bytes from here
-constexpr physaddr<uint64_t> gdt{0x00000000 + 1 - 1};
-
 } // namespace kernel::mem

+ 0 - 60
include/kernel/mem/vm_area.hpp

@@ -1,60 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-#include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace kernel::mem {
-
-constexpr unsigned long MM_WRITE = 0x00000000'00000001;
-constexpr unsigned long MM_EXECUTE = 0x00000000'00000002;
-constexpr unsigned long MM_MAPPED = 0x00000000'00000004;
-constexpr unsigned long MM_ANONYMOUS = 0x00000000'00000008;
-constexpr unsigned long MM_INTERNAL_MASK = 0xffffffff'00000000;
-constexpr unsigned long MM_BREAK = 0x80000000'00000000;
-
-struct vm_area {
-    uintptr_t start;
-    uintptr_t end;
-
-    unsigned long flags;
-
-    const fs::rust_inode_handle* mapped_file;
-    std::size_t file_offset;
-
-    constexpr bool is_avail(uintptr_t ostart, uintptr_t oend) const noexcept {
-        return (ostart >= end || oend <= start);
-    }
-
-    constexpr bool operator<(const vm_area& rhs) const noexcept {
-        return end <= rhs.start;
-    }
-    constexpr bool operator<(uintptr_t rhs) const noexcept {
-        return end <= rhs;
-    }
-    friend constexpr bool operator<(uintptr_t lhs,
-                                    const vm_area& rhs) noexcept {
-        return lhs < rhs.start;
-    }
-
-    constexpr vm_area(uintptr_t start, unsigned long flags, uintptr_t end,
-                      const fs::rust_inode_handle* mapped_file = nullptr,
-                      std::size_t offset = 0)
-        : start{start}
-        , end{end}
-        , flags{flags}
-        , mapped_file{mapped_file}
-        , file_offset{offset} {}
-
-    constexpr vm_area(uintptr_t start, unsigned long flags,
-                      const fs::rust_inode_handle* mapped_file = nullptr,
-                      std::size_t offset = 0)
-        : start{start}
-        , end{start}
-        , flags{flags}
-        , mapped_file{mapped_file}
-        , file_offset{offset} {}
-};
-
-} // namespace kernel::mem

+ 0 - 37
include/kernel/module.hpp

@@ -1,37 +0,0 @@
-#pragma once
-
-#include <memory>
-
-#include <types/types.h>
-
-#define MODULE_LOADER(name) \
-    static std::unique_ptr<kernel::kmod::kmod> __module##name##_loader()
-
-#define INTERNAL_MODULE(name, type)                                         \
-    MODULE_LOADER(name);                                                    \
-    SECTION(".kmods")                                                       \
-    __attribute__((used))                                                   \
-    std::unique_ptr<kernel::kmod::kmod> (*const __module##name##_entry)() = \
-        __module##name##_loader;                                            \
-    MODULE_LOADER(name) {                                                   \
-        return std::make_unique<type>();                                    \
-    }
-
-namespace kernel::kmod {
-
-struct kmod {
-    const char* const name;
-
-    explicit kmod(const char* name);
-
-    virtual ~kmod() = default;
-    kmod(const kmod&) = delete;
-    kmod& operator=(const kmod&) = delete;
-
-    virtual int init() = 0;
-};
-
-extern "C" std::unique_ptr<kmod> (*const KMOD_LOADERS_START[])();
-void load_internal_modules();
-
-} // namespace kernel::kmod

+ 2 - 156
include/kernel/process.hpp

@@ -1,13 +1,9 @@
 #pragma once
 
-#include <list>
-#include <map>
-#include <set>
-#include <tuple>
-#include <utility>
-
 #include <assert.h>
+#include <errno.h>
 #include <fcntl.h>
+#include <signal.h>
 #include <stdint.h>
 #include <sys/types.h>
 
@@ -16,158 +12,8 @@
 #include <types/path.hpp>
 #include <types/types.h>
 
-#include <kernel/async/waitlist.hpp>
 #include <kernel/interrupt.hpp>
-#include <kernel/mem/mm_list.hpp>
 #include <kernel/mem/paging.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/task/current.hpp>
-#include <kernel/task/thread.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/filearr.hpp>
-
-class process;
-
-class proclist;
-
-inline process* volatile current_process;
-inline proclist* procs;
-
-struct process_attr {
-    uint16_t system : 1;
-    uint16_t zombie : 1 = 0;
-};
-
-class process {
-   public:
-    struct wait_obj {
-        pid_t pid;
-        int code;
-    };
-
-   public:
-    kernel::mem::mm_list mms{};
-    std::set<kernel::task::thread> thds;
-    kernel::async::wait_list waitlist;
-
-    kernel::async::mutex mtx_waitprocs;
-    std::list<wait_obj> waitprocs;
-
-    process_attr attr{};
-    fs::filearray files;
-    fs::dentry_pointer cwd{};
-    mode_t umask{0022};
-
-    pid_t pid{};
-    pid_t ppid{};
-    pid_t pgid{};
-    pid_t sid{};
-
-    kernel::tty::tty* control_tty{};
-    struct fs::fs_context fs_context;
-    std::set<pid_t> children;
-
-   public:
-    process(const process&) = delete;
-    explicit process(const process& parent, pid_t pid);
-
-    // this function is used for system initialization
-    // DO NOT use this after the system is on
-    explicit process(pid_t pid, pid_t ppid);
-
-    constexpr bool is_system(void) const { return attr.system; }
-    constexpr bool is_zombie(void) const { return attr.zombie; }
-
-    void send_signal(kernel::signal_list::signo_type signal);
-};
-
-class proclist final {
-   private:
-    std::map<pid_t, process> m_procs;
-    pid_t m_nextpid = 2;
-
-    constexpr pid_t next_pid() { return m_nextpid++; }
-    process& real_emplace(pid_t pid, pid_t ppid);
-
-   public:
-    proclist();
-
-    constexpr process& copy_from(process& proc) {
-        pid_t pid = next_pid();
-        auto [iter, inserted] = m_procs.try_emplace(pid, proc, pid);
-        assert(inserted);
-
-        proc.children.insert(pid);
-        return iter->second;
-    }
-
-    constexpr void remove(pid_t pid) {
-        make_children_orphans(pid);
-
-        auto proc_iter = m_procs.find(pid);
-
-        auto ppid = proc_iter->second.ppid;
-        find(ppid).children.erase(pid);
-
-        m_procs.erase(proc_iter);
-    }
-
-    constexpr std::pair<process*, bool> try_find(pid_t pid) const {
-        auto iter = m_procs.find(pid);
-        if (iter)
-            return {(process*)&iter->second, true};
-        else
-            return {nullptr, false};
-    }
-
-    // if process doesn't exist, the behavior is undefined
-    constexpr process& find(pid_t pid) {
-        auto [ptr, found] = try_find(pid);
-        assert(found);
-        return *ptr;
-    }
-
-    constexpr void make_children_orphans(pid_t pid) {
-        auto& children = find(pid).children;
-        auto& init_children = find(1).children;
-
-        for (auto item : children) {
-            init_children.insert(item);
-            find(item).ppid = 1;
-        }
-
-        children.clear();
-    }
-
-    // the process MUST exist, or the behavior is undefined
-    void send_signal(pid_t pid, kernel::signal_list::signo_type signal) {
-        auto& proc = find(pid);
-        proc.send_signal(signal);
-    }
-    void send_signal_grp(pid_t pgid, kernel::signal_list::signo_type signal) {
-        // TODO: find processes that are in the same session quickly
-        for (auto& [pid, proc] : m_procs) {
-            if (proc.pgid != pgid)
-                continue;
-            proc.send_signal(signal);
-        }
-    }
-
-    void kill(pid_t pid, int exit_code);
-
-    constexpr auto begin() const { return m_procs.begin(); }
-    constexpr auto end() const { return m_procs.end(); }
-};
-
-void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn);
-/// @return true if returned normally, false if being interrupted
-bool schedule(void);
-void NORETURN schedule_noreturn(void);
 
 void NORETURN freeze(void);
-void NORETURN kill_current(int signo);
-
-void check_signal(void);

+ 0 - 74
include/kernel/signal.hpp

@@ -1,74 +0,0 @@
-#pragma once
-
-#include <list>
-#include <map>
-
-#include <signal.h>
-#include <stdint.h>
-
-#include <types/cplusplus.hpp>
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/interrupt.hpp>
-
-namespace kernel {
-
-using sigmask_type = uint64_t;
-
-struct sigaction {
-    sighandler_t sa_handler;
-    unsigned long sa_flags;
-    sigrestorer_t sa_restorer;
-    sigmask_type sa_mask;
-};
-
-class signal_list {
-   public:
-    using signo_type = uint32_t;
-    using list_type = std::list<signo_type>;
-
-   private:
-    list_type m_list;
-    sigmask_type m_mask{};
-    std::map<signo_type, sigaction> m_handlers;
-    async::mutex m_mtx;
-
-   public:
-    static constexpr bool check_valid(signo_type sig) {
-        return sig >= 1 && sig <= 64;
-    }
-
-   public:
-    constexpr signal_list() = default;
-    constexpr signal_list(const signal_list& val)
-        : m_list{val.m_list}
-        , m_mask{val.m_mask}
-        , m_handlers{val.m_handlers}
-        , m_mtx{} {}
-
-    constexpr signal_list(signal_list&& val)
-        : m_list{std::move(val.m_list)}
-        , m_mask{std::move(val.m_mask)}
-        , m_handlers{std::move(val.m_handlers)}
-        , m_mtx{} {}
-
-    void on_exec();
-
-    sigmask_type get_mask() const;
-    void set_mask(sigmask_type mask);
-    void mask(sigmask_type mask);
-    void unmask(sigmask_type mask);
-
-    void set_handler(signo_type signal, const sigaction& action);
-    void get_handler(signo_type signal, sigaction& action) const;
-
-    signo_type pending_signal();
-
-    // return value: whether the thread should wake up
-    bool raise(signo_type signal);
-    void handle(interrupt_stack* context, mmx_registers* mmxregs);
-    void after_signal(signo_type signal);
-};
-
-} // namespace kernel

+ 0 - 118
include/kernel/syscall.hpp

@@ -1,118 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include <bits/alltypes.h>
-#include <poll.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <sys/utsname.h>
-#include <time.h>
-
-#include <types/types.h>
-
-#include <kernel/interrupt.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/user/thread_local.hpp>
-
-#define SYSCALL64_ARG1(type, name) type name = (type)((data)->head.s_regs.rdi)
-#define SYSCALL64_ARG2(type, name) type name = (type)((data)->head.s_regs.rsi)
-#define SYSCALL64_ARG3(type, name) type name = (type)((data)->head.s_regs.rdx)
-#define SYSCALL64_ARG4(type, name) type name = (type)((data)->head.s_regs.r10)
-#define SYSCALL64_ARG5(type, name) type name = (type)((data)->head.s_regs.r8)
-#define SYSCALL64_ARG6(type, name) type name = (type)((data)->head.s_regs.r9)
-
-namespace kernel {
-void init_syscall_table();
-
-void handle_syscall32(int no, interrupt_stack* data, mmx_registers* mmxregs);
-void handle_syscall64(int no, interrupt_stack* data, mmx_registers* mmxregs);
-
-namespace syscall {
-    // in fileops.cc
-    ssize_t do_write(int fd, const char __user* buf, size_t n);
-    ssize_t do_read(int fd, char __user* buf, size_t n);
-    int do_close(int fd);
-    int do_dup(int old_fd);
-    int do_dup2(int old_fd, int new_fd);
-    int do_pipe(int __user* pipefd);
-    ssize_t do_getdents(int fd, char __user* buf, size_t cnt);
-    ssize_t do_getdents64(int fd, char __user* buf, size_t cnt);
-    int do_open(const char __user* path, int flags, mode_t mode);
-    int do_symlink(const char __user* target, const char __user* linkpath);
-    int do_readlink(const char __user* pathname, char __user* buf,
-                    size_t buf_size);
-    int do_ioctl(int fd, unsigned long request, uintptr_t arg3);
-    ssize_t do_readv(int fd, const iovec* iov, int iovcnt);
-    ssize_t do_writev(int fd, const iovec* iov, int iovcnt);
-    off_t do_lseek(int fd, off_t offset, int whence);
-    uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags,
-                            int fd, off_t pgoffset);
-    int do_munmap(uintptr_t addr, size_t len);
-    ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset,
-                        size_t count);
-    int do_statx(int dirfd, const char __user* path, int flags,
-                 unsigned int mask, statx __user* statxbuf);
-    int do_fcntl(int fd, int cmd, unsigned long arg);
-    int do_poll(pollfd __user* fds, nfds_t nfds, int timeout);
-    int do_mknod(const char __user* pathname, mode_t mode, dev_t dev);
-    int do_access(const char __user* pathname, int mode);
-    int do_unlink(const char __user* pathname);
-    int do_truncate(const char __user* pathname, long length);
-    int do_mkdir(const char __user* pathname, mode_t mode);
-    int do_socket(int domain, int type, int protocol);
-
-    // in procops.cc
-    int do_chdir(const char __user* path);
-    [[noreturn]] int do_exit(int status);
-    int do_waitpid(pid_t waitpid, int __user* arg1, int options);
-    pid_t do_getsid(pid_t pid);
-    pid_t do_setsid();
-    pid_t do_getpgid(pid_t pid);
-    int do_setpgid(pid_t pid, pid_t pgid);
-    int do_set_thread_area(user::user_desc __user* ptr);
-    pid_t do_set_tid_address(int __user* tidptr);
-    int do_prctl(int option, uintptr_t arg2);
-    int do_arch_prctl(int option, uintptr_t arg2);
-    pid_t do_getpid();
-    pid_t do_getppid();
-    uid_t do_getuid();
-    uid_t do_geteuid();
-    gid_t do_getgid();
-    pid_t do_gettid();
-    int do_getcwd(char __user* buf, size_t buf_size);
-    uintptr_t do_brk(uintptr_t addr);
-    int do_umask(mode_t mask);
-    int do_kill(pid_t pid, int sig);
-    int do_tkill(pid_t pid, int sig);
-    int do_rt_sigprocmask(int how, const kernel::sigmask_type __user* set,
-                          kernel::sigmask_type __user* oldset,
-                          size_t sigsetsize);
-    int do_rt_sigaction(int signum, const sigaction __user* act,
-                        sigaction __user* oldact, size_t sigsetsize);
-    int do_newuname(new_utsname __user* buf);
-
-    struct execve_retval {
-        uintptr_t ip;
-        uintptr_t sp;
-        int status;
-    };
-
-    execve_retval do_execve(const std::string& exec,
-                            const std::vector<std::string>& args,
-                            const std::vector<std::string>& envs);
-
-    // in mount.cc
-    int do_mount(const char __user* source, const char __user* target,
-                 const char __user* fstype, unsigned long flags,
-                 const void __user* _fsdata);
-
-    // in infoops.cc
-    int do_clock_gettime(clockid_t clk_id, timespec __user* tp);
-    int do_gettimeofday(timeval __user* tv, void __user* tz);
-
-} // namespace syscall
-
-} // namespace kernel

+ 0 - 5
include/kernel/task/current.hpp

@@ -1,5 +0,0 @@
-#pragma once
-
-#include <kernel/task/thread.hpp>
-
-inline kernel::task::thread* volatile current_thread;

+ 0 - 16
include/kernel/task/readyqueue.hpp

@@ -1,16 +0,0 @@
-#pragma once
-
-#include <list>
-
-#include <kernel/task/thread.hpp>
-
-namespace kernel::task::dispatcher {
-
-void enqueue(thread* thd);
-void dequeue(thread* thd);
-
-void setup_idle(thread* idle_thd);
-
-thread* next();
-
-} // namespace kernel::task::dispatcher

+ 0 - 76
include/kernel/task/thread.hpp

@@ -1,76 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <string>
-
-#include <stdint.h>
-#include <sys/types.h>
-
-#include <types/types.h>
-
-#include <kernel/mem/paging.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/user/thread_local.hpp>
-
-namespace kernel::task {
-
-using tid_t = std::size_t;
-
-struct thread {
-   public:
-    using thd_attr_t = uint32_t;
-    static constexpr thd_attr_t SYSTEM = 0x01;
-    static constexpr thd_attr_t READY = 0x02;
-    static constexpr thd_attr_t STOPPED = 0x04;
-    static constexpr thd_attr_t ZOMBIE = 0x08;
-    static constexpr thd_attr_t ISLEEP = 0x10;
-    static constexpr thd_attr_t USLEEP = 0x20;
-
-   private:
-    struct kernel_stack {
-        mem::paging::pfn_t pfn;
-        uintptr_t sp;
-
-        kernel_stack();
-        kernel_stack(const kernel_stack& other);
-        kernel_stack(kernel_stack&& other);
-        ~kernel_stack();
-
-        uint64_t pushq(uint64_t val);
-        uint32_t pushl(uint32_t val);
-
-        void load_interrupt_stack() const;
-    };
-
-   public:
-    kernel_stack kstack;
-    pid_t owner;
-    thd_attr_t attr;
-    signal_list signals;
-
-    int* __user set_child_tid{};
-    int* __user clear_child_tid{};
-
-    std::string name{};
-    uint64_t tls_desc32{};
-    std::size_t elected_times{};
-
-    explicit thread(std::string name, pid_t owner);
-    thread(const thread& val, pid_t owner);
-
-    int set_thread_area(user::user_desc* ptr);
-    int load_thread_area32() const;
-
-    void set_attr(thd_attr_t new_attr);
-
-    void send_signal(signal_list::signo_type signal);
-
-    thread(thread&& val) = default;
-
-    tid_t tid() const;
-
-    bool operator<(const thread& rhs) const;
-    bool operator==(const thread& rhs) const;
-};
-
-} // namespace kernel::task

+ 0 - 73
include/kernel/tty.hpp

@@ -1,73 +0,0 @@
-#pragma once
-
-#include <string>
-
-#include <stdint.h>
-#include <sys/types.h>
-#include <termios.h>
-
-#include <types/allocator.hpp>
-#include <types/buffer.hpp>
-#include <types/cplusplus.hpp>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-
-namespace kernel::tty {
-
-class tty : public types::non_copyable {
-   public:
-    static constexpr size_t BUFFER_SIZE = 4096;
-
-   private:
-    void _real_commit_char(int c);
-    void _echo_char(int c);
-
-    int _do_erase(bool should_echo);
-
-   public:
-    explicit tty(std::string name);
-    virtual void putchar(char c) = 0;
-    void print(const char* str);
-    ssize_t read(char* buf, size_t buf_size, size_t n);
-    ssize_t write(const char* buf, size_t n);
-
-    // characters committed to buffer will be handled
-    // by the input line discipline (N_TTY)
-    void commit_char(int c);
-
-    // print character to the output
-    // characters will be handled by the output line discipline
-    void show_char(int c);
-
-    void clear_read_buf(void);
-
-    // TODO: formal poll support
-    int poll();
-
-    constexpr void set_pgrp(pid_t pgid) { fg_pgroup = pgid; }
-
-    constexpr pid_t get_pgrp(void) const { return fg_pgroup; }
-
-    termios termio;
-    std::string name;
-
-   protected:
-    async::mutex mtx_buf;
-    types::buffer buf;
-    async::wait_list waitlist;
-
-    pid_t fg_pgroup;
-};
-
-class vga_tty : public virtual tty {
-   public:
-    vga_tty();
-    virtual void putchar(char c) override;
-};
-
-inline tty* console;
-
-int register_tty(tty* tty_dev);
-
-} // namespace kernel::tty

+ 0 - 21
include/kernel/user/thread_local.hpp

@@ -1,21 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-namespace kernel::user {
-
-struct user_desc {
-    uint32_t entry_number;
-    uint32_t base_addr;
-    uint32_t limit;
-    uint32_t seg_32bit : 1;
-    uint32_t contents : 2;
-    uint32_t read_exec_only : 1;
-    uint32_t limit_in_pages : 1;
-    uint32_t seg_not_present : 1;
-    uint32_t useable : 1;
-};
-
-void load_thread_area32(uint64_t desc);
-
-} // namespace kernel::user

+ 0 - 84
include/kernel/vfs.hpp

@@ -5,12 +5,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
-#include <types/path.hpp>
-
-#include <kernel/mem/paging.hpp>
-#include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/file.hpp>
-
 #define NODE_MAJOR(node) (((node) >> 8) & 0xFFU)
 #define NODE_MINOR(node) ((node) & 0xFFU)
 
@@ -20,82 +14,4 @@ constexpr dev_t make_device(uint32_t major, uint32_t minor) {
     return ((major << 8) & 0xFF00U) | (minor & 0xFFU);
 }
 
-// buf, buf_size, cnt
-using chrdev_read = std::function<ssize_t(char*, std::size_t, std::size_t)>;
-
-// buf, cnt
-using chrdev_write = std::function<ssize_t(const char*, std::size_t)>;
-
-struct chrdev_ops {
-    chrdev_read read;
-    chrdev_write write;
-};
-
-struct PACKED user_dirent {
-    ino_t d_ino;       // inode number
-    uint32_t d_off;    // ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    char d_name[1];    // file name with a padding zero
-    // uint8_t d_type; // file type, with offset of (d_reclen - 1)
-};
-
-struct PACKED user_dirent64 {
-    ino64_t d_ino;     // inode number
-    uint64_t d_off;    // implementation-defined field, ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    uint8_t d_type;    // file type, with offset of (d_reclen - 1)
-    char d_name[1];    // file name with a padding zero
-};
-
-struct fs_context {
-    dentry_pointer root;
-};
-
-int register_char_device(dev_t node, const chrdev_ops& ops);
-ssize_t char_device_read(dev_t node, char* buf, size_t buf_size, size_t n);
-ssize_t char_device_write(dev_t node, const char* buf, size_t n);
-
-extern "C" int fs_creat(struct dentry* at, mode_t mode);
-extern "C" int fs_mkdir(struct dentry* at, mode_t mode);
-extern "C" int fs_mknod(struct dentry* at, mode_t mode, dev_t sn);
-extern "C" int fs_unlink(struct dentry* at);
-extern "C" int fs_symlink(struct dentry* at, const char* target);
-
-extern "C" int fs_statx(const struct rust_inode_handle* inode,
-                        struct statx* stat, unsigned int mask);
-extern "C" int fs_readlink(const struct rust_inode_handle* inode, char* buf,
-                           size_t buf_size);
-extern "C" int fs_truncate(const struct rust_inode_handle* file, size_t size);
-extern "C" size_t fs_read(const struct rust_inode_handle* file, char* buf,
-                          size_t buf_size, size_t offset, size_t n);
-extern "C" size_t fs_write(const struct rust_inode_handle* file,
-                           const char* buf, size_t offset, size_t n);
-
-using readdir_callback_fn = std::function<int(const char*, size_t, ino_t)>;
-
-extern "C" ssize_t fs_readdir(const struct rust_inode_handle* file,
-                              size_t offset,
-                              const readdir_callback_fn* callback);
-
-extern "C" int fs_mount(dentry* mnt, const char* source,
-                        const char* mount_point, const char* fstype,
-                        unsigned long flags, const void* data);
-
-extern "C" mode_t r_get_inode_mode(struct rust_inode_handle* inode);
-extern "C" size_t r_get_inode_size(struct rust_inode_handle* inode);
-extern "C" bool r_dentry_is_directory(struct dentry* dentry);
-extern "C" bool r_dentry_is_invalid(struct dentry* dentry);
-
-// borrow from dentry->inode
-extern "C" struct rust_inode_handle* r_dentry_get_inode(struct dentry* dentry);
-extern "C" struct dentry* r_get_root_dentry();
-
-#define current_open(...) \
-    fs::open(current_process->fs_context, current_process->cwd, __VA_ARGS__)
-
-std::pair<dentry_pointer, int> open(const fs_context& context,
-                                    const dentry_pointer& cwd,
-                                    types::string_view path,
-                                    bool follow_symlinks = true);
-
 } // namespace fs

+ 0 - 28
include/kernel/vfs/dentry.hpp

@@ -1,28 +0,0 @@
-#pragma once
-
-#include <string>
-
-#include <bits/alltypes.h>
-
-#include <types/path.hpp>
-
-#include <kernel/async/lock.hpp>
-
-struct dentry;
-
-namespace fs {
-
-struct rust_vfs_handle {
-    void* data[2];
-};
-
-struct dentry_deleter {
-    void operator()(struct dentry* dentry) const;
-};
-
-using dentry_pointer = std::unique_ptr<struct dentry, dentry_deleter>;
-extern "C" int d_path(struct dentry* dentry, struct dentry* root,
-                      char* out_path, size_t buflen);
-dentry_pointer d_get(const dentry_pointer& dp);
-
-} // namespace fs

+ 0 - 106
include/kernel/vfs/file.hpp

@@ -1,106 +0,0 @@
-#pragma once
-
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/types.h>
-
-#include <types/buffer.hpp>
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace fs {
-
-class pipe : public types::non_copyable {
-   private:
-    static constexpr size_t PIPE_SIZE = 4096;
-    static constexpr uint32_t READABLE = 1;
-    static constexpr uint32_t WRITABLE = 2;
-
-   private:
-    types::buffer buf;
-    uint32_t flags;
-    kernel::async::mutex mtx;
-
-    kernel::async::wait_list waitlist_r;
-    kernel::async::wait_list waitlist_w;
-
-   public:
-    pipe();
-
-    void close_read();
-    void close_write();
-
-    int write(const char* buf, size_t n);
-    int read(char* buf, size_t n);
-
-    constexpr bool is_readable() const { return flags & READABLE; }
-
-    constexpr bool is_writeable() const { return flags & WRITABLE; }
-};
-
-struct file {
-    struct file_flags {
-        uint32_t read : 1;
-        uint32_t write : 1;
-        uint32_t append : 1;
-    } flags{};
-
-    file(file_flags flags) : flags(flags) {}
-
-    virtual ~file() = default;
-
-    virtual ssize_t read(char* __user buf, size_t n) = 0;
-    virtual ssize_t do_write(const char* __user buf, size_t n) = 0;
-
-    virtual off_t seek(off_t n, int whence) {
-        return (void)n, (void)whence, -ESPIPE;
-    }
-
-    ssize_t write(const char* __user buf, size_t n) {
-        if (!flags.write)
-            return -EBADF;
-
-        if (flags.append) {
-            seek(0, SEEK_END);
-        }
-
-        return do_write(buf, n);
-    }
-
-    // regular files should override this method
-    virtual int getdents(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-    virtual int getdents64(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-};
-
-struct regular_file : public virtual file {
-    virtual ~regular_file() = default;
-    std::size_t cursor{};
-    struct rust_inode_handle* ind{};
-
-    regular_file(file_flags flags, size_t cursor, rust_inode_handle* ind);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-    virtual off_t seek(off_t n, int whence) override;
-    virtual int getdents(char* __user buf, size_t cnt) override;
-    virtual int getdents64(char* __user buf, size_t cnt) override;
-};
-
-struct fifo_file : public virtual file {
-    virtual ~fifo_file() override;
-    std::shared_ptr<pipe> ppipe;
-
-    fifo_file(file_flags flags, std::shared_ptr<fs::pipe> ppipe);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-};
-
-} // namespace fs

+ 0 - 51
include/kernel/vfs/filearr.hpp

@@ -1,51 +0,0 @@
-#pragma once
-
-#include "dentry.hpp"
-#include "file.hpp"
-
-#include <memory>
-
-#include <types/path.hpp>
-
-#include <kernel/vfs.hpp>
-
-namespace fs {
-
-class filearray {
-   private:
-    struct impl;
-    std::shared_ptr<impl> pimpl;
-    filearray(std::shared_ptr<impl>);
-
-   public:
-    filearray(const fs_context* ctx);
-    filearray(filearray&& other) = default;
-
-    filearray copy() const;
-    filearray share() const;
-
-    // dup old_fd to some random fd
-    int dup(int old_fd);
-
-    // dup old_fd to new_fd, close new_fd if it is already open
-    int dup(int old_fd, int new_fd, int flags);
-
-    // dup old_fd to the first available fd starting from min_fd
-    int dupfd(int fd, int min_fd, int flags);
-
-    fs::file* operator[](int i) const;
-    int set_flags(int fd, int flags);
-
-    int pipe(int (&pipefd)[2]);
-    int open(const dentry_pointer& cwd, types::string_view filepath, int flags,
-             mode_t mode);
-    int open(types::string_view filepath, int flags, mode_t mode);
-
-    int close(int fd);
-
-    // any call to member methods will be invalid after clear()
-    void clear();
-    void onexec();
-};
-
-} // namespace fs

+ 0 - 25
include/kernel/vfs/vfsfwd.hpp

@@ -1,25 +0,0 @@
-#pragma once
-
-namespace fs {
-
-// in dentry.hpp
-struct dcache;
-struct dentry;
-
-// in file.hpp
-struct file;
-struct regular_file;
-struct fifo_file;
-
-class pipe;
-
-// in filearray.hpp
-class file_array;
-
-// in inode.hpp
-struct inode;
-
-// in vfs.hpp
-class vfs;
-
-} // namespace fs

+ 0 - 293
include/types/elf.hpp

@@ -1,293 +0,0 @@
-#pragma once
-
-#include <vector>
-
-#include <stdint.h>
-
-#include <kernel/interrupt.hpp>
-#include <kernel/process.hpp>
-#include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace types::elf {
-
-using elf32_addr_t = uint32_t;
-using elf32_off_t = uint32_t;
-
-using elf64_addr_t = uint64_t;
-using elf64_off_t = uint64_t;
-
-constexpr elf32_addr_t ELF32_STACK_BOTTOM = 0xbffff000;
-constexpr elf32_off_t ELF32_STACK_SIZE = 8 * 1024 * 1024;
-constexpr elf32_addr_t ELF32_STACK_TOP = ELF32_STACK_BOTTOM - ELF32_STACK_SIZE;
-
-constexpr int ELF_LOAD_FAIL_NORETURN = 0x114514;
-
-struct PACKED elf32_header {
-    // 0x7f, "ELF"
-    char magic[4];
-
-    enum : uint8_t {
-        FORMAT_32 = 1,
-        FORMAT_64 = 2,
-    } format;
-    enum : uint8_t {
-        ENDIAN_LITTLE = 1,
-        ENDIAN_BIG = 2,
-    } endian;
-    // should be 1
-    uint8_t _version1;
-    enum : uint8_t {
-        ABI_SYSTEM_V = 0x00,
-        // TODO:
-        ABI_LINUX = 0x03,
-    } abi;
-    uint8_t abi_version;
-    uint8_t _reserved[7];
-    enum : uint16_t {
-        ET_NONE = 0x00,
-        ET_REL = 0x01,
-        ET_EXEC = 0x02,
-        ET_DYN = 0x03,
-        ET_CORE = 0x04,
-        ET_LOOS = 0xfe00,
-        ET_HIOS = 0xfeff,
-        ET_LOPROC = 0xff00,
-        ET_HIPROC = 0xffff,
-    } type;
-    enum : uint16_t {
-        ARCH_NONE = 0x00,
-        ARCH_X86 = 0x03,
-        ARCH_ARM = 0x28,
-        ARCH_IA64 = 0x32,
-        ARCH_X86_64 = 0x3e,
-        ARCH_ARM64 = 0xb7,
-        ARCH_RISCV = 0xf3,
-    } arch;
-    // should be 1
-    uint32_t _version2;
-    // entry address
-    elf32_addr_t entry;
-    // program header table offset
-    elf32_off_t phoff;
-    // section header table offset
-    elf32_off_t shoff;
-    // architecture dependent flags
-    uint32_t flags;
-    // elf header size
-    uint16_t ehsize;
-    // program header table entry size
-    uint16_t phentsize;
-    // program header table entries number
-    uint16_t phnum;
-    // section header table entry size
-    uint16_t shentsize;
-    // section header table entries number
-    uint16_t shnum;
-    // section header table entry index that contains section names
-    uint16_t shstrndx;
-};
-
-struct PACKED elf32_program_header_entry {
-    enum : uint32_t {
-        PT_NULL = 0x00,
-        PT_LOAD = 0x01,
-        PT_DYNAMIC = 0x02,
-        PT_INTERP = 0x03,
-        PT_NOTE = 0x04,
-        PT_SHLIB = 0x05,
-        PT_PHDR = 0x06,
-        PT_TLS = 0x07,
-        PT_LOOS = 0x60000000,
-        PT_HIOS = 0x6fffffff,
-        PT_LIPROC = 0x70000000,
-        PT_HIPROC = 0x7fffffff,
-    } type;
-    elf32_off_t offset;
-    elf32_addr_t vaddr;
-    elf32_addr_t paddr;
-    elf32_off_t filesz;
-    elf32_off_t memsz;
-    // segment dependent
-    enum : uint32_t {
-        PF_X = 0x1,
-        PF_W = 0x2,
-        PF_R = 0x4,
-    } flags;
-    // 0 and 1 for no alignment, otherwise power of 2
-    uint32_t align;
-};
-
-struct PACKED elf32_section_header_entry {
-    elf32_off_t sh_name;
-    enum : uint32_t {
-        SHT_NULL = 0x00,
-        SHT_PROGBITS = 0x01,
-        SHT_RELA = 0x04,
-        SHT_DYNAMIC = 0x06,
-        SHT_NOTE = 0x07,
-        SHT_NOBITS = 0x08,
-        SHT_REL = 0x09,
-        SHT_DYNSYM = 0x0b,
-        SHT_INIT_ARRAY = 0x0e,
-        SHT_FINI_ARRAY = 0x0f,
-        SHT_PREINIT_ARRAY = 0x0f,
-    } sh_type;
-    enum : uint32_t {
-        SHF_WRITE = 0x01,
-        SHF_ALLOC = 0x02,
-        SHF_EXECINSTR = 0x04,
-    } sh_flags;
-    elf32_addr_t sh_addr;
-    elf32_off_t sh_offset;
-    elf32_off_t sh_size;
-    uint32_t sh_link;
-    uint32_t sh_info;
-    elf32_off_t sh_addralign;
-    elf32_off_t sh_entsize;
-};
-
-struct elf32_load_data {
-    fs::dentry_pointer exec_dent;
-    const std::vector<std::string>& argv;
-    const std::vector<std::string>& envp;
-    uintptr_t ip;
-    uintptr_t sp;
-};
-
-// TODO: environment variables
-int elf32_load(elf32_load_data& data);
-
-struct PACKED elf64_header {
-    // 0x7f, "ELF"
-    char magic[4];
-
-    enum : uint8_t {
-        FORMAT_32 = 1,
-        FORMAT_64 = 2,
-    } format;
-    enum : uint8_t {
-        ENDIAN_LITTLE = 1,
-        ENDIAN_BIG = 2,
-    } endian;
-    // should be 1
-    uint8_t _version1;
-    enum : uint8_t {
-        ABI_SYSTEM_V = 0x00,
-        // TODO:
-        ABI_LINUX = 0x03,
-    } abi;
-    uint8_t abi_version;
-    uint8_t _reserved[7];
-    enum : uint16_t {
-        ET_NONE = 0x00,
-        ET_REL = 0x01,
-        ET_EXEC = 0x02,
-        ET_DYN = 0x03,
-        ET_CORE = 0x04,
-        ET_LOOS = 0xfe00,
-        ET_HIOS = 0xfeff,
-        ET_LOPROC = 0xff00,
-        ET_HIPROC = 0xffff,
-    } type;
-    enum : uint16_t {
-        ARCH_NONE = 0x00,
-        ARCH_X86 = 0x03,
-        ARCH_ARM = 0x28,
-        ARCH_IA64 = 0x32,
-        ARCH_X86_64 = 0x3e,
-        ARCH_ARM64 = 0xb7,
-        ARCH_RISCV = 0xf3,
-    } arch;
-    // should be 1
-    uint32_t _version2;
-    // entry address
-    elf64_addr_t entry;
-    // program header table offset
-    elf64_off_t phoff;
-    // section header table offset
-    elf64_off_t shoff;
-    // architecture dependent flags
-    uint32_t flags;
-    // elf header size
-    uint16_t ehsize;
-    // program header table entry size
-    uint16_t phentsize;
-    // program header table entries number
-    uint16_t phnum;
-    // section header table entry size
-    uint16_t shentsize;
-    // section header table entries number
-    uint16_t shnum;
-    // section header table entry index that contains section names
-    uint16_t shstrndx;
-};
-
-struct PACKED elf64_program_header_entry {
-    enum : uint32_t {
-        PT_NULL = 0x00,
-        PT_LOAD = 0x01,
-        PT_DYNAMIC = 0x02,
-        PT_INTERP = 0x03,
-        PT_NOTE = 0x04,
-        PT_SHLIB = 0x05,
-        PT_PHDR = 0x06,
-        PT_TLS = 0x07,
-        PT_LOOS = 0x60000000,
-        PT_HIOS = 0x6fffffff,
-        PT_LIPROC = 0x70000000,
-        PT_HIPROC = 0x7fffffff,
-    } type;
-    // segment dependent
-    enum : uint32_t {
-        PF_X = 0x1,
-        PF_W = 0x2,
-        PF_R = 0x4,
-    } flags;
-    elf64_off_t offset;
-    elf64_addr_t vaddr;
-    elf64_addr_t paddr;
-    elf64_off_t filesz;
-    elf64_off_t memsz;
-    // 0 and 1 for no alignment, otherwise power of 2
-    uint64_t align;
-};
-
-struct PACKED elf64_section_header_entry {
-    uint32_t sh_name;
-    enum : uint32_t {
-        SHT_NULL = 0x00,
-        SHT_PROGBITS = 0x01,
-        SHT_RELA = 0x04,
-        SHT_DYNAMIC = 0x06,
-        SHT_NOTE = 0x07,
-        SHT_NOBITS = 0x08,
-        SHT_REL = 0x09,
-        SHT_DYNSYM = 0x0b,
-        SHT_INIT_ARRAY = 0x0e,
-        SHT_FINI_ARRAY = 0x0f,
-        SHT_PREINIT_ARRAY = 0x0f,
-    } sh_type;
-    enum : uint64_t {
-        SHF_WRITE = 0x01,
-        SHF_ALLOC = 0x02,
-        SHF_EXECINSTR = 0x04,
-    } sh_flags;
-    elf64_addr_t sh_addr;
-    elf64_off_t sh_offset;
-    elf64_off_t sh_size;
-    uint32_t sh_link;
-    uint32_t sh_info;
-    elf64_off_t sh_addralign;
-    elf64_off_t sh_entsize;
-};
-
-struct elf64_load_data {
-    fs::dentry_pointer exec_dent;
-    std::vector<std::string> argv;
-    std::vector<std::string> envp;
-    unsigned long ip;
-    unsigned long sp;
-};
-
-} // namespace types::elf

+ 6 - 5
init_script.sh

@@ -3,7 +3,7 @@
 BUSYBOX=/mnt/busybox
 
 freeze() {
-    echo "an error occurred while executing '''$@''', freezing..." > /dev/console
+    echo "an error occurred while executing '''$@''', freezing..." >&2
 
     while true; do
         true
@@ -25,15 +25,17 @@ do_or_freeze $BUSYBOX mknod -m 666 /dev/null c 1 3
 do_or_freeze $BUSYBOX mknod -m 666 /dev/zero c 1 5
 do_or_freeze $BUSYBOX mknod -m 666 /dev/sda b 8 0
 do_or_freeze $BUSYBOX mknod -m 666 /dev/sda1 b 8 1
+do_or_freeze $BUSYBOX mknod -m 666 /dev/ttyS0 c 4 64
+do_or_freeze $BUSYBOX mknod -m 666 /dev/ttyS1 c 4 65
 
-echo -n -e "deploying busybox... " > /dev/console
+echo -n -e "deploying busybox... " >&2
 
 do_or_freeze $BUSYBOX mkdir -p /bin
 do_or_freeze $BUSYBOX --install -s /bin
 
 export PATH="/bin"
 
-echo ok > /dev/console
+echo ok >&2
 
 do_or_freeze mkdir -p /etc /root /proc
 do_or_freeze mount -t procfs proc proc
@@ -57,5 +59,4 @@ alias ll="ls -l "
 alias la="ls -la "
 EOF
 
-exec /mnt/init /bin/sh -l \
-    < /dev/console > /dev/console 2> /dev/console
+exec /mnt/init /bin/sh -c 'exec sh -l < /dev/ttyS0 > /dev/ttyS0 2> /dev/ttyS0'

+ 11 - 53
src/asm/interrupt.s

@@ -33,8 +33,8 @@
 	.cfi_restore \reg
 .endm
 
-.extern after_ctx_switch
 .globl ISR_stub_restore
+.type ISR_stub_restore @function
 
 ISR_stub:
 	.cfi_startproc
@@ -42,6 +42,11 @@ ISR_stub:
 	.cfi_def_cfa_offset 0x18
 	.cfi_offset %rsp, 0x10
 
+	cmpq $0x08, 24(%rsp)
+	je 1f
+	swapgs
+
+1:
 	sub $0x78, %rsp
 	.cfi_def_cfa_offset 0x90
 
@@ -101,59 +106,12 @@ ISR_stub_restore:
 	add $0x88, %rsp
 	.cfi_def_cfa_offset 0x08
 
-	iretq
-	.cfi_endproc
-
-# parameters
-# #1: sp* current_task_sp
-# #2: sp* target_task_sp
-.globl asm_ctx_switch
-.type  asm_ctx_switch @function
-asm_ctx_switch:
-	.cfi_startproc
-    pushf
-	.cfi_def_cfa_offset 0x10
-
-	sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
-	.cfi_def_cfa_offset 0x48
-
-	movcfi %rbx, 0x08
-	movcfi %rbp, 0x10
-	movcfi %r12, 0x18
-	movcfi %r13, 0x20
-	movcfi %r14, 0x28
-	movcfi %r15, 0x30
-
-    push (%rdi) 	 # save sp of previous stack frame of current
-	                 # acts as saving bp
-	.cfi_def_cfa_offset 0x50
-
-    mov %rsp, (%rdi) # save sp of current stack
-    mov (%rsi), %rsp # load sp of target stack
+	cmpq $0x08, 8(%rsp)
+	je 1f
+	swapgs
 
-    pop (%rsi)       # load sp of previous stack frame of target
-	                 # acts as restoring previous bp
-	.cfi_def_cfa_offset 0x48
-
-	pop %rax         # align to 16 bytes
-	.cfi_def_cfa_offset 0x40
-
-	call after_ctx_switch
-
-	mov 0x28(%rsp), %r15
-	mov 0x20(%rsp), %r14
-	mov 0x18(%rsp), %r13
-	mov 0x10(%rsp), %r12
-	mov 0x08(%rsp), %rbp
-    mov 0x00(%rsp), %rbx
-
-	add $0x30, %rsp
-	.cfi_def_cfa_offset 0x10
-
-    popf
-	.cfi_def_cfa_offset 0x08
-
-    ret
+1:
+	iretq
 	.cfi_endproc
 
 .altmacro

+ 81 - 6
src/boot.s

@@ -128,8 +128,8 @@ start_32bit:
     # read kimage into memory
 	lea -16(%esp), %esp
     mov $KIMAGE_32K_COUNT, %ecx
-    mov $KERNEL_IMAGE_PADDR, 4(%esp) # destination address
-	mov $9, (%esp) # LBA
+    movl $KERNEL_IMAGE_PADDR, 4(%esp) # destination address
+	movl $9, (%esp) # LBA
 
 .Lread_kimage:
 	mov (%esp), %edi
@@ -139,8 +139,8 @@ start_32bit:
     call read_disk
 	mov %ebx, %ecx
 
-    add $0x8000, 4(%esp)
-	add $64, (%esp)
+    addl $0x8000, 4(%esp)
+	addl $64, (%esp)
 
     loop .Lread_kimage
 
@@ -293,9 +293,10 @@ fill_pxe:
 .L64bit_entry:
     jmp start_64bit
 
-.section .text.kinit
+.section .text
 start_64bit:
-    # set stack pointer and clear stack bottom
+    # We map the first 1GB identically to the first 1GB of physical memory,
+    # move sp to the correct position in identically mapped area of kernel space.
     mov %rsp, %rdi
     xor %rsp, %rsp
     inc %rsp
@@ -320,3 +321,77 @@ start_64bit:
     cli
     hlt
     jmp .L64bit_hlt
+
+.section .stage1.smp
+.code16
+
+.globl ap_bootstrap
+.type ap_bootstrap, @function
+ap_bootstrap:
+	ljmp $0x0, $.Lap1
+
+.Lap1:
+    # we use a shared gdt for now
+	lgdt shared_gdt_desc
+
+    # set msr
+    mov $0xc0000080, %ecx
+    rdmsr
+    or $0x901, %eax # set LME, NXE, SCE
+    wrmsr
+
+    # set cr4
+    mov %cr4, %eax
+    or $0xa0, %eax # set PAE, PGE
+    mov %eax, %cr4
+
+    # load new page table
+    mov $KERNEL_PML4, %eax
+    mov %eax, %cr3
+
+    mov %cr0, %eax
+    // SET PE, WP, PG
+    or $0x80010001, %eax
+    mov %eax, %cr0
+
+	ljmp $0x08, $.Lap_bootstrap_end
+
+.align 16
+shared_gdt_desc:
+	.8byte 0x0000000000005f
+
+.code64
+.Lap_bootstrap_end:
+    mov $0x10, %ax
+	mov %ax, %ds
+	mov %ax, %es
+	mov %ax, %ss
+
+	xor %rsp, %rsp
+	xor %rax, %rax
+	inc %rax
+1:
+	xchg %rax, BOOT_SEMAPHORE
+	cmp $0, %rax
+	je 1f
+	pause
+	jmp 1b
+
+1:
+	mov BOOT_STACK, %rsp # Acquire
+	cmp $0, %rsp
+	jne 1f
+	pause
+	jmp 1b
+
+1:
+	xor %rax, %rax
+	mov %rax, BOOT_STACK # Release
+	xchg %rax, BOOT_SEMAPHORE
+
+	xor %rbp, %rbp
+	mov %rsp, %rdi # stack area start address as the first argument
+
+	add $0x200000, %rsp # kernel stack order 9
+	push %rbp # NULL return address
+	jmp ap_entry

+ 0 - 71
src/dev/builtin-chardev.cc

@@ -1,71 +0,0 @@
-#include <kernel/module.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/vfs.hpp>
-
-using namespace kernel::kmod;
-using namespace kernel::tty;
-
-static ssize_t null_read(char*, size_t, size_t) {
-    return 0;
-}
-
-static ssize_t null_write(const char*, size_t n) {
-    return n;
-}
-
-static ssize_t zero_read(char* buf, size_t buf_size, size_t n) {
-    if (n > buf_size)
-        n = buf_size;
-
-    memset(buf, 0, n);
-    return n;
-}
-
-static ssize_t zero_write(const char*, size_t n) {
-    return n;
-}
-
-// TODO: add interface to bind console device to other devices
-ssize_t console_read(char* buf, size_t buf_size, size_t n) {
-    return console->read(buf, buf_size, n);
-}
-
-ssize_t console_write(const char* buf, size_t n) {
-    size_t orig_n = n;
-    while (n--)
-        console->putchar(*(buf++));
-
-    return orig_n;
-}
-
-class builtin_chardev : public virtual kmod {
-   public:
-    builtin_chardev() : kmod("builtin-chardev") {}
-    int init() override {
-        using namespace fs;
-        // null
-        chrdev_ops null_ops{
-            .read = null_read,
-            .write = null_write,
-        };
-        register_char_device(make_device(1, 3), null_ops);
-
-        // zero
-        chrdev_ops zero_ops{
-            .read = zero_read,
-            .write = zero_write,
-        };
-        register_char_device(make_device(1, 5), zero_ops);
-
-        // console
-        chrdev_ops console_ops{
-            .read = console_read,
-            .write = console_write,
-        };
-        register_char_device(make_device(5, 1), console_ops);
-
-        return 0;
-    }
-};
-
-INTERNAL_MODULE(builtin_chardev, builtin_chardev);

+ 20 - 0
src/driver.rs

@@ -1,2 +1,22 @@
 pub mod ahci;
 pub mod e1000e;
+pub mod serial;
+
+// TODO!!!: Put it somewhere else.
+pub struct Port8 {
+    no: u16,
+}
+
+impl Port8 {
+    pub const fn new(no: u16) -> Self {
+        Self { no }
+    }
+
+    pub fn read(&self) -> u8 {
+        arch::io::inb(self.no)
+    }
+
+    pub fn write(&self, data: u8) {
+        arch::io::outb(self.no, data)
+    }
+}

+ 5 - 4
src/driver/ahci/command.rs

@@ -16,19 +16,20 @@ pub trait Command {
 }
 
 pub struct IdentifyCommand {
-    pages: [Page; 1],
+    page: Page,
 }
 
 impl IdentifyCommand {
     pub fn new() -> Self {
-        let page = Page::alloc_one();
-        Self { pages: [page] }
+        Self {
+            page: Page::alloc_one(),
+        }
     }
 }
 
 impl Command for IdentifyCommand {
     fn pages(&self) -> &[Page] {
-        &self.pages
+        core::slice::from_ref(&self.page)
     }
 
     fn lba(&self) -> u64 {

+ 35 - 38
src/driver/ahci/control.rs

@@ -1,9 +1,6 @@
-use crate::{
-    kernel::mem::phys::{NoCachePP, PhysPtr},
-    prelude::*,
-};
+use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
 
-use super::{vread, vwrite, GHC_IE};
+use super::{BitsIterator, GHC_IE};
 
 /// An `AdapterControl` is an HBA device Global Host Control block
 ///
@@ -12,7 +9,7 @@ use super::{vread, vwrite, GHC_IE};
 /// All reads and writes to this struct is volatile
 ///
 #[repr(C)]
-pub struct AdapterControl {
+struct AdapterControlData {
     capabilities: u32,
     global_host_control: u32,
     interrupt_status: u32,
@@ -29,50 +26,50 @@ pub struct AdapterControl {
     vendor: [u8; 96],
 }
 
+const CONTROL_CAP: usize = 0;
+const CONTROL_GHC: usize = 1;
+const CONTROL_IS: usize = 2;
+const CONTROL_PI: usize = 3;
+
+pub struct AdapterControl {
+    inner: *mut u32,
+}
+
+/// # Safety
+/// At the same time, exactly one instance of this struct may exist.
+unsafe impl Send for AdapterControl {}
+
 impl AdapterControl {
-    pub fn new<'lt>(addr: usize) -> &'lt mut Self {
-        NoCachePP::new(addr).as_mut()
+    pub fn new(addr: usize) -> Self {
+        Self {
+            inner: NoCachePP::new(addr).as_ptr(),
+        }
     }
 }
 
 impl AdapterControl {
-    pub fn enable_interrupts(&mut self) {
-        let ghc = vread(&self.global_host_control);
-        vwrite(&mut self.global_host_control, ghc | GHC_IE);
+    fn read(&self, off: usize) -> u32 {
+        unsafe { self.inner.offset(off as isize).read_volatile() }
     }
 
-    pub fn implemented_ports(&self) -> ImplementedPortsIter {
-        ImplementedPortsIter::new(vread(&self.ports_implemented))
+    fn write(&self, off: usize, value: u32) {
+        unsafe { self.inner.offset(off as isize).write_volatile(value) }
     }
-}
 
-pub struct ImplementedPortsIter {
-    ports: u32,
-    n: u32,
-}
-
-impl ImplementedPortsIter {
-    fn new(ports: u32) -> Self {
-        Self { ports, n: 0 }
+    pub fn enable_interrupts(&self) {
+        let ghc = self.read(CONTROL_GHC);
+        self.write(CONTROL_GHC, ghc | GHC_IE);
     }
-}
-
-impl Iterator for ImplementedPortsIter {
-    type Item = u32;
 
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.n == 32 {
-            return None;
-        }
+    pub fn implemented_ports(&self) -> BitsIterator {
+        BitsIterator::new(self.read(CONTROL_PI))
+    }
 
-        let have: bool = self.ports & 1 != 0;
-        self.ports >>= 1;
-        self.n += 1;
+    pub fn pending_interrupts(&self) -> BitsIterator {
+        BitsIterator::new(self.read(CONTROL_IS))
+    }
 
-        if have {
-            Some(self.n - 1)
-        } else {
-            self.next()
-        }
+    pub fn clear_interrupt(&self, no: u32) {
+        self.write(CONTROL_IS, 1 << no)
     }
 }

+ 33 - 33
src/driver/ahci/defs.rs

@@ -17,6 +17,33 @@ pub const PORT_CMD_FRE: u32 = 0x00000010;
 pub const PORT_CMD_FR: u32 = 0x00004000;
 pub const PORT_CMD_CR: u32 = 0x00008000;
 
+pub const PORT_IE_DHRE: u32 = 0x00000001;
+pub const PORT_IE_UFE: u32 = 0x00000010;
+pub const PORT_IE_INFE: u32 = 0x04000000;
+pub const PORT_IE_IFE: u32 = 0x08000000;
+pub const PORT_IE_HBDE: u32 = 0x10000000;
+pub const PORT_IE_IBFE: u32 = 0x20000000;
+pub const PORT_IE_TFEE: u32 = 0x40000000;
+
+pub const PORT_IE_DEFAULT: u32 = PORT_IE_DHRE
+    | PORT_IE_UFE
+    | PORT_IE_INFE
+    | PORT_IE_IFE
+    | PORT_IE_HBDE
+    | PORT_IE_IBFE
+    | PORT_IE_TFEE;
+
+pub const PORT_IS_DHRS: u32 = 0x00000001;
+pub const PORT_IS_UFS: u32 = 0x00000010;
+pub const PORT_IS_INFS: u32 = 0x04000000;
+pub const PORT_IS_IFS: u32 = 0x08000000;
+pub const PORT_IS_HBDS: u32 = 0x10000000;
+pub const PORT_IS_IBFS: u32 = 0x20000000;
+pub const PORT_IS_TFES: u32 = 0x40000000;
+
+pub const PORT_IS_ERROR: u32 =
+    PORT_IS_UFS | PORT_IS_INFS | PORT_IS_IFS | PORT_IS_HBDS | PORT_IS_IBFS;
+
 /// A `CommandHeader` is used to send commands to the HBA device
 ///
 /// # Access
@@ -29,47 +56,20 @@ pub struct CommandHeader {
     // [5]: ATAPI
     // [6]: Write
     // [7]: Prefetchable
-    first: u8,
+    pub first: u8,
 
     // [0]: Reset
     // [1]: BIST
     // [2]: Clear busy upon ok
     // [3]: Reserved
     // [4:7]: Port multiplier
-    second: u8,
-
-    prdt_length: u16,
-    bytes_transferred: u32,
-    command_table_base: u64,
-
-    _reserved: [u32; 4],
-}
-
-impl CommandHeader {
-    pub fn clear(&mut self) {
-        self.first = 0;
-        self.second = 0;
-        self.prdt_length = 0;
-        self.bytes_transferred = 0;
-        self.command_table_base = 0;
-        self._reserved = [0; 4];
-    }
-
-    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
-        self.first = 0x05; // FIS type
-
-        if write {
-            self.first |= 0x40;
-        }
+    pub second: u8,
 
-        self.second = 0x04; // Clear busy upon ok
+    pub prdt_length: u16,
+    pub bytes_transferred: u32,
+    pub command_table_base: u64,
 
-        self.prdt_length = prdtlen;
-        self.bytes_transferred = 0;
-        self.command_table_base = cmdtable_base;
-
-        self._reserved = [0; 4];
-    }
+    pub _reserved: [u32; 4],
 }
 
 pub enum FisType {

+ 108 - 57
src/driver/ahci/mod.rs

@@ -1,9 +1,13 @@
 use crate::{
-    kernel::block::{make_device, BlockDevice},
+    fs::procfs,
+    kernel::{
+        block::{make_device, BlockDevice},
+        interrupt::register_irq_handler,
+    },
     prelude::*,
 };
 
-use alloc::sync::Arc;
+use alloc::{format, sync::Arc};
 use bindings::{
     kernel::hw::pci::{self, pci_device},
     EIO,
@@ -17,100 +21,149 @@ mod control;
 mod defs;
 mod port;
 
-fn vread<T: Sized + Copy>(refval: &T) -> T {
-    unsafe { core::ptr::read_volatile(refval) }
+pub struct BitsIterator {
+    data: u32,
+    n: u32,
 }
 
-fn vwrite<T: Sized + Copy>(refval: &mut T, val: T) {
-    unsafe { core::ptr::write_volatile(refval, val) }
+impl BitsIterator {
+    fn new(data: u32) -> Self {
+        Self { data, n: 0 }
+    }
 }
 
-fn spinwait_clear(refval: &u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
+impl Iterator for BitsIterator {
+    type Item = u32;
 
-    let mut spins = 0;
-    while vread(refval) & mask != 0 {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.n == 32 {
+            return None;
         }
 
-        spins += 1;
-    }
-
-    Ok(())
-}
-
-fn spinwait_set(refval: &u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
+        let have: bool = self.data & 1 != 0;
+        self.data >>= 1;
+        self.n += 1;
 
-    let mut spins = 0;
-    while vread(refval) & mask != mask {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
+        if have {
+            Some(self.n - 1)
+        } else {
+            self.next()
         }
-
-        spins += 1;
     }
+}
+
+fn vread<T: Sized + Copy>(refval: *const T) -> T {
+    unsafe { refval.read_volatile() }
+}
 
-    Ok(())
+fn vwrite<T: Sized + Copy>(refval: *mut T, val: T) {
+    unsafe { refval.write_volatile(val) }
 }
 
-struct Device<'lt, 'port> {
+struct Device {
     control_base: usize,
-    control: &'lt mut AdapterControl,
+    control: AdapterControl,
     // TODO: impl Drop to free pci device
     pcidev: *mut pci_device,
-    ports: Vec<Option<Arc<Mutex<AdapterPort<'port>>>>>,
+    /// # Lock
+    /// Might be accessed from irq handler, use with `lock_irq()`
+    ports: Spin<[Option<Arc<AdapterPort>>; 32]>,
 }
 
-impl<'lt, 'port: 'static> Device<'lt, 'port> {
-    fn probe_ports(&mut self) -> KResult<()> {
-        for nport in self.control.implemented_ports() {
-            let mut port = AdapterPort::<'port>::new(self.control_base, nport);
+/// # Safety
+/// `pcidev` is never accessed from Rust code
+/// TODO!!!: place *mut pci_device in a safe wrapper
+unsafe impl Send for Device {}
+unsafe impl Sync for Device {}
 
+impl Device {
+    fn probe_ports(&self) -> KResult<()> {
+        for nport in self.control.implemented_ports() {
+            let port = Arc::new(AdapterPort::new(self.control_base, nport));
             if !port.status_ok() {
                 continue;
             }
 
-            port.init()?;
+            self.ports.lock_irq()[nport as usize] = Some(port.clone());
+            if let Err(e) = (|| -> KResult<()> {
+                port.init()?;
+
+                {
+                    let port = port.clone();
+                    let name = format!("ahci-p{}-stats", port.nport);
+                    procfs::populate_root(name.into_bytes().into(), move |buffer| {
+                        writeln!(buffer, "{:?}", port.stats.lock().as_ref()).map_err(|_| EIO)
+                    })?;
+                }
+
+                let port = BlockDevice::register_disk(
+                    make_device(8, nport * 16),
+                    2147483647, // TODO: get size from device
+                    port,
+                )?;
+
+                port.partprobe()?;
+
+                Ok(())
+            })() {
+                self.ports.lock_irq()[nport as usize] = None;
+                println_warn!("probe port {nport} failed with {e}");
+            }
+        }
+
+        Ok(())
+    }
+
+    fn handle_interrupt(&self) {
+        // Safety
+        // `self.ports` is accessed inside irq handler
+        let ports = self.ports.lock();
+        for nport in self.control.pending_interrupts() {
+            if let None = ports[nport as usize] {
+                println_warn!("port {nport} not found");
+                continue;
+            }
+
+            let port = ports[nport as usize].as_ref().unwrap();
+            let status = vread(port.interrupt_status());
 
-            let port = Arc::new(Mutex::new(port));
+            if status & PORT_IS_ERROR != 0 {
+                println_warn!("port {nport} SATA error");
+                continue;
+            }
 
-            self.ports[nport as usize] = Some(port.clone());
+            debug_assert!(status & PORT_IS_DHRS != 0);
+            vwrite(port.interrupt_status(), PORT_IS_DHRS);
 
-            let port = BlockDevice::register_disk(
-                make_device(8, nport * 16),
-                2147483647, // TODO: get size from device
-                port,
-            )?;
+            self.control.clear_interrupt(nport);
 
-            port.partprobe()?;
+            port.handle_interrupt();
         }
-
-        Ok(())
     }
 }
 
-impl<'lt: 'static, 'port: 'static> Device<'lt, 'port> {
-    pub fn new(pcidev: *mut pci_device) -> KResult<Self> {
+impl Device {
+    pub fn new(pcidev: *mut pci_device) -> KResult<Arc<Self>> {
         let base = unsafe { *(*pcidev).header_type0() }.bars[PCI_REG_ABAR];
+        let irqno = unsafe { *(*pcidev).header_type0() }.interrupt_line;
 
         // use MMIO
         if base & 0xf != 0 {
             return Err(EIO);
         }
 
-        let mut ports = Vec::with_capacity(32);
-        ports.resize_with(32, || None);
-
-        let mut device = Device {
+        let device = Arc::new(Device {
             control_base: base as usize,
             control: AdapterControl::new(base as usize),
             pcidev,
-            ports,
-        };
+            ports: Spin::new([const { None }; 32]),
+        });
 
         device.control.enable_interrupts();
+
+        let device_irq = device.clone();
+        register_irq_handler(irqno as i32, move || device_irq.handle_interrupt())?;
+
         device.probe_ports()?;
 
         Ok(device)
@@ -123,15 +176,13 @@ unsafe extern "C" fn probe_device(pcidev: *mut pci_device) -> i32 {
             // TODO!!!: save device to pci_device
             Box::leak(Box::new(device));
             0
-        },
+        }
         Err(e) => -(e as i32),
     }
 }
 
 pub fn register_ahci_driver() {
-    let ret = unsafe {
-        pci::register_driver_r(VENDOR_INTEL, DEVICE_AHCI, Some(probe_device))
-    };
+    let ret = unsafe { pci::register_driver_r(VENDOR_INTEL, DEVICE_AHCI, Some(probe_device)) };
 
     assert_eq!(ret, 0);
 }

+ 266 - 58
src/driver/ahci/port.rs

@@ -1,4 +1,5 @@
-use bindings::EINVAL;
+use alloc::collections::vec_deque::VecDeque;
+use bindings::{EINVAL, EIO};
 
 use crate::prelude::*;
 
@@ -6,14 +7,29 @@ use crate::kernel::block::{BlockDeviceRequest, BlockRequestQueue};
 use crate::kernel::mem::paging::Page;
 
 use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
+use crate::sync::UCondVar;
 
 use super::command::{Command, IdentifyCommand, ReadLBACommand};
 use super::{
-    spinwait_clear, vread, vwrite, CommandHeader, PRDTEntry, ReceivedFis,
-    ATA_DEV_BSY, ATA_DEV_DRQ, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
-    PORT_CMD_ST,
+    vread, vwrite, CommandHeader, PRDTEntry, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
+    PORT_CMD_ST, PORT_IE_DEFAULT,
 };
 
+fn spinwait_clear(refval: *const u32, mask: u32) -> KResult<()> {
+    const SPINWAIT_MAX: usize = 1000;
+
+    let mut spins = 0;
+    while vread(refval) & mask != 0 {
+        if spins == SPINWAIT_MAX {
+            return Err(EIO);
+        }
+
+        spins += 1;
+    }
+
+    Ok(())
+}
+
 /// An `AdapterPort` is an HBA device in AHCI mode.
 ///
 /// # Access
@@ -49,92 +65,289 @@ pub struct AdapterPortData {
     vendor: [u32; 4],
 }
 
-pub struct AdapterPort<'lt> {
-    nport: u32,
-    data: &'lt mut AdapterPortData,
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum SlotState {
+    Idle,
+    Working,
+    Finished,
+    Error,
+}
+
+struct CommandSlotInner {
+    state: SlotState,
+    /// # Usage
+    /// `cmdheader` might be used in irq handler. So in order to wait for
+    /// commands to finish, we should use `lock_irq` on `cmdheader`
+    cmdheader: *mut CommandHeader,
+}
+
+/// # Safety
+/// This is safe because the `cmdheader` is not shared between threads
+unsafe impl Send for CommandSlotInner {}
+
+impl CommandSlotInner {
+    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
+        let cmdheader = unsafe { self.cmdheader.as_mut().unwrap() };
+        cmdheader.first = 0x05; // FIS type
+
+        if write {
+            cmdheader.first |= 0x40;
+        }
+
+        cmdheader.second = 0x00;
+
+        cmdheader.prdt_length = prdtlen;
+        cmdheader.bytes_transferred = 0;
+        cmdheader.command_table_base = cmdtable_base;
+
+        cmdheader._reserved = [0; 4];
+    }
+}
+
+struct CommandSlot {
+    inner: Spin<CommandSlotInner>,
+    cv: UCondVar,
+}
+
+impl CommandSlot {
+    fn new(cmdheader: *mut CommandHeader) -> Self {
+        Self {
+            inner: Spin::new(CommandSlotInner {
+                state: SlotState::Idle,
+                cmdheader,
+            }),
+            cv: UCondVar::new(),
+        }
+    }
+}
+
+struct FreeList {
+    free: VecDeque<u32>,
+    working: VecDeque<u32>,
+}
+
+impl FreeList {
+    fn new() -> Self {
+        Self {
+            free: (0..32).collect(),
+            working: VecDeque::new(),
+        }
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct AdapterPortStats {
+    /// Number of commands sent
+    cmd_sent: u64,
+
+    /// Number of transmission errors
+    cmd_error: u64,
+
+    /// Number of interrupts fired
+    int_fired: u64,
+}
+
+pub struct AdapterPort {
+    pub nport: u32,
+    regs: *mut (),
     page: Page,
-    cmdheaders: &'lt mut [CommandHeader; 32],
-    recv_fis: &'lt mut ReceivedFis,
+    slots: [CommandSlot; 32],
+    free_list: Spin<FreeList>,
+    free_list_cv: UCondVar,
+
+    /// Statistics for this port
+    pub stats: Spin<AdapterPortStats>,
 }
 
-impl<'lt> AdapterPort<'lt> {
+/// # Safety
+/// This is safe because the `AdapterPort` can be accessed by only one thread at the same time
+unsafe impl Send for AdapterPort {}
+unsafe impl Sync for AdapterPort {}
+
+impl AdapterPort {
     pub fn new(base: usize, nport: u32) -> Self {
         let page = Page::alloc_one();
+        let cmdheaders_start = page.as_cached().as_ptr::<CommandHeader>();
+
         Self {
             nport,
-            data: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_mut(),
-            cmdheaders: page.as_cached().as_mut(),
-            recv_fis: page.as_cached().offset(0x400).as_mut(),
+            regs: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_ptr(),
+            slots: core::array::from_fn(|index| {
+                CommandSlot::new(unsafe { cmdheaders_start.offset(index as isize) })
+            }),
+            free_list: Spin::new(FreeList::new()),
+            free_list_cv: UCondVar::new(),
             page,
+            stats: Spin::default(),
         }
     }
 }
 
-impl<'lt> AdapterPort<'lt> {
+impl AdapterPort {
+    fn command_list_base(&self) -> *mut u64 {
+        unsafe { self.regs.byte_offset(0x00).cast() }
+    }
+
+    fn fis_base(&self) -> *mut u64 {
+        unsafe { self.regs.byte_offset(0x08).cast() }
+    }
+
+    fn sata_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x28).cast() }
+    }
+
+    fn command_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x18).cast() }
+    }
+
+    fn command_issue(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x38).cast() }
+    }
+
+    pub fn interrupt_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x10).cast() }
+    }
+
+    pub fn interrupt_enable(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x14).cast() }
+    }
+
     pub fn status_ok(&self) -> bool {
-        self.data.sata_status & 0xf == 0x3
+        vread(self.sata_status()) & 0xf == 0x3
+    }
+
+    fn get_free_slot(&self) -> u32 {
+        let mut free_list = self.free_list.lock_irq();
+
+        loop {
+            match free_list.free.pop_front() {
+                Some(slot) => break slot,
+                None => self.free_list_cv.wait(&mut free_list),
+            };
+        }
+    }
+
+    fn save_working(&self, slot: u32) {
+        self.free_list.lock().working.push_back(slot);
+    }
+
+    fn release_free_slot(&self, slot: u32) {
+        self.free_list.lock().free.push_back(slot);
+        self.free_list_cv.notify_one();
+    }
+
+    pub fn handle_interrupt(&self) {
+        let ci = vread(self.command_issue());
+
+        // no need to use `lock_irq()` inside interrupt handler
+        let mut free_list = self.free_list.lock();
+
+        free_list.working.retain(|&n| {
+            if ci & (1 << n) != 0 {
+                return true;
+            }
+
+            let slot = &self.slots[n as usize];
+
+            // TODO: check error
+            let mut slot_inner = slot.inner.lock();
+            debug_assert_eq!(slot_inner.state, SlotState::Working);
+            slot_inner.state = SlotState::Finished;
+            slot.cv.notify_all();
+            self.stats.lock().int_fired += 1;
+
+            false
+        });
     }
 
-    fn stop_command(&mut self) -> KResult<()> {
-        let cmd_status = vread(&self.data.command_status);
+    fn stop_command(&self) -> KResult<()> {
         vwrite(
-            &mut self.data.command_status,
-            cmd_status & !(PORT_CMD_ST | PORT_CMD_FRE),
+            self.command_status(),
+            vread(self.command_status()) & !(PORT_CMD_ST | PORT_CMD_FRE),
         );
 
-        spinwait_clear(&self.data.command_status, PORT_CMD_CR | PORT_CMD_FR)
+        spinwait_clear(self.command_status(), PORT_CMD_CR | PORT_CMD_FR)
     }
 
-    fn start_command(&mut self) -> KResult<()> {
-        spinwait_clear(&self.data.command_status, PORT_CMD_CR)?;
+    fn start_command(&self) -> KResult<()> {
+        spinwait_clear(self.command_status(), PORT_CMD_CR)?;
 
-        let cmd_status = vread(&self.data.command_status);
+        let cmd_status = vread(self.command_status());
         vwrite(
-            &mut self.data.command_status,
+            self.command_status(),
             cmd_status | PORT_CMD_ST | PORT_CMD_FRE,
         );
 
         Ok(())
     }
 
-    fn send_command(&mut self, cmd: &impl Command) -> KResult<()> {
-        let pages = cmd.pages();
-
-        // TODO: get an available command slot
-        let cmdslot = 0;
+    /// # Might Sleep
+    /// This function **might sleep**, so call it in a preemptible context
+    fn send_command(&self, cmd: &impl Command) -> KResult<()> {
+        might_sleep!();
 
+        let pages = cmd.pages();
         let cmdtable_page = Page::alloc_one();
-        self.cmdheaders[cmdslot].clear();
-        self.cmdheaders[cmdslot].setup(
-            cmdtable_page.as_phys() as u64,
-            pages.len() as u16,
-            cmd.write(),
-        );
 
         let command_fis: &mut FISH2D = cmdtable_page.as_cached().as_mut();
         command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
 
-        let prdt: &mut [PRDTEntry; 248] =
-            cmdtable_page.as_cached().offset(0x80).as_mut();
+        let prdt: &mut [PRDTEntry; 248] = cmdtable_page.as_cached().offset(0x80).as_mut();
 
         for (idx, page) in pages.iter().enumerate() {
             prdt[idx].setup(page);
         }
 
-        // clear received fis?
+        let slot_index = self.get_free_slot() as usize;
+        let slot_object = &self.slots[slot_index];
 
-        // wait until port is not busy
-        spinwait_clear(&self.data.task_file_data, ATA_DEV_BSY | ATA_DEV_DRQ)?;
+        let mut slot = slot_object.inner.lock_irq();
 
-        vwrite(&mut self.data.command_issue, 1 << cmdslot);
-        spinwait_clear(&self.data.command_issue, 1 << cmdslot)?;
+        slot.setup(
+            cmdtable_page.as_phys() as u64,
+            pages.len() as u16,
+            cmd.write(),
+        );
+        slot.state = SlotState::Working;
+
+        // should we clear received fis here?
+        debug_assert!(vread(self.command_issue()) & (1 << slot_index) == 0);
+        vwrite(self.command_issue(), 1 << slot_index);
+
+        if spinwait_clear(self.command_issue(), 1 << slot_index).is_err() {
+            let mut saved = false;
+            while slot.state == SlotState::Working {
+                if !saved {
+                    saved = true;
+                    self.save_working(slot_index as u32);
+                }
+                slot_object.cv.wait(&mut slot);
+            }
+        } else {
+            // TODO: check error
+            slot.state = SlotState::Finished;
+        }
 
-        // TODO: check and wait interrupt
+        let state = slot.state;
+        slot.state = SlotState::Idle;
 
-        Ok(())
+        debug_assert_ne!(state, SlotState::Working);
+        self.release_free_slot(slot_index as u32);
+
+        match state {
+            SlotState::Finished => {
+                self.stats.lock().cmd_sent += 1;
+                Ok(())
+            }
+            SlotState::Error => {
+                self.stats.lock().cmd_error += 1;
+                Err(EIO)
+            }
+            _ => panic!("Invalid slot state"),
+        }
     }
 
-    fn identify(&mut self) -> KResult<()> {
+    fn identify(&self) -> KResult<()> {
         let cmd = IdentifyCommand::new();
 
         // TODO: check returned data
@@ -143,43 +356,38 @@ impl<'lt> AdapterPort<'lt> {
         Ok(())
     }
 
-    pub fn init(&mut self) -> KResult<()> {
+    pub fn init(&self) -> KResult<()> {
         self.stop_command()?;
 
-        // TODO: use interrupt
-        // this is the PxIE register, setting bits here will make
-        //      it generate corresponding interrupts in PxIS
-        //
-        // port->interrupt_enable = 1;
+        vwrite(self.interrupt_enable(), PORT_IE_DEFAULT);
 
-        vwrite(&mut self.data.command_list_base, self.page.as_phys() as u64);
-        vwrite(&mut self.data.fis_base, self.page.as_phys() as u64 + 0x400);
+        vwrite(self.command_list_base(), self.page.as_phys() as u64);
+        vwrite(self.fis_base(), self.page.as_phys() as u64 + 0x400);
 
         self.start_command()?;
 
         match self.identify() {
             Err(err) => {
                 self.stop_command()?;
-                return Err(err);
+                Err(err)
             }
             Ok(_) => Ok(()),
         }
     }
 }
 
-impl<'lt> BlockRequestQueue for AdapterPort<'lt> {
+impl BlockRequestQueue for AdapterPort {
     fn max_request_pages(&self) -> u64 {
         1024
     }
 
-    fn submit(&mut self, req: BlockDeviceRequest) -> KResult<()> {
+    fn submit(&self, req: BlockDeviceRequest) -> KResult<()> {
         // TODO: check disk size limit using newtype
         if req.count > 65535 {
             return Err(EINVAL);
         }
 
-        let command =
-            ReadLBACommand::new(req.buffer, req.sector, req.count as u16)?;
+        let command = ReadLBACommand::new(req.buffer, req.sector, req.count as u16)?;
 
         self.send_command(&command)
     }

+ 23 - 11
src/driver/e1000e.rs

@@ -1,3 +1,5 @@
+use crate::prelude::*;
+
 use crate::bindings::root::kernel::hw::pci;
 use crate::kernel::interrupt::register_irq_handler;
 use crate::kernel::mem::paging::copy_to_page;
@@ -56,6 +58,23 @@ fn test(val: u32, bit: u32) -> bool {
     (val & bit) == bit
 }
 
+struct PrintableBytes<'a>(&'a [u8]);
+
+impl core::fmt::Debug for PrintableBytes<'_> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "PrintableBytes {{")?;
+        for chunk in self.0.chunks(16) {
+            for &byte in chunk {
+                write!(f, "{byte} ")?;
+            }
+            write!(f, "\n")?;
+        }
+        write!(f, "}}")?;
+
+        Ok(())
+    }
+}
+
 impl netdev::Netdev for E1000eDev {
     fn mac(&self) -> netdev::Mac {
         self.mac
@@ -151,17 +170,10 @@ impl netdev::Netdev for E1000eDev {
                 )
             };
 
-            use crate::{dont_check, print, println};
-            dont_check!(println!("==== e1000e: received packet ===="));
-
-            for i in 0..len {
-                if i % 16 == 0 {
-                    dont_check!(println!());
-                }
-                dont_check!(print!("{:02x} ", data[i]));
-            }
-
-            dont_check!(println!("\n\n====  e1000e: end of packet  ===="));
+            println_debug!(
+                "e1000e: received {len} bytes, {:?}",
+                PrintableBytes(data)
+            );
             self.rx_tail = Some(next_tail);
         }
 

+ 145 - 0
src/driver/serial.rs

@@ -0,0 +1,145 @@
+use alloc::{format, sync::Arc};
+use bindings::EIO;
+
+use crate::{
+    kernel::{
+        block::make_device, interrupt::register_irq_handler, CharDevice, CharDeviceType, Console,
+        Terminal, TerminalDevice,
+    },
+    prelude::*,
+};
+
+use super::Port8;
+
+struct Serial {
+    id: u32,
+    name: Arc<str>,
+
+    terminal: Option<Arc<Terminal>>,
+
+    tx_rx: Port8,
+    int_ena: Port8,
+    int_ident: Port8,
+    line_control: Port8,
+    modem_control: Port8,
+    line_status: Port8,
+    modem_status: Port8,
+    scratch: Port8,
+}
+
+impl Serial {
+    const COM0_BASE: u16 = 0x3f8;
+    const COM1_BASE: u16 = 0x2f8;
+
+    const COM0_IRQ: u8 = 4;
+    const COM1_IRQ: u8 = 3;
+
+    fn enable_interrupts(&self) {
+        // Enable interrupt #0: Received data available
+        self.int_ena.write(0x01);
+    }
+
+    pub fn new(id: u32, base_port: u16) -> KResult<Self> {
+        let port = Self {
+            id,
+            name: Arc::from(format!("ttyS{id}")),
+            terminal: None,
+            tx_rx: Port8::new(base_port),
+            int_ena: Port8::new(base_port + 1),
+            int_ident: Port8::new(base_port + 2),
+            line_control: Port8::new(base_port + 3),
+            modem_control: Port8::new(base_port + 4),
+            line_status: Port8::new(base_port + 5),
+            modem_status: Port8::new(base_port + 6),
+            scratch: Port8::new(base_port + 7),
+        };
+
+        port.int_ena.write(0x00); // Disable all interrupts
+        port.line_control.write(0x80); // Enable DLAB (set baud rate divisor)
+        port.tx_rx.write(0x00); // Set divisor to 0 (lo byte) 115200 baud rate
+        port.int_ena.write(0x00); //              0 (hi byte)
+        port.line_control.write(0x03); // 8 bits, no parity, one stop bit
+        port.int_ident.write(0xc7); // Enable FIFO, clear them, with 14-byte threshold
+        port.modem_control.write(0x0b); // IRQs enabled, RTS/DSR set
+        port.modem_control.write(0x1e); // Set in loopback mode, test the serial chip
+        port.tx_rx.write(0x19); // Test serial chip (send byte 0x19 and check if serial returns
+                                // same byte)
+        if port.tx_rx.read() != 0x19 {
+            return Err(EIO);
+        }
+
+        port.modem_control.write(0x0f); // Return to normal operation mode
+        Ok(port)
+    }
+
+    fn irq_handler(&self) {
+        let terminal = self.terminal.as_ref();
+        while self.line_status.read() & 0x01 != 0 {
+            let ch = self.tx_rx.read();
+
+            if let Some(terminal) = terminal {
+                terminal.commit_char(ch);
+            }
+        }
+    }
+
+    fn register_char_device(port: Self) -> KResult<()> {
+        let mut port = Arc::new(port);
+        let terminal = Terminal::new(port.clone());
+
+        // TODO!!!!!!: This is unsafe, we should find a way to avoid this.
+        //             Under smp, we should make the publish of terminal atomic.
+        unsafe { Arc::get_mut_unchecked(&mut port) }.terminal = Some(terminal.clone());
+
+        {
+            let port = port.clone();
+            let irq_no = match port.id {
+                0 => Serial::COM0_IRQ,
+                1 => Serial::COM1_IRQ,
+                _ => unreachable!(),
+            };
+
+            register_irq_handler(irq_no as i32, move || {
+                port.irq_handler();
+            })?;
+        }
+        port.enable_interrupts();
+        dont_check!(Console::register_terminal(&terminal));
+
+        CharDevice::register(
+            make_device(4, 64 + port.id),
+            port.name.clone(),
+            CharDeviceType::Terminal(terminal),
+        )?;
+
+        Ok(())
+    }
+}
+
+impl TerminalDevice for Serial {
+    fn putchar(&self, ch: u8) {
+        loop {
+            // If we poll the status and get the corresponding bit, we should handle the action.
+            let status = self.line_status.read();
+            if status & 0x20 != 0 {
+                self.tx_rx.write(ch);
+                return;
+            }
+        }
+    }
+}
+
+pub fn init() -> KResult<()> {
+    let com0 = Serial::new(0, Serial::COM0_BASE);
+    let com1 = Serial::new(1, Serial::COM1_BASE);
+
+    if let Ok(port) = com0 {
+        Serial::register_char_device(port)?;
+    }
+
+    if let Ok(port) = com1 {
+        Serial::register_char_device(port)?;
+    }
+
+    Ok(())
+}

+ 370 - 0
src/elf.rs

@@ -0,0 +1,370 @@
+use alloc::{ffi::CString, sync::Arc};
+use bitflags::bitflags;
+
+use crate::{
+    io::{RawBuffer, UninitBuffer},
+    kernel::{
+        constants::ENOEXEC,
+        mem::{FileMapping, MMList, Mapping, Permission, VAddr},
+        task::Thread,
+        user::{dataflow::CheckedUserPointer, UserPointerMut},
+        vfs::dentry::Dentry,
+    },
+    prelude::*,
+};
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfFormat {
+    Elf32 = 1,
+    Elf64 = 2,
+}
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfEndian {
+    Little = 1,
+    Big = 2,
+}
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfABI {
+    // SystemV = 0,
+    Linux = 3,
+}
+
+#[repr(u16)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfType {
+    Relocatable = 1,
+    Executable = 2,
+    Dynamic = 3,
+    Core = 4,
+}
+
+#[repr(u16)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfArch {
+    X86 = 0x03,
+    Arm = 0x28,
+    IA64 = 0x32,
+    X86_64 = 0x3e,
+    AArch64 = 0xb7,
+    RiscV = 0xf3,
+}
+
+bitflags! {
+    #[derive(Default, Clone, Copy)]
+    pub struct Elf32PhFlags: u32 {
+        const Exec = 1;
+        const Write = 2;
+        const Read = 4;
+    }
+
+    #[derive(Default, Clone, Copy)]
+    pub struct Elf32ShFlags: u32 {
+        const Write = 1;
+        const Alloc = 2;
+        const Exec = 4;
+        const MaskProc = 0xf0000000;
+    }
+}
+
+#[derive(Default, Clone, Copy, PartialEq, Eq)]
+pub enum Elf32PhType {
+    #[default]
+    Null = 0,
+    Load = 1,
+    Dynamic = 2,
+    Interp = 3,
+    Note = 4,
+    Shlib = 5,
+    Phdr = 6,
+    Tls = 7,
+    Loos = 0x60000000,
+    Hios = 0x6fffffff,
+    Loproc = 0x70000000,
+    Hiproc = 0x7fffffff,
+}
+
+#[derive(Default, Clone, Copy, PartialEq, Eq)]
+pub enum Elf32ShType {
+    #[default]
+    Null = 0,
+    ProgBits = 1,
+    SymTab = 2,
+    StrTab = 3,
+    Rela = 4,
+    Hash = 5,
+    Dynamic = 6,
+    Note = 7,
+    NoBits = 8,
+    Rel = 9,
+    Shlib = 10,
+    DynSym = 11,
+    InitArray = 14,
+    FiniArray = 15,
+    PreInitArray = 16,
+    Group = 17,
+    SymTabShndx = 18,
+    Loos = 0x60000000,
+    Hios = 0x6fffffff,
+    Loproc = 0x70000000,
+    Hiproc = 0x7fffffff,
+}
+
+#[repr(C, packed)]
+#[derive(Clone, Copy)]
+pub struct Elf32Header {
+    /// ELF magic number: 0x7f, "ELF"
+    pub magic: [u8; 4],
+    pub format: ElfFormat,
+    pub endian: ElfEndian,
+    /// ELF version, should be 1
+    pub version: u8,
+    pub abi: ElfABI,
+    pub abi_version: u8,
+    padding: [u8; 7],
+    pub elf_type: ElfType,
+    pub arch: ElfArch,
+    /// ELF version, should be 1
+    pub version2: u32,
+    pub entry: u32,
+    pub ph_offset: u32,
+    pub sh_offset: u32,
+    pub flags: u32,
+    pub eh_size: u16,
+    pub ph_entry_size: u16,
+    pub ph_entry_count: u16,
+    pub sh_entry_size: u16,
+    pub sh_entry_count: u16,
+    pub sh_str_index: u16,
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy)]
+pub struct Elf32PhEntry {
+    pub ph_type: Elf32PhType,
+    pub offset: u32,
+    pub vaddr: u32,
+    pub paddr: u32,
+    pub file_size: u32,
+    pub mem_size: u32,
+    pub flags: Elf32PhFlags,
+    /// `0` and `1` for no alignment, otherwise power of `2`
+    pub align: u32,
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy)]
+pub struct Elf32ShEntry {
+    pub name_offset: u32,
+    pub sh_type: Elf32ShType,
+    pub flags: Elf32ShFlags,
+    pub addr: u32,
+    pub offset: u32,
+    pub size: u32,
+    pub link: u32,
+    pub info: u32,
+    pub addr_align: u32,
+    pub entry_size: u32,
+}
+
+pub struct ParsedElf32 {
+    entry: u32,
+    file: Arc<Dentry>,
+    phents: Vec<Elf32PhEntry>,
+    shents: Vec<Elf32ShEntry>,
+}
+
+const ELF_MAGIC: [u8; 4] = *b"\x7fELF";
+
+impl Elf32Header {
+    fn check_valid(&self) -> bool {
+        self.magic == ELF_MAGIC
+            && self.version == 1
+            && self.version2 == 1
+            && self.eh_size as usize == size_of::<Elf32Header>()
+            && self.ph_entry_size as usize == size_of::<Elf32PhEntry>()
+            && self.sh_entry_size as usize == size_of::<Elf32ShEntry>()
+    }
+}
+
+impl ParsedElf32 {
+    pub fn parse(file: Arc<Dentry>) -> KResult<Self> {
+        let mut header = UninitBuffer::<Elf32Header>::new();
+        file.read(&mut header, 0)?;
+
+        let header = header.assume_init().ok_or(ENOEXEC)?;
+        if !header.check_valid() {
+            return Err(ENOEXEC);
+        }
+
+        // TODO: Use `UninitBuffer` for `phents` and `shents`.
+        let mut phents = vec![Elf32PhEntry::default(); header.ph_entry_count as usize];
+        let nread = file.read(
+            &mut RawBuffer::new_from_slice(phents.as_mut_slice()),
+            header.ph_offset as usize,
+        )?;
+        if nread != header.ph_entry_count as usize * size_of::<Elf32PhEntry>() {
+            return Err(ENOEXEC);
+        }
+
+        let mut shents = vec![Elf32ShEntry::default(); header.sh_entry_count as usize];
+        let nread = file.read(
+            &mut RawBuffer::new_from_slice(shents.as_mut_slice()),
+            header.sh_offset as usize,
+        )?;
+        if nread != header.sh_entry_count as usize * size_of::<Elf32ShEntry>() {
+            return Err(ENOEXEC);
+        }
+
+        Ok(Self {
+            entry: header.entry,
+            file,
+            phents,
+            shents,
+        })
+    }
+
+    /// Load the ELF file into memory. Return the entry point address.
+    ///
+    /// We clear the user space and load the program headers into memory.
+    /// Can't make a way back if failed from now on.
+    ///
+    /// # Return
+    /// `(entry_ip, sp)`
+    pub fn load(
+        self,
+        mm_list: &MMList,
+        args: Vec<CString>,
+        envs: Vec<CString>,
+    ) -> KResult<(VAddr, VAddr)> {
+        mm_list.clear_user();
+
+        let mut data_segment_end = VAddr(0);
+        for phent in self
+            .phents
+            .into_iter()
+            .filter(|ent| ent.ph_type == Elf32PhType::Load)
+        {
+            let vaddr_start = VAddr(phent.vaddr as usize);
+            let vmem_vaddr_end = vaddr_start + phent.mem_size as usize;
+            let load_vaddr_end = vaddr_start + phent.file_size as usize;
+
+            let vaddr = vaddr_start.floor();
+            let vmem_len = vmem_vaddr_end.ceil() - vaddr;
+            let file_len = load_vaddr_end.ceil() - vaddr;
+            let file_offset = phent.offset as usize & !0xfff;
+
+            let permission = Permission {
+                write: phent.flags.contains(Elf32PhFlags::Write),
+                execute: phent.flags.contains(Elf32PhFlags::Exec),
+            };
+
+            if file_len != 0 {
+                let real_file_length = load_vaddr_end - vaddr;
+                mm_list.mmap_fixed(
+                    vaddr,
+                    file_len,
+                    Mapping::File(FileMapping::new(
+                        self.file.clone(),
+                        file_offset,
+                        real_file_length,
+                    )),
+                    permission,
+                )?;
+            }
+
+            if vmem_len > file_len {
+                mm_list.mmap_fixed(
+                    vaddr + file_len,
+                    vmem_len - file_len,
+                    Mapping::Anonymous,
+                    permission,
+                )?;
+            }
+
+            if vaddr + vmem_len > data_segment_end {
+                data_segment_end = vaddr + vmem_len;
+            }
+        }
+
+        mm_list.register_break(data_segment_end + 0x10000);
+
+        // Map stack area
+        mm_list.mmap_fixed(
+            VAddr(0xc0000000 - 0x800000), // Stack bottom is at 0xc0000000
+            0x800000,                     // 8MB stack size
+            Mapping::Anonymous,
+            Permission {
+                write: true,
+                execute: false,
+            },
+        )?;
+
+        // TODO!!!!!: A temporary workaround.
+        mm_list.switch_page_table();
+
+        let mut sp = 0xc0000000u32;
+        let arg_addrs = args
+            .into_iter()
+            .map(|arg| push_string(&mut sp, arg))
+            .collect::<Vec<_>>();
+
+        let env_addrs = envs
+            .into_iter()
+            .map(|env| push_string(&mut sp, env))
+            .collect::<Vec<_>>();
+
+        let longs = 2 // Null auxiliary vector entry
+            + env_addrs.len() + 1 // Envs + null
+            + arg_addrs.len() + 1 // Args + null
+            + 1; // argc
+
+        sp -= longs as u32 * 4;
+        sp &= !0xf; // Align to 16 bytes
+
+        let mut cursor = (0..longs)
+            .map(|idx| UserPointerMut::<u32>::new_vaddr(sp as usize + size_of::<u32>() * idx));
+
+        // argc
+        cursor.next().unwrap()?.write(arg_addrs.len() as u32)?;
+
+        // args
+        for arg_addr in arg_addrs.into_iter() {
+            cursor.next().unwrap()?.write(arg_addr)?;
+        }
+        cursor.next().unwrap()?.write(0)?; // null
+
+        // envs
+        for env_addr in env_addrs.into_iter() {
+            cursor.next().unwrap()?.write(env_addr)?;
+        }
+        cursor.next().unwrap()?.write(0)?; // null
+
+        // Null auxiliary vector
+        cursor.next().unwrap()?.write(0)?; // AT_NULL
+        cursor.next().unwrap()?.write(0)?; // AT_NULL
+
+        // TODO!!!!!: A temporary workaround.
+        Thread::current().process.mm_list.switch_page_table();
+
+        assert!(cursor.next().is_none());
+        Ok((VAddr(self.entry as usize), VAddr(sp as usize)))
+    }
+}
+
+fn push_string(sp: &mut u32, string: CString) -> u32 {
+    let data = string.as_bytes_with_nul();
+    let new_sp = (*sp - data.len() as u32) & !0x3; // Align to 4 bytes
+
+    CheckedUserPointer::new(new_sp as *const u8, data.len())
+        .unwrap()
+        .write(data.as_ptr() as _, data.len())
+        .unwrap();
+
+    *sp = new_sp;
+    new_sp
+}

+ 137 - 148
src/fs/fat32.rs

@@ -1,4 +1,10 @@
-use alloc::{sync::Arc, vec::Vec};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+
+use alloc::{
+    collections::btree_map::BTreeMap,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
 use bindings::{EINVAL, EIO, S_IFDIR, S_IFREG};
 
 use itertools::Itertools;
@@ -10,10 +16,10 @@ use crate::{
         mem::{paging::Page, phys::PhysPtr},
         vfs::{
             dentry::Dentry,
-            inode::{Ino, Inode, InodeCache, InodeOps},
+            inode::{define_struct_inode, Ino, Inode, InodeData},
             mount::{register_filesystem, Mount, MountCreator},
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
     },
     prelude::*,
@@ -131,19 +137,35 @@ struct Bootsector {
     mbr_signature: u16,
 }
 
+impl_any!(FatFs);
 /// # Lock order
-/// 1. FatFs
 /// 2. FatTable
 /// 3. Inodes
 ///
 struct FatFs {
-    device: Arc<BlockDevice>,
-    icache: Mutex<InodeCache<FatFs>>,
     sectors_per_cluster: u8,
     rootdir_cluster: ClusterNo,
     data_start: u64,
-    fat: Mutex<Vec<ClusterNo>>,
-    volume_label: String,
+    volume_label: [u8; 11],
+
+    device: Arc<BlockDevice>,
+    fat: RwSemaphore<Vec<ClusterNo>>,
+    weak: Weak<FatFs>,
+    icache: BTreeMap<Ino, FatInode>,
+}
+
+impl Vfs for FatFs {
+    fn io_blksize(&self) -> usize {
+        4096
+    }
+
+    fn fs_devid(&self) -> DevId {
+        self.device.devid()
+    }
+
+    fn is_read_only(&self) -> bool {
+        true
+    }
 }
 
 impl FatFs {
@@ -151,8 +173,7 @@ impl FatFs {
         let cluster = cluster - 2;
 
         let rq = BlockDeviceRequest {
-            sector: self.data_start as u64
-                + cluster as u64 * self.sectors_per_cluster as u64,
+            sector: self.data_start as u64 + cluster as u64 * self.sectors_per_cluster as u64,
             count: self.sectors_per_cluster as u64,
             buffer: core::slice::from_ref(buf),
         };
@@ -160,57 +181,34 @@ impl FatFs {
 
         Ok(())
     }
-}
-
-impl InodeCache<FatFs> {
-    fn get_or_alloc(
-        &mut self,
-        ino: Ino,
-        is_directory: bool,
-        size: u64,
-    ) -> KResult<Arc<Inode>> {
-        self.get(ino).map(|inode| Ok(inode)).unwrap_or_else(|| {
-            let nlink;
-            let mut mode = 0o777;
-
-            let ops: Box<dyn InodeOps>;
-
-            if is_directory {
-                nlink = 2;
-                mode |= S_IFDIR;
-                ops = Box::new(DirOps);
-            } else {
-                nlink = 1;
-                mode |= S_IFREG;
-                ops = Box::new(FileOps);
-            }
-
-            let mut inode = self.alloc(ino, ops);
-            let inode_mut = unsafe { Arc::get_mut_unchecked(&mut inode) };
-            let inode_idata = inode_mut.idata.get_mut();
 
-            inode_idata.mode = mode;
-            inode_idata.nlink = nlink;
-            inode_idata.size = size;
-
-            self.submit(&inode)?;
-
-            Ok(inode)
-        })
+    fn get_or_alloc_inode(&self, ino: Ino, is_directory: bool, size: u32) -> Arc<dyn Inode> {
+        self.icache
+            .get(&ino)
+            .cloned()
+            .map(FatInode::unwrap)
+            .unwrap_or_else(|| {
+                if is_directory {
+                    DirInode::new(ino, self.weak.clone(), size)
+                } else {
+                    FileInode::new(ino, self.weak.clone(), size)
+                }
+            })
     }
 }
 
 impl FatFs {
-    pub fn create(device: DevId) -> KResult<(Arc<Self>, Arc<Inode>)> {
+    pub fn create(device: DevId) -> KResult<(Arc<Self>, Arc<dyn Inode>)> {
         let device = BlockDevice::get(device)?;
-        let mut fatfs_arc = Arc::new_cyclic(|weak| Self {
+        let mut fatfs_arc = Arc::new_cyclic(|weak: &Weak<FatFs>| Self {
             device,
-            icache: Mutex::new(InodeCache::new(weak.clone())),
             sectors_per_cluster: 0,
             rootdir_cluster: 0,
             data_start: 0,
-            fat: Mutex::new(Vec::new()),
-            volume_label: String::new(),
+            fat: RwSemaphore::new(Vec::new()),
+            weak: weak.clone(),
+            icache: BTreeMap::new(),
+            volume_label: [0; 11],
         });
 
         let fatfs = unsafe { Arc::get_mut_unchecked(&mut fatfs_arc) };
@@ -221,13 +219,13 @@ impl FatFs {
 
         fatfs.sectors_per_cluster = info.sectors_per_cluster;
         fatfs.rootdir_cluster = info.root_cluster;
-        fatfs.data_start = info.reserved_sectors as u64
-            + info.fat_copies as u64 * info.sectors_per_fat as u64;
+        fatfs.data_start =
+            info.reserved_sectors as u64 + info.fat_copies as u64 * info.sectors_per_fat as u64;
 
         let fat = fatfs.fat.get_mut();
+
         fat.resize(
-            512 * info.sectors_per_fat as usize
-                / core::mem::size_of::<ClusterNo>(),
+            512 * info.sectors_per_fat as usize / core::mem::size_of::<ClusterNo>(),
             0,
         );
 
@@ -242,51 +240,21 @@ impl FatFs {
             return Err(EIO);
         }
 
-        fatfs.volume_label = String::from(
-            str::from_utf8(&info.volume_label)
-                .map_err(|_| EINVAL)?
-                .trim_end_matches(char::from(' ')),
-        );
-
-        let root_dir_cluster_count =
-            ClusterIterator::new(&fat, fatfs.rootdir_cluster).count();
-
-        let root_inode = {
-            let icache = fatfs.icache.get_mut();
-
-            let mut inode =
-                icache.alloc(info.root_cluster as Ino, Box::new(DirOps));
-            let inode_mut = unsafe { Arc::get_mut_unchecked(&mut inode) };
-            let inode_idata = inode_mut.idata.get_mut();
-
-            inode_idata.mode = S_IFDIR | 0o777;
-            inode_idata.nlink = 2;
-            inode_idata.size = root_dir_cluster_count as u64
-                * info.sectors_per_cluster as u64
-                * 512;
+        info.volume_label
+            .iter()
+            .take_while(|&&c| c != ' ' as u8)
+            .take(11)
+            .enumerate()
+            .for_each(|(idx, c)| fatfs.volume_label[idx] = *c);
 
-            icache.submit(&inode)?;
-            inode
-        };
+        let root_dir_cluster_count = ClusterIterator::new(fat, fatfs.rootdir_cluster).count();
+        let root_dir_size = root_dir_cluster_count as u32 * info.sectors_per_cluster as u32 * 512;
+        let root_inode = DirInode::new(info.root_cluster as Ino, fatfs.weak.clone(), root_dir_size);
 
         Ok((fatfs_arc, root_inode))
     }
 }
 
-impl Vfs for FatFs {
-    fn io_blksize(&self) -> usize {
-        4096
-    }
-
-    fn fs_devid(&self) -> DevId {
-        self.device.devid()
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
 struct ClusterIterator<'fat> {
     fat: &'fat [ClusterNo],
     cur: ClusterNo,
@@ -371,24 +339,47 @@ impl<'fat> Iterator for ClusterIterator<'fat> {
     }
 }
 
-struct FileOps;
-impl InodeOps for FileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+#[derive(Clone)]
+enum FatInode {
+    File(Arc<FileInode>),
+    Dir(Arc<DirInode>),
+}
+
+impl FatInode {
+    fn unwrap(self) -> Arc<dyn Inode> {
+        match self {
+            FatInode::File(inode) => inode,
+            FatInode::Dir(inode) => inode,
+        }
     }
+}
 
-    fn read(
-        &self,
-        inode: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
-        let vfs = inode.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+define_struct_inode! {
+    struct FileInode;
+}
+
+impl FileInode {
+    fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
+        let inode = Arc::new(Self {
+            idata: InodeData::new(ino, weak),
+        });
+
+        // Safety: We are initializing the inode
+        inode.nlink.store(1, Ordering::Relaxed);
+        inode.mode.store(S_IFREG | 0o777, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+
+        inode
+    }
+}
+
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
-        let iter = ClusterIterator::new(&fat, inode.ino as ClusterNo)
-            .read(vfs, offset);
+        let iter = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).read(vfs, offset);
 
         for data in iter {
             if buffer.fill(data?)?.should_stop() {
@@ -400,23 +391,32 @@ impl InodeOps for FileOps {
     }
 }
 
-struct DirOps;
-impl InodeOps for DirOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+define_struct_inode! {
+    struct DirInode;
+}
+
+impl DirInode {
+    fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
+        let inode = Arc::new(Self {
+            idata: InodeData::new(ino, weak),
+        });
+
+        // Safety: We are initializing the inode
+        inode.nlink.store(2, Ordering::Relaxed);
+        inode.mode.store(S_IFDIR | 0o777, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+
+        inode
     }
+}
 
-    fn lookup(
-        &self,
-        dir: &Inode,
-        dentry: &Arc<Dentry>,
-    ) -> KResult<Option<Arc<Inode>>> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+impl Inode for DirInode {
+    fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
-        let mut entries =
-            ClusterIterator::new(&fat, dir.ino as ClusterNo).dirs(vfs, 0);
+        let mut entries = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).dirs(vfs, 0);
 
         let entry = entries.find_map(|entry| {
             if entry.is_err() {
@@ -438,28 +438,27 @@ impl InodeOps for DirOps {
             Some(Ok(entry)) => {
                 let ino = entry.ino();
 
-                Ok(Some(vfs.icache.lock().get_or_alloc(
+                Ok(Some(vfs.get_or_alloc_inode(
                     ino,
                     entry.is_directory(),
-                    entry.size as u64,
-                )?))
+                    entry.size,
+                )))
             }
         }
     }
 
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
-        dir: &'r Inode,
+    fn do_readdir(
+        &self,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
         const ENTRY_SIZE: usize = core::mem::size_of::<FatDirectoryEntry>();
         let cluster_iter =
-            ClusterIterator::new(&fat, dir.ino as ClusterNo).dirs(vfs, offset);
+            ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).dirs(vfs, offset);
 
         let mut nread = 0;
         for entry in cluster_iter {
@@ -473,13 +472,9 @@ impl InodeOps for DirOps {
             let ino = entry.ino();
             let name = entry.filename();
 
-            vfs.icache.lock().get_or_alloc(
-                ino,
-                entry.is_directory(),
-                entry.size as u64,
-            )?;
+            vfs.get_or_alloc_inode(ino, entry.is_directory(), entry.size);
 
-            if callback(name.as_ref(), ino).is_err() {
+            if callback(name.as_ref(), ino)?.is_break() {
                 break;
             }
 
@@ -493,13 +488,7 @@ impl InodeOps for DirOps {
 struct FatMountCreator;
 
 impl MountCreator for FatMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fatfs, root_inode) = FatFs::create(make_device(8, 1))?;
 
         Mount::new(mp, fatfs, root_inode)
@@ -507,5 +496,5 @@ impl MountCreator for FatMountCreator {
 }
 
 pub fn init() {
-    register_filesystem("fat32", Box::new(FatMountCreator)).unwrap();
+    register_filesystem("fat32", Arc::new(FatMountCreator)).unwrap();
 }

+ 170 - 140
src/fs/procfs.rs

@@ -1,7 +1,11 @@
-use core::sync::atomic::Ordering;
-
-use alloc::sync::{Arc, Weak};
+use alloc::{
+    collections::btree_map::BTreeMap,
+    sync::{Arc, Weak},
+};
 use bindings::{EACCES, ENOTDIR, S_IFDIR, S_IFREG};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
+use lazy_static::lazy_static;
 
 use crate::{
     io::Buffer,
@@ -9,13 +13,14 @@ use crate::{
         mem::paging::{Page, PageBuffer},
         vfs::{
             dentry::Dentry,
-            inode::{AtomicIno, Inode, InodeCache, InodeData, InodeOps},
+            inode::{define_struct_inode, AtomicIno, Ino, Inode, InodeData},
             mount::{dump_mounts, register_filesystem, Mount, MountCreator},
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
     },
     prelude::*,
+    sync::Locked,
 };
 
 fn split_len_offset(data: &[u8], len: usize, offset: usize) -> Option<&[u8]> {
@@ -24,8 +29,6 @@ fn split_len_offset(data: &[u8], len: usize, offset: usize) -> Option<&[u8]> {
     real_data.split_at_checked(offset).map(|(_, data)| data)
 }
 
-pub struct ProcFsNode(Arc<Inode>);
-
 pub trait ProcFsFile: Send + Sync {
     fn can_read(&self) -> bool {
         false
@@ -44,21 +47,57 @@ pub trait ProcFsFile: Send + Sync {
     }
 }
 
-struct ProcFsFileOps {
-    file: Box<dyn ProcFsFile>,
+pub enum ProcFsNode {
+    File(Arc<FileInode>),
+    Dir(Arc<DirInode>),
 }
 
-impl InodeOps for ProcFsFileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl ProcFsNode {
+    fn unwrap(&self) -> Arc<dyn Inode> {
+        match self {
+            ProcFsNode::File(inode) => inode.clone(),
+            ProcFsNode::Dir(inode) => inode.clone(),
+        }
     }
 
-    fn read(
-        &self,
-        _: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
+    fn ino(&self) -> Ino {
+        match self {
+            ProcFsNode::File(inode) => inode.ino,
+            ProcFsNode::Dir(inode) => inode.ino,
+        }
+    }
+}
+
+define_struct_inode! {
+    pub struct FileInode {
+        file: Box<dyn ProcFsFile>,
+    }
+}
+
+impl FileInode {
+    pub fn new(ino: Ino, vfs: Weak<ProcFs>, file: Box<dyn ProcFsFile>) -> Arc<Self> {
+        let mut mode = S_IFREG;
+        if file.can_read() {
+            mode |= 0o444;
+        }
+        if file.can_write() {
+            mode |= 0o200;
+        }
+
+        let inode = Self {
+            idata: InodeData::new(ino, vfs),
+            file,
+        };
+
+        inode.idata.mode.store(mode, Ordering::Relaxed);
+        inode.idata.nlink.store(1, Ordering::Relaxed);
+
+        Arc::new(inode)
+    }
+}
+
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         if !self.file.can_read() {
             return Err(EACCES);
         }
@@ -75,47 +114,56 @@ impl InodeOps for ProcFsFileOps {
     }
 }
 
-struct ProcFsDirectory {
-    entries: Mutex<Vec<(Arc<[u8]>, ProcFsNode)>>,
+define_struct_inode! {
+    struct DirInode {
+        entries: Locked<Vec<(Arc<[u8]>, ProcFsNode)>, ()>,
+    }
 }
 
-impl InodeOps for ProcFsDirectory {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl DirInode {
+    pub fn new(ino: Ino, vfs: Weak<ProcFs>) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, entries).write(Locked::new(vec![], rwsem));
+            addr_of_mut_field!(inode, mode).write((S_IFDIR | 0o755).into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
+}
 
-    fn lookup(
-        &self,
-        _: &Inode,
-        dentry: &Arc<Dentry>,
-    ) -> KResult<Option<Arc<Inode>>> {
-        Ok(self.entries.lock().iter().find_map(|(name, node)| {
-            name.as_ref()
-                .eq(dentry.name().as_ref())
-                .then(|| node.0.clone())
-        }))
+impl Inode for DirInode {
+    fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
+        let lock = self.rwsem.lock_shared();
+        Ok(self
+            .entries
+            .access(lock.as_ref())
+            .iter()
+            .find_map(|(name, node)| {
+                name.as_ref()
+                    .eq(dentry.name().as_ref())
+                    .then(|| node.unwrap())
+            }))
     }
 
-    fn readdir<'cb, 'r: 'cb>(
+    fn do_readdir(
         &self,
-        _: &Inode,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        Ok(self
-            .entries
-            .lock()
+        let lock = self.rwsem.lock_shared();
+        self.entries
+            .access(lock.as_ref())
             .iter()
             .skip(offset)
-            .take_while(|(name, ProcFsNode(inode))| {
-                callback(name, inode.ino).is_ok()
-            })
-            .count())
+            .map(|(name, node)| callback(name.as_ref(), node.ino()))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
 }
 
+impl_any!(ProcFs);
 pub struct ProcFs {
-    root_node: Arc<Inode>,
+    root_node: Arc<DirInode>,
     next_ino: AtomicIno,
 }
 
@@ -128,38 +176,37 @@ impl Vfs for ProcFs {
         10
     }
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn is_read_only(&self) -> bool {
+        false
     }
 }
 
-static mut GLOBAL_PROCFS: Option<Arc<ProcFs>> = None;
-static mut ICACHE: Option<InodeCache<ProcFs>> = None;
+lazy_static! {
+    static ref ICACHE: Spin<BTreeMap<Ino, ProcFsNode>> = Spin::new(BTreeMap::new());
+    static ref GLOBAL_PROCFS: Arc<ProcFs> = {
+        let fs: Arc<ProcFs> = Arc::new_cyclic(|weak: &Weak<ProcFs>| ProcFs {
+            root_node: DirInode::new(0, weak.clone()),
+            next_ino: AtomicIno::new(1),
+        });
 
-fn get_icache() -> &'static InodeCache<ProcFs> {
-    unsafe { ICACHE.as_ref().unwrap() }
+        fs
+    };
 }
 
 struct ProcFsMountCreator;
 
 impl ProcFsMountCreator {
     pub fn get() -> Arc<ProcFs> {
-        unsafe { GLOBAL_PROCFS.as_ref().cloned().unwrap() }
+        GLOBAL_PROCFS.clone()
     }
 
     pub fn get_weak() -> Weak<ProcFs> {
-        unsafe { GLOBAL_PROCFS.as_ref().map(Arc::downgrade).unwrap() }
+        Arc::downgrade(&GLOBAL_PROCFS)
     }
 }
 
 impl MountCreator for ProcFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let vfs = ProcFsMountCreator::get();
         let root_inode = vfs.root_node.clone();
         Mount::new(mp, vfs, root_inode)
@@ -170,77 +217,55 @@ pub fn root() -> ProcFsNode {
     let vfs = ProcFsMountCreator::get();
     let root = vfs.root_node.clone();
 
-    ProcFsNode(root)
+    ProcFsNode::Dir(root)
 }
 
 pub fn creat(
     parent: &ProcFsNode,
-    name: &Arc<[u8]>,
+    name: Arc<[u8]>,
     file: Box<dyn ProcFsFile>,
 ) -> KResult<ProcFsNode> {
-    let mut mode = S_IFREG;
-    if file.can_read() {
-        mode |= 0o444;
-    }
-    if file.can_write() {
-        mode |= 0o200;
-    }
-
-    let dir = parent
-        .0
-        .ops
-        .as_any()
-        .downcast_ref::<ProcFsDirectory>()
-        .ok_or(ENOTDIR)?;
+    let parent = match parent {
+        ProcFsNode::File(_) => return Err(ENOTDIR),
+        ProcFsNode::Dir(parent) => parent,
+    };
 
     let fs = ProcFsMountCreator::get();
-    let ino = fs.next_ino.fetch_add(1, Ordering::SeqCst);
+    let ino = fs.next_ino.fetch_add(1, Ordering::Relaxed);
 
-    let inode = get_icache().alloc(ino, Box::new(ProcFsFileOps { file }));
+    let inode = FileInode::new(ino, Arc::downgrade(&fs), file);
 
-    inode.idata.lock().mode = mode;
-    inode.idata.lock().nlink = 1;
-
-    dir.entries
-        .lock()
-        .push((name.clone(), ProcFsNode(inode.clone())));
+    {
+        let mut lock = parent.idata.rwsem.lock();
+        parent
+            .entries
+            .access_mut(lock.as_mut())
+            .push((name, ProcFsNode::File(inode.clone())));
+    }
 
-    Ok(ProcFsNode(inode))
+    Ok(ProcFsNode::File(inode))
 }
 
 pub fn mkdir(parent: &ProcFsNode, name: &[u8]) -> KResult<ProcFsNode> {
-    let dir = parent
-        .0
-        .ops
-        .as_any()
-        .downcast_ref::<ProcFsDirectory>()
-        .ok_or(ENOTDIR)?;
-
-    let ino = ProcFsMountCreator::get()
-        .next_ino
-        .fetch_add(1, Ordering::SeqCst);
-
-    let inode = get_icache().alloc(
-        ino,
-        Box::new(ProcFsDirectory {
-            entries: Mutex::new(vec![]),
-        }),
-    );
+    let parent = match parent {
+        ProcFsNode::File(_) => return Err(ENOTDIR),
+        ProcFsNode::Dir(parent) => parent,
+    };
 
-    {
-        let mut idata = inode.idata.lock();
-        idata.nlink = 2;
-        idata.mode = S_IFDIR | 0o755;
-    }
+    let fs = ProcFsMountCreator::get();
+    let ino = fs.next_ino.fetch_add(1, Ordering::Relaxed);
 
-    dir.entries
-        .lock()
-        .push((Arc::from(name), ProcFsNode(inode.clone())));
+    let inode = DirInode::new(ino, Arc::downgrade(&fs));
 
-    Ok(ProcFsNode(inode))
+    parent
+        .entries
+        .access_mut(inode.rwsem.lock().as_mut())
+        .push((Arc::from(name), ProcFsNode::Dir(inode.clone())));
+
+    Ok(ProcFsNode::Dir(inode))
 }
 
-struct DumpMountsFile {}
+struct DumpMountsFile;
 impl ProcFsFile for DumpMountsFile {
     fn can_read(&self) -> bool {
         true
@@ -254,43 +279,48 @@ impl ProcFsFile for DumpMountsFile {
 }
 
 pub fn init() {
-    let dir = ProcFsDirectory {
-        entries: Mutex::new(vec![]),
-    };
-
-    let fs: Arc<ProcFs> = Arc::new_cyclic(|weak: &Weak<ProcFs>| {
-        let root_node = Arc::new(Inode {
-            ino: 0,
-            vfs: weak.clone(),
-            idata: Mutex::new(InodeData::default()),
-            ops: Box::new(dir),
-        });
+    register_filesystem("procfs", Arc::new(ProcFsMountCreator)).unwrap();
 
-        ProcFs {
-            root_node,
-            next_ino: AtomicIno::new(1),
-        }
-    });
+    creat(
+        &root(),
+        Arc::from(b"mounts".as_slice()),
+        Box::new(DumpMountsFile),
+    )
+    .unwrap();
+}
 
-    {
-        let mut indata = fs.root_node.idata.lock();
-        indata.mode = S_IFDIR | 0o755;
-        indata.nlink = 1;
-    };
+pub struct GenericProcFsFile<ReadFn>
+where
+    ReadFn: Send + Sync + Fn(&mut PageBuffer) -> KResult<()>,
+{
+    read_fn: Option<ReadFn>,
+}
 
-    unsafe {
-        GLOBAL_PROCFS = Some(fs);
-        ICACHE = Some(InodeCache::new(ProcFsMountCreator::get_weak()));
-    };
+impl<ReadFn> ProcFsFile for GenericProcFsFile<ReadFn>
+where
+    ReadFn: Send + Sync + Fn(&mut PageBuffer) -> KResult<()>,
+{
+    fn can_read(&self) -> bool {
+        self.read_fn.is_some()
+    }
 
-    register_filesystem("procfs", Box::new(ProcFsMountCreator)).unwrap();
+    fn read(&self, buffer: &mut PageBuffer) -> KResult<usize> {
+        self.read_fn.as_ref().ok_or(EACCES)?(buffer).map(|_| buffer.len())
+    }
+}
 
+pub fn populate_root<F>(name: Arc<[u8]>, read_fn: F) -> KResult<()>
+where
+    F: Send + Sync + Fn(&mut PageBuffer) -> KResult<()> + 'static,
+{
     let root = root();
 
     creat(
         &root,
-        &Arc::from(b"mounts".as_slice()),
-        Box::new(DumpMountsFile {}),
+        name,
+        Box::new(GenericProcFsFile {
+            read_fn: Some(read_fn),
+        }),
     )
-    .unwrap();
+    .map(|_| ())
 }

+ 207 - 243
src/fs/tmpfs.rs

@@ -1,383 +1,347 @@
-use core::sync::atomic::Ordering;
+use alloc::sync::{Arc, Weak};
+use bindings::{EINVAL, EIO, EISDIR, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
 
 use crate::{
     io::Buffer,
     kernel::vfs::{
-        dentry::Dentry,
-        inode::{AtomicIno, Ino, Inode, InodeCache, InodeOps, Mode},
+        dentry::{dcache, Dentry},
+        inode::{define_struct_inode, AtomicIno, Ino, Inode, Mode, WriteOffset},
         mount::{register_filesystem, Mount, MountCreator, MS_RDONLY},
         s_isblk, s_ischr,
         vfs::Vfs,
-        DevId, ReadDirCallback,
+        DevId,
     },
     prelude::*,
+    sync::Locked,
 };
 
-use alloc::sync::Arc;
-
-use bindings::{
-    EINVAL, EIO, EISDIR, EROFS, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG,
-};
-
-struct FileOps {
-    data: Mutex<Vec<u8>>,
+fn acquire(vfs: &Weak<dyn Vfs>) -> KResult<Arc<dyn Vfs>> {
+    vfs.upgrade().ok_or(EIO)
 }
 
-struct NodeOps {
-    devid: DevId,
+fn astmp(vfs: &Arc<dyn Vfs>) -> &TmpFs {
+    vfs.as_any()
+        .downcast_ref::<TmpFs>()
+        .expect("corrupted tmpfs data structure")
 }
 
-impl NodeOps {
-    fn new(devid: DevId) -> Self {
-        Self { devid }
+define_struct_inode! {
+    struct NodeInode {
+        devid: DevId,
     }
 }
 
-impl InodeOps for NodeOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl NodeInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode, devid: DevId) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, _| unsafe {
+            addr_of_mut_field!(inode, devid).write(devid);
+
+            addr_of_mut_field!(inode, mode).write(mode.into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
+}
 
-    fn devid(&self, _: &Inode) -> KResult<DevId> {
+impl Inode for NodeInode {
+    fn devid(&self) -> KResult<DevId> {
         Ok(self.devid)
     }
 }
 
-struct DirectoryOps {
-    entries: Mutex<Vec<(Arc<[u8]>, Ino)>>,
+define_struct_inode! {
+    struct DirectoryInode {
+        entries: Locked<Vec<(Arc<[u8]>, Ino)>, ()>,
+    }
 }
 
-impl DirectoryOps {
-    fn new() -> Self {
-        Self {
-            entries: Mutex::new(vec![]),
-        }
+impl DirectoryInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, entries)
+                .write(Locked::new(vec![(Arc::from(b".".as_slice()), ino)], rwsem));
+
+            addr_of_mut_field!(inode, size).write(1.into());
+            addr_of_mut_field!(inode, mode).write((S_IFDIR | (mode & 0o777)).into());
+            addr_of_mut_field!(inode, nlink).write(1.into()); // link from `.` to itself
+        })
     }
 
-    /// Locks the `inode.idata`
-    fn link(&self, dir: &Inode, file: &Inode, name: Arc<[u8]>) -> KResult<()> {
-        dir.idata.lock().size += 1;
-        self.entries.lock().push((name, file.ino));
+    fn link(&self, name: Arc<[u8]>, file: &dyn Inode, dlock: &mut ()) {
+        // SAFETY: Only `unlink` will do something based on `nlink` count
+        //         No need to synchronize here
+        file.nlink.fetch_add(1, Ordering::Relaxed);
 
-        file.idata.lock().nlink += 1;
+        // SAFETY: `rwsem` has done the synchronization
+        self.size.fetch_add(1, Ordering::Relaxed);
 
-        Ok(())
+        self.entries.access_mut(dlock).push((name, file.ino));
     }
 }
 
-impl InodeOps for DirectoryOps {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn readdir<'cb, 'r: 'cb>(
+impl Inode for DirectoryInode {
+    fn do_readdir(
         &self,
-        _: &Inode,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
-        Ok(self
-            .entries
-            .lock()
+        let lock = self.rwsem.lock_shared();
+        self.entries
+            .access(lock.as_ref())
             .iter()
             .skip(offset)
-            .take_while(|(name, ino)| callback(name, *ino).is_ok())
-            .count())
+            .map(|(name, ino)| callback(&name, *ino))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
 
-    fn creat(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn creat(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut rwsem = self.rwsem.lock();
 
         let ino = vfs.assign_ino();
-        let file = vfs.icache.lock().alloc_file(ino, mode)?;
+        let file = FileInode::new(ino, self.vfs.clone(), mode);
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_reg(file)
     }
 
-    fn mknod(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        mode: Mode,
-        dev: DevId,
-    ) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
-
-        if vfs.readonly {
-            return Err(EROFS);
-        }
-
+    fn mknod(&self, at: &Dentry, mode: Mode, dev: DevId) -> KResult<()> {
         if !s_ischr(mode) && !s_isblk(mode) {
             return Err(EINVAL);
         }
 
-        let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-        let file = icache.alloc(ino, Box::new(NodeOps::new(dev)));
-        file.idata.lock().mode = mode & (0o777 | S_IFBLK | S_IFCHR);
-        icache.submit(&file)?;
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
+
+        let mut rwsem = self.rwsem.lock();
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        let ino = vfs.assign_ino();
+        let file = NodeInode::new(
+            ino,
+            self.vfs.clone(),
+            mode & (0o777 | S_IFBLK | S_IFCHR),
+            dev,
+        );
+
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_reg(file)
     }
 
-    fn symlink(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        target: &[u8],
-    ) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
-
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+    fn symlink(&self, at: &Arc<Dentry>, target: &[u8]) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-
-        let target_len = target.len() as u64;
+        let mut rwsem = self.rwsem.lock();
 
-        let file =
-            icache.alloc(ino, Box::new(SymlinkOps::new(Arc::from(target))));
-        {
-            let mut idata = file.idata.lock();
-            idata.mode = S_IFLNK | 0o777;
-            idata.size = target_len;
-        }
-        icache.submit(&file)?;
+        let ino = vfs.assign_ino();
+        let file = SymlinkInode::new(ino, self.vfs.clone(), target.into());
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_symlink(file)
     }
 
-    fn mkdir(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn mkdir(&self, at: &Dentry, mode: Mode) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut rwsem = self.rwsem.lock();
 
         let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-
-        let mut newdir_ops = DirectoryOps::new();
-        let entries = newdir_ops.entries.get_mut();
-        entries.push((Arc::from(b".".as_slice()), ino));
-        entries.push((Arc::from(b"..".as_slice()), dir.ino));
-
-        let newdir = icache.alloc(ino, Box::new(newdir_ops));
-        {
-            let mut newdir_idata = newdir.idata.lock();
-            newdir_idata.mode = S_IFDIR | (mode & 0o777);
-            newdir_idata.nlink = 1;
-            newdir_idata.size = 2;
-        }
-
-        icache.submit(&newdir)?;
-        dir.idata.lock().nlink += 1; // link from `newdir` to `dir`, (or parent)
+        let newdir = DirectoryInode::new(ino, self.vfs.clone(), mode);
 
-        self.link(dir, newdir.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), newdir.as_ref(), rwsem.as_mut());
         at.save_dir(newdir)
     }
 
-    fn unlink(&self, dir: &Inode, at: &Arc<Dentry>) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn unlink(&self, at: &Arc<Dentry>) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut dlock = self.rwsem.lock();
 
         let file = at.get_inode()?;
+        let _flock = file.rwsem.lock();
 
-        let mut file_idata = file.idata.lock();
-
-        if file_idata.mode & S_IFDIR != 0 {
+        // SAFETY: `flock` has done the synchronization
+        if file.mode.load(Ordering::Relaxed) & S_IFDIR != 0 {
             return Err(EISDIR);
         }
 
-        let mut self_idata = dir.idata.lock();
-        let mut entries = self.entries.lock();
-
-        let idx = entries
-            .iter()
-            .position(|(_, ino)| *ino == file.ino)
-            .expect("file not found in directory");
+        let entries = self.entries.access_mut(dlock.as_mut());
+        entries.retain(|(_, ino)| *ino != file.ino);
+
+        assert_eq!(
+            entries.len() as u64,
+            // SAFETY: `dlock` has done the synchronization
+            self.size.fetch_sub(1, Ordering::Relaxed) - 1
+        );
+
+        // SAFETY: `flock` has done the synchronization
+        let file_nlink = file.nlink.fetch_sub(1, Ordering::Relaxed) - 1;
+
+        if file_nlink == 0 {
+            // Remove the file inode from the inode cache
+            // The last reference to the inode is held by some dentry
+            // and will be released when the dentry is released
+            //
+            // TODO: Should we use some inode cache in tmpfs?
+            //
+            // vfs.icache.lock().retain(|ino, _| *ino != file.ino);
+        }
 
-        self_idata.size -= 1;
-        file_idata.nlink -= 1;
-        entries.remove(idx);
+        // Postpone the invalidation of the dentry and inode until the
+        // last reference to the dentry is released
+        //
+        // But we can remove it from the dentry cache immediately
+        // so later lookup will fail with ENOENT
+        dcache::d_remove(at);
 
-        at.invalidate()
+        Ok(())
     }
 }
 
-struct SymlinkOps {
-    target: Arc<[u8]>,
-}
-
-impl SymlinkOps {
-    fn new(target: Arc<[u8]>) -> Self {
-        Self { target }
+define_struct_inode! {
+    struct SymlinkInode {
+        target: Arc<[u8]>,
     }
 }
 
-impl InodeOps for SymlinkOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl SymlinkInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, target: Arc<[u8]>) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, _| unsafe {
+            let len = target.len();
+            addr_of_mut_field!(inode, target).write(target);
+
+            addr_of_mut_field!(inode, mode).write((S_IFLNK | 0o777).into());
+            addr_of_mut_field!(inode, size).write((len as u64).into());
+        })
     }
+}
 
-    fn readlink(&self, _: &Inode, buffer: &mut dyn Buffer) -> KResult<usize> {
+impl Inode for SymlinkInode {
+    fn readlink(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
         buffer
             .fill(self.target.as_ref())
             .map(|result| result.allow_partial())
     }
 }
 
-impl FileOps {
-    fn new() -> Self {
-        Self {
-            data: Mutex::new(vec![]),
-        }
+define_struct_inode! {
+    struct FileInode {
+        filedata: Locked<Vec<u8>, ()>,
     }
 }
 
-impl InodeOps for FileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl FileInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, filedata).write(Locked::new(vec![], rwsem));
+
+            addr_of_mut_field!(inode, mode).write((S_IFREG | (mode & 0o777)).into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
+}
 
-    fn read(
-        &self,
-        _: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
-        let data = self.data.lock();
-        let data = data.split_at_checked(offset).ok_or(EINVAL)?.1;
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let lock = self.rwsem.lock_shared();
 
-        buffer.fill(data).map(|result| result.allow_partial())
+        match self.filedata.access(lock.as_ref()).split_at_checked(offset) {
+            Some((_, data)) => buffer.fill(data).map(|result| result.allow_partial()),
+            None => Ok(0),
+        }
     }
 
-    fn write(
-        &self,
-        inode: &Inode,
-        buffer: &[u8],
-        offset: usize,
-    ) -> KResult<usize> {
-        let mut idata = inode.idata.lock();
-        let mut data = self.data.lock();
+    fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let mut lock = self.rwsem.lock();
+        let filedata = self.filedata.access_mut(lock.as_mut());
+
+        let offset = match offset {
+            WriteOffset::Position(offset) => offset,
+            // SAFETY: `lock` has done the synchronization
+            WriteOffset::End(end) => {
+                let size = self.size.load(Ordering::Relaxed) as usize;
+                *end = size + buffer.len();
+
+                size
+            }
+        };
 
-        if data.len() < offset + buffer.len() {
-            data.resize(offset + buffer.len(), 0);
+        if filedata.len() < offset + buffer.len() {
+            filedata.resize(offset + buffer.len(), 0);
         }
 
-        data[offset..offset + buffer.len()].copy_from_slice(&buffer);
-        idata.size = data.len() as u64;
+        filedata[offset..offset + buffer.len()].copy_from_slice(&buffer);
+
+        // SAFETY: `lock` has done the synchronization
+        self.size.store(filedata.len() as u64, Ordering::Relaxed);
 
         Ok(buffer.len())
     }
 
-    fn truncate(&self, inode: &Inode, length: usize) -> KResult<()> {
-        let mut idata = inode.idata.lock();
+    fn truncate(&self, length: usize) -> KResult<()> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let mut lock = self.rwsem.lock();
+        let filedata = self.filedata.access_mut(lock.as_mut());
 
-        idata.size = length as u64;
-        self.data.lock().resize(length, 0);
+        // SAFETY: `lock` has done the synchronization
+        self.size.store(length as u64, Ordering::Relaxed);
+        filedata.resize(length, 0);
 
         Ok(())
     }
 }
 
-/// # Lock order
-/// `vfs` -> `icache` -> `idata` -> `*ops`.`*data`
+impl_any!(TmpFs);
 struct TmpFs {
-    icache: Mutex<InodeCache<TmpFs>>,
     next_ino: AtomicIno,
     readonly: bool,
 }
 
-impl InodeCache<TmpFs> {
-    fn alloc_file(&mut self, ino: Ino, mode: Mode) -> KResult<Arc<Inode>> {
-        let file = self.alloc(ino, Box::new(FileOps::new()));
-        file.idata.lock().mode = S_IFREG | (mode & 0o777);
+impl Vfs for TmpFs {
+    fn io_blksize(&self) -> usize {
+        4096
+    }
 
-        self.submit(&file)?;
+    fn fs_devid(&self) -> DevId {
+        2
+    }
 
-        Ok(file)
+    fn is_read_only(&self) -> bool {
+        self.readonly
     }
 }
 
 impl TmpFs {
     fn assign_ino(&self) -> Ino {
-        self.next_ino.fetch_add(1, Ordering::SeqCst)
+        self.next_ino.fetch_add(1, Ordering::AcqRel)
     }
 
-    pub fn create(readonly: bool) -> KResult<(Arc<TmpFs>, Arc<Inode>)> {
-        let tmpfs = Arc::new_cyclic(|weak| Self {
-            icache: Mutex::new(InodeCache::new(weak.clone())),
+    pub fn create(readonly: bool) -> KResult<(Arc<dyn Vfs>, Arc<dyn Inode>)> {
+        let tmpfs = Arc::new(Self {
             next_ino: AtomicIno::new(1),
             readonly,
         });
 
-        let mut dir = DirectoryOps::new();
-        let entries = dir.entries.get_mut();
-        entries.push((Arc::from(b".".as_slice()), 0));
-        entries.push((Arc::from(b"..".as_slice()), 0));
-
-        let root_dir = {
-            let mut icache = tmpfs.icache.lock();
-            let root_dir = icache.alloc(0, Box::new(dir));
-            {
-                let mut idata = root_dir.idata.lock();
-
-                idata.mode = S_IFDIR | 0o755;
-                idata.nlink = 2;
-                idata.size = 2;
-            }
-
-            icache.submit(&root_dir)?;
-
-            root_dir
-        };
+        let weak = Arc::downgrade(&tmpfs);
+        let root_dir = DirectoryInode::new(0, weak, 0o755);
 
         Ok((tmpfs, root_dir))
     }
 }
 
-impl Vfs for TmpFs {
-    fn io_blksize(&self) -> usize {
-        4096
-    }
-
-    fn fs_devid(&self) -> DevId {
-        2
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
 struct TmpFsMountCreator;
 
 impl MountCreator for TmpFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fs, root_inode) = TmpFs::create(flags & MS_RDONLY != 0)?;
 
         Mount::new(mp, fs, root_inode)
@@ -385,5 +349,5 @@ impl MountCreator for TmpFsMountCreator {
 }
 
 pub fn init() {
-    register_filesystem("tmpfs", Box::new(TmpFsMountCreator)).unwrap();
+    register_filesystem("tmpfs", Arc::new(TmpFsMountCreator)).unwrap();
 }

+ 34 - 51
src/io.rs

@@ -2,8 +2,9 @@ use bindings::EFAULT;
 
 use crate::prelude::*;
 
-use core::{ffi::c_char, fmt::Write, mem::MaybeUninit};
+use core::{fmt::Write, mem::MaybeUninit};
 
+#[must_use]
 pub enum FillResult {
     Done(usize),
     Partial(usize),
@@ -33,7 +34,27 @@ impl FillResult {
 pub trait Buffer {
     fn total(&self) -> usize;
     fn wrote(&self) -> usize;
+
+    #[must_use]
     fn fill(&mut self, data: &[u8]) -> KResult<FillResult>;
+
+    fn available(&self) -> usize {
+        self.total() - self.wrote()
+    }
+}
+
+pub trait BufferFill<T: Copy> {
+    fn copy(&mut self, object: &T) -> KResult<FillResult>;
+}
+
+impl<T: Copy, B: Buffer + ?Sized> BufferFill<T> for B {
+    fn copy(&mut self, object: &T) -> KResult<FillResult> {
+        let ptr = object as *const T as *const u8;
+        let len = core::mem::size_of::<T>();
+
+        // SAFETY: `object` is a valid object.
+        self.fill(unsafe { core::slice::from_raw_parts(ptr, len) })
+    }
 }
 
 pub struct UninitBuffer<'lt, T: Copy + Sized> {
@@ -49,10 +70,7 @@ impl<'lt, T: Copy + Sized> UninitBuffer<'lt, T> {
         Self {
             data,
             buffer: RawBuffer::new_from_slice(unsafe {
-                core::slice::from_raw_parts_mut(
-                    ptr as *mut u8,
-                    core::mem::size_of::<T>(),
-                )
+                core::slice::from_raw_parts_mut(ptr as *mut u8, core::mem::size_of::<T>())
             }),
         }
     }
@@ -64,6 +82,14 @@ impl<'lt, T: Copy + Sized> UninitBuffer<'lt, T> {
 
         Ok(unsafe { self.data.assume_init_ref() })
     }
+
+    pub fn assume_init(self) -> Option<T> {
+        if self.buffer.filled() {
+            Some(unsafe { *self.data.assume_init() })
+        } else {
+            None
+        }
+    }
 }
 
 impl<'lt, T: Copy + Sized> Buffer for UninitBuffer<'lt, T> {
@@ -106,9 +132,9 @@ impl<'lt> RawBuffer<'lt> {
         }
     }
 
-    pub fn new_from_raw(buf: &'lt mut *mut u8, tot: usize) -> Self {
+    pub fn new_from_raw(buf: *mut u8, tot: usize) -> Self {
         Self {
-            buf: *buf,
+            buf,
             tot,
             cur: 0,
             _phantom: core::marker::PhantomData,
@@ -136,11 +162,7 @@ impl<'lt> RawBuffer<'lt> {
             n if n == 0 => Ok(FillResult::Full),
             n if n < data.len() => {
                 unsafe {
-                    core::ptr::copy_nonoverlapping(
-                        data.as_ptr(),
-                        self.buf.add(self.count()),
-                        n,
-                    );
+                    core::ptr::copy_nonoverlapping(data.as_ptr(), self.buf.add(self.count()), n);
                 }
                 self.cur += n;
                 Ok(FillResult::Partial(n))
@@ -227,42 +249,3 @@ impl Write for RawBuffer<'_> {
         }
     }
 }
-
-pub fn get_str_from_cstr<'a>(cstr: *const c_char) -> KResult<&'a str> {
-    if cstr.is_null() {
-        return Err(EFAULT);
-    }
-
-    let cstr = unsafe { core::ffi::CStr::from_ptr::<'a>(cstr) };
-    cstr.to_str().map_err(|_| EFAULT)
-}
-
-/// Copy data from src to dst, starting from offset, and copy at most count bytes.
-///
-/// # Return
-///
-/// The number of bytes copied.
-pub fn copy_offset_count(
-    src: &[u8],
-    dst: &mut [u8],
-    offset: usize,
-    count: usize,
-) -> usize {
-    if offset >= src.len() {
-        return 0;
-    }
-
-    let count = {
-        let count = count.min(dst.len());
-
-        if offset + count > src.len() {
-            src.len() - offset
-        } else {
-            count
-        }
-    };
-
-    dst[..count].copy_from_slice(&src[offset..offset + count]);
-
-    count
-}

+ 79 - 89
src/kernel.ld

@@ -2,12 +2,13 @@ OUTPUT_FORMAT(elf64-x86-64)
 
 MEMORY
 {
-    MBR    (wx) : org = 0x0e00, l = 512
-    STAGE1 (wx) : org = 0x1000, l = 4K
-    PHYMEM (w)  : org = 0xffffff0000000000, len = 512 * 1024M
-    PARRAY (w)  : org = 0xffffff8000000000, len = 128 * 1024M
-    KBSS   (w)  : org = 0xffffffffc0200000, len = 2M
-    KIMAGE (wx) : org = 0xffffffffffc00000, len = 2M
+    MBR           (wx) : org = 0x0e00, l = 512
+    STAGE1        (wx) : org = 0x1000, l = 4K
+    PHYMEM        (w)  : org = 0xffffff0000000000, len = 512 * 1024M
+    PARRAY        (w)  : org = 0xffffff8000000000, len = 128 * 1024M
+    KBSS          (w)  : org = 0xffffffffc0200000, len = 2M
+    KIMAGE        (wx) : org = 0xffffffffffc00000, len = 2M
+    KPERCPU       (w)  : org = 0x0000000000000000, len = 128K
 }
 
 SECTIONS
@@ -26,53 +27,16 @@ SECTIONS
 
     .stage1 : AT(LOADADDR(.mbr) + SIZEOF(.mbr))
     {
-        *(.stage1)
-        . = ALIGN(0x1000);
-    } > STAGE1
-
-    .kinit :
-        AT(LOADADDR(.stage1) + SIZEOF(.stage1))
-    {
-        KIMAGE_START = .;
-        KINIT_START = .;
-
-        *(.text.kinit)
-
-        . = ALIGN(16);
-        *(.rodata.kinit)
-
-        KINIT_START_ADDR = .;
-        QUAD(ABSOLUTE(KINIT_START));
-
-        KINIT_END_ADDR = .;
-        QUAD(ABSOLUTE(KINIT_END));
-
-        KINIT_PAGES = .;
-        QUAD((KINIT_END - KINIT_START) / 0x1000);
-
-        KIMAGE_PAGES_VALUE = .;
-        QUAD((KIMAGE_END - KIMAGE_START) / 0x1000);
-
-        . = ALIGN(16);
-        start_ctors = .;
-        KEEP(*(.init_array));
-        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
-        KEEP(*(.ctors));
-        KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
-        end_ctors = .;
+        KEEP(*(.stage1.smp));
 
         . = ALIGN(16);
-        *(.data.kinit)
-
-        . = ALIGN(16);
-        *(.bss.kinit)
+        *(.stage1)
 
         . = ALIGN(0x1000);
-        KINIT_END = .;
-    } > KIMAGE
+    } > STAGE1
 
     .text :
-        AT(LOADADDR(.kinit) + SIZEOF(.kinit))
+        AT(LOADADDR(.stage1) + SIZEOF(.stage1))
     {
         TEXT_START = .;
         *(.text)
@@ -82,6 +46,8 @@ SECTIONS
         TEXT_END = .;
     } > KIMAGE
 
+    TEXT_PAGES = (TEXT_END - TEXT_START) / 0x1000;
+
     .rodata :
         AT(LOADADDR(.text) + SIZEOF(.text))
     {
@@ -90,30 +56,37 @@ SECTIONS
         *(.rodata*)
 
         . = ALIGN(16);
-        KMOD_LOADERS_START = .;
-
-        KEEP(*(.kmods));
-        QUAD(0);
+        start_ctors = .;
+        KEEP(*(.init_array));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
+        KEEP(*(.ctors));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
+        end_ctors = .;
 
         . = ALIGN(16);
-        late_init_start = .;
-        KEEP(*(.late_init));
-        QUAD(0);
-        late_init_end = .;
+        _fix_start = .;
+        KEEP(*(.fix));
+        _fix_end = .;
 
         . = ALIGN(16);
-
         BSS_ADDR = .;
         QUAD(ABSOLUTE(BSS_START));
         BSS_LENGTH = .;
         QUAD(BSS_END - BSS_START);
+        FIX_START = .;
+        QUAD(ABSOLUTE(_fix_start));
+        FIX_END = .;
+        QUAD(ABSOLUTE(_fix_end));
+        PERCPU_PAGES = .;
+        QUAD(_PERCPU_PAGES);
 
         . = ALIGN(0x1000);
         RODATA_END = .;
     } > KIMAGE
 
-    .data :
-        AT(LOADADDR(.rodata) + SIZEOF(.rodata))
+    RODATA_PAGES = (RODATA_END - RODATA_START) / 0x1000;
+
+    .data : AT(LOADADDR(.rodata) + SIZEOF(.rodata))
     {
         DATA_START = .;
         *(.data)
@@ -122,13 +95,30 @@ SECTIONS
         *(.got)
         *(.got.plt)
 
+        . = . + 4;
         . = ALIGN(0x1000) - 4;
         LONG(KERNEL_MAGIC);
-
         DATA_END = .;
-        KIMAGE_END = .;
     } > KIMAGE
 
+    DATA_PAGES = (DATA_END - DATA_START) / 0x1000;
+
+    _PERCPU_DATA_START = .;
+    .percpu 0 : AT(LOADADDR(.data) + SIZEOF(.data))
+    {
+        PERCPU_START = .;
+        QUAD(0); /* Reserved for x86 percpu pointer */
+        QUAD(0);
+
+        *(.percpu .percpu*)
+
+        . = ALIGN(0x1000);
+        PERCPU_END = .;
+    } > KPERCPU
+    _PERCPU_LENGTH = PERCPU_END - PERCPU_START;
+
+    _PERCPU_PAGES = _PERCPU_LENGTH / 0x1000;
+
     .bss :
     {
         BSS_START = .;
@@ -139,56 +129,56 @@ SECTIONS
         BSS_END = .;
     } > KBSS
 
-    KIMAGE_PAGES = (KIMAGE_END - KIMAGE_START) / 0x1000;
+    KIMAGE_PAGES = TEXT_PAGES + RODATA_PAGES + _PERCPU_PAGES + DATA_PAGES;
     BSS_PAGES = (BSS_END - BSS_START) / 0x1000;
     KERNEL_MAGIC = 0x01145140;
 
-    KIMAGE_32K_COUNT = ((KIMAGE_END - KIMAGE_START) + 32 * 1024 - 1) / (32 * 1024);
+    KIMAGE_32K_COUNT = (KIMAGE_PAGES * 0x1000 + 32 * 1024 - 1) / (32 * 1024);
 
     .eh_frame :
-        AT(LOADADDR(.data) + SIZEOF(.data))
+        AT(LOADADDR(.percpu) + SIZEOF(.percpu))
     {
         KEEP(*(.eh_frame*))
         . = ALIGN(0x1000);
     } > KIMAGE
 
     /* Stabs debugging sections.  */
-    .stab          0 : { *(.stab) }
-    .stabstr       0 : { *(.stabstr) }
-    .stab.excl     0 : { *(.stab.excl) }
-    .stab.exclstr  0 : { *(.stab.exclstr) }
-    .stab.index    0 : { *(.stab.index) }
-    .stab.indexstr 0 : { *(.stab.indexstr) }
-    .comment       0 : { *(.comment) }
+    .stab          0 : { KEEP(*(.stab)); }
+    .stabstr       0 : { KEEP(*(.stabstr)); }
+    .stab.excl     0 : { KEEP(*(.stab.excl)); }
+    .stab.exclstr  0 : { KEEP(*(.stab.exclstr)); }
+    .stab.index    0 : { KEEP(*(.stab.index)); }
+    .stab.indexstr 0 : { KEEP(*(.stab.indexstr)); }
+    .comment       0 : { KEEP(*(.comment)); }
     /* DWARF debug sections.
        Symbols in the DWARF debugging sections are relative to the beginning
        of the section so we begin them at 0.  */
     /* DWARF 1 */
-    .debug          0 : { *(.debug) }
-    .line           0 : { *(.line) }
+    .debug          0 : { KEEP(*(.debug)); }
+    .line           0 : { KEEP(*(.line)); }
     /* GNU DWARF 1 extensions */
-    .debug_srcinfo  0 : { *(.debug_srcinfo) }
-    .debug_sfnames  0 : { *(.debug_sfnames) }
+    .debug_srcinfo  0 : { KEEP(*(.debug_srcinfo)); }
+    .debug_sfnames  0 : { KEEP(*(.debug_sfnames)); }
     /* DWARF 1.1 and DWARF 2 */
-    .debug_aranges  0 : { *(.debug_aranges) }
-    .debug_pubnames 0 : { *(.debug_pubnames) }
+    .debug_aranges  0 : { KEEP(*(.debug_aranges)); }
+    .debug_pubnames 0 : { KEEP(*(.debug_pubnames)); }
     /* DWARF 2 */
-    .debug_info     0 : { *(.debug_info) }
-    .debug_abbrev   0 : { *(.debug_abbrev) }
-    .debug_line     0 : { *(.debug_line) }
-    .debug_frame    0 : { *(.debug_frame) }
-    .debug_str      0 : { *(.debug_str) }
-    .debug_loc      0 : { *(.debug_loc) }
-    .debug_macinfo  0 : { *(.debug_macinfo) }
+    .debug_info     0 : { KEEP(*(.debug_info)); }
+    .debug_abbrev   0 : { KEEP(*(.debug_abbrev)); }
+    .debug_line     0 : { KEEP(*(.debug_line)); }
+    .debug_frame    0 : { KEEP(*(.debug_frame)); }
+    .debug_str      0 : { KEEP(*(.debug_str)); }
+    .debug_loc      0 : { KEEP(*(.debug_loc)); }
+    .debug_macinfo  0 : { KEEP(*(.debug_macinfo)); }
     /* SGI/MIPS DWARF 2 extensions */
-    .debug_weaknames 0 : { *(.debug_weaknames) }
-    .debug_funcnames 0 : { *(.debug_funcnames) }
-    .debug_typenames 0 : { *(.debug_typenames) }
-    .debug_varnames  0 : { *(.debug_varnames) }
+    .debug_weaknames 0 : { KEEP(*(.debug_weaknames)); }
+    .debug_funcnames 0 : { KEEP(*(.debug_funcnames)); }
+    .debug_typenames 0 : { KEEP(*(.debug_typenames)); }
+    .debug_varnames  0 : { KEEP(*(.debug_varnames)); }
 
     /* DWARF Other */
-    .debug_ranges  0 : { *(.debug_ranges) }
-    .debug_line_str 0 : { *(.debug_line_str) }
+    .debug_ranges  0 : { KEEP(*(.debug_ranges)); }
+    .debug_line_str 0 : { KEEP(*(.debug_line_str)); }
     /* Rust stuff */
 
     /DISCARD/ :

+ 16 - 0
src/kernel.rs

@@ -1,5 +1,21 @@
+pub mod arch;
 pub mod block;
 pub mod console;
+pub mod constants;
 pub mod interrupt;
 pub mod mem;
+pub mod syscall;
+pub mod task;
+pub mod timer;
+pub mod user;
 pub mod vfs;
+
+#[cfg(feature = "smp")]
+pub mod smp;
+
+mod chardev;
+mod terminal;
+
+pub use chardev::{CharDevice, CharDeviceType, VirtualCharDevice};
+pub use console::Console;
+pub use terminal::{Terminal, TerminalDevice};

+ 0 - 1
src/kernel/allocator.cc

@@ -218,7 +218,6 @@ static constexpr int __cache_index(std::size_t size) {
     return -1;
 }
 
-SECTION(".text.kinit")
 void kernel::kinit::init_allocator() {
     mem::init_slab_cache(caches + 0, 32);
     mem::init_slab_cache(caches + 1, 64);

+ 5 - 0
src/kernel/arch.rs

@@ -0,0 +1,5 @@
+#[cfg(target_arch = "x86_64")]
+pub mod x86_64;
+
+#[cfg(target_arch = "x86_64")]
+pub use x86_64::*;

+ 82 - 0
src/kernel/arch/x86_64.rs

@@ -0,0 +1,82 @@
+pub mod init;
+pub mod interrupt;
+
+use arch::x86_64::{gdt::GDT, task::TSS};
+
+// TODO!!!: This can be stored in the percpu area.
+//          But we need to implement a guard that ensures that preemption is disabled
+//          while we are accessing the percpu variables.
+#[arch::define_percpu]
+static GDT_OBJECT: Option<GDT> = None;
+
+#[arch::define_percpu]
+static TSS_OBJECT: Option<TSS> = None;
+
+pub mod user {
+    use crate::sync::preempt;
+    use arch::x86_64::gdt::GDTEntry;
+
+    pub struct InterruptStack(pub u64);
+
+    #[derive(Debug, Clone)]
+    pub enum TLS {
+        /// TODO: This is not used yet.
+        #[allow(dead_code)]
+        TLS64(u64),
+        TLS32 {
+            base: u64,
+            desc: GDTEntry,
+        },
+    }
+
+    impl TLS {
+        /// # Return
+        /// Returns the TLS descriptor and the index of the TLS segment.
+        pub fn new32(base: u32, limit: u32, is_limit_in_pages: bool) -> (Self, u32) {
+            let flags = if is_limit_in_pages { 0xc } else { 0x4 };
+
+            (
+                TLS::TLS32 {
+                    base: base as u64,
+                    desc: GDTEntry::new(base, limit, 0xf2, flags),
+                },
+                7,
+            )
+        }
+
+        pub fn load(&self) {
+            match self {
+                TLS::TLS64(base) => {
+                    const IA32_KERNEL_GS_BASE: u32 = 0xc0000102;
+                    arch::x86_64::task::wrmsr(IA32_KERNEL_GS_BASE, *base);
+                }
+                TLS::TLS32 { base, desc } => {
+                    preempt::disable();
+                    let gdt = unsafe {
+                        super::GDT_OBJECT
+                            .as_mut()
+                            .as_mut()
+                            .expect("GDT should be valid")
+                    };
+                    gdt.set_tls32(*desc);
+                    preempt::enable();
+
+                    const IA32_KERNEL_GS_BASE: u32 = 0xc0000102;
+                    arch::x86_64::task::wrmsr(IA32_KERNEL_GS_BASE, *base);
+                }
+            }
+        }
+    }
+
+    pub fn load_interrupt_stack(stack: InterruptStack) {
+        preempt::disable();
+        let tss = unsafe {
+            super::TSS_OBJECT
+                .as_mut()
+                .as_mut()
+                .expect("TSS should be valid")
+        };
+        tss.set_rsp0(stack.0);
+        preempt::enable();
+    }
+}

+ 126 - 0
src/kernel/arch/x86_64/init.rs

@@ -0,0 +1,126 @@
+use super::{interrupt::setup_idt, GDT_OBJECT, TSS_OBJECT};
+use crate::{
+    kernel::{
+        arch::interrupt::APIC_BASE,
+        mem::{paging::Page, phys::PhysPtr as _},
+        smp,
+        task::{ProcessList, Scheduler, Thread},
+    },
+    println_debug, println_info,
+    sync::preempt,
+};
+use alloc::{format, sync::Arc};
+use arch::{
+    interrupt,
+    task::pause,
+    x86_64::{gdt::GDT, task::TSS},
+};
+use core::sync::atomic::{AtomicU32, AtomicUsize, Ordering};
+
+unsafe fn init_gdt_tss_thiscpu() {
+    preempt::disable();
+    let gdt_ref = unsafe { GDT_OBJECT.as_mut() };
+    let tss_ref = unsafe { TSS_OBJECT.as_mut() };
+    *gdt_ref = Some(GDT::new());
+    *tss_ref = Some(TSS::new());
+
+    if let Some(gdt) = gdt_ref.as_mut() {
+        if let Some(tss) = tss_ref.as_mut() {
+            gdt.set_tss(tss as *mut _ as u64);
+        } else {
+            panic!("TSS is not initialized");
+        }
+
+        unsafe { gdt.load() };
+    } else {
+        panic!("GDT is not initialized");
+    }
+
+    preempt::enable();
+}
+
+/// Initialization routine for all CPUs.
+pub unsafe fn init_cpu() {
+    arch::x86_64::io::enable_sse();
+
+    let area = smp::alloc_percpu_area();
+    smp::set_percpu_area(area);
+    init_gdt_tss_thiscpu();
+
+    setup_idt();
+
+    APIC_BASE.spurious().write(0x1ff);
+    APIC_BASE.task_priority().write(0);
+    APIC_BASE.timer_divide().write(0x3); // Divide by 16
+    APIC_BASE.timer_register().write(0x20040);
+
+    // TODO: Get the bus frequency from...?
+    let freq = 800;
+    let count = freq * 1_000_000 / 16 / 100;
+    APIC_BASE.timer_initial_count().write(count as u32);
+
+    let cpu = CPU_COUNT.fetch_add(1, Ordering::Relaxed);
+    if cpu != 0 {
+        // Application processor
+        println_debug!("AP{} started", cpu);
+    }
+}
+
+#[no_mangle]
+pub static BOOT_SEMAPHORE: AtomicU32 = AtomicU32::new(0);
+#[no_mangle]
+pub static BOOT_STACK: AtomicUsize = AtomicUsize::new(0);
+
+pub static CPU_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+#[no_mangle]
+pub unsafe extern "C" fn ap_entry(stack_start: u64) {
+    init_cpu();
+
+    let idle_process = ProcessList::get()
+        .try_find_process(0)
+        .expect("Idle process must exist");
+
+    let idle_thread_name = format!("[kernel idle#AP{}]", 0);
+    let idle_thread = Thread::new_for_init(Arc::from(idle_thread_name.as_bytes()), &idle_process);
+    ProcessList::get().add_thread(&idle_thread);
+    Scheduler::set_idle(idle_thread.clone());
+    Scheduler::set_current(idle_thread);
+
+    preempt::disable();
+    interrupt::enable();
+
+    // TODO!!!!!: Free the stack after having switched to idle task.
+    arch::task::context_switch_light(
+        stack_start as *mut _, // We will never come back
+        unsafe { Scheduler::idle_task().get_sp_ptr() },
+    );
+    arch::task::freeze()
+}
+
+pub unsafe fn bootstrap_cpus() {
+    let icr = APIC_BASE.interrupt_command();
+
+    icr.write(0xc4500);
+    while icr.read() & 0x1000 != 0 {
+        pause();
+    }
+
+    icr.write(0xc4601);
+    while icr.read() & 0x1000 != 0 {
+        pause();
+    }
+
+    while CPU_COUNT.load(Ordering::Acquire) != 4 {
+        if BOOT_STACK.load(Ordering::Acquire) == 0 {
+            let page = Page::alloc_many(9);
+            let stack_start = page.as_cached().as_ptr::<()>() as usize;
+            core::mem::forget(page);
+
+            BOOT_STACK.store(stack_start, Ordering::Release);
+        }
+        pause();
+    }
+
+    println_info!("Processors startup finished");
+}

+ 129 - 0
src/kernel/arch/x86_64/interrupt.rs

@@ -0,0 +1,129 @@
+use crate::kernel::mem::phys::{CachedPP, PhysPtr as _};
+use arch::task::rdmsr;
+use lazy_static::lazy_static;
+
+extern "C" {
+    static ISR_START_ADDR: usize;
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+struct IDTEntry {
+    offset_low: u16,
+    selector: u16,
+
+    interrupt_stack: u8,
+    attributes: u8,
+
+    offset_mid: u16,
+    offset_high: u32,
+    reserved: u32,
+}
+
+impl IDTEntry {
+    const fn new(offset: usize, selector: u16, attributes: u8) -> Self {
+        Self {
+            offset_low: offset as u16,
+            selector,
+            interrupt_stack: 0,
+            attributes,
+            offset_mid: (offset >> 16) as u16,
+            offset_high: (offset >> 32) as u32,
+            reserved: 0,
+        }
+    }
+
+    const fn null() -> Self {
+        Self {
+            offset_low: 0,
+            selector: 0,
+            interrupt_stack: 0,
+            attributes: 0,
+            offset_mid: 0,
+            offset_high: 0,
+            reserved: 0,
+        }
+    }
+}
+
+pub struct APICReg(*mut u32);
+pub struct APICRegs {
+    base: CachedPP,
+}
+
+impl APICReg {
+    fn new(pointer: *mut u32) -> Self {
+        Self(pointer)
+    }
+
+    pub fn read(&self) -> u32 {
+        unsafe { self.0.read_volatile() }
+    }
+
+    pub fn write(&self, value: u32) {
+        unsafe { self.0.write_volatile(value) }
+    }
+}
+
+impl APICRegs {
+    pub fn spurious(&self) -> APICReg {
+        APICReg::new(self.base.offset(0xf0).as_ptr())
+    }
+
+    pub fn task_priority(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x80).as_ptr())
+    }
+
+    pub fn end_of_interrupt(&self) {
+        APICReg::new(self.base.offset(0xb0).as_ptr()).write(0)
+    }
+
+    pub fn interrupt_command(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x300).as_ptr())
+    }
+
+    pub fn timer_register(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x320).as_ptr())
+    }
+
+    pub fn timer_initial_count(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x380).as_ptr())
+    }
+
+    pub fn timer_current_count(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x390).as_ptr())
+    }
+
+    pub fn timer_divide(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x3e0).as_ptr())
+    }
+}
+
+lazy_static! {
+    static ref IDT: [IDTEntry; 256] = core::array::from_fn(|idx| match idx {
+        0..0x80 => IDTEntry::new(unsafe { ISR_START_ADDR } + 8 * idx, 0x08, 0x8e),
+        0x80 => IDTEntry::new(unsafe { ISR_START_ADDR } + 8 * idx, 0x08, 0xee),
+        _ => IDTEntry::null(),
+    });
+    pub static ref APIC_BASE: APICRegs = {
+        let apic_base = rdmsr(0x1b);
+        assert_eq!(apic_base & 0x800, 0x800, "LAPIC not enabled");
+        assert_eq!(apic_base & 0x100, 0x100, "Is not bootstrap processor");
+
+        let apic_base = apic_base & !0xfff;
+        APICRegs {
+            base: CachedPP::new(apic_base as usize),
+        }
+    };
+}
+
+pub fn setup_idt() {
+    arch::x86_64::interrupt::lidt(
+        IDT.as_ptr() as usize,
+        (size_of::<IDTEntry>() * 256 - 1) as u16,
+    );
+}
+
+pub fn end_of_interrupt() {
+    APIC_BASE.end_of_interrupt()
+}

+ 7 - 27
src/kernel/async/lock.cc

@@ -1,5 +1,4 @@
 #include <assert.h>
-#include <stdint.h>
 
 #include <kernel/async/lock.hpp>
 
@@ -49,31 +48,20 @@ static inline void _restore_interrupt_state(lock_context_t context) {
         :);
 }
 
-// TODO: mark as _per_cpu
-static inline preempt_count_t& _preempt_count() {
-    static preempt_count_t _preempt_count;
-    assert(!(_preempt_count & 0x80000000));
-    return _preempt_count;
-}
+extern "C" void r_preempt_disable();
+extern "C" void r_preempt_enable();
+extern "C" unsigned long r_preempt_count();
 
 void preempt_disable() {
-    ++_preempt_count();
+    r_preempt_disable();
 }
 
 void preempt_enable() {
-    --_preempt_count();
-}
-
-extern "C" void r_preempt_disable() {
-    ++_preempt_count();
-}
-
-extern "C" void r_preempt_enable() {
-    --_preempt_count();
+    r_preempt_enable();
 }
 
-preempt_count_t preempt_count() {
-    return _preempt_count();
+unsigned long preempt_count() {
+    return r_preempt_count();
 }
 
 void spin_lock(spinlock_t& lock) {
@@ -105,14 +93,6 @@ mutex::~mutex() {
     assert(m_lock == 0);
 }
 
-void mutex::lock() {
-    spin_lock(m_lock);
-}
-
-void mutex::unlock() {
-    spin_unlock(m_lock);
-}
-
 lock_context_t mutex::lock_irq() {
     return spin_lock_irqsave(m_lock);
 }

+ 0 - 57
src/kernel/async/waitlist.cc

@@ -1,57 +0,0 @@
-#include <assert.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-#include <kernel/process.hpp>
-#include <kernel/task/thread.hpp>
-
-using namespace kernel::async;
-
-bool wait_list::wait(mutex& lock) {
-    this->subscribe();
-
-    auto* curthd = current_thread;
-    curthd->set_attr(kernel::task::thread::ISLEEP);
-
-    lock.unlock();
-    bool has_signals = schedule();
-    lock.lock();
-
-    m_subscribers.erase(curthd);
-    return !has_signals;
-}
-
-void wait_list::subscribe() {
-    lock_guard lck(m_mtx);
-
-    auto* thd = current_thread;
-
-    bool inserted;
-    std::tie(std::ignore, inserted) = m_subscribers.insert(thd);
-
-    assert(inserted);
-}
-
-void wait_list::notify_one() {
-    lock_guard lck(m_mtx);
-
-    if (m_subscribers.empty())
-        return;
-
-    auto iter = m_subscribers.begin();
-    (*iter)->set_attr(kernel::task::thread::READY);
-
-    m_subscribers.erase(iter);
-}
-
-void wait_list::notify_all() {
-    lock_guard lck(m_mtx);
-
-    if (m_subscribers.empty())
-        return;
-
-    for (auto thd : m_subscribers)
-        thd->set_attr(kernel::task::thread::READY);
-
-    m_subscribers.clear();
-}

+ 12 - 10
src/kernel/block.rs

@@ -11,7 +11,7 @@ use alloc::{
 };
 use bindings::{EEXIST, EINVAL, EIO, ENOENT};
 
-use crate::KResult;
+use lazy_static::lazy_static;
 
 use super::{
     mem::{paging::Page, phys::PhysPtr},
@@ -27,18 +27,18 @@ pub trait BlockRequestQueue: Send + Sync {
     ///
     fn max_request_pages(&self) -> u64;
 
-    fn submit(&mut self, req: BlockDeviceRequest) -> KResult<()>;
+    fn submit(&self, req: BlockDeviceRequest) -> KResult<()>;
 }
 
 struct BlockDeviceDisk {
-    queue: Arc<Mutex<dyn BlockRequestQueue>>,
+    queue: Arc<dyn BlockRequestQueue>,
 }
 
 struct BlockDevicePartition {
     disk_dev: DevId,
     offset: u64,
 
-    queue: Arc<Mutex<dyn BlockRequestQueue>>,
+    queue: Arc<dyn BlockRequestQueue>,
 }
 
 enum BlockDeviceType {
@@ -74,8 +74,10 @@ impl Ord for BlockDevice {
     }
 }
 
-static BLOCK_DEVICE_LIST: Mutex<BTreeMap<DevId, Arc<BlockDevice>>> =
-    Mutex::new(BTreeMap::new());
+lazy_static! {
+    static ref BLOCK_DEVICE_LIST: Spin<BTreeMap<DevId, Arc<BlockDevice>>> =
+        Spin::new(BTreeMap::new());
+}
 
 #[derive(Debug, Clone, Copy)]
 #[repr(C)]
@@ -100,9 +102,9 @@ impl BlockDevice {
     pub fn register_disk(
         devid: DevId,
         size: u64,
-        queue: Arc<Mutex<dyn BlockRequestQueue>>,
+        queue: Arc<dyn BlockRequestQueue>,
     ) -> KResult<Arc<Self>> {
-        let max_pages = queue.lock().max_request_pages();
+        let max_pages = queue.max_request_pages();
         let device = Arc::new(Self {
             devid,
             size,
@@ -199,10 +201,10 @@ impl BlockDevice {
         }
 
         match self.dev_type {
-            BlockDeviceType::Disk(ref disk) => disk.queue.lock().submit(req),
+            BlockDeviceType::Disk(ref disk) => disk.queue.submit(req),
             BlockDeviceType::Partition(ref part) => {
                 req.sector += part.offset;
-                part.queue.lock().submit(req)
+                part.queue.submit(req)
             }
         }
     }

+ 155 - 0
src/kernel/chardev.rs

@@ -0,0 +1,155 @@
+use alloc::{
+    boxed::Box,
+    collections::btree_map::{BTreeMap, Entry},
+    sync::Arc,
+};
+use bindings::{EEXIST, EIO};
+
+use crate::{io::Buffer, kernel::console::CONSOLE, prelude::*};
+
+use super::{
+    block::make_device,
+    task::Thread,
+    terminal::Terminal,
+    vfs::{
+        file::{File, TerminalFile},
+        DevId,
+    },
+};
+
+use lazy_static::lazy_static;
+
+pub trait VirtualCharDevice: Send + Sync {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize>;
+    fn write(&self, data: &[u8]) -> KResult<usize>;
+}
+
+pub enum CharDeviceType {
+    Terminal(Arc<Terminal>),
+    Virtual(Box<dyn VirtualCharDevice>),
+}
+
+pub struct CharDevice {
+    name: Arc<str>,
+    device: CharDeviceType,
+}
+
+lazy_static! {
+    pub static ref CHAR_DEVICES: Spin<BTreeMap<DevId, Arc<CharDevice>>> =
+        Spin::new(BTreeMap::new());
+}
+
+impl CharDevice {
+    pub fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        match &self.device {
+            CharDeviceType::Terminal(terminal) => terminal.read(buffer),
+            CharDeviceType::Virtual(device) => device.read(buffer),
+        }
+    }
+
+    pub fn write(&self, data: &[u8]) -> KResult<usize> {
+        match &self.device {
+            CharDeviceType::Virtual(device) => device.write(data),
+            CharDeviceType::Terminal(terminal) => {
+                for &ch in data.iter() {
+                    terminal.show_char(ch);
+                }
+                Ok(data.len())
+            }
+        }
+    }
+
+    pub fn get(devid: DevId) -> Option<Arc<CharDevice>> {
+        CHAR_DEVICES.lock().get(&devid).cloned()
+    }
+
+    pub fn register(devid: DevId, name: Arc<str>, device: CharDeviceType) -> KResult<()> {
+        match CHAR_DEVICES.lock().entry(devid) {
+            Entry::Vacant(entry) => {
+                entry.insert(Arc::new(CharDevice { name, device }));
+                Ok(())
+            }
+            Entry::Occupied(_) => Err(EEXIST),
+        }
+    }
+
+    pub fn open(self: &Arc<Self>) -> KResult<Arc<File>> {
+        Ok(match &self.device {
+            CharDeviceType::Terminal(terminal) => {
+                // We only set the control terminal if the process is the session leader.
+                if Thread::current().process.sid() == Thread::current().process.pid {
+                    let session = Thread::current().process.session();
+                    // Silently fail if we can't set the control terminal.
+                    dont_check!(session.set_control_terminal(&terminal, false));
+                }
+
+                TerminalFile::new(terminal.clone())
+            }
+            CharDeviceType::Virtual(_) => Arc::new(File::CharDev(self.clone())),
+        })
+    }
+}
+
+struct NullDevice;
+impl VirtualCharDevice for NullDevice {
+    fn read(&self, _buffer: &mut dyn Buffer) -> KResult<usize> {
+        Ok(0)
+    }
+
+    fn write(&self, _data: &[u8]) -> KResult<usize> {
+        Ok(_data.len())
+    }
+}
+
+struct ZeroDevice;
+impl VirtualCharDevice for ZeroDevice {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        // TODO: Copy from empty page.
+        while let false = buffer.fill(&[0; 16])?.should_stop() {}
+        Ok(buffer.wrote())
+    }
+
+    fn write(&self, _data: &[u8]) -> KResult<usize> {
+        Ok(_data.len())
+    }
+}
+
+struct ConsoleDevice;
+impl VirtualCharDevice for ConsoleDevice {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        let console_terminal = CONSOLE.lock_irq().get_terminal().ok_or(EIO)?;
+        console_terminal.read(buffer)
+    }
+
+    fn write(&self, data: &[u8]) -> KResult<usize> {
+        let console_terminal = CONSOLE.lock_irq().get_terminal().ok_or(EIO)?;
+        for &ch in data.iter() {
+            console_terminal.show_char(ch);
+        }
+        Ok(data.len())
+    }
+}
+
+impl CharDevice {
+    pub fn init() -> KResult<()> {
+        Self::register(
+            make_device(1, 3),
+            Arc::from("null"),
+            CharDeviceType::Virtual(Box::new(NullDevice)),
+        )?;
+
+        Self::register(
+            make_device(1, 5),
+            Arc::from("zero"),
+            CharDeviceType::Virtual(Box::new(ZeroDevice)),
+        )?;
+
+        Self::register(
+            make_device(5, 1),
+            Arc::from("console"),
+            CharDeviceType::Virtual(Box::new(ConsoleDevice)),
+        )?;
+
+        Ok(())
+    }
+}

+ 60 - 11
src/kernel/console.rs

@@ -1,16 +1,34 @@
 use crate::prelude::*;
 
-pub struct Console {}
+use alloc::sync::Arc;
+use bindings::EEXIST;
+use lazy_static::lazy_static;
+
+pub struct Console {
+    terminal: Option<Arc<Terminal>>,
+}
+
+impl Console {
+    pub fn get_terminal(&self) -> Option<Arc<Terminal>> {
+        self.terminal.clone()
+    }
+
+    pub fn register_terminal(terminal: &Arc<Terminal>) -> KResult<()> {
+        let mut console = CONSOLE.lock_irq();
+        if console.terminal.is_some() {
+            return Err(EEXIST);
+        }
+
+        console.terminal = Some(terminal.clone());
+        Ok(())
+    }
+}
 
 impl Write for Console {
     fn write_str(&mut self, s: &str) -> core::fmt::Result {
-        use crate::bindings::root::kernel::tty::console as _console;
-
-        if let Some(console) = unsafe { _console.as_mut() } {
+        if let Some(console) = &self.terminal {
             for &ch in s.as_bytes() {
-                unsafe {
-                    console.show_char(ch as i32);
-                }
+                console.show_char(ch)
             }
         }
 
@@ -19,11 +37,13 @@ impl Write for Console {
 }
 
 #[doc(hidden)]
-pub fn _print(args: core::fmt::Arguments) -> core::fmt::Result {
-    CONSOLE.lock().write_fmt(args)
+pub fn _print(args: core::fmt::Arguments) {
+    dont_check!(CONSOLE.lock_irq().write_fmt(args))
 }
 
-pub static CONSOLE: spin::Mutex<Console> = spin::Mutex::new(Console {});
+lazy_static! {
+    pub static ref CONSOLE: Spin<Console> = Spin::new(Console { terminal: None });
+}
 
 macro_rules! print {
     ($($arg:tt)*) => {
@@ -40,4 +60,33 @@ macro_rules! println {
     };
 }
 
-pub(crate) use {print, println};
+macro_rules! println_warn {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel: warn] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_debug {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel:debug] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_info {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel: info] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_fatal {
+    () => {
+        $crate::println!("[kernel:fatal] ")
+    };
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel:fatal] {}", format_args!($($arg)*))
+    };
+}
+
+use super::terminal::Terminal;
+
+pub(crate) use {print, println, println_debug, println_fatal, println_info, println_warn};

+ 39 - 0
src/kernel/constants.rs

@@ -0,0 +1,39 @@
+use bitflags::bitflags;
+
+pub const TCGETS: u32 = 0x5401;
+pub const TCSETS: u32 = 0x5402;
+pub const TIOCGPGRP: u32 = 0x540f;
+pub const TIOCSPGRP: u32 = 0x5410;
+pub const TIOCGWINSZ: u32 = 0x5413;
+
+pub const PR_SET_NAME: u32 = 15;
+pub const PR_GET_NAME: u32 = 16;
+
+pub const SIG_BLOCK: u32 = 0;
+pub const SIG_UNBLOCK: u32 = 1;
+pub const SIG_SETMASK: u32 = 2;
+
+pub const SA_SIGINFO: u32 = 4;
+
+pub const CLOCK_REALTIME: u32 = 0;
+pub const CLOCK_MONOTONIC: u32 = 1;
+
+pub const ENXIO: u32 = 6;
+pub const ENOEXEC: u32 = 8;
+
+bitflags! {
+    #[derive(Debug, Clone, Copy)]
+    pub struct UserMmapFlags: u32 {
+        const MAP_SHARED = 0x01;
+        const MAP_PRIVATE = 0x02;
+        const MAP_FIXED = 0x10;
+        const MAP_ANONYMOUS = 0x20;
+    }
+
+    #[derive(Debug, Clone, Copy)]
+    pub struct UserMmapProtocol: u32 {
+        const PROT_READ = 0x01;
+        const PROT_WRITE = 0x02;
+        const PROT_EXEC = 0x04;
+    }
+}

+ 2 - 2
src/kernel/hw/pci.cc

@@ -87,11 +87,11 @@ int register_driver_r(uint16_t vendor, uint16_t device,
 
 namespace kernel::kinit {
 
-SECTION(".text.kinit")
-void init_pci() {
+extern "C" void init_pci() {
     using namespace hw::acpi;
     using namespace hw::pci;
 
+    assert(parse_acpi_tables() == 0);
     auto* mcfg = (MCFG*)get_table("MCFG");
     assert(mcfg);
 

+ 0 - 115
src/kernel/hw/serial.cc

@@ -1,115 +0,0 @@
-#include <errno.h>
-#include <stdio.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/irq.hpp>
-#include <kernel/log.hpp>
-#include <kernel/module.hpp>
-#include <kernel/tty.hpp>
-
-using namespace kernel::tty;
-using namespace kernel::hw;
-using namespace kernel::irq;
-using namespace kernel::kmod;
-
-constexpr int PORT0 = 0x3f8;
-constexpr int PORT1 = 0x2f8;
-
-using port_group = const p8[6];
-
-constexpr p8 port0[] = {
-    p8{PORT0 + 0}, p8{PORT0 + 1}, p8{PORT0 + 2},
-    p8{PORT0 + 3}, p8{PORT0 + 4}, p8{PORT0 + 5},
-};
-
-constexpr p8 port1[] = {
-    p8{PORT1 + 0}, p8{PORT1 + 1}, p8{PORT1 + 2},
-    p8{PORT1 + 3}, p8{PORT1 + 4}, p8{PORT1 + 5},
-};
-
-static void _serial0_receive_data_interrupt() {
-    while (*port0[5] & 1)
-        console->commit_char(*port0[0]);
-}
-
-static void _serial1_receive_data_interrupt() {
-    while (*port1[5] & 1)
-        console->commit_char(*port1[0]);
-}
-
-static inline int _init_port(port_group ports) {
-    // taken from osdev.org
-
-    ports[1] = 0x00; // Disable all interrupts
-    ports[3] = 0x80; // Enable DLAB (set baud rate divisor)
-    // TODO: set baud rate
-    ports[0] = 0x00; // Set divisor to 0 -3- (lo byte) 115200 -38400- baud
-    ports[1] = 0x00; //                  (hi byte)
-    ports[3] = 0x03; // 8 bits, no parity, one stop bit
-    ports[2] = 0xC7; // Enable FIFO, clear them, with 14-byte threshold
-    // TODO: IRQ disabled
-    ports[4] = 0x0B; // IRQs enabled, RTS/DSR set
-    ports[4] = 0x1E; // Set in loopback mode, test the serial chip
-    ports[0] = 0xAE; // Test serial chip (send byte 0xAE and check if serial
-                     // returns same byte)
-
-    // Check if serial is faulty (i.e: not same byte as sent)
-    if (*ports[0] != 0xAE)
-        return -EIO;
-
-    // If serial is not faulty set it in normal operation mode
-    // (not-loopback with IRQs enabled and OUT#1 and OUT#2 bits enabled)
-    ports[4] = 0x0F;
-
-    ports[1] = 0x01; // Enable interrupts #0: Received Data Available
-
-    return 0;
-}
-
-class serial_tty : public virtual tty {
-    const p8* ports;
-
-   public:
-    serial_tty(port_group ports, int id) : tty{"ttyS"}, ports(ports) {
-        name += '0' + id;
-    }
-
-    virtual void putchar(char c) override {
-        while (true) {
-            auto status = *ports[5];
-            if (status & 0x1)
-                this->commit_char(*ports[0]);
-            if (status & 0x20)
-                break;
-        }
-
-        ports[0] = c;
-    }
-};
-
-class serial_module : public virtual kmod {
-   public:
-    serial_module() : kmod("serial-tty") {}
-
-    virtual int init() override {
-        if (int ret = _init_port(port0); ret == 0) {
-            auto* dev = new serial_tty(port0, 0);
-            register_handler(4, _serial0_receive_data_interrupt);
-
-            if (int ret = register_tty(dev); ret != 0)
-                kmsg("[serial] cannot register ttyS0");
-        }
-
-        if (int ret = _init_port(port1); ret == 0) {
-            auto* dev = new serial_tty(port1, 0);
-            register_handler(3, _serial1_receive_data_interrupt);
-
-            if (int ret = register_tty(dev); ret != 0)
-                kmsg("[serial] cannot register ttyS1");
-        }
-
-        return 0;
-    }
-};
-
-INTERNAL_MODULE(serial, serial_module);

+ 0 - 28
src/kernel/hw/timer.cc

@@ -1,28 +0,0 @@
-#include <types/types.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/hw/timer.hpp>
-
-constexpr kernel::hw::p8 port_control(0x43);
-constexpr kernel::hw::p8 port_count(0x40);
-
-static std::size_t _current_ticks = 0;
-
-SECTION(".text.kinit")
-void kernel::hw::timer::init_pit(void) {
-    // set interval
-    port_control = 0x34;
-
-    // send interval number
-    // 0x2e9a = 11930 = 100Hz
-    port_count = 0x9a;
-    port_count = 0x2e;
-}
-
-void kernel::hw::timer::inc_tick(void) {
-    ++_current_ticks;
-}
-
-size_t kernel::hw::timer::current_ticks(void) {
-    return _current_ticks;
-}

+ 0 - 147
src/kernel/interrupt.cpp

@@ -1,147 +0,0 @@
-#include <list>
-#include <vector>
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <types/types.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/hw/timer.hpp>
-#include <kernel/interrupt.hpp>
-#include <kernel/irq.hpp>
-#include <kernel/log.hpp>
-#include <kernel/mem/paging.hpp>
-#include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
-#include <kernel/vfs.hpp>
-
-#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
-#define USER_INTERRUPT_GATE_TYPE (0xee)
-
-constexpr kernel::hw::p8 port_pic1_command{0x20};
-constexpr kernel::hw::p8 port_pic1_data{0x21};
-constexpr kernel::hw::p8 port_pic2_command{0xa0};
-constexpr kernel::hw::p8 port_pic2_data{0xa1};
-
-struct IDT_entry {
-    uint16_t offset_low;
-    uint16_t segment;
-
-    uint8_t IST;
-    uint8_t attributes;
-
-    uint16_t offset_mid;
-    uint32_t offset_high;
-    uint32_t reserved;
-};
-
-static struct IDT_entry IDT[256];
-
-extern "C" uintptr_t ISR_START_ADDR;
-
-SECTION(".text.kinit")
-static inline void set_idt_entry(IDT_entry (&idt)[256], int n, uintptr_t offset,
-                                 uint16_t selector, uint8_t type) {
-    idt[n].offset_low = offset & 0xffff;
-    idt[n].segment = selector;
-    idt[n].IST = 0;
-    idt[n].attributes = type;
-    idt[n].offset_mid = (offset >> 16) & 0xffff;
-    idt[n].offset_high = (offset >> 32) & 0xffffffff;
-    idt[n].reserved = 0;
-}
-
-using kernel::irq::irq_handler_t;
-static std::vector<std::list<irq_handler_t>> s_irq_handlers;
-
-SECTION(".text.kinit")
-void kernel::kinit::init_interrupt() {
-    for (int i = 0; i < 0x30; ++i)
-        set_idt_entry(IDT, i, ISR_START_ADDR + 8 * i, 0x08,
-                      KERNEL_INTERRUPT_GATE_TYPE);
-    set_idt_entry(IDT, 0x80, ISR_START_ADDR + 8 * 0x80, 0x08,
-                  USER_INTERRUPT_GATE_TYPE);
-
-    uint64_t idt_descriptor[2];
-    idt_descriptor[0] = (sizeof(IDT_entry) * 256) << 48;
-    idt_descriptor[1] = (uintptr_t)IDT;
-
-    // initialize PIC
-    asm volatile("lidt (%0)" : : "r"((uintptr_t)idt_descriptor + 6) :);
-    s_irq_handlers.resize(16);
-
-    // TODO: move this to timer driver
-    kernel::irq::register_handler(0, []() {
-        kernel::hw::timer::inc_tick();
-        schedule();
-    });
-
-    port_pic1_command = 0x11; // edge trigger mode
-    port_pic1_data = 0x20;    // start from int 0x20
-    port_pic1_data = 0x04;    // PIC1 is connected to IRQ2 (1 << 2)
-    port_pic1_data = 0x01;    // no buffer mode
-
-    port_pic2_command = 0x11; // edge trigger mode
-    port_pic2_data = 0x28;    // start from int 0x28
-    port_pic2_data = 0x02;    // connected to IRQ2
-    port_pic2_data = 0x01;    // no buffer mode
-
-    // allow all the interrupts
-    port_pic1_data = 0x00;
-    port_pic2_data = 0x00;
-}
-
-void kernel::irq::register_handler(int irqno, irq_handler_t handler) {
-    s_irq_handlers[irqno].emplace_back(std::move(handler));
-}
-
-static inline void fault_handler(interrupt_stack* context, mmx_registers*) {
-    switch (context->int_no) {
-        case 6:
-        case 8: {
-            assert(false);
-            if (!current_process->attr.system)
-                kill_current(SIGSEGV); // noreturn
-        } break;
-        case 13: {
-            if (!current_process->attr.system)
-                kill_current(SIGILL); // noreturn
-        } break;
-        case 14: {
-            kernel::mem::paging::handle_page_fault(context->error_code);
-            return;
-        } break;
-    }
-
-    // fault can not be resolved
-    freeze();
-}
-
-extern "C" void irq_handler_rust(int irqno);
-
-static inline void irq_handler(interrupt_stack* context, mmx_registers*) {
-    int irqno = context->int_no - 0x20;
-
-    constexpr uint8_t PIC_EOI = 0x20;
-
-    for (const auto& handler : s_irq_handlers[irqno])
-        handler();
-
-    irq_handler_rust(irqno);
-
-    port_pic1_command = PIC_EOI;
-    if (irqno >= 8)
-        port_pic2_command = PIC_EOI;
-}
-
-extern "C" void interrupt_handler(interrupt_stack* context,
-                                  mmx_registers* mmxregs) {
-    if (context->int_no < 0x20) // interrupt is a fault
-        fault_handler(context, mmxregs);
-    else if (context->int_no == 0x80) // syscall by int 0x80
-        kernel::handle_syscall32(context->regs.rax, context, mmxregs);
-    else
-        irq_handler(context, mmxregs);
-}

+ 77 - 26
src/kernel/interrupt.rs

@@ -1,44 +1,95 @@
-use alloc::boxed::Box;
-use alloc::vec;
-use alloc::vec::Vec;
+use alloc::sync::Arc;
 
-use crate::bindings::root::EINVAL;
+use lazy_static::lazy_static;
 
-static mut IRQ_HANDLERS: spin::Mutex<[Option<Vec<Box<dyn Fn()>>>; 16]> =
-    spin::Mutex::new([const { None }; 16]);
+use crate::bindings::root::{interrupt_stack, mmx_registers, EINVAL};
+use crate::{driver::Port8, prelude::*};
+
+use super::mem::handle_page_fault;
+use super::syscall::handle_syscall32;
+use super::task::{ProcessList, Signal};
+use super::timer::timer_interrupt;
+
+const PIC1_COMMAND: Port8 = Port8::new(0x20);
+const PIC1_DATA: Port8 = Port8::new(0x21);
+const PIC2_COMMAND: Port8 = Port8::new(0xA0);
+const PIC2_DATA: Port8 = Port8::new(0xA1);
+
+lazy_static! {
+    static ref IRQ_HANDLERS: Spin<[Option<Arc<dyn Fn() + Send + Sync>>; 16]> =
+        Spin::new([const { None }; 16]);
+}
+
+fn irq_handler(irqno: usize) {
+    assert!(irqno < 16);
+
+    let handler = IRQ_HANDLERS.lock()[irqno as usize].as_ref().cloned();
+    if let Some(handler) = handler {
+        handler();
+    }
+
+    PIC1_COMMAND.write(0x20); // EOI
+    if irqno >= 8 {
+        PIC2_COMMAND.write(0x20); // EOI
+    }
+}
+
+fn fault_handler(int_stack: &mut interrupt_stack) {
+    match int_stack.int_no {
+        // Invalid Op or Double Fault
+        14 => handle_page_fault(int_stack),
+        13 if int_stack.ss == 0 => ProcessList::kill_current(Signal::SIGILL),
+        6 | 8 if int_stack.ss == 0 => ProcessList::kill_current(Signal::SIGSEGV),
+        _ => panic!("Unhandled fault: {}", int_stack.int_no),
+    }
+}
 
 #[no_mangle]
-pub extern "C" fn irq_handler_rust(irqno: core::ffi::c_int) {
-    assert!(irqno >= 0 && irqno < 16);
-
-    let handlers = unsafe { IRQ_HANDLERS.lock() };
-
-    match handlers[irqno as usize] {
-        Some(ref handlers) => {
-            for handler in handlers {
-                handler();
-            }
-        }
-        None => {}
+pub extern "C" fn interrupt_handler(int_stack: *mut interrupt_stack, mmxregs: *mut mmx_registers) {
+    let int_stack = unsafe { &mut *int_stack };
+    let mmxregs = unsafe { &mut *mmxregs };
+
+    match int_stack.int_no {
+        // Fault
+        0..0x20 => fault_handler(int_stack),
+        // Syscall
+        0x80 => handle_syscall32(int_stack.regs.rax as usize, int_stack, mmxregs),
+        // Timer
+        0x40 => timer_interrupt(),
+        // IRQ
+        no => irq_handler(no as usize - 0x20),
     }
 }
 
 pub fn register_irq_handler<F>(irqno: i32, handler: F) -> Result<(), u32>
 where
-    F: Fn() + 'static,
+    F: Fn() + Send + Sync + 'static,
 {
     if irqno < 0 || irqno >= 16 {
         return Err(EINVAL);
     }
 
-    let mut handlers = unsafe { IRQ_HANDLERS.lock() };
+    let old = IRQ_HANDLERS.lock_irq()[irqno as usize].replace(Arc::new(handler));
+    assert!(old.is_none(), "IRQ handler already registered");
+    Ok(())
+}
 
-    match handlers[irqno as usize] {
-        Some(ref mut handlers) => handlers.push(Box::new(handler)),
-        None => {
-            handlers[irqno as usize].replace(vec![Box::new(handler)]);
-        }
-    }
+pub fn init() -> KResult<()> {
+    // TODO: Move this to `arch`
+    // Initialize PIC
+    PIC1_COMMAND.write(0x11); // edge trigger mode
+    PIC1_DATA.write(0x20); // IRQ 0-7 offset
+    PIC1_DATA.write(0x04); // cascade with slave PIC
+    PIC1_DATA.write(0x01); // no buffer mode
+
+    PIC2_COMMAND.write(0x11); // edge trigger mode
+    PIC2_DATA.write(0x28); // IRQ 8-15 offset
+    PIC2_DATA.write(0x02); // cascade with master PIC
+    PIC2_DATA.write(0x01); // no buffer mode
+
+    // Allow all IRQs
+    PIC1_DATA.write(0x0);
+    PIC2_DATA.write(0x0);
 
     Ok(())
 }

+ 10 - 0
src/kernel/mem.rs

@@ -1,2 +1,12 @@
 pub mod paging;
 pub mod phys;
+
+mod mm_area;
+mod mm_list;
+mod page_table;
+mod vrange;
+
+pub(self) use mm_area::MMArea;
+pub use mm_list::{handle_page_fault, FileMapping, MMList, Mapping, PageFaultError, Permission};
+pub(self) use page_table::{PageTable, PTE};
+pub use vrange::{VAddr, VRange};

+ 102 - 0
src/kernel/mem/mm_area.rs

@@ -0,0 +1,102 @@
+use core::{borrow::Borrow, cell::UnsafeCell, cmp::Ordering};
+
+use super::{Mapping, Permission, VAddr, VRange};
+
+#[derive(Debug)]
+pub struct MMArea {
+    range: UnsafeCell<VRange>,
+    pub(super) mapping: Mapping,
+    pub(super) permission: Permission,
+}
+
+impl Clone for MMArea {
+    fn clone(&self) -> Self {
+        Self {
+            range: UnsafeCell::new(self.range()),
+            mapping: self.mapping.clone(),
+            permission: self.permission,
+        }
+    }
+}
+
+impl MMArea {
+    pub fn new(range: VRange, mapping: Mapping, permission: Permission) -> Self {
+        Self {
+            range: range.into(),
+            mapping,
+            permission,
+        }
+    }
+
+    fn range_borrow(&self) -> &VRange {
+        // SAFETY: The only way we get a reference to `MMArea` object is through `MMListInner`.
+        // And `MMListInner` is locked with IRQ disabled.
+        unsafe { self.range.get().as_ref().unwrap() }
+    }
+
+    pub fn range(&self) -> VRange {
+        *self.range_borrow()
+    }
+
+    pub fn len(&self) -> usize {
+        self.range_borrow().len()
+    }
+
+    /// # Safety
+    /// This function should be called only when we can guarantee that the range
+    /// won't overlap with any other range in some scope.
+    pub fn grow(&self, count: usize) {
+        let range = unsafe { self.range.get().as_mut().unwrap() };
+        range.clone_from(&self.range_borrow().grow(count));
+    }
+
+    pub fn split(mut self, at: VAddr) -> (Option<Self>, Option<Self>) {
+        assert_eq!(at.floor(), at);
+
+        match self.range_borrow().cmp(&VRange::from(at)) {
+            Ordering::Less => (Some(self), None),
+            Ordering::Greater => (None, Some(self)),
+            Ordering::Equal => {
+                let diff = at - self.range_borrow().start();
+                if diff == 0 {
+                    return (None, Some(self));
+                }
+
+                let right = Self {
+                    range: VRange::new(at, self.range_borrow().end()).into(),
+                    permission: self.permission,
+                    mapping: match &self.mapping {
+                        Mapping::Anonymous => Mapping::Anonymous,
+                        Mapping::File(mapping) => Mapping::File(mapping.offset(diff)),
+                    },
+                };
+
+                self.range.get_mut().shrink(diff);
+                (Some(self), Some(right))
+            }
+        }
+    }
+}
+
+impl Eq for MMArea {}
+impl PartialEq for MMArea {
+    fn eq(&self, other: &Self) -> bool {
+        self.range_borrow().eq(other.range_borrow())
+    }
+}
+impl PartialOrd for MMArea {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        self.range_borrow().partial_cmp(other.range_borrow())
+    }
+}
+impl Ord for MMArea {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.range_borrow().cmp(other.range_borrow())
+    }
+}
+
+impl Borrow<VRange> for MMArea {
+    fn borrow(&self) -> &VRange {
+        self.range_borrow()
+    }
+}

+ 13 - 15
src/kernel/mem/mm_list.cc

@@ -1,4 +1,5 @@
 #include <assert.h>
+#include <errno.h>
 #include <stdint.h>
 
 #include <kernel/mem/mm_list.hpp>
@@ -16,8 +17,7 @@ static inline void __invalidate_all_tlb() {
         : "rax", "memory");
 }
 
-static inline void __dealloc_page_table_all(paging::pfn_t pt, int depth,
-                                            int from, int to) {
+static inline void __dealloc_page_table_all(paging::pfn_t pt, int depth, int from, int to) {
     using namespace paging;
 
     if (depth > 1) {
@@ -43,7 +43,8 @@ static inline void __dealloc_page_table(paging::pfn_t pt) {
 }
 
 mm_list::mm_list() : m_pt{paging::alloc_page_table()}, m_brk{m_areas.end()} {
-    memcpy(physaddr<void>{m_pt}, paging::KERNEL_PAGE_TABLE_PHYS_ADDR, 0x1000);
+    // copy only kernel space
+    memcpy(physaddr<void>{m_pt + 0x800}, physaddr<void>{KERNEL_PML4 + 0x800}, 0x800);
 }
 
 mm_list::mm_list(const mm_list& other) : mm_list{} {
@@ -138,8 +139,7 @@ int mm_list::register_brk(uintptr_t addr) {
         return -ENOMEM;
 
     bool inserted;
-    std::tie(m_brk, inserted) =
-        m_areas.emplace(addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
+    std::tie(m_brk, inserted) = m_areas.emplace(addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
 
     assert(inserted);
     return 0;
@@ -186,8 +186,8 @@ mm_list::iterator mm_list::split(iterator area, uintptr_t addr) {
     auto new_end = area->end;
     area->end = addr;
 
-    auto [iter, inserted] = m_areas.emplace(addr, area->flags, new_end,
-                                            area->mapped_file, new_file_offset);
+    auto [iter, inserted] =
+        m_areas.emplace(addr, area->flags, new_end, d_get(area->mapped_file), new_file_offset);
 
     assert(inserted);
     return iter;
@@ -217,8 +217,7 @@ int mm_list::unmap(iterator area, bool should_invalidate_tlb) {
     return 0;
 }
 
-int mm_list::unmap(uintptr_t start, std::size_t length,
-                   bool should_invalidate_tlb) {
+int mm_list::unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb) {
     // standard says that addr and len MUST be
     // page-aligned or the call is invalid
     if (start & 0xfff)
@@ -279,7 +278,7 @@ int mm_list::unmap(uintptr_t start, std::size_t length,
 int mm_list::mmap(const map_args& args) {
     auto& vaddr = args.vaddr;
     auto& length = args.length;
-    auto& finode = args.file_inode;
+    auto& file = args.file;
     auto& foff = args.file_offset;
     auto& flags = args.flags;
 
@@ -298,10 +297,10 @@ int mm_list::mmap(const map_args& args) {
         attributes |= PA_NXE;
 
     if (flags & MM_MAPPED) {
-        assert(finode);
+        assert(file);
 
-        auto [area, inserted] = m_areas.emplace(
-            vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, finode, foff);
+        auto [area, inserted] =
+            m_areas.emplace(vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, d_get(file), foff);
         assert(inserted);
 
         attributes |= PA_MMAPPED_PAGE;
@@ -310,8 +309,7 @@ int mm_list::mmap(const map_args& args) {
     } else if (flags & MM_ANONYMOUS) {
         // private mapping of zero-filled pages
         // TODO: shared mapping
-        auto [area, inserted] =
-            m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
+        auto [area, inserted] = m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
         assert(inserted);
 
         attributes |= PA_ANONYMOUS_PAGE;

+ 357 - 0
src/kernel/mem/mm_list.rs

@@ -0,0 +1,357 @@
+mod page_fault;
+
+use crate::prelude::*;
+
+use alloc::{collections::btree_set::BTreeSet, sync::Arc};
+use bindings::{EEXIST, EINVAL, ENOMEM};
+
+use crate::kernel::vfs::dentry::Dentry;
+
+use super::{MMArea, PageTable, VAddr, VRange};
+
+pub use page_fault::{handle_page_fault, PageFaultError};
+
+#[derive(Debug, Clone)]
+pub struct FileMapping {
+    file: Arc<Dentry>,
+    /// Offset in the file, aligned to 4KB boundary.
+    offset: usize,
+    /// Length of the mapping. Exceeding part will be zeroed.
+    length: usize,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct Permission {
+    pub write: bool,
+    pub execute: bool,
+}
+
+#[derive(Debug, Clone)]
+pub enum Mapping {
+    Anonymous,
+    File(FileMapping),
+}
+
+#[derive(Debug)]
+struct MMListInner {
+    areas: BTreeSet<MMArea>,
+    break_start: Option<VRange>,
+    break_pos: Option<VAddr>,
+}
+
+#[derive(Debug)]
+pub struct MMList {
+    /// # Safety
+    /// This field might be used in IRQ context, so it should be locked with `lock_irq()`.
+    inner: Mutex<MMListInner>,
+    /// Do not modify entries in the page table without acquiring the `inner` lock.
+    page_table: PageTable,
+}
+
+impl FileMapping {
+    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
+        assert_eq!(offset & 0xfff, 0);
+        Self {
+            file,
+            offset,
+            length,
+        }
+    }
+
+    pub fn offset(&self, offset: usize) -> Self {
+        if self.length <= offset {
+            Self::new(self.file.clone(), self.offset + self.length, 0)
+        } else {
+            Self::new(
+                self.file.clone(),
+                self.offset + offset,
+                self.length - offset,
+            )
+        }
+    }
+}
+
+impl MMListInner {
+    fn overlapping_addr(&self, addr: VAddr) -> Option<&MMArea> {
+        self.areas.get(&VRange::from(addr))
+    }
+
+    fn check_overlapping_addr(&self, addr: VAddr) -> bool {
+        addr.is_user() && self.overlapping_addr(addr).is_none()
+    }
+
+    fn overlapping_range(&self, range: VRange) -> impl DoubleEndedIterator<Item = &MMArea> + '_ {
+        self.areas.range(range.into_range())
+    }
+
+    fn check_overlapping_range(&self, range: VRange) -> bool {
+        range.is_user() && self.overlapping_range(range).next().is_none()
+    }
+
+    fn find_available(&self, hint: VAddr, len: usize) -> Option<VAddr> {
+        let mut range = if hint == VAddr::NULL {
+            VRange::new(VAddr(0x1234000), VAddr(0x1234000 + len).ceil())
+        } else {
+            VRange::new(hint.floor(), (hint + len).ceil())
+        };
+        let len = range.len();
+
+        loop {
+            if !range.is_user() {
+                return None;
+            }
+
+            match self.overlapping_range(range).next_back() {
+                None => return Some(range.start()),
+                Some(area) => {
+                    range = VRange::new(area.range().end().ceil(), area.range().end().ceil() + len);
+                }
+            }
+        }
+    }
+
+    fn unmap(&mut self, page_table: &PageTable, start: VAddr, len: usize) -> KResult<()> {
+        assert_eq!(start.floor(), start);
+        let end = (start + len).ceil();
+        let range = VRange::new(start, end);
+        if !range.is_user() {
+            return Err(EINVAL);
+        }
+
+        let check_range = VRange::from(range.start())..VRange::from(range.end());
+        let mut front_remaining = None;
+        let mut back_remaining = None;
+
+        self.areas.retain(|area| {
+            if !check_range.contains(&area.range()) {
+                return true;
+            }
+            if area.range() == range.start().into() {
+                let (left, right) = area.clone().split(range.start());
+                page_table.unmap(&right.unwrap());
+
+                if let Some(left) = left {
+                    assert!(
+                        front_remaining.replace(left).is_none(),
+                        "There should be only one `front`."
+                    );
+                }
+            } else if area.range() == range.end().into() {
+                let (left, right) = area.clone().split(range.end());
+                page_table.unmap(&left.unwrap());
+
+                assert!(
+                    back_remaining
+                        .replace(right.expect("`right` should be valid"))
+                        .is_none(),
+                    "There should be only one `back`."
+                );
+            } else {
+                page_table.unmap(area);
+            }
+
+            false
+        });
+
+        if let Some(front) = front_remaining {
+            self.areas.insert(front);
+        }
+        if let Some(back) = back_remaining {
+            self.areas.insert(back);
+        }
+
+        Ok(())
+    }
+
+    fn mmap(
+        &mut self,
+        page_table: &PageTable,
+        at: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+    ) -> KResult<()> {
+        assert_eq!(at.floor(), at);
+        assert_eq!(len & 0xfff, 0);
+        let range = VRange::new(at, at + len);
+
+        // We are doing a area marker insertion.
+        if len == 0 && !self.check_overlapping_addr(at) || !self.check_overlapping_range(range) {
+            return Err(EEXIST);
+        }
+
+        match &mapping {
+            Mapping::Anonymous => page_table.set_anonymous(range, permission),
+            Mapping::File(_) => page_table.set_mmapped(range, permission),
+        }
+
+        self.areas.insert(MMArea::new(range, mapping, permission));
+        Ok(())
+    }
+}
+
+impl MMList {
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self {
+            inner: Mutex::new(MMListInner {
+                areas: BTreeSet::new(),
+                break_start: None,
+                break_pos: None,
+            }),
+            page_table: PageTable::new(),
+        })
+    }
+
+    pub fn new_cloned(&self) -> Arc<Self> {
+        let inner = self.inner.lock_irq();
+
+        let list = Arc::new(Self {
+            inner: Mutex::new(MMListInner {
+                areas: inner.areas.clone(),
+                break_start: inner.break_start,
+                break_pos: inner.break_pos,
+            }),
+            page_table: PageTable::new(),
+        });
+
+        // SAFETY: `self.inner` already locked with IRQ disabled.
+        {
+            let list_inner = list.inner.lock();
+
+            for area in list_inner.areas.iter() {
+                let new_iter = list.page_table.iter_user(area.range()).unwrap();
+                let old_iter = self.page_table.iter_user(area.range()).unwrap();
+
+                for (new, old) in new_iter.zip(old_iter) {
+                    new.setup_cow(old);
+                }
+            }
+        }
+
+        // We set some pages as COW, so we need to invalidate TLB.
+        self.page_table.lazy_invalidate_tlb_all();
+
+        list
+    }
+
+    /// No need to do invalidation manually, `PageTable` already does it.
+    pub fn clear_user(&self) {
+        let mut inner = self.inner.lock_irq();
+        inner.areas.retain(|area| {
+            self.page_table.unmap(area);
+            false
+        });
+        inner.break_start = None;
+        inner.break_pos = None;
+    }
+
+    pub fn switch_page_table(&self) {
+        self.page_table.switch();
+    }
+
+    /// No need to do invalidation manually, `PageTable` already does it.
+    pub fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
+        self.inner.lock_irq().unmap(&self.page_table, start, len)
+    }
+
+    pub fn mmap_hint(
+        &self,
+        hint: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+    ) -> KResult<VAddr> {
+        let mut inner = self.inner.lock_irq();
+        if hint == VAddr::NULL {
+            let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
+            inner.mmap(&self.page_table, at, len, mapping, permission)?;
+            return Ok(at);
+        }
+
+        match inner.mmap(&self.page_table, hint, len, mapping.clone(), permission) {
+            Ok(()) => Ok(hint),
+            Err(EEXIST) => {
+                let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
+                inner.mmap(&self.page_table, at, len, mapping, permission)?;
+                Ok(at)
+            }
+            Err(err) => Err(err),
+        }
+    }
+
+    pub fn mmap_fixed(
+        &self,
+        at: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+    ) -> KResult<VAddr> {
+        self.inner
+            .lock_irq()
+            .mmap(&self.page_table, at, len, mapping.clone(), permission)
+            .map(|_| at)
+    }
+
+    pub fn set_break(&self, pos: Option<VAddr>) -> VAddr {
+        let mut inner = self.inner.lock_irq();
+
+        // SAFETY: `set_break` is only called in syscalls, where program break should be valid.
+        assert!(inner.break_start.is_some() && inner.break_pos.is_some());
+        let break_start = inner.break_start.unwrap();
+        let current_break = inner.break_pos.unwrap();
+        let pos = match pos {
+            None => return current_break,
+            Some(pos) => pos.ceil(),
+        };
+
+        let range = VRange::new(current_break, pos);
+        if !inner.check_overlapping_range(range) {
+            return current_break;
+        }
+
+        if !inner.areas.contains(&break_start) {
+            inner.areas.insert(MMArea::new(
+                break_start,
+                Mapping::Anonymous,
+                Permission {
+                    write: true,
+                    execute: false,
+                },
+            ));
+        }
+
+        let program_break = inner
+            .areas
+            .get(&break_start)
+            .expect("Program break area should be valid");
+
+        let len = pos - current_break;
+        self.page_table.set_anonymous(
+            VRange::from(program_break.range().end()).grow(len),
+            Permission {
+                write: true,
+                execute: false,
+            },
+        );
+
+        program_break.grow(len);
+
+        inner.break_pos = Some(pos);
+        pos
+    }
+
+    /// This should be called only **once** for every thread.
+    pub fn register_break(&self, start: VAddr) {
+        let mut inner = self.inner.lock_irq();
+        assert!(inner.break_start.is_none() && inner.break_pos.is_none());
+
+        inner.break_start = Some(start.into());
+        inner.break_pos = Some(start);
+    }
+}
+
+impl Drop for MMList {
+    fn drop(&mut self) {
+        self.clear_user();
+    }
+}

+ 206 - 0
src/kernel/mem/mm_list/page_fault.rs

@@ -0,0 +1,206 @@
+use bindings::kernel::mem::paging::pfn_to_page;
+use bindings::{PA_A, PA_ANON, PA_COW, PA_MMAP, PA_P, PA_RW};
+use bitflags::bitflags;
+
+use crate::bindings::root::interrupt_stack;
+use crate::kernel::mem::paging::{Page, PageBuffer};
+use crate::kernel::mem::phys::{CachedPP, PhysPtr};
+use crate::kernel::mem::{Mapping, VRange};
+use crate::kernel::task::{ProcessList, Signal, Thread};
+use crate::prelude::*;
+
+use super::{MMList, VAddr};
+
+bitflags! {
+    pub struct PageFaultError: u64 {
+        const Present = 0x0001;
+        const Write = 0x0002;
+        const User = 0x0004;
+        const ReservedSet = 0x0008;
+        const InstructionFetch = 0x0010;
+        const ProtectionKey = 0x0020;
+        const SGX = 0x8000;
+    }
+}
+
+#[repr(C)]
+struct FixEntry {
+    start: u64,
+    length: u64,
+    jump_address: u64,
+    op_type: u64,
+}
+
+impl MMList {
+    fn handle_page_fault(
+        &self,
+        int_stack: &mut interrupt_stack,
+        addr: VAddr,
+        error: PageFaultError,
+    ) -> Result<(), Signal> {
+        let inner = self.inner.lock();
+        let area = match inner.areas.get(&VRange::from(addr)) {
+            Some(area) => area,
+            None => {
+                if error.contains(PageFaultError::User) {
+                    return Err(Signal::SIGBUS);
+                } else {
+                    try_page_fault_fix(int_stack, addr);
+                    return Ok(());
+                }
+            }
+        };
+
+        // User access permission violation, check user access permission.
+        if error.contains(PageFaultError::User | PageFaultError::Present) {
+            if error.contains(PageFaultError::Write) && !area.permission.write {
+                ProcessList::kill_current(Signal::SIGSEGV)
+            }
+
+            if error.contains(PageFaultError::InstructionFetch) && !area.permission.execute {
+                ProcessList::kill_current(Signal::SIGSEGV)
+            }
+        }
+
+        let pte = self
+            .page_table
+            .iter_user(VRange::new(addr.floor(), addr.floor() + 0x1000))
+            .unwrap()
+            .next()
+            .expect("If we can find the mapped area, we should be able to find the PTE");
+
+        let is_mapped = matches!(&area.mapping, Mapping::File(_));
+        if !is_mapped && !error.contains(PageFaultError::Present) {
+            try_page_fault_fix(int_stack, addr);
+            return Ok(());
+        }
+
+        let mut pfn = pte.pfn();
+        let mut attributes = pte.attributes();
+
+        if attributes & PA_COW as usize != 0 {
+            attributes &= !PA_COW as usize;
+            if area.permission.write {
+                attributes |= PA_RW as usize;
+            } else {
+                attributes &= !PA_RW as usize;
+            }
+
+            // TODO!!!: Change this.
+            let page = unsafe { pfn_to_page(pfn).as_mut().unwrap() };
+            if page.refcount == 1 {
+                pte.set_attributes(attributes);
+                return Ok(());
+            }
+
+            let new_page = Page::alloc_one();
+            if attributes & PA_ANON as usize != 0 {
+                new_page.zero();
+            } else {
+                new_page
+                    .as_cached()
+                    .as_mut_slice::<u8>(0x1000)
+                    .copy_from_slice(CachedPP::new(pfn).as_slice(0x1000));
+            }
+
+            attributes &= !(PA_A | PA_ANON) as usize;
+            page.refcount -= 1;
+
+            pfn = new_page.into_pfn();
+            pte.set(pfn, attributes);
+        }
+
+        // TODO: shared mapping
+        if attributes & PA_MMAP as usize != 0 {
+            attributes |= PA_P as usize;
+
+            if let Mapping::File(mapping) = &area.mapping {
+                let load_offset = addr.floor() - area.range().start();
+                if load_offset < mapping.length {
+                    // SAFETY: Since we are here, the `pfn` must refer to a valid buddy page.
+                    let page = unsafe { Page::from_pfn(pfn, 0) };
+                    let nread = mapping
+                        .file
+                        .read(
+                            &mut PageBuffer::new(page.clone()),
+                            mapping.offset + load_offset,
+                        )
+                        .map_err(|_| Signal::SIGBUS)?;
+
+                    if nread < page.len() {
+                        page.as_cached().as_mut_slice::<u8>(0x1000)[nread..].fill(0);
+                    }
+
+                    if mapping.length - load_offset < 0x1000 {
+                        let length_to_end = mapping.length - load_offset;
+                        page.as_cached().as_mut_slice::<u8>(0x1000)[length_to_end..].fill(0);
+                    }
+                }
+                // Otherwise, the page is kept zero emptied.
+
+                attributes &= !PA_MMAP as usize;
+                pte.set_attributes(attributes);
+            } else {
+                panic!("Anonymous mapping should not be PA_MMAP");
+            }
+        }
+
+        Ok(())
+    }
+}
+
+extern "C" {
+    static FIX_START: *const FixEntry;
+    static FIX_END: *const FixEntry;
+}
+
+/// Try to fix the page fault by jumping to the `error` address.
+///
+/// Panic if we can't find the `ip` in the fix list.
+fn try_page_fault_fix(int_stack: &mut interrupt_stack, addr: VAddr) {
+    let ip = int_stack.v_rip as u64;
+    // TODO: Use `op_type` to fix.
+
+    // SAFETY: `FIX_START` and `FIX_END` are defined in the linker script in `.rodata` section.
+    let entries = unsafe {
+        core::slice::from_raw_parts(
+            FIX_START,
+            (FIX_END as usize - FIX_START as usize) / size_of::<FixEntry>(),
+        )
+    };
+
+    for entry in entries.iter() {
+        if ip >= entry.start && ip < entry.start + entry.length {
+            int_stack.v_rip = entry.jump_address as usize;
+            return;
+        }
+    }
+
+    kernel_page_fault_die(addr, ip as usize)
+}
+
+fn kernel_page_fault_die(vaddr: VAddr, ip: usize) -> ! {
+    panic!(
+        "Invalid kernel mode memory access to {:#8x} while executing the instruction at {:#8x}",
+        vaddr.0, ip
+    )
+}
+
+pub fn handle_page_fault(int_stack: &mut interrupt_stack) {
+    let error = PageFaultError::from_bits_truncate(int_stack.error_code);
+    let vaddr = VAddr(arch::x86_64::vm::get_cr2());
+
+    let result = Thread::current()
+        .process
+        .mm_list
+        .handle_page_fault(int_stack, vaddr, error);
+
+    if let Err(signal) = result {
+        println_debug!(
+            "Page fault on {:#x} in user space at {:#x}",
+            vaddr.0,
+            int_stack.v_rip
+        );
+        ProcessList::kill_current(signal)
+    }
+}

+ 307 - 0
src/kernel/mem/page_table.rs

@@ -0,0 +1,307 @@
+use lazy_static::lazy_static;
+
+use crate::prelude::*;
+
+use crate::bindings::root::{EINVAL, KERNEL_PML4};
+
+use super::{
+    paging::Page,
+    phys::{CachedPP, PhysPtr as _},
+    VAddr, VRange,
+};
+use super::{MMArea, Permission};
+
+const PA_P: usize = 0x001;
+const PA_RW: usize = 0x002;
+const PA_US: usize = 0x004;
+const PA_PWT: usize = 0x008;
+const PA_PCD: usize = 0x010;
+const PA_A: usize = 0x020;
+const PA_D: usize = 0x040;
+const PA_PS: usize = 0x080;
+const PA_G: usize = 0x100;
+const PA_COW: usize = 0x200;
+const PA_MMAP: usize = 0x400;
+const PA_ANON: usize = 0x800;
+const PA_NXE: usize = 0x8000_0000_0000_0000;
+const PA_MASK: usize = 0xfff0_0000_0000_0fff;
+
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy)]
+pub struct PTE(usize);
+
+#[derive(Debug)]
+pub struct PageTable {
+    page: Page,
+}
+
+pub struct PTEIterator<'lt, const KERNEL: bool> {
+    count: usize,
+    i4: u16,
+    i3: u16,
+    i2: u16,
+    i1: u16,
+    p4: CachedPP,
+    p3: CachedPP,
+    p2: CachedPP,
+    p1: CachedPP,
+
+    start: VAddr,
+    end: VAddr,
+    _phantom: core::marker::PhantomData<&'lt ()>,
+}
+
+lazy_static! {
+    static ref EMPTY_PAGE: Page = {
+        let page = Page::alloc_one();
+        page.zero();
+        page
+    };
+}
+
+impl PTE {
+    pub fn is_user(&self) -> bool {
+        self.0 & PA_US != 0
+    }
+
+    pub fn is_present(&self) -> bool {
+        self.0 & PA_P != 0
+    }
+
+    pub fn pfn(&self) -> usize {
+        self.0 & !PA_MASK
+    }
+
+    pub fn attributes(&self) -> usize {
+        self.0 & PA_MASK
+    }
+
+    pub fn set(&mut self, pfn: usize, attributes: usize) {
+        self.0 = pfn | attributes;
+    }
+
+    pub fn set_pfn(&mut self, pfn: usize) {
+        self.set(pfn, self.attributes())
+    }
+
+    pub fn set_attributes(&mut self, attributes: usize) {
+        self.set(self.pfn(), attributes)
+    }
+
+    fn parse_page_table(&mut self, kernel: bool) -> CachedPP {
+        let attributes = if kernel {
+            PA_P | PA_RW | PA_G
+        } else {
+            PA_P | PA_RW | PA_US
+        };
+
+        if self.is_present() {
+            CachedPP::new(self.pfn())
+        } else {
+            let page = Page::alloc_one();
+            let pp = page.as_cached();
+            page.zero();
+
+            self.set(page.into_pfn(), attributes);
+            pp
+        }
+    }
+
+    pub fn setup_cow(&mut self, from: &mut Self) {
+        self.set(
+            unsafe { Page::from_pfn(from.pfn(), 0) }.into_pfn(),
+            (from.attributes() & !(PA_RW | PA_A | PA_D)) | PA_COW,
+        );
+
+        from.set_attributes((from.attributes() & !PA_RW) | PA_COW);
+    }
+
+    pub fn clear(&mut self) {
+        self.set(0, 0)
+    }
+
+    /// Take the ownership of the page from the PTE, clear the PTE and return the page.
+    pub fn take(&mut self) -> Page {
+        // SAFETY: Acquire the ownership of the page from the page table and then
+        // clear the PTE so no one could be able to access the page from here later on.
+        let page = unsafe { Page::take_pfn(self.pfn(), 0) };
+        self.clear();
+        page
+    }
+}
+
+impl<'lt, const KERNEL: bool> PTEIterator<'lt, KERNEL> {
+    fn new(pt: &'lt Page, start: VAddr, end: VAddr) -> KResult<Self> {
+        if start > end {
+            return Err(EINVAL);
+        }
+
+        let p4 = pt.as_cached();
+        let p3 = p4.as_mut_slice::<PTE>(512)[Self::index(4, start)].parse_page_table(KERNEL);
+        let p2 = p3.as_mut_slice::<PTE>(512)[Self::index(3, start)].parse_page_table(KERNEL);
+        let p1 = p2.as_mut_slice::<PTE>(512)[Self::index(2, start)].parse_page_table(KERNEL);
+
+        Ok(Self {
+            count: (end.0 - start.0) >> 12,
+            i4: Self::index(4, start) as u16,
+            i3: Self::index(3, start) as u16,
+            i2: Self::index(2, start) as u16,
+            i1: Self::index(1, start) as u16,
+            p4,
+            p3,
+            p2,
+            p1,
+            start,
+            end,
+            _phantom: core::marker::PhantomData,
+        })
+    }
+
+    fn offset(level: u32) -> usize {
+        12 + (level as usize - 1) * 9
+    }
+
+    fn index(level: u32, vaddr: VAddr) -> usize {
+        (vaddr.0 >> Self::offset(level)) & 0x1ff
+    }
+}
+
+impl<'lt, const KERNEL: bool> Iterator for PTEIterator<'lt, KERNEL> {
+    type Item = &'lt mut PTE;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.count != 0 {
+            self.count -= 1;
+        } else {
+            return None;
+        }
+
+        let retval = &mut self.p1.as_mut_slice::<PTE>(512)[self.i1 as usize];
+        self.i1 = (self.i1 + 1) % 512;
+        if self.i1 == 0 {
+            self.i2 = (self.i2 + 1) % 512;
+            if self.i2 == 0 {
+                self.i3 = (self.i3 + 1) % 512;
+                if self.i3 == 0 {
+                    self.i4 = (self.i4 + 1) % 512;
+                    if self.i4 == 0 {
+                        panic!("PTEIterator: out of range");
+                    }
+                }
+                self.p3 =
+                    self.p4.as_mut_slice::<PTE>(512)[self.i4 as usize].parse_page_table(KERNEL);
+            }
+            self.p2 = self.p3.as_mut_slice::<PTE>(512)[self.i3 as usize].parse_page_table(KERNEL);
+        }
+        self.p1 = self.p2.as_mut_slice::<PTE>(512)[self.i2 as usize].parse_page_table(KERNEL);
+        Some(retval)
+    }
+}
+
+impl PageTable {
+    pub fn new() -> Self {
+        let page = Page::alloc_one();
+        page.zero();
+
+        // TODO: copy only the kernel space mappings.
+        let kernel_space_page_table = CachedPP::new(KERNEL_PML4 as usize);
+
+        page.as_cached().as_mut_slice::<u64>(512)[256..]
+            .copy_from_slice(&kernel_space_page_table.as_mut_slice(512)[256..]);
+
+        Self { page }
+    }
+
+    pub fn iter_user(&self, range: VRange) -> KResult<PTEIterator<'_, false>> {
+        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
+    }
+
+    pub fn iter_kernel(&self, range: VRange) -> KResult<PTEIterator<'_, true>> {
+        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
+    }
+
+    pub fn switch(&self) {
+        arch::vm::switch_page_table(self.page.as_phys())
+    }
+
+    pub fn unmap(&self, area: &MMArea) {
+        let range = area.range();
+        let use_invlpg = range.len() / 4096 < 4;
+        let iter = self.iter_user(range).unwrap();
+
+        if self.page.as_phys() != arch::vm::current_page_table() {
+            for pte in iter {
+                pte.take();
+            }
+            return;
+        }
+
+        if use_invlpg {
+            for (offset_pages, pte) in iter.enumerate() {
+                pte.take();
+
+                let pfn = range.start().floor().0 + offset_pages * 4096;
+                arch::vm::invlpg(pfn);
+            }
+        } else {
+            for pte in iter {
+                pte.take();
+            }
+            arch::vm::invlpg_all();
+        }
+    }
+
+    pub fn lazy_invalidate_tlb_all(&self) {
+        if self.page.as_phys() == arch::vm::current_page_table() {
+            arch::vm::invlpg_all();
+        }
+    }
+
+    pub fn set_mmapped(&self, range: VRange, permission: Permission) {
+        // PA_RW is set during page fault handling.
+        // PA_NXE is preserved across page faults, so we set PA_NXE now.
+        let attributes = if permission.execute {
+            PA_US | PA_COW | PA_ANON | PA_MMAP
+        } else {
+            PA_US | PA_COW | PA_ANON | PA_MMAP | PA_NXE
+        };
+
+        for pte in self.iter_user(range).unwrap() {
+            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
+        }
+    }
+
+    pub fn set_anonymous(&self, range: VRange, permission: Permission) {
+        // PA_RW is set during page fault handling.
+        // PA_NXE is preserved across page faults, so we set PA_NXE now.
+        let attributes = if permission.execute {
+            PA_P | PA_US | PA_COW | PA_ANON
+        } else {
+            PA_P | PA_US | PA_COW | PA_ANON | PA_NXE
+        };
+
+        for pte in self.iter_user(range).unwrap() {
+            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
+        }
+    }
+}
+
+fn drop_page_table_recursive(pt: &Page, level: usize) {
+    for pte in pt
+        .as_cached()
+        .as_mut_slice::<PTE>(512)
+        .iter_mut()
+        .filter(|pte| pte.is_present() && pte.is_user())
+    {
+        let page = pte.take();
+        if level > 1 {
+            drop_page_table_recursive(&page, level - 1);
+        }
+    }
+}
+
+impl Drop for PageTable {
+    fn drop(&mut self) {
+        drop_page_table_recursive(&self.page, 4);
+    }
+}

+ 4 - 197
src/kernel/mem/paging.cc

@@ -5,10 +5,8 @@
 
 #include <kernel/async/lock.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mem/mm_list.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/slab.hpp>
-#include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 
 using namespace types::list;
@@ -16,19 +14,6 @@ using namespace types::list;
 using namespace kernel::async;
 using namespace kernel::mem::paging;
 
-static inline void __page_fault_die(uintptr_t vaddr) {
-    kmsgf("[kernel] kernel panic: invalid memory access to %p", vaddr);
-    freeze();
-}
-
-static inline PSE __parse_pse(PSE pse, bool priv) {
-    auto attr = priv ? PA_KERNEL_PAGE_TABLE : PA_USER_PAGE_TABLE;
-    if (!(pse.attributes() & PA_P))
-        pse.set(attr, alloc_page_table());
-
-    return pse.parse();
-}
-
 static struct zone_info {
     page* next;
     std::size_t count;
@@ -96,8 +81,7 @@ static inline page* _create_zone(pfn_t pfn, unsigned order) {
 }
 
 // call with zone_lock held
-static inline void _split_zone(page* zone, unsigned order,
-                               unsigned target_order) {
+static inline void _split_zone(page* zone, unsigned order, unsigned target_order) {
     while (order > target_order) {
         pfn_t pfn = page_to_pfn(zone);
         _create_zone(buddy(pfn, order - 1), order - 1);
@@ -116,7 +100,7 @@ static inline page* _alloc_zone(unsigned order) {
         if (!zone)
             continue;
 
-        increase_refcount(zone);
+        zone->refcount++;
 
         if (i > order)
             _split_zone(zone, i, order);
@@ -213,13 +197,12 @@ pfn_t kernel::mem::paging::alloc_page_table() {
 }
 
 void kernel::mem::paging::free_pages(page* pg, unsigned order) {
+    lock_guard_irq lock{zone_lock};
     assert((pg->flags & 0xff) == order);
 
-    // TODO: atomic
     if (!(pg->flags & PAGE_BUDDY) || --pg->refcount)
         return;
 
-    lock_guard_irq lock{zone_lock};
     while (order < 52) {
         pfn_t pfn = page_to_pfn(pg);
         pfn_t buddy_pfn = buddy(pfn, order);
@@ -268,182 +251,6 @@ page* kernel::mem::paging::pfn_to_page(pfn_t pfn) {
 }
 
 void kernel::mem::paging::increase_refcount(page* pg) {
+    lock_guard_irq lock{zone_lock};
     pg->refcount++;
 }
-
-void kernel::mem::paging::handle_page_fault(unsigned long err) {
-    using namespace kernel::mem;
-    using namespace paging;
-
-    uintptr_t vaddr;
-    asm volatile("mov %%cr2, %0" : "=g"(vaddr) : :);
-    auto& mms = current_process->mms;
-
-    auto* mm_area = mms.find(vaddr);
-    if (!mm_area) [[unlikely]] {
-        // user access to address that does not exist
-        if (err & PAGE_FAULT_U)
-            kill_current(SIGSEGV);
-
-        __page_fault_die(vaddr);
-    }
-
-    // user access to a present page caused the fault
-    // check access rights
-    if (err & PAGE_FAULT_U && err & PAGE_FAULT_P) {
-        // write to read only pages
-        if (err & PAGE_FAULT_W && !(mm_area->flags & MM_WRITE))
-            kill_current(SIGSEGV);
-
-        // execute from non-executable pages
-        if (err & PAGE_FAULT_I && !(mm_area->flags & MM_EXECUTE))
-            kill_current(SIGSEGV);
-    }
-
-    auto idx = idx_all(vaddr);
-
-    auto pe = mms.get_page_table()[std::get<1>(idx)];
-    assert(pe.attributes() & PA_P);
-    pe = pe.parse()[std::get<2>(idx)];
-    assert(pe.attributes() & PA_P);
-    pe = pe.parse()[std::get<3>(idx)];
-    assert(pe.attributes() & PA_P);
-    pe = pe.parse()[std::get<4>(idx)];
-
-    bool mmapped = mm_area->flags & MM_MAPPED;
-    assert(!mmapped || mm_area->mapped_file);
-
-    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
-        __page_fault_die(vaddr);
-
-    pfn_t pfn = pe.pfn();
-    auto attr = pe.attributes();
-
-    page* pg = pfn_to_page(pfn);
-
-    if (attr & PA_COW) {
-        attr &= ~PA_COW;
-        if (mm_area->flags & MM_WRITE)
-            attr |= PA_RW;
-        else
-            attr &= ~PA_RW;
-
-        // if it is a dying page
-        // TODO: use atomic
-        if (pg->refcount == 1) {
-            pe.set(attr, pfn);
-            return;
-        }
-
-        // duplicate the page
-        page* new_page = alloc_page();
-        pfn_t new_pfn = page_to_pfn(new_page);
-        physaddr<void> new_page_addr{new_pfn};
-
-        if (attr & PA_ANON)
-            memset(new_page_addr, 0x00, 0x1000);
-        else
-            memcpy(new_page_addr, physaddr<void>{pfn}, 0x1000);
-
-        attr &= ~(PA_A | PA_ANON);
-        --pg->refcount;
-
-        pe.set(attr, new_pfn);
-        pfn = new_pfn;
-    }
-
-    if (attr & PA_MMAP) {
-        attr |= PA_P;
-
-        size_t offset = (vaddr & ~0xfff) - mm_area->start;
-        char* data = physaddr<char>{pfn};
-
-        int n = fs_read(mm_area->mapped_file, data, 4096,
-                        mm_area->file_offset + offset, 4096);
-
-        // TODO: send SIGBUS if offset is greater than real size
-        if (n != 4096)
-            memset(data + n, 0x00, 4096 - n);
-
-        // TODO: shared mapping
-        attr &= ~PA_MMAP;
-
-        pe.set(attr, pfn);
-    }
-}
-
-vaddr_range::vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool priv)
-    : n{start >= end ? 0 : ((end - start) >> 12)}
-    , idx4{!n ? 0 : idx_p4(start)}
-    , idx3{!n ? 0 : idx_p3(start)}
-    , idx2{!n ? 0 : idx_p2(start)}
-    , idx1{!n ? 0 : idx_p1(start)}
-    , pml4{!n ? PSE{0} : PSE{pt}}
-    , pdpt{!n ? PSE{0} : __parse_pse(pml4[idx4], priv)}
-    , pd{!n ? PSE{0} : __parse_pse(pdpt[idx3], priv)}
-    , pt{!n ? PSE{0} : __parse_pse(pd[idx2], priv)}
-    , m_start{!n ? 0 : start}
-    , m_end{!n ? 0 : end}
-    , is_privilege{!n ? false : priv} {}
-
-vaddr_range::vaddr_range(std::nullptr_t)
-    : n{}
-    , idx4{}
-    , idx3{}
-    , idx2{}
-    , idx1{}
-    , pml4{0}
-    , pdpt{0}
-    , pd{0}
-    , pt{0}
-    , m_start{}
-    , m_end{}
-    , is_privilege{} {}
-
-vaddr_range vaddr_range::begin() const noexcept {
-    return *this;
-}
-
-vaddr_range vaddr_range::end() const noexcept {
-    return vaddr_range{nullptr};
-}
-
-PSE vaddr_range::operator*() const noexcept {
-    return pt[idx1];
-}
-
-vaddr_range& vaddr_range::operator++() {
-    --n;
-
-    if ((idx1 = (idx1 + 1) % 512) != 0)
-        return *this;
-
-    do {
-        if ((idx2 = (idx2 + 1) % 512) != 0)
-            break;
-        do {
-            if ((idx3 = (idx3 + 1) % 512) != 0)
-                break;
-
-            idx4 = (idx4 + 1) % 512;
-
-            // if idx4 is 0 after update, we have an overflow
-            assert(idx4 != 0);
-
-            pdpt = __parse_pse(pml4[idx4], is_privilege);
-        } while (false);
-
-        pd = __parse_pse(pdpt[idx3], is_privilege);
-    } while (false);
-
-    pt = __parse_pse(pd[idx2], is_privilege);
-    return *this;
-}
-
-vaddr_range::operator bool() const noexcept {
-    return n;
-}
-
-bool vaddr_range::operator==(const vaddr_range& other) const noexcept {
-    return n == other.n;
-}

+ 91 - 35
src/kernel/mem/paging.rs

@@ -1,37 +1,78 @@
+use crate::bindings::root::kernel::mem::paging::{
+    alloc_page as c_alloc_page, alloc_pages as c_alloc_pages, free_pages as c_free_pages,
+    increase_refcount as c_increase_refcount, page as c_page, page_to_pfn as c_page_to_pfn,
+    pfn_to_page as c_pfn_to_page, PAGE_BUDDY,
+};
 use crate::bindings::root::EFAULT;
+use crate::io::{Buffer, FillResult};
 use crate::kernel::mem::phys;
 use core::fmt;
 
 use super::phys::PhysPtr;
 
 pub struct Page {
-    page_ptr: *mut crate::bindings::root::kernel::mem::paging::page,
+    page_ptr: *mut c_page,
     order: u32,
 }
 
 impl Page {
     pub fn alloc_one() -> Self {
-        use crate::bindings::root::kernel::mem::paging::alloc_page;
-        let page_ptr = unsafe { alloc_page() };
+        let page_ptr = unsafe { c_alloc_page() };
 
         Self { page_ptr, order: 0 }
     }
 
     pub fn alloc_many(order: u32) -> Self {
-        use crate::bindings::root::kernel::mem::paging::alloc_pages;
-        let page_ptr = unsafe { alloc_pages(order) };
+        let page_ptr = unsafe { c_alloc_pages(order) };
 
         Self { page_ptr, order }
     }
 
+    /// Get `Page` from `pfn`, acquiring the ownership of the page. `refcount` is not increased.
+    ///
+    /// # Safety
+    /// Caller must ensure that the pfn is no longer referenced by any other code.
+    pub unsafe fn take_pfn(pfn: usize, order: u32) -> Self {
+        let page_ptr = unsafe { c_pfn_to_page(pfn) };
+
+        // Only buddy pages can be used here.
+        assert!(unsafe { page_ptr.as_ref().unwrap() }.flags & PAGE_BUDDY != 0);
+
+        // Check if the order is correct.
+        assert_eq!(
+            unsafe { page_ptr.as_ref().unwrap() }.flags & 0xff,
+            order as u64
+        );
+
+        Self { page_ptr, order }
+    }
+
+    /// Get `Page` from `pfn` and increase the reference count.
+    ///
+    /// # Safety
+    /// Caller must ensure that `pfn` refers to a valid physical frame number with `refcount` > 0.
+    pub unsafe fn from_pfn(pfn: usize, order: u32) -> Self {
+        // SAFETY: `pfn` is a valid physical frame number with refcount > 0.
+        unsafe { Self::increase_refcount(pfn) };
+
+        // SAFETY: `pfn` has an increased refcount.
+        unsafe { Self::take_pfn(pfn, order) }
+    }
+
+    /// Consumes the `Page` and returns the physical frame number without dropping the reference
+    /// count the page holds.
+    pub fn into_pfn(self) -> usize {
+        let pfn = unsafe { c_page_to_pfn(self.page_ptr) };
+        core::mem::forget(self);
+        pfn
+    }
+
     pub fn len(&self) -> usize {
         1 << (self.order + 12)
     }
 
     pub fn as_phys(&self) -> usize {
-        use crate::bindings::root::kernel::mem::paging::page_to_pfn;
-
-        unsafe { page_to_pfn(self.page_ptr) }
+        unsafe { c_page_to_pfn(self.page_ptr) }
     }
 
     pub fn as_cached(&self) -> phys::CachedPP {
@@ -46,11 +87,17 @@ impl Page {
         use phys::PhysPtr;
 
         unsafe {
-            core::ptr::write_bytes(
-                self.as_cached().as_ptr::<u8>(),
-                0,
-                self.len(),
-            );
+            core::ptr::write_bytes(self.as_cached().as_ptr::<u8>(), 0, self.len());
+        }
+    }
+
+    /// # Safety
+    /// Caller must ensure that the page is properly freed.
+    pub unsafe fn increase_refcount(pfn: usize) {
+        let page = unsafe { c_pfn_to_page(pfn) };
+
+        unsafe {
+            c_increase_refcount(page);
         }
     }
 }
@@ -58,9 +105,7 @@ impl Page {
 impl Clone for Page {
     fn clone(&self) -> Self {
         unsafe {
-            crate::bindings::root::kernel::mem::paging::increase_refcount(
-                self.page_ptr,
-            );
+            c_increase_refcount(self.page_ptr);
         }
 
         Self {
@@ -73,10 +118,7 @@ impl Clone for Page {
 impl Drop for Page {
     fn drop(&mut self) {
         unsafe {
-            crate::bindings::root::kernel::mem::paging::free_pages(
-                self.page_ptr,
-                self.order,
-            );
+            c_free_pages(self.page_ptr, self.order);
         }
     }
 }
@@ -118,20 +160,12 @@ impl PageBuffer {
     }
 
     pub fn as_slice(&self) -> &[u8] {
-        unsafe {
-            core::slice::from_raw_parts(
-                self.page.as_cached().as_ptr::<u8>(),
-                self.offset,
-            )
-        }
+        unsafe { core::slice::from_raw_parts(self.page.as_cached().as_ptr::<u8>(), self.offset) }
     }
 
     pub fn as_mut_slice(&self) -> &mut [u8] {
         unsafe {
-            core::slice::from_raw_parts_mut(
-                self.page.as_cached().as_ptr::<u8>(),
-                self.offset,
-            )
+            core::slice::from_raw_parts_mut(self.page.as_cached().as_ptr::<u8>(), self.offset)
         }
     }
 
@@ -162,6 +196,32 @@ impl core::fmt::Write for PageBuffer {
     }
 }
 
+impl Buffer for PageBuffer {
+    fn total(&self) -> usize {
+        self.page.len()
+    }
+
+    fn wrote(&self) -> usize {
+        self.len()
+    }
+
+    fn fill(&mut self, data: &[u8]) -> crate::KResult<crate::io::FillResult> {
+        if self.remaining() == 0 {
+            return Ok(FillResult::Full);
+        }
+
+        let len = core::cmp::min(data.len(), self.remaining());
+        self.available_as_slice()[..len].copy_from_slice(&data[..len]);
+        self.consume(len);
+
+        if len < data.len() {
+            Ok(FillResult::Partial(len))
+        } else {
+            Ok(FillResult::Done(len))
+        }
+    }
+}
+
 /// Copy data from a slice to a `Page`
 ///
 /// DONT USE THIS FUNCTION TO COPY DATA TO MMIO ADDRESSES
@@ -177,11 +237,7 @@ pub fn copy_to_page(src: &[u8], dst: &Page) -> Result<(), u32> {
     }
 
     unsafe {
-        core::ptr::copy_nonoverlapping(
-            src.as_ptr(),
-            dst.as_cached().as_ptr(),
-            src.len(),
-        );
+        core::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_cached().as_ptr(), src.len());
     }
 
     Ok(())

+ 2 - 2
src/kernel/mem/phys.rs

@@ -31,11 +31,11 @@ pub struct NoCachePP {
 }
 
 impl CachedPP {
-    pub fn new(addr: usize) -> Self {
+    pub const fn new(addr: usize) -> Self {
         Self { addr }
     }
 
-    pub fn offset(&self, offset: usize) -> Self {
+    pub const fn offset(&self, offset: usize) -> Self {
         Self {
             addr: self.addr + offset,
         }

+ 20 - 10
src/kernel/mem/slab.cc

@@ -4,6 +4,7 @@
 
 #include <types/list.hpp>
 
+#include <kernel/async/lock.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/slab.hpp>
 
@@ -12,6 +13,8 @@ using namespace types::list;
 
 constexpr std::size_t SLAB_PAGE_SIZE = 0x1000; // 4K
 
+kernel::async::mutex slab_lock;
+
 std::ptrdiff_t _slab_data_start_offset(std::size_t size) {
     return (sizeof(slab_head) + size - 1) & ~(size - 1);
 }
@@ -67,6 +70,8 @@ void _slab_add_page(slab_cache* cache) {
 }
 
 void* kernel::mem::slab_alloc(slab_cache* cache) {
+    async::lock_guard_irq lock(slab_lock);
+
     slab_head* slab = cache->slabs_partial;
     if (!slab) {                 // no partial slabs, try to get an empty slab
         if (!cache->slabs_empty) // no empty slabs, create a new one
@@ -88,24 +93,29 @@ void* kernel::mem::slab_alloc(slab_cache* cache) {
 }
 
 void kernel::mem::slab_free(void* ptr) {
+    async::lock_guard_irq lock(slab_lock);
+
     slab_head* slab = (slab_head*)((uintptr_t)ptr & ~(SLAB_PAGE_SIZE - 1));
 
     *(void**)ptr = slab->free;
     slab->free = ptr;
     slab->free_count++;
 
-    if (slab->free_count == _slab_max_count(slab->obj_size)) {
-        auto* cache = slab->cache;
-        slab_head** head = nullptr;
+    auto max_count = _slab_max_count(slab->obj_size);
 
-        if (cache->slabs_full == slab) {
-            head = &cache->slabs_full;
-        } else {
-            head = &cache->slabs_partial;
-        }
+    if (max_count == 1) {
+        list_remove(&slab->cache->slabs_full, slab);
+        list_insert(&slab->cache->slabs_empty, slab);
+    }
+
+    if (slab->free_count == 1) {
+        list_remove(&slab->cache->slabs_full, slab);
+        list_insert(&slab->cache->slabs_partial, slab);
+    }
 
-        list_remove(head, slab);
-        list_insert(&cache->slabs_empty, slab);
+    if (slab->free_count == max_count) {
+        list_remove(&slab->cache->slabs_partial, slab);
+        list_insert(&slab->cache->slabs_empty, slab);
     }
 }
 

+ 168 - 0
src/kernel/mem/vrange.rs

@@ -0,0 +1,168 @@
+use core::{
+    cmp::Ordering,
+    fmt::{self, Debug, Formatter},
+    ops::{Add, RangeBounds, Sub},
+};
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct VAddr(pub usize);
+
+#[derive(Clone, Copy)]
+pub struct VRange {
+    start: VAddr,
+    end: VAddr,
+}
+
+const USER_SPACE_MEMORY_TOP: VAddr = VAddr(0x8000_0000_0000);
+
+impl VAddr {
+    pub const NULL: Self = Self(0);
+
+    pub fn floor(&self) -> Self {
+        VAddr(self.0 & !0xfff)
+    }
+
+    pub fn ceil(&self) -> Self {
+        VAddr((self.0 + 0xfff) & !0xfff)
+    }
+
+    pub fn is_user(&self) -> bool {
+        self.0 != 0 && self < &USER_SPACE_MEMORY_TOP
+    }
+}
+
+impl Sub for VAddr {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Add<usize> for VAddr {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 + rhs)
+    }
+}
+
+impl Sub<usize> for VAddr {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 - rhs)
+    }
+}
+
+impl Debug for VAddr {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "V{:#x}", self.0)
+    }
+}
+
+impl Debug for VRange {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "[{:?}, {:?})", self.start, self.end)
+    }
+}
+
+impl Eq for VRange {}
+impl PartialOrd for VRange {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for VRange {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+/// Any two ranges that have one of them containing the other are considered equal.
+impl Ord for VRange {
+    fn cmp(&self, other: &Self) -> Ordering {
+        if self.start == other.start {
+            return Ordering::Equal;
+        }
+
+        if self.end == other.end {
+            if self.start == self.end {
+                return Ordering::Greater;
+            }
+            if other.start == other.end {
+                return Ordering::Less;
+            }
+            return Ordering::Equal;
+        }
+
+        if self.start < other.start {
+            if other.end < self.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Less;
+            }
+        }
+
+        if other.start < self.start {
+            if self.end < other.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Greater;
+            }
+        }
+
+        unreachable!()
+    }
+}
+
+impl From<VAddr> for VRange {
+    fn from(addr: VAddr) -> Self {
+        VRange::new(addr, addr)
+    }
+}
+
+impl VRange {
+    pub fn new(start: VAddr, end: VAddr) -> Self {
+        assert!(start <= end);
+        VRange { start, end }
+    }
+
+    pub fn is_overlapped(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    pub fn is_user(&self) -> bool {
+        self.start < USER_SPACE_MEMORY_TOP && self.end <= USER_SPACE_MEMORY_TOP
+    }
+
+    pub fn start(&self) -> VAddr {
+        self.start
+    }
+
+    pub fn end(&self) -> VAddr {
+        self.end
+    }
+
+    pub fn len(&self) -> usize {
+        self.end.0 - self.start.0
+    }
+
+    pub fn shrink(&self, count: usize) -> Self {
+        assert!(count <= self.len());
+        VRange::new(self.start, self.end - count)
+    }
+
+    pub fn grow(&self, count: usize) -> Self {
+        VRange::new(self.start, self.end + count)
+    }
+
+    pub fn into_range(self) -> impl RangeBounds<Self> {
+        if self.len() == 0 {
+            VRange::from(self.start())..=VRange::from(self.start())
+        } else {
+            VRange::from(self.start())..=VRange::from(self.end() - 1)
+        }
+    }
+}

この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません