Browse Source

Merge branch 'smp'

greatbridf 3 weeks ago
parent
commit
79f34a447a
100 changed files with 4739 additions and 3279 deletions
  1. 1 0
      .clang-format
  2. 3 3
      .rustfmt.toml
  3. 12 49
      CMakeLists.txt
  4. 45 14
      Cargo.lock
  5. 11 0
      Cargo.toml
  6. 7 1
      Makefile.src
  7. 65 0
      arch/Cargo.lock
  8. 8 0
      arch/Cargo.toml
  9. 53 0
      arch/percpu/Cargo.lock
  10. 8 0
      arch/percpu/Cargo.toml
  11. 47 0
      arch/percpu/macros/Cargo.lock
  12. 12 0
      arch/percpu/macros/Cargo.toml
  13. 22 0
      arch/percpu/macros/src/arch.rs
  14. 117 0
      arch/percpu/macros/src/lib.rs
  15. 25 0
      arch/percpu/src/arch.rs
  16. 6 0
      arch/percpu/src/lib.rs
  17. 98 0
      arch/src/lib.rs
  18. 6 0
      arch/x86_64/Cargo.toml
  19. 92 0
      arch/x86_64/src/gdt.rs
  20. 27 0
      arch/x86_64/src/interrupt.rs
  21. 93 0
      arch/x86_64/src/io.rs
  22. 70 0
      arch/x86_64/src/lib.rs
  23. 172 0
      arch/x86_64/src/task.rs
  24. 2 2
      doc/mem_layout.txt
  25. 4 1
      gblibc/CMakeLists.txt
  26. 2 0
      gblibc/include/errno.h
  27. 0 19
      include/kernel/async/lock.hpp
  28. 0 29
      include/kernel/async/waitlist.hpp
  29. 0 6
      include/kernel/hw/pci.hpp
  30. 0 20
      include/kernel/hw/serial.hpp
  31. 0 11
      include/kernel/hw/timer.hpp
  32. 0 5
      include/kernel/interrupt.hpp
  33. 0 11
      include/kernel/irq.hpp
  34. 2 15
      include/kernel/log.hpp
  35. 0 112
      include/kernel/mem/mm_list.hpp
  36. 5 47
      include/kernel/mem/paging.hpp
  37. 1 2
      include/kernel/mem/paging_asm.h
  38. 1 19
      include/kernel/mem/phys.hpp
  39. 0 60
      include/kernel/mem/vm_area.hpp
  40. 0 37
      include/kernel/module.hpp
  41. 2 156
      include/kernel/process.hpp
  42. 0 74
      include/kernel/signal.hpp
  43. 0 118
      include/kernel/syscall.hpp
  44. 0 5
      include/kernel/task/current.hpp
  45. 0 16
      include/kernel/task/readyqueue.hpp
  46. 0 76
      include/kernel/task/thread.hpp
  47. 0 73
      include/kernel/tty.hpp
  48. 0 21
      include/kernel/user/thread_local.hpp
  49. 0 84
      include/kernel/vfs.hpp
  50. 0 28
      include/kernel/vfs/dentry.hpp
  51. 0 106
      include/kernel/vfs/file.hpp
  52. 0 51
      include/kernel/vfs/filearr.hpp
  53. 0 25
      include/kernel/vfs/vfsfwd.hpp
  54. 0 293
      include/types/elf.hpp
  55. 6 5
      init_script.sh
  56. 11 53
      src/asm/interrupt.s
  57. 81 6
      src/boot.s
  58. 0 71
      src/dev/builtin-chardev.cc
  59. 20 0
      src/driver.rs
  60. 5 4
      src/driver/ahci/command.rs
  61. 35 38
      src/driver/ahci/control.rs
  62. 33 33
      src/driver/ahci/defs.rs
  63. 108 57
      src/driver/ahci/mod.rs
  64. 266 58
      src/driver/ahci/port.rs
  65. 23 11
      src/driver/e1000e.rs
  66. 145 0
      src/driver/serial.rs
  67. 370 0
      src/elf.rs
  68. 137 148
      src/fs/fat32.rs
  69. 170 140
      src/fs/procfs.rs
  70. 207 243
      src/fs/tmpfs.rs
  71. 34 51
      src/io.rs
  72. 79 89
      src/kernel.ld
  73. 16 0
      src/kernel.rs
  74. 0 1
      src/kernel/allocator.cc
  75. 5 0
      src/kernel/arch.rs
  76. 82 0
      src/kernel/arch/x86_64.rs
  77. 126 0
      src/kernel/arch/x86_64/init.rs
  78. 129 0
      src/kernel/arch/x86_64/interrupt.rs
  79. 7 27
      src/kernel/async/lock.cc
  80. 0 57
      src/kernel/async/waitlist.cc
  81. 12 10
      src/kernel/block.rs
  82. 155 0
      src/kernel/chardev.rs
  83. 60 11
      src/kernel/console.rs
  84. 39 0
      src/kernel/constants.rs
  85. 2 2
      src/kernel/hw/pci.cc
  86. 0 115
      src/kernel/hw/serial.cc
  87. 0 28
      src/kernel/hw/timer.cc
  88. 0 147
      src/kernel/interrupt.cpp
  89. 77 26
      src/kernel/interrupt.rs
  90. 10 0
      src/kernel/mem.rs
  91. 102 0
      src/kernel/mem/mm_area.rs
  92. 13 15
      src/kernel/mem/mm_list.cc
  93. 357 0
      src/kernel/mem/mm_list.rs
  94. 206 0
      src/kernel/mem/mm_list/page_fault.rs
  95. 307 0
      src/kernel/mem/page_table.rs
  96. 4 197
      src/kernel/mem/paging.cc
  97. 91 35
      src/kernel/mem/paging.rs
  98. 2 2
      src/kernel/mem/phys.rs
  99. 20 10
      src/kernel/mem/slab.cc
  100. 168 0
      src/kernel/mem/vrange.rs

+ 1 - 0
.clang-format

@@ -6,6 +6,7 @@ AllowShortFunctionsOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLoopsOnASingleLine: 'false'
 AllowShortLoopsOnASingleLine: 'false'
 BreakConstructorInitializers: BeforeComma
 BreakConstructorInitializers: BeforeComma
+ColumnLimit: '100'
 FixNamespaceComments: 'true'
 FixNamespaceComments: 'true'
 IncludeBlocks: Regroup
 IncludeBlocks: Regroup
 IndentWidth: '4'
 IndentWidth: '4'

+ 3 - 3
.rustfmt.toml

@@ -1,4 +1,4 @@
-max_width = 80
+max_width = 100
 hard_tabs = false
 hard_tabs = false
 tab_spaces = 4
 tab_spaces = 4
 newline_style = "Auto"
 newline_style = "Auto"
@@ -10,8 +10,8 @@ struct_lit_width = 18
 struct_variant_width = 35
 struct_variant_width = 35
 array_width = 60
 array_width = 60
 chain_width = 60
 chain_width = 60
-single_line_if_else_max_width = 50
-single_line_let_else_max_width = 50
+single_line_if_else_max_width = 60
+single_line_let_else_max_width = 60
 wrap_comments = false
 wrap_comments = false
 format_code_in_doc_comments = false
 format_code_in_doc_comments = false
 doc_comment_code_block_width = 100
 doc_comment_code_block_width = 100

+ 12 - 49
CMakeLists.txt

@@ -38,75 +38,37 @@ set(BOOTLOADER_SOURCES src/boot.s
                        src/asm/interrupt.s
                        src/asm/interrupt.s
                        )
                        )
 
 
-set(KERNEL_MAIN_SOURCES src/dev/builtin-chardev.cc
-                        src/kinit.cpp
-                        src/kernel/async/waitlist.cc
+set(KERNEL_MAIN_SOURCES src/kinit.cpp
                         src/kernel/async/lock.cc
                         src/kernel/async/lock.cc
                         src/kernel/allocator.cc
                         src/kernel/allocator.cc
-                        src/kernel/interrupt.cpp
                         src/kernel/process.cpp
                         src/kernel/process.cpp
-                        src/kernel/tty.cpp
-                        src/kernel/syscall.cpp
-                        src/kernel/syscall/fileops.cc
-                        src/kernel/syscall/infoops.cc
-                        src/kernel/syscall/mount.cc
-                        src/kernel/syscall/procops.cc
-                        src/kernel/mem/mm_list.cc
                         src/kernel/mem/paging.cc
                         src/kernel/mem/paging.cc
                         src/kernel/mem/slab.cc
                         src/kernel/mem/slab.cc
-                        src/kernel/module.cc
-                        src/kernel/vfs.cpp
                         src/kernel/vga.cpp
                         src/kernel/vga.cpp
                         src/kernel/hw/acpi.cc
                         src/kernel/hw/acpi.cc
                         src/kernel/hw/pci.cc
                         src/kernel/hw/pci.cc
-                        src/kernel/hw/serial.cc
-                        src/kernel/hw/timer.cc
-                        src/kernel/task/thread.cc
-                        src/kernel/task/readyqueue.cc
-                        src/kernel/user/thread_local.cc
-                        src/kernel/vfs/filearr.cc
-                        src/kernel/signal.cpp
                         src/net/ethernet.cc
                         src/net/ethernet.cc
                         src/types/crc.cc
                         src/types/crc.cc
-                        src/types/elf.cpp
                         src/types/libstdcpp.cpp
                         src/types/libstdcpp.cpp
                         include/defs.hpp
                         include/defs.hpp
-                        include/kernel/async/waitlist.hpp
                         include/kernel/async/lock.hpp
                         include/kernel/async/lock.hpp
-                        include/kernel/tty.hpp
                         include/kernel/interrupt.hpp
                         include/kernel/interrupt.hpp
-                        include/kernel/irq.hpp
                         include/kernel/process.hpp
                         include/kernel/process.hpp
-                        include/kernel/syscall.hpp
-                        include/kernel/mem/mm_list.hpp
                         include/kernel/mem/paging.hpp
                         include/kernel/mem/paging.hpp
                         include/kernel/mem/slab.hpp
                         include/kernel/mem/slab.hpp
                         include/kernel/mem/types.hpp
                         include/kernel/mem/types.hpp
-                        include/kernel/mem/vm_area.hpp
-                        include/kernel/module.hpp
                         include/kernel/utsname.hpp
                         include/kernel/utsname.hpp
-                        include/kernel/vfs.hpp
-                        include/kernel/vfs/dentry.hpp
-                        include/kernel/vfs/file.hpp
-                        include/kernel/vfs/filearr.hpp
                         include/kernel/vga.hpp
                         include/kernel/vga.hpp
-                        include/kernel/signal.hpp
                         include/kernel/task/forward.hpp
                         include/kernel/task/forward.hpp
-                        include/kernel/task/thread.hpp
-                        include/kernel/task/readyqueue.hpp
                         include/kernel/hw/acpi.hpp
                         include/kernel/hw/acpi.hpp
                         include/kernel/hw/pci.hpp
                         include/kernel/hw/pci.hpp
                         include/kernel/hw/port.hpp
                         include/kernel/hw/port.hpp
-                        include/kernel/hw/serial.hpp
-                        include/kernel/hw/timer.hpp
                         include/kernel/input/keycodes.h
                         include/kernel/input/keycodes.h
-                        include/kernel/user/thread_local.hpp
                         include/net/arp.hpp
                         include/net/arp.hpp
                         include/net/ethernet.hpp
                         include/net/ethernet.hpp
                         include/net/netdev.hpp
                         include/net/netdev.hpp
                         include/types/bitmap.hpp
                         include/types/bitmap.hpp
                         include/types/buffer.hpp
                         include/types/buffer.hpp
-                        include/types/elf.hpp
                         include/types/list.hpp
                         include/types/list.hpp
                         include/types/types.h
                         include/types/types.h
                         include/types/allocator.hpp
                         include/types/allocator.hpp
@@ -121,6 +83,7 @@ target_include_directories(kernel.out PRIVATE ${PROJECT_SOURCE_DIR}/include)
 target_link_options(kernel.out PRIVATE
 target_link_options(kernel.out PRIVATE
     -T "${CMAKE_SOURCE_DIR}/src/kernel.ld"
     -T "${CMAKE_SOURCE_DIR}/src/kernel.ld"
     -L "${CMAKE_BINARY_DIR}/x86_64-unknown-none/${CARGO_BUILD_TYPE}"
     -L "${CMAKE_BINARY_DIR}/x86_64-unknown-none/${CARGO_BUILD_TYPE}"
+    --no-check-sections
     )
     )
 set_target_properties(kernel.out PROPERTIES LINK_DEPENDS "${CMAKE_SOURCE_DIR}/src/kernel.ld")
 set_target_properties(kernel.out PROPERTIES LINK_DEPENDS "${CMAKE_SOURCE_DIR}/src/kernel.ld")
 set_source_files_properties(src/mbr.S PROPERTIES OBJECT_DEPENDS
 set_source_files_properties(src/mbr.S PROPERTIES OBJECT_DEPENDS
@@ -141,16 +104,16 @@ add_custom_target(boot.img
     DEPENDS user_space_programs
     DEPENDS user_space_programs
     COMMAND dd if=mbr_hole.bin of=boot.img
     COMMAND dd if=mbr_hole.bin of=boot.img
     COMMAND dd if=/dev/zero of=boot.img bs=`expr 512 \\* 1024 \\* 1024` count=0 seek=1
     COMMAND dd if=/dev/zero of=boot.img bs=`expr 512 \\* 1024 \\* 1024` count=0 seek=1
-    COMMAND sh -c \"echo n\; echo\; echo\; echo\; echo\; echo a\; echo w\" | ${FDISK_BIN} boot.img
-    COMMAND mkfs.fat --offset=2048 -v -n SYSTEM boot.img
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/hello-world.out ::hello
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/interrupt-test.out ::int
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/stack-test.out ::stack
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/init.out ::init
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_BINARY_DIR}/user-space-program/priv-test.out ::priv
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_SOURCE_DIR}/busybox-minimal ::busybox_
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_SOURCE_DIR}/busybox ::busybox
-    COMMAND mcopy -i boot.img@@1M ${CMAKE_SOURCE_DIR}/init_script.sh ::initsh
+    COMMAND sh -c \"echo n\; echo\; echo \; echo 8192\; echo\; echo a\; echo w\" | ${FDISK_BIN} boot.img
+    COMMAND mkfs.fat --offset=8192 -v -n SYSTEM boot.img
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/hello-world.out ::hello
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/interrupt-test.out ::int
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/stack-test.out ::stack
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/init.out ::init
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_BINARY_DIR}/user-space-program/priv-test.out ::priv
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_SOURCE_DIR}/busybox-minimal ::busybox_
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_SOURCE_DIR}/busybox ::busybox
+    COMMAND mcopy -i boot.img@@4M ${CMAKE_SOURCE_DIR}/init_script.sh ::initsh
 )
 )
 
 
 add_custom_command(OUTPUT run
 add_custom_command(OUTPUT run

+ 45 - 14
Cargo.lock

@@ -11,6 +11,14 @@ dependencies = [
  "memchr",
  "memchr",
 ]
 ]
 
 
+[[package]]
+name = "arch"
+version = "0.1.0"
+dependencies = [
+ "percpu",
+ "x86_64",
+]
+
 [[package]]
 [[package]]
 name = "autocfg"
 name = "autocfg"
 version = "1.4.0"
 version = "1.4.0"
@@ -79,7 +87,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 name = "gbos-rust-part"
 name = "gbos-rust-part"
 version = "0.1.0"
 version = "0.1.0"
 dependencies = [
 dependencies = [
+ "arch",
  "bindgen",
  "bindgen",
+ "bitflags",
  "itertools",
  "itertools",
  "lazy_static",
  "lazy_static",
  "spin",
  "spin",
@@ -111,9 +121,9 @@ dependencies = [
 
 
 [[package]]
 [[package]]
 name = "libc"
 name = "libc"
-version = "0.2.159"
+version = "0.2.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
+checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f"
 
 
 [[package]]
 [[package]]
 name = "libloading"
 name = "libloading"
@@ -163,11 +173,28 @@ dependencies = [
  "minimal-lexical",
  "minimal-lexical",
 ]
 ]
 
 
+[[package]]
+name = "percpu"
+version = "0.1.0"
+dependencies = [
+ "percpu-macros",
+ "x86_64",
+]
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 [[package]]
 name = "prettyplease"
 name = "prettyplease"
-version = "0.2.22"
+version = "0.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba"
+checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033"
 dependencies = [
 dependencies = [
  "proc-macro2",
  "proc-macro2",
  "syn",
  "syn",
@@ -175,9 +202,9 @@ dependencies = [
 
 
 [[package]]
 [[package]]
 name = "proc-macro2"
 name = "proc-macro2"
-version = "1.0.87"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
 dependencies = [
  "unicode-ident",
  "unicode-ident",
 ]
 ]
@@ -193,9 +220,9 @@ dependencies = [
 
 
 [[package]]
 [[package]]
 name = "regex"
 name = "regex"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
 dependencies = [
  "aho-corasick",
  "aho-corasick",
  "memchr",
  "memchr",
@@ -205,9 +232,9 @@ dependencies = [
 
 
 [[package]]
 [[package]]
 name = "regex-automata"
 name = "regex-automata"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
 dependencies = [
  "aho-corasick",
  "aho-corasick",
  "memchr",
  "memchr",
@@ -249,9 +276,9 @@ dependencies = [
 
 
 [[package]]
 [[package]]
 name = "syn"
 name = "syn"
-version = "2.0.79"
+version = "2.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
 dependencies = [
 dependencies = [
  "proc-macro2",
  "proc-macro2",
  "quote",
  "quote",
@@ -260,9 +287,9 @@ dependencies = [
 
 
 [[package]]
 [[package]]
 name = "unicode-ident"
 name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 
 [[package]]
 [[package]]
 name = "windows-targets"
 name = "windows-targets"
@@ -327,3 +354,7 @@ name = "windows_x86_64_msvc"
 version = "0.52.6"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "x86_64"
+version = "0.1.0"

+ 11 - 0
Cargo.toml

@@ -7,16 +7,27 @@ edition = "2021"
 crate-type = ["staticlib"]
 crate-type = ["staticlib"]
 
 
 [dependencies]
 [dependencies]
+arch = { path="./arch" }
+bitflags = "2.6.0"
 itertools = { version = "0.13.0", default-features = false }
 itertools = { version = "0.13.0", default-features = false }
 lazy_static = { version = "1.5.0", features = ["spin_no_std"] }
 lazy_static = { version = "1.5.0", features = ["spin_no_std"] }
 spin = "0.9.8"
 spin = "0.9.8"
 
 
+[features]
+default = ["smp"]
+debug_syscall = []
+smp = []
+
 [build-dependencies]
 [build-dependencies]
 bindgen = "0.70.1"
 bindgen = "0.70.1"
 
 
 [profile.dev]
 [profile.dev]
 panic = "abort"
 panic = "abort"
 
 
+[profile.dev.package.core]
+opt-level = 2
+debug = true
+
 [profile.dev.package."*"]
 [profile.dev.package."*"]
 opt-level = 2
 opt-level = 2
 debug = false
 debug = false

+ 7 - 1
Makefile.src

@@ -42,7 +42,13 @@ clean-all: clean
 
 
 .PHONY: debug
 .PHONY: debug
 debug:
 debug:
-	-$(GDB_BIN) --symbols=build/kernel.out --init-eval-command 'source pretty-print.py' --init-eval-command 'set pagination off' --init-eval-command 'set output-radix 16' --init-eval-command 'set print pretty on' --init-eval-command 'target remote:1234'
+	-$(GDB_BIN) --symbols=build/kernel.out \
+		-iex 'source pretty-print.py' \
+		-iex 'set pagination off' \
+		-iex 'set output-radix 16' \
+		-iex 'set print asm-demangle on' \
+		-iex 'set print pretty on' \
+		-iex 'target remote:1234'
 	-killall $(QEMU_BIN)
 	-killall $(QEMU_BIN)
 
 
 build/boot.vdi: build/boot.img
 build/boot.vdi: build/boot.img

+ 65 - 0
arch/Cargo.lock

@@ -0,0 +1,65 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "arch"
+version = "0.1.0"
+dependencies = [
+ "percpu",
+ "x86_64",
+]
+
+[[package]]
+name = "percpu"
+version = "0.1.0"
+dependencies = [
+ "percpu-macros",
+]
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
+
+[[package]]
+name = "x86_64"
+version = "0.1.0"

+ 8 - 0
arch/Cargo.toml

@@ -0,0 +1,8 @@
+[package]
+name = "arch"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+x86_64 = { path="./x86_64" }
+percpu = { path="./percpu" }

+ 53 - 0
arch/percpu/Cargo.lock

@@ -0,0 +1,53 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "percpu"
+version = "0.1.0"
+dependencies = [
+ "percpu-macros",
+]
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"

+ 8 - 0
arch/percpu/Cargo.toml

@@ -0,0 +1,8 @@
+[package]
+name = "percpu"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+percpu-macros = { path = "macros" }
+x86_64 = { path = "../x86_64" }

+ 47 - 0
arch/percpu/macros/Cargo.lock

@@ -0,0 +1,47 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "percpu-macros"
+version = "0.1.0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.92"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"

+ 12 - 0
arch/percpu/macros/Cargo.toml

@@ -0,0 +1,12 @@
+[package]
+name = "percpu-macros"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+proc-macro = true
+
+[dependencies]
+syn = { version = "2.0", features = ["full"] }
+quote = "1.0"
+proc-macro2 = "1.0"

+ 22 - 0
arch/percpu/macros/src/arch.rs

@@ -0,0 +1,22 @@
+use proc_macro2::TokenStream;
+use quote::quote;
+use syn::{Ident, Type};
+
+/// Get the base address for percpu variables of the current thread.
+pub fn get_percpu_pointer(percpu: &Ident, ty: &Type) -> TokenStream {
+    quote! {
+        #[cfg(target_arch = "x86_64")]
+        {
+            let base: *mut #ty;
+            ::core::arch::asm!(
+                "mov %gs:0, {address}",
+                "add ${percpu_pointer}, {address}",
+                percpu_pointer = sym #percpu,
+                address = out(reg) base,
+                options(att_syntax)
+            );
+            base
+        }
+    }
+    .into()
+}

+ 117 - 0
arch/percpu/macros/src/lib.rs

@@ -0,0 +1,117 @@
+extern crate proc_macro;
+
+use proc_macro::TokenStream;
+use quote::{format_ident, quote};
+use syn::{parse_macro_input, ItemStatic};
+
+mod arch;
+
+#[proc_macro_attribute]
+pub fn define_percpu(attrs: TokenStream, item: TokenStream) -> TokenStream {
+    if !attrs.is_empty() {
+        panic!("`define_percpu` attribute does not take any arguments");
+    }
+
+    let item = parse_macro_input!(item as ItemStatic);
+    let vis = &item.vis;
+    let ident = &item.ident;
+    let ty = &item.ty;
+    let expr = &item.expr;
+
+    let is_bool = quote!(#ty).to_string().as_str() == "bool";
+    let is_integer =
+        ["u8", "u16", "u32", "u64", "usize"].contains(&quote!(#ty).to_string().as_str());
+
+    let is_atomic_like = is_bool || is_integer || quote!(#ty).to_string().contains("NonNull");
+
+    let inner_ident = format_ident!("_percpu_inner_{}", ident);
+    let access_ident = format_ident!("_access_{}", ident);
+
+    let integer_methods = if is_integer {
+        quote! {
+            pub fn add(&self, value: #ty) {
+                *unsafe { self.as_mut() } += value;
+            }
+
+            pub fn sub(&self, value: #ty) {
+                *unsafe { self.as_mut() } -= value;
+            }
+        }
+    } else {
+        quote! {}
+    };
+
+    let preempt_disable = if !is_atomic_like {
+        quote! { crate::sync::preempt::disable(); }
+    } else {
+        quote! {}
+    };
+
+    let preempt_enable = if !is_atomic_like {
+        quote! { crate::sync::preempt::enable(); }
+    } else {
+        quote! {}
+    };
+
+    let as_ptr = arch::get_percpu_pointer(&inner_ident, &ty);
+
+    quote! {
+        #[link_section = ".percpu"]
+        #[allow(non_upper_case_globals)]
+        static mut #inner_ident: #ty = #expr;
+        #[allow(non_camel_case_types)]
+        #vis struct #access_ident;
+        #vis static #ident: #access_ident = #access_ident;
+
+        impl #access_ident {
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ptr(&self) -> *mut #ty {
+                #as_ptr
+            }
+
+            pub fn get(&self) -> #ty {
+                #preempt_disable
+                let value = unsafe { self.as_ptr().read() };
+                #preempt_enable
+                value
+            }
+
+            pub fn set(&self, value: #ty) {
+                #preempt_disable
+                unsafe { self.as_ptr().write(value) }
+                #preempt_enable
+            }
+
+            pub fn swap(&self, mut value: #ty) -> #ty {
+                #preempt_disable
+                unsafe { self.as_ptr().swap(&mut value) }
+                #preempt_enable
+                value
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for immutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_ref(&self) -> & #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_ref().unwrap()
+            }
+
+            /// # Safety
+            /// This function is unsafe because it allows for mutable aliasing of the percpu
+            /// variable.
+            /// Make sure that preempt is disabled when calling this function.
+            pub unsafe fn as_mut(&self) -> &mut #ty {
+                // SAFETY: This is safe because `as_ptr()` is guaranteed to be valid.
+                self.as_ptr().as_mut().unwrap()
+            }
+
+            #integer_methods
+        }
+    }
+    .into()
+}

+ 25 - 0
arch/percpu/src/arch.rs

@@ -0,0 +1,25 @@
+pub unsafe fn save_percpu_pointer(percpu_area_base: *mut ()) {
+    #[cfg(target_arch = "x86_64")]
+    x86_64::task::wrmsr(0xC0000101, percpu_area_base as u64);
+
+    #[cfg(not(target_arch = "x86_64"))]
+    compile_error!("unsupported architecture");
+}
+
+pub unsafe fn set_percpu_area_thiscpu(percpu_area_base: *mut ()) {
+    use core::arch::asm;
+
+    save_percpu_pointer(percpu_area_base);
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        asm!(
+            "movq {}, %gs:0",
+            in(reg) percpu_area_base,
+            options(att_syntax)
+        );
+    }
+
+    #[cfg(not(target_arch = "x86_64"))]
+    compile_error!("unsupported architecture");
+}

+ 6 - 0
arch/percpu/src/lib.rs

@@ -0,0 +1,6 @@
+#![no_std]
+
+mod arch;
+
+pub use arch::set_percpu_area_thiscpu;
+pub use percpu_macros::define_percpu;

+ 98 - 0
arch/src/lib.rs

@@ -0,0 +1,98 @@
+#![no_std]
+
+pub mod vm {
+    pub fn invlpg(vaddr: usize) {
+        x86_64::vm::invlpg(vaddr)
+    }
+
+    pub fn invlpg_all() {
+        x86_64::vm::invlpg_all()
+    }
+
+    pub fn current_page_table() -> usize {
+        x86_64::vm::get_cr3()
+    }
+
+    pub fn switch_page_table(pfn: usize) {
+        x86_64::vm::set_cr3(pfn)
+    }
+}
+
+pub mod task {
+    #[inline(always)]
+    pub fn halt() {
+        x86_64::task::halt()
+    }
+
+    #[inline(always)]
+    pub fn pause() {
+        x86_64::task::pause()
+    }
+
+    #[inline(always)]
+    pub fn freeze() -> ! {
+        x86_64::task::freeze()
+    }
+
+    /// Switch to the `next` task. `IF` state is also switched.
+    ///
+    /// This function should only be used to switch between tasks that do not need SMP synchronization.
+    ///
+    /// # Arguments
+    /// * `current_task_sp` - Pointer to the stack pointer of the current task.
+    /// * `next_task_sp` - Pointer to the stack pointer of the next task.
+    #[inline(always)]
+    pub fn context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize) {
+        x86_64::task::context_switch_light(current_task_sp, next_task_sp);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub use x86_64::task::{rdmsr, wrmsr};
+}
+
+pub mod interrupt {
+    #[inline(always)]
+    pub fn enable() {
+        x86_64::interrupt::enable()
+    }
+
+    #[inline(always)]
+    pub fn disable() {
+        x86_64::interrupt::disable()
+    }
+}
+
+pub mod io {
+    #[inline(always)]
+    pub fn inb(port: u16) -> u8 {
+        x86_64::io::inb(port)
+    }
+
+    #[inline(always)]
+    pub fn outb(port: u16, data: u8) {
+        x86_64::io::outb(port, data)
+    }
+
+    #[inline(always)]
+    pub fn inw(port: u16) -> u16 {
+        x86_64::io::inw(port)
+    }
+
+    #[inline(always)]
+    pub fn outw(port: u16, data: u16) {
+        x86_64::io::outw(port, data)
+    }
+
+    #[inline(always)]
+    pub fn inl(port: u16) -> u32 {
+        x86_64::io::inl(port)
+    }
+
+    #[inline(always)]
+    pub fn outl(port: u16, data: u32) {
+        x86_64::io::outl(port, data)
+    }
+}
+
+pub use percpu::{define_percpu, set_percpu_area_thiscpu};
+pub use x86_64;

+ 6 - 0
arch/x86_64/Cargo.toml

@@ -0,0 +1,6 @@
+[package]
+name = "x86_64"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]

+ 92 - 0
arch/x86_64/src/gdt.rs

@@ -0,0 +1,92 @@
+use core::arch::asm;
+
+use crate::task::TSS;
+
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy)]
+pub struct GDTEntry(u64);
+
+pub struct GDT([GDTEntry; GDT::LEN]);
+
+impl GDTEntry {
+    const NULL: Self = Self(0);
+
+    const KERNEL_CODE64: Self = Self::new(0, 0, 0x9a, 0x2);
+    const KERNEL_DATA64: Self = Self::new(0, 0, 0x92, 0x0);
+
+    const USER_CODE64: Self = Self::new(0, 0, 0xfa, 0x2);
+    const USER_DATA64: Self = Self::new(0, 0, 0xf2, 0x0);
+
+    const USER_CODE32: Self = Self::new(0, 0xfffff, 0xfa, 0xc);
+    const USER_DATA32: Self = Self::new(0, 0xfffff, 0xf2, 0xc);
+
+    pub const fn new(base: u32, limit: u32, access: u8, flags: u8) -> Self {
+        let mut entry = 0u64;
+        entry |= (limit & 0x0000_ffff) as u64;
+        entry |= ((limit & 0x000f_0000) as u64) << 32;
+        entry |= ((base & 0x00ff_ffff) as u64) << 16;
+        entry |= ((base & 0xff00_0000) as u64) << 32;
+        entry |= (access as u64) << 40;
+        entry |= (flags as u64) << 52;
+
+        GDTEntry(entry)
+    }
+
+    pub const fn new_ldt(base: u64, limit: u32) -> [Self; 2] {
+        let first = Self::new(base as u32, limit, 0x82, 0x0);
+        let second = Self(base >> 32);
+        [first, second]
+    }
+
+    pub const fn new_tss(base: u64, limit: u32) -> [Self; 2] {
+        let first = Self::new(base as u32, limit, 0x89, 0x0);
+        let second = Self(base >> 32);
+        [first, second]
+    }
+}
+
+impl GDT {
+    const LEN: usize = 10;
+    const TLS32_INDEX: usize = 7;
+    const TSS_INDEX: usize = 8;
+
+    pub fn new() -> Self {
+        Self([
+            GDTEntry::NULL,
+            GDTEntry::KERNEL_CODE64,
+            GDTEntry::KERNEL_DATA64,
+            GDTEntry::USER_CODE64,
+            GDTEntry::USER_DATA64,
+            GDTEntry::USER_CODE32,
+            GDTEntry::USER_DATA32,
+            GDTEntry::NULL, // User TLS 32bit
+            GDTEntry::NULL, // TSS Descriptor Low
+            GDTEntry::NULL, // TSS Descriptor High
+        ])
+    }
+
+    pub fn set_tss(&mut self, base: u64) {
+        let tss = GDTEntry::new_tss(base, size_of::<TSS>() as u32 - 1);
+        self.0[Self::TSS_INDEX] = tss[0];
+        self.0[Self::TSS_INDEX + 1] = tss[1];
+    }
+
+    pub fn set_tls32(&mut self, desc: GDTEntry) {
+        self.0[Self::TLS32_INDEX] = desc;
+    }
+
+    pub unsafe fn load(&self) {
+        let len = Self::LEN * 8 - 1;
+        let descriptor: [u64; 2] = [(len as u64) << 48, self.0.as_ptr() as u64];
+        assert!(len < 0x10000, "GDT too large");
+
+        let descriptor_address = &descriptor as *const _ as usize + 6;
+        asm!(
+            "lgdt ({})",
+            "ltr %ax",
+            in(reg) descriptor_address,
+            in("ax") Self::TSS_INDEX as u16 * 8,
+            options(att_syntax)
+        );
+    }
+}

+ 27 - 0
arch/x86_64/src/interrupt.rs

@@ -0,0 +1,27 @@
+use core::arch::asm;
+
+pub fn enable() {
+    unsafe {
+        asm!("sti");
+    }
+}
+
+pub fn disable() {
+    unsafe {
+        asm!("cli");
+    }
+}
+
+pub fn lidt(base: usize, limit: u16) {
+    let mut idt_descriptor = [0u16; 5];
+
+    idt_descriptor[0] = limit;
+    idt_descriptor[1] = base as u16;
+    idt_descriptor[2] = (base >> 16) as u16;
+    idt_descriptor[3] = (base >> 32) as u16;
+    idt_descriptor[4] = (base >> 48) as u16;
+
+    unsafe {
+        asm!("lidt ({})", in(reg) &idt_descriptor, options(att_syntax));
+    }
+}

+ 93 - 0
arch/x86_64/src/io.rs

@@ -0,0 +1,93 @@
+use core::arch::asm;
+
+pub fn enable_sse() {
+    unsafe {
+        asm!(
+            "mov %cr0, %rax",
+            "and $(~0xc), %rax",
+            "or $0x22, %rax",
+            "mov %rax, %cr0",
+            "mov %cr4, %rax",
+            "or $0x600, %rax",
+            "mov %rax, %cr4",
+            "fninit",
+            out("rax") _,
+            options(att_syntax, nomem, nostack)
+        )
+    }
+}
+
+pub fn inb(no: u16) -> u8 {
+    let data;
+    unsafe {
+        asm!(
+            "inb %dx, %al",
+            in("dx") no,
+            out("al") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn inw(no: u16) -> u16 {
+    let data;
+    unsafe {
+        asm!(
+            "inw %dx, %ax",
+            in("dx") no,
+            out("ax") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn inl(no: u16) -> u32 {
+    let data;
+    unsafe {
+        asm!(
+            "inl %dx, %eax",
+            in("dx") no,
+            out("eax") data,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+
+    data
+}
+
+pub fn outb(no: u16, data: u8) {
+    unsafe {
+        asm!(
+            "outb %al, %dx",
+            in("al") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}
+
+pub fn outw(no: u16, data: u16) {
+    unsafe {
+        asm!(
+            "outw %ax, %dx",
+            in("ax") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}
+
+pub fn outl(no: u16, data: u32) {
+    unsafe {
+        asm!(
+            "outl %eax, %dx",
+            in("eax") data,
+            in("dx") no,
+            options(att_syntax, nomem, nostack)
+        )
+    };
+}

+ 70 - 0
arch/x86_64/src/lib.rs

@@ -0,0 +1,70 @@
+#![no_std]
+
+pub mod vm {
+    use core::arch::asm;
+
+    #[inline(always)]
+    pub fn invlpg(vaddr: usize) {
+        unsafe {
+            asm!(
+                "invlpg ({})",
+                in(reg) vaddr,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn invlpg_all() {
+        unsafe {
+            asm!(
+                "mov %cr3, %rax",
+                "mov %rax, %cr3",
+                out("rax") _,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn get_cr3() -> usize {
+        let cr3: usize;
+        unsafe {
+            asm!(
+                "mov %cr3, {0}",
+                out(reg) cr3,
+                options(att_syntax)
+            );
+        }
+        cr3
+    }
+
+    #[inline(always)]
+    pub fn set_cr3(pfn: usize) {
+        unsafe {
+            asm!(
+                "mov {0}, %cr3",
+                in(reg) pfn,
+                options(att_syntax)
+            );
+        }
+    }
+
+    #[inline(always)]
+    pub fn get_cr2() -> usize {
+        let cr2: usize;
+        unsafe {
+            asm!(
+                "mov %cr2, {}",
+                out(reg) cr2,
+                options(att_syntax)
+            );
+        }
+        cr2
+    }
+}
+
+pub mod gdt;
+pub mod interrupt;
+pub mod io;
+pub mod task;

+ 172 - 0
arch/x86_64/src/task.rs

@@ -0,0 +1,172 @@
+use core::arch::{asm, global_asm};
+
+use crate::interrupt;
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+struct SP {
+    low: u32,
+    high: u32,
+}
+
+#[repr(C)]
+pub struct TSS {
+    _reserved1: u32,
+    rsp: [SP; 3],
+    _reserved2: u32,
+    _reserved3: u32,
+    ist: [SP; 7],
+    _reserved4: u32,
+    _reserved5: u32,
+    _reserved6: u16,
+    iomap_base: u16,
+}
+
+impl TSS {
+    pub fn new() -> Self {
+        Self {
+            _reserved1: 0,
+            rsp: [SP { low: 0, high: 0 }; 3],
+            _reserved2: 0,
+            _reserved3: 0,
+            ist: [SP { low: 0, high: 0 }; 7],
+            _reserved4: 0,
+            _reserved5: 0,
+            _reserved6: 0,
+            iomap_base: 0,
+        }
+    }
+
+    pub fn set_rsp0(&mut self, rsp: u64) {
+        self.rsp[0].low = rsp as u32;
+        self.rsp[0].high = (rsp >> 32) as u32;
+    }
+}
+
+#[inline(always)]
+pub fn halt() {
+    unsafe {
+        asm!("hlt", options(att_syntax, nostack));
+    }
+}
+
+#[inline(always)]
+pub fn pause() {
+    unsafe {
+        asm!("pause", options(att_syntax, nostack));
+    }
+}
+
+#[inline(always)]
+pub fn freeze() -> ! {
+    loop {
+        interrupt::disable();
+        halt();
+    }
+}
+
+#[inline(always)]
+pub fn rdmsr(msr: u32) -> u64 {
+    let edx: u32;
+    let eax: u32;
+
+    unsafe {
+        asm!(
+            "rdmsr",
+            in("ecx") msr,
+            out("eax") eax,
+            out("edx") edx,
+            options(att_syntax),
+        );
+    }
+
+    (edx as u64) << 32 | eax as u64
+}
+
+#[inline(always)]
+pub fn wrmsr(msr: u32, value: u64) {
+    let eax = value as u32;
+    let edx = (value >> 32) as u32;
+
+    unsafe {
+        asm!(
+            "wrmsr",
+            in("ecx") msr,
+            in("eax") eax,
+            in("edx") edx,
+            options(att_syntax),
+        );
+    }
+}
+
+global_asm!(
+    r"
+    .macro movcfi reg, offset
+        mov \reg, \offset(%rsp)
+        .cfi_rel_offset \reg, \offset
+    .endm
+
+    .macro movrst reg, offset
+        mov \offset(%rsp), \reg
+        .cfi_restore \reg
+    .endm
+
+    .globl __context_switch_light
+    .type __context_switch_light @function
+    __context_switch_light:
+    .cfi_startproc
+
+        pushf
+    .cfi_def_cfa_offset 0x10
+
+        sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
+    .cfi_def_cfa_offset 0x48
+
+        movcfi %rbx, 0x08
+        movcfi %rbp, 0x10
+        movcfi %r12, 0x18
+        movcfi %r13, 0x20
+        movcfi %r14, 0x28
+        movcfi %r15, 0x30
+
+        push (%rdi)      # save sp of previous stack frame of current
+                         # acts as saving bp
+    .cfi_def_cfa_offset 0x50
+
+        mov %rsp, (%rdi) # save sp of current stack
+        mov (%rsi), %rsp # load sp of target stack
+
+        pop (%rsi)       # load sp of previous stack frame of target
+                         # acts as restoring previous bp
+    .cfi_def_cfa_offset 0x48
+
+        pop %rax         # align to 16 bytes
+    .cfi_def_cfa_offset 0x40
+
+        mov 0x28(%rsp), %r15
+        mov 0x20(%rsp), %r14
+        mov 0x18(%rsp), %r13
+        mov 0x10(%rsp), %r12
+        mov 0x08(%rsp), %rbp
+        mov 0x00(%rsp), %rbx
+
+        add $0x30, %rsp
+    .cfi_def_cfa_offset 0x10
+
+        popf
+    .cfi_def_cfa_offset 0x08
+
+        ret
+    .cfi_endproc
+    ",
+    options(att_syntax),
+);
+
+extern "C" {
+    fn __context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize);
+}
+
+#[inline(always)]
+pub fn context_switch_light(current_task_sp: *mut usize, next_task_sp: *mut usize) {
+    unsafe { __context_switch_light(current_task_sp, next_task_sp) }
+}

+ 2 - 2
doc/mem_layout.txt

@@ -1,8 +1,8 @@
 physical memory
 physical memory
 
 
-0x0000 - 0x1000 : GDT, TSS, LDT and some early kernel data
+0x0000 - 0x1000 : GDT for kernel initialization use and some early kernel data
 0x1000 - 0x2000 : kernel stage1
 0x1000 - 0x2000 : kernel stage1
-0x2000 - 0x3000 : kernel PML4
+0x2000 - 0x3000 : kernel space PML4
 0x3000 - 0x4000 : kernel PDPT for physical memory mappings
 0x3000 - 0x4000 : kernel PDPT for physical memory mappings
 0x4000 - 0x5000 : kernel PDPT for kernel space
 0x4000 - 0x5000 : kernel PDPT for kernel space
 0x5000 - 0x6000 : kernel PD for kernel image
 0x5000 - 0x6000 : kernel PD for kernel image

+ 4 - 1
gblibc/CMakeLists.txt

@@ -45,9 +45,12 @@ add_library(crt0_32 OBJECT
     src/crt0.s
     src/crt0.s
 )
 )
 
 
-target_compile_options(gblibc_32 PRIVATE "-m32")
+target_compile_options(gblibc_32 PRIVATE "-fno-pic")
 target_compile_options(gblibc_32 PRIVATE "-mcmodel=32")
 target_compile_options(gblibc_32 PRIVATE "-mcmodel=32")
+target_compile_options(gblibc_32 PRIVATE "-m32")
+target_compile_options(crt0_32 PRIVATE "-fno-pic")
 target_compile_options(crt0_32 PRIVATE "-m32")
 target_compile_options(crt0_32 PRIVATE "-m32")
+target_compile_options(crt0_32 PRIVATE "-mcmodel=32")
 target_link_options(gblibc_32 PRIVATE "LINKER:-melf_i386")
 target_link_options(gblibc_32 PRIVATE "LINKER:-melf_i386")
 target_link_options(crt0_32 PRIVATE "LINKER:-melf_i386")
 target_link_options(crt0_32 PRIVATE "LINKER:-melf_i386")
 
 

+ 2 - 0
gblibc/include/errno.h

@@ -30,7 +30,9 @@ extern int* __errno_location(void);
 #define ESPIPE 29
 #define ESPIPE 29
 #define EROFS 30
 #define EROFS 30
 #define EPIPE 32
 #define EPIPE 32
+#define ERANGE 34
 #define ELOOP 40
 #define ELOOP 40
+#define EOVERFLOW 75
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 0 - 19
include/kernel/async/lock.hpp

@@ -8,11 +8,6 @@ namespace kernel::async {
 
 
 using spinlock_t = unsigned long volatile;
 using spinlock_t = unsigned long volatile;
 using lock_context_t = unsigned long;
 using lock_context_t = unsigned long;
-using preempt_count_t = std::size_t;
-
-void preempt_disable();
-void preempt_enable();
-preempt_count_t preempt_count();
 
 
 void init_spinlock(spinlock_t& lock);
 void init_spinlock(spinlock_t& lock);
 
 
@@ -31,24 +26,10 @@ class mutex {
     mutex(const mutex&) = delete;
     mutex(const mutex&) = delete;
     ~mutex();
     ~mutex();
 
 
-    void lock();
-    void unlock();
-
     lock_context_t lock_irq();
     lock_context_t lock_irq();
     void unlock_irq(lock_context_t state);
     void unlock_irq(lock_context_t state);
 };
 };
 
 
-class lock_guard {
-   private:
-    mutex& m_mtx;
-
-   public:
-    explicit inline lock_guard(mutex& mtx) : m_mtx{mtx} { m_mtx.lock(); }
-    lock_guard(const lock_guard&) = delete;
-
-    inline ~lock_guard() { m_mtx.unlock(); }
-};
-
 class lock_guard_irq {
 class lock_guard_irq {
    private:
    private:
     mutex& m_mtx;
     mutex& m_mtx;

+ 0 - 29
include/kernel/async/waitlist.hpp

@@ -1,29 +0,0 @@
-#pragma once
-
-#include <set>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/task/forward.hpp>
-
-namespace kernel::async {
-
-class wait_list {
-   private:
-    mutex m_mtx;
-    std::set<task::thread*> m_subscribers;
-
-    wait_list(const wait_list&) = delete;
-
-   public:
-    explicit wait_list() = default;
-
-    // @return whether the wait is interrupted
-    bool wait(mutex& lck);
-
-    void subscribe();
-
-    void notify_one();
-    void notify_all();
-};
-
-} // namespace kernel::async

+ 0 - 6
include/kernel/hw/pci.hpp

@@ -9,12 +9,6 @@
 
 
 #include <kernel/mem/phys.hpp>
 #include <kernel/mem/phys.hpp>
 
 
-namespace kernel::kinit {
-
-void init_pci();
-
-} // namespace kernel::kinit
-
 namespace kernel::hw::pci {
 namespace kernel::hw::pci {
 
 
 struct PACKED device_header_base {
 struct PACKED device_header_base {

+ 0 - 20
include/kernel/hw/serial.hpp

@@ -1,20 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define PORT_SERIAL0 (0x3f8)
-#define PORT_SERIAL1 (0x2f8)
-
-int32_t init_serial_port(port_id_t port);
-
-int32_t is_serial_has_data(port_id_t port);
-uint8_t serial_read_data(port_id_t port);
-
-int32_t is_serial_ready_for_transmition(port_id_t port);
-void serial_send_data(port_id_t port, uint8_t data);
-
-#ifdef __cplusplus
-}
-#endif

+ 0 - 11
include/kernel/hw/timer.hpp

@@ -1,11 +0,0 @@
-#pragma once
-
-#include <cstddef>
-
-namespace kernel::hw::timer {
-void init_pit(void);
-void inc_tick(void);
-
-std::size_t current_ticks(void);
-
-} // namespace kernel::hw::timer

+ 0 - 5
include/kernel/interrupt.hpp

@@ -36,8 +36,3 @@ struct interrupt_stack {
 struct mmx_registers {
 struct mmx_registers {
     uint8_t data[512]; // TODO: list of content
     uint8_t data[512]; // TODO: list of content
 };
 };
-
-namespace kernel::kinit {
-void init_interrupt();
-
-} // namespace kernel::kinit

+ 0 - 11
include/kernel/irq.hpp

@@ -1,11 +0,0 @@
-#pragma once
-
-#include <functional>
-
-namespace kernel::irq {
-
-using irq_handler_t = std::function<void()>;
-
-void register_handler(int irqno, irq_handler_t handler);
-
-}; // namespace kernel::irq

+ 2 - 15
include/kernel/log.hpp

@@ -1,20 +1,7 @@
 #pragma once
 #pragma once
 
 
-#include <stdio.h>
-
-#include <kernel/tty.hpp>
-
-#define kmsgf(fmt, ...)                                                  \
-    if (1) {                                                             \
-        char buf[512];                                                   \
-        snprintf(buf, sizeof(buf), fmt "\n" __VA_OPT__(, ) __VA_ARGS__); \
-        if (kernel::tty::console)                                        \
-            kernel::tty::console->print(buf);                            \
-    }
-
-#define kmsg(msg)             \
-    if (kernel::tty::console) \
-    kernel::tty::console->print(msg "\n")
+#define kmsgf(fmt, ...)
+#define kmsg(msg)
 
 
 #ifdef NDEBUG
 #ifdef NDEBUG
 #define kmsgf_debug(...)
 #define kmsgf_debug(...)

+ 0 - 112
include/kernel/mem/mm_list.hpp

@@ -1,112 +0,0 @@
-#pragma once
-
-#include "paging.hpp"
-#include "vm_area.hpp"
-
-#include <set>
-
-#include <stdint.h>
-
-#include <kernel/vfs/dentry.hpp>
-
-namespace kernel::mem {
-
-constexpr uintptr_t KERNEL_SPACE_START = 0x8000000000000000ULL;
-constexpr uintptr_t USER_SPACE_MEMORY_TOP = 0x0000800000000000ULL;
-constexpr uintptr_t MMAP_MIN_ADDR = 0x0000000000001000ULL;
-constexpr uintptr_t STACK_MIN_ADDR = 0x0000700000000000ULL;
-
-class mm_list {
-   private:
-    struct comparator {
-        constexpr bool operator()(const vm_area& lhs,
-                                  const vm_area& rhs) const noexcept {
-            return lhs < rhs;
-        }
-        constexpr bool operator()(const vm_area& lhs,
-                                  uintptr_t rhs) const noexcept {
-            return lhs < rhs;
-        }
-        constexpr bool operator()(uintptr_t lhs,
-                                  const vm_area& rhs) const noexcept {
-            return lhs < rhs;
-        }
-    };
-
-   public:
-    using list_type = std::set<vm_area, comparator>;
-    using iterator = list_type::iterator;
-    using const_iterator = list_type::const_iterator;
-
-    struct map_args {
-        // MUSE BE aligned to 4kb boundary
-        uintptr_t vaddr;
-        // MUSE BE aligned to 4kb boundary
-        std::size_t length;
-
-        unsigned long flags;
-
-        const fs::rust_inode_handle* file_inode;
-        // MUSE BE aligned to 4kb boundary
-        std::size_t file_offset;
-    };
-
-   private:
-    list_type m_areas;
-    paging::pfn_t m_pt;
-    iterator m_brk{};
-
-   public:
-    // default constructor copies kernel_mms
-    explicit mm_list();
-    // copies kernel_mms and mirrors user space
-    explicit mm_list(const mm_list& other);
-
-    constexpr mm_list(mm_list&& v)
-        : m_areas(std::move(v.m_areas))
-        , m_pt(std::exchange(v.m_pt, 0))
-        , m_brk{std::move(v.m_brk)} {}
-
-    ~mm_list();
-
-    void switch_pd() const noexcept;
-
-    int register_brk(uintptr_t addr);
-    uintptr_t set_brk(uintptr_t addr);
-
-    void clear();
-
-    // split the memory block at the specified address
-    // return: iterator to the new block
-    iterator split(iterator area, uintptr_t at);
-
-    bool is_avail(uintptr_t addr) const;
-    bool is_avail(uintptr_t start, std::size_t length) const noexcept;
-
-    uintptr_t find_avail(uintptr_t hint, size_t length) const;
-
-    int unmap(iterator area, bool should_invalidate_tlb);
-    int unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb);
-
-    int mmap(const map_args& args);
-
-    constexpr vm_area* find(uintptr_t lp) {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &iter;
-    }
-
-    constexpr const vm_area* find(uintptr_t lp) const {
-        auto iter = m_areas.find(lp);
-        if (iter == m_areas.end())
-            return nullptr;
-        return &iter;
-    }
-
-    constexpr paging::PSE get_page_table() const noexcept {
-        return paging::PSE{m_pt};
-    }
-};
-
-} // namespace kernel::mem

+ 5 - 47
include/kernel/mem/paging.hpp

@@ -6,6 +6,7 @@
 
 
 #include <stdint.h>
 #include <stdint.h>
 
 
+#include <kernel/interrupt.hpp>
 #include <kernel/mem/paging_asm.h>
 #include <kernel/mem/paging_asm.h>
 #include <kernel/mem/phys.hpp>
 #include <kernel/mem/phys.hpp>
 
 
@@ -27,10 +28,8 @@ constexpr int idx_p1(uintptr_t vaddr) noexcept {
     return (vaddr >> 12) & 0x1ff;
     return (vaddr >> 12) & 0x1ff;
 }
 }
 
 
-constexpr std::tuple<int, int, int, int, int> idx_all(
-    uintptr_t vaddr) noexcept {
-    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr),
-            idx_p1(vaddr)};
+constexpr std::tuple<int, int, int, int, int> idx_all(uintptr_t vaddr) noexcept {
+    return {idx_p5(vaddr), idx_p4(vaddr), idx_p3(vaddr), idx_p2(vaddr), idx_p1(vaddr)};
 }
 }
 
 
 // page frame number
 // page frame number
@@ -46,15 +45,11 @@ constexpr psattr_t PA_USER_DATA = PA_DATA | PA_G | PA_US;
 
 
 constexpr psattr_t PA_PAGE_TABLE = PA_P | PA_RW;
 constexpr psattr_t PA_PAGE_TABLE = PA_P | PA_RW;
 constexpr psattr_t PA_KERNEL_PAGE_TABLE = PA_PAGE_TABLE | PA_G;
 constexpr psattr_t PA_KERNEL_PAGE_TABLE = PA_PAGE_TABLE | PA_G;
-constexpr psattr_t PA_USER_PAGE_TABLE = PA_PAGE_TABLE | PA_US;
 
 
 constexpr psattr_t PA_DATA_HUGE = PA_DATA | PA_PS;
 constexpr psattr_t PA_DATA_HUGE = PA_DATA | PA_PS;
 constexpr psattr_t PA_KERNEL_DATA_HUGE = PA_DATA_HUGE | PA_G;
 constexpr psattr_t PA_KERNEL_DATA_HUGE = PA_DATA_HUGE | PA_G;
 constexpr psattr_t PA_USER_DATA_HUGE = PA_DATA_HUGE | PA_US;
 constexpr psattr_t PA_USER_DATA_HUGE = PA_DATA_HUGE | PA_US;
 
 
-constexpr psattr_t PA_ANONYMOUS_PAGE = PA_P | PA_US | PA_COW | PA_ANON;
-constexpr psattr_t PA_MMAPPED_PAGE = PA_US | PA_COW | PA_ANON | PA_MMAP;
-
 namespace __inner {
 namespace __inner {
     using pse_t = uint64_t;
     using pse_t = uint64_t;
 
 
@@ -74,9 +69,7 @@ class PSE {
 
 
     constexpr pfn_t pfn() const noexcept { return *m_ptrbase & ~PA_MASK; }
     constexpr pfn_t pfn() const noexcept { return *m_ptrbase & ~PA_MASK; }
 
 
-    constexpr psattr_t attributes() const noexcept {
-        return *m_ptrbase & PA_MASK;
-    }
+    constexpr psattr_t attributes() const noexcept { return *m_ptrbase & PA_MASK; }
 
 
     constexpr PSE operator[](std::size_t nth) const noexcept {
     constexpr PSE operator[](std::size_t nth) const noexcept {
         return PSE{m_ptrbase.phys() + 8 * nth};
         return PSE{m_ptrbase.phys() + 8 * nth};
@@ -135,41 +128,6 @@ constexpr unsigned long PAGE_FAULT_PK = 0x00000020;
 constexpr unsigned long PAGE_FAULT_SS = 0x00000040;
 constexpr unsigned long PAGE_FAULT_SS = 0x00000040;
 constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
 constexpr unsigned long PAGE_FAULT_SGX = 0x00008000;
 
 
-void handle_page_fault(unsigned long err);
-
-class vaddr_range {
-    std::size_t n;
-
-    int idx4;
-    int idx3;
-    int idx2;
-    int idx1;
-
-    PSE pml4;
-    PSE pdpt;
-    PSE pd;
-    PSE pt;
-
-    uintptr_t m_start;
-    uintptr_t m_end;
-
-    bool is_privilege;
-
-   public:
-    explicit vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end,
-                         bool is_privilege = false);
-    explicit vaddr_range(std::nullptr_t);
-
-    vaddr_range begin() const noexcept;
-    vaddr_range end() const noexcept;
-
-    PSE operator*() const noexcept;
-
-    vaddr_range& operator++();
-    operator bool() const noexcept;
-
-    // compares remaining pages to iterate
-    bool operator==(const vaddr_range& other) const noexcept;
-};
+void handle_page_fault(interrupt_stack* int_stack);
 
 
 } // namespace kernel::mem::paging
 } // namespace kernel::mem::paging

+ 1 - 2
include/kernel/mem/paging_asm.h

@@ -1,4 +1,3 @@
-
 #define KERNEL_IMAGE_PADDR         0x400000
 #define KERNEL_IMAGE_PADDR         0x400000
 #define KERNEL_STAGE1_PADDR        0x001000
 #define KERNEL_STAGE1_PADDR        0x001000
 #define KERNEL_PML4                0x002000
 #define KERNEL_PML4                0x002000
@@ -7,10 +6,10 @@
 #define KERNEL_PD_KIMAGE           0x005000
 #define KERNEL_PD_KIMAGE           0x005000
 #define KERNEL_PT_KIMAGE           0x006000
 #define KERNEL_PT_KIMAGE           0x006000
 #define KERNEL_PD_STRUCT_PAGE_ARR  0x007000
 #define KERNEL_PD_STRUCT_PAGE_ARR  0x007000
-#define EMPTY_PAGE_PFN             0x008000
 
 
 #define KERNEL_BSS_HUGE_PAGE       0x200000
 #define KERNEL_BSS_HUGE_PAGE       0x200000
 
 
+
 #define PA_P    0x0000000000000001
 #define PA_P    0x0000000000000001
 #define PA_RW   0x0000000000000002
 #define PA_RW   0x0000000000000002
 #define PA_US   0x0000000000000004
 #define PA_US   0x0000000000000004

+ 1 - 19
include/kernel/mem/phys.hpp

@@ -13,8 +13,7 @@ namespace kernel::mem {
 
 
 template <typename T, bool Cached = true>
 template <typename T, bool Cached = true>
 class physaddr {
 class physaddr {
-    static constexpr uintptr_t PHYS_OFFSET =
-        Cached ? 0xffffff0000000000ULL : 0xffffff4000000000ULL;
+    static constexpr uintptr_t PHYS_OFFSET = Cached ? 0xffffff0000000000ULL : 0xffffff4000000000ULL;
 
 
     uintptr_t m_ptr;
     uintptr_t m_ptr;
 
 
@@ -33,21 +32,4 @@ class physaddr {
     constexpr uintptr_t phys() const noexcept { return m_ptr; }
     constexpr uintptr_t phys() const noexcept { return m_ptr; }
 };
 };
 
 
-//  gdt[0]:  null
-//  gdt[1]:  kernel code
-//  gdt[2]:  kernel data
-//  gdt[3]:  user code
-//  gdt[4]:  user data
-//  gdt[5]:  user code compability mode
-//  gdt[6]:  user data compability mode
-//  gdt[7]:  thread local 32bit
-//  gdt[8]:  tss descriptor low
-//  gdt[9]:  tss descriptor high
-//  gdt[10]: ldt descriptor low
-//  gdt[11]: ldt descriptor high
-//  gdt[12]: null segment(in ldt)
-//  gdt[13]: thread local 64bit(in ldt)
-// &gdt[14]: tss of 0x68 bytes from here
-constexpr physaddr<uint64_t> gdt{0x00000000 + 1 - 1};
-
 } // namespace kernel::mem
 } // namespace kernel::mem

+ 0 - 60
include/kernel/mem/vm_area.hpp

@@ -1,60 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-#include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace kernel::mem {
-
-constexpr unsigned long MM_WRITE = 0x00000000'00000001;
-constexpr unsigned long MM_EXECUTE = 0x00000000'00000002;
-constexpr unsigned long MM_MAPPED = 0x00000000'00000004;
-constexpr unsigned long MM_ANONYMOUS = 0x00000000'00000008;
-constexpr unsigned long MM_INTERNAL_MASK = 0xffffffff'00000000;
-constexpr unsigned long MM_BREAK = 0x80000000'00000000;
-
-struct vm_area {
-    uintptr_t start;
-    uintptr_t end;
-
-    unsigned long flags;
-
-    const fs::rust_inode_handle* mapped_file;
-    std::size_t file_offset;
-
-    constexpr bool is_avail(uintptr_t ostart, uintptr_t oend) const noexcept {
-        return (ostart >= end || oend <= start);
-    }
-
-    constexpr bool operator<(const vm_area& rhs) const noexcept {
-        return end <= rhs.start;
-    }
-    constexpr bool operator<(uintptr_t rhs) const noexcept {
-        return end <= rhs;
-    }
-    friend constexpr bool operator<(uintptr_t lhs,
-                                    const vm_area& rhs) noexcept {
-        return lhs < rhs.start;
-    }
-
-    constexpr vm_area(uintptr_t start, unsigned long flags, uintptr_t end,
-                      const fs::rust_inode_handle* mapped_file = nullptr,
-                      std::size_t offset = 0)
-        : start{start}
-        , end{end}
-        , flags{flags}
-        , mapped_file{mapped_file}
-        , file_offset{offset} {}
-
-    constexpr vm_area(uintptr_t start, unsigned long flags,
-                      const fs::rust_inode_handle* mapped_file = nullptr,
-                      std::size_t offset = 0)
-        : start{start}
-        , end{start}
-        , flags{flags}
-        , mapped_file{mapped_file}
-        , file_offset{offset} {}
-};
-
-} // namespace kernel::mem

+ 0 - 37
include/kernel/module.hpp

@@ -1,37 +0,0 @@
-#pragma once
-
-#include <memory>
-
-#include <types/types.h>
-
-#define MODULE_LOADER(name) \
-    static std::unique_ptr<kernel::kmod::kmod> __module##name##_loader()
-
-#define INTERNAL_MODULE(name, type)                                         \
-    MODULE_LOADER(name);                                                    \
-    SECTION(".kmods")                                                       \
-    __attribute__((used))                                                   \
-    std::unique_ptr<kernel::kmod::kmod> (*const __module##name##_entry)() = \
-        __module##name##_loader;                                            \
-    MODULE_LOADER(name) {                                                   \
-        return std::make_unique<type>();                                    \
-    }
-
-namespace kernel::kmod {
-
-struct kmod {
-    const char* const name;
-
-    explicit kmod(const char* name);
-
-    virtual ~kmod() = default;
-    kmod(const kmod&) = delete;
-    kmod& operator=(const kmod&) = delete;
-
-    virtual int init() = 0;
-};
-
-extern "C" std::unique_ptr<kmod> (*const KMOD_LOADERS_START[])();
-void load_internal_modules();
-
-} // namespace kernel::kmod

+ 2 - 156
include/kernel/process.hpp

@@ -1,13 +1,9 @@
 #pragma once
 #pragma once
 
 
-#include <list>
-#include <map>
-#include <set>
-#include <tuple>
-#include <utility>
-
 #include <assert.h>
 #include <assert.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <fcntl.h>
+#include <signal.h>
 #include <stdint.h>
 #include <stdint.h>
 #include <sys/types.h>
 #include <sys/types.h>
 
 
@@ -16,158 +12,8 @@
 #include <types/path.hpp>
 #include <types/path.hpp>
 #include <types/types.h>
 #include <types/types.h>
 
 
-#include <kernel/async/waitlist.hpp>
 #include <kernel/interrupt.hpp>
 #include <kernel/interrupt.hpp>
-#include <kernel/mem/mm_list.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/paging.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/task/current.hpp>
-#include <kernel/task/thread.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/user/thread_local.hpp>
 #include <kernel/vfs.hpp>
 #include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/filearr.hpp>
-
-class process;
-
-class proclist;
-
-inline process* volatile current_process;
-inline proclist* procs;
-
-struct process_attr {
-    uint16_t system : 1;
-    uint16_t zombie : 1 = 0;
-};
-
-class process {
-   public:
-    struct wait_obj {
-        pid_t pid;
-        int code;
-    };
-
-   public:
-    kernel::mem::mm_list mms{};
-    std::set<kernel::task::thread> thds;
-    kernel::async::wait_list waitlist;
-
-    kernel::async::mutex mtx_waitprocs;
-    std::list<wait_obj> waitprocs;
-
-    process_attr attr{};
-    fs::filearray files;
-    fs::dentry_pointer cwd{};
-    mode_t umask{0022};
-
-    pid_t pid{};
-    pid_t ppid{};
-    pid_t pgid{};
-    pid_t sid{};
-
-    kernel::tty::tty* control_tty{};
-    struct fs::fs_context fs_context;
-    std::set<pid_t> children;
-
-   public:
-    process(const process&) = delete;
-    explicit process(const process& parent, pid_t pid);
-
-    // this function is used for system initialization
-    // DO NOT use this after the system is on
-    explicit process(pid_t pid, pid_t ppid);
-
-    constexpr bool is_system(void) const { return attr.system; }
-    constexpr bool is_zombie(void) const { return attr.zombie; }
-
-    void send_signal(kernel::signal_list::signo_type signal);
-};
-
-class proclist final {
-   private:
-    std::map<pid_t, process> m_procs;
-    pid_t m_nextpid = 2;
-
-    constexpr pid_t next_pid() { return m_nextpid++; }
-    process& real_emplace(pid_t pid, pid_t ppid);
-
-   public:
-    proclist();
-
-    constexpr process& copy_from(process& proc) {
-        pid_t pid = next_pid();
-        auto [iter, inserted] = m_procs.try_emplace(pid, proc, pid);
-        assert(inserted);
-
-        proc.children.insert(pid);
-        return iter->second;
-    }
-
-    constexpr void remove(pid_t pid) {
-        make_children_orphans(pid);
-
-        auto proc_iter = m_procs.find(pid);
-
-        auto ppid = proc_iter->second.ppid;
-        find(ppid).children.erase(pid);
-
-        m_procs.erase(proc_iter);
-    }
-
-    constexpr std::pair<process*, bool> try_find(pid_t pid) const {
-        auto iter = m_procs.find(pid);
-        if (iter)
-            return {(process*)&iter->second, true};
-        else
-            return {nullptr, false};
-    }
-
-    // if process doesn't exist, the behavior is undefined
-    constexpr process& find(pid_t pid) {
-        auto [ptr, found] = try_find(pid);
-        assert(found);
-        return *ptr;
-    }
-
-    constexpr void make_children_orphans(pid_t pid) {
-        auto& children = find(pid).children;
-        auto& init_children = find(1).children;
-
-        for (auto item : children) {
-            init_children.insert(item);
-            find(item).ppid = 1;
-        }
-
-        children.clear();
-    }
-
-    // the process MUST exist, or the behavior is undefined
-    void send_signal(pid_t pid, kernel::signal_list::signo_type signal) {
-        auto& proc = find(pid);
-        proc.send_signal(signal);
-    }
-    void send_signal_grp(pid_t pgid, kernel::signal_list::signo_type signal) {
-        // TODO: find processes that are in the same session quickly
-        for (auto& [pid, proc] : m_procs) {
-            if (proc.pgid != pgid)
-                continue;
-            proc.send_signal(signal);
-        }
-    }
-
-    void kill(pid_t pid, int exit_code);
-
-    constexpr auto begin() const { return m_procs.begin(); }
-    constexpr auto end() const { return m_procs.end(); }
-};
-
-void NORETURN init_scheduler(kernel::mem::paging::pfn_t kernel_stack_pfn);
-/// @return true if returned normally, false if being interrupted
-bool schedule(void);
-void NORETURN schedule_noreturn(void);
 
 
 void NORETURN freeze(void);
 void NORETURN freeze(void);
-void NORETURN kill_current(int signo);
-
-void check_signal(void);

+ 0 - 74
include/kernel/signal.hpp

@@ -1,74 +0,0 @@
-#pragma once
-
-#include <list>
-#include <map>
-
-#include <signal.h>
-#include <stdint.h>
-
-#include <types/cplusplus.hpp>
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/interrupt.hpp>
-
-namespace kernel {
-
-using sigmask_type = uint64_t;
-
-struct sigaction {
-    sighandler_t sa_handler;
-    unsigned long sa_flags;
-    sigrestorer_t sa_restorer;
-    sigmask_type sa_mask;
-};
-
-class signal_list {
-   public:
-    using signo_type = uint32_t;
-    using list_type = std::list<signo_type>;
-
-   private:
-    list_type m_list;
-    sigmask_type m_mask{};
-    std::map<signo_type, sigaction> m_handlers;
-    async::mutex m_mtx;
-
-   public:
-    static constexpr bool check_valid(signo_type sig) {
-        return sig >= 1 && sig <= 64;
-    }
-
-   public:
-    constexpr signal_list() = default;
-    constexpr signal_list(const signal_list& val)
-        : m_list{val.m_list}
-        , m_mask{val.m_mask}
-        , m_handlers{val.m_handlers}
-        , m_mtx{} {}
-
-    constexpr signal_list(signal_list&& val)
-        : m_list{std::move(val.m_list)}
-        , m_mask{std::move(val.m_mask)}
-        , m_handlers{std::move(val.m_handlers)}
-        , m_mtx{} {}
-
-    void on_exec();
-
-    sigmask_type get_mask() const;
-    void set_mask(sigmask_type mask);
-    void mask(sigmask_type mask);
-    void unmask(sigmask_type mask);
-
-    void set_handler(signo_type signal, const sigaction& action);
-    void get_handler(signo_type signal, sigaction& action) const;
-
-    signo_type pending_signal();
-
-    // return value: whether the thread should wake up
-    bool raise(signo_type signal);
-    void handle(interrupt_stack* context, mmx_registers* mmxregs);
-    void after_signal(signo_type signal);
-};
-
-} // namespace kernel

+ 0 - 118
include/kernel/syscall.hpp

@@ -1,118 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include <bits/alltypes.h>
-#include <poll.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/uio.h>
-#include <sys/utsname.h>
-#include <time.h>
-
-#include <types/types.h>
-
-#include <kernel/interrupt.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/user/thread_local.hpp>
-
-#define SYSCALL64_ARG1(type, name) type name = (type)((data)->head.s_regs.rdi)
-#define SYSCALL64_ARG2(type, name) type name = (type)((data)->head.s_regs.rsi)
-#define SYSCALL64_ARG3(type, name) type name = (type)((data)->head.s_regs.rdx)
-#define SYSCALL64_ARG4(type, name) type name = (type)((data)->head.s_regs.r10)
-#define SYSCALL64_ARG5(type, name) type name = (type)((data)->head.s_regs.r8)
-#define SYSCALL64_ARG6(type, name) type name = (type)((data)->head.s_regs.r9)
-
-namespace kernel {
-void init_syscall_table();
-
-void handle_syscall32(int no, interrupt_stack* data, mmx_registers* mmxregs);
-void handle_syscall64(int no, interrupt_stack* data, mmx_registers* mmxregs);
-
-namespace syscall {
-    // in fileops.cc
-    ssize_t do_write(int fd, const char __user* buf, size_t n);
-    ssize_t do_read(int fd, char __user* buf, size_t n);
-    int do_close(int fd);
-    int do_dup(int old_fd);
-    int do_dup2(int old_fd, int new_fd);
-    int do_pipe(int __user* pipefd);
-    ssize_t do_getdents(int fd, char __user* buf, size_t cnt);
-    ssize_t do_getdents64(int fd, char __user* buf, size_t cnt);
-    int do_open(const char __user* path, int flags, mode_t mode);
-    int do_symlink(const char __user* target, const char __user* linkpath);
-    int do_readlink(const char __user* pathname, char __user* buf,
-                    size_t buf_size);
-    int do_ioctl(int fd, unsigned long request, uintptr_t arg3);
-    ssize_t do_readv(int fd, const iovec* iov, int iovcnt);
-    ssize_t do_writev(int fd, const iovec* iov, int iovcnt);
-    off_t do_lseek(int fd, off_t offset, int whence);
-    uintptr_t do_mmap_pgoff(uintptr_t addr, size_t len, int prot, int flags,
-                            int fd, off_t pgoffset);
-    int do_munmap(uintptr_t addr, size_t len);
-    ssize_t do_sendfile(int out_fd, int in_fd, off_t __user* offset,
-                        size_t count);
-    int do_statx(int dirfd, const char __user* path, int flags,
-                 unsigned int mask, statx __user* statxbuf);
-    int do_fcntl(int fd, int cmd, unsigned long arg);
-    int do_poll(pollfd __user* fds, nfds_t nfds, int timeout);
-    int do_mknod(const char __user* pathname, mode_t mode, dev_t dev);
-    int do_access(const char __user* pathname, int mode);
-    int do_unlink(const char __user* pathname);
-    int do_truncate(const char __user* pathname, long length);
-    int do_mkdir(const char __user* pathname, mode_t mode);
-    int do_socket(int domain, int type, int protocol);
-
-    // in procops.cc
-    int do_chdir(const char __user* path);
-    [[noreturn]] int do_exit(int status);
-    int do_waitpid(pid_t waitpid, int __user* arg1, int options);
-    pid_t do_getsid(pid_t pid);
-    pid_t do_setsid();
-    pid_t do_getpgid(pid_t pid);
-    int do_setpgid(pid_t pid, pid_t pgid);
-    int do_set_thread_area(user::user_desc __user* ptr);
-    pid_t do_set_tid_address(int __user* tidptr);
-    int do_prctl(int option, uintptr_t arg2);
-    int do_arch_prctl(int option, uintptr_t arg2);
-    pid_t do_getpid();
-    pid_t do_getppid();
-    uid_t do_getuid();
-    uid_t do_geteuid();
-    gid_t do_getgid();
-    pid_t do_gettid();
-    int do_getcwd(char __user* buf, size_t buf_size);
-    uintptr_t do_brk(uintptr_t addr);
-    int do_umask(mode_t mask);
-    int do_kill(pid_t pid, int sig);
-    int do_tkill(pid_t pid, int sig);
-    int do_rt_sigprocmask(int how, const kernel::sigmask_type __user* set,
-                          kernel::sigmask_type __user* oldset,
-                          size_t sigsetsize);
-    int do_rt_sigaction(int signum, const sigaction __user* act,
-                        sigaction __user* oldact, size_t sigsetsize);
-    int do_newuname(new_utsname __user* buf);
-
-    struct execve_retval {
-        uintptr_t ip;
-        uintptr_t sp;
-        int status;
-    };
-
-    execve_retval do_execve(const std::string& exec,
-                            const std::vector<std::string>& args,
-                            const std::vector<std::string>& envs);
-
-    // in mount.cc
-    int do_mount(const char __user* source, const char __user* target,
-                 const char __user* fstype, unsigned long flags,
-                 const void __user* _fsdata);
-
-    // in infoops.cc
-    int do_clock_gettime(clockid_t clk_id, timespec __user* tp);
-    int do_gettimeofday(timeval __user* tv, void __user* tz);
-
-} // namespace syscall
-
-} // namespace kernel

+ 0 - 5
include/kernel/task/current.hpp

@@ -1,5 +0,0 @@
-#pragma once
-
-#include <kernel/task/thread.hpp>
-
-inline kernel::task::thread* volatile current_thread;

+ 0 - 16
include/kernel/task/readyqueue.hpp

@@ -1,16 +0,0 @@
-#pragma once
-
-#include <list>
-
-#include <kernel/task/thread.hpp>
-
-namespace kernel::task::dispatcher {
-
-void enqueue(thread* thd);
-void dequeue(thread* thd);
-
-void setup_idle(thread* idle_thd);
-
-thread* next();
-
-} // namespace kernel::task::dispatcher

+ 0 - 76
include/kernel/task/thread.hpp

@@ -1,76 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <string>
-
-#include <stdint.h>
-#include <sys/types.h>
-
-#include <types/types.h>
-
-#include <kernel/mem/paging.hpp>
-#include <kernel/signal.hpp>
-#include <kernel/user/thread_local.hpp>
-
-namespace kernel::task {
-
-using tid_t = std::size_t;
-
-struct thread {
-   public:
-    using thd_attr_t = uint32_t;
-    static constexpr thd_attr_t SYSTEM = 0x01;
-    static constexpr thd_attr_t READY = 0x02;
-    static constexpr thd_attr_t STOPPED = 0x04;
-    static constexpr thd_attr_t ZOMBIE = 0x08;
-    static constexpr thd_attr_t ISLEEP = 0x10;
-    static constexpr thd_attr_t USLEEP = 0x20;
-
-   private:
-    struct kernel_stack {
-        mem::paging::pfn_t pfn;
-        uintptr_t sp;
-
-        kernel_stack();
-        kernel_stack(const kernel_stack& other);
-        kernel_stack(kernel_stack&& other);
-        ~kernel_stack();
-
-        uint64_t pushq(uint64_t val);
-        uint32_t pushl(uint32_t val);
-
-        void load_interrupt_stack() const;
-    };
-
-   public:
-    kernel_stack kstack;
-    pid_t owner;
-    thd_attr_t attr;
-    signal_list signals;
-
-    int* __user set_child_tid{};
-    int* __user clear_child_tid{};
-
-    std::string name{};
-    uint64_t tls_desc32{};
-    std::size_t elected_times{};
-
-    explicit thread(std::string name, pid_t owner);
-    thread(const thread& val, pid_t owner);
-
-    int set_thread_area(user::user_desc* ptr);
-    int load_thread_area32() const;
-
-    void set_attr(thd_attr_t new_attr);
-
-    void send_signal(signal_list::signo_type signal);
-
-    thread(thread&& val) = default;
-
-    tid_t tid() const;
-
-    bool operator<(const thread& rhs) const;
-    bool operator==(const thread& rhs) const;
-};
-
-} // namespace kernel::task

+ 0 - 73
include/kernel/tty.hpp

@@ -1,73 +0,0 @@
-#pragma once
-
-#include <string>
-
-#include <stdint.h>
-#include <sys/types.h>
-#include <termios.h>
-
-#include <types/allocator.hpp>
-#include <types/buffer.hpp>
-#include <types/cplusplus.hpp>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-
-namespace kernel::tty {
-
-class tty : public types::non_copyable {
-   public:
-    static constexpr size_t BUFFER_SIZE = 4096;
-
-   private:
-    void _real_commit_char(int c);
-    void _echo_char(int c);
-
-    int _do_erase(bool should_echo);
-
-   public:
-    explicit tty(std::string name);
-    virtual void putchar(char c) = 0;
-    void print(const char* str);
-    ssize_t read(char* buf, size_t buf_size, size_t n);
-    ssize_t write(const char* buf, size_t n);
-
-    // characters committed to buffer will be handled
-    // by the input line discipline (N_TTY)
-    void commit_char(int c);
-
-    // print character to the output
-    // characters will be handled by the output line discipline
-    void show_char(int c);
-
-    void clear_read_buf(void);
-
-    // TODO: formal poll support
-    int poll();
-
-    constexpr void set_pgrp(pid_t pgid) { fg_pgroup = pgid; }
-
-    constexpr pid_t get_pgrp(void) const { return fg_pgroup; }
-
-    termios termio;
-    std::string name;
-
-   protected:
-    async::mutex mtx_buf;
-    types::buffer buf;
-    async::wait_list waitlist;
-
-    pid_t fg_pgroup;
-};
-
-class vga_tty : public virtual tty {
-   public:
-    vga_tty();
-    virtual void putchar(char c) override;
-};
-
-inline tty* console;
-
-int register_tty(tty* tty_dev);
-
-} // namespace kernel::tty

+ 0 - 21
include/kernel/user/thread_local.hpp

@@ -1,21 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-namespace kernel::user {
-
-struct user_desc {
-    uint32_t entry_number;
-    uint32_t base_addr;
-    uint32_t limit;
-    uint32_t seg_32bit : 1;
-    uint32_t contents : 2;
-    uint32_t read_exec_only : 1;
-    uint32_t limit_in_pages : 1;
-    uint32_t seg_not_present : 1;
-    uint32_t useable : 1;
-};
-
-void load_thread_area32(uint64_t desc);
-
-} // namespace kernel::user

+ 0 - 84
include/kernel/vfs.hpp

@@ -5,12 +5,6 @@
 #include <sys/stat.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/types.h>
 
 
-#include <types/path.hpp>
-
-#include <kernel/mem/paging.hpp>
-#include <kernel/vfs/dentry.hpp>
-#include <kernel/vfs/file.hpp>
-
 #define NODE_MAJOR(node) (((node) >> 8) & 0xFFU)
 #define NODE_MAJOR(node) (((node) >> 8) & 0xFFU)
 #define NODE_MINOR(node) ((node) & 0xFFU)
 #define NODE_MINOR(node) ((node) & 0xFFU)
 
 
@@ -20,82 +14,4 @@ constexpr dev_t make_device(uint32_t major, uint32_t minor) {
     return ((major << 8) & 0xFF00U) | (minor & 0xFFU);
     return ((major << 8) & 0xFF00U) | (minor & 0xFFU);
 }
 }
 
 
-// buf, buf_size, cnt
-using chrdev_read = std::function<ssize_t(char*, std::size_t, std::size_t)>;
-
-// buf, cnt
-using chrdev_write = std::function<ssize_t(const char*, std::size_t)>;
-
-struct chrdev_ops {
-    chrdev_read read;
-    chrdev_write write;
-};
-
-struct PACKED user_dirent {
-    ino_t d_ino;       // inode number
-    uint32_t d_off;    // ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    char d_name[1];    // file name with a padding zero
-    // uint8_t d_type; // file type, with offset of (d_reclen - 1)
-};
-
-struct PACKED user_dirent64 {
-    ino64_t d_ino;     // inode number
-    uint64_t d_off;    // implementation-defined field, ignored
-    uint16_t d_reclen; // length of this struct user_dirent
-    uint8_t d_type;    // file type, with offset of (d_reclen - 1)
-    char d_name[1];    // file name with a padding zero
-};
-
-struct fs_context {
-    dentry_pointer root;
-};
-
-int register_char_device(dev_t node, const chrdev_ops& ops);
-ssize_t char_device_read(dev_t node, char* buf, size_t buf_size, size_t n);
-ssize_t char_device_write(dev_t node, const char* buf, size_t n);
-
-extern "C" int fs_creat(struct dentry* at, mode_t mode);
-extern "C" int fs_mkdir(struct dentry* at, mode_t mode);
-extern "C" int fs_mknod(struct dentry* at, mode_t mode, dev_t sn);
-extern "C" int fs_unlink(struct dentry* at);
-extern "C" int fs_symlink(struct dentry* at, const char* target);
-
-extern "C" int fs_statx(const struct rust_inode_handle* inode,
-                        struct statx* stat, unsigned int mask);
-extern "C" int fs_readlink(const struct rust_inode_handle* inode, char* buf,
-                           size_t buf_size);
-extern "C" int fs_truncate(const struct rust_inode_handle* file, size_t size);
-extern "C" size_t fs_read(const struct rust_inode_handle* file, char* buf,
-                          size_t buf_size, size_t offset, size_t n);
-extern "C" size_t fs_write(const struct rust_inode_handle* file,
-                           const char* buf, size_t offset, size_t n);
-
-using readdir_callback_fn = std::function<int(const char*, size_t, ino_t)>;
-
-extern "C" ssize_t fs_readdir(const struct rust_inode_handle* file,
-                              size_t offset,
-                              const readdir_callback_fn* callback);
-
-extern "C" int fs_mount(dentry* mnt, const char* source,
-                        const char* mount_point, const char* fstype,
-                        unsigned long flags, const void* data);
-
-extern "C" mode_t r_get_inode_mode(struct rust_inode_handle* inode);
-extern "C" size_t r_get_inode_size(struct rust_inode_handle* inode);
-extern "C" bool r_dentry_is_directory(struct dentry* dentry);
-extern "C" bool r_dentry_is_invalid(struct dentry* dentry);
-
-// borrow from dentry->inode
-extern "C" struct rust_inode_handle* r_dentry_get_inode(struct dentry* dentry);
-extern "C" struct dentry* r_get_root_dentry();
-
-#define current_open(...) \
-    fs::open(current_process->fs_context, current_process->cwd, __VA_ARGS__)
-
-std::pair<dentry_pointer, int> open(const fs_context& context,
-                                    const dentry_pointer& cwd,
-                                    types::string_view path,
-                                    bool follow_symlinks = true);
-
 } // namespace fs
 } // namespace fs

+ 0 - 28
include/kernel/vfs/dentry.hpp

@@ -1,28 +0,0 @@
-#pragma once
-
-#include <string>
-
-#include <bits/alltypes.h>
-
-#include <types/path.hpp>
-
-#include <kernel/async/lock.hpp>
-
-struct dentry;
-
-namespace fs {
-
-struct rust_vfs_handle {
-    void* data[2];
-};
-
-struct dentry_deleter {
-    void operator()(struct dentry* dentry) const;
-};
-
-using dentry_pointer = std::unique_ptr<struct dentry, dentry_deleter>;
-extern "C" int d_path(struct dentry* dentry, struct dentry* root,
-                      char* out_path, size_t buflen);
-dentry_pointer d_get(const dentry_pointer& dp);
-
-} // namespace fs

+ 0 - 106
include/kernel/vfs/file.hpp

@@ -1,106 +0,0 @@
-#pragma once
-
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/types.h>
-
-#include <types/buffer.hpp>
-#include <types/types.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace fs {
-
-class pipe : public types::non_copyable {
-   private:
-    static constexpr size_t PIPE_SIZE = 4096;
-    static constexpr uint32_t READABLE = 1;
-    static constexpr uint32_t WRITABLE = 2;
-
-   private:
-    types::buffer buf;
-    uint32_t flags;
-    kernel::async::mutex mtx;
-
-    kernel::async::wait_list waitlist_r;
-    kernel::async::wait_list waitlist_w;
-
-   public:
-    pipe();
-
-    void close_read();
-    void close_write();
-
-    int write(const char* buf, size_t n);
-    int read(char* buf, size_t n);
-
-    constexpr bool is_readable() const { return flags & READABLE; }
-
-    constexpr bool is_writeable() const { return flags & WRITABLE; }
-};
-
-struct file {
-    struct file_flags {
-        uint32_t read : 1;
-        uint32_t write : 1;
-        uint32_t append : 1;
-    } flags{};
-
-    file(file_flags flags) : flags(flags) {}
-
-    virtual ~file() = default;
-
-    virtual ssize_t read(char* __user buf, size_t n) = 0;
-    virtual ssize_t do_write(const char* __user buf, size_t n) = 0;
-
-    virtual off_t seek(off_t n, int whence) {
-        return (void)n, (void)whence, -ESPIPE;
-    }
-
-    ssize_t write(const char* __user buf, size_t n) {
-        if (!flags.write)
-            return -EBADF;
-
-        if (flags.append) {
-            seek(0, SEEK_END);
-        }
-
-        return do_write(buf, n);
-    }
-
-    // regular files should override this method
-    virtual int getdents(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-    virtual int getdents64(char* __user buf, size_t cnt) {
-        return (void)buf, (void)cnt, -ENOTDIR;
-    }
-};
-
-struct regular_file : public virtual file {
-    virtual ~regular_file() = default;
-    std::size_t cursor{};
-    struct rust_inode_handle* ind{};
-
-    regular_file(file_flags flags, size_t cursor, rust_inode_handle* ind);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-    virtual off_t seek(off_t n, int whence) override;
-    virtual int getdents(char* __user buf, size_t cnt) override;
-    virtual int getdents64(char* __user buf, size_t cnt) override;
-};
-
-struct fifo_file : public virtual file {
-    virtual ~fifo_file() override;
-    std::shared_ptr<pipe> ppipe;
-
-    fifo_file(file_flags flags, std::shared_ptr<fs::pipe> ppipe);
-
-    virtual ssize_t read(char* __user buf, size_t n) override;
-    virtual ssize_t do_write(const char* __user buf, size_t n) override;
-};
-
-} // namespace fs

+ 0 - 51
include/kernel/vfs/filearr.hpp

@@ -1,51 +0,0 @@
-#pragma once
-
-#include "dentry.hpp"
-#include "file.hpp"
-
-#include <memory>
-
-#include <types/path.hpp>
-
-#include <kernel/vfs.hpp>
-
-namespace fs {
-
-class filearray {
-   private:
-    struct impl;
-    std::shared_ptr<impl> pimpl;
-    filearray(std::shared_ptr<impl>);
-
-   public:
-    filearray(const fs_context* ctx);
-    filearray(filearray&& other) = default;
-
-    filearray copy() const;
-    filearray share() const;
-
-    // dup old_fd to some random fd
-    int dup(int old_fd);
-
-    // dup old_fd to new_fd, close new_fd if it is already open
-    int dup(int old_fd, int new_fd, int flags);
-
-    // dup old_fd to the first available fd starting from min_fd
-    int dupfd(int fd, int min_fd, int flags);
-
-    fs::file* operator[](int i) const;
-    int set_flags(int fd, int flags);
-
-    int pipe(int (&pipefd)[2]);
-    int open(const dentry_pointer& cwd, types::string_view filepath, int flags,
-             mode_t mode);
-    int open(types::string_view filepath, int flags, mode_t mode);
-
-    int close(int fd);
-
-    // any call to member methods will be invalid after clear()
-    void clear();
-    void onexec();
-};
-
-} // namespace fs

+ 0 - 25
include/kernel/vfs/vfsfwd.hpp

@@ -1,25 +0,0 @@
-#pragma once
-
-namespace fs {
-
-// in dentry.hpp
-struct dcache;
-struct dentry;
-
-// in file.hpp
-struct file;
-struct regular_file;
-struct fifo_file;
-
-class pipe;
-
-// in filearray.hpp
-class file_array;
-
-// in inode.hpp
-struct inode;
-
-// in vfs.hpp
-class vfs;
-
-} // namespace fs

+ 0 - 293
include/types/elf.hpp

@@ -1,293 +0,0 @@
-#pragma once
-
-#include <vector>
-
-#include <stdint.h>
-
-#include <kernel/interrupt.hpp>
-#include <kernel/process.hpp>
-#include <kernel/vfs.hpp>
-#include <kernel/vfs/dentry.hpp>
-
-namespace types::elf {
-
-using elf32_addr_t = uint32_t;
-using elf32_off_t = uint32_t;
-
-using elf64_addr_t = uint64_t;
-using elf64_off_t = uint64_t;
-
-constexpr elf32_addr_t ELF32_STACK_BOTTOM = 0xbffff000;
-constexpr elf32_off_t ELF32_STACK_SIZE = 8 * 1024 * 1024;
-constexpr elf32_addr_t ELF32_STACK_TOP = ELF32_STACK_BOTTOM - ELF32_STACK_SIZE;
-
-constexpr int ELF_LOAD_FAIL_NORETURN = 0x114514;
-
-struct PACKED elf32_header {
-    // 0x7f, "ELF"
-    char magic[4];
-
-    enum : uint8_t {
-        FORMAT_32 = 1,
-        FORMAT_64 = 2,
-    } format;
-    enum : uint8_t {
-        ENDIAN_LITTLE = 1,
-        ENDIAN_BIG = 2,
-    } endian;
-    // should be 1
-    uint8_t _version1;
-    enum : uint8_t {
-        ABI_SYSTEM_V = 0x00,
-        // TODO:
-        ABI_LINUX = 0x03,
-    } abi;
-    uint8_t abi_version;
-    uint8_t _reserved[7];
-    enum : uint16_t {
-        ET_NONE = 0x00,
-        ET_REL = 0x01,
-        ET_EXEC = 0x02,
-        ET_DYN = 0x03,
-        ET_CORE = 0x04,
-        ET_LOOS = 0xfe00,
-        ET_HIOS = 0xfeff,
-        ET_LOPROC = 0xff00,
-        ET_HIPROC = 0xffff,
-    } type;
-    enum : uint16_t {
-        ARCH_NONE = 0x00,
-        ARCH_X86 = 0x03,
-        ARCH_ARM = 0x28,
-        ARCH_IA64 = 0x32,
-        ARCH_X86_64 = 0x3e,
-        ARCH_ARM64 = 0xb7,
-        ARCH_RISCV = 0xf3,
-    } arch;
-    // should be 1
-    uint32_t _version2;
-    // entry address
-    elf32_addr_t entry;
-    // program header table offset
-    elf32_off_t phoff;
-    // section header table offset
-    elf32_off_t shoff;
-    // architecture dependent flags
-    uint32_t flags;
-    // elf header size
-    uint16_t ehsize;
-    // program header table entry size
-    uint16_t phentsize;
-    // program header table entries number
-    uint16_t phnum;
-    // section header table entry size
-    uint16_t shentsize;
-    // section header table entries number
-    uint16_t shnum;
-    // section header table entry index that contains section names
-    uint16_t shstrndx;
-};
-
-struct PACKED elf32_program_header_entry {
-    enum : uint32_t {
-        PT_NULL = 0x00,
-        PT_LOAD = 0x01,
-        PT_DYNAMIC = 0x02,
-        PT_INTERP = 0x03,
-        PT_NOTE = 0x04,
-        PT_SHLIB = 0x05,
-        PT_PHDR = 0x06,
-        PT_TLS = 0x07,
-        PT_LOOS = 0x60000000,
-        PT_HIOS = 0x6fffffff,
-        PT_LIPROC = 0x70000000,
-        PT_HIPROC = 0x7fffffff,
-    } type;
-    elf32_off_t offset;
-    elf32_addr_t vaddr;
-    elf32_addr_t paddr;
-    elf32_off_t filesz;
-    elf32_off_t memsz;
-    // segment dependent
-    enum : uint32_t {
-        PF_X = 0x1,
-        PF_W = 0x2,
-        PF_R = 0x4,
-    } flags;
-    // 0 and 1 for no alignment, otherwise power of 2
-    uint32_t align;
-};
-
-struct PACKED elf32_section_header_entry {
-    elf32_off_t sh_name;
-    enum : uint32_t {
-        SHT_NULL = 0x00,
-        SHT_PROGBITS = 0x01,
-        SHT_RELA = 0x04,
-        SHT_DYNAMIC = 0x06,
-        SHT_NOTE = 0x07,
-        SHT_NOBITS = 0x08,
-        SHT_REL = 0x09,
-        SHT_DYNSYM = 0x0b,
-        SHT_INIT_ARRAY = 0x0e,
-        SHT_FINI_ARRAY = 0x0f,
-        SHT_PREINIT_ARRAY = 0x0f,
-    } sh_type;
-    enum : uint32_t {
-        SHF_WRITE = 0x01,
-        SHF_ALLOC = 0x02,
-        SHF_EXECINSTR = 0x04,
-    } sh_flags;
-    elf32_addr_t sh_addr;
-    elf32_off_t sh_offset;
-    elf32_off_t sh_size;
-    uint32_t sh_link;
-    uint32_t sh_info;
-    elf32_off_t sh_addralign;
-    elf32_off_t sh_entsize;
-};
-
-struct elf32_load_data {
-    fs::dentry_pointer exec_dent;
-    const std::vector<std::string>& argv;
-    const std::vector<std::string>& envp;
-    uintptr_t ip;
-    uintptr_t sp;
-};
-
-// TODO: environment variables
-int elf32_load(elf32_load_data& data);
-
-struct PACKED elf64_header {
-    // 0x7f, "ELF"
-    char magic[4];
-
-    enum : uint8_t {
-        FORMAT_32 = 1,
-        FORMAT_64 = 2,
-    } format;
-    enum : uint8_t {
-        ENDIAN_LITTLE = 1,
-        ENDIAN_BIG = 2,
-    } endian;
-    // should be 1
-    uint8_t _version1;
-    enum : uint8_t {
-        ABI_SYSTEM_V = 0x00,
-        // TODO:
-        ABI_LINUX = 0x03,
-    } abi;
-    uint8_t abi_version;
-    uint8_t _reserved[7];
-    enum : uint16_t {
-        ET_NONE = 0x00,
-        ET_REL = 0x01,
-        ET_EXEC = 0x02,
-        ET_DYN = 0x03,
-        ET_CORE = 0x04,
-        ET_LOOS = 0xfe00,
-        ET_HIOS = 0xfeff,
-        ET_LOPROC = 0xff00,
-        ET_HIPROC = 0xffff,
-    } type;
-    enum : uint16_t {
-        ARCH_NONE = 0x00,
-        ARCH_X86 = 0x03,
-        ARCH_ARM = 0x28,
-        ARCH_IA64 = 0x32,
-        ARCH_X86_64 = 0x3e,
-        ARCH_ARM64 = 0xb7,
-        ARCH_RISCV = 0xf3,
-    } arch;
-    // should be 1
-    uint32_t _version2;
-    // entry address
-    elf64_addr_t entry;
-    // program header table offset
-    elf64_off_t phoff;
-    // section header table offset
-    elf64_off_t shoff;
-    // architecture dependent flags
-    uint32_t flags;
-    // elf header size
-    uint16_t ehsize;
-    // program header table entry size
-    uint16_t phentsize;
-    // program header table entries number
-    uint16_t phnum;
-    // section header table entry size
-    uint16_t shentsize;
-    // section header table entries number
-    uint16_t shnum;
-    // section header table entry index that contains section names
-    uint16_t shstrndx;
-};
-
-struct PACKED elf64_program_header_entry {
-    enum : uint32_t {
-        PT_NULL = 0x00,
-        PT_LOAD = 0x01,
-        PT_DYNAMIC = 0x02,
-        PT_INTERP = 0x03,
-        PT_NOTE = 0x04,
-        PT_SHLIB = 0x05,
-        PT_PHDR = 0x06,
-        PT_TLS = 0x07,
-        PT_LOOS = 0x60000000,
-        PT_HIOS = 0x6fffffff,
-        PT_LIPROC = 0x70000000,
-        PT_HIPROC = 0x7fffffff,
-    } type;
-    // segment dependent
-    enum : uint32_t {
-        PF_X = 0x1,
-        PF_W = 0x2,
-        PF_R = 0x4,
-    } flags;
-    elf64_off_t offset;
-    elf64_addr_t vaddr;
-    elf64_addr_t paddr;
-    elf64_off_t filesz;
-    elf64_off_t memsz;
-    // 0 and 1 for no alignment, otherwise power of 2
-    uint64_t align;
-};
-
-struct PACKED elf64_section_header_entry {
-    uint32_t sh_name;
-    enum : uint32_t {
-        SHT_NULL = 0x00,
-        SHT_PROGBITS = 0x01,
-        SHT_RELA = 0x04,
-        SHT_DYNAMIC = 0x06,
-        SHT_NOTE = 0x07,
-        SHT_NOBITS = 0x08,
-        SHT_REL = 0x09,
-        SHT_DYNSYM = 0x0b,
-        SHT_INIT_ARRAY = 0x0e,
-        SHT_FINI_ARRAY = 0x0f,
-        SHT_PREINIT_ARRAY = 0x0f,
-    } sh_type;
-    enum : uint64_t {
-        SHF_WRITE = 0x01,
-        SHF_ALLOC = 0x02,
-        SHF_EXECINSTR = 0x04,
-    } sh_flags;
-    elf64_addr_t sh_addr;
-    elf64_off_t sh_offset;
-    elf64_off_t sh_size;
-    uint32_t sh_link;
-    uint32_t sh_info;
-    elf64_off_t sh_addralign;
-    elf64_off_t sh_entsize;
-};
-
-struct elf64_load_data {
-    fs::dentry_pointer exec_dent;
-    std::vector<std::string> argv;
-    std::vector<std::string> envp;
-    unsigned long ip;
-    unsigned long sp;
-};
-
-} // namespace types::elf

+ 6 - 5
init_script.sh

@@ -3,7 +3,7 @@
 BUSYBOX=/mnt/busybox
 BUSYBOX=/mnt/busybox
 
 
 freeze() {
 freeze() {
-    echo "an error occurred while executing '''$@''', freezing..." > /dev/console
+    echo "an error occurred while executing '''$@''', freezing..." >&2
 
 
     while true; do
     while true; do
         true
         true
@@ -25,15 +25,17 @@ do_or_freeze $BUSYBOX mknod -m 666 /dev/null c 1 3
 do_or_freeze $BUSYBOX mknod -m 666 /dev/zero c 1 5
 do_or_freeze $BUSYBOX mknod -m 666 /dev/zero c 1 5
 do_or_freeze $BUSYBOX mknod -m 666 /dev/sda b 8 0
 do_or_freeze $BUSYBOX mknod -m 666 /dev/sda b 8 0
 do_or_freeze $BUSYBOX mknod -m 666 /dev/sda1 b 8 1
 do_or_freeze $BUSYBOX mknod -m 666 /dev/sda1 b 8 1
+do_or_freeze $BUSYBOX mknod -m 666 /dev/ttyS0 c 4 64
+do_or_freeze $BUSYBOX mknod -m 666 /dev/ttyS1 c 4 65
 
 
-echo -n -e "deploying busybox... " > /dev/console
+echo -n -e "deploying busybox... " >&2
 
 
 do_or_freeze $BUSYBOX mkdir -p /bin
 do_or_freeze $BUSYBOX mkdir -p /bin
 do_or_freeze $BUSYBOX --install -s /bin
 do_or_freeze $BUSYBOX --install -s /bin
 
 
 export PATH="/bin"
 export PATH="/bin"
 
 
-echo ok > /dev/console
+echo ok >&2
 
 
 do_or_freeze mkdir -p /etc /root /proc
 do_or_freeze mkdir -p /etc /root /proc
 do_or_freeze mount -t procfs proc proc
 do_or_freeze mount -t procfs proc proc
@@ -57,5 +59,4 @@ alias ll="ls -l "
 alias la="ls -la "
 alias la="ls -la "
 EOF
 EOF
 
 
-exec /mnt/init /bin/sh -l \
-    < /dev/console > /dev/console 2> /dev/console
+exec /mnt/init /bin/sh -c 'exec sh -l < /dev/ttyS0 > /dev/ttyS0 2> /dev/ttyS0'

+ 11 - 53
src/asm/interrupt.s

@@ -33,8 +33,8 @@
 	.cfi_restore \reg
 	.cfi_restore \reg
 .endm
 .endm
 
 
-.extern after_ctx_switch
 .globl ISR_stub_restore
 .globl ISR_stub_restore
+.type ISR_stub_restore @function
 
 
 ISR_stub:
 ISR_stub:
 	.cfi_startproc
 	.cfi_startproc
@@ -42,6 +42,11 @@ ISR_stub:
 	.cfi_def_cfa_offset 0x18
 	.cfi_def_cfa_offset 0x18
 	.cfi_offset %rsp, 0x10
 	.cfi_offset %rsp, 0x10
 
 
+	cmpq $0x08, 24(%rsp)
+	je 1f
+	swapgs
+
+1:
 	sub $0x78, %rsp
 	sub $0x78, %rsp
 	.cfi_def_cfa_offset 0x90
 	.cfi_def_cfa_offset 0x90
 
 
@@ -101,59 +106,12 @@ ISR_stub_restore:
 	add $0x88, %rsp
 	add $0x88, %rsp
 	.cfi_def_cfa_offset 0x08
 	.cfi_def_cfa_offset 0x08
 
 
-	iretq
-	.cfi_endproc
-
-# parameters
-# #1: sp* current_task_sp
-# #2: sp* target_task_sp
-.globl asm_ctx_switch
-.type  asm_ctx_switch @function
-asm_ctx_switch:
-	.cfi_startproc
-    pushf
-	.cfi_def_cfa_offset 0x10
-
-	sub $0x38, %rsp  # extra 8 bytes to align to 16 bytes
-	.cfi_def_cfa_offset 0x48
-
-	movcfi %rbx, 0x08
-	movcfi %rbp, 0x10
-	movcfi %r12, 0x18
-	movcfi %r13, 0x20
-	movcfi %r14, 0x28
-	movcfi %r15, 0x30
-
-    push (%rdi) 	 # save sp of previous stack frame of current
-	                 # acts as saving bp
-	.cfi_def_cfa_offset 0x50
-
-    mov %rsp, (%rdi) # save sp of current stack
-    mov (%rsi), %rsp # load sp of target stack
+	cmpq $0x08, 8(%rsp)
+	je 1f
+	swapgs
 
 
-    pop (%rsi)       # load sp of previous stack frame of target
-	                 # acts as restoring previous bp
-	.cfi_def_cfa_offset 0x48
-
-	pop %rax         # align to 16 bytes
-	.cfi_def_cfa_offset 0x40
-
-	call after_ctx_switch
-
-	mov 0x28(%rsp), %r15
-	mov 0x20(%rsp), %r14
-	mov 0x18(%rsp), %r13
-	mov 0x10(%rsp), %r12
-	mov 0x08(%rsp), %rbp
-    mov 0x00(%rsp), %rbx
-
-	add $0x30, %rsp
-	.cfi_def_cfa_offset 0x10
-
-    popf
-	.cfi_def_cfa_offset 0x08
-
-    ret
+1:
+	iretq
 	.cfi_endproc
 	.cfi_endproc
 
 
 .altmacro
 .altmacro

+ 81 - 6
src/boot.s

@@ -128,8 +128,8 @@ start_32bit:
     # read kimage into memory
     # read kimage into memory
 	lea -16(%esp), %esp
 	lea -16(%esp), %esp
     mov $KIMAGE_32K_COUNT, %ecx
     mov $KIMAGE_32K_COUNT, %ecx
-    mov $KERNEL_IMAGE_PADDR, 4(%esp) # destination address
-	mov $9, (%esp) # LBA
+    movl $KERNEL_IMAGE_PADDR, 4(%esp) # destination address
+	movl $9, (%esp) # LBA
 
 
 .Lread_kimage:
 .Lread_kimage:
 	mov (%esp), %edi
 	mov (%esp), %edi
@@ -139,8 +139,8 @@ start_32bit:
     call read_disk
     call read_disk
 	mov %ebx, %ecx
 	mov %ebx, %ecx
 
 
-    add $0x8000, 4(%esp)
-	add $64, (%esp)
+    addl $0x8000, 4(%esp)
+	addl $64, (%esp)
 
 
     loop .Lread_kimage
     loop .Lread_kimage
 
 
@@ -293,9 +293,10 @@ fill_pxe:
 .L64bit_entry:
 .L64bit_entry:
     jmp start_64bit
     jmp start_64bit
 
 
-.section .text.kinit
+.section .text
 start_64bit:
 start_64bit:
-    # set stack pointer and clear stack bottom
+    # We map the first 1GB identically to the first 1GB of physical memory,
+    # move sp to the correct position in identically mapped area of kernel space.
     mov %rsp, %rdi
     mov %rsp, %rdi
     xor %rsp, %rsp
     xor %rsp, %rsp
     inc %rsp
     inc %rsp
@@ -320,3 +321,77 @@ start_64bit:
     cli
     cli
     hlt
     hlt
     jmp .L64bit_hlt
     jmp .L64bit_hlt
+
+.section .stage1.smp
+.code16
+
+.globl ap_bootstrap
+.type ap_bootstrap, @function
+ap_bootstrap:
+	ljmp $0x0, $.Lap1
+
+.Lap1:
+    # we use a shared gdt for now
+	lgdt shared_gdt_desc
+
+    # set msr
+    mov $0xc0000080, %ecx
+    rdmsr
+    or $0x901, %eax # set LME, NXE, SCE
+    wrmsr
+
+    # set cr4
+    mov %cr4, %eax
+    or $0xa0, %eax # set PAE, PGE
+    mov %eax, %cr4
+
+    # load new page table
+    mov $KERNEL_PML4, %eax
+    mov %eax, %cr3
+
+    mov %cr0, %eax
+    // SET PE, WP, PG
+    or $0x80010001, %eax
+    mov %eax, %cr0
+
+	ljmp $0x08, $.Lap_bootstrap_end
+
+.align 16
+shared_gdt_desc:
+	.8byte 0x0000000000005f
+
+.code64
+.Lap_bootstrap_end:
+    mov $0x10, %ax
+	mov %ax, %ds
+	mov %ax, %es
+	mov %ax, %ss
+
+	xor %rsp, %rsp
+	xor %rax, %rax
+	inc %rax
+1:
+	xchg %rax, BOOT_SEMAPHORE
+	cmp $0, %rax
+	je 1f
+	pause
+	jmp 1b
+
+1:
+	mov BOOT_STACK, %rsp # Acquire
+	cmp $0, %rsp
+	jne 1f
+	pause
+	jmp 1b
+
+1:
+	xor %rax, %rax
+	mov %rax, BOOT_STACK # Release
+	xchg %rax, BOOT_SEMAPHORE
+
+	xor %rbp, %rbp
+	mov %rsp, %rdi # stack area start address as the first argument
+
+	add $0x200000, %rsp # kernel stack order 9
+	push %rbp # NULL return address
+	jmp ap_entry

+ 0 - 71
src/dev/builtin-chardev.cc

@@ -1,71 +0,0 @@
-#include <kernel/module.hpp>
-#include <kernel/tty.hpp>
-#include <kernel/vfs.hpp>
-
-using namespace kernel::kmod;
-using namespace kernel::tty;
-
-static ssize_t null_read(char*, size_t, size_t) {
-    return 0;
-}
-
-static ssize_t null_write(const char*, size_t n) {
-    return n;
-}
-
-static ssize_t zero_read(char* buf, size_t buf_size, size_t n) {
-    if (n > buf_size)
-        n = buf_size;
-
-    memset(buf, 0, n);
-    return n;
-}
-
-static ssize_t zero_write(const char*, size_t n) {
-    return n;
-}
-
-// TODO: add interface to bind console device to other devices
-ssize_t console_read(char* buf, size_t buf_size, size_t n) {
-    return console->read(buf, buf_size, n);
-}
-
-ssize_t console_write(const char* buf, size_t n) {
-    size_t orig_n = n;
-    while (n--)
-        console->putchar(*(buf++));
-
-    return orig_n;
-}
-
-class builtin_chardev : public virtual kmod {
-   public:
-    builtin_chardev() : kmod("builtin-chardev") {}
-    int init() override {
-        using namespace fs;
-        // null
-        chrdev_ops null_ops{
-            .read = null_read,
-            .write = null_write,
-        };
-        register_char_device(make_device(1, 3), null_ops);
-
-        // zero
-        chrdev_ops zero_ops{
-            .read = zero_read,
-            .write = zero_write,
-        };
-        register_char_device(make_device(1, 5), zero_ops);
-
-        // console
-        chrdev_ops console_ops{
-            .read = console_read,
-            .write = console_write,
-        };
-        register_char_device(make_device(5, 1), console_ops);
-
-        return 0;
-    }
-};
-
-INTERNAL_MODULE(builtin_chardev, builtin_chardev);

+ 20 - 0
src/driver.rs

@@ -1,2 +1,22 @@
 pub mod ahci;
 pub mod ahci;
 pub mod e1000e;
 pub mod e1000e;
+pub mod serial;
+
+// TODO!!!: Put it somewhere else.
+pub struct Port8 {
+    no: u16,
+}
+
+impl Port8 {
+    pub const fn new(no: u16) -> Self {
+        Self { no }
+    }
+
+    pub fn read(&self) -> u8 {
+        arch::io::inb(self.no)
+    }
+
+    pub fn write(&self, data: u8) {
+        arch::io::outb(self.no, data)
+    }
+}

+ 5 - 4
src/driver/ahci/command.rs

@@ -16,19 +16,20 @@ pub trait Command {
 }
 }
 
 
 pub struct IdentifyCommand {
 pub struct IdentifyCommand {
-    pages: [Page; 1],
+    page: Page,
 }
 }
 
 
 impl IdentifyCommand {
 impl IdentifyCommand {
     pub fn new() -> Self {
     pub fn new() -> Self {
-        let page = Page::alloc_one();
-        Self { pages: [page] }
+        Self {
+            page: Page::alloc_one(),
+        }
     }
     }
 }
 }
 
 
 impl Command for IdentifyCommand {
 impl Command for IdentifyCommand {
     fn pages(&self) -> &[Page] {
     fn pages(&self) -> &[Page] {
-        &self.pages
+        core::slice::from_ref(&self.page)
     }
     }
 
 
     fn lba(&self) -> u64 {
     fn lba(&self) -> u64 {

+ 35 - 38
src/driver/ahci/control.rs

@@ -1,9 +1,6 @@
-use crate::{
-    kernel::mem::phys::{NoCachePP, PhysPtr},
-    prelude::*,
-};
+use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
 
 
-use super::{vread, vwrite, GHC_IE};
+use super::{BitsIterator, GHC_IE};
 
 
 /// An `AdapterControl` is an HBA device Global Host Control block
 /// An `AdapterControl` is an HBA device Global Host Control block
 ///
 ///
@@ -12,7 +9,7 @@ use super::{vread, vwrite, GHC_IE};
 /// All reads and writes to this struct is volatile
 /// All reads and writes to this struct is volatile
 ///
 ///
 #[repr(C)]
 #[repr(C)]
-pub struct AdapterControl {
+struct AdapterControlData {
     capabilities: u32,
     capabilities: u32,
     global_host_control: u32,
     global_host_control: u32,
     interrupt_status: u32,
     interrupt_status: u32,
@@ -29,50 +26,50 @@ pub struct AdapterControl {
     vendor: [u8; 96],
     vendor: [u8; 96],
 }
 }
 
 
+const CONTROL_CAP: usize = 0;
+const CONTROL_GHC: usize = 1;
+const CONTROL_IS: usize = 2;
+const CONTROL_PI: usize = 3;
+
+pub struct AdapterControl {
+    inner: *mut u32,
+}
+
+/// # Safety
+/// At the same time, exactly one instance of this struct may exist.
+unsafe impl Send for AdapterControl {}
+
 impl AdapterControl {
 impl AdapterControl {
-    pub fn new<'lt>(addr: usize) -> &'lt mut Self {
-        NoCachePP::new(addr).as_mut()
+    pub fn new(addr: usize) -> Self {
+        Self {
+            inner: NoCachePP::new(addr).as_ptr(),
+        }
     }
     }
 }
 }
 
 
 impl AdapterControl {
 impl AdapterControl {
-    pub fn enable_interrupts(&mut self) {
-        let ghc = vread(&self.global_host_control);
-        vwrite(&mut self.global_host_control, ghc | GHC_IE);
+    fn read(&self, off: usize) -> u32 {
+        unsafe { self.inner.offset(off as isize).read_volatile() }
     }
     }
 
 
-    pub fn implemented_ports(&self) -> ImplementedPortsIter {
-        ImplementedPortsIter::new(vread(&self.ports_implemented))
+    fn write(&self, off: usize, value: u32) {
+        unsafe { self.inner.offset(off as isize).write_volatile(value) }
     }
     }
-}
 
 
-pub struct ImplementedPortsIter {
-    ports: u32,
-    n: u32,
-}
-
-impl ImplementedPortsIter {
-    fn new(ports: u32) -> Self {
-        Self { ports, n: 0 }
+    pub fn enable_interrupts(&self) {
+        let ghc = self.read(CONTROL_GHC);
+        self.write(CONTROL_GHC, ghc | GHC_IE);
     }
     }
-}
-
-impl Iterator for ImplementedPortsIter {
-    type Item = u32;
 
 
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.n == 32 {
-            return None;
-        }
+    pub fn implemented_ports(&self) -> BitsIterator {
+        BitsIterator::new(self.read(CONTROL_PI))
+    }
 
 
-        let have: bool = self.ports & 1 != 0;
-        self.ports >>= 1;
-        self.n += 1;
+    pub fn pending_interrupts(&self) -> BitsIterator {
+        BitsIterator::new(self.read(CONTROL_IS))
+    }
 
 
-        if have {
-            Some(self.n - 1)
-        } else {
-            self.next()
-        }
+    pub fn clear_interrupt(&self, no: u32) {
+        self.write(CONTROL_IS, 1 << no)
     }
     }
 }
 }

+ 33 - 33
src/driver/ahci/defs.rs

@@ -17,6 +17,33 @@ pub const PORT_CMD_FRE: u32 = 0x00000010;
 pub const PORT_CMD_FR: u32 = 0x00004000;
 pub const PORT_CMD_FR: u32 = 0x00004000;
 pub const PORT_CMD_CR: u32 = 0x00008000;
 pub const PORT_CMD_CR: u32 = 0x00008000;
 
 
+pub const PORT_IE_DHRE: u32 = 0x00000001;
+pub const PORT_IE_UFE: u32 = 0x00000010;
+pub const PORT_IE_INFE: u32 = 0x04000000;
+pub const PORT_IE_IFE: u32 = 0x08000000;
+pub const PORT_IE_HBDE: u32 = 0x10000000;
+pub const PORT_IE_IBFE: u32 = 0x20000000;
+pub const PORT_IE_TFEE: u32 = 0x40000000;
+
+pub const PORT_IE_DEFAULT: u32 = PORT_IE_DHRE
+    | PORT_IE_UFE
+    | PORT_IE_INFE
+    | PORT_IE_IFE
+    | PORT_IE_HBDE
+    | PORT_IE_IBFE
+    | PORT_IE_TFEE;
+
+pub const PORT_IS_DHRS: u32 = 0x00000001;
+pub const PORT_IS_UFS: u32 = 0x00000010;
+pub const PORT_IS_INFS: u32 = 0x04000000;
+pub const PORT_IS_IFS: u32 = 0x08000000;
+pub const PORT_IS_HBDS: u32 = 0x10000000;
+pub const PORT_IS_IBFS: u32 = 0x20000000;
+pub const PORT_IS_TFES: u32 = 0x40000000;
+
+pub const PORT_IS_ERROR: u32 =
+    PORT_IS_UFS | PORT_IS_INFS | PORT_IS_IFS | PORT_IS_HBDS | PORT_IS_IBFS;
+
 /// A `CommandHeader` is used to send commands to the HBA device
 /// A `CommandHeader` is used to send commands to the HBA device
 ///
 ///
 /// # Access
 /// # Access
@@ -29,47 +56,20 @@ pub struct CommandHeader {
     // [5]: ATAPI
     // [5]: ATAPI
     // [6]: Write
     // [6]: Write
     // [7]: Prefetchable
     // [7]: Prefetchable
-    first: u8,
+    pub first: u8,
 
 
     // [0]: Reset
     // [0]: Reset
     // [1]: BIST
     // [1]: BIST
     // [2]: Clear busy upon ok
     // [2]: Clear busy upon ok
     // [3]: Reserved
     // [3]: Reserved
     // [4:7]: Port multiplier
     // [4:7]: Port multiplier
-    second: u8,
-
-    prdt_length: u16,
-    bytes_transferred: u32,
-    command_table_base: u64,
-
-    _reserved: [u32; 4],
-}
-
-impl CommandHeader {
-    pub fn clear(&mut self) {
-        self.first = 0;
-        self.second = 0;
-        self.prdt_length = 0;
-        self.bytes_transferred = 0;
-        self.command_table_base = 0;
-        self._reserved = [0; 4];
-    }
-
-    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
-        self.first = 0x05; // FIS type
-
-        if write {
-            self.first |= 0x40;
-        }
+    pub second: u8,
 
 
-        self.second = 0x04; // Clear busy upon ok
+    pub prdt_length: u16,
+    pub bytes_transferred: u32,
+    pub command_table_base: u64,
 
 
-        self.prdt_length = prdtlen;
-        self.bytes_transferred = 0;
-        self.command_table_base = cmdtable_base;
-
-        self._reserved = [0; 4];
-    }
+    pub _reserved: [u32; 4],
 }
 }
 
 
 pub enum FisType {
 pub enum FisType {

+ 108 - 57
src/driver/ahci/mod.rs

@@ -1,9 +1,13 @@
 use crate::{
 use crate::{
-    kernel::block::{make_device, BlockDevice},
+    fs::procfs,
+    kernel::{
+        block::{make_device, BlockDevice},
+        interrupt::register_irq_handler,
+    },
     prelude::*,
     prelude::*,
 };
 };
 
 
-use alloc::sync::Arc;
+use alloc::{format, sync::Arc};
 use bindings::{
 use bindings::{
     kernel::hw::pci::{self, pci_device},
     kernel::hw::pci::{self, pci_device},
     EIO,
     EIO,
@@ -17,100 +21,149 @@ mod control;
 mod defs;
 mod defs;
 mod port;
 mod port;
 
 
-fn vread<T: Sized + Copy>(refval: &T) -> T {
-    unsafe { core::ptr::read_volatile(refval) }
+pub struct BitsIterator {
+    data: u32,
+    n: u32,
 }
 }
 
 
-fn vwrite<T: Sized + Copy>(refval: &mut T, val: T) {
-    unsafe { core::ptr::write_volatile(refval, val) }
+impl BitsIterator {
+    fn new(data: u32) -> Self {
+        Self { data, n: 0 }
+    }
 }
 }
 
 
-fn spinwait_clear(refval: &u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
+impl Iterator for BitsIterator {
+    type Item = u32;
 
 
-    let mut spins = 0;
-    while vread(refval) & mask != 0 {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.n == 32 {
+            return None;
         }
         }
 
 
-        spins += 1;
-    }
-
-    Ok(())
-}
-
-fn spinwait_set(refval: &u32, mask: u32) -> KResult<()> {
-    const SPINWAIT_MAX: usize = 1000;
+        let have: bool = self.data & 1 != 0;
+        self.data >>= 1;
+        self.n += 1;
 
 
-    let mut spins = 0;
-    while vread(refval) & mask != mask {
-        if spins == SPINWAIT_MAX {
-            return Err(EIO);
+        if have {
+            Some(self.n - 1)
+        } else {
+            self.next()
         }
         }
-
-        spins += 1;
     }
     }
+}
+
+fn vread<T: Sized + Copy>(refval: *const T) -> T {
+    unsafe { refval.read_volatile() }
+}
 
 
-    Ok(())
+fn vwrite<T: Sized + Copy>(refval: *mut T, val: T) {
+    unsafe { refval.write_volatile(val) }
 }
 }
 
 
-struct Device<'lt, 'port> {
+struct Device {
     control_base: usize,
     control_base: usize,
-    control: &'lt mut AdapterControl,
+    control: AdapterControl,
     // TODO: impl Drop to free pci device
     // TODO: impl Drop to free pci device
     pcidev: *mut pci_device,
     pcidev: *mut pci_device,
-    ports: Vec<Option<Arc<Mutex<AdapterPort<'port>>>>>,
+    /// # Lock
+    /// Might be accessed from irq handler, use with `lock_irq()`
+    ports: Spin<[Option<Arc<AdapterPort>>; 32]>,
 }
 }
 
 
-impl<'lt, 'port: 'static> Device<'lt, 'port> {
-    fn probe_ports(&mut self) -> KResult<()> {
-        for nport in self.control.implemented_ports() {
-            let mut port = AdapterPort::<'port>::new(self.control_base, nport);
+/// # Safety
+/// `pcidev` is never accessed from Rust code
+/// TODO!!!: place *mut pci_device in a safe wrapper
+unsafe impl Send for Device {}
+unsafe impl Sync for Device {}
 
 
+impl Device {
+    fn probe_ports(&self) -> KResult<()> {
+        for nport in self.control.implemented_ports() {
+            let port = Arc::new(AdapterPort::new(self.control_base, nport));
             if !port.status_ok() {
             if !port.status_ok() {
                 continue;
                 continue;
             }
             }
 
 
-            port.init()?;
+            self.ports.lock_irq()[nport as usize] = Some(port.clone());
+            if let Err(e) = (|| -> KResult<()> {
+                port.init()?;
+
+                {
+                    let port = port.clone();
+                    let name = format!("ahci-p{}-stats", port.nport);
+                    procfs::populate_root(name.into_bytes().into(), move |buffer| {
+                        writeln!(buffer, "{:?}", port.stats.lock().as_ref()).map_err(|_| EIO)
+                    })?;
+                }
+
+                let port = BlockDevice::register_disk(
+                    make_device(8, nport * 16),
+                    2147483647, // TODO: get size from device
+                    port,
+                )?;
+
+                port.partprobe()?;
+
+                Ok(())
+            })() {
+                self.ports.lock_irq()[nport as usize] = None;
+                println_warn!("probe port {nport} failed with {e}");
+            }
+        }
+
+        Ok(())
+    }
+
+    fn handle_interrupt(&self) {
+        // Safety
+        // `self.ports` is accessed inside irq handler
+        let ports = self.ports.lock();
+        for nport in self.control.pending_interrupts() {
+            if let None = ports[nport as usize] {
+                println_warn!("port {nport} not found");
+                continue;
+            }
+
+            let port = ports[nport as usize].as_ref().unwrap();
+            let status = vread(port.interrupt_status());
 
 
-            let port = Arc::new(Mutex::new(port));
+            if status & PORT_IS_ERROR != 0 {
+                println_warn!("port {nport} SATA error");
+                continue;
+            }
 
 
-            self.ports[nport as usize] = Some(port.clone());
+            debug_assert!(status & PORT_IS_DHRS != 0);
+            vwrite(port.interrupt_status(), PORT_IS_DHRS);
 
 
-            let port = BlockDevice::register_disk(
-                make_device(8, nport * 16),
-                2147483647, // TODO: get size from device
-                port,
-            )?;
+            self.control.clear_interrupt(nport);
 
 
-            port.partprobe()?;
+            port.handle_interrupt();
         }
         }
-
-        Ok(())
     }
     }
 }
 }
 
 
-impl<'lt: 'static, 'port: 'static> Device<'lt, 'port> {
-    pub fn new(pcidev: *mut pci_device) -> KResult<Self> {
+impl Device {
+    pub fn new(pcidev: *mut pci_device) -> KResult<Arc<Self>> {
         let base = unsafe { *(*pcidev).header_type0() }.bars[PCI_REG_ABAR];
         let base = unsafe { *(*pcidev).header_type0() }.bars[PCI_REG_ABAR];
+        let irqno = unsafe { *(*pcidev).header_type0() }.interrupt_line;
 
 
         // use MMIO
         // use MMIO
         if base & 0xf != 0 {
         if base & 0xf != 0 {
             return Err(EIO);
             return Err(EIO);
         }
         }
 
 
-        let mut ports = Vec::with_capacity(32);
-        ports.resize_with(32, || None);
-
-        let mut device = Device {
+        let device = Arc::new(Device {
             control_base: base as usize,
             control_base: base as usize,
             control: AdapterControl::new(base as usize),
             control: AdapterControl::new(base as usize),
             pcidev,
             pcidev,
-            ports,
-        };
+            ports: Spin::new([const { None }; 32]),
+        });
 
 
         device.control.enable_interrupts();
         device.control.enable_interrupts();
+
+        let device_irq = device.clone();
+        register_irq_handler(irqno as i32, move || device_irq.handle_interrupt())?;
+
         device.probe_ports()?;
         device.probe_ports()?;
 
 
         Ok(device)
         Ok(device)
@@ -123,15 +176,13 @@ unsafe extern "C" fn probe_device(pcidev: *mut pci_device) -> i32 {
             // TODO!!!: save device to pci_device
             // TODO!!!: save device to pci_device
             Box::leak(Box::new(device));
             Box::leak(Box::new(device));
             0
             0
-        },
+        }
         Err(e) => -(e as i32),
         Err(e) => -(e as i32),
     }
     }
 }
 }
 
 
 pub fn register_ahci_driver() {
 pub fn register_ahci_driver() {
-    let ret = unsafe {
-        pci::register_driver_r(VENDOR_INTEL, DEVICE_AHCI, Some(probe_device))
-    };
+    let ret = unsafe { pci::register_driver_r(VENDOR_INTEL, DEVICE_AHCI, Some(probe_device)) };
 
 
     assert_eq!(ret, 0);
     assert_eq!(ret, 0);
 }
 }

+ 266 - 58
src/driver/ahci/port.rs

@@ -1,4 +1,5 @@
-use bindings::EINVAL;
+use alloc::collections::vec_deque::VecDeque;
+use bindings::{EINVAL, EIO};
 
 
 use crate::prelude::*;
 use crate::prelude::*;
 
 
@@ -6,14 +7,29 @@ use crate::kernel::block::{BlockDeviceRequest, BlockRequestQueue};
 use crate::kernel::mem::paging::Page;
 use crate::kernel::mem::paging::Page;
 
 
 use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
 use crate::kernel::mem::phys::{NoCachePP, PhysPtr};
+use crate::sync::UCondVar;
 
 
 use super::command::{Command, IdentifyCommand, ReadLBACommand};
 use super::command::{Command, IdentifyCommand, ReadLBACommand};
 use super::{
 use super::{
-    spinwait_clear, vread, vwrite, CommandHeader, PRDTEntry, ReceivedFis,
-    ATA_DEV_BSY, ATA_DEV_DRQ, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
-    PORT_CMD_ST,
+    vread, vwrite, CommandHeader, PRDTEntry, FISH2D, PORT_CMD_CR, PORT_CMD_FR, PORT_CMD_FRE,
+    PORT_CMD_ST, PORT_IE_DEFAULT,
 };
 };
 
 
+fn spinwait_clear(refval: *const u32, mask: u32) -> KResult<()> {
+    const SPINWAIT_MAX: usize = 1000;
+
+    let mut spins = 0;
+    while vread(refval) & mask != 0 {
+        if spins == SPINWAIT_MAX {
+            return Err(EIO);
+        }
+
+        spins += 1;
+    }
+
+    Ok(())
+}
+
 /// An `AdapterPort` is an HBA device in AHCI mode.
 /// An `AdapterPort` is an HBA device in AHCI mode.
 ///
 ///
 /// # Access
 /// # Access
@@ -49,92 +65,289 @@ pub struct AdapterPortData {
     vendor: [u32; 4],
     vendor: [u32; 4],
 }
 }
 
 
-pub struct AdapterPort<'lt> {
-    nport: u32,
-    data: &'lt mut AdapterPortData,
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum SlotState {
+    Idle,
+    Working,
+    Finished,
+    Error,
+}
+
+struct CommandSlotInner {
+    state: SlotState,
+    /// # Usage
+    /// `cmdheader` might be used in irq handler. So in order to wait for
+    /// commands to finish, we should use `lock_irq` on `cmdheader`
+    cmdheader: *mut CommandHeader,
+}
+
+/// # Safety
+/// This is safe because the `cmdheader` is not shared between threads
+unsafe impl Send for CommandSlotInner {}
+
+impl CommandSlotInner {
+    pub fn setup(&mut self, cmdtable_base: u64, prdtlen: u16, write: bool) {
+        let cmdheader = unsafe { self.cmdheader.as_mut().unwrap() };
+        cmdheader.first = 0x05; // FIS type
+
+        if write {
+            cmdheader.first |= 0x40;
+        }
+
+        cmdheader.second = 0x00;
+
+        cmdheader.prdt_length = prdtlen;
+        cmdheader.bytes_transferred = 0;
+        cmdheader.command_table_base = cmdtable_base;
+
+        cmdheader._reserved = [0; 4];
+    }
+}
+
+struct CommandSlot {
+    inner: Spin<CommandSlotInner>,
+    cv: UCondVar,
+}
+
+impl CommandSlot {
+    fn new(cmdheader: *mut CommandHeader) -> Self {
+        Self {
+            inner: Spin::new(CommandSlotInner {
+                state: SlotState::Idle,
+                cmdheader,
+            }),
+            cv: UCondVar::new(),
+        }
+    }
+}
+
+struct FreeList {
+    free: VecDeque<u32>,
+    working: VecDeque<u32>,
+}
+
+impl FreeList {
+    fn new() -> Self {
+        Self {
+            free: (0..32).collect(),
+            working: VecDeque::new(),
+        }
+    }
+}
+
+#[derive(Default, Debug)]
+pub struct AdapterPortStats {
+    /// Number of commands sent
+    cmd_sent: u64,
+
+    /// Number of transmission errors
+    cmd_error: u64,
+
+    /// Number of interrupts fired
+    int_fired: u64,
+}
+
+pub struct AdapterPort {
+    pub nport: u32,
+    regs: *mut (),
     page: Page,
     page: Page,
-    cmdheaders: &'lt mut [CommandHeader; 32],
-    recv_fis: &'lt mut ReceivedFis,
+    slots: [CommandSlot; 32],
+    free_list: Spin<FreeList>,
+    free_list_cv: UCondVar,
+
+    /// Statistics for this port
+    pub stats: Spin<AdapterPortStats>,
 }
 }
 
 
-impl<'lt> AdapterPort<'lt> {
+/// # Safety
+/// This is safe because the `AdapterPort` can be accessed by only one thread at the same time
+unsafe impl Send for AdapterPort {}
+unsafe impl Sync for AdapterPort {}
+
+impl AdapterPort {
     pub fn new(base: usize, nport: u32) -> Self {
     pub fn new(base: usize, nport: u32) -> Self {
         let page = Page::alloc_one();
         let page = Page::alloc_one();
+        let cmdheaders_start = page.as_cached().as_ptr::<CommandHeader>();
+
         Self {
         Self {
             nport,
             nport,
-            data: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_mut(),
-            cmdheaders: page.as_cached().as_mut(),
-            recv_fis: page.as_cached().offset(0x400).as_mut(),
+            regs: NoCachePP::new(base + 0x100 + 0x80 * nport as usize).as_ptr(),
+            slots: core::array::from_fn(|index| {
+                CommandSlot::new(unsafe { cmdheaders_start.offset(index as isize) })
+            }),
+            free_list: Spin::new(FreeList::new()),
+            free_list_cv: UCondVar::new(),
             page,
             page,
+            stats: Spin::default(),
         }
         }
     }
     }
 }
 }
 
 
-impl<'lt> AdapterPort<'lt> {
+impl AdapterPort {
+    fn command_list_base(&self) -> *mut u64 {
+        unsafe { self.regs.byte_offset(0x00).cast() }
+    }
+
+    fn fis_base(&self) -> *mut u64 {
+        unsafe { self.regs.byte_offset(0x08).cast() }
+    }
+
+    fn sata_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x28).cast() }
+    }
+
+    fn command_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x18).cast() }
+    }
+
+    fn command_issue(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x38).cast() }
+    }
+
+    pub fn interrupt_status(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x10).cast() }
+    }
+
+    pub fn interrupt_enable(&self) -> *mut u32 {
+        unsafe { self.regs.byte_offset(0x14).cast() }
+    }
+
     pub fn status_ok(&self) -> bool {
     pub fn status_ok(&self) -> bool {
-        self.data.sata_status & 0xf == 0x3
+        vread(self.sata_status()) & 0xf == 0x3
+    }
+
+    fn get_free_slot(&self) -> u32 {
+        let mut free_list = self.free_list.lock_irq();
+
+        loop {
+            match free_list.free.pop_front() {
+                Some(slot) => break slot,
+                None => self.free_list_cv.wait(&mut free_list),
+            };
+        }
+    }
+
+    fn save_working(&self, slot: u32) {
+        self.free_list.lock().working.push_back(slot);
+    }
+
+    fn release_free_slot(&self, slot: u32) {
+        self.free_list.lock().free.push_back(slot);
+        self.free_list_cv.notify_one();
+    }
+
+    pub fn handle_interrupt(&self) {
+        let ci = vread(self.command_issue());
+
+        // no need to use `lock_irq()` inside interrupt handler
+        let mut free_list = self.free_list.lock();
+
+        free_list.working.retain(|&n| {
+            if ci & (1 << n) != 0 {
+                return true;
+            }
+
+            let slot = &self.slots[n as usize];
+
+            // TODO: check error
+            let mut slot_inner = slot.inner.lock();
+            debug_assert_eq!(slot_inner.state, SlotState::Working);
+            slot_inner.state = SlotState::Finished;
+            slot.cv.notify_all();
+            self.stats.lock().int_fired += 1;
+
+            false
+        });
     }
     }
 
 
-    fn stop_command(&mut self) -> KResult<()> {
-        let cmd_status = vread(&self.data.command_status);
+    fn stop_command(&self) -> KResult<()> {
         vwrite(
         vwrite(
-            &mut self.data.command_status,
-            cmd_status & !(PORT_CMD_ST | PORT_CMD_FRE),
+            self.command_status(),
+            vread(self.command_status()) & !(PORT_CMD_ST | PORT_CMD_FRE),
         );
         );
 
 
-        spinwait_clear(&self.data.command_status, PORT_CMD_CR | PORT_CMD_FR)
+        spinwait_clear(self.command_status(), PORT_CMD_CR | PORT_CMD_FR)
     }
     }
 
 
-    fn start_command(&mut self) -> KResult<()> {
-        spinwait_clear(&self.data.command_status, PORT_CMD_CR)?;
+    fn start_command(&self) -> KResult<()> {
+        spinwait_clear(self.command_status(), PORT_CMD_CR)?;
 
 
-        let cmd_status = vread(&self.data.command_status);
+        let cmd_status = vread(self.command_status());
         vwrite(
         vwrite(
-            &mut self.data.command_status,
+            self.command_status(),
             cmd_status | PORT_CMD_ST | PORT_CMD_FRE,
             cmd_status | PORT_CMD_ST | PORT_CMD_FRE,
         );
         );
 
 
         Ok(())
         Ok(())
     }
     }
 
 
-    fn send_command(&mut self, cmd: &impl Command) -> KResult<()> {
-        let pages = cmd.pages();
-
-        // TODO: get an available command slot
-        let cmdslot = 0;
+    /// # Might Sleep
+    /// This function **might sleep**, so call it in a preemptible context
+    fn send_command(&self, cmd: &impl Command) -> KResult<()> {
+        might_sleep!();
 
 
+        let pages = cmd.pages();
         let cmdtable_page = Page::alloc_one();
         let cmdtable_page = Page::alloc_one();
-        self.cmdheaders[cmdslot].clear();
-        self.cmdheaders[cmdslot].setup(
-            cmdtable_page.as_phys() as u64,
-            pages.len() as u16,
-            cmd.write(),
-        );
 
 
         let command_fis: &mut FISH2D = cmdtable_page.as_cached().as_mut();
         let command_fis: &mut FISH2D = cmdtable_page.as_cached().as_mut();
         command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
         command_fis.setup(cmd.cmd(), cmd.lba(), cmd.count());
 
 
-        let prdt: &mut [PRDTEntry; 248] =
-            cmdtable_page.as_cached().offset(0x80).as_mut();
+        let prdt: &mut [PRDTEntry; 248] = cmdtable_page.as_cached().offset(0x80).as_mut();
 
 
         for (idx, page) in pages.iter().enumerate() {
         for (idx, page) in pages.iter().enumerate() {
             prdt[idx].setup(page);
             prdt[idx].setup(page);
         }
         }
 
 
-        // clear received fis?
+        let slot_index = self.get_free_slot() as usize;
+        let slot_object = &self.slots[slot_index];
 
 
-        // wait until port is not busy
-        spinwait_clear(&self.data.task_file_data, ATA_DEV_BSY | ATA_DEV_DRQ)?;
+        let mut slot = slot_object.inner.lock_irq();
 
 
-        vwrite(&mut self.data.command_issue, 1 << cmdslot);
-        spinwait_clear(&self.data.command_issue, 1 << cmdslot)?;
+        slot.setup(
+            cmdtable_page.as_phys() as u64,
+            pages.len() as u16,
+            cmd.write(),
+        );
+        slot.state = SlotState::Working;
+
+        // should we clear received fis here?
+        debug_assert!(vread(self.command_issue()) & (1 << slot_index) == 0);
+        vwrite(self.command_issue(), 1 << slot_index);
+
+        if spinwait_clear(self.command_issue(), 1 << slot_index).is_err() {
+            let mut saved = false;
+            while slot.state == SlotState::Working {
+                if !saved {
+                    saved = true;
+                    self.save_working(slot_index as u32);
+                }
+                slot_object.cv.wait(&mut slot);
+            }
+        } else {
+            // TODO: check error
+            slot.state = SlotState::Finished;
+        }
 
 
-        // TODO: check and wait interrupt
+        let state = slot.state;
+        slot.state = SlotState::Idle;
 
 
-        Ok(())
+        debug_assert_ne!(state, SlotState::Working);
+        self.release_free_slot(slot_index as u32);
+
+        match state {
+            SlotState::Finished => {
+                self.stats.lock().cmd_sent += 1;
+                Ok(())
+            }
+            SlotState::Error => {
+                self.stats.lock().cmd_error += 1;
+                Err(EIO)
+            }
+            _ => panic!("Invalid slot state"),
+        }
     }
     }
 
 
-    fn identify(&mut self) -> KResult<()> {
+    fn identify(&self) -> KResult<()> {
         let cmd = IdentifyCommand::new();
         let cmd = IdentifyCommand::new();
 
 
         // TODO: check returned data
         // TODO: check returned data
@@ -143,43 +356,38 @@ impl<'lt> AdapterPort<'lt> {
         Ok(())
         Ok(())
     }
     }
 
 
-    pub fn init(&mut self) -> KResult<()> {
+    pub fn init(&self) -> KResult<()> {
         self.stop_command()?;
         self.stop_command()?;
 
 
-        // TODO: use interrupt
-        // this is the PxIE register, setting bits here will make
-        //      it generate corresponding interrupts in PxIS
-        //
-        // port->interrupt_enable = 1;
+        vwrite(self.interrupt_enable(), PORT_IE_DEFAULT);
 
 
-        vwrite(&mut self.data.command_list_base, self.page.as_phys() as u64);
-        vwrite(&mut self.data.fis_base, self.page.as_phys() as u64 + 0x400);
+        vwrite(self.command_list_base(), self.page.as_phys() as u64);
+        vwrite(self.fis_base(), self.page.as_phys() as u64 + 0x400);
 
 
         self.start_command()?;
         self.start_command()?;
 
 
         match self.identify() {
         match self.identify() {
             Err(err) => {
             Err(err) => {
                 self.stop_command()?;
                 self.stop_command()?;
-                return Err(err);
+                Err(err)
             }
             }
             Ok(_) => Ok(()),
             Ok(_) => Ok(()),
         }
         }
     }
     }
 }
 }
 
 
-impl<'lt> BlockRequestQueue for AdapterPort<'lt> {
+impl BlockRequestQueue for AdapterPort {
     fn max_request_pages(&self) -> u64 {
     fn max_request_pages(&self) -> u64 {
         1024
         1024
     }
     }
 
 
-    fn submit(&mut self, req: BlockDeviceRequest) -> KResult<()> {
+    fn submit(&self, req: BlockDeviceRequest) -> KResult<()> {
         // TODO: check disk size limit using newtype
         // TODO: check disk size limit using newtype
         if req.count > 65535 {
         if req.count > 65535 {
             return Err(EINVAL);
             return Err(EINVAL);
         }
         }
 
 
-        let command =
-            ReadLBACommand::new(req.buffer, req.sector, req.count as u16)?;
+        let command = ReadLBACommand::new(req.buffer, req.sector, req.count as u16)?;
 
 
         self.send_command(&command)
         self.send_command(&command)
     }
     }

+ 23 - 11
src/driver/e1000e.rs

@@ -1,3 +1,5 @@
+use crate::prelude::*;
+
 use crate::bindings::root::kernel::hw::pci;
 use crate::bindings::root::kernel::hw::pci;
 use crate::kernel::interrupt::register_irq_handler;
 use crate::kernel::interrupt::register_irq_handler;
 use crate::kernel::mem::paging::copy_to_page;
 use crate::kernel::mem::paging::copy_to_page;
@@ -56,6 +58,23 @@ fn test(val: u32, bit: u32) -> bool {
     (val & bit) == bit
     (val & bit) == bit
 }
 }
 
 
+struct PrintableBytes<'a>(&'a [u8]);
+
+impl core::fmt::Debug for PrintableBytes<'_> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "PrintableBytes {{")?;
+        for chunk in self.0.chunks(16) {
+            for &byte in chunk {
+                write!(f, "{byte} ")?;
+            }
+            write!(f, "\n")?;
+        }
+        write!(f, "}}")?;
+
+        Ok(())
+    }
+}
+
 impl netdev::Netdev for E1000eDev {
 impl netdev::Netdev for E1000eDev {
     fn mac(&self) -> netdev::Mac {
     fn mac(&self) -> netdev::Mac {
         self.mac
         self.mac
@@ -151,17 +170,10 @@ impl netdev::Netdev for E1000eDev {
                 )
                 )
             };
             };
 
 
-            use crate::{dont_check, print, println};
-            dont_check!(println!("==== e1000e: received packet ===="));
-
-            for i in 0..len {
-                if i % 16 == 0 {
-                    dont_check!(println!());
-                }
-                dont_check!(print!("{:02x} ", data[i]));
-            }
-
-            dont_check!(println!("\n\n====  e1000e: end of packet  ===="));
+            println_debug!(
+                "e1000e: received {len} bytes, {:?}",
+                PrintableBytes(data)
+            );
             self.rx_tail = Some(next_tail);
             self.rx_tail = Some(next_tail);
         }
         }
 
 

+ 145 - 0
src/driver/serial.rs

@@ -0,0 +1,145 @@
+use alloc::{format, sync::Arc};
+use bindings::EIO;
+
+use crate::{
+    kernel::{
+        block::make_device, interrupt::register_irq_handler, CharDevice, CharDeviceType, Console,
+        Terminal, TerminalDevice,
+    },
+    prelude::*,
+};
+
+use super::Port8;
+
+struct Serial {
+    id: u32,
+    name: Arc<str>,
+
+    terminal: Option<Arc<Terminal>>,
+
+    tx_rx: Port8,
+    int_ena: Port8,
+    int_ident: Port8,
+    line_control: Port8,
+    modem_control: Port8,
+    line_status: Port8,
+    modem_status: Port8,
+    scratch: Port8,
+}
+
+impl Serial {
+    const COM0_BASE: u16 = 0x3f8;
+    const COM1_BASE: u16 = 0x2f8;
+
+    const COM0_IRQ: u8 = 4;
+    const COM1_IRQ: u8 = 3;
+
+    fn enable_interrupts(&self) {
+        // Enable interrupt #0: Received data available
+        self.int_ena.write(0x01);
+    }
+
+    pub fn new(id: u32, base_port: u16) -> KResult<Self> {
+        let port = Self {
+            id,
+            name: Arc::from(format!("ttyS{id}")),
+            terminal: None,
+            tx_rx: Port8::new(base_port),
+            int_ena: Port8::new(base_port + 1),
+            int_ident: Port8::new(base_port + 2),
+            line_control: Port8::new(base_port + 3),
+            modem_control: Port8::new(base_port + 4),
+            line_status: Port8::new(base_port + 5),
+            modem_status: Port8::new(base_port + 6),
+            scratch: Port8::new(base_port + 7),
+        };
+
+        port.int_ena.write(0x00); // Disable all interrupts
+        port.line_control.write(0x80); // Enable DLAB (set baud rate divisor)
+        port.tx_rx.write(0x00); // Set divisor to 0 (lo byte) 115200 baud rate
+        port.int_ena.write(0x00); //              0 (hi byte)
+        port.line_control.write(0x03); // 8 bits, no parity, one stop bit
+        port.int_ident.write(0xc7); // Enable FIFO, clear them, with 14-byte threshold
+        port.modem_control.write(0x0b); // IRQs enabled, RTS/DSR set
+        port.modem_control.write(0x1e); // Set in loopback mode, test the serial chip
+        port.tx_rx.write(0x19); // Test serial chip (send byte 0x19 and check if serial returns
+                                // same byte)
+        if port.tx_rx.read() != 0x19 {
+            return Err(EIO);
+        }
+
+        port.modem_control.write(0x0f); // Return to normal operation mode
+        Ok(port)
+    }
+
+    fn irq_handler(&self) {
+        let terminal = self.terminal.as_ref();
+        while self.line_status.read() & 0x01 != 0 {
+            let ch = self.tx_rx.read();
+
+            if let Some(terminal) = terminal {
+                terminal.commit_char(ch);
+            }
+        }
+    }
+
+    fn register_char_device(port: Self) -> KResult<()> {
+        let mut port = Arc::new(port);
+        let terminal = Terminal::new(port.clone());
+
+        // TODO!!!!!!: This is unsafe, we should find a way to avoid this.
+        //             Under smp, we should make the publish of terminal atomic.
+        unsafe { Arc::get_mut_unchecked(&mut port) }.terminal = Some(terminal.clone());
+
+        {
+            let port = port.clone();
+            let irq_no = match port.id {
+                0 => Serial::COM0_IRQ,
+                1 => Serial::COM1_IRQ,
+                _ => unreachable!(),
+            };
+
+            register_irq_handler(irq_no as i32, move || {
+                port.irq_handler();
+            })?;
+        }
+        port.enable_interrupts();
+        dont_check!(Console::register_terminal(&terminal));
+
+        CharDevice::register(
+            make_device(4, 64 + port.id),
+            port.name.clone(),
+            CharDeviceType::Terminal(terminal),
+        )?;
+
+        Ok(())
+    }
+}
+
+impl TerminalDevice for Serial {
+    fn putchar(&self, ch: u8) {
+        loop {
+            // If we poll the status and get the corresponding bit, we should handle the action.
+            let status = self.line_status.read();
+            if status & 0x20 != 0 {
+                self.tx_rx.write(ch);
+                return;
+            }
+        }
+    }
+}
+
+pub fn init() -> KResult<()> {
+    let com0 = Serial::new(0, Serial::COM0_BASE);
+    let com1 = Serial::new(1, Serial::COM1_BASE);
+
+    if let Ok(port) = com0 {
+        Serial::register_char_device(port)?;
+    }
+
+    if let Ok(port) = com1 {
+        Serial::register_char_device(port)?;
+    }
+
+    Ok(())
+}

+ 370 - 0
src/elf.rs

@@ -0,0 +1,370 @@
+use alloc::{ffi::CString, sync::Arc};
+use bitflags::bitflags;
+
+use crate::{
+    io::{RawBuffer, UninitBuffer},
+    kernel::{
+        constants::ENOEXEC,
+        mem::{FileMapping, MMList, Mapping, Permission, VAddr},
+        task::Thread,
+        user::{dataflow::CheckedUserPointer, UserPointerMut},
+        vfs::dentry::Dentry,
+    },
+    prelude::*,
+};
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfFormat {
+    Elf32 = 1,
+    Elf64 = 2,
+}
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfEndian {
+    Little = 1,
+    Big = 2,
+}
+
+#[repr(u8)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfABI {
+    // SystemV = 0,
+    Linux = 3,
+}
+
+#[repr(u16)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfType {
+    Relocatable = 1,
+    Executable = 2,
+    Dynamic = 3,
+    Core = 4,
+}
+
+#[repr(u16)]
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ElfArch {
+    X86 = 0x03,
+    Arm = 0x28,
+    IA64 = 0x32,
+    X86_64 = 0x3e,
+    AArch64 = 0xb7,
+    RiscV = 0xf3,
+}
+
+bitflags! {
+    #[derive(Default, Clone, Copy)]
+    pub struct Elf32PhFlags: u32 {
+        const Exec = 1;
+        const Write = 2;
+        const Read = 4;
+    }
+
+    #[derive(Default, Clone, Copy)]
+    pub struct Elf32ShFlags: u32 {
+        const Write = 1;
+        const Alloc = 2;
+        const Exec = 4;
+        const MaskProc = 0xf0000000;
+    }
+}
+
+#[derive(Default, Clone, Copy, PartialEq, Eq)]
+pub enum Elf32PhType {
+    #[default]
+    Null = 0,
+    Load = 1,
+    Dynamic = 2,
+    Interp = 3,
+    Note = 4,
+    Shlib = 5,
+    Phdr = 6,
+    Tls = 7,
+    Loos = 0x60000000,
+    Hios = 0x6fffffff,
+    Loproc = 0x70000000,
+    Hiproc = 0x7fffffff,
+}
+
+#[derive(Default, Clone, Copy, PartialEq, Eq)]
+pub enum Elf32ShType {
+    #[default]
+    Null = 0,
+    ProgBits = 1,
+    SymTab = 2,
+    StrTab = 3,
+    Rela = 4,
+    Hash = 5,
+    Dynamic = 6,
+    Note = 7,
+    NoBits = 8,
+    Rel = 9,
+    Shlib = 10,
+    DynSym = 11,
+    InitArray = 14,
+    FiniArray = 15,
+    PreInitArray = 16,
+    Group = 17,
+    SymTabShndx = 18,
+    Loos = 0x60000000,
+    Hios = 0x6fffffff,
+    Loproc = 0x70000000,
+    Hiproc = 0x7fffffff,
+}
+
+#[repr(C, packed)]
+#[derive(Clone, Copy)]
+pub struct Elf32Header {
+    /// ELF magic number: 0x7f, "ELF"
+    pub magic: [u8; 4],
+    pub format: ElfFormat,
+    pub endian: ElfEndian,
+    /// ELF version, should be 1
+    pub version: u8,
+    pub abi: ElfABI,
+    pub abi_version: u8,
+    padding: [u8; 7],
+    pub elf_type: ElfType,
+    pub arch: ElfArch,
+    /// ELF version, should be 1
+    pub version2: u32,
+    pub entry: u32,
+    pub ph_offset: u32,
+    pub sh_offset: u32,
+    pub flags: u32,
+    pub eh_size: u16,
+    pub ph_entry_size: u16,
+    pub ph_entry_count: u16,
+    pub sh_entry_size: u16,
+    pub sh_entry_count: u16,
+    pub sh_str_index: u16,
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy)]
+pub struct Elf32PhEntry {
+    pub ph_type: Elf32PhType,
+    pub offset: u32,
+    pub vaddr: u32,
+    pub paddr: u32,
+    pub file_size: u32,
+    pub mem_size: u32,
+    pub flags: Elf32PhFlags,
+    /// `0` and `1` for no alignment, otherwise power of `2`
+    pub align: u32,
+}
+
+#[repr(C)]
+#[derive(Default, Clone, Copy)]
+pub struct Elf32ShEntry {
+    pub name_offset: u32,
+    pub sh_type: Elf32ShType,
+    pub flags: Elf32ShFlags,
+    pub addr: u32,
+    pub offset: u32,
+    pub size: u32,
+    pub link: u32,
+    pub info: u32,
+    pub addr_align: u32,
+    pub entry_size: u32,
+}
+
+pub struct ParsedElf32 {
+    entry: u32,
+    file: Arc<Dentry>,
+    phents: Vec<Elf32PhEntry>,
+    shents: Vec<Elf32ShEntry>,
+}
+
+const ELF_MAGIC: [u8; 4] = *b"\x7fELF";
+
+impl Elf32Header {
+    fn check_valid(&self) -> bool {
+        self.magic == ELF_MAGIC
+            && self.version == 1
+            && self.version2 == 1
+            && self.eh_size as usize == size_of::<Elf32Header>()
+            && self.ph_entry_size as usize == size_of::<Elf32PhEntry>()
+            && self.sh_entry_size as usize == size_of::<Elf32ShEntry>()
+    }
+}
+
+impl ParsedElf32 {
+    pub fn parse(file: Arc<Dentry>) -> KResult<Self> {
+        let mut header = UninitBuffer::<Elf32Header>::new();
+        file.read(&mut header, 0)?;
+
+        let header = header.assume_init().ok_or(ENOEXEC)?;
+        if !header.check_valid() {
+            return Err(ENOEXEC);
+        }
+
+        // TODO: Use `UninitBuffer` for `phents` and `shents`.
+        let mut phents = vec![Elf32PhEntry::default(); header.ph_entry_count as usize];
+        let nread = file.read(
+            &mut RawBuffer::new_from_slice(phents.as_mut_slice()),
+            header.ph_offset as usize,
+        )?;
+        if nread != header.ph_entry_count as usize * size_of::<Elf32PhEntry>() {
+            return Err(ENOEXEC);
+        }
+
+        let mut shents = vec![Elf32ShEntry::default(); header.sh_entry_count as usize];
+        let nread = file.read(
+            &mut RawBuffer::new_from_slice(shents.as_mut_slice()),
+            header.sh_offset as usize,
+        )?;
+        if nread != header.sh_entry_count as usize * size_of::<Elf32ShEntry>() {
+            return Err(ENOEXEC);
+        }
+
+        Ok(Self {
+            entry: header.entry,
+            file,
+            phents,
+            shents,
+        })
+    }
+
+    /// Load the ELF file into memory. Return the entry point address.
+    ///
+    /// We clear the user space and load the program headers into memory.
+    /// Can't make a way back if failed from now on.
+    ///
+    /// # Return
+    /// `(entry_ip, sp)`
+    pub fn load(
+        self,
+        mm_list: &MMList,
+        args: Vec<CString>,
+        envs: Vec<CString>,
+    ) -> KResult<(VAddr, VAddr)> {
+        mm_list.clear_user();
+
+        let mut data_segment_end = VAddr(0);
+        for phent in self
+            .phents
+            .into_iter()
+            .filter(|ent| ent.ph_type == Elf32PhType::Load)
+        {
+            let vaddr_start = VAddr(phent.vaddr as usize);
+            let vmem_vaddr_end = vaddr_start + phent.mem_size as usize;
+            let load_vaddr_end = vaddr_start + phent.file_size as usize;
+
+            let vaddr = vaddr_start.floor();
+            let vmem_len = vmem_vaddr_end.ceil() - vaddr;
+            let file_len = load_vaddr_end.ceil() - vaddr;
+            let file_offset = phent.offset as usize & !0xfff;
+
+            let permission = Permission {
+                write: phent.flags.contains(Elf32PhFlags::Write),
+                execute: phent.flags.contains(Elf32PhFlags::Exec),
+            };
+
+            if file_len != 0 {
+                let real_file_length = load_vaddr_end - vaddr;
+                mm_list.mmap_fixed(
+                    vaddr,
+                    file_len,
+                    Mapping::File(FileMapping::new(
+                        self.file.clone(),
+                        file_offset,
+                        real_file_length,
+                    )),
+                    permission,
+                )?;
+            }
+
+            if vmem_len > file_len {
+                mm_list.mmap_fixed(
+                    vaddr + file_len,
+                    vmem_len - file_len,
+                    Mapping::Anonymous,
+                    permission,
+                )?;
+            }
+
+            if vaddr + vmem_len > data_segment_end {
+                data_segment_end = vaddr + vmem_len;
+            }
+        }
+
+        mm_list.register_break(data_segment_end + 0x10000);
+
+        // Map stack area
+        mm_list.mmap_fixed(
+            VAddr(0xc0000000 - 0x800000), // Stack bottom is at 0xc0000000
+            0x800000,                     // 8MB stack size
+            Mapping::Anonymous,
+            Permission {
+                write: true,
+                execute: false,
+            },
+        )?;
+
+        // TODO!!!!!: A temporary workaround.
+        mm_list.switch_page_table();
+
+        let mut sp = 0xc0000000u32;
+        let arg_addrs = args
+            .into_iter()
+            .map(|arg| push_string(&mut sp, arg))
+            .collect::<Vec<_>>();
+
+        let env_addrs = envs
+            .into_iter()
+            .map(|env| push_string(&mut sp, env))
+            .collect::<Vec<_>>();
+
+        let longs = 2 // Null auxiliary vector entry
+            + env_addrs.len() + 1 // Envs + null
+            + arg_addrs.len() + 1 // Args + null
+            + 1; // argc
+
+        sp -= longs as u32 * 4;
+        sp &= !0xf; // Align to 16 bytes
+
+        let mut cursor = (0..longs)
+            .map(|idx| UserPointerMut::<u32>::new_vaddr(sp as usize + size_of::<u32>() * idx));
+
+        // argc
+        cursor.next().unwrap()?.write(arg_addrs.len() as u32)?;
+
+        // args
+        for arg_addr in arg_addrs.into_iter() {
+            cursor.next().unwrap()?.write(arg_addr)?;
+        }
+        cursor.next().unwrap()?.write(0)?; // null
+
+        // envs
+        for env_addr in env_addrs.into_iter() {
+            cursor.next().unwrap()?.write(env_addr)?;
+        }
+        cursor.next().unwrap()?.write(0)?; // null
+
+        // Null auxiliary vector
+        cursor.next().unwrap()?.write(0)?; // AT_NULL
+        cursor.next().unwrap()?.write(0)?; // AT_NULL
+
+        // TODO!!!!!: A temporary workaround.
+        Thread::current().process.mm_list.switch_page_table();
+
+        assert!(cursor.next().is_none());
+        Ok((VAddr(self.entry as usize), VAddr(sp as usize)))
+    }
+}
+
+fn push_string(sp: &mut u32, string: CString) -> u32 {
+    let data = string.as_bytes_with_nul();
+    let new_sp = (*sp - data.len() as u32) & !0x3; // Align to 4 bytes
+
+    CheckedUserPointer::new(new_sp as *const u8, data.len())
+        .unwrap()
+        .write(data.as_ptr() as _, data.len())
+        .unwrap();
+
+    *sp = new_sp;
+    new_sp
+}

+ 137 - 148
src/fs/fat32.rs

@@ -1,4 +1,10 @@
-use alloc::{sync::Arc, vec::Vec};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+
+use alloc::{
+    collections::btree_map::BTreeMap,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
 use bindings::{EINVAL, EIO, S_IFDIR, S_IFREG};
 use bindings::{EINVAL, EIO, S_IFDIR, S_IFREG};
 
 
 use itertools::Itertools;
 use itertools::Itertools;
@@ -10,10 +16,10 @@ use crate::{
         mem::{paging::Page, phys::PhysPtr},
         mem::{paging::Page, phys::PhysPtr},
         vfs::{
         vfs::{
             dentry::Dentry,
             dentry::Dentry,
-            inode::{Ino, Inode, InodeCache, InodeOps},
+            inode::{define_struct_inode, Ino, Inode, InodeData},
             mount::{register_filesystem, Mount, MountCreator},
             mount::{register_filesystem, Mount, MountCreator},
             vfs::Vfs,
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
         },
     },
     },
     prelude::*,
     prelude::*,
@@ -131,19 +137,35 @@ struct Bootsector {
     mbr_signature: u16,
     mbr_signature: u16,
 }
 }
 
 
+impl_any!(FatFs);
 /// # Lock order
 /// # Lock order
-/// 1. FatFs
 /// 2. FatTable
 /// 2. FatTable
 /// 3. Inodes
 /// 3. Inodes
 ///
 ///
 struct FatFs {
 struct FatFs {
-    device: Arc<BlockDevice>,
-    icache: Mutex<InodeCache<FatFs>>,
     sectors_per_cluster: u8,
     sectors_per_cluster: u8,
     rootdir_cluster: ClusterNo,
     rootdir_cluster: ClusterNo,
     data_start: u64,
     data_start: u64,
-    fat: Mutex<Vec<ClusterNo>>,
-    volume_label: String,
+    volume_label: [u8; 11],
+
+    device: Arc<BlockDevice>,
+    fat: RwSemaphore<Vec<ClusterNo>>,
+    weak: Weak<FatFs>,
+    icache: BTreeMap<Ino, FatInode>,
+}
+
+impl Vfs for FatFs {
+    fn io_blksize(&self) -> usize {
+        4096
+    }
+
+    fn fs_devid(&self) -> DevId {
+        self.device.devid()
+    }
+
+    fn is_read_only(&self) -> bool {
+        true
+    }
 }
 }
 
 
 impl FatFs {
 impl FatFs {
@@ -151,8 +173,7 @@ impl FatFs {
         let cluster = cluster - 2;
         let cluster = cluster - 2;
 
 
         let rq = BlockDeviceRequest {
         let rq = BlockDeviceRequest {
-            sector: self.data_start as u64
-                + cluster as u64 * self.sectors_per_cluster as u64,
+            sector: self.data_start as u64 + cluster as u64 * self.sectors_per_cluster as u64,
             count: self.sectors_per_cluster as u64,
             count: self.sectors_per_cluster as u64,
             buffer: core::slice::from_ref(buf),
             buffer: core::slice::from_ref(buf),
         };
         };
@@ -160,57 +181,34 @@ impl FatFs {
 
 
         Ok(())
         Ok(())
     }
     }
-}
-
-impl InodeCache<FatFs> {
-    fn get_or_alloc(
-        &mut self,
-        ino: Ino,
-        is_directory: bool,
-        size: u64,
-    ) -> KResult<Arc<Inode>> {
-        self.get(ino).map(|inode| Ok(inode)).unwrap_or_else(|| {
-            let nlink;
-            let mut mode = 0o777;
-
-            let ops: Box<dyn InodeOps>;
-
-            if is_directory {
-                nlink = 2;
-                mode |= S_IFDIR;
-                ops = Box::new(DirOps);
-            } else {
-                nlink = 1;
-                mode |= S_IFREG;
-                ops = Box::new(FileOps);
-            }
-
-            let mut inode = self.alloc(ino, ops);
-            let inode_mut = unsafe { Arc::get_mut_unchecked(&mut inode) };
-            let inode_idata = inode_mut.idata.get_mut();
 
 
-            inode_idata.mode = mode;
-            inode_idata.nlink = nlink;
-            inode_idata.size = size;
-
-            self.submit(&inode)?;
-
-            Ok(inode)
-        })
+    fn get_or_alloc_inode(&self, ino: Ino, is_directory: bool, size: u32) -> Arc<dyn Inode> {
+        self.icache
+            .get(&ino)
+            .cloned()
+            .map(FatInode::unwrap)
+            .unwrap_or_else(|| {
+                if is_directory {
+                    DirInode::new(ino, self.weak.clone(), size)
+                } else {
+                    FileInode::new(ino, self.weak.clone(), size)
+                }
+            })
     }
     }
 }
 }
 
 
 impl FatFs {
 impl FatFs {
-    pub fn create(device: DevId) -> KResult<(Arc<Self>, Arc<Inode>)> {
+    pub fn create(device: DevId) -> KResult<(Arc<Self>, Arc<dyn Inode>)> {
         let device = BlockDevice::get(device)?;
         let device = BlockDevice::get(device)?;
-        let mut fatfs_arc = Arc::new_cyclic(|weak| Self {
+        let mut fatfs_arc = Arc::new_cyclic(|weak: &Weak<FatFs>| Self {
             device,
             device,
-            icache: Mutex::new(InodeCache::new(weak.clone())),
             sectors_per_cluster: 0,
             sectors_per_cluster: 0,
             rootdir_cluster: 0,
             rootdir_cluster: 0,
             data_start: 0,
             data_start: 0,
-            fat: Mutex::new(Vec::new()),
-            volume_label: String::new(),
+            fat: RwSemaphore::new(Vec::new()),
+            weak: weak.clone(),
+            icache: BTreeMap::new(),
+            volume_label: [0; 11],
         });
         });
 
 
         let fatfs = unsafe { Arc::get_mut_unchecked(&mut fatfs_arc) };
         let fatfs = unsafe { Arc::get_mut_unchecked(&mut fatfs_arc) };
@@ -221,13 +219,13 @@ impl FatFs {
 
 
         fatfs.sectors_per_cluster = info.sectors_per_cluster;
         fatfs.sectors_per_cluster = info.sectors_per_cluster;
         fatfs.rootdir_cluster = info.root_cluster;
         fatfs.rootdir_cluster = info.root_cluster;
-        fatfs.data_start = info.reserved_sectors as u64
-            + info.fat_copies as u64 * info.sectors_per_fat as u64;
+        fatfs.data_start =
+            info.reserved_sectors as u64 + info.fat_copies as u64 * info.sectors_per_fat as u64;
 
 
         let fat = fatfs.fat.get_mut();
         let fat = fatfs.fat.get_mut();
+
         fat.resize(
         fat.resize(
-            512 * info.sectors_per_fat as usize
-                / core::mem::size_of::<ClusterNo>(),
+            512 * info.sectors_per_fat as usize / core::mem::size_of::<ClusterNo>(),
             0,
             0,
         );
         );
 
 
@@ -242,51 +240,21 @@ impl FatFs {
             return Err(EIO);
             return Err(EIO);
         }
         }
 
 
-        fatfs.volume_label = String::from(
-            str::from_utf8(&info.volume_label)
-                .map_err(|_| EINVAL)?
-                .trim_end_matches(char::from(' ')),
-        );
-
-        let root_dir_cluster_count =
-            ClusterIterator::new(&fat, fatfs.rootdir_cluster).count();
-
-        let root_inode = {
-            let icache = fatfs.icache.get_mut();
-
-            let mut inode =
-                icache.alloc(info.root_cluster as Ino, Box::new(DirOps));
-            let inode_mut = unsafe { Arc::get_mut_unchecked(&mut inode) };
-            let inode_idata = inode_mut.idata.get_mut();
-
-            inode_idata.mode = S_IFDIR | 0o777;
-            inode_idata.nlink = 2;
-            inode_idata.size = root_dir_cluster_count as u64
-                * info.sectors_per_cluster as u64
-                * 512;
+        info.volume_label
+            .iter()
+            .take_while(|&&c| c != ' ' as u8)
+            .take(11)
+            .enumerate()
+            .for_each(|(idx, c)| fatfs.volume_label[idx] = *c);
 
 
-            icache.submit(&inode)?;
-            inode
-        };
+        let root_dir_cluster_count = ClusterIterator::new(fat, fatfs.rootdir_cluster).count();
+        let root_dir_size = root_dir_cluster_count as u32 * info.sectors_per_cluster as u32 * 512;
+        let root_inode = DirInode::new(info.root_cluster as Ino, fatfs.weak.clone(), root_dir_size);
 
 
         Ok((fatfs_arc, root_inode))
         Ok((fatfs_arc, root_inode))
     }
     }
 }
 }
 
 
-impl Vfs for FatFs {
-    fn io_blksize(&self) -> usize {
-        4096
-    }
-
-    fn fs_devid(&self) -> DevId {
-        self.device.devid()
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
 struct ClusterIterator<'fat> {
 struct ClusterIterator<'fat> {
     fat: &'fat [ClusterNo],
     fat: &'fat [ClusterNo],
     cur: ClusterNo,
     cur: ClusterNo,
@@ -371,24 +339,47 @@ impl<'fat> Iterator for ClusterIterator<'fat> {
     }
     }
 }
 }
 
 
-struct FileOps;
-impl InodeOps for FileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+#[derive(Clone)]
+enum FatInode {
+    File(Arc<FileInode>),
+    Dir(Arc<DirInode>),
+}
+
+impl FatInode {
+    fn unwrap(self) -> Arc<dyn Inode> {
+        match self {
+            FatInode::File(inode) => inode,
+            FatInode::Dir(inode) => inode,
+        }
     }
     }
+}
 
 
-    fn read(
-        &self,
-        inode: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
-        let vfs = inode.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+define_struct_inode! {
+    struct FileInode;
+}
+
+impl FileInode {
+    fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
+        let inode = Arc::new(Self {
+            idata: InodeData::new(ino, weak),
+        });
+
+        // Safety: We are initializing the inode
+        inode.nlink.store(1, Ordering::Relaxed);
+        inode.mode.store(S_IFREG | 0o777, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+
+        inode
+    }
+}
+
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
 
-        let iter = ClusterIterator::new(&fat, inode.ino as ClusterNo)
-            .read(vfs, offset);
+        let iter = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).read(vfs, offset);
 
 
         for data in iter {
         for data in iter {
             if buffer.fill(data?)?.should_stop() {
             if buffer.fill(data?)?.should_stop() {
@@ -400,23 +391,32 @@ impl InodeOps for FileOps {
     }
     }
 }
 }
 
 
-struct DirOps;
-impl InodeOps for DirOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+define_struct_inode! {
+    struct DirInode;
+}
+
+impl DirInode {
+    fn new(ino: Ino, weak: Weak<FatFs>, size: u32) -> Arc<Self> {
+        let inode = Arc::new(Self {
+            idata: InodeData::new(ino, weak),
+        });
+
+        // Safety: We are initializing the inode
+        inode.nlink.store(2, Ordering::Relaxed);
+        inode.mode.store(S_IFDIR | 0o777, Ordering::Relaxed);
+        inode.size.store(size as u64, Ordering::Relaxed);
+
+        inode
     }
     }
+}
 
 
-    fn lookup(
-        &self,
-        dir: &Inode,
-        dentry: &Arc<Dentry>,
-    ) -> KResult<Option<Arc<Inode>>> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+impl Inode for DirInode {
+    fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
 
-        let mut entries =
-            ClusterIterator::new(&fat, dir.ino as ClusterNo).dirs(vfs, 0);
+        let mut entries = ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).dirs(vfs, 0);
 
 
         let entry = entries.find_map(|entry| {
         let entry = entries.find_map(|entry| {
             if entry.is_err() {
             if entry.is_err() {
@@ -438,28 +438,27 @@ impl InodeOps for DirOps {
             Some(Ok(entry)) => {
             Some(Ok(entry)) => {
                 let ino = entry.ino();
                 let ino = entry.ino();
 
 
-                Ok(Some(vfs.icache.lock().get_or_alloc(
+                Ok(Some(vfs.get_or_alloc_inode(
                     ino,
                     ino,
                     entry.is_directory(),
                     entry.is_directory(),
-                    entry.size as u64,
-                )?))
+                    entry.size,
+                )))
             }
             }
         }
         }
     }
     }
 
 
-    fn readdir<'cb, 'r: 'cb>(
-        &'r self,
-        dir: &'r Inode,
+    fn do_readdir(
+        &self,
         offset: usize,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
     ) -> KResult<usize> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<FatFs>().ok_or(EINVAL)?;
-        let fat = vfs.fat.lock();
+        let vfs = self.vfs.upgrade().ok_or(EIO)?;
+        let vfs = vfs.as_any().downcast_ref::<FatFs>().unwrap();
+        let fat = vfs.fat.lock_shared();
 
 
         const ENTRY_SIZE: usize = core::mem::size_of::<FatDirectoryEntry>();
         const ENTRY_SIZE: usize = core::mem::size_of::<FatDirectoryEntry>();
         let cluster_iter =
         let cluster_iter =
-            ClusterIterator::new(&fat, dir.ino as ClusterNo).dirs(vfs, offset);
+            ClusterIterator::new(fat.as_ref(), self.ino as ClusterNo).dirs(vfs, offset);
 
 
         let mut nread = 0;
         let mut nread = 0;
         for entry in cluster_iter {
         for entry in cluster_iter {
@@ -473,13 +472,9 @@ impl InodeOps for DirOps {
             let ino = entry.ino();
             let ino = entry.ino();
             let name = entry.filename();
             let name = entry.filename();
 
 
-            vfs.icache.lock().get_or_alloc(
-                ino,
-                entry.is_directory(),
-                entry.size as u64,
-            )?;
+            vfs.get_or_alloc_inode(ino, entry.is_directory(), entry.size);
 
 
-            if callback(name.as_ref(), ino).is_err() {
+            if callback(name.as_ref(), ino)?.is_break() {
                 break;
                 break;
             }
             }
 
 
@@ -493,13 +488,7 @@ impl InodeOps for DirOps {
 struct FatMountCreator;
 struct FatMountCreator;
 
 
 impl MountCreator for FatMountCreator {
 impl MountCreator for FatMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fatfs, root_inode) = FatFs::create(make_device(8, 1))?;
         let (fatfs, root_inode) = FatFs::create(make_device(8, 1))?;
 
 
         Mount::new(mp, fatfs, root_inode)
         Mount::new(mp, fatfs, root_inode)
@@ -507,5 +496,5 @@ impl MountCreator for FatMountCreator {
 }
 }
 
 
 pub fn init() {
 pub fn init() {
-    register_filesystem("fat32", Box::new(FatMountCreator)).unwrap();
+    register_filesystem("fat32", Arc::new(FatMountCreator)).unwrap();
 }
 }

+ 170 - 140
src/fs/procfs.rs

@@ -1,7 +1,11 @@
-use core::sync::atomic::Ordering;
-
-use alloc::sync::{Arc, Weak};
+use alloc::{
+    collections::btree_map::BTreeMap,
+    sync::{Arc, Weak},
+};
 use bindings::{EACCES, ENOTDIR, S_IFDIR, S_IFREG};
 use bindings::{EACCES, ENOTDIR, S_IFDIR, S_IFREG};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
+use lazy_static::lazy_static;
 
 
 use crate::{
 use crate::{
     io::Buffer,
     io::Buffer,
@@ -9,13 +13,14 @@ use crate::{
         mem::paging::{Page, PageBuffer},
         mem::paging::{Page, PageBuffer},
         vfs::{
         vfs::{
             dentry::Dentry,
             dentry::Dentry,
-            inode::{AtomicIno, Inode, InodeCache, InodeData, InodeOps},
+            inode::{define_struct_inode, AtomicIno, Ino, Inode, InodeData},
             mount::{dump_mounts, register_filesystem, Mount, MountCreator},
             mount::{dump_mounts, register_filesystem, Mount, MountCreator},
             vfs::Vfs,
             vfs::Vfs,
-            DevId, ReadDirCallback,
+            DevId,
         },
         },
     },
     },
     prelude::*,
     prelude::*,
+    sync::Locked,
 };
 };
 
 
 fn split_len_offset(data: &[u8], len: usize, offset: usize) -> Option<&[u8]> {
 fn split_len_offset(data: &[u8], len: usize, offset: usize) -> Option<&[u8]> {
@@ -24,8 +29,6 @@ fn split_len_offset(data: &[u8], len: usize, offset: usize) -> Option<&[u8]> {
     real_data.split_at_checked(offset).map(|(_, data)| data)
     real_data.split_at_checked(offset).map(|(_, data)| data)
 }
 }
 
 
-pub struct ProcFsNode(Arc<Inode>);
-
 pub trait ProcFsFile: Send + Sync {
 pub trait ProcFsFile: Send + Sync {
     fn can_read(&self) -> bool {
     fn can_read(&self) -> bool {
         false
         false
@@ -44,21 +47,57 @@ pub trait ProcFsFile: Send + Sync {
     }
     }
 }
 }
 
 
-struct ProcFsFileOps {
-    file: Box<dyn ProcFsFile>,
+pub enum ProcFsNode {
+    File(Arc<FileInode>),
+    Dir(Arc<DirInode>),
 }
 }
 
 
-impl InodeOps for ProcFsFileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl ProcFsNode {
+    fn unwrap(&self) -> Arc<dyn Inode> {
+        match self {
+            ProcFsNode::File(inode) => inode.clone(),
+            ProcFsNode::Dir(inode) => inode.clone(),
+        }
     }
     }
 
 
-    fn read(
-        &self,
-        _: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
+    fn ino(&self) -> Ino {
+        match self {
+            ProcFsNode::File(inode) => inode.ino,
+            ProcFsNode::Dir(inode) => inode.ino,
+        }
+    }
+}
+
+define_struct_inode! {
+    pub struct FileInode {
+        file: Box<dyn ProcFsFile>,
+    }
+}
+
+impl FileInode {
+    pub fn new(ino: Ino, vfs: Weak<ProcFs>, file: Box<dyn ProcFsFile>) -> Arc<Self> {
+        let mut mode = S_IFREG;
+        if file.can_read() {
+            mode |= 0o444;
+        }
+        if file.can_write() {
+            mode |= 0o200;
+        }
+
+        let inode = Self {
+            idata: InodeData::new(ino, vfs),
+            file,
+        };
+
+        inode.idata.mode.store(mode, Ordering::Relaxed);
+        inode.idata.nlink.store(1, Ordering::Relaxed);
+
+        Arc::new(inode)
+    }
+}
+
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
         if !self.file.can_read() {
         if !self.file.can_read() {
             return Err(EACCES);
             return Err(EACCES);
         }
         }
@@ -75,47 +114,56 @@ impl InodeOps for ProcFsFileOps {
     }
     }
 }
 }
 
 
-struct ProcFsDirectory {
-    entries: Mutex<Vec<(Arc<[u8]>, ProcFsNode)>>,
+define_struct_inode! {
+    struct DirInode {
+        entries: Locked<Vec<(Arc<[u8]>, ProcFsNode)>, ()>,
+    }
 }
 }
 
 
-impl InodeOps for ProcFsDirectory {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl DirInode {
+    pub fn new(ino: Ino, vfs: Weak<ProcFs>) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, entries).write(Locked::new(vec![], rwsem));
+            addr_of_mut_field!(inode, mode).write((S_IFDIR | 0o755).into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
     }
+}
 
 
-    fn lookup(
-        &self,
-        _: &Inode,
-        dentry: &Arc<Dentry>,
-    ) -> KResult<Option<Arc<Inode>>> {
-        Ok(self.entries.lock().iter().find_map(|(name, node)| {
-            name.as_ref()
-                .eq(dentry.name().as_ref())
-                .then(|| node.0.clone())
-        }))
+impl Inode for DirInode {
+    fn lookup(&self, dentry: &Arc<Dentry>) -> KResult<Option<Arc<dyn Inode>>> {
+        let lock = self.rwsem.lock_shared();
+        Ok(self
+            .entries
+            .access(lock.as_ref())
+            .iter()
+            .find_map(|(name, node)| {
+                name.as_ref()
+                    .eq(dentry.name().as_ref())
+                    .then(|| node.unwrap())
+            }))
     }
     }
 
 
-    fn readdir<'cb, 'r: 'cb>(
+    fn do_readdir(
         &self,
         &self,
-        _: &Inode,
         offset: usize,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
     ) -> KResult<usize> {
-        Ok(self
-            .entries
-            .lock()
+        let lock = self.rwsem.lock_shared();
+        self.entries
+            .access(lock.as_ref())
             .iter()
             .iter()
             .skip(offset)
             .skip(offset)
-            .take_while(|(name, ProcFsNode(inode))| {
-                callback(name, inode.ino).is_ok()
-            })
-            .count())
+            .map(|(name, node)| callback(name.as_ref(), node.ino()))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
     }
 }
 }
 
 
+impl_any!(ProcFs);
 pub struct ProcFs {
 pub struct ProcFs {
-    root_node: Arc<Inode>,
+    root_node: Arc<DirInode>,
     next_ino: AtomicIno,
     next_ino: AtomicIno,
 }
 }
 
 
@@ -128,38 +176,37 @@ impl Vfs for ProcFs {
         10
         10
     }
     }
 
 
-    fn as_any(&self) -> &dyn Any {
-        self
+    fn is_read_only(&self) -> bool {
+        false
     }
     }
 }
 }
 
 
-static mut GLOBAL_PROCFS: Option<Arc<ProcFs>> = None;
-static mut ICACHE: Option<InodeCache<ProcFs>> = None;
+lazy_static! {
+    static ref ICACHE: Spin<BTreeMap<Ino, ProcFsNode>> = Spin::new(BTreeMap::new());
+    static ref GLOBAL_PROCFS: Arc<ProcFs> = {
+        let fs: Arc<ProcFs> = Arc::new_cyclic(|weak: &Weak<ProcFs>| ProcFs {
+            root_node: DirInode::new(0, weak.clone()),
+            next_ino: AtomicIno::new(1),
+        });
 
 
-fn get_icache() -> &'static InodeCache<ProcFs> {
-    unsafe { ICACHE.as_ref().unwrap() }
+        fs
+    };
 }
 }
 
 
 struct ProcFsMountCreator;
 struct ProcFsMountCreator;
 
 
 impl ProcFsMountCreator {
 impl ProcFsMountCreator {
     pub fn get() -> Arc<ProcFs> {
     pub fn get() -> Arc<ProcFs> {
-        unsafe { GLOBAL_PROCFS.as_ref().cloned().unwrap() }
+        GLOBAL_PROCFS.clone()
     }
     }
 
 
     pub fn get_weak() -> Weak<ProcFs> {
     pub fn get_weak() -> Weak<ProcFs> {
-        unsafe { GLOBAL_PROCFS.as_ref().map(Arc::downgrade).unwrap() }
+        Arc::downgrade(&GLOBAL_PROCFS)
     }
     }
 }
 }
 
 
 impl MountCreator for ProcFsMountCreator {
 impl MountCreator for ProcFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        _flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, _flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let vfs = ProcFsMountCreator::get();
         let vfs = ProcFsMountCreator::get();
         let root_inode = vfs.root_node.clone();
         let root_inode = vfs.root_node.clone();
         Mount::new(mp, vfs, root_inode)
         Mount::new(mp, vfs, root_inode)
@@ -170,77 +217,55 @@ pub fn root() -> ProcFsNode {
     let vfs = ProcFsMountCreator::get();
     let vfs = ProcFsMountCreator::get();
     let root = vfs.root_node.clone();
     let root = vfs.root_node.clone();
 
 
-    ProcFsNode(root)
+    ProcFsNode::Dir(root)
 }
 }
 
 
 pub fn creat(
 pub fn creat(
     parent: &ProcFsNode,
     parent: &ProcFsNode,
-    name: &Arc<[u8]>,
+    name: Arc<[u8]>,
     file: Box<dyn ProcFsFile>,
     file: Box<dyn ProcFsFile>,
 ) -> KResult<ProcFsNode> {
 ) -> KResult<ProcFsNode> {
-    let mut mode = S_IFREG;
-    if file.can_read() {
-        mode |= 0o444;
-    }
-    if file.can_write() {
-        mode |= 0o200;
-    }
-
-    let dir = parent
-        .0
-        .ops
-        .as_any()
-        .downcast_ref::<ProcFsDirectory>()
-        .ok_or(ENOTDIR)?;
+    let parent = match parent {
+        ProcFsNode::File(_) => return Err(ENOTDIR),
+        ProcFsNode::Dir(parent) => parent,
+    };
 
 
     let fs = ProcFsMountCreator::get();
     let fs = ProcFsMountCreator::get();
-    let ino = fs.next_ino.fetch_add(1, Ordering::SeqCst);
+    let ino = fs.next_ino.fetch_add(1, Ordering::Relaxed);
 
 
-    let inode = get_icache().alloc(ino, Box::new(ProcFsFileOps { file }));
+    let inode = FileInode::new(ino, Arc::downgrade(&fs), file);
 
 
-    inode.idata.lock().mode = mode;
-    inode.idata.lock().nlink = 1;
-
-    dir.entries
-        .lock()
-        .push((name.clone(), ProcFsNode(inode.clone())));
+    {
+        let mut lock = parent.idata.rwsem.lock();
+        parent
+            .entries
+            .access_mut(lock.as_mut())
+            .push((name, ProcFsNode::File(inode.clone())));
+    }
 
 
-    Ok(ProcFsNode(inode))
+    Ok(ProcFsNode::File(inode))
 }
 }
 
 
 pub fn mkdir(parent: &ProcFsNode, name: &[u8]) -> KResult<ProcFsNode> {
 pub fn mkdir(parent: &ProcFsNode, name: &[u8]) -> KResult<ProcFsNode> {
-    let dir = parent
-        .0
-        .ops
-        .as_any()
-        .downcast_ref::<ProcFsDirectory>()
-        .ok_or(ENOTDIR)?;
-
-    let ino = ProcFsMountCreator::get()
-        .next_ino
-        .fetch_add(1, Ordering::SeqCst);
-
-    let inode = get_icache().alloc(
-        ino,
-        Box::new(ProcFsDirectory {
-            entries: Mutex::new(vec![]),
-        }),
-    );
+    let parent = match parent {
+        ProcFsNode::File(_) => return Err(ENOTDIR),
+        ProcFsNode::Dir(parent) => parent,
+    };
 
 
-    {
-        let mut idata = inode.idata.lock();
-        idata.nlink = 2;
-        idata.mode = S_IFDIR | 0o755;
-    }
+    let fs = ProcFsMountCreator::get();
+    let ino = fs.next_ino.fetch_add(1, Ordering::Relaxed);
 
 
-    dir.entries
-        .lock()
-        .push((Arc::from(name), ProcFsNode(inode.clone())));
+    let inode = DirInode::new(ino, Arc::downgrade(&fs));
 
 
-    Ok(ProcFsNode(inode))
+    parent
+        .entries
+        .access_mut(inode.rwsem.lock().as_mut())
+        .push((Arc::from(name), ProcFsNode::Dir(inode.clone())));
+
+    Ok(ProcFsNode::Dir(inode))
 }
 }
 
 
-struct DumpMountsFile {}
+struct DumpMountsFile;
 impl ProcFsFile for DumpMountsFile {
 impl ProcFsFile for DumpMountsFile {
     fn can_read(&self) -> bool {
     fn can_read(&self) -> bool {
         true
         true
@@ -254,43 +279,48 @@ impl ProcFsFile for DumpMountsFile {
 }
 }
 
 
 pub fn init() {
 pub fn init() {
-    let dir = ProcFsDirectory {
-        entries: Mutex::new(vec![]),
-    };
-
-    let fs: Arc<ProcFs> = Arc::new_cyclic(|weak: &Weak<ProcFs>| {
-        let root_node = Arc::new(Inode {
-            ino: 0,
-            vfs: weak.clone(),
-            idata: Mutex::new(InodeData::default()),
-            ops: Box::new(dir),
-        });
+    register_filesystem("procfs", Arc::new(ProcFsMountCreator)).unwrap();
 
 
-        ProcFs {
-            root_node,
-            next_ino: AtomicIno::new(1),
-        }
-    });
+    creat(
+        &root(),
+        Arc::from(b"mounts".as_slice()),
+        Box::new(DumpMountsFile),
+    )
+    .unwrap();
+}
 
 
-    {
-        let mut indata = fs.root_node.idata.lock();
-        indata.mode = S_IFDIR | 0o755;
-        indata.nlink = 1;
-    };
+pub struct GenericProcFsFile<ReadFn>
+where
+    ReadFn: Send + Sync + Fn(&mut PageBuffer) -> KResult<()>,
+{
+    read_fn: Option<ReadFn>,
+}
 
 
-    unsafe {
-        GLOBAL_PROCFS = Some(fs);
-        ICACHE = Some(InodeCache::new(ProcFsMountCreator::get_weak()));
-    };
+impl<ReadFn> ProcFsFile for GenericProcFsFile<ReadFn>
+where
+    ReadFn: Send + Sync + Fn(&mut PageBuffer) -> KResult<()>,
+{
+    fn can_read(&self) -> bool {
+        self.read_fn.is_some()
+    }
 
 
-    register_filesystem("procfs", Box::new(ProcFsMountCreator)).unwrap();
+    fn read(&self, buffer: &mut PageBuffer) -> KResult<usize> {
+        self.read_fn.as_ref().ok_or(EACCES)?(buffer).map(|_| buffer.len())
+    }
+}
 
 
+pub fn populate_root<F>(name: Arc<[u8]>, read_fn: F) -> KResult<()>
+where
+    F: Send + Sync + Fn(&mut PageBuffer) -> KResult<()> + 'static,
+{
     let root = root();
     let root = root();
 
 
     creat(
     creat(
         &root,
         &root,
-        &Arc::from(b"mounts".as_slice()),
-        Box::new(DumpMountsFile {}),
+        name,
+        Box::new(GenericProcFsFile {
+            read_fn: Some(read_fn),
+        }),
     )
     )
-    .unwrap();
+    .map(|_| ())
 }
 }

+ 207 - 243
src/fs/tmpfs.rs

@@ -1,383 +1,347 @@
-use core::sync::atomic::Ordering;
+use alloc::sync::{Arc, Weak};
+use bindings::{EINVAL, EIO, EISDIR, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG};
+use core::{ops::ControlFlow, sync::atomic::Ordering};
+use itertools::Itertools;
 
 
 use crate::{
 use crate::{
     io::Buffer,
     io::Buffer,
     kernel::vfs::{
     kernel::vfs::{
-        dentry::Dentry,
-        inode::{AtomicIno, Ino, Inode, InodeCache, InodeOps, Mode},
+        dentry::{dcache, Dentry},
+        inode::{define_struct_inode, AtomicIno, Ino, Inode, Mode, WriteOffset},
         mount::{register_filesystem, Mount, MountCreator, MS_RDONLY},
         mount::{register_filesystem, Mount, MountCreator, MS_RDONLY},
         s_isblk, s_ischr,
         s_isblk, s_ischr,
         vfs::Vfs,
         vfs::Vfs,
-        DevId, ReadDirCallback,
+        DevId,
     },
     },
     prelude::*,
     prelude::*,
+    sync::Locked,
 };
 };
 
 
-use alloc::sync::Arc;
-
-use bindings::{
-    EINVAL, EIO, EISDIR, EROFS, S_IFBLK, S_IFCHR, S_IFDIR, S_IFLNK, S_IFREG,
-};
-
-struct FileOps {
-    data: Mutex<Vec<u8>>,
+fn acquire(vfs: &Weak<dyn Vfs>) -> KResult<Arc<dyn Vfs>> {
+    vfs.upgrade().ok_or(EIO)
 }
 }
 
 
-struct NodeOps {
-    devid: DevId,
+fn astmp(vfs: &Arc<dyn Vfs>) -> &TmpFs {
+    vfs.as_any()
+        .downcast_ref::<TmpFs>()
+        .expect("corrupted tmpfs data structure")
 }
 }
 
 
-impl NodeOps {
-    fn new(devid: DevId) -> Self {
-        Self { devid }
+define_struct_inode! {
+    struct NodeInode {
+        devid: DevId,
     }
     }
 }
 }
 
 
-impl InodeOps for NodeOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl NodeInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode, devid: DevId) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, _| unsafe {
+            addr_of_mut_field!(inode, devid).write(devid);
+
+            addr_of_mut_field!(inode, mode).write(mode.into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
     }
+}
 
 
-    fn devid(&self, _: &Inode) -> KResult<DevId> {
+impl Inode for NodeInode {
+    fn devid(&self) -> KResult<DevId> {
         Ok(self.devid)
         Ok(self.devid)
     }
     }
 }
 }
 
 
-struct DirectoryOps {
-    entries: Mutex<Vec<(Arc<[u8]>, Ino)>>,
+define_struct_inode! {
+    struct DirectoryInode {
+        entries: Locked<Vec<(Arc<[u8]>, Ino)>, ()>,
+    }
 }
 }
 
 
-impl DirectoryOps {
-    fn new() -> Self {
-        Self {
-            entries: Mutex::new(vec![]),
-        }
+impl DirectoryInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, entries)
+                .write(Locked::new(vec![(Arc::from(b".".as_slice()), ino)], rwsem));
+
+            addr_of_mut_field!(inode, size).write(1.into());
+            addr_of_mut_field!(inode, mode).write((S_IFDIR | (mode & 0o777)).into());
+            addr_of_mut_field!(inode, nlink).write(1.into()); // link from `.` to itself
+        })
     }
     }
 
 
-    /// Locks the `inode.idata`
-    fn link(&self, dir: &Inode, file: &Inode, name: Arc<[u8]>) -> KResult<()> {
-        dir.idata.lock().size += 1;
-        self.entries.lock().push((name, file.ino));
+    fn link(&self, name: Arc<[u8]>, file: &dyn Inode, dlock: &mut ()) {
+        // SAFETY: Only `unlink` will do something based on `nlink` count
+        //         No need to synchronize here
+        file.nlink.fetch_add(1, Ordering::Relaxed);
 
 
-        file.idata.lock().nlink += 1;
+        // SAFETY: `rwsem` has done the synchronization
+        self.size.fetch_add(1, Ordering::Relaxed);
 
 
-        Ok(())
+        self.entries.access_mut(dlock).push((name, file.ino));
     }
     }
 }
 }
 
 
-impl InodeOps for DirectoryOps {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn readdir<'cb, 'r: 'cb>(
+impl Inode for DirectoryInode {
+    fn do_readdir(
         &self,
         &self,
-        _: &Inode,
         offset: usize,
         offset: usize,
-        callback: &ReadDirCallback<'cb>,
+        callback: &mut dyn FnMut(&[u8], Ino) -> KResult<ControlFlow<(), ()>>,
     ) -> KResult<usize> {
     ) -> KResult<usize> {
-        Ok(self
-            .entries
-            .lock()
+        let lock = self.rwsem.lock_shared();
+        self.entries
+            .access(lock.as_ref())
             .iter()
             .iter()
             .skip(offset)
             .skip(offset)
-            .take_while(|(name, ino)| callback(name, *ino).is_ok())
-            .count())
+            .map(|(name, ino)| callback(&name, *ino))
+            .take_while(|result| result.map_or(true, |flow| flow.is_continue()))
+            .take_while_inclusive(|result| result.is_ok())
+            .fold_ok(0, |acc, _| acc + 1)
     }
     }
 
 
-    fn creat(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn creat(&self, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut rwsem = self.rwsem.lock();
 
 
         let ino = vfs.assign_ino();
         let ino = vfs.assign_ino();
-        let file = vfs.icache.lock().alloc_file(ino, mode)?;
+        let file = FileInode::new(ino, self.vfs.clone(), mode);
 
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_reg(file)
         at.save_reg(file)
     }
     }
 
 
-    fn mknod(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        mode: Mode,
-        dev: DevId,
-    ) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
-
-        if vfs.readonly {
-            return Err(EROFS);
-        }
-
+    fn mknod(&self, at: &Dentry, mode: Mode, dev: DevId) -> KResult<()> {
         if !s_ischr(mode) && !s_isblk(mode) {
         if !s_ischr(mode) && !s_isblk(mode) {
             return Err(EINVAL);
             return Err(EINVAL);
         }
         }
 
 
-        let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-        let file = icache.alloc(ino, Box::new(NodeOps::new(dev)));
-        file.idata.lock().mode = mode & (0o777 | S_IFBLK | S_IFCHR);
-        icache.submit(&file)?;
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
+
+        let mut rwsem = self.rwsem.lock();
 
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        let ino = vfs.assign_ino();
+        let file = NodeInode::new(
+            ino,
+            self.vfs.clone(),
+            mode & (0o777 | S_IFBLK | S_IFCHR),
+            dev,
+        );
+
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_reg(file)
         at.save_reg(file)
     }
     }
 
 
-    fn symlink(
-        &self,
-        dir: &Inode,
-        at: &Arc<Dentry>,
-        target: &[u8],
-    ) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
-
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+    fn symlink(&self, at: &Arc<Dentry>, target: &[u8]) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
 
-        let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-
-        let target_len = target.len() as u64;
+        let mut rwsem = self.rwsem.lock();
 
 
-        let file =
-            icache.alloc(ino, Box::new(SymlinkOps::new(Arc::from(target))));
-        {
-            let mut idata = file.idata.lock();
-            idata.mode = S_IFLNK | 0o777;
-            idata.size = target_len;
-        }
-        icache.submit(&file)?;
+        let ino = vfs.assign_ino();
+        let file = SymlinkInode::new(ino, self.vfs.clone(), target.into());
 
 
-        self.link(dir, file.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), file.as_ref(), rwsem.as_mut());
         at.save_symlink(file)
         at.save_symlink(file)
     }
     }
 
 
-    fn mkdir(&self, dir: &Inode, at: &Arc<Dentry>, mode: Mode) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn mkdir(&self, at: &Dentry, mode: Mode) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut rwsem = self.rwsem.lock();
 
 
         let ino = vfs.assign_ino();
         let ino = vfs.assign_ino();
-        let mut icache = vfs.icache.lock();
-
-        let mut newdir_ops = DirectoryOps::new();
-        let entries = newdir_ops.entries.get_mut();
-        entries.push((Arc::from(b".".as_slice()), ino));
-        entries.push((Arc::from(b"..".as_slice()), dir.ino));
-
-        let newdir = icache.alloc(ino, Box::new(newdir_ops));
-        {
-            let mut newdir_idata = newdir.idata.lock();
-            newdir_idata.mode = S_IFDIR | (mode & 0o777);
-            newdir_idata.nlink = 1;
-            newdir_idata.size = 2;
-        }
-
-        icache.submit(&newdir)?;
-        dir.idata.lock().nlink += 1; // link from `newdir` to `dir`, (or parent)
+        let newdir = DirectoryInode::new(ino, self.vfs.clone(), mode);
 
 
-        self.link(dir, newdir.as_ref(), at.name().clone())?;
+        self.link(at.name().clone(), newdir.as_ref(), rwsem.as_mut());
         at.save_dir(newdir)
         at.save_dir(newdir)
     }
     }
 
 
-    fn unlink(&self, dir: &Inode, at: &Arc<Dentry>) -> KResult<()> {
-        let vfs = dir.vfs.upgrade().ok_or(EIO)?;
-        let vfs = vfs.as_any().downcast_ref::<TmpFs>().unwrap();
+    fn unlink(&self, at: &Arc<Dentry>) -> KResult<()> {
+        let vfs = acquire(&self.vfs)?;
+        let vfs = astmp(&vfs);
 
 
-        if vfs.readonly {
-            return Err(EROFS);
-        }
+        let mut dlock = self.rwsem.lock();
 
 
         let file = at.get_inode()?;
         let file = at.get_inode()?;
+        let _flock = file.rwsem.lock();
 
 
-        let mut file_idata = file.idata.lock();
-
-        if file_idata.mode & S_IFDIR != 0 {
+        // SAFETY: `flock` has done the synchronization
+        if file.mode.load(Ordering::Relaxed) & S_IFDIR != 0 {
             return Err(EISDIR);
             return Err(EISDIR);
         }
         }
 
 
-        let mut self_idata = dir.idata.lock();
-        let mut entries = self.entries.lock();
-
-        let idx = entries
-            .iter()
-            .position(|(_, ino)| *ino == file.ino)
-            .expect("file not found in directory");
+        let entries = self.entries.access_mut(dlock.as_mut());
+        entries.retain(|(_, ino)| *ino != file.ino);
+
+        assert_eq!(
+            entries.len() as u64,
+            // SAFETY: `dlock` has done the synchronization
+            self.size.fetch_sub(1, Ordering::Relaxed) - 1
+        );
+
+        // SAFETY: `flock` has done the synchronization
+        let file_nlink = file.nlink.fetch_sub(1, Ordering::Relaxed) - 1;
+
+        if file_nlink == 0 {
+            // Remove the file inode from the inode cache
+            // The last reference to the inode is held by some dentry
+            // and will be released when the dentry is released
+            //
+            // TODO: Should we use some inode cache in tmpfs?
+            //
+            // vfs.icache.lock().retain(|ino, _| *ino != file.ino);
+        }
 
 
-        self_idata.size -= 1;
-        file_idata.nlink -= 1;
-        entries.remove(idx);
+        // Postpone the invalidation of the dentry and inode until the
+        // last reference to the dentry is released
+        //
+        // But we can remove it from the dentry cache immediately
+        // so later lookup will fail with ENOENT
+        dcache::d_remove(at);
 
 
-        at.invalidate()
+        Ok(())
     }
     }
 }
 }
 
 
-struct SymlinkOps {
-    target: Arc<[u8]>,
-}
-
-impl SymlinkOps {
-    fn new(target: Arc<[u8]>) -> Self {
-        Self { target }
+define_struct_inode! {
+    struct SymlinkInode {
+        target: Arc<[u8]>,
     }
     }
 }
 }
 
 
-impl InodeOps for SymlinkOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl SymlinkInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, target: Arc<[u8]>) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, _| unsafe {
+            let len = target.len();
+            addr_of_mut_field!(inode, target).write(target);
+
+            addr_of_mut_field!(inode, mode).write((S_IFLNK | 0o777).into());
+            addr_of_mut_field!(inode, size).write((len as u64).into());
+        })
     }
     }
+}
 
 
-    fn readlink(&self, _: &Inode, buffer: &mut dyn Buffer) -> KResult<usize> {
+impl Inode for SymlinkInode {
+    fn readlink(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
         buffer
         buffer
             .fill(self.target.as_ref())
             .fill(self.target.as_ref())
             .map(|result| result.allow_partial())
             .map(|result| result.allow_partial())
     }
     }
 }
 }
 
 
-impl FileOps {
-    fn new() -> Self {
-        Self {
-            data: Mutex::new(vec![]),
-        }
+define_struct_inode! {
+    struct FileInode {
+        filedata: Locked<Vec<u8>, ()>,
     }
     }
 }
 }
 
 
-impl InodeOps for FileOps {
-    fn as_any(&self) -> &dyn Any {
-        self
+impl FileInode {
+    fn new(ino: Ino, vfs: Weak<dyn Vfs>, mode: Mode) -> Arc<Self> {
+        Self::new_locked(ino, vfs, |inode, rwsem| unsafe {
+            addr_of_mut_field!(inode, filedata).write(Locked::new(vec![], rwsem));
+
+            addr_of_mut_field!(inode, mode).write((S_IFREG | (mode & 0o777)).into());
+            addr_of_mut_field!(inode, nlink).write(1.into());
+        })
     }
     }
+}
 
 
-    fn read(
-        &self,
-        _: &Inode,
-        buffer: &mut dyn Buffer,
-        offset: usize,
-    ) -> KResult<usize> {
-        let data = self.data.lock();
-        let data = data.split_at_checked(offset).ok_or(EINVAL)?.1;
+impl Inode for FileInode {
+    fn read(&self, buffer: &mut dyn Buffer, offset: usize) -> KResult<usize> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let lock = self.rwsem.lock_shared();
 
 
-        buffer.fill(data).map(|result| result.allow_partial())
+        match self.filedata.access(lock.as_ref()).split_at_checked(offset) {
+            Some((_, data)) => buffer.fill(data).map(|result| result.allow_partial()),
+            None => Ok(0),
+        }
     }
     }
 
 
-    fn write(
-        &self,
-        inode: &Inode,
-        buffer: &[u8],
-        offset: usize,
-    ) -> KResult<usize> {
-        let mut idata = inode.idata.lock();
-        let mut data = self.data.lock();
+    fn write(&self, buffer: &[u8], offset: WriteOffset) -> KResult<usize> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let mut lock = self.rwsem.lock();
+        let filedata = self.filedata.access_mut(lock.as_mut());
+
+        let offset = match offset {
+            WriteOffset::Position(offset) => offset,
+            // SAFETY: `lock` has done the synchronization
+            WriteOffset::End(end) => {
+                let size = self.size.load(Ordering::Relaxed) as usize;
+                *end = size + buffer.len();
+
+                size
+            }
+        };
 
 
-        if data.len() < offset + buffer.len() {
-            data.resize(offset + buffer.len(), 0);
+        if filedata.len() < offset + buffer.len() {
+            filedata.resize(offset + buffer.len(), 0);
         }
         }
 
 
-        data[offset..offset + buffer.len()].copy_from_slice(&buffer);
-        idata.size = data.len() as u64;
+        filedata[offset..offset + buffer.len()].copy_from_slice(&buffer);
+
+        // SAFETY: `lock` has done the synchronization
+        self.size.store(filedata.len() as u64, Ordering::Relaxed);
 
 
         Ok(buffer.len())
         Ok(buffer.len())
     }
     }
 
 
-    fn truncate(&self, inode: &Inode, length: usize) -> KResult<()> {
-        let mut idata = inode.idata.lock();
+    fn truncate(&self, length: usize) -> KResult<()> {
+        // TODO: We don't need that strong guarantee, find some way to avoid locks
+        let mut lock = self.rwsem.lock();
+        let filedata = self.filedata.access_mut(lock.as_mut());
 
 
-        idata.size = length as u64;
-        self.data.lock().resize(length, 0);
+        // SAFETY: `lock` has done the synchronization
+        self.size.store(length as u64, Ordering::Relaxed);
+        filedata.resize(length, 0);
 
 
         Ok(())
         Ok(())
     }
     }
 }
 }
 
 
-/// # Lock order
-/// `vfs` -> `icache` -> `idata` -> `*ops`.`*data`
+impl_any!(TmpFs);
 struct TmpFs {
 struct TmpFs {
-    icache: Mutex<InodeCache<TmpFs>>,
     next_ino: AtomicIno,
     next_ino: AtomicIno,
     readonly: bool,
     readonly: bool,
 }
 }
 
 
-impl InodeCache<TmpFs> {
-    fn alloc_file(&mut self, ino: Ino, mode: Mode) -> KResult<Arc<Inode>> {
-        let file = self.alloc(ino, Box::new(FileOps::new()));
-        file.idata.lock().mode = S_IFREG | (mode & 0o777);
+impl Vfs for TmpFs {
+    fn io_blksize(&self) -> usize {
+        4096
+    }
 
 
-        self.submit(&file)?;
+    fn fs_devid(&self) -> DevId {
+        2
+    }
 
 
-        Ok(file)
+    fn is_read_only(&self) -> bool {
+        self.readonly
     }
     }
 }
 }
 
 
 impl TmpFs {
 impl TmpFs {
     fn assign_ino(&self) -> Ino {
     fn assign_ino(&self) -> Ino {
-        self.next_ino.fetch_add(1, Ordering::SeqCst)
+        self.next_ino.fetch_add(1, Ordering::AcqRel)
     }
     }
 
 
-    pub fn create(readonly: bool) -> KResult<(Arc<TmpFs>, Arc<Inode>)> {
-        let tmpfs = Arc::new_cyclic(|weak| Self {
-            icache: Mutex::new(InodeCache::new(weak.clone())),
+    pub fn create(readonly: bool) -> KResult<(Arc<dyn Vfs>, Arc<dyn Inode>)> {
+        let tmpfs = Arc::new(Self {
             next_ino: AtomicIno::new(1),
             next_ino: AtomicIno::new(1),
             readonly,
             readonly,
         });
         });
 
 
-        let mut dir = DirectoryOps::new();
-        let entries = dir.entries.get_mut();
-        entries.push((Arc::from(b".".as_slice()), 0));
-        entries.push((Arc::from(b"..".as_slice()), 0));
-
-        let root_dir = {
-            let mut icache = tmpfs.icache.lock();
-            let root_dir = icache.alloc(0, Box::new(dir));
-            {
-                let mut idata = root_dir.idata.lock();
-
-                idata.mode = S_IFDIR | 0o755;
-                idata.nlink = 2;
-                idata.size = 2;
-            }
-
-            icache.submit(&root_dir)?;
-
-            root_dir
-        };
+        let weak = Arc::downgrade(&tmpfs);
+        let root_dir = DirectoryInode::new(0, weak, 0o755);
 
 
         Ok((tmpfs, root_dir))
         Ok((tmpfs, root_dir))
     }
     }
 }
 }
 
 
-impl Vfs for TmpFs {
-    fn io_blksize(&self) -> usize {
-        4096
-    }
-
-    fn fs_devid(&self) -> DevId {
-        2
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-}
-
 struct TmpFsMountCreator;
 struct TmpFsMountCreator;
 
 
 impl MountCreator for TmpFsMountCreator {
 impl MountCreator for TmpFsMountCreator {
-    fn create_mount(
-        &self,
-        _source: &str,
-        flags: u64,
-        _data: &[u8],
-        mp: &Arc<Dentry>,
-    ) -> KResult<Mount> {
+    fn create_mount(&self, _source: &str, flags: u64, mp: &Arc<Dentry>) -> KResult<Mount> {
         let (fs, root_inode) = TmpFs::create(flags & MS_RDONLY != 0)?;
         let (fs, root_inode) = TmpFs::create(flags & MS_RDONLY != 0)?;
 
 
         Mount::new(mp, fs, root_inode)
         Mount::new(mp, fs, root_inode)
@@ -385,5 +349,5 @@ impl MountCreator for TmpFsMountCreator {
 }
 }
 
 
 pub fn init() {
 pub fn init() {
-    register_filesystem("tmpfs", Box::new(TmpFsMountCreator)).unwrap();
+    register_filesystem("tmpfs", Arc::new(TmpFsMountCreator)).unwrap();
 }
 }

+ 34 - 51
src/io.rs

@@ -2,8 +2,9 @@ use bindings::EFAULT;
 
 
 use crate::prelude::*;
 use crate::prelude::*;
 
 
-use core::{ffi::c_char, fmt::Write, mem::MaybeUninit};
+use core::{fmt::Write, mem::MaybeUninit};
 
 
+#[must_use]
 pub enum FillResult {
 pub enum FillResult {
     Done(usize),
     Done(usize),
     Partial(usize),
     Partial(usize),
@@ -33,7 +34,27 @@ impl FillResult {
 pub trait Buffer {
 pub trait Buffer {
     fn total(&self) -> usize;
     fn total(&self) -> usize;
     fn wrote(&self) -> usize;
     fn wrote(&self) -> usize;
+
+    #[must_use]
     fn fill(&mut self, data: &[u8]) -> KResult<FillResult>;
     fn fill(&mut self, data: &[u8]) -> KResult<FillResult>;
+
+    fn available(&self) -> usize {
+        self.total() - self.wrote()
+    }
+}
+
+pub trait BufferFill<T: Copy> {
+    fn copy(&mut self, object: &T) -> KResult<FillResult>;
+}
+
+impl<T: Copy, B: Buffer + ?Sized> BufferFill<T> for B {
+    fn copy(&mut self, object: &T) -> KResult<FillResult> {
+        let ptr = object as *const T as *const u8;
+        let len = core::mem::size_of::<T>();
+
+        // SAFETY: `object` is a valid object.
+        self.fill(unsafe { core::slice::from_raw_parts(ptr, len) })
+    }
 }
 }
 
 
 pub struct UninitBuffer<'lt, T: Copy + Sized> {
 pub struct UninitBuffer<'lt, T: Copy + Sized> {
@@ -49,10 +70,7 @@ impl<'lt, T: Copy + Sized> UninitBuffer<'lt, T> {
         Self {
         Self {
             data,
             data,
             buffer: RawBuffer::new_from_slice(unsafe {
             buffer: RawBuffer::new_from_slice(unsafe {
-                core::slice::from_raw_parts_mut(
-                    ptr as *mut u8,
-                    core::mem::size_of::<T>(),
-                )
+                core::slice::from_raw_parts_mut(ptr as *mut u8, core::mem::size_of::<T>())
             }),
             }),
         }
         }
     }
     }
@@ -64,6 +82,14 @@ impl<'lt, T: Copy + Sized> UninitBuffer<'lt, T> {
 
 
         Ok(unsafe { self.data.assume_init_ref() })
         Ok(unsafe { self.data.assume_init_ref() })
     }
     }
+
+    pub fn assume_init(self) -> Option<T> {
+        if self.buffer.filled() {
+            Some(unsafe { *self.data.assume_init() })
+        } else {
+            None
+        }
+    }
 }
 }
 
 
 impl<'lt, T: Copy + Sized> Buffer for UninitBuffer<'lt, T> {
 impl<'lt, T: Copy + Sized> Buffer for UninitBuffer<'lt, T> {
@@ -106,9 +132,9 @@ impl<'lt> RawBuffer<'lt> {
         }
         }
     }
     }
 
 
-    pub fn new_from_raw(buf: &'lt mut *mut u8, tot: usize) -> Self {
+    pub fn new_from_raw(buf: *mut u8, tot: usize) -> Self {
         Self {
         Self {
-            buf: *buf,
+            buf,
             tot,
             tot,
             cur: 0,
             cur: 0,
             _phantom: core::marker::PhantomData,
             _phantom: core::marker::PhantomData,
@@ -136,11 +162,7 @@ impl<'lt> RawBuffer<'lt> {
             n if n == 0 => Ok(FillResult::Full),
             n if n == 0 => Ok(FillResult::Full),
             n if n < data.len() => {
             n if n < data.len() => {
                 unsafe {
                 unsafe {
-                    core::ptr::copy_nonoverlapping(
-                        data.as_ptr(),
-                        self.buf.add(self.count()),
-                        n,
-                    );
+                    core::ptr::copy_nonoverlapping(data.as_ptr(), self.buf.add(self.count()), n);
                 }
                 }
                 self.cur += n;
                 self.cur += n;
                 Ok(FillResult::Partial(n))
                 Ok(FillResult::Partial(n))
@@ -227,42 +249,3 @@ impl Write for RawBuffer<'_> {
         }
         }
     }
     }
 }
 }
-
-pub fn get_str_from_cstr<'a>(cstr: *const c_char) -> KResult<&'a str> {
-    if cstr.is_null() {
-        return Err(EFAULT);
-    }
-
-    let cstr = unsafe { core::ffi::CStr::from_ptr::<'a>(cstr) };
-    cstr.to_str().map_err(|_| EFAULT)
-}
-
-/// Copy data from src to dst, starting from offset, and copy at most count bytes.
-///
-/// # Return
-///
-/// The number of bytes copied.
-pub fn copy_offset_count(
-    src: &[u8],
-    dst: &mut [u8],
-    offset: usize,
-    count: usize,
-) -> usize {
-    if offset >= src.len() {
-        return 0;
-    }
-
-    let count = {
-        let count = count.min(dst.len());
-
-        if offset + count > src.len() {
-            src.len() - offset
-        } else {
-            count
-        }
-    };
-
-    dst[..count].copy_from_slice(&src[offset..offset + count]);
-
-    count
-}

+ 79 - 89
src/kernel.ld

@@ -2,12 +2,13 @@ OUTPUT_FORMAT(elf64-x86-64)
 
 
 MEMORY
 MEMORY
 {
 {
-    MBR    (wx) : org = 0x0e00, l = 512
-    STAGE1 (wx) : org = 0x1000, l = 4K
-    PHYMEM (w)  : org = 0xffffff0000000000, len = 512 * 1024M
-    PARRAY (w)  : org = 0xffffff8000000000, len = 128 * 1024M
-    KBSS   (w)  : org = 0xffffffffc0200000, len = 2M
-    KIMAGE (wx) : org = 0xffffffffffc00000, len = 2M
+    MBR           (wx) : org = 0x0e00, l = 512
+    STAGE1        (wx) : org = 0x1000, l = 4K
+    PHYMEM        (w)  : org = 0xffffff0000000000, len = 512 * 1024M
+    PARRAY        (w)  : org = 0xffffff8000000000, len = 128 * 1024M
+    KBSS          (w)  : org = 0xffffffffc0200000, len = 2M
+    KIMAGE        (wx) : org = 0xffffffffffc00000, len = 2M
+    KPERCPU       (w)  : org = 0x0000000000000000, len = 128K
 }
 }
 
 
 SECTIONS
 SECTIONS
@@ -26,53 +27,16 @@ SECTIONS
 
 
     .stage1 : AT(LOADADDR(.mbr) + SIZEOF(.mbr))
     .stage1 : AT(LOADADDR(.mbr) + SIZEOF(.mbr))
     {
     {
-        *(.stage1)
-        . = ALIGN(0x1000);
-    } > STAGE1
-
-    .kinit :
-        AT(LOADADDR(.stage1) + SIZEOF(.stage1))
-    {
-        KIMAGE_START = .;
-        KINIT_START = .;
-
-        *(.text.kinit)
-
-        . = ALIGN(16);
-        *(.rodata.kinit)
-
-        KINIT_START_ADDR = .;
-        QUAD(ABSOLUTE(KINIT_START));
-
-        KINIT_END_ADDR = .;
-        QUAD(ABSOLUTE(KINIT_END));
-
-        KINIT_PAGES = .;
-        QUAD((KINIT_END - KINIT_START) / 0x1000);
-
-        KIMAGE_PAGES_VALUE = .;
-        QUAD((KIMAGE_END - KIMAGE_START) / 0x1000);
-
-        . = ALIGN(16);
-        start_ctors = .;
-        KEEP(*(.init_array));
-        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
-        KEEP(*(.ctors));
-        KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
-        end_ctors = .;
+        KEEP(*(.stage1.smp));
 
 
         . = ALIGN(16);
         . = ALIGN(16);
-        *(.data.kinit)
-
-        . = ALIGN(16);
-        *(.bss.kinit)
+        *(.stage1)
 
 
         . = ALIGN(0x1000);
         . = ALIGN(0x1000);
-        KINIT_END = .;
-    } > KIMAGE
+    } > STAGE1
 
 
     .text :
     .text :
-        AT(LOADADDR(.kinit) + SIZEOF(.kinit))
+        AT(LOADADDR(.stage1) + SIZEOF(.stage1))
     {
     {
         TEXT_START = .;
         TEXT_START = .;
         *(.text)
         *(.text)
@@ -82,6 +46,8 @@ SECTIONS
         TEXT_END = .;
         TEXT_END = .;
     } > KIMAGE
     } > KIMAGE
 
 
+    TEXT_PAGES = (TEXT_END - TEXT_START) / 0x1000;
+
     .rodata :
     .rodata :
         AT(LOADADDR(.text) + SIZEOF(.text))
         AT(LOADADDR(.text) + SIZEOF(.text))
     {
     {
@@ -90,30 +56,37 @@ SECTIONS
         *(.rodata*)
         *(.rodata*)
 
 
         . = ALIGN(16);
         . = ALIGN(16);
-        KMOD_LOADERS_START = .;
-
-        KEEP(*(.kmods));
-        QUAD(0);
+        start_ctors = .;
+        KEEP(*(.init_array));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*)));
+        KEEP(*(.ctors));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.ctors*)));
+        end_ctors = .;
 
 
         . = ALIGN(16);
         . = ALIGN(16);
-        late_init_start = .;
-        KEEP(*(.late_init));
-        QUAD(0);
-        late_init_end = .;
+        _fix_start = .;
+        KEEP(*(.fix));
+        _fix_end = .;
 
 
         . = ALIGN(16);
         . = ALIGN(16);
-
         BSS_ADDR = .;
         BSS_ADDR = .;
         QUAD(ABSOLUTE(BSS_START));
         QUAD(ABSOLUTE(BSS_START));
         BSS_LENGTH = .;
         BSS_LENGTH = .;
         QUAD(BSS_END - BSS_START);
         QUAD(BSS_END - BSS_START);
+        FIX_START = .;
+        QUAD(ABSOLUTE(_fix_start));
+        FIX_END = .;
+        QUAD(ABSOLUTE(_fix_end));
+        PERCPU_PAGES = .;
+        QUAD(_PERCPU_PAGES);
 
 
         . = ALIGN(0x1000);
         . = ALIGN(0x1000);
         RODATA_END = .;
         RODATA_END = .;
     } > KIMAGE
     } > KIMAGE
 
 
-    .data :
-        AT(LOADADDR(.rodata) + SIZEOF(.rodata))
+    RODATA_PAGES = (RODATA_END - RODATA_START) / 0x1000;
+
+    .data : AT(LOADADDR(.rodata) + SIZEOF(.rodata))
     {
     {
         DATA_START = .;
         DATA_START = .;
         *(.data)
         *(.data)
@@ -122,13 +95,30 @@ SECTIONS
         *(.got)
         *(.got)
         *(.got.plt)
         *(.got.plt)
 
 
+        . = . + 4;
         . = ALIGN(0x1000) - 4;
         . = ALIGN(0x1000) - 4;
         LONG(KERNEL_MAGIC);
         LONG(KERNEL_MAGIC);
-
         DATA_END = .;
         DATA_END = .;
-        KIMAGE_END = .;
     } > KIMAGE
     } > KIMAGE
 
 
+    DATA_PAGES = (DATA_END - DATA_START) / 0x1000;
+
+    _PERCPU_DATA_START = .;
+    .percpu 0 : AT(LOADADDR(.data) + SIZEOF(.data))
+    {
+        PERCPU_START = .;
+        QUAD(0); /* Reserved for x86 percpu pointer */
+        QUAD(0);
+
+        *(.percpu .percpu*)
+
+        . = ALIGN(0x1000);
+        PERCPU_END = .;
+    } > KPERCPU
+    _PERCPU_LENGTH = PERCPU_END - PERCPU_START;
+
+    _PERCPU_PAGES = _PERCPU_LENGTH / 0x1000;
+
     .bss :
     .bss :
     {
     {
         BSS_START = .;
         BSS_START = .;
@@ -139,56 +129,56 @@ SECTIONS
         BSS_END = .;
         BSS_END = .;
     } > KBSS
     } > KBSS
 
 
-    KIMAGE_PAGES = (KIMAGE_END - KIMAGE_START) / 0x1000;
+    KIMAGE_PAGES = TEXT_PAGES + RODATA_PAGES + _PERCPU_PAGES + DATA_PAGES;
     BSS_PAGES = (BSS_END - BSS_START) / 0x1000;
     BSS_PAGES = (BSS_END - BSS_START) / 0x1000;
     KERNEL_MAGIC = 0x01145140;
     KERNEL_MAGIC = 0x01145140;
 
 
-    KIMAGE_32K_COUNT = ((KIMAGE_END - KIMAGE_START) + 32 * 1024 - 1) / (32 * 1024);
+    KIMAGE_32K_COUNT = (KIMAGE_PAGES * 0x1000 + 32 * 1024 - 1) / (32 * 1024);
 
 
     .eh_frame :
     .eh_frame :
-        AT(LOADADDR(.data) + SIZEOF(.data))
+        AT(LOADADDR(.percpu) + SIZEOF(.percpu))
     {
     {
         KEEP(*(.eh_frame*))
         KEEP(*(.eh_frame*))
         . = ALIGN(0x1000);
         . = ALIGN(0x1000);
     } > KIMAGE
     } > KIMAGE
 
 
     /* Stabs debugging sections.  */
     /* Stabs debugging sections.  */
-    .stab          0 : { *(.stab) }
-    .stabstr       0 : { *(.stabstr) }
-    .stab.excl     0 : { *(.stab.excl) }
-    .stab.exclstr  0 : { *(.stab.exclstr) }
-    .stab.index    0 : { *(.stab.index) }
-    .stab.indexstr 0 : { *(.stab.indexstr) }
-    .comment       0 : { *(.comment) }
+    .stab          0 : { KEEP(*(.stab)); }
+    .stabstr       0 : { KEEP(*(.stabstr)); }
+    .stab.excl     0 : { KEEP(*(.stab.excl)); }
+    .stab.exclstr  0 : { KEEP(*(.stab.exclstr)); }
+    .stab.index    0 : { KEEP(*(.stab.index)); }
+    .stab.indexstr 0 : { KEEP(*(.stab.indexstr)); }
+    .comment       0 : { KEEP(*(.comment)); }
     /* DWARF debug sections.
     /* DWARF debug sections.
        Symbols in the DWARF debugging sections are relative to the beginning
        Symbols in the DWARF debugging sections are relative to the beginning
        of the section so we begin them at 0.  */
        of the section so we begin them at 0.  */
     /* DWARF 1 */
     /* DWARF 1 */
-    .debug          0 : { *(.debug) }
-    .line           0 : { *(.line) }
+    .debug          0 : { KEEP(*(.debug)); }
+    .line           0 : { KEEP(*(.line)); }
     /* GNU DWARF 1 extensions */
     /* GNU DWARF 1 extensions */
-    .debug_srcinfo  0 : { *(.debug_srcinfo) }
-    .debug_sfnames  0 : { *(.debug_sfnames) }
+    .debug_srcinfo  0 : { KEEP(*(.debug_srcinfo)); }
+    .debug_sfnames  0 : { KEEP(*(.debug_sfnames)); }
     /* DWARF 1.1 and DWARF 2 */
     /* DWARF 1.1 and DWARF 2 */
-    .debug_aranges  0 : { *(.debug_aranges) }
-    .debug_pubnames 0 : { *(.debug_pubnames) }
+    .debug_aranges  0 : { KEEP(*(.debug_aranges)); }
+    .debug_pubnames 0 : { KEEP(*(.debug_pubnames)); }
     /* DWARF 2 */
     /* DWARF 2 */
-    .debug_info     0 : { *(.debug_info) }
-    .debug_abbrev   0 : { *(.debug_abbrev) }
-    .debug_line     0 : { *(.debug_line) }
-    .debug_frame    0 : { *(.debug_frame) }
-    .debug_str      0 : { *(.debug_str) }
-    .debug_loc      0 : { *(.debug_loc) }
-    .debug_macinfo  0 : { *(.debug_macinfo) }
+    .debug_info     0 : { KEEP(*(.debug_info)); }
+    .debug_abbrev   0 : { KEEP(*(.debug_abbrev)); }
+    .debug_line     0 : { KEEP(*(.debug_line)); }
+    .debug_frame    0 : { KEEP(*(.debug_frame)); }
+    .debug_str      0 : { KEEP(*(.debug_str)); }
+    .debug_loc      0 : { KEEP(*(.debug_loc)); }
+    .debug_macinfo  0 : { KEEP(*(.debug_macinfo)); }
     /* SGI/MIPS DWARF 2 extensions */
     /* SGI/MIPS DWARF 2 extensions */
-    .debug_weaknames 0 : { *(.debug_weaknames) }
-    .debug_funcnames 0 : { *(.debug_funcnames) }
-    .debug_typenames 0 : { *(.debug_typenames) }
-    .debug_varnames  0 : { *(.debug_varnames) }
+    .debug_weaknames 0 : { KEEP(*(.debug_weaknames)); }
+    .debug_funcnames 0 : { KEEP(*(.debug_funcnames)); }
+    .debug_typenames 0 : { KEEP(*(.debug_typenames)); }
+    .debug_varnames  0 : { KEEP(*(.debug_varnames)); }
 
 
     /* DWARF Other */
     /* DWARF Other */
-    .debug_ranges  0 : { *(.debug_ranges) }
-    .debug_line_str 0 : { *(.debug_line_str) }
+    .debug_ranges  0 : { KEEP(*(.debug_ranges)); }
+    .debug_line_str 0 : { KEEP(*(.debug_line_str)); }
     /* Rust stuff */
     /* Rust stuff */
 
 
     /DISCARD/ :
     /DISCARD/ :

+ 16 - 0
src/kernel.rs

@@ -1,5 +1,21 @@
+pub mod arch;
 pub mod block;
 pub mod block;
 pub mod console;
 pub mod console;
+pub mod constants;
 pub mod interrupt;
 pub mod interrupt;
 pub mod mem;
 pub mod mem;
+pub mod syscall;
+pub mod task;
+pub mod timer;
+pub mod user;
 pub mod vfs;
 pub mod vfs;
+
+#[cfg(feature = "smp")]
+pub mod smp;
+
+mod chardev;
+mod terminal;
+
+pub use chardev::{CharDevice, CharDeviceType, VirtualCharDevice};
+pub use console::Console;
+pub use terminal::{Terminal, TerminalDevice};

+ 0 - 1
src/kernel/allocator.cc

@@ -218,7 +218,6 @@ static constexpr int __cache_index(std::size_t size) {
     return -1;
     return -1;
 }
 }
 
 
-SECTION(".text.kinit")
 void kernel::kinit::init_allocator() {
 void kernel::kinit::init_allocator() {
     mem::init_slab_cache(caches + 0, 32);
     mem::init_slab_cache(caches + 0, 32);
     mem::init_slab_cache(caches + 1, 64);
     mem::init_slab_cache(caches + 1, 64);

+ 5 - 0
src/kernel/arch.rs

@@ -0,0 +1,5 @@
+#[cfg(target_arch = "x86_64")]
+pub mod x86_64;
+
+#[cfg(target_arch = "x86_64")]
+pub use x86_64::*;

+ 82 - 0
src/kernel/arch/x86_64.rs

@@ -0,0 +1,82 @@
+pub mod init;
+pub mod interrupt;
+
+use arch::x86_64::{gdt::GDT, task::TSS};
+
+// TODO!!!: This can be stored in the percpu area.
+//          But we need to implement a guard that ensures that preemption is disabled
+//          while we are accessing the percpu variables.
+#[arch::define_percpu]
+static GDT_OBJECT: Option<GDT> = None;
+
+#[arch::define_percpu]
+static TSS_OBJECT: Option<TSS> = None;
+
+pub mod user {
+    use crate::sync::preempt;
+    use arch::x86_64::gdt::GDTEntry;
+
+    pub struct InterruptStack(pub u64);
+
+    #[derive(Debug, Clone)]
+    pub enum TLS {
+        /// TODO: This is not used yet.
+        #[allow(dead_code)]
+        TLS64(u64),
+        TLS32 {
+            base: u64,
+            desc: GDTEntry,
+        },
+    }
+
+    impl TLS {
+        /// # Return
+        /// Returns the TLS descriptor and the index of the TLS segment.
+        pub fn new32(base: u32, limit: u32, is_limit_in_pages: bool) -> (Self, u32) {
+            let flags = if is_limit_in_pages { 0xc } else { 0x4 };
+
+            (
+                TLS::TLS32 {
+                    base: base as u64,
+                    desc: GDTEntry::new(base, limit, 0xf2, flags),
+                },
+                7,
+            )
+        }
+
+        pub fn load(&self) {
+            match self {
+                TLS::TLS64(base) => {
+                    const IA32_KERNEL_GS_BASE: u32 = 0xc0000102;
+                    arch::x86_64::task::wrmsr(IA32_KERNEL_GS_BASE, *base);
+                }
+                TLS::TLS32 { base, desc } => {
+                    preempt::disable();
+                    let gdt = unsafe {
+                        super::GDT_OBJECT
+                            .as_mut()
+                            .as_mut()
+                            .expect("GDT should be valid")
+                    };
+                    gdt.set_tls32(*desc);
+                    preempt::enable();
+
+                    const IA32_KERNEL_GS_BASE: u32 = 0xc0000102;
+                    arch::x86_64::task::wrmsr(IA32_KERNEL_GS_BASE, *base);
+                }
+            }
+        }
+    }
+
+    pub fn load_interrupt_stack(stack: InterruptStack) {
+        preempt::disable();
+        let tss = unsafe {
+            super::TSS_OBJECT
+                .as_mut()
+                .as_mut()
+                .expect("TSS should be valid")
+        };
+        tss.set_rsp0(stack.0);
+        preempt::enable();
+    }
+}

+ 126 - 0
src/kernel/arch/x86_64/init.rs

@@ -0,0 +1,126 @@
+use super::{interrupt::setup_idt, GDT_OBJECT, TSS_OBJECT};
+use crate::{
+    kernel::{
+        arch::interrupt::APIC_BASE,
+        mem::{paging::Page, phys::PhysPtr as _},
+        smp,
+        task::{ProcessList, Scheduler, Thread},
+    },
+    println_debug, println_info,
+    sync::preempt,
+};
+use alloc::{format, sync::Arc};
+use arch::{
+    interrupt,
+    task::pause,
+    x86_64::{gdt::GDT, task::TSS},
+};
+use core::sync::atomic::{AtomicU32, AtomicUsize, Ordering};
+
+unsafe fn init_gdt_tss_thiscpu() {
+    preempt::disable();
+    let gdt_ref = unsafe { GDT_OBJECT.as_mut() };
+    let tss_ref = unsafe { TSS_OBJECT.as_mut() };
+    *gdt_ref = Some(GDT::new());
+    *tss_ref = Some(TSS::new());
+
+    if let Some(gdt) = gdt_ref.as_mut() {
+        if let Some(tss) = tss_ref.as_mut() {
+            gdt.set_tss(tss as *mut _ as u64);
+        } else {
+            panic!("TSS is not initialized");
+        }
+
+        unsafe { gdt.load() };
+    } else {
+        panic!("GDT is not initialized");
+    }
+
+    preempt::enable();
+}
+
+/// Initialization routine for all CPUs.
+pub unsafe fn init_cpu() {
+    arch::x86_64::io::enable_sse();
+
+    let area = smp::alloc_percpu_area();
+    smp::set_percpu_area(area);
+    init_gdt_tss_thiscpu();
+
+    setup_idt();
+
+    APIC_BASE.spurious().write(0x1ff);
+    APIC_BASE.task_priority().write(0);
+    APIC_BASE.timer_divide().write(0x3); // Divide by 16
+    APIC_BASE.timer_register().write(0x20040);
+
+    // TODO: Get the bus frequency from...?
+    let freq = 800;
+    let count = freq * 1_000_000 / 16 / 100;
+    APIC_BASE.timer_initial_count().write(count as u32);
+
+    let cpu = CPU_COUNT.fetch_add(1, Ordering::Relaxed);
+    if cpu != 0 {
+        // Application processor
+        println_debug!("AP{} started", cpu);
+    }
+}
+
+#[no_mangle]
+pub static BOOT_SEMAPHORE: AtomicU32 = AtomicU32::new(0);
+#[no_mangle]
+pub static BOOT_STACK: AtomicUsize = AtomicUsize::new(0);
+
+pub static CPU_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+#[no_mangle]
+pub unsafe extern "C" fn ap_entry(stack_start: u64) {
+    init_cpu();
+
+    let idle_process = ProcessList::get()
+        .try_find_process(0)
+        .expect("Idle process must exist");
+
+    let idle_thread_name = format!("[kernel idle#AP{}]", 0);
+    let idle_thread = Thread::new_for_init(Arc::from(idle_thread_name.as_bytes()), &idle_process);
+    ProcessList::get().add_thread(&idle_thread);
+    Scheduler::set_idle(idle_thread.clone());
+    Scheduler::set_current(idle_thread);
+
+    preempt::disable();
+    interrupt::enable();
+
+    // TODO!!!!!: Free the stack after having switched to idle task.
+    arch::task::context_switch_light(
+        stack_start as *mut _, // We will never come back
+        unsafe { Scheduler::idle_task().get_sp_ptr() },
+    );
+    arch::task::freeze()
+}
+
+pub unsafe fn bootstrap_cpus() {
+    let icr = APIC_BASE.interrupt_command();
+
+    icr.write(0xc4500);
+    while icr.read() & 0x1000 != 0 {
+        pause();
+    }
+
+    icr.write(0xc4601);
+    while icr.read() & 0x1000 != 0 {
+        pause();
+    }
+
+    while CPU_COUNT.load(Ordering::Acquire) != 4 {
+        if BOOT_STACK.load(Ordering::Acquire) == 0 {
+            let page = Page::alloc_many(9);
+            let stack_start = page.as_cached().as_ptr::<()>() as usize;
+            core::mem::forget(page);
+
+            BOOT_STACK.store(stack_start, Ordering::Release);
+        }
+        pause();
+    }
+
+    println_info!("Processors startup finished");
+}

+ 129 - 0
src/kernel/arch/x86_64/interrupt.rs

@@ -0,0 +1,129 @@
+use crate::kernel::mem::phys::{CachedPP, PhysPtr as _};
+use arch::task::rdmsr;
+use lazy_static::lazy_static;
+
+extern "C" {
+    static ISR_START_ADDR: usize;
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+struct IDTEntry {
+    offset_low: u16,
+    selector: u16,
+
+    interrupt_stack: u8,
+    attributes: u8,
+
+    offset_mid: u16,
+    offset_high: u32,
+    reserved: u32,
+}
+
+impl IDTEntry {
+    const fn new(offset: usize, selector: u16, attributes: u8) -> Self {
+        Self {
+            offset_low: offset as u16,
+            selector,
+            interrupt_stack: 0,
+            attributes,
+            offset_mid: (offset >> 16) as u16,
+            offset_high: (offset >> 32) as u32,
+            reserved: 0,
+        }
+    }
+
+    const fn null() -> Self {
+        Self {
+            offset_low: 0,
+            selector: 0,
+            interrupt_stack: 0,
+            attributes: 0,
+            offset_mid: 0,
+            offset_high: 0,
+            reserved: 0,
+        }
+    }
+}
+
+pub struct APICReg(*mut u32);
+pub struct APICRegs {
+    base: CachedPP,
+}
+
+impl APICReg {
+    fn new(pointer: *mut u32) -> Self {
+        Self(pointer)
+    }
+
+    pub fn read(&self) -> u32 {
+        unsafe { self.0.read_volatile() }
+    }
+
+    pub fn write(&self, value: u32) {
+        unsafe { self.0.write_volatile(value) }
+    }
+}
+
+impl APICRegs {
+    pub fn spurious(&self) -> APICReg {
+        APICReg::new(self.base.offset(0xf0).as_ptr())
+    }
+
+    pub fn task_priority(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x80).as_ptr())
+    }
+
+    pub fn end_of_interrupt(&self) {
+        APICReg::new(self.base.offset(0xb0).as_ptr()).write(0)
+    }
+
+    pub fn interrupt_command(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x300).as_ptr())
+    }
+
+    pub fn timer_register(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x320).as_ptr())
+    }
+
+    pub fn timer_initial_count(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x380).as_ptr())
+    }
+
+    pub fn timer_current_count(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x390).as_ptr())
+    }
+
+    pub fn timer_divide(&self) -> APICReg {
+        APICReg::new(self.base.offset(0x3e0).as_ptr())
+    }
+}
+
+lazy_static! {
+    static ref IDT: [IDTEntry; 256] = core::array::from_fn(|idx| match idx {
+        0..0x80 => IDTEntry::new(unsafe { ISR_START_ADDR } + 8 * idx, 0x08, 0x8e),
+        0x80 => IDTEntry::new(unsafe { ISR_START_ADDR } + 8 * idx, 0x08, 0xee),
+        _ => IDTEntry::null(),
+    });
+    pub static ref APIC_BASE: APICRegs = {
+        let apic_base = rdmsr(0x1b);
+        assert_eq!(apic_base & 0x800, 0x800, "LAPIC not enabled");
+        assert_eq!(apic_base & 0x100, 0x100, "Is not bootstrap processor");
+
+        let apic_base = apic_base & !0xfff;
+        APICRegs {
+            base: CachedPP::new(apic_base as usize),
+        }
+    };
+}
+
+pub fn setup_idt() {
+    arch::x86_64::interrupt::lidt(
+        IDT.as_ptr() as usize,
+        (size_of::<IDTEntry>() * 256 - 1) as u16,
+    );
+}
+
+pub fn end_of_interrupt() {
+    APIC_BASE.end_of_interrupt()
+}

+ 7 - 27
src/kernel/async/lock.cc

@@ -1,5 +1,4 @@
 #include <assert.h>
 #include <assert.h>
-#include <stdint.h>
 
 
 #include <kernel/async/lock.hpp>
 #include <kernel/async/lock.hpp>
 
 
@@ -49,31 +48,20 @@ static inline void _restore_interrupt_state(lock_context_t context) {
         :);
         :);
 }
 }
 
 
-// TODO: mark as _per_cpu
-static inline preempt_count_t& _preempt_count() {
-    static preempt_count_t _preempt_count;
-    assert(!(_preempt_count & 0x80000000));
-    return _preempt_count;
-}
+extern "C" void r_preempt_disable();
+extern "C" void r_preempt_enable();
+extern "C" unsigned long r_preempt_count();
 
 
 void preempt_disable() {
 void preempt_disable() {
-    ++_preempt_count();
+    r_preempt_disable();
 }
 }
 
 
 void preempt_enable() {
 void preempt_enable() {
-    --_preempt_count();
-}
-
-extern "C" void r_preempt_disable() {
-    ++_preempt_count();
-}
-
-extern "C" void r_preempt_enable() {
-    --_preempt_count();
+    r_preempt_enable();
 }
 }
 
 
-preempt_count_t preempt_count() {
-    return _preempt_count();
+unsigned long preempt_count() {
+    return r_preempt_count();
 }
 }
 
 
 void spin_lock(spinlock_t& lock) {
 void spin_lock(spinlock_t& lock) {
@@ -105,14 +93,6 @@ mutex::~mutex() {
     assert(m_lock == 0);
     assert(m_lock == 0);
 }
 }
 
 
-void mutex::lock() {
-    spin_lock(m_lock);
-}
-
-void mutex::unlock() {
-    spin_unlock(m_lock);
-}
-
 lock_context_t mutex::lock_irq() {
 lock_context_t mutex::lock_irq() {
     return spin_lock_irqsave(m_lock);
     return spin_lock_irqsave(m_lock);
 }
 }

+ 0 - 57
src/kernel/async/waitlist.cc

@@ -1,57 +0,0 @@
-#include <assert.h>
-
-#include <kernel/async/lock.hpp>
-#include <kernel/async/waitlist.hpp>
-#include <kernel/process.hpp>
-#include <kernel/task/thread.hpp>
-
-using namespace kernel::async;
-
-bool wait_list::wait(mutex& lock) {
-    this->subscribe();
-
-    auto* curthd = current_thread;
-    curthd->set_attr(kernel::task::thread::ISLEEP);
-
-    lock.unlock();
-    bool has_signals = schedule();
-    lock.lock();
-
-    m_subscribers.erase(curthd);
-    return !has_signals;
-}
-
-void wait_list::subscribe() {
-    lock_guard lck(m_mtx);
-
-    auto* thd = current_thread;
-
-    bool inserted;
-    std::tie(std::ignore, inserted) = m_subscribers.insert(thd);
-
-    assert(inserted);
-}
-
-void wait_list::notify_one() {
-    lock_guard lck(m_mtx);
-
-    if (m_subscribers.empty())
-        return;
-
-    auto iter = m_subscribers.begin();
-    (*iter)->set_attr(kernel::task::thread::READY);
-
-    m_subscribers.erase(iter);
-}
-
-void wait_list::notify_all() {
-    lock_guard lck(m_mtx);
-
-    if (m_subscribers.empty())
-        return;
-
-    for (auto thd : m_subscribers)
-        thd->set_attr(kernel::task::thread::READY);
-
-    m_subscribers.clear();
-}

+ 12 - 10
src/kernel/block.rs

@@ -11,7 +11,7 @@ use alloc::{
 };
 };
 use bindings::{EEXIST, EINVAL, EIO, ENOENT};
 use bindings::{EEXIST, EINVAL, EIO, ENOENT};
 
 
-use crate::KResult;
+use lazy_static::lazy_static;
 
 
 use super::{
 use super::{
     mem::{paging::Page, phys::PhysPtr},
     mem::{paging::Page, phys::PhysPtr},
@@ -27,18 +27,18 @@ pub trait BlockRequestQueue: Send + Sync {
     ///
     ///
     fn max_request_pages(&self) -> u64;
     fn max_request_pages(&self) -> u64;
 
 
-    fn submit(&mut self, req: BlockDeviceRequest) -> KResult<()>;
+    fn submit(&self, req: BlockDeviceRequest) -> KResult<()>;
 }
 }
 
 
 struct BlockDeviceDisk {
 struct BlockDeviceDisk {
-    queue: Arc<Mutex<dyn BlockRequestQueue>>,
+    queue: Arc<dyn BlockRequestQueue>,
 }
 }
 
 
 struct BlockDevicePartition {
 struct BlockDevicePartition {
     disk_dev: DevId,
     disk_dev: DevId,
     offset: u64,
     offset: u64,
 
 
-    queue: Arc<Mutex<dyn BlockRequestQueue>>,
+    queue: Arc<dyn BlockRequestQueue>,
 }
 }
 
 
 enum BlockDeviceType {
 enum BlockDeviceType {
@@ -74,8 +74,10 @@ impl Ord for BlockDevice {
     }
     }
 }
 }
 
 
-static BLOCK_DEVICE_LIST: Mutex<BTreeMap<DevId, Arc<BlockDevice>>> =
-    Mutex::new(BTreeMap::new());
+lazy_static! {
+    static ref BLOCK_DEVICE_LIST: Spin<BTreeMap<DevId, Arc<BlockDevice>>> =
+        Spin::new(BTreeMap::new());
+}
 
 
 #[derive(Debug, Clone, Copy)]
 #[derive(Debug, Clone, Copy)]
 #[repr(C)]
 #[repr(C)]
@@ -100,9 +102,9 @@ impl BlockDevice {
     pub fn register_disk(
     pub fn register_disk(
         devid: DevId,
         devid: DevId,
         size: u64,
         size: u64,
-        queue: Arc<Mutex<dyn BlockRequestQueue>>,
+        queue: Arc<dyn BlockRequestQueue>,
     ) -> KResult<Arc<Self>> {
     ) -> KResult<Arc<Self>> {
-        let max_pages = queue.lock().max_request_pages();
+        let max_pages = queue.max_request_pages();
         let device = Arc::new(Self {
         let device = Arc::new(Self {
             devid,
             devid,
             size,
             size,
@@ -199,10 +201,10 @@ impl BlockDevice {
         }
         }
 
 
         match self.dev_type {
         match self.dev_type {
-            BlockDeviceType::Disk(ref disk) => disk.queue.lock().submit(req),
+            BlockDeviceType::Disk(ref disk) => disk.queue.submit(req),
             BlockDeviceType::Partition(ref part) => {
             BlockDeviceType::Partition(ref part) => {
                 req.sector += part.offset;
                 req.sector += part.offset;
-                part.queue.lock().submit(req)
+                part.queue.submit(req)
             }
             }
         }
         }
     }
     }

+ 155 - 0
src/kernel/chardev.rs

@@ -0,0 +1,155 @@
+use alloc::{
+    boxed::Box,
+    collections::btree_map::{BTreeMap, Entry},
+    sync::Arc,
+};
+use bindings::{EEXIST, EIO};
+
+use crate::{io::Buffer, kernel::console::CONSOLE, prelude::*};
+
+use super::{
+    block::make_device,
+    task::Thread,
+    terminal::Terminal,
+    vfs::{
+        file::{File, TerminalFile},
+        DevId,
+    },
+};
+
+use lazy_static::lazy_static;
+
+pub trait VirtualCharDevice: Send + Sync {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize>;
+    fn write(&self, data: &[u8]) -> KResult<usize>;
+}
+
+pub enum CharDeviceType {
+    Terminal(Arc<Terminal>),
+    Virtual(Box<dyn VirtualCharDevice>),
+}
+
+pub struct CharDevice {
+    name: Arc<str>,
+    device: CharDeviceType,
+}
+
+lazy_static! {
+    pub static ref CHAR_DEVICES: Spin<BTreeMap<DevId, Arc<CharDevice>>> =
+        Spin::new(BTreeMap::new());
+}
+
+impl CharDevice {
+    pub fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        match &self.device {
+            CharDeviceType::Terminal(terminal) => terminal.read(buffer),
+            CharDeviceType::Virtual(device) => device.read(buffer),
+        }
+    }
+
+    pub fn write(&self, data: &[u8]) -> KResult<usize> {
+        match &self.device {
+            CharDeviceType::Virtual(device) => device.write(data),
+            CharDeviceType::Terminal(terminal) => {
+                for &ch in data.iter() {
+                    terminal.show_char(ch);
+                }
+                Ok(data.len())
+            }
+        }
+    }
+
+    pub fn get(devid: DevId) -> Option<Arc<CharDevice>> {
+        CHAR_DEVICES.lock().get(&devid).cloned()
+    }
+
+    pub fn register(devid: DevId, name: Arc<str>, device: CharDeviceType) -> KResult<()> {
+        match CHAR_DEVICES.lock().entry(devid) {
+            Entry::Vacant(entry) => {
+                entry.insert(Arc::new(CharDevice { name, device }));
+                Ok(())
+            }
+            Entry::Occupied(_) => Err(EEXIST),
+        }
+    }
+
+    pub fn open(self: &Arc<Self>) -> KResult<Arc<File>> {
+        Ok(match &self.device {
+            CharDeviceType::Terminal(terminal) => {
+                // We only set the control terminal if the process is the session leader.
+                if Thread::current().process.sid() == Thread::current().process.pid {
+                    let session = Thread::current().process.session();
+                    // Silently fail if we can't set the control terminal.
+                    dont_check!(session.set_control_terminal(&terminal, false));
+                }
+
+                TerminalFile::new(terminal.clone())
+            }
+            CharDeviceType::Virtual(_) => Arc::new(File::CharDev(self.clone())),
+        })
+    }
+}
+
+struct NullDevice;
+impl VirtualCharDevice for NullDevice {
+    fn read(&self, _buffer: &mut dyn Buffer) -> KResult<usize> {
+        Ok(0)
+    }
+
+    fn write(&self, _data: &[u8]) -> KResult<usize> {
+        Ok(_data.len())
+    }
+}
+
+struct ZeroDevice;
+impl VirtualCharDevice for ZeroDevice {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        // TODO: Copy from empty page.
+        while let false = buffer.fill(&[0; 16])?.should_stop() {}
+        Ok(buffer.wrote())
+    }
+
+    fn write(&self, _data: &[u8]) -> KResult<usize> {
+        Ok(_data.len())
+    }
+}
+
+struct ConsoleDevice;
+impl VirtualCharDevice for ConsoleDevice {
+    fn read(&self, buffer: &mut dyn Buffer) -> KResult<usize> {
+        let console_terminal = CONSOLE.lock_irq().get_terminal().ok_or(EIO)?;
+        console_terminal.read(buffer)
+    }
+
+    fn write(&self, data: &[u8]) -> KResult<usize> {
+        let console_terminal = CONSOLE.lock_irq().get_terminal().ok_or(EIO)?;
+        for &ch in data.iter() {
+            console_terminal.show_char(ch);
+        }
+        Ok(data.len())
+    }
+}
+
+impl CharDevice {
+    pub fn init() -> KResult<()> {
+        Self::register(
+            make_device(1, 3),
+            Arc::from("null"),
+            CharDeviceType::Virtual(Box::new(NullDevice)),
+        )?;
+
+        Self::register(
+            make_device(1, 5),
+            Arc::from("zero"),
+            CharDeviceType::Virtual(Box::new(ZeroDevice)),
+        )?;
+
+        Self::register(
+            make_device(5, 1),
+            Arc::from("console"),
+            CharDeviceType::Virtual(Box::new(ConsoleDevice)),
+        )?;
+
+        Ok(())
+    }
+}

+ 60 - 11
src/kernel/console.rs

@@ -1,16 +1,34 @@
 use crate::prelude::*;
 use crate::prelude::*;
 
 
-pub struct Console {}
+use alloc::sync::Arc;
+use bindings::EEXIST;
+use lazy_static::lazy_static;
+
+pub struct Console {
+    terminal: Option<Arc<Terminal>>,
+}
+
+impl Console {
+    pub fn get_terminal(&self) -> Option<Arc<Terminal>> {
+        self.terminal.clone()
+    }
+
+    pub fn register_terminal(terminal: &Arc<Terminal>) -> KResult<()> {
+        let mut console = CONSOLE.lock_irq();
+        if console.terminal.is_some() {
+            return Err(EEXIST);
+        }
+
+        console.terminal = Some(terminal.clone());
+        Ok(())
+    }
+}
 
 
 impl Write for Console {
 impl Write for Console {
     fn write_str(&mut self, s: &str) -> core::fmt::Result {
     fn write_str(&mut self, s: &str) -> core::fmt::Result {
-        use crate::bindings::root::kernel::tty::console as _console;
-
-        if let Some(console) = unsafe { _console.as_mut() } {
+        if let Some(console) = &self.terminal {
             for &ch in s.as_bytes() {
             for &ch in s.as_bytes() {
-                unsafe {
-                    console.show_char(ch as i32);
-                }
+                console.show_char(ch)
             }
             }
         }
         }
 
 
@@ -19,11 +37,13 @@ impl Write for Console {
 }
 }
 
 
 #[doc(hidden)]
 #[doc(hidden)]
-pub fn _print(args: core::fmt::Arguments) -> core::fmt::Result {
-    CONSOLE.lock().write_fmt(args)
+pub fn _print(args: core::fmt::Arguments) {
+    dont_check!(CONSOLE.lock_irq().write_fmt(args))
 }
 }
 
 
-pub static CONSOLE: spin::Mutex<Console> = spin::Mutex::new(Console {});
+lazy_static! {
+    pub static ref CONSOLE: Spin<Console> = Spin::new(Console { terminal: None });
+}
 
 
 macro_rules! print {
 macro_rules! print {
     ($($arg:tt)*) => {
     ($($arg:tt)*) => {
@@ -40,4 +60,33 @@ macro_rules! println {
     };
     };
 }
 }
 
 
-pub(crate) use {print, println};
+macro_rules! println_warn {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel: warn] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_debug {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel:debug] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_info {
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel: info] {}", format_args!($($arg)*))
+    };
+}
+
+macro_rules! println_fatal {
+    () => {
+        $crate::println!("[kernel:fatal] ")
+    };
+    ($($arg:tt)*) => {
+        $crate::println!("[kernel:fatal] {}", format_args!($($arg)*))
+    };
+}
+
+use super::terminal::Terminal;
+
+pub(crate) use {print, println, println_debug, println_fatal, println_info, println_warn};

+ 39 - 0
src/kernel/constants.rs

@@ -0,0 +1,39 @@
+use bitflags::bitflags;
+
+pub const TCGETS: u32 = 0x5401;
+pub const TCSETS: u32 = 0x5402;
+pub const TIOCGPGRP: u32 = 0x540f;
+pub const TIOCSPGRP: u32 = 0x5410;
+pub const TIOCGWINSZ: u32 = 0x5413;
+
+pub const PR_SET_NAME: u32 = 15;
+pub const PR_GET_NAME: u32 = 16;
+
+pub const SIG_BLOCK: u32 = 0;
+pub const SIG_UNBLOCK: u32 = 1;
+pub const SIG_SETMASK: u32 = 2;
+
+pub const SA_SIGINFO: u32 = 4;
+
+pub const CLOCK_REALTIME: u32 = 0;
+pub const CLOCK_MONOTONIC: u32 = 1;
+
+pub const ENXIO: u32 = 6;
+pub const ENOEXEC: u32 = 8;
+
+bitflags! {
+    #[derive(Debug, Clone, Copy)]
+    pub struct UserMmapFlags: u32 {
+        const MAP_SHARED = 0x01;
+        const MAP_PRIVATE = 0x02;
+        const MAP_FIXED = 0x10;
+        const MAP_ANONYMOUS = 0x20;
+    }
+
+    #[derive(Debug, Clone, Copy)]
+    pub struct UserMmapProtocol: u32 {
+        const PROT_READ = 0x01;
+        const PROT_WRITE = 0x02;
+        const PROT_EXEC = 0x04;
+    }
+}

+ 2 - 2
src/kernel/hw/pci.cc

@@ -87,11 +87,11 @@ int register_driver_r(uint16_t vendor, uint16_t device,
 
 
 namespace kernel::kinit {
 namespace kernel::kinit {
 
 
-SECTION(".text.kinit")
-void init_pci() {
+extern "C" void init_pci() {
     using namespace hw::acpi;
     using namespace hw::acpi;
     using namespace hw::pci;
     using namespace hw::pci;
 
 
+    assert(parse_acpi_tables() == 0);
     auto* mcfg = (MCFG*)get_table("MCFG");
     auto* mcfg = (MCFG*)get_table("MCFG");
     assert(mcfg);
     assert(mcfg);
 
 

+ 0 - 115
src/kernel/hw/serial.cc

@@ -1,115 +0,0 @@
-#include <errno.h>
-#include <stdio.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/irq.hpp>
-#include <kernel/log.hpp>
-#include <kernel/module.hpp>
-#include <kernel/tty.hpp>
-
-using namespace kernel::tty;
-using namespace kernel::hw;
-using namespace kernel::irq;
-using namespace kernel::kmod;
-
-constexpr int PORT0 = 0x3f8;
-constexpr int PORT1 = 0x2f8;
-
-using port_group = const p8[6];
-
-constexpr p8 port0[] = {
-    p8{PORT0 + 0}, p8{PORT0 + 1}, p8{PORT0 + 2},
-    p8{PORT0 + 3}, p8{PORT0 + 4}, p8{PORT0 + 5},
-};
-
-constexpr p8 port1[] = {
-    p8{PORT1 + 0}, p8{PORT1 + 1}, p8{PORT1 + 2},
-    p8{PORT1 + 3}, p8{PORT1 + 4}, p8{PORT1 + 5},
-};
-
-static void _serial0_receive_data_interrupt() {
-    while (*port0[5] & 1)
-        console->commit_char(*port0[0]);
-}
-
-static void _serial1_receive_data_interrupt() {
-    while (*port1[5] & 1)
-        console->commit_char(*port1[0]);
-}
-
-static inline int _init_port(port_group ports) {
-    // taken from osdev.org
-
-    ports[1] = 0x00; // Disable all interrupts
-    ports[3] = 0x80; // Enable DLAB (set baud rate divisor)
-    // TODO: set baud rate
-    ports[0] = 0x00; // Set divisor to 0 -3- (lo byte) 115200 -38400- baud
-    ports[1] = 0x00; //                  (hi byte)
-    ports[3] = 0x03; // 8 bits, no parity, one stop bit
-    ports[2] = 0xC7; // Enable FIFO, clear them, with 14-byte threshold
-    // TODO: IRQ disabled
-    ports[4] = 0x0B; // IRQs enabled, RTS/DSR set
-    ports[4] = 0x1E; // Set in loopback mode, test the serial chip
-    ports[0] = 0xAE; // Test serial chip (send byte 0xAE and check if serial
-                     // returns same byte)
-
-    // Check if serial is faulty (i.e: not same byte as sent)
-    if (*ports[0] != 0xAE)
-        return -EIO;
-
-    // If serial is not faulty set it in normal operation mode
-    // (not-loopback with IRQs enabled and OUT#1 and OUT#2 bits enabled)
-    ports[4] = 0x0F;
-
-    ports[1] = 0x01; // Enable interrupts #0: Received Data Available
-
-    return 0;
-}
-
-class serial_tty : public virtual tty {
-    const p8* ports;
-
-   public:
-    serial_tty(port_group ports, int id) : tty{"ttyS"}, ports(ports) {
-        name += '0' + id;
-    }
-
-    virtual void putchar(char c) override {
-        while (true) {
-            auto status = *ports[5];
-            if (status & 0x1)
-                this->commit_char(*ports[0]);
-            if (status & 0x20)
-                break;
-        }
-
-        ports[0] = c;
-    }
-};
-
-class serial_module : public virtual kmod {
-   public:
-    serial_module() : kmod("serial-tty") {}
-
-    virtual int init() override {
-        if (int ret = _init_port(port0); ret == 0) {
-            auto* dev = new serial_tty(port0, 0);
-            register_handler(4, _serial0_receive_data_interrupt);
-
-            if (int ret = register_tty(dev); ret != 0)
-                kmsg("[serial] cannot register ttyS0");
-        }
-
-        if (int ret = _init_port(port1); ret == 0) {
-            auto* dev = new serial_tty(port1, 0);
-            register_handler(3, _serial1_receive_data_interrupt);
-
-            if (int ret = register_tty(dev); ret != 0)
-                kmsg("[serial] cannot register ttyS1");
-        }
-
-        return 0;
-    }
-};
-
-INTERNAL_MODULE(serial, serial_module);

+ 0 - 28
src/kernel/hw/timer.cc

@@ -1,28 +0,0 @@
-#include <types/types.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/hw/timer.hpp>
-
-constexpr kernel::hw::p8 port_control(0x43);
-constexpr kernel::hw::p8 port_count(0x40);
-
-static std::size_t _current_ticks = 0;
-
-SECTION(".text.kinit")
-void kernel::hw::timer::init_pit(void) {
-    // set interval
-    port_control = 0x34;
-
-    // send interval number
-    // 0x2e9a = 11930 = 100Hz
-    port_count = 0x9a;
-    port_count = 0x2e;
-}
-
-void kernel::hw::timer::inc_tick(void) {
-    ++_current_ticks;
-}
-
-size_t kernel::hw::timer::current_ticks(void) {
-    return _current_ticks;
-}

+ 0 - 147
src/kernel/interrupt.cpp

@@ -1,147 +0,0 @@
-#include <list>
-#include <vector>
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <types/types.h>
-
-#include <kernel/hw/port.hpp>
-#include <kernel/hw/timer.hpp>
-#include <kernel/interrupt.hpp>
-#include <kernel/irq.hpp>
-#include <kernel/log.hpp>
-#include <kernel/mem/paging.hpp>
-#include <kernel/process.hpp>
-#include <kernel/syscall.hpp>
-#include <kernel/vfs.hpp>
-
-#define KERNEL_INTERRUPT_GATE_TYPE (0x8e)
-#define USER_INTERRUPT_GATE_TYPE (0xee)
-
-constexpr kernel::hw::p8 port_pic1_command{0x20};
-constexpr kernel::hw::p8 port_pic1_data{0x21};
-constexpr kernel::hw::p8 port_pic2_command{0xa0};
-constexpr kernel::hw::p8 port_pic2_data{0xa1};
-
-struct IDT_entry {
-    uint16_t offset_low;
-    uint16_t segment;
-
-    uint8_t IST;
-    uint8_t attributes;
-
-    uint16_t offset_mid;
-    uint32_t offset_high;
-    uint32_t reserved;
-};
-
-static struct IDT_entry IDT[256];
-
-extern "C" uintptr_t ISR_START_ADDR;
-
-SECTION(".text.kinit")
-static inline void set_idt_entry(IDT_entry (&idt)[256], int n, uintptr_t offset,
-                                 uint16_t selector, uint8_t type) {
-    idt[n].offset_low = offset & 0xffff;
-    idt[n].segment = selector;
-    idt[n].IST = 0;
-    idt[n].attributes = type;
-    idt[n].offset_mid = (offset >> 16) & 0xffff;
-    idt[n].offset_high = (offset >> 32) & 0xffffffff;
-    idt[n].reserved = 0;
-}
-
-using kernel::irq::irq_handler_t;
-static std::vector<std::list<irq_handler_t>> s_irq_handlers;
-
-SECTION(".text.kinit")
-void kernel::kinit::init_interrupt() {
-    for (int i = 0; i < 0x30; ++i)
-        set_idt_entry(IDT, i, ISR_START_ADDR + 8 * i, 0x08,
-                      KERNEL_INTERRUPT_GATE_TYPE);
-    set_idt_entry(IDT, 0x80, ISR_START_ADDR + 8 * 0x80, 0x08,
-                  USER_INTERRUPT_GATE_TYPE);
-
-    uint64_t idt_descriptor[2];
-    idt_descriptor[0] = (sizeof(IDT_entry) * 256) << 48;
-    idt_descriptor[1] = (uintptr_t)IDT;
-
-    // initialize PIC
-    asm volatile("lidt (%0)" : : "r"((uintptr_t)idt_descriptor + 6) :);
-    s_irq_handlers.resize(16);
-
-    // TODO: move this to timer driver
-    kernel::irq::register_handler(0, []() {
-        kernel::hw::timer::inc_tick();
-        schedule();
-    });
-
-    port_pic1_command = 0x11; // edge trigger mode
-    port_pic1_data = 0x20;    // start from int 0x20
-    port_pic1_data = 0x04;    // PIC1 is connected to IRQ2 (1 << 2)
-    port_pic1_data = 0x01;    // no buffer mode
-
-    port_pic2_command = 0x11; // edge trigger mode
-    port_pic2_data = 0x28;    // start from int 0x28
-    port_pic2_data = 0x02;    // connected to IRQ2
-    port_pic2_data = 0x01;    // no buffer mode
-
-    // allow all the interrupts
-    port_pic1_data = 0x00;
-    port_pic2_data = 0x00;
-}
-
-void kernel::irq::register_handler(int irqno, irq_handler_t handler) {
-    s_irq_handlers[irqno].emplace_back(std::move(handler));
-}
-
-static inline void fault_handler(interrupt_stack* context, mmx_registers*) {
-    switch (context->int_no) {
-        case 6:
-        case 8: {
-            assert(false);
-            if (!current_process->attr.system)
-                kill_current(SIGSEGV); // noreturn
-        } break;
-        case 13: {
-            if (!current_process->attr.system)
-                kill_current(SIGILL); // noreturn
-        } break;
-        case 14: {
-            kernel::mem::paging::handle_page_fault(context->error_code);
-            return;
-        } break;
-    }
-
-    // fault can not be resolved
-    freeze();
-}
-
-extern "C" void irq_handler_rust(int irqno);
-
-static inline void irq_handler(interrupt_stack* context, mmx_registers*) {
-    int irqno = context->int_no - 0x20;
-
-    constexpr uint8_t PIC_EOI = 0x20;
-
-    for (const auto& handler : s_irq_handlers[irqno])
-        handler();
-
-    irq_handler_rust(irqno);
-
-    port_pic1_command = PIC_EOI;
-    if (irqno >= 8)
-        port_pic2_command = PIC_EOI;
-}
-
-extern "C" void interrupt_handler(interrupt_stack* context,
-                                  mmx_registers* mmxregs) {
-    if (context->int_no < 0x20) // interrupt is a fault
-        fault_handler(context, mmxregs);
-    else if (context->int_no == 0x80) // syscall by int 0x80
-        kernel::handle_syscall32(context->regs.rax, context, mmxregs);
-    else
-        irq_handler(context, mmxregs);
-}

+ 77 - 26
src/kernel/interrupt.rs

@@ -1,44 +1,95 @@
-use alloc::boxed::Box;
-use alloc::vec;
-use alloc::vec::Vec;
+use alloc::sync::Arc;
 
 
-use crate::bindings::root::EINVAL;
+use lazy_static::lazy_static;
 
 
-static mut IRQ_HANDLERS: spin::Mutex<[Option<Vec<Box<dyn Fn()>>>; 16]> =
-    spin::Mutex::new([const { None }; 16]);
+use crate::bindings::root::{interrupt_stack, mmx_registers, EINVAL};
+use crate::{driver::Port8, prelude::*};
+
+use super::mem::handle_page_fault;
+use super::syscall::handle_syscall32;
+use super::task::{ProcessList, Signal};
+use super::timer::timer_interrupt;
+
+const PIC1_COMMAND: Port8 = Port8::new(0x20);
+const PIC1_DATA: Port8 = Port8::new(0x21);
+const PIC2_COMMAND: Port8 = Port8::new(0xA0);
+const PIC2_DATA: Port8 = Port8::new(0xA1);
+
+lazy_static! {
+    static ref IRQ_HANDLERS: Spin<[Option<Arc<dyn Fn() + Send + Sync>>; 16]> =
+        Spin::new([const { None }; 16]);
+}
+
+fn irq_handler(irqno: usize) {
+    assert!(irqno < 16);
+
+    let handler = IRQ_HANDLERS.lock()[irqno as usize].as_ref().cloned();
+    if let Some(handler) = handler {
+        handler();
+    }
+
+    PIC1_COMMAND.write(0x20); // EOI
+    if irqno >= 8 {
+        PIC2_COMMAND.write(0x20); // EOI
+    }
+}
+
+fn fault_handler(int_stack: &mut interrupt_stack) {
+    match int_stack.int_no {
+        // Invalid Op or Double Fault
+        14 => handle_page_fault(int_stack),
+        13 if int_stack.ss == 0 => ProcessList::kill_current(Signal::SIGILL),
+        6 | 8 if int_stack.ss == 0 => ProcessList::kill_current(Signal::SIGSEGV),
+        _ => panic!("Unhandled fault: {}", int_stack.int_no),
+    }
+}
 
 
 #[no_mangle]
 #[no_mangle]
-pub extern "C" fn irq_handler_rust(irqno: core::ffi::c_int) {
-    assert!(irqno >= 0 && irqno < 16);
-
-    let handlers = unsafe { IRQ_HANDLERS.lock() };
-
-    match handlers[irqno as usize] {
-        Some(ref handlers) => {
-            for handler in handlers {
-                handler();
-            }
-        }
-        None => {}
+pub extern "C" fn interrupt_handler(int_stack: *mut interrupt_stack, mmxregs: *mut mmx_registers) {
+    let int_stack = unsafe { &mut *int_stack };
+    let mmxregs = unsafe { &mut *mmxregs };
+
+    match int_stack.int_no {
+        // Fault
+        0..0x20 => fault_handler(int_stack),
+        // Syscall
+        0x80 => handle_syscall32(int_stack.regs.rax as usize, int_stack, mmxregs),
+        // Timer
+        0x40 => timer_interrupt(),
+        // IRQ
+        no => irq_handler(no as usize - 0x20),
     }
     }
 }
 }
 
 
 pub fn register_irq_handler<F>(irqno: i32, handler: F) -> Result<(), u32>
 pub fn register_irq_handler<F>(irqno: i32, handler: F) -> Result<(), u32>
 where
 where
-    F: Fn() + 'static,
+    F: Fn() + Send + Sync + 'static,
 {
 {
     if irqno < 0 || irqno >= 16 {
     if irqno < 0 || irqno >= 16 {
         return Err(EINVAL);
         return Err(EINVAL);
     }
     }
 
 
-    let mut handlers = unsafe { IRQ_HANDLERS.lock() };
+    let old = IRQ_HANDLERS.lock_irq()[irqno as usize].replace(Arc::new(handler));
+    assert!(old.is_none(), "IRQ handler already registered");
+    Ok(())
+}
 
 
-    match handlers[irqno as usize] {
-        Some(ref mut handlers) => handlers.push(Box::new(handler)),
-        None => {
-            handlers[irqno as usize].replace(vec![Box::new(handler)]);
-        }
-    }
+pub fn init() -> KResult<()> {
+    // TODO: Move this to `arch`
+    // Initialize PIC
+    PIC1_COMMAND.write(0x11); // edge trigger mode
+    PIC1_DATA.write(0x20); // IRQ 0-7 offset
+    PIC1_DATA.write(0x04); // cascade with slave PIC
+    PIC1_DATA.write(0x01); // no buffer mode
+
+    PIC2_COMMAND.write(0x11); // edge trigger mode
+    PIC2_DATA.write(0x28); // IRQ 8-15 offset
+    PIC2_DATA.write(0x02); // cascade with master PIC
+    PIC2_DATA.write(0x01); // no buffer mode
+
+    // Allow all IRQs
+    PIC1_DATA.write(0x0);
+    PIC2_DATA.write(0x0);
 
 
     Ok(())
     Ok(())
 }
 }

+ 10 - 0
src/kernel/mem.rs

@@ -1,2 +1,12 @@
 pub mod paging;
 pub mod paging;
 pub mod phys;
 pub mod phys;
+
+mod mm_area;
+mod mm_list;
+mod page_table;
+mod vrange;
+
+pub(self) use mm_area::MMArea;
+pub use mm_list::{handle_page_fault, FileMapping, MMList, Mapping, PageFaultError, Permission};
+pub(self) use page_table::{PageTable, PTE};
+pub use vrange::{VAddr, VRange};

+ 102 - 0
src/kernel/mem/mm_area.rs

@@ -0,0 +1,102 @@
+use core::{borrow::Borrow, cell::UnsafeCell, cmp::Ordering};
+
+use super::{Mapping, Permission, VAddr, VRange};
+
+#[derive(Debug)]
+pub struct MMArea {
+    range: UnsafeCell<VRange>,
+    pub(super) mapping: Mapping,
+    pub(super) permission: Permission,
+}
+
+impl Clone for MMArea {
+    fn clone(&self) -> Self {
+        Self {
+            range: UnsafeCell::new(self.range()),
+            mapping: self.mapping.clone(),
+            permission: self.permission,
+        }
+    }
+}
+
+impl MMArea {
+    pub fn new(range: VRange, mapping: Mapping, permission: Permission) -> Self {
+        Self {
+            range: range.into(),
+            mapping,
+            permission,
+        }
+    }
+
+    fn range_borrow(&self) -> &VRange {
+        // SAFETY: The only way we get a reference to `MMArea` object is through `MMListInner`.
+        // And `MMListInner` is locked with IRQ disabled.
+        unsafe { self.range.get().as_ref().unwrap() }
+    }
+
+    pub fn range(&self) -> VRange {
+        *self.range_borrow()
+    }
+
+    pub fn len(&self) -> usize {
+        self.range_borrow().len()
+    }
+
+    /// # Safety
+    /// This function should be called only when we can guarantee that the range
+    /// won't overlap with any other range in some scope.
+    pub fn grow(&self, count: usize) {
+        let range = unsafe { self.range.get().as_mut().unwrap() };
+        range.clone_from(&self.range_borrow().grow(count));
+    }
+
+    pub fn split(mut self, at: VAddr) -> (Option<Self>, Option<Self>) {
+        assert_eq!(at.floor(), at);
+
+        match self.range_borrow().cmp(&VRange::from(at)) {
+            Ordering::Less => (Some(self), None),
+            Ordering::Greater => (None, Some(self)),
+            Ordering::Equal => {
+                let diff = at - self.range_borrow().start();
+                if diff == 0 {
+                    return (None, Some(self));
+                }
+
+                let right = Self {
+                    range: VRange::new(at, self.range_borrow().end()).into(),
+                    permission: self.permission,
+                    mapping: match &self.mapping {
+                        Mapping::Anonymous => Mapping::Anonymous,
+                        Mapping::File(mapping) => Mapping::File(mapping.offset(diff)),
+                    },
+                };
+
+                self.range.get_mut().shrink(diff);
+                (Some(self), Some(right))
+            }
+        }
+    }
+}
+
+impl Eq for MMArea {}
+impl PartialEq for MMArea {
+    fn eq(&self, other: &Self) -> bool {
+        self.range_borrow().eq(other.range_borrow())
+    }
+}
+impl PartialOrd for MMArea {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        self.range_borrow().partial_cmp(other.range_borrow())
+    }
+}
+impl Ord for MMArea {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.range_borrow().cmp(other.range_borrow())
+    }
+}
+
+impl Borrow<VRange> for MMArea {
+    fn borrow(&self) -> &VRange {
+        self.range_borrow()
+    }
+}

+ 13 - 15
src/kernel/mem/mm_list.cc

@@ -1,4 +1,5 @@
 #include <assert.h>
 #include <assert.h>
+#include <errno.h>
 #include <stdint.h>
 #include <stdint.h>
 
 
 #include <kernel/mem/mm_list.hpp>
 #include <kernel/mem/mm_list.hpp>
@@ -16,8 +17,7 @@ static inline void __invalidate_all_tlb() {
         : "rax", "memory");
         : "rax", "memory");
 }
 }
 
 
-static inline void __dealloc_page_table_all(paging::pfn_t pt, int depth,
-                                            int from, int to) {
+static inline void __dealloc_page_table_all(paging::pfn_t pt, int depth, int from, int to) {
     using namespace paging;
     using namespace paging;
 
 
     if (depth > 1) {
     if (depth > 1) {
@@ -43,7 +43,8 @@ static inline void __dealloc_page_table(paging::pfn_t pt) {
 }
 }
 
 
 mm_list::mm_list() : m_pt{paging::alloc_page_table()}, m_brk{m_areas.end()} {
 mm_list::mm_list() : m_pt{paging::alloc_page_table()}, m_brk{m_areas.end()} {
-    memcpy(physaddr<void>{m_pt}, paging::KERNEL_PAGE_TABLE_PHYS_ADDR, 0x1000);
+    // copy only kernel space
+    memcpy(physaddr<void>{m_pt + 0x800}, physaddr<void>{KERNEL_PML4 + 0x800}, 0x800);
 }
 }
 
 
 mm_list::mm_list(const mm_list& other) : mm_list{} {
 mm_list::mm_list(const mm_list& other) : mm_list{} {
@@ -138,8 +139,7 @@ int mm_list::register_brk(uintptr_t addr) {
         return -ENOMEM;
         return -ENOMEM;
 
 
     bool inserted;
     bool inserted;
-    std::tie(m_brk, inserted) =
-        m_areas.emplace(addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
+    std::tie(m_brk, inserted) = m_areas.emplace(addr, MM_ANONYMOUS | MM_WRITE | MM_BREAK);
 
 
     assert(inserted);
     assert(inserted);
     return 0;
     return 0;
@@ -186,8 +186,8 @@ mm_list::iterator mm_list::split(iterator area, uintptr_t addr) {
     auto new_end = area->end;
     auto new_end = area->end;
     area->end = addr;
     area->end = addr;
 
 
-    auto [iter, inserted] = m_areas.emplace(addr, area->flags, new_end,
-                                            area->mapped_file, new_file_offset);
+    auto [iter, inserted] =
+        m_areas.emplace(addr, area->flags, new_end, d_get(area->mapped_file), new_file_offset);
 
 
     assert(inserted);
     assert(inserted);
     return iter;
     return iter;
@@ -217,8 +217,7 @@ int mm_list::unmap(iterator area, bool should_invalidate_tlb) {
     return 0;
     return 0;
 }
 }
 
 
-int mm_list::unmap(uintptr_t start, std::size_t length,
-                   bool should_invalidate_tlb) {
+int mm_list::unmap(uintptr_t start, std::size_t length, bool should_invalidate_tlb) {
     // standard says that addr and len MUST be
     // standard says that addr and len MUST be
     // page-aligned or the call is invalid
     // page-aligned or the call is invalid
     if (start & 0xfff)
     if (start & 0xfff)
@@ -279,7 +278,7 @@ int mm_list::unmap(uintptr_t start, std::size_t length,
 int mm_list::mmap(const map_args& args) {
 int mm_list::mmap(const map_args& args) {
     auto& vaddr = args.vaddr;
     auto& vaddr = args.vaddr;
     auto& length = args.length;
     auto& length = args.length;
-    auto& finode = args.file_inode;
+    auto& file = args.file;
     auto& foff = args.file_offset;
     auto& foff = args.file_offset;
     auto& flags = args.flags;
     auto& flags = args.flags;
 
 
@@ -298,10 +297,10 @@ int mm_list::mmap(const map_args& args) {
         attributes |= PA_NXE;
         attributes |= PA_NXE;
 
 
     if (flags & MM_MAPPED) {
     if (flags & MM_MAPPED) {
-        assert(finode);
+        assert(file);
 
 
-        auto [area, inserted] = m_areas.emplace(
-            vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, finode, foff);
+        auto [area, inserted] =
+            m_areas.emplace(vaddr, flags & ~MM_INTERNAL_MASK, vaddr + length, d_get(file), foff);
         assert(inserted);
         assert(inserted);
 
 
         attributes |= PA_MMAPPED_PAGE;
         attributes |= PA_MMAPPED_PAGE;
@@ -310,8 +309,7 @@ int mm_list::mmap(const map_args& args) {
     } else if (flags & MM_ANONYMOUS) {
     } else if (flags & MM_ANONYMOUS) {
         // private mapping of zero-filled pages
         // private mapping of zero-filled pages
         // TODO: shared mapping
         // TODO: shared mapping
-        auto [area, inserted] =
-            m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
+        auto [area, inserted] = m_areas.emplace(vaddr, (flags & ~MM_INTERNAL_MASK), vaddr + length);
         assert(inserted);
         assert(inserted);
 
 
         attributes |= PA_ANONYMOUS_PAGE;
         attributes |= PA_ANONYMOUS_PAGE;

+ 357 - 0
src/kernel/mem/mm_list.rs

@@ -0,0 +1,357 @@
+mod page_fault;
+
+use crate::prelude::*;
+
+use alloc::{collections::btree_set::BTreeSet, sync::Arc};
+use bindings::{EEXIST, EINVAL, ENOMEM};
+
+use crate::kernel::vfs::dentry::Dentry;
+
+use super::{MMArea, PageTable, VAddr, VRange};
+
+pub use page_fault::{handle_page_fault, PageFaultError};
+
+#[derive(Debug, Clone)]
+pub struct FileMapping {
+    file: Arc<Dentry>,
+    /// Offset in the file, aligned to 4KB boundary.
+    offset: usize,
+    /// Length of the mapping. Exceeding part will be zeroed.
+    length: usize,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct Permission {
+    pub write: bool,
+    pub execute: bool,
+}
+
+#[derive(Debug, Clone)]
+pub enum Mapping {
+    Anonymous,
+    File(FileMapping),
+}
+
+#[derive(Debug)]
+struct MMListInner {
+    areas: BTreeSet<MMArea>,
+    break_start: Option<VRange>,
+    break_pos: Option<VAddr>,
+}
+
+#[derive(Debug)]
+pub struct MMList {
+    /// # Safety
+    /// This field might be used in IRQ context, so it should be locked with `lock_irq()`.
+    inner: Mutex<MMListInner>,
+    /// Do not modify entries in the page table without acquiring the `inner` lock.
+    page_table: PageTable,
+}
+
+impl FileMapping {
+    pub fn new(file: Arc<Dentry>, offset: usize, length: usize) -> Self {
+        assert_eq!(offset & 0xfff, 0);
+        Self {
+            file,
+            offset,
+            length,
+        }
+    }
+
+    pub fn offset(&self, offset: usize) -> Self {
+        if self.length <= offset {
+            Self::new(self.file.clone(), self.offset + self.length, 0)
+        } else {
+            Self::new(
+                self.file.clone(),
+                self.offset + offset,
+                self.length - offset,
+            )
+        }
+    }
+}
+
+impl MMListInner {
+    fn overlapping_addr(&self, addr: VAddr) -> Option<&MMArea> {
+        self.areas.get(&VRange::from(addr))
+    }
+
+    fn check_overlapping_addr(&self, addr: VAddr) -> bool {
+        addr.is_user() && self.overlapping_addr(addr).is_none()
+    }
+
+    fn overlapping_range(&self, range: VRange) -> impl DoubleEndedIterator<Item = &MMArea> + '_ {
+        self.areas.range(range.into_range())
+    }
+
+    fn check_overlapping_range(&self, range: VRange) -> bool {
+        range.is_user() && self.overlapping_range(range).next().is_none()
+    }
+
+    fn find_available(&self, hint: VAddr, len: usize) -> Option<VAddr> {
+        let mut range = if hint == VAddr::NULL {
+            VRange::new(VAddr(0x1234000), VAddr(0x1234000 + len).ceil())
+        } else {
+            VRange::new(hint.floor(), (hint + len).ceil())
+        };
+        let len = range.len();
+
+        loop {
+            if !range.is_user() {
+                return None;
+            }
+
+            match self.overlapping_range(range).next_back() {
+                None => return Some(range.start()),
+                Some(area) => {
+                    range = VRange::new(area.range().end().ceil(), area.range().end().ceil() + len);
+                }
+            }
+        }
+    }
+
+    fn unmap(&mut self, page_table: &PageTable, start: VAddr, len: usize) -> KResult<()> {
+        assert_eq!(start.floor(), start);
+        let end = (start + len).ceil();
+        let range = VRange::new(start, end);
+        if !range.is_user() {
+            return Err(EINVAL);
+        }
+
+        let check_range = VRange::from(range.start())..VRange::from(range.end());
+        let mut front_remaining = None;
+        let mut back_remaining = None;
+
+        self.areas.retain(|area| {
+            if !check_range.contains(&area.range()) {
+                return true;
+            }
+            if area.range() == range.start().into() {
+                let (left, right) = area.clone().split(range.start());
+                page_table.unmap(&right.unwrap());
+
+                if let Some(left) = left {
+                    assert!(
+                        front_remaining.replace(left).is_none(),
+                        "There should be only one `front`."
+                    );
+                }
+            } else if area.range() == range.end().into() {
+                let (left, right) = area.clone().split(range.end());
+                page_table.unmap(&left.unwrap());
+
+                assert!(
+                    back_remaining
+                        .replace(right.expect("`right` should be valid"))
+                        .is_none(),
+                    "There should be only one `back`."
+                );
+            } else {
+                page_table.unmap(area);
+            }
+
+            false
+        });
+
+        if let Some(front) = front_remaining {
+            self.areas.insert(front);
+        }
+        if let Some(back) = back_remaining {
+            self.areas.insert(back);
+        }
+
+        Ok(())
+    }
+
+    fn mmap(
+        &mut self,
+        page_table: &PageTable,
+        at: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+    ) -> KResult<()> {
+        assert_eq!(at.floor(), at);
+        assert_eq!(len & 0xfff, 0);
+        let range = VRange::new(at, at + len);
+
+        // We are doing a area marker insertion.
+        if len == 0 && !self.check_overlapping_addr(at) || !self.check_overlapping_range(range) {
+            return Err(EEXIST);
+        }
+
+        match &mapping {
+            Mapping::Anonymous => page_table.set_anonymous(range, permission),
+            Mapping::File(_) => page_table.set_mmapped(range, permission),
+        }
+
+        self.areas.insert(MMArea::new(range, mapping, permission));
+        Ok(())
+    }
+}
+
+impl MMList {
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self {
+            inner: Mutex::new(MMListInner {
+                areas: BTreeSet::new(),
+                break_start: None,
+                break_pos: None,
+            }),
+            page_table: PageTable::new(),
+        })
+    }
+
+    pub fn new_cloned(&self) -> Arc<Self> {
+        let inner = self.inner.lock_irq();
+
+        let list = Arc::new(Self {
+            inner: Mutex::new(MMListInner {
+                areas: inner.areas.clone(),
+                break_start: inner.break_start,
+                break_pos: inner.break_pos,
+            }),
+            page_table: PageTable::new(),
+        });
+
+        // SAFETY: `self.inner` already locked with IRQ disabled.
+        {
+            let list_inner = list.inner.lock();
+
+            for area in list_inner.areas.iter() {
+                let new_iter = list.page_table.iter_user(area.range()).unwrap();
+                let old_iter = self.page_table.iter_user(area.range()).unwrap();
+
+                for (new, old) in new_iter.zip(old_iter) {
+                    new.setup_cow(old);
+                }
+            }
+        }
+
+        // We set some pages as COW, so we need to invalidate TLB.
+        self.page_table.lazy_invalidate_tlb_all();
+
+        list
+    }
+
+    /// No need to do invalidation manually, `PageTable` already does it.
+    pub fn clear_user(&self) {
+        let mut inner = self.inner.lock_irq();
+        inner.areas.retain(|area| {
+            self.page_table.unmap(area);
+            false
+        });
+        inner.break_start = None;
+        inner.break_pos = None;
+    }
+
+    pub fn switch_page_table(&self) {
+        self.page_table.switch();
+    }
+
+    /// No need to do invalidation manually, `PageTable` already does it.
+    pub fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
+        self.inner.lock_irq().unmap(&self.page_table, start, len)
+    }
+
+    pub fn mmap_hint(
+        &self,
+        hint: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+    ) -> KResult<VAddr> {
+        let mut inner = self.inner.lock_irq();
+        if hint == VAddr::NULL {
+            let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
+            inner.mmap(&self.page_table, at, len, mapping, permission)?;
+            return Ok(at);
+        }
+
+        match inner.mmap(&self.page_table, hint, len, mapping.clone(), permission) {
+            Ok(()) => Ok(hint),
+            Err(EEXIST) => {
+                let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
+                inner.mmap(&self.page_table, at, len, mapping, permission)?;
+                Ok(at)
+            }
+            Err(err) => Err(err),
+        }
+    }
+
+    pub fn mmap_fixed(
+        &self,
+        at: VAddr,
+        len: usize,
+        mapping: Mapping,
+        permission: Permission,
+    ) -> KResult<VAddr> {
+        self.inner
+            .lock_irq()
+            .mmap(&self.page_table, at, len, mapping.clone(), permission)
+            .map(|_| at)
+    }
+
+    pub fn set_break(&self, pos: Option<VAddr>) -> VAddr {
+        let mut inner = self.inner.lock_irq();
+
+        // SAFETY: `set_break` is only called in syscalls, where program break should be valid.
+        assert!(inner.break_start.is_some() && inner.break_pos.is_some());
+        let break_start = inner.break_start.unwrap();
+        let current_break = inner.break_pos.unwrap();
+        let pos = match pos {
+            None => return current_break,
+            Some(pos) => pos.ceil(),
+        };
+
+        let range = VRange::new(current_break, pos);
+        if !inner.check_overlapping_range(range) {
+            return current_break;
+        }
+
+        if !inner.areas.contains(&break_start) {
+            inner.areas.insert(MMArea::new(
+                break_start,
+                Mapping::Anonymous,
+                Permission {
+                    write: true,
+                    execute: false,
+                },
+            ));
+        }
+
+        let program_break = inner
+            .areas
+            .get(&break_start)
+            .expect("Program break area should be valid");
+
+        let len = pos - current_break;
+        self.page_table.set_anonymous(
+            VRange::from(program_break.range().end()).grow(len),
+            Permission {
+                write: true,
+                execute: false,
+            },
+        );
+
+        program_break.grow(len);
+
+        inner.break_pos = Some(pos);
+        pos
+    }
+
+    /// This should be called only **once** for every thread.
+    pub fn register_break(&self, start: VAddr) {
+        let mut inner = self.inner.lock_irq();
+        assert!(inner.break_start.is_none() && inner.break_pos.is_none());
+
+        inner.break_start = Some(start.into());
+        inner.break_pos = Some(start);
+    }
+}
+
+impl Drop for MMList {
+    fn drop(&mut self) {
+        self.clear_user();
+    }
+}

+ 206 - 0
src/kernel/mem/mm_list/page_fault.rs

@@ -0,0 +1,206 @@
+use bindings::kernel::mem::paging::pfn_to_page;
+use bindings::{PA_A, PA_ANON, PA_COW, PA_MMAP, PA_P, PA_RW};
+use bitflags::bitflags;
+
+use crate::bindings::root::interrupt_stack;
+use crate::kernel::mem::paging::{Page, PageBuffer};
+use crate::kernel::mem::phys::{CachedPP, PhysPtr};
+use crate::kernel::mem::{Mapping, VRange};
+use crate::kernel::task::{ProcessList, Signal, Thread};
+use crate::prelude::*;
+
+use super::{MMList, VAddr};
+
+bitflags! {
+    pub struct PageFaultError: u64 {
+        const Present = 0x0001;
+        const Write = 0x0002;
+        const User = 0x0004;
+        const ReservedSet = 0x0008;
+        const InstructionFetch = 0x0010;
+        const ProtectionKey = 0x0020;
+        const SGX = 0x8000;
+    }
+}
+
+#[repr(C)]
+struct FixEntry {
+    start: u64,
+    length: u64,
+    jump_address: u64,
+    op_type: u64,
+}
+
+impl MMList {
+    fn handle_page_fault(
+        &self,
+        int_stack: &mut interrupt_stack,
+        addr: VAddr,
+        error: PageFaultError,
+    ) -> Result<(), Signal> {
+        let inner = self.inner.lock();
+        let area = match inner.areas.get(&VRange::from(addr)) {
+            Some(area) => area,
+            None => {
+                if error.contains(PageFaultError::User) {
+                    return Err(Signal::SIGBUS);
+                } else {
+                    try_page_fault_fix(int_stack, addr);
+                    return Ok(());
+                }
+            }
+        };
+
+        // User access permission violation, check user access permission.
+        if error.contains(PageFaultError::User | PageFaultError::Present) {
+            if error.contains(PageFaultError::Write) && !area.permission.write {
+                ProcessList::kill_current(Signal::SIGSEGV)
+            }
+
+            if error.contains(PageFaultError::InstructionFetch) && !area.permission.execute {
+                ProcessList::kill_current(Signal::SIGSEGV)
+            }
+        }
+
+        let pte = self
+            .page_table
+            .iter_user(VRange::new(addr.floor(), addr.floor() + 0x1000))
+            .unwrap()
+            .next()
+            .expect("If we can find the mapped area, we should be able to find the PTE");
+
+        let is_mapped = matches!(&area.mapping, Mapping::File(_));
+        if !is_mapped && !error.contains(PageFaultError::Present) {
+            try_page_fault_fix(int_stack, addr);
+            return Ok(());
+        }
+
+        let mut pfn = pte.pfn();
+        let mut attributes = pte.attributes();
+
+        if attributes & PA_COW as usize != 0 {
+            attributes &= !PA_COW as usize;
+            if area.permission.write {
+                attributes |= PA_RW as usize;
+            } else {
+                attributes &= !PA_RW as usize;
+            }
+
+            // TODO!!!: Change this.
+            let page = unsafe { pfn_to_page(pfn).as_mut().unwrap() };
+            if page.refcount == 1 {
+                pte.set_attributes(attributes);
+                return Ok(());
+            }
+
+            let new_page = Page::alloc_one();
+            if attributes & PA_ANON as usize != 0 {
+                new_page.zero();
+            } else {
+                new_page
+                    .as_cached()
+                    .as_mut_slice::<u8>(0x1000)
+                    .copy_from_slice(CachedPP::new(pfn).as_slice(0x1000));
+            }
+
+            attributes &= !(PA_A | PA_ANON) as usize;
+            page.refcount -= 1;
+
+            pfn = new_page.into_pfn();
+            pte.set(pfn, attributes);
+        }
+
+        // TODO: shared mapping
+        if attributes & PA_MMAP as usize != 0 {
+            attributes |= PA_P as usize;
+
+            if let Mapping::File(mapping) = &area.mapping {
+                let load_offset = addr.floor() - area.range().start();
+                if load_offset < mapping.length {
+                    // SAFETY: Since we are here, the `pfn` must refer to a valid buddy page.
+                    let page = unsafe { Page::from_pfn(pfn, 0) };
+                    let nread = mapping
+                        .file
+                        .read(
+                            &mut PageBuffer::new(page.clone()),
+                            mapping.offset + load_offset,
+                        )
+                        .map_err(|_| Signal::SIGBUS)?;
+
+                    if nread < page.len() {
+                        page.as_cached().as_mut_slice::<u8>(0x1000)[nread..].fill(0);
+                    }
+
+                    if mapping.length - load_offset < 0x1000 {
+                        let length_to_end = mapping.length - load_offset;
+                        page.as_cached().as_mut_slice::<u8>(0x1000)[length_to_end..].fill(0);
+                    }
+                }
+                // Otherwise, the page is kept zero emptied.
+
+                attributes &= !PA_MMAP as usize;
+                pte.set_attributes(attributes);
+            } else {
+                panic!("Anonymous mapping should not be PA_MMAP");
+            }
+        }
+
+        Ok(())
+    }
+}
+
+extern "C" {
+    static FIX_START: *const FixEntry;
+    static FIX_END: *const FixEntry;
+}
+
+/// Try to fix the page fault by jumping to the `error` address.
+///
+/// Panic if we can't find the `ip` in the fix list.
+fn try_page_fault_fix(int_stack: &mut interrupt_stack, addr: VAddr) {
+    let ip = int_stack.v_rip as u64;
+    // TODO: Use `op_type` to fix.
+
+    // SAFETY: `FIX_START` and `FIX_END` are defined in the linker script in `.rodata` section.
+    let entries = unsafe {
+        core::slice::from_raw_parts(
+            FIX_START,
+            (FIX_END as usize - FIX_START as usize) / size_of::<FixEntry>(),
+        )
+    };
+
+    for entry in entries.iter() {
+        if ip >= entry.start && ip < entry.start + entry.length {
+            int_stack.v_rip = entry.jump_address as usize;
+            return;
+        }
+    }
+
+    kernel_page_fault_die(addr, ip as usize)
+}
+
+fn kernel_page_fault_die(vaddr: VAddr, ip: usize) -> ! {
+    panic!(
+        "Invalid kernel mode memory access to {:#8x} while executing the instruction at {:#8x}",
+        vaddr.0, ip
+    )
+}
+
+pub fn handle_page_fault(int_stack: &mut interrupt_stack) {
+    let error = PageFaultError::from_bits_truncate(int_stack.error_code);
+    let vaddr = VAddr(arch::x86_64::vm::get_cr2());
+
+    let result = Thread::current()
+        .process
+        .mm_list
+        .handle_page_fault(int_stack, vaddr, error);
+
+    if let Err(signal) = result {
+        println_debug!(
+            "Page fault on {:#x} in user space at {:#x}",
+            vaddr.0,
+            int_stack.v_rip
+        );
+        ProcessList::kill_current(signal)
+    }
+}

+ 307 - 0
src/kernel/mem/page_table.rs

@@ -0,0 +1,307 @@
+use lazy_static::lazy_static;
+
+use crate::prelude::*;
+
+use crate::bindings::root::{EINVAL, KERNEL_PML4};
+
+use super::{
+    paging::Page,
+    phys::{CachedPP, PhysPtr as _},
+    VAddr, VRange,
+};
+use super::{MMArea, Permission};
+
+const PA_P: usize = 0x001;
+const PA_RW: usize = 0x002;
+const PA_US: usize = 0x004;
+const PA_PWT: usize = 0x008;
+const PA_PCD: usize = 0x010;
+const PA_A: usize = 0x020;
+const PA_D: usize = 0x040;
+const PA_PS: usize = 0x080;
+const PA_G: usize = 0x100;
+const PA_COW: usize = 0x200;
+const PA_MMAP: usize = 0x400;
+const PA_ANON: usize = 0x800;
+const PA_NXE: usize = 0x8000_0000_0000_0000;
+const PA_MASK: usize = 0xfff0_0000_0000_0fff;
+
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy)]
+pub struct PTE(usize);
+
+#[derive(Debug)]
+pub struct PageTable {
+    page: Page,
+}
+
+pub struct PTEIterator<'lt, const KERNEL: bool> {
+    count: usize,
+    i4: u16,
+    i3: u16,
+    i2: u16,
+    i1: u16,
+    p4: CachedPP,
+    p3: CachedPP,
+    p2: CachedPP,
+    p1: CachedPP,
+
+    start: VAddr,
+    end: VAddr,
+    _phantom: core::marker::PhantomData<&'lt ()>,
+}
+
+lazy_static! {
+    static ref EMPTY_PAGE: Page = {
+        let page = Page::alloc_one();
+        page.zero();
+        page
+    };
+}
+
+impl PTE {
+    pub fn is_user(&self) -> bool {
+        self.0 & PA_US != 0
+    }
+
+    pub fn is_present(&self) -> bool {
+        self.0 & PA_P != 0
+    }
+
+    pub fn pfn(&self) -> usize {
+        self.0 & !PA_MASK
+    }
+
+    pub fn attributes(&self) -> usize {
+        self.0 & PA_MASK
+    }
+
+    pub fn set(&mut self, pfn: usize, attributes: usize) {
+        self.0 = pfn | attributes;
+    }
+
+    pub fn set_pfn(&mut self, pfn: usize) {
+        self.set(pfn, self.attributes())
+    }
+
+    pub fn set_attributes(&mut self, attributes: usize) {
+        self.set(self.pfn(), attributes)
+    }
+
+    fn parse_page_table(&mut self, kernel: bool) -> CachedPP {
+        let attributes = if kernel {
+            PA_P | PA_RW | PA_G
+        } else {
+            PA_P | PA_RW | PA_US
+        };
+
+        if self.is_present() {
+            CachedPP::new(self.pfn())
+        } else {
+            let page = Page::alloc_one();
+            let pp = page.as_cached();
+            page.zero();
+
+            self.set(page.into_pfn(), attributes);
+            pp
+        }
+    }
+
+    pub fn setup_cow(&mut self, from: &mut Self) {
+        self.set(
+            unsafe { Page::from_pfn(from.pfn(), 0) }.into_pfn(),
+            (from.attributes() & !(PA_RW | PA_A | PA_D)) | PA_COW,
+        );
+
+        from.set_attributes((from.attributes() & !PA_RW) | PA_COW);
+    }
+
+    pub fn clear(&mut self) {
+        self.set(0, 0)
+    }
+
+    /// Take the ownership of the page from the PTE, clear the PTE and return the page.
+    pub fn take(&mut self) -> Page {
+        // SAFETY: Acquire the ownership of the page from the page table and then
+        // clear the PTE so no one could be able to access the page from here later on.
+        let page = unsafe { Page::take_pfn(self.pfn(), 0) };
+        self.clear();
+        page
+    }
+}
+
+impl<'lt, const KERNEL: bool> PTEIterator<'lt, KERNEL> {
+    fn new(pt: &'lt Page, start: VAddr, end: VAddr) -> KResult<Self> {
+        if start > end {
+            return Err(EINVAL);
+        }
+
+        let p4 = pt.as_cached();
+        let p3 = p4.as_mut_slice::<PTE>(512)[Self::index(4, start)].parse_page_table(KERNEL);
+        let p2 = p3.as_mut_slice::<PTE>(512)[Self::index(3, start)].parse_page_table(KERNEL);
+        let p1 = p2.as_mut_slice::<PTE>(512)[Self::index(2, start)].parse_page_table(KERNEL);
+
+        Ok(Self {
+            count: (end.0 - start.0) >> 12,
+            i4: Self::index(4, start) as u16,
+            i3: Self::index(3, start) as u16,
+            i2: Self::index(2, start) as u16,
+            i1: Self::index(1, start) as u16,
+            p4,
+            p3,
+            p2,
+            p1,
+            start,
+            end,
+            _phantom: core::marker::PhantomData,
+        })
+    }
+
+    fn offset(level: u32) -> usize {
+        12 + (level as usize - 1) * 9
+    }
+
+    fn index(level: u32, vaddr: VAddr) -> usize {
+        (vaddr.0 >> Self::offset(level)) & 0x1ff
+    }
+}
+
+impl<'lt, const KERNEL: bool> Iterator for PTEIterator<'lt, KERNEL> {
+    type Item = &'lt mut PTE;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.count != 0 {
+            self.count -= 1;
+        } else {
+            return None;
+        }
+
+        let retval = &mut self.p1.as_mut_slice::<PTE>(512)[self.i1 as usize];
+        self.i1 = (self.i1 + 1) % 512;
+        if self.i1 == 0 {
+            self.i2 = (self.i2 + 1) % 512;
+            if self.i2 == 0 {
+                self.i3 = (self.i3 + 1) % 512;
+                if self.i3 == 0 {
+                    self.i4 = (self.i4 + 1) % 512;
+                    if self.i4 == 0 {
+                        panic!("PTEIterator: out of range");
+                    }
+                }
+                self.p3 =
+                    self.p4.as_mut_slice::<PTE>(512)[self.i4 as usize].parse_page_table(KERNEL);
+            }
+            self.p2 = self.p3.as_mut_slice::<PTE>(512)[self.i3 as usize].parse_page_table(KERNEL);
+        }
+        self.p1 = self.p2.as_mut_slice::<PTE>(512)[self.i2 as usize].parse_page_table(KERNEL);
+        Some(retval)
+    }
+}
+
+impl PageTable {
+    pub fn new() -> Self {
+        let page = Page::alloc_one();
+        page.zero();
+
+        // TODO: copy only the kernel space mappings.
+        let kernel_space_page_table = CachedPP::new(KERNEL_PML4 as usize);
+
+        page.as_cached().as_mut_slice::<u64>(512)[256..]
+            .copy_from_slice(&kernel_space_page_table.as_mut_slice(512)[256..]);
+
+        Self { page }
+    }
+
+    pub fn iter_user(&self, range: VRange) -> KResult<PTEIterator<'_, false>> {
+        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
+    }
+
+    pub fn iter_kernel(&self, range: VRange) -> KResult<PTEIterator<'_, true>> {
+        PTEIterator::new(&self.page, range.start().floor(), range.end().ceil())
+    }
+
+    pub fn switch(&self) {
+        arch::vm::switch_page_table(self.page.as_phys())
+    }
+
+    pub fn unmap(&self, area: &MMArea) {
+        let range = area.range();
+        let use_invlpg = range.len() / 4096 < 4;
+        let iter = self.iter_user(range).unwrap();
+
+        if self.page.as_phys() != arch::vm::current_page_table() {
+            for pte in iter {
+                pte.take();
+            }
+            return;
+        }
+
+        if use_invlpg {
+            for (offset_pages, pte) in iter.enumerate() {
+                pte.take();
+
+                let pfn = range.start().floor().0 + offset_pages * 4096;
+                arch::vm::invlpg(pfn);
+            }
+        } else {
+            for pte in iter {
+                pte.take();
+            }
+            arch::vm::invlpg_all();
+        }
+    }
+
+    pub fn lazy_invalidate_tlb_all(&self) {
+        if self.page.as_phys() == arch::vm::current_page_table() {
+            arch::vm::invlpg_all();
+        }
+    }
+
+    pub fn set_mmapped(&self, range: VRange, permission: Permission) {
+        // PA_RW is set during page fault handling.
+        // PA_NXE is preserved across page faults, so we set PA_NXE now.
+        let attributes = if permission.execute {
+            PA_US | PA_COW | PA_ANON | PA_MMAP
+        } else {
+            PA_US | PA_COW | PA_ANON | PA_MMAP | PA_NXE
+        };
+
+        for pte in self.iter_user(range).unwrap() {
+            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
+        }
+    }
+
+    pub fn set_anonymous(&self, range: VRange, permission: Permission) {
+        // PA_RW is set during page fault handling.
+        // PA_NXE is preserved across page faults, so we set PA_NXE now.
+        let attributes = if permission.execute {
+            PA_P | PA_US | PA_COW | PA_ANON
+        } else {
+            PA_P | PA_US | PA_COW | PA_ANON | PA_NXE
+        };
+
+        for pte in self.iter_user(range).unwrap() {
+            pte.set(EMPTY_PAGE.clone().into_pfn(), attributes);
+        }
+    }
+}
+
+fn drop_page_table_recursive(pt: &Page, level: usize) {
+    for pte in pt
+        .as_cached()
+        .as_mut_slice::<PTE>(512)
+        .iter_mut()
+        .filter(|pte| pte.is_present() && pte.is_user())
+    {
+        let page = pte.take();
+        if level > 1 {
+            drop_page_table_recursive(&page, level - 1);
+        }
+    }
+}
+
+impl Drop for PageTable {
+    fn drop(&mut self) {
+        drop_page_table_recursive(&self.page, 4);
+    }
+}

+ 4 - 197
src/kernel/mem/paging.cc

@@ -5,10 +5,8 @@
 
 
 #include <kernel/async/lock.hpp>
 #include <kernel/async/lock.hpp>
 #include <kernel/log.hpp>
 #include <kernel/log.hpp>
-#include <kernel/mem/mm_list.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/slab.hpp>
 #include <kernel/mem/slab.hpp>
-#include <kernel/mem/vm_area.hpp>
 #include <kernel/process.hpp>
 #include <kernel/process.hpp>
 
 
 using namespace types::list;
 using namespace types::list;
@@ -16,19 +14,6 @@ using namespace types::list;
 using namespace kernel::async;
 using namespace kernel::async;
 using namespace kernel::mem::paging;
 using namespace kernel::mem::paging;
 
 
-static inline void __page_fault_die(uintptr_t vaddr) {
-    kmsgf("[kernel] kernel panic: invalid memory access to %p", vaddr);
-    freeze();
-}
-
-static inline PSE __parse_pse(PSE pse, bool priv) {
-    auto attr = priv ? PA_KERNEL_PAGE_TABLE : PA_USER_PAGE_TABLE;
-    if (!(pse.attributes() & PA_P))
-        pse.set(attr, alloc_page_table());
-
-    return pse.parse();
-}
-
 static struct zone_info {
 static struct zone_info {
     page* next;
     page* next;
     std::size_t count;
     std::size_t count;
@@ -96,8 +81,7 @@ static inline page* _create_zone(pfn_t pfn, unsigned order) {
 }
 }
 
 
 // call with zone_lock held
 // call with zone_lock held
-static inline void _split_zone(page* zone, unsigned order,
-                               unsigned target_order) {
+static inline void _split_zone(page* zone, unsigned order, unsigned target_order) {
     while (order > target_order) {
     while (order > target_order) {
         pfn_t pfn = page_to_pfn(zone);
         pfn_t pfn = page_to_pfn(zone);
         _create_zone(buddy(pfn, order - 1), order - 1);
         _create_zone(buddy(pfn, order - 1), order - 1);
@@ -116,7 +100,7 @@ static inline page* _alloc_zone(unsigned order) {
         if (!zone)
         if (!zone)
             continue;
             continue;
 
 
-        increase_refcount(zone);
+        zone->refcount++;
 
 
         if (i > order)
         if (i > order)
             _split_zone(zone, i, order);
             _split_zone(zone, i, order);
@@ -213,13 +197,12 @@ pfn_t kernel::mem::paging::alloc_page_table() {
 }
 }
 
 
 void kernel::mem::paging::free_pages(page* pg, unsigned order) {
 void kernel::mem::paging::free_pages(page* pg, unsigned order) {
+    lock_guard_irq lock{zone_lock};
     assert((pg->flags & 0xff) == order);
     assert((pg->flags & 0xff) == order);
 
 
-    // TODO: atomic
     if (!(pg->flags & PAGE_BUDDY) || --pg->refcount)
     if (!(pg->flags & PAGE_BUDDY) || --pg->refcount)
         return;
         return;
 
 
-    lock_guard_irq lock{zone_lock};
     while (order < 52) {
     while (order < 52) {
         pfn_t pfn = page_to_pfn(pg);
         pfn_t pfn = page_to_pfn(pg);
         pfn_t buddy_pfn = buddy(pfn, order);
         pfn_t buddy_pfn = buddy(pfn, order);
@@ -268,182 +251,6 @@ page* kernel::mem::paging::pfn_to_page(pfn_t pfn) {
 }
 }
 
 
 void kernel::mem::paging::increase_refcount(page* pg) {
 void kernel::mem::paging::increase_refcount(page* pg) {
+    lock_guard_irq lock{zone_lock};
     pg->refcount++;
     pg->refcount++;
 }
 }
-
-void kernel::mem::paging::handle_page_fault(unsigned long err) {
-    using namespace kernel::mem;
-    using namespace paging;
-
-    uintptr_t vaddr;
-    asm volatile("mov %%cr2, %0" : "=g"(vaddr) : :);
-    auto& mms = current_process->mms;
-
-    auto* mm_area = mms.find(vaddr);
-    if (!mm_area) [[unlikely]] {
-        // user access to address that does not exist
-        if (err & PAGE_FAULT_U)
-            kill_current(SIGSEGV);
-
-        __page_fault_die(vaddr);
-    }
-
-    // user access to a present page caused the fault
-    // check access rights
-    if (err & PAGE_FAULT_U && err & PAGE_FAULT_P) {
-        // write to read only pages
-        if (err & PAGE_FAULT_W && !(mm_area->flags & MM_WRITE))
-            kill_current(SIGSEGV);
-
-        // execute from non-executable pages
-        if (err & PAGE_FAULT_I && !(mm_area->flags & MM_EXECUTE))
-            kill_current(SIGSEGV);
-    }
-
-    auto idx = idx_all(vaddr);
-
-    auto pe = mms.get_page_table()[std::get<1>(idx)];
-    assert(pe.attributes() & PA_P);
-    pe = pe.parse()[std::get<2>(idx)];
-    assert(pe.attributes() & PA_P);
-    pe = pe.parse()[std::get<3>(idx)];
-    assert(pe.attributes() & PA_P);
-    pe = pe.parse()[std::get<4>(idx)];
-
-    bool mmapped = mm_area->flags & MM_MAPPED;
-    assert(!mmapped || mm_area->mapped_file);
-
-    if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
-        __page_fault_die(vaddr);
-
-    pfn_t pfn = pe.pfn();
-    auto attr = pe.attributes();
-
-    page* pg = pfn_to_page(pfn);
-
-    if (attr & PA_COW) {
-        attr &= ~PA_COW;
-        if (mm_area->flags & MM_WRITE)
-            attr |= PA_RW;
-        else
-            attr &= ~PA_RW;
-
-        // if it is a dying page
-        // TODO: use atomic
-        if (pg->refcount == 1) {
-            pe.set(attr, pfn);
-            return;
-        }
-
-        // duplicate the page
-        page* new_page = alloc_page();
-        pfn_t new_pfn = page_to_pfn(new_page);
-        physaddr<void> new_page_addr{new_pfn};
-
-        if (attr & PA_ANON)
-            memset(new_page_addr, 0x00, 0x1000);
-        else
-            memcpy(new_page_addr, physaddr<void>{pfn}, 0x1000);
-
-        attr &= ~(PA_A | PA_ANON);
-        --pg->refcount;
-
-        pe.set(attr, new_pfn);
-        pfn = new_pfn;
-    }
-
-    if (attr & PA_MMAP) {
-        attr |= PA_P;
-
-        size_t offset = (vaddr & ~0xfff) - mm_area->start;
-        char* data = physaddr<char>{pfn};
-
-        int n = fs_read(mm_area->mapped_file, data, 4096,
-                        mm_area->file_offset + offset, 4096);
-
-        // TODO: send SIGBUS if offset is greater than real size
-        if (n != 4096)
-            memset(data + n, 0x00, 4096 - n);
-
-        // TODO: shared mapping
-        attr &= ~PA_MMAP;
-
-        pe.set(attr, pfn);
-    }
-}
-
-vaddr_range::vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool priv)
-    : n{start >= end ? 0 : ((end - start) >> 12)}
-    , idx4{!n ? 0 : idx_p4(start)}
-    , idx3{!n ? 0 : idx_p3(start)}
-    , idx2{!n ? 0 : idx_p2(start)}
-    , idx1{!n ? 0 : idx_p1(start)}
-    , pml4{!n ? PSE{0} : PSE{pt}}
-    , pdpt{!n ? PSE{0} : __parse_pse(pml4[idx4], priv)}
-    , pd{!n ? PSE{0} : __parse_pse(pdpt[idx3], priv)}
-    , pt{!n ? PSE{0} : __parse_pse(pd[idx2], priv)}
-    , m_start{!n ? 0 : start}
-    , m_end{!n ? 0 : end}
-    , is_privilege{!n ? false : priv} {}
-
-vaddr_range::vaddr_range(std::nullptr_t)
-    : n{}
-    , idx4{}
-    , idx3{}
-    , idx2{}
-    , idx1{}
-    , pml4{0}
-    , pdpt{0}
-    , pd{0}
-    , pt{0}
-    , m_start{}
-    , m_end{}
-    , is_privilege{} {}
-
-vaddr_range vaddr_range::begin() const noexcept {
-    return *this;
-}
-
-vaddr_range vaddr_range::end() const noexcept {
-    return vaddr_range{nullptr};
-}
-
-PSE vaddr_range::operator*() const noexcept {
-    return pt[idx1];
-}
-
-vaddr_range& vaddr_range::operator++() {
-    --n;
-
-    if ((idx1 = (idx1 + 1) % 512) != 0)
-        return *this;
-
-    do {
-        if ((idx2 = (idx2 + 1) % 512) != 0)
-            break;
-        do {
-            if ((idx3 = (idx3 + 1) % 512) != 0)
-                break;
-
-            idx4 = (idx4 + 1) % 512;
-
-            // if idx4 is 0 after update, we have an overflow
-            assert(idx4 != 0);
-
-            pdpt = __parse_pse(pml4[idx4], is_privilege);
-        } while (false);
-
-        pd = __parse_pse(pdpt[idx3], is_privilege);
-    } while (false);
-
-    pt = __parse_pse(pd[idx2], is_privilege);
-    return *this;
-}
-
-vaddr_range::operator bool() const noexcept {
-    return n;
-}
-
-bool vaddr_range::operator==(const vaddr_range& other) const noexcept {
-    return n == other.n;
-}

+ 91 - 35
src/kernel/mem/paging.rs

@@ -1,37 +1,78 @@
+use crate::bindings::root::kernel::mem::paging::{
+    alloc_page as c_alloc_page, alloc_pages as c_alloc_pages, free_pages as c_free_pages,
+    increase_refcount as c_increase_refcount, page as c_page, page_to_pfn as c_page_to_pfn,
+    pfn_to_page as c_pfn_to_page, PAGE_BUDDY,
+};
 use crate::bindings::root::EFAULT;
 use crate::bindings::root::EFAULT;
+use crate::io::{Buffer, FillResult};
 use crate::kernel::mem::phys;
 use crate::kernel::mem::phys;
 use core::fmt;
 use core::fmt;
 
 
 use super::phys::PhysPtr;
 use super::phys::PhysPtr;
 
 
 pub struct Page {
 pub struct Page {
-    page_ptr: *mut crate::bindings::root::kernel::mem::paging::page,
+    page_ptr: *mut c_page,
     order: u32,
     order: u32,
 }
 }
 
 
 impl Page {
 impl Page {
     pub fn alloc_one() -> Self {
     pub fn alloc_one() -> Self {
-        use crate::bindings::root::kernel::mem::paging::alloc_page;
-        let page_ptr = unsafe { alloc_page() };
+        let page_ptr = unsafe { c_alloc_page() };
 
 
         Self { page_ptr, order: 0 }
         Self { page_ptr, order: 0 }
     }
     }
 
 
     pub fn alloc_many(order: u32) -> Self {
     pub fn alloc_many(order: u32) -> Self {
-        use crate::bindings::root::kernel::mem::paging::alloc_pages;
-        let page_ptr = unsafe { alloc_pages(order) };
+        let page_ptr = unsafe { c_alloc_pages(order) };
 
 
         Self { page_ptr, order }
         Self { page_ptr, order }
     }
     }
 
 
+    /// Get `Page` from `pfn`, acquiring the ownership of the page. `refcount` is not increased.
+    ///
+    /// # Safety
+    /// Caller must ensure that the pfn is no longer referenced by any other code.
+    pub unsafe fn take_pfn(pfn: usize, order: u32) -> Self {
+        let page_ptr = unsafe { c_pfn_to_page(pfn) };
+
+        // Only buddy pages can be used here.
+        assert!(unsafe { page_ptr.as_ref().unwrap() }.flags & PAGE_BUDDY != 0);
+
+        // Check if the order is correct.
+        assert_eq!(
+            unsafe { page_ptr.as_ref().unwrap() }.flags & 0xff,
+            order as u64
+        );
+
+        Self { page_ptr, order }
+    }
+
+    /// Get `Page` from `pfn` and increase the reference count.
+    ///
+    /// # Safety
+    /// Caller must ensure that `pfn` refers to a valid physical frame number with `refcount` > 0.
+    pub unsafe fn from_pfn(pfn: usize, order: u32) -> Self {
+        // SAFETY: `pfn` is a valid physical frame number with refcount > 0.
+        unsafe { Self::increase_refcount(pfn) };
+
+        // SAFETY: `pfn` has an increased refcount.
+        unsafe { Self::take_pfn(pfn, order) }
+    }
+
+    /// Consumes the `Page` and returns the physical frame number without dropping the reference
+    /// count the page holds.
+    pub fn into_pfn(self) -> usize {
+        let pfn = unsafe { c_page_to_pfn(self.page_ptr) };
+        core::mem::forget(self);
+        pfn
+    }
+
     pub fn len(&self) -> usize {
     pub fn len(&self) -> usize {
         1 << (self.order + 12)
         1 << (self.order + 12)
     }
     }
 
 
     pub fn as_phys(&self) -> usize {
     pub fn as_phys(&self) -> usize {
-        use crate::bindings::root::kernel::mem::paging::page_to_pfn;
-
-        unsafe { page_to_pfn(self.page_ptr) }
+        unsafe { c_page_to_pfn(self.page_ptr) }
     }
     }
 
 
     pub fn as_cached(&self) -> phys::CachedPP {
     pub fn as_cached(&self) -> phys::CachedPP {
@@ -46,11 +87,17 @@ impl Page {
         use phys::PhysPtr;
         use phys::PhysPtr;
 
 
         unsafe {
         unsafe {
-            core::ptr::write_bytes(
-                self.as_cached().as_ptr::<u8>(),
-                0,
-                self.len(),
-            );
+            core::ptr::write_bytes(self.as_cached().as_ptr::<u8>(), 0, self.len());
+        }
+    }
+
+    /// # Safety
+    /// Caller must ensure that the page is properly freed.
+    pub unsafe fn increase_refcount(pfn: usize) {
+        let page = unsafe { c_pfn_to_page(pfn) };
+
+        unsafe {
+            c_increase_refcount(page);
         }
         }
     }
     }
 }
 }
@@ -58,9 +105,7 @@ impl Page {
 impl Clone for Page {
 impl Clone for Page {
     fn clone(&self) -> Self {
     fn clone(&self) -> Self {
         unsafe {
         unsafe {
-            crate::bindings::root::kernel::mem::paging::increase_refcount(
-                self.page_ptr,
-            );
+            c_increase_refcount(self.page_ptr);
         }
         }
 
 
         Self {
         Self {
@@ -73,10 +118,7 @@ impl Clone for Page {
 impl Drop for Page {
 impl Drop for Page {
     fn drop(&mut self) {
     fn drop(&mut self) {
         unsafe {
         unsafe {
-            crate::bindings::root::kernel::mem::paging::free_pages(
-                self.page_ptr,
-                self.order,
-            );
+            c_free_pages(self.page_ptr, self.order);
         }
         }
     }
     }
 }
 }
@@ -118,20 +160,12 @@ impl PageBuffer {
     }
     }
 
 
     pub fn as_slice(&self) -> &[u8] {
     pub fn as_slice(&self) -> &[u8] {
-        unsafe {
-            core::slice::from_raw_parts(
-                self.page.as_cached().as_ptr::<u8>(),
-                self.offset,
-            )
-        }
+        unsafe { core::slice::from_raw_parts(self.page.as_cached().as_ptr::<u8>(), self.offset) }
     }
     }
 
 
     pub fn as_mut_slice(&self) -> &mut [u8] {
     pub fn as_mut_slice(&self) -> &mut [u8] {
         unsafe {
         unsafe {
-            core::slice::from_raw_parts_mut(
-                self.page.as_cached().as_ptr::<u8>(),
-                self.offset,
-            )
+            core::slice::from_raw_parts_mut(self.page.as_cached().as_ptr::<u8>(), self.offset)
         }
         }
     }
     }
 
 
@@ -162,6 +196,32 @@ impl core::fmt::Write for PageBuffer {
     }
     }
 }
 }
 
 
+impl Buffer for PageBuffer {
+    fn total(&self) -> usize {
+        self.page.len()
+    }
+
+    fn wrote(&self) -> usize {
+        self.len()
+    }
+
+    fn fill(&mut self, data: &[u8]) -> crate::KResult<crate::io::FillResult> {
+        if self.remaining() == 0 {
+            return Ok(FillResult::Full);
+        }
+
+        let len = core::cmp::min(data.len(), self.remaining());
+        self.available_as_slice()[..len].copy_from_slice(&data[..len]);
+        self.consume(len);
+
+        if len < data.len() {
+            Ok(FillResult::Partial(len))
+        } else {
+            Ok(FillResult::Done(len))
+        }
+    }
+}
+
 /// Copy data from a slice to a `Page`
 /// Copy data from a slice to a `Page`
 ///
 ///
 /// DONT USE THIS FUNCTION TO COPY DATA TO MMIO ADDRESSES
 /// DONT USE THIS FUNCTION TO COPY DATA TO MMIO ADDRESSES
@@ -177,11 +237,7 @@ pub fn copy_to_page(src: &[u8], dst: &Page) -> Result<(), u32> {
     }
     }
 
 
     unsafe {
     unsafe {
-        core::ptr::copy_nonoverlapping(
-            src.as_ptr(),
-            dst.as_cached().as_ptr(),
-            src.len(),
-        );
+        core::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_cached().as_ptr(), src.len());
     }
     }
 
 
     Ok(())
     Ok(())

+ 2 - 2
src/kernel/mem/phys.rs

@@ -31,11 +31,11 @@ pub struct NoCachePP {
 }
 }
 
 
 impl CachedPP {
 impl CachedPP {
-    pub fn new(addr: usize) -> Self {
+    pub const fn new(addr: usize) -> Self {
         Self { addr }
         Self { addr }
     }
     }
 
 
-    pub fn offset(&self, offset: usize) -> Self {
+    pub const fn offset(&self, offset: usize) -> Self {
         Self {
         Self {
             addr: self.addr + offset,
             addr: self.addr + offset,
         }
         }

+ 20 - 10
src/kernel/mem/slab.cc

@@ -4,6 +4,7 @@
 
 
 #include <types/list.hpp>
 #include <types/list.hpp>
 
 
+#include <kernel/async/lock.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/paging.hpp>
 #include <kernel/mem/slab.hpp>
 #include <kernel/mem/slab.hpp>
 
 
@@ -12,6 +13,8 @@ using namespace types::list;
 
 
 constexpr std::size_t SLAB_PAGE_SIZE = 0x1000; // 4K
 constexpr std::size_t SLAB_PAGE_SIZE = 0x1000; // 4K
 
 
+kernel::async::mutex slab_lock;
+
 std::ptrdiff_t _slab_data_start_offset(std::size_t size) {
 std::ptrdiff_t _slab_data_start_offset(std::size_t size) {
     return (sizeof(slab_head) + size - 1) & ~(size - 1);
     return (sizeof(slab_head) + size - 1) & ~(size - 1);
 }
 }
@@ -67,6 +70,8 @@ void _slab_add_page(slab_cache* cache) {
 }
 }
 
 
 void* kernel::mem::slab_alloc(slab_cache* cache) {
 void* kernel::mem::slab_alloc(slab_cache* cache) {
+    async::lock_guard_irq lock(slab_lock);
+
     slab_head* slab = cache->slabs_partial;
     slab_head* slab = cache->slabs_partial;
     if (!slab) {                 // no partial slabs, try to get an empty slab
     if (!slab) {                 // no partial slabs, try to get an empty slab
         if (!cache->slabs_empty) // no empty slabs, create a new one
         if (!cache->slabs_empty) // no empty slabs, create a new one
@@ -88,24 +93,29 @@ void* kernel::mem::slab_alloc(slab_cache* cache) {
 }
 }
 
 
 void kernel::mem::slab_free(void* ptr) {
 void kernel::mem::slab_free(void* ptr) {
+    async::lock_guard_irq lock(slab_lock);
+
     slab_head* slab = (slab_head*)((uintptr_t)ptr & ~(SLAB_PAGE_SIZE - 1));
     slab_head* slab = (slab_head*)((uintptr_t)ptr & ~(SLAB_PAGE_SIZE - 1));
 
 
     *(void**)ptr = slab->free;
     *(void**)ptr = slab->free;
     slab->free = ptr;
     slab->free = ptr;
     slab->free_count++;
     slab->free_count++;
 
 
-    if (slab->free_count == _slab_max_count(slab->obj_size)) {
-        auto* cache = slab->cache;
-        slab_head** head = nullptr;
+    auto max_count = _slab_max_count(slab->obj_size);
 
 
-        if (cache->slabs_full == slab) {
-            head = &cache->slabs_full;
-        } else {
-            head = &cache->slabs_partial;
-        }
+    if (max_count == 1) {
+        list_remove(&slab->cache->slabs_full, slab);
+        list_insert(&slab->cache->slabs_empty, slab);
+    }
+
+    if (slab->free_count == 1) {
+        list_remove(&slab->cache->slabs_full, slab);
+        list_insert(&slab->cache->slabs_partial, slab);
+    }
 
 
-        list_remove(head, slab);
-        list_insert(&cache->slabs_empty, slab);
+    if (slab->free_count == max_count) {
+        list_remove(&slab->cache->slabs_partial, slab);
+        list_insert(&slab->cache->slabs_empty, slab);
     }
     }
 }
 }
 
 

+ 168 - 0
src/kernel/mem/vrange.rs

@@ -0,0 +1,168 @@
+use core::{
+    cmp::Ordering,
+    fmt::{self, Debug, Formatter},
+    ops::{Add, RangeBounds, Sub},
+};
+
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct VAddr(pub usize);
+
+#[derive(Clone, Copy)]
+pub struct VRange {
+    start: VAddr,
+    end: VAddr,
+}
+
+const USER_SPACE_MEMORY_TOP: VAddr = VAddr(0x8000_0000_0000);
+
+impl VAddr {
+    pub const NULL: Self = Self(0);
+
+    pub fn floor(&self) -> Self {
+        VAddr(self.0 & !0xfff)
+    }
+
+    pub fn ceil(&self) -> Self {
+        VAddr((self.0 + 0xfff) & !0xfff)
+    }
+
+    pub fn is_user(&self) -> bool {
+        self.0 != 0 && self < &USER_SPACE_MEMORY_TOP
+    }
+}
+
+impl Sub for VAddr {
+    type Output = usize;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.0 - rhs.0
+    }
+}
+
+impl Add<usize> for VAddr {
+    type Output = Self;
+
+    fn add(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 + rhs)
+    }
+}
+
+impl Sub<usize> for VAddr {
+    type Output = Self;
+
+    fn sub(self, rhs: usize) -> Self::Output {
+        VAddr(self.0 - rhs)
+    }
+}
+
+impl Debug for VAddr {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "V{:#x}", self.0)
+    }
+}
+
+impl Debug for VRange {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "[{:?}, {:?})", self.start, self.end)
+    }
+}
+
+impl Eq for VRange {}
+impl PartialOrd for VRange {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for VRange {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+/// Any two ranges that have one of them containing the other are considered equal.
+impl Ord for VRange {
+    fn cmp(&self, other: &Self) -> Ordering {
+        if self.start == other.start {
+            return Ordering::Equal;
+        }
+
+        if self.end == other.end {
+            if self.start == self.end {
+                return Ordering::Greater;
+            }
+            if other.start == other.end {
+                return Ordering::Less;
+            }
+            return Ordering::Equal;
+        }
+
+        if self.start < other.start {
+            if other.end < self.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Less;
+            }
+        }
+
+        if other.start < self.start {
+            if self.end < other.end {
+                return Ordering::Equal;
+            } else {
+                return Ordering::Greater;
+            }
+        }
+
+        unreachable!()
+    }
+}
+
+impl From<VAddr> for VRange {
+    fn from(addr: VAddr) -> Self {
+        VRange::new(addr, addr)
+    }
+}
+
+impl VRange {
+    pub fn new(start: VAddr, end: VAddr) -> Self {
+        assert!(start <= end);
+        VRange { start, end }
+    }
+
+    pub fn is_overlapped(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    pub fn is_user(&self) -> bool {
+        self.start < USER_SPACE_MEMORY_TOP && self.end <= USER_SPACE_MEMORY_TOP
+    }
+
+    pub fn start(&self) -> VAddr {
+        self.start
+    }
+
+    pub fn end(&self) -> VAddr {
+        self.end
+    }
+
+    pub fn len(&self) -> usize {
+        self.end.0 - self.start.0
+    }
+
+    pub fn shrink(&self, count: usize) -> Self {
+        assert!(count <= self.len());
+        VRange::new(self.start, self.end - count)
+    }
+
+    pub fn grow(&self, count: usize) -> Self {
+        VRange::new(self.start, self.end + count)
+    }
+
+    pub fn into_range(self) -> impl RangeBounds<Self> {
+        if self.len() == 0 {
+            VRange::from(self.start())..=VRange::from(self.start())
+        } else {
+            VRange::from(self.start())..=VRange::from(self.end() - 1)
+        }
+    }
+}

Some files were not shown because too many files changed in this diff