paging.cc 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. #include <assert.h>
  2. #include <string.h>
  3. #include <types/list.hpp>
  4. #include <kernel/async/lock.hpp>
  5. #include <kernel/log.hpp>
  6. #include <kernel/mem/mm_list.hpp>
  7. #include <kernel/mem/paging.hpp>
  8. #include <kernel/mem/slab.hpp>
  9. #include <kernel/mem/vm_area.hpp>
  10. #include <kernel/process.hpp>
  11. #include <kernel/procfs.hpp>
  12. using namespace types::list;
  13. using namespace kernel::async;
  14. using namespace kernel::mem::paging;
  15. static inline void __page_fault_die(uintptr_t vaddr) {
  16. kmsgf("[kernel] kernel panic: invalid memory access to %p", vaddr);
  17. freeze();
  18. }
  19. static inline PSE __parse_pse(PSE pse, bool priv) {
  20. auto attr = priv ? PA_KERNEL_PAGE_TABLE : PA_USER_PAGE_TABLE;
  21. if (!(pse.attributes() & PA_P))
  22. pse.set(attr, alloc_page_table());
  23. return pse.parse();
  24. }
  25. static struct zone_info {
  26. page* next;
  27. std::size_t count;
  28. } zones[52];
  29. static mutex zone_lock;
  30. constexpr unsigned _msb(std::size_t x) {
  31. unsigned n = 0;
  32. while (x >>= 1)
  33. n++;
  34. return n;
  35. }
  36. constexpr pfn_t buddy(pfn_t pfn, unsigned order) {
  37. return pfn ^ (1 << (order + 12));
  38. }
  39. constexpr pfn_t parent(pfn_t pfn, unsigned order) {
  40. return pfn & ~(1 << (order + 12));
  41. }
  42. // call with zone_lock held
  43. static inline void _zone_list_insert(unsigned order, page* zone) {
  44. assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
  45. assert((zone->flags & 0xff) == 0);
  46. zone->flags |= order;
  47. zones[order].count++;
  48. list_insert(&zones[order].next, zone);
  49. }
  50. // call with zone_lock held
  51. static inline void _zone_list_remove(unsigned order, page* zone) {
  52. assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
  53. assert(zones[order].count > 0 && (zone->flags & 0xff) == order);
  54. zone->flags &= ~0xff;
  55. zones[order].count--;
  56. list_remove(&zones[order].next, zone);
  57. }
  58. // call with zone_lock held
  59. static inline page* _zone_list_get(unsigned order) {
  60. if (zones[order].count == 0)
  61. return nullptr;
  62. zones[order].count--;
  63. auto* pg = list_get(&zones[order].next);
  64. assert((pg->flags & 0xff) == order);
  65. return pg;
  66. }
  67. // where order represents power of 2
  68. // call with zone_lock held
  69. static inline page* _create_zone(pfn_t pfn, unsigned order) {
  70. page* zone = pfn_to_page(pfn);
  71. assert(zone->flags & PAGE_PRESENT);
  72. zone->flags |= PAGE_BUDDY;
  73. _zone_list_insert(order, zone);
  74. return zone;
  75. }
  76. // call with zone_lock held
  77. static inline void _split_zone(page* zone, unsigned order,
  78. unsigned target_order) {
  79. while (order > target_order) {
  80. pfn_t pfn = page_to_pfn(zone);
  81. _create_zone(buddy(pfn, order - 1), order - 1);
  82. order--;
  83. }
  84. zone->flags &= ~0xff;
  85. zone->flags |= target_order;
  86. }
  87. // call with zone_lock held
  88. static inline page* _alloc_zone(unsigned order) {
  89. for (unsigned i = order; i < 52; ++i) {
  90. auto zone = _zone_list_get(i);
  91. if (!zone)
  92. continue;
  93. increase_refcount(zone);
  94. if (i > order)
  95. _split_zone(zone, i, order);
  96. assert(zone->flags & PAGE_PRESENT && zone->flags & PAGE_BUDDY);
  97. return zone;
  98. }
  99. return nullptr;
  100. }
  101. void kernel::mem::paging::create_zone(uintptr_t start, uintptr_t end) {
  102. start += (4096 - 1);
  103. start >>= 12;
  104. end >>= 12;
  105. if (start >= end)
  106. return;
  107. lock_guard_irq lock{zone_lock};
  108. unsigned long low = start;
  109. for (unsigned i = 0; i < _msb(end); ++i, low >>= 1) {
  110. if (!(low & 1))
  111. continue;
  112. _create_zone(low << (12 + i), i);
  113. low++;
  114. }
  115. low = 1 << _msb(end);
  116. while (low < end) {
  117. unsigned order = _msb(end - low);
  118. _create_zone(low << 12, order);
  119. low |= (1 << order);
  120. }
  121. }
  122. void kernel::mem::paging::mark_present(uintptr_t start, uintptr_t end) {
  123. start >>= 12;
  124. end += (4096 - 1);
  125. end >>= 12;
  126. while (start < end)
  127. PAGE_ARRAY[start++].flags |= PAGE_PRESENT;
  128. }
  129. page* kernel::mem::paging::alloc_pages(unsigned order) {
  130. lock_guard_irq lock{zone_lock};
  131. auto* zone = _alloc_zone(order);
  132. if (!zone)
  133. freeze();
  134. return zone;
  135. }
  136. page* kernel::mem::paging::alloc_page() {
  137. return alloc_pages(0);
  138. }
  139. pfn_t kernel::mem::paging::alloc_page_table() {
  140. page* zone = alloc_page();
  141. pfn_t pfn = page_to_pfn(zone);
  142. memset(physaddr<void>{pfn}, 0x00, 0x1000);
  143. return pfn;
  144. }
  145. void kernel::mem::paging::free_pages(page* pg, unsigned order) {
  146. assert((pg->flags & 0xff) == order);
  147. // TODO: atomic
  148. if (!(pg->flags & PAGE_BUDDY) || --pg->refcount)
  149. return;
  150. lock_guard_irq lock{zone_lock};
  151. while (order < 52) {
  152. pfn_t pfn = page_to_pfn(pg);
  153. pfn_t buddy_pfn = buddy(pfn, order);
  154. page* buddy_page = pfn_to_page(buddy_pfn);
  155. if (!(buddy_page->flags & PAGE_BUDDY))
  156. break;
  157. if ((buddy_page->flags & 0xff) != order)
  158. break;
  159. if (buddy_page->refcount)
  160. break;
  161. _zone_list_remove(order, buddy_page);
  162. if (buddy_page < pg)
  163. std::swap(buddy_page, pg);
  164. buddy_page->flags &= ~(PAGE_BUDDY | 0xff);
  165. order++;
  166. }
  167. pg->flags &= ~0xff;
  168. _zone_list_insert(order, pg);
  169. }
  170. void kernel::mem::paging::free_page(page* page) {
  171. return free_pages(page, 0);
  172. }
  173. void kernel::mem::paging::free_pages(pfn_t pfn, unsigned order) {
  174. return free_pages(pfn_to_page(pfn), order);
  175. }
  176. void kernel::mem::paging::free_page(pfn_t pfn) {
  177. return free_page(pfn_to_page(pfn));
  178. }
  179. pfn_t kernel::mem::paging::page_to_pfn(page* _page) {
  180. return (pfn_t)(_page - PAGE_ARRAY) * 0x1000;
  181. }
  182. page* kernel::mem::paging::pfn_to_page(pfn_t pfn) {
  183. return PAGE_ARRAY + pfn / 0x1000;
  184. }
  185. void kernel::mem::paging::increase_refcount(page* pg) {
  186. pg->refcount++;
  187. }
  188. void kernel::mem::paging::handle_page_fault(unsigned long err) {
  189. using namespace kernel::mem;
  190. using namespace paging;
  191. uintptr_t vaddr;
  192. asm volatile("mov %%cr2, %0" : "=g"(vaddr) : :);
  193. auto& mms = current_process->mms;
  194. auto* mm_area = mms.find(vaddr);
  195. if (!mm_area) [[unlikely]] {
  196. // user access to address that does not exist
  197. if (err & PAGE_FAULT_U)
  198. kill_current(SIGSEGV);
  199. __page_fault_die(vaddr);
  200. }
  201. // user access to a present page caused the fault
  202. // check access rights
  203. if (err & PAGE_FAULT_U && err & PAGE_FAULT_P) {
  204. // write to read only pages
  205. if (err & PAGE_FAULT_W && !(mm_area->flags & MM_WRITE))
  206. kill_current(SIGSEGV);
  207. // execute from non-executable pages
  208. if (err & PAGE_FAULT_I && !(mm_area->flags & MM_EXECUTE))
  209. kill_current(SIGSEGV);
  210. }
  211. auto idx = idx_all(vaddr);
  212. auto pe = mms.get_page_table()[std::get<1>(idx)];
  213. assert(pe.attributes() & PA_P);
  214. pe = pe.parse()[std::get<2>(idx)];
  215. assert(pe.attributes() & PA_P);
  216. pe = pe.parse()[std::get<3>(idx)];
  217. assert(pe.attributes() & PA_P);
  218. pe = pe.parse()[std::get<4>(idx)];
  219. bool mmapped = mm_area->flags & MM_MAPPED;
  220. assert(!mmapped || mm_area->mapped_file);
  221. if (!(err & PAGE_FAULT_P) && !mmapped) [[unlikely]]
  222. __page_fault_die(vaddr);
  223. pfn_t pfn = pe.pfn();
  224. auto attr = pe.attributes();
  225. page* pg = pfn_to_page(pfn);
  226. if (attr & PA_COW) {
  227. attr &= ~PA_COW;
  228. if (mm_area->flags & MM_WRITE)
  229. attr |= PA_RW;
  230. else
  231. attr &= ~PA_RW;
  232. // if it is a dying page
  233. // TODO: use atomic
  234. if (pg->refcount == 1) {
  235. pe.set(attr, pfn);
  236. return;
  237. }
  238. // duplicate the page
  239. page* new_page = alloc_page();
  240. pfn_t new_pfn = page_to_pfn(new_page);
  241. physaddr<void> new_page_addr{new_pfn};
  242. if (attr & PA_ANON)
  243. memset(new_page_addr, 0x00, 0x1000);
  244. else
  245. memcpy(new_page_addr, physaddr<void>{pfn}, 0x1000);
  246. attr &= ~(PA_A | PA_ANON);
  247. --pg->refcount;
  248. pe.set(attr, new_pfn);
  249. pfn = new_pfn;
  250. }
  251. if (attr & PA_MMAP) {
  252. attr |= PA_P;
  253. size_t offset = (vaddr & ~0xfff) - mm_area->start;
  254. char* data = physaddr<char>{pfn};
  255. int n = fs::read(mm_area->mapped_file, data, 4096,
  256. mm_area->file_offset + offset, 4096);
  257. // TODO: send SIGBUS if offset is greater than real size
  258. if (n != 4096)
  259. memset(data + n, 0x00, 4096 - n);
  260. // TODO: shared mapping
  261. attr &= ~PA_MMAP;
  262. pe.set(attr, pfn);
  263. }
  264. }
  265. vaddr_range::vaddr_range(pfn_t pt, uintptr_t start, uintptr_t end, bool priv)
  266. : n{start >= end ? 0 : ((end - start) >> 12)}
  267. , idx4{!n ? 0 : idx_p4(start)}
  268. , idx3{!n ? 0 : idx_p3(start)}
  269. , idx2{!n ? 0 : idx_p2(start)}
  270. , idx1{!n ? 0 : idx_p1(start)}
  271. , pml4{!n ? PSE{0} : PSE{pt}}
  272. , pdpt{!n ? PSE{0} : __parse_pse(pml4[idx4], priv)}
  273. , pd{!n ? PSE{0} : __parse_pse(pdpt[idx3], priv)}
  274. , pt{!n ? PSE{0} : __parse_pse(pd[idx2], priv)}
  275. , m_start{!n ? 0 : start}
  276. , m_end{!n ? 0 : end}
  277. , is_privilege{!n ? false : priv} {}
  278. vaddr_range::vaddr_range(std::nullptr_t)
  279. : n{}
  280. , idx4{}
  281. , idx3{}
  282. , idx2{}
  283. , idx1{}
  284. , pml4{0}
  285. , pdpt{0}
  286. , pd{0}
  287. , pt{0}
  288. , m_start{}
  289. , m_end{}
  290. , is_privilege{} {}
  291. vaddr_range vaddr_range::begin() const noexcept {
  292. return *this;
  293. }
  294. vaddr_range vaddr_range::end() const noexcept {
  295. return vaddr_range{nullptr};
  296. }
  297. PSE vaddr_range::operator*() const noexcept {
  298. return pt[idx1];
  299. }
  300. vaddr_range& vaddr_range::operator++() {
  301. --n;
  302. if ((idx1 = (idx1 + 1) % 512) != 0)
  303. return *this;
  304. do {
  305. if ((idx2 = (idx2 + 1) % 512) != 0)
  306. break;
  307. do {
  308. if ((idx3 = (idx3 + 1) % 512) != 0)
  309. break;
  310. idx4 = (idx4 + 1) % 512;
  311. // if idx4 is 0 after update, we have an overflow
  312. assert(idx4 != 0);
  313. pdpt = __parse_pse(pml4[idx4], is_privilege);
  314. } while (false);
  315. pd = __parse_pse(pdpt[idx3], is_privilege);
  316. } while (false);
  317. pt = __parse_pse(pd[idx2], is_privilege);
  318. return *this;
  319. }
  320. vaddr_range::operator bool() const noexcept {
  321. return n;
  322. }
  323. bool vaddr_range::operator==(const vaddr_range& other) const noexcept {
  324. return n == other.n;
  325. }
  326. extern "C" isize real_dump_buddy(const zone_info* zones, u8* buf,
  327. usize buf_size);
  328. static isize _dump_buddy(u8* buf, usize buf_size) {
  329. return real_dump_buddy(zones, buf, buf_size);
  330. }
  331. static void _init_procfs_files() {
  332. auto* root = kernel::procfs::root();
  333. kernel::procfs::create(root, "buddyinfo", _dump_buddy, nullptr);
  334. }
  335. __attribute__((used))
  336. SECTION(".late_init") void (*const _paging_late_init)() = _init_procfs_files;