1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kexec_handover.c - kexec handover metadata processing 4 * Copyright (C) 2023 Alexander Graf <graf@amazon.com> 5 * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> 6 * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> 7 */ 8 9 #define pr_fmt(fmt) "KHO: " fmt 10 11 #include <linux/cma.h> 12 #include <linux/count_zeros.h> 13 #include <linux/debugfs.h> 14 #include <linux/kexec.h> 15 #include <linux/kexec_handover.h> 16 #include <linux/libfdt.h> 17 #include <linux/list.h> 18 #include <linux/memblock.h> 19 #include <linux/notifier.h> 20 #include <linux/page-isolation.h> 21 22 #include <asm/early_ioremap.h> 23 24 /* 25 * KHO is tightly coupled with mm init and needs access to some of mm 26 * internal APIs. 27 */ 28 #include "../mm/internal.h" 29 #include "kexec_internal.h" 30 31 #define KHO_FDT_COMPATIBLE "kho-v1" 32 #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" 33 #define PROP_SUB_FDT "fdt" 34 35 static bool kho_enable __ro_after_init; 36 37 bool kho_is_enabled(void) 38 { 39 return kho_enable; 40 } 41 EXPORT_SYMBOL_GPL(kho_is_enabled); 42 43 static int __init kho_parse_enable(char *p) 44 { 45 return kstrtobool(p, &kho_enable); 46 } 47 early_param("kho", kho_parse_enable); 48 49 /* 50 * Keep track of memory that is to be preserved across KHO. 51 * 52 * The serializing side uses two levels of xarrays to manage chunks of per-order 53 * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 54 * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations 55 * each bitmap will cover 16M of address space. Thus, for 16G of memory at most 56 * 512K of bitmap memory will be needed for order 0. 57 * 58 * This approach is fully incremental, as the serialization progresses folios 59 * can continue be aggregated to the tracker. The final step, immediately prior 60 * to kexec would serialize the xarray information into a linked list for the 61 * successor kernel to parse. 62 */ 63 64 #define PRESERVE_BITS (512 * 8) 65 66 struct kho_mem_phys_bits { 67 DECLARE_BITMAP(preserve, PRESERVE_BITS); 68 }; 69 70 struct kho_mem_phys { 71 /* 72 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized 73 * to order. 74 */ 75 struct xarray phys_bits; 76 }; 77 78 struct kho_mem_track { 79 /* Points to kho_mem_phys, each order gets its own bitmap tree */ 80 struct xarray orders; 81 }; 82 83 struct khoser_mem_chunk; 84 85 struct kho_serialization { 86 struct page *fdt; 87 struct list_head fdt_list; 88 struct dentry *sub_fdt_dir; 89 struct kho_mem_track track; 90 /* First chunk of serialized preserved memory map */ 91 struct khoser_mem_chunk *preserved_mem_map; 92 }; 93 94 static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) 95 { 96 void *elm, *res; 97 98 elm = xa_load(xa, index); 99 if (elm) 100 return elm; 101 102 elm = kzalloc(sz, GFP_KERNEL); 103 if (!elm) 104 return ERR_PTR(-ENOMEM); 105 106 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 107 if (xa_is_err(res)) 108 res = ERR_PTR(xa_err(res)); 109 110 if (res) { 111 kfree(elm); 112 return res; 113 } 114 115 return elm; 116 } 117 118 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, 119 unsigned long end_pfn) 120 { 121 struct kho_mem_phys_bits *bits; 122 struct kho_mem_phys *physxa; 123 124 while (pfn < end_pfn) { 125 const unsigned int order = 126 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 127 const unsigned long pfn_high = pfn >> order; 128 129 physxa = xa_load(&track->orders, order); 130 if (!physxa) 131 continue; 132 133 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 134 if (!bits) 135 continue; 136 137 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 138 139 pfn += 1 << order; 140 } 141 } 142 143 static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, 144 unsigned int order) 145 { 146 struct kho_mem_phys_bits *bits; 147 struct kho_mem_phys *physxa; 148 const unsigned long pfn_high = pfn >> order; 149 150 might_sleep(); 151 152 physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); 153 if (IS_ERR(physxa)) 154 return PTR_ERR(physxa); 155 156 bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, 157 sizeof(*bits)); 158 if (IS_ERR(bits)) 159 return PTR_ERR(bits); 160 161 set_bit(pfn_high % PRESERVE_BITS, bits->preserve); 162 163 return 0; 164 } 165 166 /* almost as free_reserved_page(), just don't free the page */ 167 static void kho_restore_page(struct page *page) 168 { 169 ClearPageReserved(page); 170 init_page_count(page); 171 adjust_managed_page_count(page, 1); 172 } 173 174 /** 175 * kho_restore_folio - recreates the folio from the preserved memory. 176 * @phys: physical address of the folio. 177 * 178 * Return: pointer to the struct folio on success, NULL on failure. 179 */ 180 struct folio *kho_restore_folio(phys_addr_t phys) 181 { 182 struct page *page = pfn_to_online_page(PHYS_PFN(phys)); 183 unsigned long order; 184 185 if (!page) 186 return NULL; 187 188 order = page->private; 189 if (order) { 190 if (order > MAX_PAGE_ORDER) 191 return NULL; 192 193 prep_compound_page(page, order); 194 } else { 195 kho_restore_page(page); 196 } 197 198 return page_folio(page); 199 } 200 EXPORT_SYMBOL_GPL(kho_restore_folio); 201 202 /* Serialize and deserialize struct kho_mem_phys across kexec 203 * 204 * Record all the bitmaps in a linked list of pages for the next kernel to 205 * process. Each chunk holds bitmaps of the same order and each block of bitmaps 206 * starts at a given physical address. This allows the bitmaps to be sparse. The 207 * xarray is used to store them in a tree while building up the data structure, 208 * but the KHO successor kernel only needs to process them once in order. 209 * 210 * All of this memory is normal kmalloc() memory and is not marked for 211 * preservation. The successor kernel will remain isolated to the scratch space 212 * until it completes processing this list. Once processed all the memory 213 * storing these ranges will be marked as free. 214 */ 215 216 struct khoser_mem_bitmap_ptr { 217 phys_addr_t phys_start; 218 DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); 219 }; 220 221 struct khoser_mem_chunk_hdr { 222 DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); 223 unsigned int order; 224 unsigned int num_elms; 225 }; 226 227 #define KHOSER_BITMAP_SIZE \ 228 ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ 229 sizeof(struct khoser_mem_bitmap_ptr)) 230 231 struct khoser_mem_chunk { 232 struct khoser_mem_chunk_hdr hdr; 233 struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; 234 }; 235 236 static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); 237 238 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 239 unsigned long order) 240 { 241 struct khoser_mem_chunk *chunk; 242 243 chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); 244 if (!chunk) 245 return NULL; 246 chunk->hdr.order = order; 247 if (cur_chunk) 248 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 249 return chunk; 250 } 251 252 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) 253 { 254 struct khoser_mem_chunk *chunk = first_chunk; 255 256 while (chunk) { 257 struct khoser_mem_chunk *tmp = chunk; 258 259 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 260 kfree(tmp); 261 } 262 } 263 264 static int kho_mem_serialize(struct kho_serialization *ser) 265 { 266 struct khoser_mem_chunk *first_chunk = NULL; 267 struct khoser_mem_chunk *chunk = NULL; 268 struct kho_mem_phys *physxa; 269 unsigned long order; 270 271 xa_for_each(&ser->track.orders, order, physxa) { 272 struct kho_mem_phys_bits *bits; 273 unsigned long phys; 274 275 chunk = new_chunk(chunk, order); 276 if (!chunk) 277 goto err_free; 278 279 if (!first_chunk) 280 first_chunk = chunk; 281 282 xa_for_each(&physxa->phys_bits, phys, bits) { 283 struct khoser_mem_bitmap_ptr *elm; 284 285 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 286 chunk = new_chunk(chunk, order); 287 if (!chunk) 288 goto err_free; 289 } 290 291 elm = &chunk->bitmaps[chunk->hdr.num_elms]; 292 chunk->hdr.num_elms++; 293 elm->phys_start = (phys * PRESERVE_BITS) 294 << (order + PAGE_SHIFT); 295 KHOSER_STORE_PTR(elm->bitmap, bits); 296 } 297 } 298 299 ser->preserved_mem_map = first_chunk; 300 301 return 0; 302 303 err_free: 304 kho_mem_ser_free(first_chunk); 305 return -ENOMEM; 306 } 307 308 static void deserialize_bitmap(unsigned int order, 309 struct khoser_mem_bitmap_ptr *elm) 310 { 311 struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); 312 unsigned long bit; 313 314 for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { 315 int sz = 1 << (order + PAGE_SHIFT); 316 phys_addr_t phys = 317 elm->phys_start + (bit << (order + PAGE_SHIFT)); 318 struct page *page = phys_to_page(phys); 319 320 memblock_reserve(phys, sz); 321 memblock_reserved_mark_noinit(phys, sz); 322 page->private = order; 323 } 324 } 325 326 static void __init kho_mem_deserialize(const void *fdt) 327 { 328 struct khoser_mem_chunk *chunk; 329 const phys_addr_t *mem; 330 int len; 331 332 mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); 333 334 if (!mem || len != sizeof(*mem)) { 335 pr_err("failed to get preserved memory bitmaps\n"); 336 return; 337 } 338 339 chunk = *mem ? phys_to_virt(*mem) : NULL; 340 while (chunk) { 341 unsigned int i; 342 343 for (i = 0; i != chunk->hdr.num_elms; i++) 344 deserialize_bitmap(chunk->hdr.order, 345 &chunk->bitmaps[i]); 346 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 347 } 348 } 349 350 /* 351 * With KHO enabled, memory can become fragmented because KHO regions may 352 * be anywhere in physical address space. The scratch regions give us a 353 * safe zones that we will never see KHO allocations from. This is where we 354 * can later safely load our new kexec images into and then use the scratch 355 * area for early allocations that happen before page allocator is 356 * initialized. 357 */ 358 static struct kho_scratch *kho_scratch; 359 static unsigned int kho_scratch_cnt; 360 361 /* 362 * The scratch areas are scaled by default as percent of memory allocated from 363 * memblock. A user can override the scale with command line parameter: 364 * 365 * kho_scratch=N% 366 * 367 * It is also possible to explicitly define size for a lowmem, a global and 368 * per-node scratch areas: 369 * 370 * kho_scratch=l[KMG],n[KMG],m[KMG] 371 * 372 * The explicit size definition takes precedence over scale definition. 373 */ 374 static unsigned int scratch_scale __initdata = 200; 375 static phys_addr_t scratch_size_global __initdata; 376 static phys_addr_t scratch_size_pernode __initdata; 377 static phys_addr_t scratch_size_lowmem __initdata; 378 379 static int __init kho_parse_scratch_size(char *p) 380 { 381 size_t len; 382 unsigned long sizes[3]; 383 int i; 384 385 if (!p) 386 return -EINVAL; 387 388 len = strlen(p); 389 if (!len) 390 return -EINVAL; 391 392 /* parse nn% */ 393 if (p[len - 1] == '%') { 394 /* unsigned int max is 4,294,967,295, 10 chars */ 395 char s_scale[11] = {}; 396 int ret = 0; 397 398 if (len > ARRAY_SIZE(s_scale)) 399 return -EINVAL; 400 401 memcpy(s_scale, p, len - 1); 402 ret = kstrtouint(s_scale, 10, &scratch_scale); 403 if (!ret) 404 pr_notice("scratch scale is %d%%\n", scratch_scale); 405 return ret; 406 } 407 408 /* parse ll[KMG],mm[KMG],nn[KMG] */ 409 for (i = 0; i < ARRAY_SIZE(sizes); i++) { 410 char *endp = p; 411 412 if (i > 0) { 413 if (*p != ',') 414 return -EINVAL; 415 p += 1; 416 } 417 418 sizes[i] = memparse(p, &endp); 419 if (!sizes[i] || endp == p) 420 return -EINVAL; 421 p = endp; 422 } 423 424 scratch_size_lowmem = sizes[0]; 425 scratch_size_global = sizes[1]; 426 scratch_size_pernode = sizes[2]; 427 scratch_scale = 0; 428 429 pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", 430 (u64)(scratch_size_lowmem >> 20), 431 (u64)(scratch_size_global >> 20), 432 (u64)(scratch_size_pernode >> 20)); 433 434 return 0; 435 } 436 early_param("kho_scratch", kho_parse_scratch_size); 437 438 static void __init scratch_size_update(void) 439 { 440 phys_addr_t size; 441 442 if (!scratch_scale) 443 return; 444 445 size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, 446 NUMA_NO_NODE); 447 size = size * scratch_scale / 100; 448 scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 449 450 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 451 NUMA_NO_NODE); 452 size = size * scratch_scale / 100 - scratch_size_lowmem; 453 scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); 454 } 455 456 static phys_addr_t __init scratch_size_node(int nid) 457 { 458 phys_addr_t size; 459 460 if (scratch_scale) { 461 size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, 462 nid); 463 size = size * scratch_scale / 100; 464 } else { 465 size = scratch_size_pernode; 466 } 467 468 return round_up(size, CMA_MIN_ALIGNMENT_BYTES); 469 } 470 471 /** 472 * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec 473 * 474 * With KHO we can preserve arbitrary pages in the system. To ensure we still 475 * have a large contiguous region of memory when we search the physical address 476 * space for target memory, let's make sure we always have a large CMA region 477 * active. This CMA region will only be used for movable pages which are not a 478 * problem for us during KHO because we can just move them somewhere else. 479 */ 480 static void __init kho_reserve_scratch(void) 481 { 482 phys_addr_t addr, size; 483 int nid, i = 0; 484 485 if (!kho_enable) 486 return; 487 488 scratch_size_update(); 489 490 /* FIXME: deal with node hot-plug/remove */ 491 kho_scratch_cnt = num_online_nodes() + 2; 492 size = kho_scratch_cnt * sizeof(*kho_scratch); 493 kho_scratch = memblock_alloc(size, PAGE_SIZE); 494 if (!kho_scratch) 495 goto err_disable_kho; 496 497 /* 498 * reserve scratch area in low memory for lowmem allocations in the 499 * next kernel 500 */ 501 size = scratch_size_lowmem; 502 addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, 503 ARCH_LOW_ADDRESS_LIMIT); 504 if (!addr) 505 goto err_free_scratch_desc; 506 507 kho_scratch[i].addr = addr; 508 kho_scratch[i].size = size; 509 i++; 510 511 /* reserve large contiguous area for allocations without nid */ 512 size = scratch_size_global; 513 addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); 514 if (!addr) 515 goto err_free_scratch_areas; 516 517 kho_scratch[i].addr = addr; 518 kho_scratch[i].size = size; 519 i++; 520 521 for_each_online_node(nid) { 522 size = scratch_size_node(nid); 523 addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 524 0, MEMBLOCK_ALLOC_ACCESSIBLE, 525 nid, true); 526 if (!addr) 527 goto err_free_scratch_areas; 528 529 kho_scratch[i].addr = addr; 530 kho_scratch[i].size = size; 531 i++; 532 } 533 534 return; 535 536 err_free_scratch_areas: 537 for (i--; i >= 0; i--) 538 memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); 539 err_free_scratch_desc: 540 memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); 541 err_disable_kho: 542 kho_enable = false; 543 } 544 545 struct fdt_debugfs { 546 struct list_head list; 547 struct debugfs_blob_wrapper wrapper; 548 struct dentry *file; 549 }; 550 551 static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, 552 const char *name, const void *fdt) 553 { 554 struct fdt_debugfs *f; 555 struct dentry *file; 556 557 f = kmalloc(sizeof(*f), GFP_KERNEL); 558 if (!f) 559 return -ENOMEM; 560 561 f->wrapper.data = (void *)fdt; 562 f->wrapper.size = fdt_totalsize(fdt); 563 564 file = debugfs_create_blob(name, 0400, dir, &f->wrapper); 565 if (IS_ERR(file)) { 566 kfree(f); 567 return PTR_ERR(file); 568 } 569 570 f->file = file; 571 list_add(&f->list, list); 572 573 return 0; 574 } 575 576 /** 577 * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. 578 * @ser: serialization control object passed by KHO notifiers. 579 * @name: name of the sub tree. 580 * @fdt: the sub tree blob. 581 * 582 * Creates a new child node named @name in KHO root FDT and records 583 * the physical address of @fdt. The pages of @fdt must also be preserved 584 * by KHO for the new kernel to retrieve it after kexec. 585 * 586 * A debugfs blob entry is also created at 587 * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. 588 * 589 * Return: 0 on success, error code on failure 590 */ 591 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) 592 { 593 int err = 0; 594 u64 phys = (u64)virt_to_phys(fdt); 595 void *root = page_to_virt(ser->fdt); 596 597 err |= fdt_begin_node(root, name); 598 err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); 599 err |= fdt_end_node(root); 600 601 if (err) 602 return err; 603 604 return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); 605 } 606 EXPORT_SYMBOL_GPL(kho_add_subtree); 607 608 struct kho_out { 609 struct blocking_notifier_head chain_head; 610 611 struct dentry *dir; 612 613 struct mutex lock; /* protects KHO FDT finalization */ 614 615 struct kho_serialization ser; 616 bool finalized; 617 }; 618 619 static struct kho_out kho_out = { 620 .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), 621 .lock = __MUTEX_INITIALIZER(kho_out.lock), 622 .ser = { 623 .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), 624 .track = { 625 .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), 626 }, 627 }, 628 .finalized = false, 629 }; 630 631 int register_kho_notifier(struct notifier_block *nb) 632 { 633 return blocking_notifier_chain_register(&kho_out.chain_head, nb); 634 } 635 EXPORT_SYMBOL_GPL(register_kho_notifier); 636 637 int unregister_kho_notifier(struct notifier_block *nb) 638 { 639 return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); 640 } 641 EXPORT_SYMBOL_GPL(unregister_kho_notifier); 642 643 /** 644 * kho_preserve_folio - preserve a folio across kexec. 645 * @folio: folio to preserve. 646 * 647 * Instructs KHO to preserve the whole folio across kexec. The order 648 * will be preserved as well. 649 * 650 * Return: 0 on success, error code on failure 651 */ 652 int kho_preserve_folio(struct folio *folio) 653 { 654 const unsigned long pfn = folio_pfn(folio); 655 const unsigned int order = folio_order(folio); 656 struct kho_mem_track *track = &kho_out.ser.track; 657 658 if (kho_out.finalized) 659 return -EBUSY; 660 661 return __kho_preserve_order(track, pfn, order); 662 } 663 EXPORT_SYMBOL_GPL(kho_preserve_folio); 664 665 /** 666 * kho_preserve_phys - preserve a physically contiguous range across kexec. 667 * @phys: physical address of the range. 668 * @size: size of the range. 669 * 670 * Instructs KHO to preserve the memory range from @phys to @phys + @size 671 * across kexec. 672 * 673 * Return: 0 on success, error code on failure 674 */ 675 int kho_preserve_phys(phys_addr_t phys, size_t size) 676 { 677 unsigned long pfn = PHYS_PFN(phys); 678 unsigned long failed_pfn = 0; 679 const unsigned long start_pfn = pfn; 680 const unsigned long end_pfn = PHYS_PFN(phys + size); 681 int err = 0; 682 struct kho_mem_track *track = &kho_out.ser.track; 683 684 if (kho_out.finalized) 685 return -EBUSY; 686 687 if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) 688 return -EINVAL; 689 690 while (pfn < end_pfn) { 691 const unsigned int order = 692 min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); 693 694 err = __kho_preserve_order(track, pfn, order); 695 if (err) { 696 failed_pfn = pfn; 697 break; 698 } 699 700 pfn += 1 << order; 701 } 702 703 if (err) 704 __kho_unpreserve(track, start_pfn, failed_pfn); 705 706 return err; 707 } 708 EXPORT_SYMBOL_GPL(kho_preserve_phys); 709 710 /* Handling for debug/kho/out */ 711 712 static struct dentry *debugfs_root; 713 714 static int kho_out_update_debugfs_fdt(void) 715 { 716 int err = 0; 717 struct fdt_debugfs *ff, *tmp; 718 719 if (kho_out.finalized) { 720 err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, 721 "fdt", page_to_virt(kho_out.ser.fdt)); 722 } else { 723 list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { 724 debugfs_remove(ff->file); 725 list_del(&ff->list); 726 kfree(ff); 727 } 728 } 729 730 return err; 731 } 732 733 static int kho_abort(void) 734 { 735 int err; 736 unsigned long order; 737 struct kho_mem_phys *physxa; 738 739 xa_for_each(&kho_out.ser.track.orders, order, physxa) { 740 struct kho_mem_phys_bits *bits; 741 unsigned long phys; 742 743 xa_for_each(&physxa->phys_bits, phys, bits) 744 kfree(bits); 745 746 xa_destroy(&physxa->phys_bits); 747 kfree(physxa); 748 } 749 xa_destroy(&kho_out.ser.track.orders); 750 751 if (kho_out.ser.preserved_mem_map) { 752 kho_mem_ser_free(kho_out.ser.preserved_mem_map); 753 kho_out.ser.preserved_mem_map = NULL; 754 } 755 756 err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, 757 NULL); 758 err = notifier_to_errno(err); 759 760 if (err) 761 pr_err("Failed to abort KHO finalization: %d\n", err); 762 763 return err; 764 } 765 766 static int kho_finalize(void) 767 { 768 int err = 0; 769 u64 *preserved_mem_map; 770 void *fdt = page_to_virt(kho_out.ser.fdt); 771 772 err |= fdt_create(fdt, PAGE_SIZE); 773 err |= fdt_finish_reservemap(fdt); 774 err |= fdt_begin_node(fdt, ""); 775 err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); 776 /** 777 * Reserve the preserved-memory-map property in the root FDT, so 778 * that all property definitions will precede subnodes created by 779 * KHO callers. 780 */ 781 err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, 782 sizeof(*preserved_mem_map), 783 (void **)&preserved_mem_map); 784 if (err) 785 goto abort; 786 787 err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); 788 if (err) 789 goto abort; 790 791 err = blocking_notifier_call_chain(&kho_out.chain_head, 792 KEXEC_KHO_FINALIZE, &kho_out.ser); 793 err = notifier_to_errno(err); 794 if (err) 795 goto abort; 796 797 err = kho_mem_serialize(&kho_out.ser); 798 if (err) 799 goto abort; 800 801 *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); 802 803 err |= fdt_end_node(fdt); 804 err |= fdt_finish(fdt); 805 806 abort: 807 if (err) { 808 pr_err("Failed to convert KHO state tree: %d\n", err); 809 kho_abort(); 810 } 811 812 return err; 813 } 814 815 static int kho_out_finalize_get(void *data, u64 *val) 816 { 817 mutex_lock(&kho_out.lock); 818 *val = kho_out.finalized; 819 mutex_unlock(&kho_out.lock); 820 821 return 0; 822 } 823 824 static int kho_out_finalize_set(void *data, u64 _val) 825 { 826 int ret = 0; 827 bool val = !!_val; 828 829 mutex_lock(&kho_out.lock); 830 831 if (val == kho_out.finalized) { 832 if (kho_out.finalized) 833 ret = -EEXIST; 834 else 835 ret = -ENOENT; 836 goto unlock; 837 } 838 839 if (val) 840 ret = kho_finalize(); 841 else 842 ret = kho_abort(); 843 844 if (ret) 845 goto unlock; 846 847 kho_out.finalized = val; 848 ret = kho_out_update_debugfs_fdt(); 849 850 unlock: 851 mutex_unlock(&kho_out.lock); 852 return ret; 853 } 854 855 DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, 856 kho_out_finalize_set, "%llu\n"); 857 858 static int scratch_phys_show(struct seq_file *m, void *v) 859 { 860 for (int i = 0; i < kho_scratch_cnt; i++) 861 seq_printf(m, "0x%llx\n", kho_scratch[i].addr); 862 863 return 0; 864 } 865 DEFINE_SHOW_ATTRIBUTE(scratch_phys); 866 867 static int scratch_len_show(struct seq_file *m, void *v) 868 { 869 for (int i = 0; i < kho_scratch_cnt; i++) 870 seq_printf(m, "0x%llx\n", kho_scratch[i].size); 871 872 return 0; 873 } 874 DEFINE_SHOW_ATTRIBUTE(scratch_len); 875 876 static __init int kho_out_debugfs_init(void) 877 { 878 struct dentry *dir, *f, *sub_fdt_dir; 879 880 dir = debugfs_create_dir("out", debugfs_root); 881 if (IS_ERR(dir)) 882 return -ENOMEM; 883 884 sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); 885 if (IS_ERR(sub_fdt_dir)) 886 goto err_rmdir; 887 888 f = debugfs_create_file("scratch_phys", 0400, dir, NULL, 889 &scratch_phys_fops); 890 if (IS_ERR(f)) 891 goto err_rmdir; 892 893 f = debugfs_create_file("scratch_len", 0400, dir, NULL, 894 &scratch_len_fops); 895 if (IS_ERR(f)) 896 goto err_rmdir; 897 898 f = debugfs_create_file("finalize", 0600, dir, NULL, 899 &fops_kho_out_finalize); 900 if (IS_ERR(f)) 901 goto err_rmdir; 902 903 kho_out.dir = dir; 904 kho_out.ser.sub_fdt_dir = sub_fdt_dir; 905 return 0; 906 907 err_rmdir: 908 debugfs_remove_recursive(dir); 909 return -ENOENT; 910 } 911 912 struct kho_in { 913 struct dentry *dir; 914 phys_addr_t fdt_phys; 915 phys_addr_t scratch_phys; 916 struct list_head fdt_list; 917 }; 918 919 static struct kho_in kho_in = { 920 .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), 921 }; 922 923 static const void *kho_get_fdt(void) 924 { 925 return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; 926 } 927 928 /** 929 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 930 * @name: the name of the sub FDT passed to kho_add_subtree(). 931 * @phys: if found, the physical address of the sub FDT is stored in @phys. 932 * 933 * Retrieve a preserved sub FDT named @name and store its physical 934 * address in @phys. 935 * 936 * Return: 0 on success, error code on failure 937 */ 938 int kho_retrieve_subtree(const char *name, phys_addr_t *phys) 939 { 940 const void *fdt = kho_get_fdt(); 941 const u64 *val; 942 int offset, len; 943 944 if (!fdt) 945 return -ENOENT; 946 947 if (!phys) 948 return -EINVAL; 949 950 offset = fdt_subnode_offset(fdt, 0, name); 951 if (offset < 0) 952 return -ENOENT; 953 954 val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); 955 if (!val || len != sizeof(*val)) 956 return -EINVAL; 957 958 *phys = (phys_addr_t)*val; 959 960 return 0; 961 } 962 EXPORT_SYMBOL_GPL(kho_retrieve_subtree); 963 964 /* Handling for debugfs/kho/in */ 965 966 static __init int kho_in_debugfs_init(const void *fdt) 967 { 968 struct dentry *sub_fdt_dir; 969 int err, child; 970 971 kho_in.dir = debugfs_create_dir("in", debugfs_root); 972 if (IS_ERR(kho_in.dir)) 973 return PTR_ERR(kho_in.dir); 974 975 sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); 976 if (IS_ERR(sub_fdt_dir)) { 977 err = PTR_ERR(sub_fdt_dir); 978 goto err_rmdir; 979 } 980 981 err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); 982 if (err) 983 goto err_rmdir; 984 985 fdt_for_each_subnode(child, fdt, 0) { 986 int len = 0; 987 const char *name = fdt_get_name(fdt, child, NULL); 988 const u64 *fdt_phys; 989 990 fdt_phys = fdt_getprop(fdt, child, "fdt", &len); 991 if (!fdt_phys) 992 continue; 993 if (len != sizeof(*fdt_phys)) { 994 pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", 995 name, len); 996 continue; 997 } 998 err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, 999 phys_to_virt(*fdt_phys)); 1000 if (err) { 1001 pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, 1002 err); 1003 continue; 1004 } 1005 } 1006 1007 return 0; 1008 1009 err_rmdir: 1010 debugfs_remove_recursive(kho_in.dir); 1011 return err; 1012 } 1013 1014 static __init int kho_init(void) 1015 { 1016 int err = 0; 1017 const void *fdt = kho_get_fdt(); 1018 1019 if (!kho_enable) 1020 return 0; 1021 1022 kho_out.ser.fdt = alloc_page(GFP_KERNEL); 1023 if (!kho_out.ser.fdt) { 1024 err = -ENOMEM; 1025 goto err_free_scratch; 1026 } 1027 1028 debugfs_root = debugfs_create_dir("kho", NULL); 1029 if (IS_ERR(debugfs_root)) { 1030 err = -ENOENT; 1031 goto err_free_fdt; 1032 } 1033 1034 err = kho_out_debugfs_init(); 1035 if (err) 1036 goto err_free_fdt; 1037 1038 if (fdt) { 1039 err = kho_in_debugfs_init(fdt); 1040 /* 1041 * Failure to create /sys/kernel/debug/kho/in does not prevent 1042 * reviving state from KHO and setting up KHO for the next 1043 * kexec. 1044 */ 1045 if (err) 1046 pr_err("failed exposing handover FDT in debugfs: %d\n", 1047 err); 1048 1049 return 0; 1050 } 1051 1052 for (int i = 0; i < kho_scratch_cnt; i++) { 1053 unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); 1054 unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; 1055 unsigned long pfn; 1056 1057 for (pfn = base_pfn; pfn < base_pfn + count; 1058 pfn += pageblock_nr_pages) 1059 init_cma_reserved_pageblock(pfn_to_page(pfn)); 1060 } 1061 1062 return 0; 1063 1064 err_free_fdt: 1065 put_page(kho_out.ser.fdt); 1066 kho_out.ser.fdt = NULL; 1067 err_free_scratch: 1068 for (int i = 0; i < kho_scratch_cnt; i++) { 1069 void *start = __va(kho_scratch[i].addr); 1070 void *end = start + kho_scratch[i].size; 1071 1072 free_reserved_area(start, end, -1, ""); 1073 } 1074 kho_enable = false; 1075 return err; 1076 } 1077 late_initcall(kho_init); 1078 1079 static void __init kho_release_scratch(void) 1080 { 1081 phys_addr_t start, end; 1082 u64 i; 1083 1084 memmap_init_kho_scratch_pages(); 1085 1086 /* 1087 * Mark scratch mem as CMA before we return it. That way we 1088 * ensure that no kernel allocations happen on it. That means 1089 * we can reuse it as scratch memory again later. 1090 */ 1091 __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, 1092 MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { 1093 ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); 1094 ulong end_pfn = pageblock_align(PFN_UP(end)); 1095 ulong pfn; 1096 1097 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) 1098 set_pageblock_migratetype(pfn_to_page(pfn), 1099 MIGRATE_CMA); 1100 } 1101 } 1102 1103 void __init kho_memory_init(void) 1104 { 1105 struct folio *folio; 1106 1107 if (kho_in.scratch_phys) { 1108 kho_scratch = phys_to_virt(kho_in.scratch_phys); 1109 kho_release_scratch(); 1110 1111 kho_mem_deserialize(kho_get_fdt()); 1112 folio = kho_restore_folio(kho_in.fdt_phys); 1113 if (!folio) 1114 pr_warn("failed to restore folio for KHO fdt\n"); 1115 } else { 1116 kho_reserve_scratch(); 1117 } 1118 } 1119 1120 void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, 1121 phys_addr_t scratch_phys, u64 scratch_len) 1122 { 1123 void *fdt = NULL; 1124 struct kho_scratch *scratch = NULL; 1125 int err = 0; 1126 unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); 1127 1128 /* Validate the input FDT */ 1129 fdt = early_memremap(fdt_phys, fdt_len); 1130 if (!fdt) { 1131 pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); 1132 err = -EFAULT; 1133 goto out; 1134 } 1135 err = fdt_check_header(fdt); 1136 if (err) { 1137 pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", 1138 fdt_phys, err); 1139 err = -EINVAL; 1140 goto out; 1141 } 1142 err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); 1143 if (err) { 1144 pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", 1145 fdt_phys, KHO_FDT_COMPATIBLE, err); 1146 err = -EINVAL; 1147 goto out; 1148 } 1149 1150 scratch = early_memremap(scratch_phys, scratch_len); 1151 if (!scratch) { 1152 pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", 1153 scratch_phys, scratch_len); 1154 err = -EFAULT; 1155 goto out; 1156 } 1157 1158 /* 1159 * We pass a safe contiguous blocks of memory to use for early boot 1160 * purporses from the previous kernel so that we can resize the 1161 * memblock array as needed. 1162 */ 1163 for (int i = 0; i < scratch_cnt; i++) { 1164 struct kho_scratch *area = &scratch[i]; 1165 u64 size = area->size; 1166 1167 memblock_add(area->addr, size); 1168 err = memblock_mark_kho_scratch(area->addr, size); 1169 if (WARN_ON(err)) { 1170 pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", 1171 &area->addr, &size, err); 1172 goto out; 1173 } 1174 pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); 1175 } 1176 1177 memblock_reserve(scratch_phys, scratch_len); 1178 1179 /* 1180 * Now that we have a viable region of scratch memory, let's tell 1181 * the memblocks allocator to only use that for any allocations. 1182 * That way we ensure that nothing scribbles over in use data while 1183 * we initialize the page tables which we will need to ingest all 1184 * memory reservations from the previous kernel. 1185 */ 1186 memblock_set_kho_scratch_only(); 1187 1188 kho_in.fdt_phys = fdt_phys; 1189 kho_in.scratch_phys = scratch_phys; 1190 kho_scratch_cnt = scratch_cnt; 1191 pr_info("found kexec handover data. Will skip init for some devices\n"); 1192 1193 out: 1194 if (fdt) 1195 early_memunmap(fdt, fdt_len); 1196 if (scratch) 1197 early_memunmap(scratch, scratch_len); 1198 if (err) 1199 pr_warn("disabling KHO revival: %d\n", err); 1200 } 1201 1202 /* Helper functions for kexec_file_load */ 1203 1204 int kho_fill_kimage(struct kimage *image) 1205 { 1206 ssize_t scratch_size; 1207 int err = 0; 1208 struct kexec_buf scratch; 1209 1210 if (!kho_enable) 1211 return 0; 1212 1213 image->kho.fdt = page_to_phys(kho_out.ser.fdt); 1214 1215 scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; 1216 scratch = (struct kexec_buf){ 1217 .image = image, 1218 .buffer = kho_scratch, 1219 .bufsz = scratch_size, 1220 .mem = KEXEC_BUF_MEM_UNKNOWN, 1221 .memsz = scratch_size, 1222 .buf_align = SZ_64K, /* Makes it easier to map */ 1223 .buf_max = ULONG_MAX, 1224 .top_down = true, 1225 }; 1226 err = kexec_add_buffer(&scratch); 1227 if (err) 1228 return err; 1229 image->kho.scratch = &image->segment[image->nr_segments - 1]; 1230 1231 return 0; 1232 } 1233 1234 static int kho_walk_scratch(struct kexec_buf *kbuf, 1235 int (*func)(struct resource *, void *)) 1236 { 1237 int ret = 0; 1238 int i; 1239 1240 for (i = 0; i < kho_scratch_cnt; i++) { 1241 struct resource res = { 1242 .start = kho_scratch[i].addr, 1243 .end = kho_scratch[i].addr + kho_scratch[i].size - 1, 1244 }; 1245 1246 /* Try to fit the kimage into our KHO scratch region */ 1247 ret = func(&res, kbuf); 1248 if (ret) 1249 break; 1250 } 1251 1252 return ret; 1253 } 1254 1255 int kho_locate_mem_hole(struct kexec_buf *kbuf, 1256 int (*func)(struct resource *, void *)) 1257 { 1258 int ret; 1259 1260 if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) 1261 return 1; 1262 1263 ret = kho_walk_scratch(kbuf, func); 1264 1265 return ret == 1 ? 0 : -EADDRNOTAVAIL; 1266 } 1267