1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * mm_init.c - Memory initialisation verification and debugging
4 *
5 * Copyright 2008 IBM Corporation, 2008
6 * Author Mel Gorman <mel@csn.ul.ie>
7 *
8 */
9 #include <linux/kernel.h>
10 #include <linux/init.h>
11 #include <linux/kobject.h>
12 #include <linux/export.h>
13 #include <linux/memory.h>
14 #include <linux/notifier.h>
15 #include <linux/sched.h>
16 #include <linux/mman.h>
17 #include <linux/memblock.h>
18 #include <linux/page-isolation.h>
19 #include <linux/padata.h>
20 #include <linux/nmi.h>
21 #include <linux/buffer_head.h>
22 #include <linux/kmemleak.h>
23 #include <linux/kfence.h>
24 #include <linux/page_ext.h>
25 #include <linux/pti.h>
26 #include <linux/pgtable.h>
27 #include <linux/stackdepot.h>
28 #include <linux/swap.h>
29 #include <linux/cma.h>
30 #include <linux/crash_dump.h>
31 #include <linux/execmem.h>
32 #include <linux/vmstat.h>
33 #include <linux/hugetlb.h>
34 #include "internal.h"
35 #include "slab.h"
36 #include "shuffle.h"
37
38 #include <asm/setup.h>
39
40 #ifndef CONFIG_NUMA
41 unsigned long max_mapnr;
42 EXPORT_SYMBOL(max_mapnr);
43
44 struct page *mem_map;
45 EXPORT_SYMBOL(mem_map);
46 #endif
47
48 /*
49 * high_memory defines the upper bound on direct map memory, then end
50 * of ZONE_NORMAL.
51 */
52 void *high_memory;
53 EXPORT_SYMBOL(high_memory);
54
55 #ifdef CONFIG_DEBUG_MEMORY_INIT
56 int __meminitdata mminit_loglevel;
57
58 /* The zonelists are simply reported, validation is manual. */
mminit_verify_zonelist(void)59 void __init mminit_verify_zonelist(void)
60 {
61 int nid;
62
63 if (mminit_loglevel < MMINIT_VERIFY)
64 return;
65
66 for_each_online_node(nid) {
67 pg_data_t *pgdat = NODE_DATA(nid);
68 struct zone *zone;
69 struct zoneref *z;
70 struct zonelist *zonelist;
71 int i, listid, zoneid;
72
73 for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
74
75 /* Identify the zone and nodelist */
76 zoneid = i % MAX_NR_ZONES;
77 listid = i / MAX_NR_ZONES;
78 zonelist = &pgdat->node_zonelists[listid];
79 zone = &pgdat->node_zones[zoneid];
80 if (!populated_zone(zone))
81 continue;
82
83 /* Print information about the zonelist */
84 printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
85 listid > 0 ? "thisnode" : "general", nid,
86 zone->name);
87
88 /* Iterate the zonelist */
89 for_each_zone_zonelist(zone, z, zonelist, zoneid)
90 pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
91 pr_cont("\n");
92 }
93 }
94 }
95
mminit_verify_pageflags_layout(void)96 void __init mminit_verify_pageflags_layout(void)
97 {
98 int shift, width;
99 unsigned long or_mask, add_mask;
100
101 shift = BITS_PER_LONG;
102 width = shift - NR_NON_PAGEFLAG_BITS;
103 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
104 "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
105 SECTIONS_WIDTH,
106 NODES_WIDTH,
107 ZONES_WIDTH,
108 LAST_CPUPID_WIDTH,
109 KASAN_TAG_WIDTH,
110 LRU_GEN_WIDTH,
111 LRU_REFS_WIDTH,
112 NR_PAGEFLAGS);
113 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
114 "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
115 SECTIONS_SHIFT,
116 NODES_SHIFT,
117 ZONES_SHIFT,
118 LAST_CPUPID_SHIFT,
119 KASAN_TAG_WIDTH);
120 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
121 "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
122 (unsigned long)SECTIONS_PGSHIFT,
123 (unsigned long)NODES_PGSHIFT,
124 (unsigned long)ZONES_PGSHIFT,
125 (unsigned long)LAST_CPUPID_PGSHIFT,
126 (unsigned long)KASAN_TAG_PGSHIFT);
127 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
128 "Node/Zone ID: %lu -> %lu\n",
129 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
130 (unsigned long)ZONEID_PGOFF);
131 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
132 "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
133 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
134 #ifdef NODE_NOT_IN_PAGE_FLAGS
135 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
136 "Node not in page flags");
137 #endif
138 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
139 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
140 "Last cpupid not in page flags");
141 #endif
142
143 if (SECTIONS_WIDTH) {
144 shift -= SECTIONS_WIDTH;
145 BUG_ON(shift != SECTIONS_PGSHIFT);
146 }
147 if (NODES_WIDTH) {
148 shift -= NODES_WIDTH;
149 BUG_ON(shift != NODES_PGSHIFT);
150 }
151 if (ZONES_WIDTH) {
152 shift -= ZONES_WIDTH;
153 BUG_ON(shift != ZONES_PGSHIFT);
154 }
155
156 /* Check for bitmask overlaps */
157 or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
158 (NODES_MASK << NODES_PGSHIFT) |
159 (SECTIONS_MASK << SECTIONS_PGSHIFT);
160 add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
161 (NODES_MASK << NODES_PGSHIFT) +
162 (SECTIONS_MASK << SECTIONS_PGSHIFT);
163 BUG_ON(or_mask != add_mask);
164 }
165
set_mminit_loglevel(char * str)166 static __init int set_mminit_loglevel(char *str)
167 {
168 get_option(&str, &mminit_loglevel);
169 return 0;
170 }
171 early_param("mminit_loglevel", set_mminit_loglevel);
172 #endif /* CONFIG_DEBUG_MEMORY_INIT */
173
174 struct kobject *mm_kobj;
175
176 #ifdef CONFIG_SMP
177 s32 vm_committed_as_batch = 32;
178
mm_compute_batch(int overcommit_policy)179 void mm_compute_batch(int overcommit_policy)
180 {
181 u64 memsized_batch;
182 s32 nr = num_present_cpus();
183 s32 batch = max_t(s32, nr*2, 32);
184 unsigned long ram_pages = totalram_pages();
185
186 /*
187 * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
188 * (total memory/#cpus), and lift it to 25% for other policies
189 * to easy the possible lock contention for percpu_counter
190 * vm_committed_as, while the max limit is INT_MAX
191 */
192 if (overcommit_policy == OVERCOMMIT_NEVER)
193 memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
194 else
195 memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
196
197 vm_committed_as_batch = max_t(s32, memsized_batch, batch);
198 }
199
mm_compute_batch_notifier(struct notifier_block * self,unsigned long action,void * arg)200 static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
201 unsigned long action, void *arg)
202 {
203 switch (action) {
204 case MEM_ONLINE:
205 case MEM_OFFLINE:
206 mm_compute_batch(sysctl_overcommit_memory);
207 break;
208 default:
209 break;
210 }
211 return NOTIFY_OK;
212 }
213
mm_compute_batch_init(void)214 static int __init mm_compute_batch_init(void)
215 {
216 mm_compute_batch(sysctl_overcommit_memory);
217 hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
218 return 0;
219 }
220
221 __initcall(mm_compute_batch_init);
222
223 #endif
224
mm_sysfs_init(void)225 static int __init mm_sysfs_init(void)
226 {
227 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
228 if (!mm_kobj)
229 return -ENOMEM;
230
231 return 0;
232 }
233 postcore_initcall(mm_sysfs_init);
234
235 static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
236 static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
237 static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
238
239 static unsigned long required_kernelcore __initdata;
240 static unsigned long required_kernelcore_percent __initdata;
241 static unsigned long required_movablecore __initdata;
242 static unsigned long required_movablecore_percent __initdata;
243
244 static unsigned long nr_kernel_pages __initdata;
245 static unsigned long nr_all_pages __initdata;
246
247 static bool deferred_struct_pages __meminitdata;
248
249 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
250
cmdline_parse_core(char * p,unsigned long * core,unsigned long * percent)251 static int __init cmdline_parse_core(char *p, unsigned long *core,
252 unsigned long *percent)
253 {
254 unsigned long long coremem;
255 char *endptr;
256
257 if (!p)
258 return -EINVAL;
259
260 /* Value may be a percentage of total memory, otherwise bytes */
261 coremem = simple_strtoull(p, &endptr, 0);
262 if (*endptr == '%') {
263 /* Paranoid check for percent values greater than 100 */
264 WARN_ON(coremem > 100);
265
266 *percent = coremem;
267 } else {
268 coremem = memparse(p, &p);
269 /* Paranoid check that UL is enough for the coremem value */
270 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
271
272 *core = coremem >> PAGE_SHIFT;
273 *percent = 0UL;
274 }
275 return 0;
276 }
277
278 bool mirrored_kernelcore __initdata_memblock;
279
280 /*
281 * kernelcore=size sets the amount of memory for use for allocations that
282 * cannot be reclaimed or migrated.
283 */
cmdline_parse_kernelcore(char * p)284 static int __init cmdline_parse_kernelcore(char *p)
285 {
286 /* parse kernelcore=mirror */
287 if (parse_option_str(p, "mirror")) {
288 mirrored_kernelcore = true;
289 return 0;
290 }
291
292 return cmdline_parse_core(p, &required_kernelcore,
293 &required_kernelcore_percent);
294 }
295 early_param("kernelcore", cmdline_parse_kernelcore);
296
297 /*
298 * movablecore=size sets the amount of memory for use for allocations that
299 * can be reclaimed or migrated.
300 */
cmdline_parse_movablecore(char * p)301 static int __init cmdline_parse_movablecore(char *p)
302 {
303 return cmdline_parse_core(p, &required_movablecore,
304 &required_movablecore_percent);
305 }
306 early_param("movablecore", cmdline_parse_movablecore);
307
308 /*
309 * early_calculate_totalpages()
310 * Sum pages in active regions for movable zone.
311 * Populate N_MEMORY for calculating usable_nodes.
312 */
early_calculate_totalpages(void)313 static unsigned long __init early_calculate_totalpages(void)
314 {
315 unsigned long totalpages = 0;
316 unsigned long start_pfn, end_pfn;
317 int i, nid;
318
319 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
320 unsigned long pages = end_pfn - start_pfn;
321
322 totalpages += pages;
323 if (pages)
324 node_set_state(nid, N_MEMORY);
325 }
326 return totalpages;
327 }
328
329 /*
330 * This finds a zone that can be used for ZONE_MOVABLE pages. The
331 * assumption is made that zones within a node are ordered in monotonic
332 * increasing memory addresses so that the "highest" populated zone is used
333 */
find_usable_zone_for_movable(void)334 static void __init find_usable_zone_for_movable(void)
335 {
336 int zone_index;
337 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
338 if (zone_index == ZONE_MOVABLE)
339 continue;
340
341 if (arch_zone_highest_possible_pfn[zone_index] >
342 arch_zone_lowest_possible_pfn[zone_index])
343 break;
344 }
345
346 VM_BUG_ON(zone_index == -1);
347 movable_zone = zone_index;
348 }
349
350 /*
351 * Find the PFN the Movable zone begins in each node. Kernel memory
352 * is spread evenly between nodes as long as the nodes have enough
353 * memory. When they don't, some nodes will have more kernelcore than
354 * others
355 */
find_zone_movable_pfns_for_nodes(void)356 static void __init find_zone_movable_pfns_for_nodes(void)
357 {
358 int i, nid;
359 unsigned long usable_startpfn;
360 unsigned long kernelcore_node, kernelcore_remaining;
361 /* save the state before borrow the nodemask */
362 nodemask_t saved_node_state = node_states[N_MEMORY];
363 unsigned long totalpages = early_calculate_totalpages();
364 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
365 struct memblock_region *r;
366
367 /* Need to find movable_zone earlier when movable_node is specified. */
368 find_usable_zone_for_movable();
369
370 /*
371 * If movable_node is specified, ignore kernelcore and movablecore
372 * options.
373 */
374 if (movable_node_is_enabled()) {
375 for_each_mem_region(r) {
376 if (!memblock_is_hotpluggable(r))
377 continue;
378
379 nid = memblock_get_region_node(r);
380
381 usable_startpfn = memblock_region_memory_base_pfn(r);
382 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
383 min(usable_startpfn, zone_movable_pfn[nid]) :
384 usable_startpfn;
385 }
386
387 goto out2;
388 }
389
390 /*
391 * If kernelcore=mirror is specified, ignore movablecore option
392 */
393 if (mirrored_kernelcore) {
394 bool mem_below_4gb_not_mirrored = false;
395
396 if (!memblock_has_mirror()) {
397 pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
398 goto out;
399 }
400
401 if (is_kdump_kernel()) {
402 pr_warn("The system is under kdump, ignore kernelcore=mirror.\n");
403 goto out;
404 }
405
406 for_each_mem_region(r) {
407 if (memblock_is_mirror(r))
408 continue;
409
410 nid = memblock_get_region_node(r);
411
412 usable_startpfn = memblock_region_memory_base_pfn(r);
413
414 if (usable_startpfn < PHYS_PFN(SZ_4G)) {
415 mem_below_4gb_not_mirrored = true;
416 continue;
417 }
418
419 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
420 min(usable_startpfn, zone_movable_pfn[nid]) :
421 usable_startpfn;
422 }
423
424 if (mem_below_4gb_not_mirrored)
425 pr_warn("This configuration results in unmirrored kernel memory.\n");
426
427 goto out2;
428 }
429
430 /*
431 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
432 * amount of necessary memory.
433 */
434 if (required_kernelcore_percent)
435 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
436 10000UL;
437 if (required_movablecore_percent)
438 required_movablecore = (totalpages * 100 * required_movablecore_percent) /
439 10000UL;
440
441 /*
442 * If movablecore= was specified, calculate what size of
443 * kernelcore that corresponds so that memory usable for
444 * any allocation type is evenly spread. If both kernelcore
445 * and movablecore are specified, then the value of kernelcore
446 * will be used for required_kernelcore if it's greater than
447 * what movablecore would have allowed.
448 */
449 if (required_movablecore) {
450 unsigned long corepages;
451
452 /*
453 * Round-up so that ZONE_MOVABLE is at least as large as what
454 * was requested by the user
455 */
456 required_movablecore =
457 round_up(required_movablecore, MAX_ORDER_NR_PAGES);
458 required_movablecore = min(totalpages, required_movablecore);
459 corepages = totalpages - required_movablecore;
460
461 required_kernelcore = max(required_kernelcore, corepages);
462 }
463
464 /*
465 * If kernelcore was not specified or kernelcore size is larger
466 * than totalpages, there is no ZONE_MOVABLE.
467 */
468 if (!required_kernelcore || required_kernelcore >= totalpages)
469 goto out;
470
471 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
472 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
473
474 restart:
475 /* Spread kernelcore memory as evenly as possible throughout nodes */
476 kernelcore_node = required_kernelcore / usable_nodes;
477 for_each_node_state(nid, N_MEMORY) {
478 unsigned long start_pfn, end_pfn;
479
480 /*
481 * Recalculate kernelcore_node if the division per node
482 * now exceeds what is necessary to satisfy the requested
483 * amount of memory for the kernel
484 */
485 if (required_kernelcore < kernelcore_node)
486 kernelcore_node = required_kernelcore / usable_nodes;
487
488 /*
489 * As the map is walked, we track how much memory is usable
490 * by the kernel using kernelcore_remaining. When it is
491 * 0, the rest of the node is usable by ZONE_MOVABLE
492 */
493 kernelcore_remaining = kernelcore_node;
494
495 /* Go through each range of PFNs within this node */
496 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
497 unsigned long size_pages;
498
499 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
500 if (start_pfn >= end_pfn)
501 continue;
502
503 /* Account for what is only usable for kernelcore */
504 if (start_pfn < usable_startpfn) {
505 unsigned long kernel_pages;
506 kernel_pages = min(end_pfn, usable_startpfn)
507 - start_pfn;
508
509 kernelcore_remaining -= min(kernel_pages,
510 kernelcore_remaining);
511 required_kernelcore -= min(kernel_pages,
512 required_kernelcore);
513
514 /* Continue if range is now fully accounted */
515 if (end_pfn <= usable_startpfn) {
516
517 /*
518 * Push zone_movable_pfn to the end so
519 * that if we have to rebalance
520 * kernelcore across nodes, we will
521 * not double account here
522 */
523 zone_movable_pfn[nid] = end_pfn;
524 continue;
525 }
526 start_pfn = usable_startpfn;
527 }
528
529 /*
530 * The usable PFN range for ZONE_MOVABLE is from
531 * start_pfn->end_pfn. Calculate size_pages as the
532 * number of pages used as kernelcore
533 */
534 size_pages = end_pfn - start_pfn;
535 if (size_pages > kernelcore_remaining)
536 size_pages = kernelcore_remaining;
537 zone_movable_pfn[nid] = start_pfn + size_pages;
538
539 /*
540 * Some kernelcore has been met, update counts and
541 * break if the kernelcore for this node has been
542 * satisfied
543 */
544 required_kernelcore -= min(required_kernelcore,
545 size_pages);
546 kernelcore_remaining -= size_pages;
547 if (!kernelcore_remaining)
548 break;
549 }
550 }
551
552 /*
553 * If there is still required_kernelcore, we do another pass with one
554 * less node in the count. This will push zone_movable_pfn[nid] further
555 * along on the nodes that still have memory until kernelcore is
556 * satisfied
557 */
558 usable_nodes--;
559 if (usable_nodes && required_kernelcore > usable_nodes)
560 goto restart;
561
562 out2:
563 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
564 for_each_node_state(nid, N_MEMORY) {
565 unsigned long start_pfn, end_pfn;
566
567 zone_movable_pfn[nid] =
568 round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
569
570 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
571 if (zone_movable_pfn[nid] >= end_pfn)
572 zone_movable_pfn[nid] = 0;
573 }
574
575 out:
576 /* restore the node_state */
577 node_states[N_MEMORY] = saved_node_state;
578 }
579
__init_single_page(struct page * page,unsigned long pfn,unsigned long zone,int nid)580 void __meminit __init_single_page(struct page *page, unsigned long pfn,
581 unsigned long zone, int nid)
582 {
583 mm_zero_struct_page(page);
584 set_page_links(page, zone, nid, pfn);
585 init_page_count(page);
586 atomic_set(&page->_mapcount, -1);
587 page_cpupid_reset_last(page);
588 page_kasan_tag_reset(page);
589
590 INIT_LIST_HEAD(&page->lru);
591 #ifdef WANT_PAGE_VIRTUAL
592 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
593 if (!is_highmem_idx(zone))
594 set_page_address(page, __va(pfn << PAGE_SHIFT));
595 #endif
596 }
597
598 #ifdef CONFIG_NUMA
599 /*
600 * During memory init memblocks map pfns to nids. The search is expensive and
601 * this caches recent lookups. The implementation of __early_pfn_to_nid
602 * treats start/end as pfns.
603 */
604 struct mminit_pfnnid_cache {
605 unsigned long last_start;
606 unsigned long last_end;
607 int last_nid;
608 };
609
610 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
611
612 /*
613 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
614 */
__early_pfn_to_nid(unsigned long pfn,struct mminit_pfnnid_cache * state)615 static int __meminit __early_pfn_to_nid(unsigned long pfn,
616 struct mminit_pfnnid_cache *state)
617 {
618 unsigned long start_pfn, end_pfn;
619 int nid;
620
621 if (state->last_start <= pfn && pfn < state->last_end)
622 return state->last_nid;
623
624 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
625 if (nid != NUMA_NO_NODE) {
626 state->last_start = start_pfn;
627 state->last_end = end_pfn;
628 state->last_nid = nid;
629 }
630
631 return nid;
632 }
633
early_pfn_to_nid(unsigned long pfn)634 int __meminit early_pfn_to_nid(unsigned long pfn)
635 {
636 static DEFINE_SPINLOCK(early_pfn_lock);
637 int nid;
638
639 spin_lock(&early_pfn_lock);
640 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
641 if (nid < 0)
642 nid = first_online_node;
643 spin_unlock(&early_pfn_lock);
644
645 return nid;
646 }
647
648 int hashdist = HASHDIST_DEFAULT;
649
set_hashdist(char * str)650 static int __init set_hashdist(char *str)
651 {
652 if (!str)
653 return 0;
654 hashdist = simple_strtoul(str, &str, 0);
655 return 1;
656 }
657 __setup("hashdist=", set_hashdist);
658
fixup_hashdist(void)659 static inline void fixup_hashdist(void)
660 {
661 if (num_node_state(N_MEMORY) == 1)
662 hashdist = 0;
663 }
664 #else
fixup_hashdist(void)665 static inline void fixup_hashdist(void) {}
666 #endif /* CONFIG_NUMA */
667
668 /*
669 * Initialize a reserved page unconditionally, finding its zone first.
670 */
__init_page_from_nid(unsigned long pfn,int nid)671 void __meminit __init_page_from_nid(unsigned long pfn, int nid)
672 {
673 pg_data_t *pgdat;
674 int zid;
675
676 pgdat = NODE_DATA(nid);
677
678 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
679 struct zone *zone = &pgdat->node_zones[zid];
680
681 if (zone_spans_pfn(zone, pfn))
682 break;
683 }
684 __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
685
686 if (pageblock_aligned(pfn))
687 set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
688 }
689
690 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
pgdat_set_deferred_range(pg_data_t * pgdat)691 static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
692 {
693 pgdat->first_deferred_pfn = ULONG_MAX;
694 }
695
696 /* Returns true if the struct page for the pfn is initialised */
early_page_initialised(unsigned long pfn,int nid)697 static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
698 {
699 if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
700 return false;
701
702 return true;
703 }
704
705 /*
706 * Returns true when the remaining initialisation should be deferred until
707 * later in the boot cycle when it can be parallelised.
708 */
709 static bool __meminit
defer_init(int nid,unsigned long pfn,unsigned long end_pfn)710 defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
711 {
712 static unsigned long prev_end_pfn, nr_initialised;
713
714 if (early_page_ext_enabled())
715 return false;
716
717 /* Always populate low zones for address-constrained allocations */
718 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
719 return false;
720
721 if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
722 return true;
723
724 /*
725 * prev_end_pfn static that contains the end of previous zone
726 * No need to protect because called very early in boot before smp_init.
727 */
728 if (prev_end_pfn != end_pfn) {
729 prev_end_pfn = end_pfn;
730 nr_initialised = 0;
731 }
732
733 /*
734 * We start only with one section of pages, more pages are added as
735 * needed until the rest of deferred pages are initialized.
736 */
737 nr_initialised++;
738 if ((nr_initialised > PAGES_PER_SECTION) &&
739 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
740 NODE_DATA(nid)->first_deferred_pfn = pfn;
741 return true;
742 }
743 return false;
744 }
745
init_deferred_page(unsigned long pfn,int nid)746 static void __meminit init_deferred_page(unsigned long pfn, int nid)
747 {
748 if (early_page_initialised(pfn, nid))
749 return;
750
751 __init_page_from_nid(pfn, nid);
752 }
753 #else
pgdat_set_deferred_range(pg_data_t * pgdat)754 static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
755
early_page_initialised(unsigned long pfn,int nid)756 static inline bool early_page_initialised(unsigned long pfn, int nid)
757 {
758 return true;
759 }
760
defer_init(int nid,unsigned long pfn,unsigned long end_pfn)761 static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
762 {
763 return false;
764 }
765
init_deferred_page(unsigned long pfn,int nid)766 static inline void init_deferred_page(unsigned long pfn, int nid)
767 {
768 }
769 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
770
771 /*
772 * Initialised pages do not have PageReserved set. This function is
773 * called for each range allocated by the bootmem allocator and
774 * marks the pages PageReserved. The remaining valid pages are later
775 * sent to the buddy page allocator.
776 */
reserve_bootmem_region(phys_addr_t start,phys_addr_t end,int nid)777 void __meminit reserve_bootmem_region(phys_addr_t start,
778 phys_addr_t end, int nid)
779 {
780 unsigned long start_pfn = PFN_DOWN(start);
781 unsigned long end_pfn = PFN_UP(end);
782
783 for (; start_pfn < end_pfn; start_pfn++) {
784 if (pfn_valid(start_pfn)) {
785 struct page *page = pfn_to_page(start_pfn);
786
787 init_deferred_page(start_pfn, nid);
788
789 /*
790 * no need for atomic set_bit because the struct
791 * page is not visible yet so nobody should
792 * access it yet.
793 */
794 __SetPageReserved(page);
795 }
796 }
797 }
798
799 /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
800 static bool __meminit
overlap_memmap_init(unsigned long zone,unsigned long * pfn)801 overlap_memmap_init(unsigned long zone, unsigned long *pfn)
802 {
803 static struct memblock_region *r;
804
805 if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
806 if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
807 for_each_mem_region(r) {
808 if (*pfn < memblock_region_memory_end_pfn(r))
809 break;
810 }
811 }
812 if (*pfn >= memblock_region_memory_base_pfn(r) &&
813 memblock_is_mirror(r)) {
814 *pfn = memblock_region_memory_end_pfn(r);
815 return true;
816 }
817 }
818 return false;
819 }
820
821 /*
822 * Only struct pages that correspond to ranges defined by memblock.memory
823 * are zeroed and initialized by going through __init_single_page() during
824 * memmap_init_zone_range().
825 *
826 * But, there could be struct pages that correspond to holes in
827 * memblock.memory. This can happen because of the following reasons:
828 * - physical memory bank size is not necessarily the exact multiple of the
829 * arbitrary section size
830 * - early reserved memory may not be listed in memblock.memory
831 * - non-memory regions covered by the contigious flatmem mapping
832 * - memory layouts defined with memmap= kernel parameter may not align
833 * nicely with memmap sections
834 *
835 * Explicitly initialize those struct pages so that:
836 * - PG_Reserved is set
837 * - zone and node links point to zone and node that span the page if the
838 * hole is in the middle of a zone
839 * - zone and node links point to adjacent zone/node if the hole falls on
840 * the zone boundary; the pages in such holes will be prepended to the
841 * zone/node above the hole except for the trailing pages in the last
842 * section that will be appended to the zone/node below.
843 */
init_unavailable_range(unsigned long spfn,unsigned long epfn,int zone,int node)844 static void __init init_unavailable_range(unsigned long spfn,
845 unsigned long epfn,
846 int zone, int node)
847 {
848 unsigned long pfn;
849 u64 pgcnt = 0;
850
851 for (pfn = spfn; pfn < epfn; pfn++) {
852 if (!pfn_valid(pageblock_start_pfn(pfn))) {
853 pfn = pageblock_end_pfn(pfn) - 1;
854 continue;
855 }
856 __init_single_page(pfn_to_page(pfn), pfn, zone, node);
857 __SetPageReserved(pfn_to_page(pfn));
858 pgcnt++;
859 }
860
861 if (pgcnt)
862 pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n",
863 node, zone_names[zone], pgcnt);
864 }
865
866 /*
867 * Initially all pages are reserved - free ones are freed
868 * up by memblock_free_all() once the early boot process is
869 * done. Non-atomic initialization, single-pass.
870 *
871 * All aligned pageblocks are initialized to the specified migratetype
872 * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
873 * zone stats (e.g., nr_isolate_pageblock) are touched.
874 */
memmap_init_range(unsigned long size,int nid,unsigned long zone,unsigned long start_pfn,unsigned long zone_end_pfn,enum meminit_context context,struct vmem_altmap * altmap,int migratetype)875 void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
876 unsigned long start_pfn, unsigned long zone_end_pfn,
877 enum meminit_context context,
878 struct vmem_altmap *altmap, int migratetype)
879 {
880 unsigned long pfn, end_pfn = start_pfn + size;
881 struct page *page;
882
883 if (highest_memmap_pfn < end_pfn - 1)
884 highest_memmap_pfn = end_pfn - 1;
885
886 #ifdef CONFIG_ZONE_DEVICE
887 /*
888 * Honor reservation requested by the driver for this ZONE_DEVICE
889 * memory. We limit the total number of pages to initialize to just
890 * those that might contain the memory mapping. We will defer the
891 * ZONE_DEVICE page initialization until after we have released
892 * the hotplug lock.
893 */
894 if (zone == ZONE_DEVICE) {
895 if (!altmap)
896 return;
897
898 if (start_pfn == altmap->base_pfn)
899 start_pfn += altmap->reserve;
900 end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
901 }
902 #endif
903
904 for (pfn = start_pfn; pfn < end_pfn; ) {
905 /*
906 * There can be holes in boot-time mem_map[]s handed to this
907 * function. They do not exist on hotplugged memory.
908 */
909 if (context == MEMINIT_EARLY) {
910 if (overlap_memmap_init(zone, &pfn))
911 continue;
912 if (defer_init(nid, pfn, zone_end_pfn)) {
913 deferred_struct_pages = true;
914 break;
915 }
916 }
917
918 page = pfn_to_page(pfn);
919 __init_single_page(page, pfn, zone, nid);
920 if (context == MEMINIT_HOTPLUG) {
921 #ifdef CONFIG_ZONE_DEVICE
922 if (zone == ZONE_DEVICE)
923 __SetPageReserved(page);
924 else
925 #endif
926 __SetPageOffline(page);
927 }
928
929 /*
930 * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
931 * such that unmovable allocations won't be scattered all
932 * over the place during system boot.
933 */
934 if (pageblock_aligned(pfn)) {
935 set_pageblock_migratetype(page, migratetype);
936 cond_resched();
937 }
938 pfn++;
939 }
940 }
941
memmap_init_zone_range(struct zone * zone,unsigned long start_pfn,unsigned long end_pfn,unsigned long * hole_pfn)942 static void __init memmap_init_zone_range(struct zone *zone,
943 unsigned long start_pfn,
944 unsigned long end_pfn,
945 unsigned long *hole_pfn)
946 {
947 unsigned long zone_start_pfn = zone->zone_start_pfn;
948 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
949 int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
950
951 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
952 end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
953
954 if (start_pfn >= end_pfn)
955 return;
956
957 memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
958 zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
959
960 if (*hole_pfn < start_pfn)
961 init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
962
963 *hole_pfn = end_pfn;
964 }
965
memmap_init(void)966 static void __init memmap_init(void)
967 {
968 unsigned long start_pfn, end_pfn;
969 unsigned long hole_pfn = 0;
970 int i, j, zone_id = 0, nid;
971
972 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
973 struct pglist_data *node = NODE_DATA(nid);
974
975 for (j = 0; j < MAX_NR_ZONES; j++) {
976 struct zone *zone = node->node_zones + j;
977
978 if (!populated_zone(zone))
979 continue;
980
981 memmap_init_zone_range(zone, start_pfn, end_pfn,
982 &hole_pfn);
983 zone_id = j;
984 }
985 }
986
987 /*
988 * Initialize the memory map for hole in the range [memory_end,
989 * section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
990 * for FLATMEM.
991 * Append the pages in this hole to the highest zone in the last
992 * node.
993 */
994 #ifdef CONFIG_SPARSEMEM
995 end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
996 #else
997 end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
998 #endif
999 if (hole_pfn < end_pfn)
1000 init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
1001 }
1002
1003 #ifdef CONFIG_ZONE_DEVICE
__init_zone_device_page(struct page * page,unsigned long pfn,unsigned long zone_idx,int nid,struct dev_pagemap * pgmap)1004 static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
1005 unsigned long zone_idx, int nid,
1006 struct dev_pagemap *pgmap)
1007 {
1008
1009 __init_single_page(page, pfn, zone_idx, nid);
1010
1011 /*
1012 * Mark page reserved as it will need to wait for onlining
1013 * phase for it to be fully associated with a zone.
1014 *
1015 * We can use the non-atomic __set_bit operation for setting
1016 * the flag as we are still initializing the pages.
1017 */
1018 __SetPageReserved(page);
1019
1020 /*
1021 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
1022 * and zone_device_data. It is a bug if a ZONE_DEVICE page is
1023 * ever freed or placed on a driver-private list.
1024 */
1025 page_folio(page)->pgmap = pgmap;
1026 page->zone_device_data = NULL;
1027
1028 /*
1029 * Mark the block movable so that blocks are reserved for
1030 * movable at startup. This will force kernel allocations
1031 * to reserve their blocks rather than leaking throughout
1032 * the address space during boot when many long-lived
1033 * kernel allocations are made.
1034 *
1035 * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
1036 * because this is done early in section_activate()
1037 */
1038 if (pageblock_aligned(pfn)) {
1039 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1040 cond_resched();
1041 }
1042
1043 /*
1044 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
1045 * directly to the driver page allocator which will set the page count
1046 * to 1 when allocating the page.
1047 *
1048 * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
1049 * their refcount reset to one whenever they are freed (ie. after
1050 * their refcount drops to 0).
1051 */
1052 switch (pgmap->type) {
1053 case MEMORY_DEVICE_FS_DAX:
1054 case MEMORY_DEVICE_PRIVATE:
1055 case MEMORY_DEVICE_COHERENT:
1056 case MEMORY_DEVICE_PCI_P2PDMA:
1057 set_page_count(page, 0);
1058 break;
1059
1060 case MEMORY_DEVICE_GENERIC:
1061 break;
1062 }
1063 }
1064
1065 /*
1066 * With compound page geometry and when struct pages are stored in ram most
1067 * tail pages are reused. Consequently, the amount of unique struct pages to
1068 * initialize is a lot smaller that the total amount of struct pages being
1069 * mapped. This is a paired / mild layering violation with explicit knowledge
1070 * of how the sparse_vmemmap internals handle compound pages in the lack
1071 * of an altmap. See vmemmap_populate_compound_pages().
1072 */
compound_nr_pages(struct vmem_altmap * altmap,struct dev_pagemap * pgmap)1073 static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
1074 struct dev_pagemap *pgmap)
1075 {
1076 if (!vmemmap_can_optimize(altmap, pgmap))
1077 return pgmap_vmemmap_nr(pgmap);
1078
1079 return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
1080 }
1081
memmap_init_compound(struct page * head,unsigned long head_pfn,unsigned long zone_idx,int nid,struct dev_pagemap * pgmap,unsigned long nr_pages)1082 static void __ref memmap_init_compound(struct page *head,
1083 unsigned long head_pfn,
1084 unsigned long zone_idx, int nid,
1085 struct dev_pagemap *pgmap,
1086 unsigned long nr_pages)
1087 {
1088 unsigned long pfn, end_pfn = head_pfn + nr_pages;
1089 unsigned int order = pgmap->vmemmap_shift;
1090
1091 __SetPageHead(head);
1092 for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
1093 struct page *page = pfn_to_page(pfn);
1094
1095 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1096 prep_compound_tail(head, pfn - head_pfn);
1097 set_page_count(page, 0);
1098
1099 /*
1100 * The first tail page stores important compound page info.
1101 * Call prep_compound_head() after the first tail page has
1102 * been initialized, to not have the data overwritten.
1103 */
1104 if (pfn == head_pfn + 1)
1105 prep_compound_head(head, order);
1106 }
1107 }
1108
memmap_init_zone_device(struct zone * zone,unsigned long start_pfn,unsigned long nr_pages,struct dev_pagemap * pgmap)1109 void __ref memmap_init_zone_device(struct zone *zone,
1110 unsigned long start_pfn,
1111 unsigned long nr_pages,
1112 struct dev_pagemap *pgmap)
1113 {
1114 unsigned long pfn, end_pfn = start_pfn + nr_pages;
1115 struct pglist_data *pgdat = zone->zone_pgdat;
1116 struct vmem_altmap *altmap = pgmap_altmap(pgmap);
1117 unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
1118 unsigned long zone_idx = zone_idx(zone);
1119 unsigned long start = jiffies;
1120 int nid = pgdat->node_id;
1121
1122 if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
1123 return;
1124
1125 /*
1126 * The call to memmap_init should have already taken care
1127 * of the pages reserved for the memmap, so we can just jump to
1128 * the end of that region and start processing the device pages.
1129 */
1130 if (altmap) {
1131 start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
1132 nr_pages = end_pfn - start_pfn;
1133 }
1134
1135 for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
1136 struct page *page = pfn_to_page(pfn);
1137
1138 __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1139
1140 if (pfns_per_compound == 1)
1141 continue;
1142
1143 memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
1144 compound_nr_pages(altmap, pgmap));
1145 }
1146
1147 pr_debug("%s initialised %lu pages in %ums\n", __func__,
1148 nr_pages, jiffies_to_msecs(jiffies - start));
1149 }
1150 #endif
1151
1152 /*
1153 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
1154 * because it is sized independent of architecture. Unlike the other zones,
1155 * the starting point for ZONE_MOVABLE is not fixed. It may be different
1156 * in each node depending on the size of each node and how evenly kernelcore
1157 * is distributed. This helper function adjusts the zone ranges
1158 * provided by the architecture for a given node by using the end of the
1159 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
1160 * zones within a node are in order of monotonic increases memory addresses
1161 */
adjust_zone_range_for_zone_movable(int nid,unsigned long zone_type,unsigned long node_end_pfn,unsigned long * zone_start_pfn,unsigned long * zone_end_pfn)1162 static void __init adjust_zone_range_for_zone_movable(int nid,
1163 unsigned long zone_type,
1164 unsigned long node_end_pfn,
1165 unsigned long *zone_start_pfn,
1166 unsigned long *zone_end_pfn)
1167 {
1168 /* Only adjust if ZONE_MOVABLE is on this node */
1169 if (zone_movable_pfn[nid]) {
1170 /* Size ZONE_MOVABLE */
1171 if (zone_type == ZONE_MOVABLE) {
1172 *zone_start_pfn = zone_movable_pfn[nid];
1173 *zone_end_pfn = min(node_end_pfn,
1174 arch_zone_highest_possible_pfn[movable_zone]);
1175
1176 /* Adjust for ZONE_MOVABLE starting within this range */
1177 } else if (!mirrored_kernelcore &&
1178 *zone_start_pfn < zone_movable_pfn[nid] &&
1179 *zone_end_pfn > zone_movable_pfn[nid]) {
1180 *zone_end_pfn = zone_movable_pfn[nid];
1181
1182 /* Check if this whole range is within ZONE_MOVABLE */
1183 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
1184 *zone_start_pfn = *zone_end_pfn;
1185 }
1186 }
1187
1188 /*
1189 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
1190 * then all holes in the requested range will be accounted for.
1191 */
__absent_pages_in_range(int nid,unsigned long range_start_pfn,unsigned long range_end_pfn)1192 static unsigned long __init __absent_pages_in_range(int nid,
1193 unsigned long range_start_pfn,
1194 unsigned long range_end_pfn)
1195 {
1196 unsigned long nr_absent = range_end_pfn - range_start_pfn;
1197 unsigned long start_pfn, end_pfn;
1198 int i;
1199
1200 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
1201 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
1202 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
1203 nr_absent -= end_pfn - start_pfn;
1204 }
1205 return nr_absent;
1206 }
1207
1208 /**
1209 * absent_pages_in_range - Return number of page frames in holes within a range
1210 * @start_pfn: The start PFN to start searching for holes
1211 * @end_pfn: The end PFN to stop searching for holes
1212 *
1213 * Return: the number of pages frames in memory holes within a range.
1214 */
absent_pages_in_range(unsigned long start_pfn,unsigned long end_pfn)1215 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
1216 unsigned long end_pfn)
1217 {
1218 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
1219 }
1220
1221 /* Return the number of page frames in holes in a zone on a node */
zone_absent_pages_in_node(int nid,unsigned long zone_type,unsigned long zone_start_pfn,unsigned long zone_end_pfn)1222 static unsigned long __init zone_absent_pages_in_node(int nid,
1223 unsigned long zone_type,
1224 unsigned long zone_start_pfn,
1225 unsigned long zone_end_pfn)
1226 {
1227 unsigned long nr_absent;
1228
1229 /* zone is empty, we don't have any absent pages */
1230 if (zone_start_pfn == zone_end_pfn)
1231 return 0;
1232
1233 nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
1234
1235 /*
1236 * ZONE_MOVABLE handling.
1237 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
1238 * and vice versa.
1239 */
1240 if (mirrored_kernelcore && zone_movable_pfn[nid]) {
1241 unsigned long start_pfn, end_pfn;
1242 struct memblock_region *r;
1243
1244 for_each_mem_region(r) {
1245 start_pfn = clamp(memblock_region_memory_base_pfn(r),
1246 zone_start_pfn, zone_end_pfn);
1247 end_pfn = clamp(memblock_region_memory_end_pfn(r),
1248 zone_start_pfn, zone_end_pfn);
1249
1250 if (zone_type == ZONE_MOVABLE &&
1251 memblock_is_mirror(r))
1252 nr_absent += end_pfn - start_pfn;
1253
1254 if (zone_type == ZONE_NORMAL &&
1255 !memblock_is_mirror(r))
1256 nr_absent += end_pfn - start_pfn;
1257 }
1258 }
1259
1260 return nr_absent;
1261 }
1262
1263 /*
1264 * Return the number of pages a zone spans in a node, including holes
1265 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
1266 */
zone_spanned_pages_in_node(int nid,unsigned long zone_type,unsigned long node_start_pfn,unsigned long node_end_pfn,unsigned long * zone_start_pfn,unsigned long * zone_end_pfn)1267 static unsigned long __init zone_spanned_pages_in_node(int nid,
1268 unsigned long zone_type,
1269 unsigned long node_start_pfn,
1270 unsigned long node_end_pfn,
1271 unsigned long *zone_start_pfn,
1272 unsigned long *zone_end_pfn)
1273 {
1274 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
1275 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
1276
1277 /* Get the start and end of the zone */
1278 *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
1279 *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
1280 adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
1281 zone_start_pfn, zone_end_pfn);
1282
1283 /* Check that this node has pages within the zone's required range */
1284 if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
1285 return 0;
1286
1287 /* Move the zone boundaries inside the node if necessary */
1288 *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
1289 *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
1290
1291 /* Return the spanned pages */
1292 return *zone_end_pfn - *zone_start_pfn;
1293 }
1294
reset_memoryless_node_totalpages(struct pglist_data * pgdat)1295 static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
1296 {
1297 struct zone *z;
1298
1299 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
1300 z->zone_start_pfn = 0;
1301 z->spanned_pages = 0;
1302 z->present_pages = 0;
1303 #if defined(CONFIG_MEMORY_HOTPLUG)
1304 z->present_early_pages = 0;
1305 #endif
1306 }
1307
1308 pgdat->node_spanned_pages = 0;
1309 pgdat->node_present_pages = 0;
1310 pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
1311 }
1312
calc_nr_kernel_pages(void)1313 static void __init calc_nr_kernel_pages(void)
1314 {
1315 unsigned long start_pfn, end_pfn;
1316 phys_addr_t start_addr, end_addr;
1317 u64 u;
1318 #ifdef CONFIG_HIGHMEM
1319 unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
1320 #endif
1321
1322 for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
1323 start_pfn = PFN_UP(start_addr);
1324 end_pfn = PFN_DOWN(end_addr);
1325
1326 if (start_pfn < end_pfn) {
1327 nr_all_pages += end_pfn - start_pfn;
1328 #ifdef CONFIG_HIGHMEM
1329 start_pfn = clamp(start_pfn, 0, high_zone_low);
1330 end_pfn = clamp(end_pfn, 0, high_zone_low);
1331 #endif
1332 nr_kernel_pages += end_pfn - start_pfn;
1333 }
1334 }
1335 }
1336
calculate_node_totalpages(struct pglist_data * pgdat,unsigned long node_start_pfn,unsigned long node_end_pfn)1337 static void __init calculate_node_totalpages(struct pglist_data *pgdat,
1338 unsigned long node_start_pfn,
1339 unsigned long node_end_pfn)
1340 {
1341 unsigned long realtotalpages = 0, totalpages = 0;
1342 enum zone_type i;
1343
1344 for (i = 0; i < MAX_NR_ZONES; i++) {
1345 struct zone *zone = pgdat->node_zones + i;
1346 unsigned long zone_start_pfn, zone_end_pfn;
1347 unsigned long spanned, absent;
1348 unsigned long real_size;
1349
1350 spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
1351 node_start_pfn,
1352 node_end_pfn,
1353 &zone_start_pfn,
1354 &zone_end_pfn);
1355 absent = zone_absent_pages_in_node(pgdat->node_id, i,
1356 zone_start_pfn,
1357 zone_end_pfn);
1358
1359 real_size = spanned - absent;
1360
1361 if (spanned)
1362 zone->zone_start_pfn = zone_start_pfn;
1363 else
1364 zone->zone_start_pfn = 0;
1365 zone->spanned_pages = spanned;
1366 zone->present_pages = real_size;
1367 #if defined(CONFIG_MEMORY_HOTPLUG)
1368 zone->present_early_pages = real_size;
1369 #endif
1370
1371 totalpages += spanned;
1372 realtotalpages += real_size;
1373 }
1374
1375 pgdat->node_spanned_pages = totalpages;
1376 pgdat->node_present_pages = realtotalpages;
1377 pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1378 }
1379
1380 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pgdat_init_split_queue(struct pglist_data * pgdat)1381 static void pgdat_init_split_queue(struct pglist_data *pgdat)
1382 {
1383 struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
1384
1385 spin_lock_init(&ds_queue->split_queue_lock);
1386 INIT_LIST_HEAD(&ds_queue->split_queue);
1387 ds_queue->split_queue_len = 0;
1388 }
1389 #else
pgdat_init_split_queue(struct pglist_data * pgdat)1390 static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
1391 #endif
1392
1393 #ifdef CONFIG_COMPACTION
pgdat_init_kcompactd(struct pglist_data * pgdat)1394 static void pgdat_init_kcompactd(struct pglist_data *pgdat)
1395 {
1396 init_waitqueue_head(&pgdat->kcompactd_wait);
1397 }
1398 #else
pgdat_init_kcompactd(struct pglist_data * pgdat)1399 static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
1400 #endif
1401
pgdat_init_internals(struct pglist_data * pgdat)1402 static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
1403 {
1404 int i;
1405
1406 pgdat_resize_init(pgdat);
1407 pgdat_kswapd_lock_init(pgdat);
1408
1409 pgdat_init_split_queue(pgdat);
1410 pgdat_init_kcompactd(pgdat);
1411
1412 init_waitqueue_head(&pgdat->kswapd_wait);
1413 init_waitqueue_head(&pgdat->pfmemalloc_wait);
1414
1415 for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
1416 init_waitqueue_head(&pgdat->reclaim_wait[i]);
1417
1418 pgdat_page_ext_init(pgdat);
1419 lruvec_init(&pgdat->__lruvec);
1420 }
1421
zone_init_internals(struct zone * zone,enum zone_type idx,int nid,unsigned long remaining_pages)1422 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
1423 unsigned long remaining_pages)
1424 {
1425 atomic_long_set(&zone->managed_pages, remaining_pages);
1426 zone_set_nid(zone, nid);
1427 zone->name = zone_names[idx];
1428 zone->zone_pgdat = NODE_DATA(nid);
1429 spin_lock_init(&zone->lock);
1430 zone_seqlock_init(zone);
1431 zone_pcp_init(zone);
1432 }
1433
zone_init_free_lists(struct zone * zone)1434 static void __meminit zone_init_free_lists(struct zone *zone)
1435 {
1436 unsigned int order, t;
1437 for_each_migratetype_order(order, t) {
1438 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
1439 zone->free_area[order].nr_free = 0;
1440 }
1441
1442 #ifdef CONFIG_UNACCEPTED_MEMORY
1443 INIT_LIST_HEAD(&zone->unaccepted_pages);
1444 #endif
1445 }
1446
init_currently_empty_zone(struct zone * zone,unsigned long zone_start_pfn,unsigned long size)1447 void __meminit init_currently_empty_zone(struct zone *zone,
1448 unsigned long zone_start_pfn,
1449 unsigned long size)
1450 {
1451 struct pglist_data *pgdat = zone->zone_pgdat;
1452 int zone_idx = zone_idx(zone) + 1;
1453
1454 if (zone_idx > pgdat->nr_zones)
1455 pgdat->nr_zones = zone_idx;
1456
1457 zone->zone_start_pfn = zone_start_pfn;
1458
1459 mminit_dprintk(MMINIT_TRACE, "memmap_init",
1460 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
1461 pgdat->node_id,
1462 (unsigned long)zone_idx(zone),
1463 zone_start_pfn, (zone_start_pfn + size));
1464
1465 zone_init_free_lists(zone);
1466 zone->initialized = 1;
1467 }
1468
1469 #ifndef CONFIG_SPARSEMEM
1470 /*
1471 * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
1472 * Start by making sure zonesize is a multiple of pageblock_order by rounding
1473 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
1474 * round what is now in bits to nearest long in bits, then return it in
1475 * bytes.
1476 */
usemap_size(unsigned long zone_start_pfn,unsigned long zonesize)1477 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
1478 {
1479 unsigned long usemapsize;
1480
1481 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
1482 usemapsize = round_up(zonesize, pageblock_nr_pages);
1483 usemapsize = usemapsize >> pageblock_order;
1484 usemapsize *= NR_PAGEBLOCK_BITS;
1485 usemapsize = round_up(usemapsize, BITS_PER_LONG);
1486
1487 return usemapsize / BITS_PER_BYTE;
1488 }
1489
setup_usemap(struct zone * zone)1490 static void __ref setup_usemap(struct zone *zone)
1491 {
1492 unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
1493 zone->spanned_pages);
1494 zone->pageblock_flags = NULL;
1495 if (usemapsize) {
1496 zone->pageblock_flags =
1497 memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
1498 zone_to_nid(zone));
1499 if (!zone->pageblock_flags)
1500 panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
1501 usemapsize, zone->name, zone_to_nid(zone));
1502 }
1503 }
1504 #else
setup_usemap(struct zone * zone)1505 static inline void setup_usemap(struct zone *zone) {}
1506 #endif /* CONFIG_SPARSEMEM */
1507
1508 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
1509
1510 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
set_pageblock_order(void)1511 void __init set_pageblock_order(void)
1512 {
1513 unsigned int order = MAX_PAGE_ORDER;
1514
1515 /* Check that pageblock_nr_pages has not already been setup */
1516 if (pageblock_order)
1517 return;
1518
1519 /* Don't let pageblocks exceed the maximum allocation granularity. */
1520 if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
1521 order = HUGETLB_PAGE_ORDER;
1522
1523 /*
1524 * Assume the largest contiguous order of interest is a huge page.
1525 * This value may be variable depending on boot parameters on powerpc.
1526 */
1527 pageblock_order = order;
1528 }
1529 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1530
1531 /*
1532 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
1533 * is unused as pageblock_order is set at compile-time. See
1534 * include/linux/pageblock-flags.h for the values of pageblock_order based on
1535 * the kernel config
1536 */
set_pageblock_order(void)1537 void __init set_pageblock_order(void)
1538 {
1539 }
1540
1541 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1542
1543 /*
1544 * Set up the zone data structures
1545 * - init pgdat internals
1546 * - init all zones belonging to this node
1547 *
1548 * NOTE: this function is only called during memory hotplug
1549 */
1550 #ifdef CONFIG_MEMORY_HOTPLUG
free_area_init_core_hotplug(struct pglist_data * pgdat)1551 void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
1552 {
1553 int nid = pgdat->node_id;
1554 enum zone_type z;
1555 int cpu;
1556
1557 pgdat_init_internals(pgdat);
1558
1559 if (pgdat->per_cpu_nodestats == &boot_nodestats)
1560 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1561
1562 /*
1563 * Reset the nr_zones, order and highest_zoneidx before reuse.
1564 * Note that kswapd will init kswapd_highest_zoneidx properly
1565 * when it starts in the near future.
1566 */
1567 pgdat->nr_zones = 0;
1568 pgdat->kswapd_order = 0;
1569 pgdat->kswapd_highest_zoneidx = 0;
1570 pgdat->node_start_pfn = 0;
1571 pgdat->node_present_pages = 0;
1572
1573 for_each_online_cpu(cpu) {
1574 struct per_cpu_nodestat *p;
1575
1576 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
1577 memset(p, 0, sizeof(*p));
1578 }
1579
1580 /*
1581 * When memory is hot-added, all the memory is in offline state. So
1582 * clear all zones' present_pages and managed_pages because they will
1583 * be updated in online_pages() and offline_pages().
1584 */
1585 for (z = 0; z < MAX_NR_ZONES; z++) {
1586 struct zone *zone = pgdat->node_zones + z;
1587
1588 zone->present_pages = 0;
1589 zone_init_internals(zone, z, nid, 0);
1590 }
1591 }
1592 #endif
1593
free_area_init_core(struct pglist_data * pgdat)1594 static void __init free_area_init_core(struct pglist_data *pgdat)
1595 {
1596 enum zone_type j;
1597 int nid = pgdat->node_id;
1598
1599 pgdat_init_internals(pgdat);
1600 pgdat->per_cpu_nodestats = &boot_nodestats;
1601
1602 for (j = 0; j < MAX_NR_ZONES; j++) {
1603 struct zone *zone = pgdat->node_zones + j;
1604 unsigned long size = zone->spanned_pages;
1605
1606 /*
1607 * Initialize zone->managed_pages as 0 , it will be reset
1608 * when memblock allocator frees pages into buddy system.
1609 */
1610 zone_init_internals(zone, j, nid, zone->present_pages);
1611
1612 if (!size)
1613 continue;
1614
1615 setup_usemap(zone);
1616 init_currently_empty_zone(zone, zone->zone_start_pfn, size);
1617 }
1618 }
1619
memmap_alloc(phys_addr_t size,phys_addr_t align,phys_addr_t min_addr,int nid,bool exact_nid)1620 void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
1621 phys_addr_t min_addr, int nid, bool exact_nid)
1622 {
1623 void *ptr;
1624
1625 /*
1626 * Kmemleak will explicitly scan mem_map by traversing all valid
1627 * `struct *page`,so memblock does not need to be added to the scan list.
1628 */
1629 if (exact_nid)
1630 ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
1631 MEMBLOCK_ALLOC_NOLEAKTRACE,
1632 nid);
1633 else
1634 ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
1635 MEMBLOCK_ALLOC_NOLEAKTRACE,
1636 nid);
1637
1638 if (ptr && size > 0)
1639 page_init_poison(ptr, size);
1640
1641 return ptr;
1642 }
1643
1644 #ifdef CONFIG_FLATMEM
alloc_node_mem_map(struct pglist_data * pgdat)1645 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1646 {
1647 unsigned long start, offset, size, end;
1648 struct page *map;
1649
1650 /* Skip empty nodes */
1651 if (!pgdat->node_spanned_pages)
1652 return;
1653
1654 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
1655 offset = pgdat->node_start_pfn - start;
1656 /*
1657 * The zone's endpoints aren't required to be MAX_PAGE_ORDER
1658 * aligned but the node_mem_map endpoints must be in order
1659 * for the buddy allocator to function correctly.
1660 */
1661 end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES);
1662 size = (end - start) * sizeof(struct page);
1663 map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
1664 pgdat->node_id, false);
1665 if (!map)
1666 panic("Failed to allocate %ld bytes for node %d memory map\n",
1667 size, pgdat->node_id);
1668 pgdat->node_mem_map = map + offset;
1669 memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
1670 pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
1671 __func__, pgdat->node_id, (unsigned long)pgdat,
1672 (unsigned long)pgdat->node_mem_map);
1673
1674 /* the global mem_map is just set as node 0's */
1675 WARN_ON(pgdat != NODE_DATA(0));
1676
1677 mem_map = pgdat->node_mem_map;
1678 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
1679 mem_map -= offset;
1680
1681 max_mapnr = end - start;
1682 }
1683 #else
alloc_node_mem_map(struct pglist_data * pgdat)1684 static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
1685 #endif /* CONFIG_FLATMEM */
1686
1687 /**
1688 * get_pfn_range_for_nid - Return the start and end page frames for a node
1689 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
1690 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
1691 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
1692 *
1693 * It returns the start and end page frame of a node based on information
1694 * provided by memblock_set_node(). If called for a node
1695 * with no available memory, the start and end PFNs will be 0.
1696 */
get_pfn_range_for_nid(unsigned int nid,unsigned long * start_pfn,unsigned long * end_pfn)1697 void __init get_pfn_range_for_nid(unsigned int nid,
1698 unsigned long *start_pfn, unsigned long *end_pfn)
1699 {
1700 unsigned long this_start_pfn, this_end_pfn;
1701 int i;
1702
1703 *start_pfn = -1UL;
1704 *end_pfn = 0;
1705
1706 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
1707 *start_pfn = min(*start_pfn, this_start_pfn);
1708 *end_pfn = max(*end_pfn, this_end_pfn);
1709 }
1710
1711 if (*start_pfn == -1UL)
1712 *start_pfn = 0;
1713 }
1714
free_area_init_node(int nid)1715 static void __init free_area_init_node(int nid)
1716 {
1717 pg_data_t *pgdat = NODE_DATA(nid);
1718 unsigned long start_pfn = 0;
1719 unsigned long end_pfn = 0;
1720
1721 /* pg_data_t should be reset to zero when it's allocated */
1722 WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
1723
1724 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1725
1726 pgdat->node_id = nid;
1727 pgdat->node_start_pfn = start_pfn;
1728 pgdat->per_cpu_nodestats = NULL;
1729
1730 if (start_pfn != end_pfn) {
1731 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
1732 (u64)start_pfn << PAGE_SHIFT,
1733 end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
1734
1735 calculate_node_totalpages(pgdat, start_pfn, end_pfn);
1736 } else {
1737 pr_info("Initmem setup node %d as memoryless\n", nid);
1738
1739 reset_memoryless_node_totalpages(pgdat);
1740 }
1741
1742 alloc_node_mem_map(pgdat);
1743 pgdat_set_deferred_range(pgdat);
1744
1745 free_area_init_core(pgdat);
1746 lru_gen_init_pgdat(pgdat);
1747 }
1748
1749 /* Any regular or high memory on that node ? */
check_for_memory(pg_data_t * pgdat)1750 static void __init check_for_memory(pg_data_t *pgdat)
1751 {
1752 enum zone_type zone_type;
1753
1754 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
1755 struct zone *zone = &pgdat->node_zones[zone_type];
1756 if (populated_zone(zone)) {
1757 if (IS_ENABLED(CONFIG_HIGHMEM))
1758 node_set_state(pgdat->node_id, N_HIGH_MEMORY);
1759 if (zone_type <= ZONE_NORMAL)
1760 node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
1761 break;
1762 }
1763 }
1764 }
1765
1766 #if MAX_NUMNODES > 1
1767 /*
1768 * Figure out the number of possible node ids.
1769 */
setup_nr_node_ids(void)1770 void __init setup_nr_node_ids(void)
1771 {
1772 unsigned int highest;
1773
1774 highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
1775 nr_node_ids = highest + 1;
1776 }
1777 #endif
1778
1779 /*
1780 * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
1781 * such cases we allow max_zone_pfn sorted in the descending order
1782 */
arch_has_descending_max_zone_pfns(void)1783 static bool arch_has_descending_max_zone_pfns(void)
1784 {
1785 return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
1786 }
1787
set_high_memory(void)1788 static void __init set_high_memory(void)
1789 {
1790 phys_addr_t highmem = memblock_end_of_DRAM();
1791
1792 /*
1793 * Some architectures (e.g. ARM) set high_memory very early and
1794 * use it in arch setup code.
1795 * If an architecture already set high_memory don't overwrite it
1796 */
1797 if (high_memory)
1798 return;
1799
1800 #ifdef CONFIG_HIGHMEM
1801 if (arch_has_descending_max_zone_pfns() ||
1802 highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]))
1803 highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]);
1804 #endif
1805
1806 high_memory = phys_to_virt(highmem - 1) + 1;
1807 }
1808
1809 /**
1810 * free_area_init - Initialise all pg_data_t and zone data
1811 * @max_zone_pfn: an array of max PFNs for each zone
1812 *
1813 * This will call free_area_init_node() for each active node in the system.
1814 * Using the page ranges provided by memblock_set_node(), the size of each
1815 * zone in each node and their holes is calculated. If the maximum PFN
1816 * between two adjacent zones match, it is assumed that the zone is empty.
1817 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
1818 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
1819 * starts where the previous one ended. For example, ZONE_DMA32 starts
1820 * at arch_max_dma_pfn.
1821 */
free_area_init(unsigned long * max_zone_pfn)1822 void __init free_area_init(unsigned long *max_zone_pfn)
1823 {
1824 unsigned long start_pfn, end_pfn;
1825 int i, nid, zone;
1826 bool descending;
1827
1828 /* Record where the zone boundaries are */
1829 memset(arch_zone_lowest_possible_pfn, 0,
1830 sizeof(arch_zone_lowest_possible_pfn));
1831 memset(arch_zone_highest_possible_pfn, 0,
1832 sizeof(arch_zone_highest_possible_pfn));
1833
1834 start_pfn = PHYS_PFN(memblock_start_of_DRAM());
1835 descending = arch_has_descending_max_zone_pfns();
1836
1837 for (i = 0; i < MAX_NR_ZONES; i++) {
1838 if (descending)
1839 zone = MAX_NR_ZONES - i - 1;
1840 else
1841 zone = i;
1842
1843 if (zone == ZONE_MOVABLE)
1844 continue;
1845
1846 end_pfn = max(max_zone_pfn[zone], start_pfn);
1847 arch_zone_lowest_possible_pfn[zone] = start_pfn;
1848 arch_zone_highest_possible_pfn[zone] = end_pfn;
1849
1850 start_pfn = end_pfn;
1851 }
1852
1853 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
1854 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
1855 find_zone_movable_pfns_for_nodes();
1856
1857 /* Print out the zone ranges */
1858 pr_info("Zone ranges:\n");
1859 for (i = 0; i < MAX_NR_ZONES; i++) {
1860 if (i == ZONE_MOVABLE)
1861 continue;
1862 pr_info(" %-8s ", zone_names[i]);
1863 if (arch_zone_lowest_possible_pfn[i] ==
1864 arch_zone_highest_possible_pfn[i])
1865 pr_cont("empty\n");
1866 else
1867 pr_cont("[mem %#018Lx-%#018Lx]\n",
1868 (u64)arch_zone_lowest_possible_pfn[i]
1869 << PAGE_SHIFT,
1870 ((u64)arch_zone_highest_possible_pfn[i]
1871 << PAGE_SHIFT) - 1);
1872 }
1873
1874 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
1875 pr_info("Movable zone start for each node\n");
1876 for (i = 0; i < MAX_NUMNODES; i++) {
1877 if (zone_movable_pfn[i])
1878 pr_info(" Node %d: %#018Lx\n", i,
1879 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
1880 }
1881
1882 /*
1883 * Print out the early node map, and initialize the
1884 * subsection-map relative to active online memory ranges to
1885 * enable future "sub-section" extensions of the memory map.
1886 */
1887 pr_info("Early memory node ranges\n");
1888 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
1889 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
1890 (u64)start_pfn << PAGE_SHIFT,
1891 ((u64)end_pfn << PAGE_SHIFT) - 1);
1892 subsection_map_init(start_pfn, end_pfn - start_pfn);
1893 }
1894
1895 /* Initialise every node */
1896 mminit_verify_pageflags_layout();
1897 setup_nr_node_ids();
1898 set_pageblock_order();
1899
1900 for_each_node(nid) {
1901 pg_data_t *pgdat;
1902
1903 if (!node_online(nid))
1904 alloc_offline_node_data(nid);
1905
1906 pgdat = NODE_DATA(nid);
1907 free_area_init_node(nid);
1908
1909 /*
1910 * No sysfs hierarcy will be created via register_one_node()
1911 *for memory-less node because here it's not marked as N_MEMORY
1912 *and won't be set online later. The benefit is userspace
1913 *program won't be confused by sysfs files/directories of
1914 *memory-less node. The pgdat will get fully initialized by
1915 *hotadd_init_pgdat() when memory is hotplugged into this node.
1916 */
1917 if (pgdat->node_present_pages) {
1918 node_set_state(nid, N_MEMORY);
1919 check_for_memory(pgdat);
1920 }
1921 }
1922
1923 for_each_node_state(nid, N_MEMORY)
1924 sparse_vmemmap_init_nid_late(nid);
1925
1926 calc_nr_kernel_pages();
1927 memmap_init();
1928
1929 /* disable hash distribution for systems with a single node */
1930 fixup_hashdist();
1931
1932 set_high_memory();
1933 }
1934
1935 /**
1936 * node_map_pfn_alignment - determine the maximum internode alignment
1937 *
1938 * This function should be called after node map is populated and sorted.
1939 * It calculates the maximum power of two alignment which can distinguish
1940 * all the nodes.
1941 *
1942 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
1943 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
1944 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
1945 * shifted, 1GiB is enough and this function will indicate so.
1946 *
1947 * This is used to test whether pfn -> nid mapping of the chosen memory
1948 * model has fine enough granularity to avoid incorrect mapping for the
1949 * populated node map.
1950 *
1951 * Return: the determined alignment in pfn's. 0 if there is no alignment
1952 * requirement (single node).
1953 */
node_map_pfn_alignment(void)1954 unsigned long __init node_map_pfn_alignment(void)
1955 {
1956 unsigned long accl_mask = 0, last_end = 0;
1957 unsigned long start, end, mask;
1958 int last_nid = NUMA_NO_NODE;
1959 int i, nid;
1960
1961 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
1962 if (!start || last_nid < 0 || last_nid == nid) {
1963 last_nid = nid;
1964 last_end = end;
1965 continue;
1966 }
1967
1968 /*
1969 * Start with a mask granular enough to pin-point to the
1970 * start pfn and tick off bits one-by-one until it becomes
1971 * too coarse to separate the current node from the last.
1972 */
1973 mask = ~((1 << __ffs(start)) - 1);
1974 while (mask && last_end <= (start & (mask << 1)))
1975 mask <<= 1;
1976
1977 /* accumulate all internode masks */
1978 accl_mask |= mask;
1979 }
1980
1981 /* convert mask to number of pages */
1982 return ~accl_mask + 1;
1983 }
1984
1985 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
deferred_free_pages(unsigned long pfn,unsigned long nr_pages)1986 static void __init deferred_free_pages(unsigned long pfn,
1987 unsigned long nr_pages)
1988 {
1989 struct page *page;
1990 unsigned long i;
1991
1992 if (!nr_pages)
1993 return;
1994
1995 page = pfn_to_page(pfn);
1996
1997 /* Free a large naturally-aligned chunk if possible */
1998 if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
1999 for (i = 0; i < nr_pages; i += pageblock_nr_pages)
2000 set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
2001 __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
2002 return;
2003 }
2004
2005 /* Accept chunks smaller than MAX_PAGE_ORDER upfront */
2006 accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
2007
2008 for (i = 0; i < nr_pages; i++, page++, pfn++) {
2009 if (pageblock_aligned(pfn))
2010 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2011 __free_pages_core(page, 0, MEMINIT_EARLY);
2012 }
2013 }
2014
2015 /* Completion tracking for deferred_init_memmap() threads */
2016 static atomic_t pgdat_init_n_undone __initdata;
2017 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
2018
pgdat_init_report_one_done(void)2019 static inline void __init pgdat_init_report_one_done(void)
2020 {
2021 if (atomic_dec_and_test(&pgdat_init_n_undone))
2022 complete(&pgdat_init_all_done_comp);
2023 }
2024
2025 /*
2026 * Initialize struct pages. We minimize pfn page lookups and scheduler checks
2027 * by performing it only once every MAX_ORDER_NR_PAGES.
2028 * Return number of pages initialized.
2029 */
deferred_init_pages(struct zone * zone,unsigned long pfn,unsigned long end_pfn)2030 static unsigned long __init deferred_init_pages(struct zone *zone,
2031 unsigned long pfn, unsigned long end_pfn)
2032 {
2033 int nid = zone_to_nid(zone);
2034 unsigned long nr_pages = end_pfn - pfn;
2035 int zid = zone_idx(zone);
2036 struct page *page = pfn_to_page(pfn);
2037
2038 for (; pfn < end_pfn; pfn++, page++)
2039 __init_single_page(page, pfn, zid, nid);
2040 return nr_pages;
2041 }
2042
2043 /*
2044 * This function is meant to pre-load the iterator for the zone init from
2045 * a given point.
2046 * Specifically it walks through the ranges starting with initial index
2047 * passed to it until we are caught up to the first_init_pfn value and
2048 * exits there. If we never encounter the value we return false indicating
2049 * there are no valid ranges left.
2050 */
2051 static bool __init
deferred_init_mem_pfn_range_in_zone(u64 * i,struct zone * zone,unsigned long * spfn,unsigned long * epfn,unsigned long first_init_pfn)2052 deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
2053 unsigned long *spfn, unsigned long *epfn,
2054 unsigned long first_init_pfn)
2055 {
2056 u64 j = *i;
2057
2058 if (j == 0)
2059 __next_mem_pfn_range_in_zone(&j, zone, spfn, epfn);
2060
2061 /*
2062 * Start out by walking through the ranges in this zone that have
2063 * already been initialized. We don't need to do anything with them
2064 * so we just need to flush them out of the system.
2065 */
2066 for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) {
2067 if (*epfn <= first_init_pfn)
2068 continue;
2069 if (*spfn < first_init_pfn)
2070 *spfn = first_init_pfn;
2071 *i = j;
2072 return true;
2073 }
2074
2075 return false;
2076 }
2077
2078 /*
2079 * Initialize and free pages. We do it in two loops: first we initialize
2080 * struct page, then free to buddy allocator, because while we are
2081 * freeing pages we can access pages that are ahead (computing buddy
2082 * page in __free_one_page()).
2083 *
2084 * In order to try and keep some memory in the cache we have the loop
2085 * broken along max page order boundaries. This way we will not cause
2086 * any issues with the buddy page computation.
2087 */
2088 static unsigned long __init
deferred_init_maxorder(u64 * i,struct zone * zone,unsigned long * start_pfn,unsigned long * end_pfn)2089 deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
2090 unsigned long *end_pfn)
2091 {
2092 unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
2093 unsigned long spfn = *start_pfn, epfn = *end_pfn;
2094 unsigned long nr_pages = 0;
2095 u64 j = *i;
2096
2097 /* First we loop through and initialize the page values */
2098 for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
2099 unsigned long t;
2100
2101 if (mo_pfn <= *start_pfn)
2102 break;
2103
2104 t = min(mo_pfn, *end_pfn);
2105 nr_pages += deferred_init_pages(zone, *start_pfn, t);
2106
2107 if (mo_pfn < *end_pfn) {
2108 *start_pfn = mo_pfn;
2109 break;
2110 }
2111 }
2112
2113 /* Reset values and now loop through freeing pages as needed */
2114 swap(j, *i);
2115
2116 for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
2117 unsigned long t;
2118
2119 if (mo_pfn <= spfn)
2120 break;
2121
2122 t = min(mo_pfn, epfn);
2123 deferred_free_pages(spfn, t - spfn);
2124
2125 if (mo_pfn <= epfn)
2126 break;
2127 }
2128
2129 return nr_pages;
2130 }
2131
2132 static void __init
deferred_init_memmap_chunk(unsigned long start_pfn,unsigned long end_pfn,void * arg)2133 deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
2134 void *arg)
2135 {
2136 unsigned long spfn, epfn;
2137 struct zone *zone = arg;
2138 u64 i = 0;
2139
2140 deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
2141
2142 /*
2143 * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
2144 * we can avoid introducing any issues with the buddy allocator.
2145 */
2146 while (spfn < end_pfn) {
2147 deferred_init_maxorder(&i, zone, &spfn, &epfn);
2148 cond_resched();
2149 }
2150 }
2151
2152 static unsigned int __init
deferred_page_init_max_threads(const struct cpumask * node_cpumask)2153 deferred_page_init_max_threads(const struct cpumask *node_cpumask)
2154 {
2155 return max(cpumask_weight(node_cpumask), 1U);
2156 }
2157
2158 /* Initialise remaining memory on a node */
deferred_init_memmap(void * data)2159 static int __init deferred_init_memmap(void *data)
2160 {
2161 pg_data_t *pgdat = data;
2162 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2163 unsigned long spfn = 0, epfn = 0;
2164 unsigned long first_init_pfn, flags;
2165 unsigned long start = jiffies;
2166 struct zone *zone;
2167 int max_threads;
2168 u64 i = 0;
2169
2170 /* Bind memory initialisation thread to a local node if possible */
2171 if (!cpumask_empty(cpumask))
2172 set_cpus_allowed_ptr(current, cpumask);
2173
2174 pgdat_resize_lock(pgdat, &flags);
2175 first_init_pfn = pgdat->first_deferred_pfn;
2176 if (first_init_pfn == ULONG_MAX) {
2177 pgdat_resize_unlock(pgdat, &flags);
2178 pgdat_init_report_one_done();
2179 return 0;
2180 }
2181
2182 /* Sanity check boundaries */
2183 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
2184 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
2185 pgdat->first_deferred_pfn = ULONG_MAX;
2186
2187 /*
2188 * Once we unlock here, the zone cannot be grown anymore, thus if an
2189 * interrupt thread must allocate this early in boot, zone must be
2190 * pre-grown prior to start of deferred page initialization.
2191 */
2192 pgdat_resize_unlock(pgdat, &flags);
2193
2194 /* Only the highest zone is deferred */
2195 zone = pgdat->node_zones + pgdat->nr_zones - 1;
2196
2197 max_threads = deferred_page_init_max_threads(cpumask);
2198
2199 while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) {
2200 first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION);
2201 struct padata_mt_job job = {
2202 .thread_fn = deferred_init_memmap_chunk,
2203 .fn_arg = zone,
2204 .start = spfn,
2205 .size = first_init_pfn - spfn,
2206 .align = PAGES_PER_SECTION,
2207 .min_chunk = PAGES_PER_SECTION,
2208 .max_threads = max_threads,
2209 .numa_aware = false,
2210 };
2211
2212 padata_do_multithreaded(&job);
2213 }
2214
2215 /* Sanity check that the next zone really is unpopulated */
2216 WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
2217
2218 pr_info("node %d deferred pages initialised in %ums\n",
2219 pgdat->node_id, jiffies_to_msecs(jiffies - start));
2220
2221 pgdat_init_report_one_done();
2222 return 0;
2223 }
2224
2225 /*
2226 * If this zone has deferred pages, try to grow it by initializing enough
2227 * deferred pages to satisfy the allocation specified by order, rounded up to
2228 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
2229 * of SECTION_SIZE bytes by initializing struct pages in increments of
2230 * PAGES_PER_SECTION * sizeof(struct page) bytes.
2231 *
2232 * Return true when zone was grown, otherwise return false. We return true even
2233 * when we grow less than requested, to let the caller decide if there are
2234 * enough pages to satisfy the allocation.
2235 */
deferred_grow_zone(struct zone * zone,unsigned int order)2236 bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
2237 {
2238 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
2239 pg_data_t *pgdat = zone->zone_pgdat;
2240 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
2241 unsigned long spfn, epfn, flags;
2242 unsigned long nr_pages = 0;
2243 u64 i = 0;
2244
2245 /* Only the last zone may have deferred pages */
2246 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
2247 return false;
2248
2249 pgdat_resize_lock(pgdat, &flags);
2250
2251 /*
2252 * If someone grew this zone while we were waiting for spinlock, return
2253 * true, as there might be enough pages already.
2254 */
2255 if (first_deferred_pfn != pgdat->first_deferred_pfn) {
2256 pgdat_resize_unlock(pgdat, &flags);
2257 return true;
2258 }
2259
2260 /* If the zone is empty somebody else may have cleared out the zone */
2261 if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
2262 first_deferred_pfn)) {
2263 pgdat->first_deferred_pfn = ULONG_MAX;
2264 pgdat_resize_unlock(pgdat, &flags);
2265 /* Retry only once. */
2266 return first_deferred_pfn != ULONG_MAX;
2267 }
2268
2269 /*
2270 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
2271 * that we can avoid introducing any issues with the buddy
2272 * allocator.
2273 */
2274 while (spfn < epfn) {
2275 /* update our first deferred PFN for this section */
2276 first_deferred_pfn = spfn;
2277
2278 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
2279 touch_nmi_watchdog();
2280
2281 /* We should only stop along section boundaries */
2282 if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
2283 continue;
2284
2285 /* If our quota has been met we can stop here */
2286 if (nr_pages >= nr_pages_needed)
2287 break;
2288 }
2289
2290 pgdat->first_deferred_pfn = spfn;
2291 pgdat_resize_unlock(pgdat, &flags);
2292
2293 return nr_pages > 0;
2294 }
2295
2296 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
2297
2298 #ifdef CONFIG_CMA
init_cma_reserved_pageblock(struct page * page)2299 void __init init_cma_reserved_pageblock(struct page *page)
2300 {
2301 unsigned i = pageblock_nr_pages;
2302 struct page *p = page;
2303
2304 do {
2305 __ClearPageReserved(p);
2306 set_page_count(p, 0);
2307 } while (++p, --i);
2308
2309 set_pageblock_migratetype(page, MIGRATE_CMA);
2310 set_page_refcounted(page);
2311 /* pages were reserved and not allocated */
2312 clear_page_tag_ref(page);
2313 __free_pages(page, pageblock_order);
2314
2315 adjust_managed_page_count(page, pageblock_nr_pages);
2316 page_zone(page)->cma_pages += pageblock_nr_pages;
2317 }
2318 /*
2319 * Similar to above, but only set the migrate type and stats.
2320 */
init_cma_pageblock(struct page * page)2321 void __init init_cma_pageblock(struct page *page)
2322 {
2323 set_pageblock_migratetype(page, MIGRATE_CMA);
2324 adjust_managed_page_count(page, pageblock_nr_pages);
2325 page_zone(page)->cma_pages += pageblock_nr_pages;
2326 }
2327 #endif
2328
set_zone_contiguous(struct zone * zone)2329 void set_zone_contiguous(struct zone *zone)
2330 {
2331 unsigned long block_start_pfn = zone->zone_start_pfn;
2332 unsigned long block_end_pfn;
2333
2334 block_end_pfn = pageblock_end_pfn(block_start_pfn);
2335 for (; block_start_pfn < zone_end_pfn(zone);
2336 block_start_pfn = block_end_pfn,
2337 block_end_pfn += pageblock_nr_pages) {
2338
2339 block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
2340
2341 if (!__pageblock_pfn_to_page(block_start_pfn,
2342 block_end_pfn, zone))
2343 return;
2344 cond_resched();
2345 }
2346
2347 /* We confirm that there is no hole */
2348 zone->contiguous = true;
2349 }
2350
2351 /*
2352 * Check if a PFN range intersects multiple zones on one or more
2353 * NUMA nodes. Specify the @nid argument if it is known that this
2354 * PFN range is on one node, NUMA_NO_NODE otherwise.
2355 */
pfn_range_intersects_zones(int nid,unsigned long start_pfn,unsigned long nr_pages)2356 bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
2357 unsigned long nr_pages)
2358 {
2359 struct zone *zone, *izone = NULL;
2360
2361 for_each_zone(zone) {
2362 if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid)
2363 continue;
2364
2365 if (zone_intersects(zone, start_pfn, nr_pages)) {
2366 if (izone != NULL)
2367 return true;
2368 izone = zone;
2369 }
2370
2371 }
2372
2373 return false;
2374 }
2375
2376 static void __init mem_init_print_info(void);
page_alloc_init_late(void)2377 void __init page_alloc_init_late(void)
2378 {
2379 struct zone *zone;
2380 int nid;
2381
2382 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
2383
2384 /* There will be num_node_state(N_MEMORY) threads */
2385 atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
2386 for_each_node_state(nid, N_MEMORY) {
2387 kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
2388 }
2389
2390 /* Block until all are initialised */
2391 wait_for_completion(&pgdat_init_all_done_comp);
2392
2393 /*
2394 * We initialized the rest of the deferred pages. Permanently disable
2395 * on-demand struct page initialization.
2396 */
2397 static_branch_disable(&deferred_pages);
2398
2399 /* Reinit limits that are based on free pages after the kernel is up */
2400 files_maxfiles_init();
2401 #endif
2402
2403 /* Accounting of total+free memory is stable at this point. */
2404 mem_init_print_info();
2405 buffer_init();
2406
2407 /* Discard memblock private memory */
2408 memblock_discard();
2409
2410 for_each_node_state(nid, N_MEMORY)
2411 shuffle_free_memory(NODE_DATA(nid));
2412
2413 for_each_populated_zone(zone)
2414 set_zone_contiguous(zone);
2415
2416 /* Initialize page ext after all struct pages are initialized. */
2417 if (deferred_struct_pages)
2418 page_ext_init();
2419
2420 page_alloc_sysctl_init();
2421 }
2422
2423 /*
2424 * Adaptive scale is meant to reduce sizes of hash tables on large memory
2425 * machines. As memory size is increased the scale is also increased but at
2426 * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
2427 * quadruples the scale is increased by one, which means the size of hash table
2428 * only doubles, instead of quadrupling as well.
2429 * Because 32-bit systems cannot have large physical memory, where this scaling
2430 * makes sense, it is disabled on such platforms.
2431 */
2432 #if __BITS_PER_LONG > 32
2433 #define ADAPT_SCALE_BASE (64ul << 30)
2434 #define ADAPT_SCALE_SHIFT 2
2435 #define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
2436 #endif
2437
2438 /*
2439 * allocate a large system hash table from bootmem
2440 * - it is assumed that the hash table must contain an exact power-of-2
2441 * quantity of entries
2442 * - limit is the number of hash buckets, not the total allocation size
2443 */
alloc_large_system_hash(const char * tablename,unsigned long bucketsize,unsigned long numentries,int scale,int flags,unsigned int * _hash_shift,unsigned int * _hash_mask,unsigned long low_limit,unsigned long high_limit)2444 void *__init alloc_large_system_hash(const char *tablename,
2445 unsigned long bucketsize,
2446 unsigned long numentries,
2447 int scale,
2448 int flags,
2449 unsigned int *_hash_shift,
2450 unsigned int *_hash_mask,
2451 unsigned long low_limit,
2452 unsigned long high_limit)
2453 {
2454 unsigned long long max = high_limit;
2455 unsigned long log2qty, size;
2456 void *table;
2457 gfp_t gfp_flags;
2458 bool virt;
2459 bool huge;
2460
2461 /* allow the kernel cmdline to have a say */
2462 if (!numentries) {
2463 /* round applicable memory size up to nearest megabyte */
2464 numentries = nr_kernel_pages;
2465
2466 /* It isn't necessary when PAGE_SIZE >= 1MB */
2467 if (PAGE_SIZE < SZ_1M)
2468 numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
2469
2470 #if __BITS_PER_LONG > 32
2471 if (!high_limit) {
2472 unsigned long adapt;
2473
2474 for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
2475 adapt <<= ADAPT_SCALE_SHIFT)
2476 scale++;
2477 }
2478 #endif
2479
2480 /* limit to 1 bucket per 2^scale bytes of low memory */
2481 if (scale > PAGE_SHIFT)
2482 numentries >>= (scale - PAGE_SHIFT);
2483 else
2484 numentries <<= (PAGE_SHIFT - scale);
2485
2486 if (unlikely((numentries * bucketsize) < PAGE_SIZE))
2487 numentries = PAGE_SIZE / bucketsize;
2488 }
2489 numentries = roundup_pow_of_two(numentries);
2490
2491 /* limit allocation size to 1/16 total memory by default */
2492 if (max == 0) {
2493 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
2494 do_div(max, bucketsize);
2495 }
2496 max = min(max, 0x80000000ULL);
2497
2498 if (numentries < low_limit)
2499 numentries = low_limit;
2500 if (numentries > max)
2501 numentries = max;
2502
2503 log2qty = ilog2(numentries);
2504
2505 gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
2506 do {
2507 virt = false;
2508 size = bucketsize << log2qty;
2509 if (flags & HASH_EARLY) {
2510 if (flags & HASH_ZERO)
2511 table = memblock_alloc(size, SMP_CACHE_BYTES);
2512 else
2513 table = memblock_alloc_raw(size,
2514 SMP_CACHE_BYTES);
2515 } else if (get_order(size) > MAX_PAGE_ORDER || hashdist) {
2516 table = vmalloc_huge(size, gfp_flags);
2517 virt = true;
2518 if (table)
2519 huge = is_vm_area_hugepages(table);
2520 } else {
2521 /*
2522 * If bucketsize is not a power-of-two, we may free
2523 * some pages at the end of hash table which
2524 * alloc_pages_exact() automatically does
2525 */
2526 table = alloc_pages_exact(size, gfp_flags);
2527 kmemleak_alloc(table, size, 1, gfp_flags);
2528 }
2529 } while (!table && size > PAGE_SIZE && --log2qty);
2530
2531 if (!table)
2532 panic("Failed to allocate %s hash table\n", tablename);
2533
2534 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
2535 tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
2536 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
2537
2538 if (_hash_shift)
2539 *_hash_shift = log2qty;
2540 if (_hash_mask)
2541 *_hash_mask = (1 << log2qty) - 1;
2542
2543 return table;
2544 }
2545
memblock_free_pages(struct page * page,unsigned long pfn,unsigned int order)2546 void __init memblock_free_pages(struct page *page, unsigned long pfn,
2547 unsigned int order)
2548 {
2549 if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
2550 int nid = early_pfn_to_nid(pfn);
2551
2552 if (!early_page_initialised(pfn, nid))
2553 return;
2554 }
2555
2556 if (!kmsan_memblock_free_pages(page, order)) {
2557 /* KMSAN will take care of these pages. */
2558 return;
2559 }
2560
2561 /* pages were reserved and not allocated */
2562 clear_page_tag_ref(page);
2563 __free_pages_core(page, order, MEMINIT_EARLY);
2564 }
2565
2566 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
2567 EXPORT_SYMBOL(init_on_alloc);
2568
2569 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
2570 EXPORT_SYMBOL(init_on_free);
2571
2572 static bool _init_on_alloc_enabled_early __read_mostly
2573 = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
early_init_on_alloc(char * buf)2574 static int __init early_init_on_alloc(char *buf)
2575 {
2576
2577 return kstrtobool(buf, &_init_on_alloc_enabled_early);
2578 }
2579 early_param("init_on_alloc", early_init_on_alloc);
2580
2581 static bool _init_on_free_enabled_early __read_mostly
2582 = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
early_init_on_free(char * buf)2583 static int __init early_init_on_free(char *buf)
2584 {
2585 return kstrtobool(buf, &_init_on_free_enabled_early);
2586 }
2587 early_param("init_on_free", early_init_on_free);
2588
2589 DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
2590
2591 /*
2592 * Enable static keys related to various memory debugging and hardening options.
2593 * Some override others, and depend on early params that are evaluated in the
2594 * order of appearance. So we need to first gather the full picture of what was
2595 * enabled, and then make decisions.
2596 */
mem_debugging_and_hardening_init(void)2597 static void __init mem_debugging_and_hardening_init(void)
2598 {
2599 bool page_poisoning_requested = false;
2600 bool want_check_pages = false;
2601
2602 #ifdef CONFIG_PAGE_POISONING
2603 /*
2604 * Page poisoning is debug page alloc for some arches. If
2605 * either of those options are enabled, enable poisoning.
2606 */
2607 if (page_poisoning_enabled() ||
2608 (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
2609 debug_pagealloc_enabled())) {
2610 static_branch_enable(&_page_poisoning_enabled);
2611 page_poisoning_requested = true;
2612 want_check_pages = true;
2613 }
2614 #endif
2615
2616 if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
2617 page_poisoning_requested) {
2618 pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
2619 "will take precedence over init_on_alloc and init_on_free\n");
2620 _init_on_alloc_enabled_early = false;
2621 _init_on_free_enabled_early = false;
2622 }
2623
2624 if (_init_on_alloc_enabled_early) {
2625 want_check_pages = true;
2626 static_branch_enable(&init_on_alloc);
2627 } else {
2628 static_branch_disable(&init_on_alloc);
2629 }
2630
2631 if (_init_on_free_enabled_early) {
2632 want_check_pages = true;
2633 static_branch_enable(&init_on_free);
2634 } else {
2635 static_branch_disable(&init_on_free);
2636 }
2637
2638 if (IS_ENABLED(CONFIG_KMSAN) &&
2639 (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
2640 pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
2641
2642 #ifdef CONFIG_DEBUG_PAGEALLOC
2643 if (debug_pagealloc_enabled()) {
2644 want_check_pages = true;
2645 static_branch_enable(&_debug_pagealloc_enabled);
2646
2647 if (debug_guardpage_minorder())
2648 static_branch_enable(&_debug_guardpage_enabled);
2649 }
2650 #endif
2651
2652 /*
2653 * Any page debugging or hardening option also enables sanity checking
2654 * of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
2655 * enabled already.
2656 */
2657 if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
2658 static_branch_enable(&check_pages_enabled);
2659 }
2660
2661 /* Report memory auto-initialization states for this boot. */
report_meminit(void)2662 static void __init report_meminit(void)
2663 {
2664 const char *stack;
2665
2666 if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
2667 stack = "all(pattern)";
2668 else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
2669 stack = "all(zero)";
2670 else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
2671 stack = "byref_all(zero)";
2672 else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
2673 stack = "byref(zero)";
2674 else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
2675 stack = "__user(zero)";
2676 else
2677 stack = "off";
2678
2679 pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
2680 stack, str_on_off(want_init_on_alloc(GFP_KERNEL)),
2681 str_on_off(want_init_on_free()));
2682 if (want_init_on_free())
2683 pr_info("mem auto-init: clearing system memory may take some time...\n");
2684 }
2685
mem_init_print_info(void)2686 static void __init mem_init_print_info(void)
2687 {
2688 unsigned long physpages, codesize, datasize, rosize, bss_size;
2689 unsigned long init_code_size, init_data_size;
2690
2691 physpages = get_num_physpages();
2692 codesize = _etext - _stext;
2693 datasize = _edata - _sdata;
2694 rosize = __end_rodata - __start_rodata;
2695 bss_size = __bss_stop - __bss_start;
2696 init_data_size = __init_end - __init_begin;
2697 init_code_size = _einittext - _sinittext;
2698
2699 /*
2700 * Detect special cases and adjust section sizes accordingly:
2701 * 1) .init.* may be embedded into .data sections
2702 * 2) .init.text.* may be out of [__init_begin, __init_end],
2703 * please refer to arch/tile/kernel/vmlinux.lds.S.
2704 * 3) .rodata.* may be embedded into .text or .data sections.
2705 */
2706 #define adj_init_size(start, end, size, pos, adj) \
2707 do { \
2708 if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
2709 size -= adj; \
2710 } while (0)
2711
2712 adj_init_size(__init_begin, __init_end, init_data_size,
2713 _sinittext, init_code_size);
2714 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
2715 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
2716 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
2717 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
2718
2719 #undef adj_init_size
2720
2721 pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
2722 #ifdef CONFIG_HIGHMEM
2723 ", %luK highmem"
2724 #endif
2725 ")\n",
2726 K(nr_free_pages()), K(physpages),
2727 codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
2728 (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
2729 K(physpages - totalram_pages() - totalcma_pages),
2730 K(totalcma_pages)
2731 #ifdef CONFIG_HIGHMEM
2732 , K(totalhigh_pages())
2733 #endif
2734 );
2735 }
2736
arch_mm_preinit(void)2737 void __init __weak arch_mm_preinit(void)
2738 {
2739 }
2740
mem_init(void)2741 void __init __weak mem_init(void)
2742 {
2743 }
2744
2745 /*
2746 * Set up kernel memory allocators
2747 */
mm_core_init(void)2748 void __init mm_core_init(void)
2749 {
2750 arch_mm_preinit();
2751 hugetlb_bootmem_alloc();
2752
2753 /* Initializations relying on SMP setup */
2754 BUILD_BUG_ON(MAX_ZONELISTS > 2);
2755 build_all_zonelists(NULL);
2756 page_alloc_init_cpuhp();
2757 alloc_tag_sec_init();
2758 /*
2759 * page_ext requires contiguous pages,
2760 * bigger than MAX_PAGE_ORDER unless SPARSEMEM.
2761 */
2762 page_ext_init_flatmem();
2763 mem_debugging_and_hardening_init();
2764 kfence_alloc_pool_and_metadata();
2765 report_meminit();
2766 kmsan_init_shadow();
2767 stack_depot_early_init();
2768 memblock_free_all();
2769 mem_init();
2770 kmem_cache_init();
2771 /*
2772 * page_owner must be initialized after buddy is ready, and also after
2773 * slab is ready so that stack_depot_init() works properly
2774 */
2775 page_ext_init_flatmem_late();
2776 kmemleak_init();
2777 ptlock_cache_init();
2778 pgtable_cache_init();
2779 debug_objects_mem_init();
2780 vmalloc_init();
2781 /* If no deferred init page_ext now, as vmap is fully initialized */
2782 if (!deferred_struct_pages)
2783 page_ext_init();
2784 /* Should be run before the first non-init thread is created */
2785 init_espfix_bsp();
2786 /* Should be run after espfix64 is set up. */
2787 pti_init();
2788 kmsan_init_runtime();
2789 mm_cache_init();
2790 execmem_init();
2791 }
2792