1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Common code for 32 and 64-bit NUMA */ 3 #include <linux/acpi.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/of.h> 7 #include <linux/string.h> 8 #include <linux/init.h> 9 #include <linux/memblock.h> 10 #include <linux/mmzone.h> 11 #include <linux/ctype.h> 12 #include <linux/nodemask.h> 13 #include <linux/sched.h> 14 #include <linux/topology.h> 15 #include <linux/sort.h> 16 #include <linux/numa_memblks.h> 17 18 #include <asm/e820/api.h> 19 #include <asm/proto.h> 20 #include <asm/dma.h> 21 #include <asm/numa.h> 22 #include <asm/amd/nb.h> 23 24 #include "mm_internal.h" 25 26 int numa_off; 27 28 static __init int numa_setup(char *opt) 29 { 30 if (!opt) 31 return -EINVAL; 32 if (!strncmp(opt, "off", 3)) 33 numa_off = 1; 34 if (!strncmp(opt, "fake=", 5)) 35 return numa_emu_cmdline(opt + 5); 36 if (!strncmp(opt, "noacpi", 6)) 37 disable_srat(); 38 if (!strncmp(opt, "nohmat", 6)) 39 disable_hmat(); 40 return 0; 41 } 42 early_param("numa", numa_setup); 43 44 /* 45 * apicid, cpu, node mappings 46 */ 47 s16 __apicid_to_node[MAX_LOCAL_APIC] = { 48 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 49 }; 50 51 int numa_cpu_node(int cpu) 52 { 53 u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); 54 55 if (apicid != BAD_APICID) 56 return __apicid_to_node[apicid]; 57 return NUMA_NO_NODE; 58 } 59 60 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 61 EXPORT_SYMBOL(node_to_cpumask_map); 62 63 /* 64 * Map cpu index to node index 65 */ 66 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 67 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 68 69 void numa_set_node(int cpu, int node) 70 { 71 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 72 73 /* early setting, no percpu area yet */ 74 if (cpu_to_node_map) { 75 cpu_to_node_map[cpu] = node; 76 return; 77 } 78 79 #ifdef CONFIG_DEBUG_PER_CPU_MAPS 80 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { 81 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); 82 dump_stack(); 83 return; 84 } 85 #endif 86 per_cpu(x86_cpu_to_node_map, cpu) = node; 87 88 set_cpu_numa_node(cpu, node); 89 } 90 91 void numa_clear_node(int cpu) 92 { 93 numa_set_node(cpu, NUMA_NO_NODE); 94 } 95 96 /* 97 * Allocate node_to_cpumask_map based on number of available nodes 98 * Requires node_possible_map to be valid. 99 * 100 * Note: cpumask_of_node() is not valid until after this is done. 101 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) 102 */ 103 void __init setup_node_to_cpumask_map(void) 104 { 105 unsigned int node; 106 107 /* setup nr_node_ids if not done yet */ 108 if (nr_node_ids == MAX_NUMNODES) 109 setup_nr_node_ids(); 110 111 /* allocate the map */ 112 for (node = 0; node < nr_node_ids; node++) 113 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); 114 115 /* cpumask_of_node() will now work */ 116 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); 117 } 118 119 static int __init numa_register_nodes(void) 120 { 121 int nid; 122 123 if (!memblock_validate_numa_coverage(SZ_1M)) 124 return -EINVAL; 125 126 /* Finally register nodes. */ 127 for_each_node_mask(nid, node_possible_map) { 128 unsigned long start_pfn, end_pfn; 129 130 /* 131 * Note, get_pfn_range_for_nid() depends on 132 * memblock_set_node() having already happened 133 */ 134 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 135 if (start_pfn >= end_pfn) 136 continue; 137 138 alloc_node_data(nid); 139 node_set_online(nid); 140 } 141 142 /* Dump memblock with node info and return. */ 143 memblock_dump_all(); 144 return 0; 145 } 146 147 /* 148 * There are unfortunately some poorly designed mainboards around that 149 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 150 * mapping. To avoid this fill in the mapping for all possible CPUs, 151 * as the number of CPUs is not known yet. We round robin the existing 152 * nodes. 153 */ 154 static void __init numa_init_array(void) 155 { 156 int rr, i; 157 158 rr = first_node(node_online_map); 159 for (i = 0; i < nr_cpu_ids; i++) { 160 if (early_cpu_to_node(i) != NUMA_NO_NODE) 161 continue; 162 numa_set_node(i, rr); 163 rr = next_node_in(rr, node_online_map); 164 } 165 } 166 167 static int __init numa_init(int (*init_func)(void)) 168 { 169 int i; 170 int ret; 171 172 for (i = 0; i < MAX_LOCAL_APIC; i++) 173 set_apicid_to_node(i, NUMA_NO_NODE); 174 175 ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true); 176 if (ret < 0) 177 return ret; 178 179 ret = numa_register_nodes(); 180 if (ret < 0) 181 return ret; 182 183 for (i = 0; i < nr_cpu_ids; i++) { 184 int nid = early_cpu_to_node(i); 185 186 if (nid == NUMA_NO_NODE) 187 continue; 188 if (!node_online(nid)) 189 numa_clear_node(i); 190 } 191 numa_init_array(); 192 193 return 0; 194 } 195 196 /** 197 * dummy_numa_init - Fallback dummy NUMA init 198 * 199 * Used if there's no underlying NUMA architecture, NUMA initialization 200 * fails, or NUMA is disabled on the command line. 201 * 202 * Must online at least one node and add memory blocks that cover all 203 * allowed memory. This function must not fail. 204 */ 205 static int __init dummy_numa_init(void) 206 { 207 printk(KERN_INFO "%s\n", 208 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 209 printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", 210 0LLU, PFN_PHYS(max_pfn) - 1); 211 212 node_set(0, numa_nodes_parsed); 213 numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); 214 215 return 0; 216 } 217 218 /** 219 * x86_numa_init - Initialize NUMA 220 * 221 * Try each configured NUMA initialization method until one succeeds. The 222 * last fallback is dummy single node config encompassing whole memory and 223 * never fails. 224 */ 225 void __init x86_numa_init(void) 226 { 227 if (!numa_off) { 228 #ifdef CONFIG_ACPI_NUMA 229 if (!numa_init(x86_acpi_numa_init)) 230 return; 231 #endif 232 #ifdef CONFIG_AMD_NUMA 233 if (!numa_init(amd_numa_init)) 234 return; 235 #endif 236 if (acpi_disabled && !numa_init(of_numa_init)) 237 return; 238 } 239 240 numa_init(dummy_numa_init); 241 } 242 243 244 /* 245 * A node may exist which has one or more Generic Initiators but no CPUs and no 246 * memory. 247 * 248 * This function must be called after init_cpu_to_node(), to ensure that any 249 * memoryless CPU nodes have already been brought online, and before the 250 * node_data[nid] is needed for zone list setup in build_all_zonelists(). 251 * 252 * When this function is called, any nodes containing either memory and/or CPUs 253 * will already be online and there is no need to do anything extra, even if 254 * they also contain one or more Generic Initiators. 255 */ 256 void __init init_gi_nodes(void) 257 { 258 int nid; 259 260 /* 261 * Exclude this node from 262 * bringup_nonboot_cpus 263 * cpu_up 264 * __try_online_node 265 * register_one_node 266 * because node_subsys is not initialized yet. 267 * TODO remove dependency on node_online 268 */ 269 for_each_node_state(nid, N_GENERIC_INITIATOR) 270 if (!node_online(nid)) 271 node_set_online(nid); 272 } 273 274 /* 275 * Setup early cpu_to_node. 276 * 277 * Populate cpu_to_node[] only if x86_cpu_to_apicid[], 278 * and apicid_to_node[] tables have valid entries for a CPU. 279 * This means we skip cpu_to_node[] initialisation for NUMA 280 * emulation and faking node case (when running a kernel compiled 281 * for NUMA on a non NUMA box), which is OK as cpu_to_node[] 282 * is already initialized in a round robin manner at numa_init_array, 283 * prior to this call, and this initialization is good enough 284 * for the fake NUMA cases. 285 * 286 * Called before the per_cpu areas are setup. 287 */ 288 void __init init_cpu_to_node(void) 289 { 290 int cpu; 291 u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 292 293 BUG_ON(cpu_to_apicid == NULL); 294 295 for_each_possible_cpu(cpu) { 296 int node = numa_cpu_node(cpu); 297 298 if (node == NUMA_NO_NODE) 299 continue; 300 301 /* 302 * Exclude this node from 303 * bringup_nonboot_cpus 304 * cpu_up 305 * __try_online_node 306 * register_one_node 307 * because node_subsys is not initialized yet. 308 * TODO remove dependency on node_online 309 */ 310 if (!node_online(node)) 311 node_set_online(node); 312 313 numa_set_node(cpu, node); 314 } 315 } 316 317 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 318 319 # ifndef CONFIG_NUMA_EMU 320 void numa_add_cpu(unsigned int cpu) 321 { 322 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 323 } 324 325 void numa_remove_cpu(unsigned int cpu) 326 { 327 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 328 } 329 # endif /* !CONFIG_NUMA_EMU */ 330 331 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 332 333 int __cpu_to_node(int cpu) 334 { 335 if (early_per_cpu_ptr(x86_cpu_to_node_map)) { 336 printk(KERN_WARNING 337 "cpu_to_node(%d): usage too early!\n", cpu); 338 dump_stack(); 339 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 340 } 341 return per_cpu(x86_cpu_to_node_map, cpu); 342 } 343 EXPORT_SYMBOL(__cpu_to_node); 344 345 /* 346 * Same function as cpu_to_node() but used if called before the 347 * per_cpu areas are setup. 348 */ 349 int early_cpu_to_node(int cpu) 350 { 351 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 352 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; 353 354 if (!cpu_possible(cpu)) { 355 printk(KERN_WARNING 356 "early_cpu_to_node(%d): no per_cpu area!\n", cpu); 357 dump_stack(); 358 return NUMA_NO_NODE; 359 } 360 return per_cpu(x86_cpu_to_node_map, cpu); 361 } 362 363 void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) 364 { 365 struct cpumask *mask; 366 367 if (node == NUMA_NO_NODE) { 368 /* early_cpu_to_node() already emits a warning and trace */ 369 return; 370 } 371 mask = node_to_cpumask_map[node]; 372 if (!cpumask_available(mask)) { 373 pr_err("node_to_cpumask_map[%i] NULL\n", node); 374 dump_stack(); 375 return; 376 } 377 378 if (enable) 379 cpumask_set_cpu(cpu, mask); 380 else 381 cpumask_clear_cpu(cpu, mask); 382 383 printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", 384 enable ? "numa_add_cpu" : "numa_remove_cpu", 385 cpu, node, cpumask_pr_args(mask)); 386 return; 387 } 388 389 # ifndef CONFIG_NUMA_EMU 390 static void numa_set_cpumask(int cpu, bool enable) 391 { 392 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); 393 } 394 395 void numa_add_cpu(unsigned int cpu) 396 { 397 numa_set_cpumask(cpu, true); 398 } 399 400 void numa_remove_cpu(unsigned int cpu) 401 { 402 numa_set_cpumask(cpu, false); 403 } 404 # endif /* !CONFIG_NUMA_EMU */ 405 406 /* 407 * Returns a pointer to the bitmask of CPUs on Node 'node'. 408 */ 409 const struct cpumask *cpumask_of_node(int node) 410 { 411 if ((unsigned)node >= nr_node_ids) { 412 printk(KERN_WARNING 413 "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", 414 node, nr_node_ids); 415 dump_stack(); 416 return cpu_none_mask; 417 } 418 if (!cpumask_available(node_to_cpumask_map[node])) { 419 printk(KERN_WARNING 420 "cpumask_of_node(%d): no node_to_cpumask_map!\n", 421 node); 422 dump_stack(); 423 return cpu_online_mask; 424 } 425 return node_to_cpumask_map[node]; 426 } 427 EXPORT_SYMBOL(cpumask_of_node); 428 429 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 430 431 #ifdef CONFIG_NUMA_EMU 432 void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, 433 unsigned int nr_emu_nids) 434 { 435 int i, j; 436 437 /* 438 * Transform __apicid_to_node table to use emulated nids by 439 * reverse-mapping phys_nid. The maps should always exist but fall 440 * back to zero just in case. 441 */ 442 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { 443 if (__apicid_to_node[i] == NUMA_NO_NODE) 444 continue; 445 for (j = 0; j < nr_emu_nids; j++) 446 if (__apicid_to_node[i] == emu_nid_to_phys[j]) 447 break; 448 __apicid_to_node[i] = j < nr_emu_nids ? j : 0; 449 } 450 } 451 452 u64 __init numa_emu_dma_end(void) 453 { 454 return PFN_PHYS(MAX_DMA32_PFN); 455 } 456 #endif /* CONFIG_NUMA_EMU */ 457