1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Memory subsystem support
4 *
5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
6 * Dave Hansen <haveblue@us.ibm.com>
7 *
8 * This file provides the necessary infrastructure to represent
9 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
10 * All arch-independent code that assumes MEMORY_HOTPLUG requires
11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
12 */
13
14 #include <linux/module.h>
15 #include <linux/init.h>
16 #include <linux/topology.h>
17 #include <linux/capability.h>
18 #include <linux/device.h>
19 #include <linux/memory.h>
20 #include <linux/memory_hotplug.h>
21 #include <linux/mm.h>
22 #include <linux/stat.h>
23 #include <linux/slab.h>
24 #include <linux/xarray.h>
25 #include <linux/export.h>
26
27 #include <linux/atomic.h>
28 #include <linux/uaccess.h>
29
30 #define MEMORY_CLASS_NAME "memory"
31
32 static const char *const online_type_to_str[] = {
33 [MMOP_OFFLINE] = "offline",
34 [MMOP_ONLINE] = "online",
35 [MMOP_ONLINE_KERNEL] = "online_kernel",
36 [MMOP_ONLINE_MOVABLE] = "online_movable",
37 };
38
mhp_online_type_from_str(const char * str)39 int mhp_online_type_from_str(const char *str)
40 {
41 int i;
42
43 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
44 if (sysfs_streq(str, online_type_to_str[i]))
45 return i;
46 }
47 return -EINVAL;
48 }
49
50 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
51
52 int sections_per_block;
53 EXPORT_SYMBOL(sections_per_block);
54
55 static int memory_subsys_online(struct device *dev);
56 static int memory_subsys_offline(struct device *dev);
57
58 static const struct bus_type memory_subsys = {
59 .name = MEMORY_CLASS_NAME,
60 .dev_name = MEMORY_CLASS_NAME,
61 .online = memory_subsys_online,
62 .offline = memory_subsys_offline,
63 };
64
65 /*
66 * Memory blocks are cached in a local radix tree to avoid
67 * a costly linear search for the corresponding device on
68 * the subsystem bus.
69 */
70 static DEFINE_XARRAY(memory_blocks);
71
72 /*
73 * Memory groups, indexed by memory group id (mgid).
74 */
75 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
76 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1
77
78 static BLOCKING_NOTIFIER_HEAD(memory_chain);
79
register_memory_notifier(struct notifier_block * nb)80 int register_memory_notifier(struct notifier_block *nb)
81 {
82 return blocking_notifier_chain_register(&memory_chain, nb);
83 }
84 EXPORT_SYMBOL(register_memory_notifier);
85
unregister_memory_notifier(struct notifier_block * nb)86 void unregister_memory_notifier(struct notifier_block *nb)
87 {
88 blocking_notifier_chain_unregister(&memory_chain, nb);
89 }
90 EXPORT_SYMBOL(unregister_memory_notifier);
91
memory_block_release(struct device * dev)92 static void memory_block_release(struct device *dev)
93 {
94 struct memory_block *mem = to_memory_block(dev);
95 /* Verify that the altmap is freed */
96 WARN_ON(mem->altmap);
97 kfree(mem);
98 }
99
100
101 /* Max block size to be set by memory_block_advise_max_size */
102 static unsigned long memory_block_advised_size;
103 static bool memory_block_advised_size_queried;
104
105 /**
106 * memory_block_advise_max_size() - advise memory hotplug on the max suggested
107 * block size, usually for alignment.
108 * @size: suggestion for maximum block size. must be aligned on power of 2.
109 *
110 * Early boot software (pre-allocator init) may advise archs on the max block
111 * size. This value can only decrease after initialization, as the intent is
112 * to identify the largest supported alignment for all sources.
113 *
114 * Use of this value is arch-defined, as is min/max block size.
115 *
116 * Return: 0 on success
117 * -EINVAL if size is 0 or not pow2 aligned
118 * -EBUSY if value has already been probed
119 */
memory_block_advise_max_size(unsigned long size)120 int __init memory_block_advise_max_size(unsigned long size)
121 {
122 if (!size || !is_power_of_2(size))
123 return -EINVAL;
124
125 if (memory_block_advised_size_queried)
126 return -EBUSY;
127
128 if (memory_block_advised_size)
129 memory_block_advised_size = min(memory_block_advised_size, size);
130 else
131 memory_block_advised_size = size;
132
133 return 0;
134 }
135
136 /**
137 * memory_block_advised_max_size() - query advised max hotplug block size.
138 *
139 * After the first call, the value can never change. Callers looking for the
140 * actual block size should use memory_block_size_bytes. This interface is
141 * intended for use by arch-init when initializing the hotplug block size.
142 *
143 * Return: advised size in bytes, or 0 if never set.
144 */
memory_block_advised_max_size(void)145 unsigned long memory_block_advised_max_size(void)
146 {
147 memory_block_advised_size_queried = true;
148 return memory_block_advised_size;
149 }
150
memory_block_size_bytes(void)151 unsigned long __weak memory_block_size_bytes(void)
152 {
153 return MIN_MEMORY_BLOCK_SIZE;
154 }
155 EXPORT_SYMBOL_GPL(memory_block_size_bytes);
156
157 /* Show the memory block ID, relative to the memory block size */
phys_index_show(struct device * dev,struct device_attribute * attr,char * buf)158 static ssize_t phys_index_show(struct device *dev,
159 struct device_attribute *attr, char *buf)
160 {
161 struct memory_block *mem = to_memory_block(dev);
162
163 return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr));
164 }
165
166 /*
167 * Legacy interface that we cannot remove. Always indicate "removable"
168 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
169 */
removable_show(struct device * dev,struct device_attribute * attr,char * buf)170 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
171 char *buf)
172 {
173 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
174 }
175
176 /*
177 * online, offline, going offline, etc.
178 */
state_show(struct device * dev,struct device_attribute * attr,char * buf)179 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
180 char *buf)
181 {
182 struct memory_block *mem = to_memory_block(dev);
183 const char *output;
184
185 /*
186 * We can probably put these states in a nice little array
187 * so that they're not open-coded
188 */
189 switch (mem->state) {
190 case MEM_ONLINE:
191 output = "online";
192 break;
193 case MEM_OFFLINE:
194 output = "offline";
195 break;
196 case MEM_GOING_OFFLINE:
197 output = "going-offline";
198 break;
199 default:
200 WARN_ON(1);
201 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
202 }
203
204 return sysfs_emit(buf, "%s\n", output);
205 }
206
memory_notify(unsigned long val,void * v)207 int memory_notify(unsigned long val, void *v)
208 {
209 return blocking_notifier_call_chain(&memory_chain, val, v);
210 }
211
212 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
213 static unsigned long memblk_nr_poison(struct memory_block *mem);
214 #else
memblk_nr_poison(struct memory_block * mem)215 static inline unsigned long memblk_nr_poison(struct memory_block *mem)
216 {
217 return 0;
218 }
219 #endif
220
221 /*
222 * Must acquire mem_hotplug_lock in write mode.
223 */
memory_block_online(struct memory_block * mem)224 static int memory_block_online(struct memory_block *mem)
225 {
226 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
227 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
228 unsigned long nr_vmemmap_pages = 0;
229 struct memory_notify arg;
230 struct zone *zone;
231 int ret;
232
233 if (memblk_nr_poison(mem))
234 return -EHWPOISON;
235
236 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
237 start_pfn, nr_pages);
238
239 /*
240 * Although vmemmap pages have a different lifecycle than the pages
241 * they describe (they remain until the memory is unplugged), doing
242 * their initialization and accounting at memory onlining/offlining
243 * stage helps to keep accounting easier to follow - e.g vmemmaps
244 * belong to the same zone as the memory they backed.
245 */
246 if (mem->altmap)
247 nr_vmemmap_pages = mem->altmap->free;
248
249 arg.altmap_start_pfn = start_pfn;
250 arg.altmap_nr_pages = nr_vmemmap_pages;
251 arg.start_pfn = start_pfn + nr_vmemmap_pages;
252 arg.nr_pages = nr_pages - nr_vmemmap_pages;
253 mem_hotplug_begin();
254 ret = memory_notify(MEM_PREPARE_ONLINE, &arg);
255 ret = notifier_to_errno(ret);
256 if (ret)
257 goto out_notifier;
258
259 if (nr_vmemmap_pages) {
260 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
261 zone, mem->altmap->inaccessible);
262 if (ret)
263 goto out;
264 }
265
266 ret = online_pages(start_pfn + nr_vmemmap_pages,
267 nr_pages - nr_vmemmap_pages, zone, mem->group);
268 if (ret) {
269 if (nr_vmemmap_pages)
270 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
271 goto out;
272 }
273
274 /*
275 * Account once onlining succeeded. If the zone was unpopulated, it is
276 * now already properly populated.
277 */
278 if (nr_vmemmap_pages)
279 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
280 nr_vmemmap_pages);
281
282 mem->zone = zone;
283 mem_hotplug_done();
284 return ret;
285 out:
286 memory_notify(MEM_FINISH_OFFLINE, &arg);
287 out_notifier:
288 mem_hotplug_done();
289 return ret;
290 }
291
292 /*
293 * Must acquire mem_hotplug_lock in write mode.
294 */
memory_block_offline(struct memory_block * mem)295 static int memory_block_offline(struct memory_block *mem)
296 {
297 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
298 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
299 unsigned long nr_vmemmap_pages = 0;
300 struct memory_notify arg;
301 int ret;
302
303 if (!mem->zone)
304 return -EINVAL;
305
306 /*
307 * Unaccount before offlining, such that unpopulated zone and kthreads
308 * can properly be torn down in offline_pages().
309 */
310 if (mem->altmap)
311 nr_vmemmap_pages = mem->altmap->free;
312
313 mem_hotplug_begin();
314 if (nr_vmemmap_pages)
315 adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
316 -nr_vmemmap_pages);
317
318 ret = offline_pages(start_pfn + nr_vmemmap_pages,
319 nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
320 if (ret) {
321 /* offline_pages() failed. Account back. */
322 if (nr_vmemmap_pages)
323 adjust_present_page_count(pfn_to_page(start_pfn),
324 mem->group, nr_vmemmap_pages);
325 goto out;
326 }
327
328 if (nr_vmemmap_pages)
329 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
330
331 mem->zone = NULL;
332 arg.altmap_start_pfn = start_pfn;
333 arg.altmap_nr_pages = nr_vmemmap_pages;
334 arg.start_pfn = start_pfn + nr_vmemmap_pages;
335 arg.nr_pages = nr_pages - nr_vmemmap_pages;
336 memory_notify(MEM_FINISH_OFFLINE, &arg);
337 out:
338 mem_hotplug_done();
339 return ret;
340 }
341
342 /*
343 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
344 * OK to have direct references to sparsemem variables in here.
345 */
346 static int
memory_block_action(struct memory_block * mem,unsigned long action)347 memory_block_action(struct memory_block *mem, unsigned long action)
348 {
349 int ret;
350
351 switch (action) {
352 case MEM_ONLINE:
353 ret = memory_block_online(mem);
354 break;
355 case MEM_OFFLINE:
356 ret = memory_block_offline(mem);
357 break;
358 default:
359 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
360 "%ld\n", __func__, mem->start_section_nr, action, action);
361 ret = -EINVAL;
362 }
363
364 return ret;
365 }
366
memory_block_change_state(struct memory_block * mem,unsigned long to_state,unsigned long from_state_req)367 static int memory_block_change_state(struct memory_block *mem,
368 unsigned long to_state, unsigned long from_state_req)
369 {
370 int ret = 0;
371
372 if (mem->state != from_state_req)
373 return -EINVAL;
374
375 if (to_state == MEM_OFFLINE)
376 mem->state = MEM_GOING_OFFLINE;
377
378 ret = memory_block_action(mem, to_state);
379 mem->state = ret ? from_state_req : to_state;
380
381 return ret;
382 }
383
384 /* The device lock serializes operations on memory_subsys_[online|offline] */
memory_subsys_online(struct device * dev)385 static int memory_subsys_online(struct device *dev)
386 {
387 struct memory_block *mem = to_memory_block(dev);
388 int ret;
389
390 if (mem->state == MEM_ONLINE)
391 return 0;
392
393 /*
394 * When called via device_online() without configuring the online_type,
395 * we want to default to MMOP_ONLINE.
396 */
397 if (mem->online_type == MMOP_OFFLINE)
398 mem->online_type = MMOP_ONLINE;
399
400 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
401 mem->online_type = MMOP_OFFLINE;
402
403 return ret;
404 }
405
memory_subsys_offline(struct device * dev)406 static int memory_subsys_offline(struct device *dev)
407 {
408 struct memory_block *mem = to_memory_block(dev);
409
410 if (mem->state == MEM_OFFLINE)
411 return 0;
412
413 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
414 }
415
state_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)416 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
417 const char *buf, size_t count)
418 {
419 const int online_type = mhp_online_type_from_str(buf);
420 struct memory_block *mem = to_memory_block(dev);
421 int ret;
422
423 if (online_type < 0)
424 return -EINVAL;
425
426 ret = lock_device_hotplug_sysfs();
427 if (ret)
428 return ret;
429
430 switch (online_type) {
431 case MMOP_ONLINE_KERNEL:
432 case MMOP_ONLINE_MOVABLE:
433 case MMOP_ONLINE:
434 /* mem->online_type is protected by device_hotplug_lock */
435 mem->online_type = online_type;
436 ret = device_online(&mem->dev);
437 break;
438 case MMOP_OFFLINE:
439 ret = device_offline(&mem->dev);
440 break;
441 default:
442 ret = -EINVAL; /* should never happen */
443 }
444
445 unlock_device_hotplug();
446
447 if (ret < 0)
448 return ret;
449 if (ret)
450 return -EINVAL;
451
452 return count;
453 }
454
455 /*
456 * Legacy interface that we cannot remove: s390x exposes the storage increment
457 * covered by a memory block, allowing for identifying which memory blocks
458 * comprise a storage increment. Since a memory block spans complete
459 * storage increments nowadays, this interface is basically unused. Other
460 * archs never exposed != 0.
461 */
phys_device_show(struct device * dev,struct device_attribute * attr,char * buf)462 static ssize_t phys_device_show(struct device *dev,
463 struct device_attribute *attr, char *buf)
464 {
465 struct memory_block *mem = to_memory_block(dev);
466 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
467
468 return sysfs_emit(buf, "%d\n",
469 arch_get_memory_phys_device(start_pfn));
470 }
471
472 #ifdef CONFIG_MEMORY_HOTREMOVE
print_allowed_zone(char * buf,int len,int nid,struct memory_group * group,unsigned long start_pfn,unsigned long nr_pages,int online_type,struct zone * default_zone)473 static int print_allowed_zone(char *buf, int len, int nid,
474 struct memory_group *group,
475 unsigned long start_pfn, unsigned long nr_pages,
476 int online_type, struct zone *default_zone)
477 {
478 struct zone *zone;
479
480 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
481 if (zone == default_zone)
482 return 0;
483
484 return sysfs_emit_at(buf, len, " %s", zone->name);
485 }
486
valid_zones_show(struct device * dev,struct device_attribute * attr,char * buf)487 static ssize_t valid_zones_show(struct device *dev,
488 struct device_attribute *attr, char *buf)
489 {
490 struct memory_block *mem = to_memory_block(dev);
491 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
492 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
493 struct memory_group *group = mem->group;
494 struct zone *default_zone;
495 int nid = mem->nid;
496 int len;
497
498 /*
499 * Check the existing zone. Make sure that we do that only on the
500 * online nodes otherwise the page_zone is not reliable
501 */
502 if (mem->state == MEM_ONLINE) {
503 /*
504 * If !mem->zone, the memory block spans multiple zones and
505 * cannot get offlined.
506 */
507 return sysfs_emit(buf, "%s\n",
508 mem->zone ? mem->zone->name : "none");
509 }
510
511 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
512 start_pfn, nr_pages);
513
514 len = sysfs_emit(buf, "%s", default_zone->name);
515 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
516 MMOP_ONLINE_KERNEL, default_zone);
517 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
518 MMOP_ONLINE_MOVABLE, default_zone);
519 len += sysfs_emit_at(buf, len, "\n");
520 return len;
521 }
522 static DEVICE_ATTR_RO(valid_zones);
523 #endif
524
525 static DEVICE_ATTR_RO(phys_index);
526 static DEVICE_ATTR_RW(state);
527 static DEVICE_ATTR_RO(phys_device);
528 static DEVICE_ATTR_RO(removable);
529
530 /*
531 * Show the memory block size (shared by all memory blocks).
532 */
block_size_bytes_show(struct device * dev,struct device_attribute * attr,char * buf)533 static ssize_t block_size_bytes_show(struct device *dev,
534 struct device_attribute *attr, char *buf)
535 {
536 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
537 }
538
539 static DEVICE_ATTR_RO(block_size_bytes);
540
541 /*
542 * Memory auto online policy.
543 */
544
auto_online_blocks_show(struct device * dev,struct device_attribute * attr,char * buf)545 static ssize_t auto_online_blocks_show(struct device *dev,
546 struct device_attribute *attr, char *buf)
547 {
548 return sysfs_emit(buf, "%s\n",
549 online_type_to_str[mhp_get_default_online_type()]);
550 }
551
auto_online_blocks_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)552 static ssize_t auto_online_blocks_store(struct device *dev,
553 struct device_attribute *attr,
554 const char *buf, size_t count)
555 {
556 const int online_type = mhp_online_type_from_str(buf);
557
558 if (online_type < 0)
559 return -EINVAL;
560
561 mhp_set_default_online_type(online_type);
562 return count;
563 }
564
565 static DEVICE_ATTR_RW(auto_online_blocks);
566
567 #ifdef CONFIG_CRASH_HOTPLUG
568 #include <linux/kexec.h>
crash_hotplug_show(struct device * dev,struct device_attribute * attr,char * buf)569 static ssize_t crash_hotplug_show(struct device *dev,
570 struct device_attribute *attr, char *buf)
571 {
572 return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
573 }
574 static DEVICE_ATTR_RO(crash_hotplug);
575 #endif
576
577 /*
578 * Some architectures will have custom drivers to do this, and
579 * will not need to do it from userspace. The fake hot-add code
580 * as well as ppc64 will do all of their discovery in userspace
581 * and will require this interface.
582 */
583 #ifdef CONFIG_ARCH_MEMORY_PROBE
probe_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)584 static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
585 const char *buf, size_t count)
586 {
587 u64 phys_addr;
588 int nid, ret;
589 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
590
591 ret = kstrtoull(buf, 0, &phys_addr);
592 if (ret)
593 return ret;
594
595 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
596 return -EINVAL;
597
598 ret = lock_device_hotplug_sysfs();
599 if (ret)
600 return ret;
601
602 nid = memory_add_physaddr_to_nid(phys_addr);
603 ret = __add_memory(nid, phys_addr,
604 MIN_MEMORY_BLOCK_SIZE * sections_per_block,
605 MHP_NONE);
606
607 if (ret)
608 goto out;
609
610 ret = count;
611 out:
612 unlock_device_hotplug();
613 return ret;
614 }
615
616 static DEVICE_ATTR_WO(probe);
617 #endif
618
619 #ifdef CONFIG_MEMORY_FAILURE
620 /*
621 * Support for offlining pages of memory
622 */
623
624 /* Soft offline a page */
soft_offline_page_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)625 static ssize_t soft_offline_page_store(struct device *dev,
626 struct device_attribute *attr,
627 const char *buf, size_t count)
628 {
629 int ret;
630 u64 pfn;
631 if (!capable(CAP_SYS_ADMIN))
632 return -EPERM;
633 if (kstrtoull(buf, 0, &pfn) < 0)
634 return -EINVAL;
635 pfn >>= PAGE_SHIFT;
636 ret = soft_offline_page(pfn, 0);
637 return ret == 0 ? count : ret;
638 }
639
640 /* Forcibly offline a page, including killing processes. */
hard_offline_page_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)641 static ssize_t hard_offline_page_store(struct device *dev,
642 struct device_attribute *attr,
643 const char *buf, size_t count)
644 {
645 int ret;
646 u64 pfn;
647 if (!capable(CAP_SYS_ADMIN))
648 return -EPERM;
649 if (kstrtoull(buf, 0, &pfn) < 0)
650 return -EINVAL;
651 pfn >>= PAGE_SHIFT;
652 ret = memory_failure(pfn, MF_SW_SIMULATED);
653 if (ret == -EOPNOTSUPP)
654 ret = 0;
655 return ret ? ret : count;
656 }
657
658 static DEVICE_ATTR_WO(soft_offline_page);
659 static DEVICE_ATTR_WO(hard_offline_page);
660 #endif
661
662 /* See phys_device_show(). */
arch_get_memory_phys_device(unsigned long start_pfn)663 int __weak arch_get_memory_phys_device(unsigned long start_pfn)
664 {
665 return 0;
666 }
667
668 /*
669 * A reference for the returned memory block device is acquired.
670 *
671 * Called under device_hotplug_lock.
672 */
find_memory_block_by_id(unsigned long block_id)673 struct memory_block *find_memory_block_by_id(unsigned long block_id)
674 {
675 struct memory_block *mem;
676
677 mem = xa_load(&memory_blocks, block_id);
678 if (mem)
679 get_device(&mem->dev);
680 return mem;
681 }
682
683 /*
684 * Called under device_hotplug_lock.
685 */
find_memory_block(unsigned long section_nr)686 struct memory_block *find_memory_block(unsigned long section_nr)
687 {
688 unsigned long block_id = memory_block_id(section_nr);
689
690 return find_memory_block_by_id(block_id);
691 }
692
693 static struct attribute *memory_memblk_attrs[] = {
694 &dev_attr_phys_index.attr,
695 &dev_attr_state.attr,
696 &dev_attr_phys_device.attr,
697 &dev_attr_removable.attr,
698 #ifdef CONFIG_MEMORY_HOTREMOVE
699 &dev_attr_valid_zones.attr,
700 #endif
701 NULL
702 };
703
704 static const struct attribute_group memory_memblk_attr_group = {
705 .attrs = memory_memblk_attrs,
706 };
707
708 static const struct attribute_group *memory_memblk_attr_groups[] = {
709 &memory_memblk_attr_group,
710 NULL,
711 };
712
__add_memory_block(struct memory_block * memory)713 static int __add_memory_block(struct memory_block *memory)
714 {
715 int ret;
716
717 memory->dev.bus = &memory_subsys;
718 memory->dev.id = memory->start_section_nr / sections_per_block;
719 memory->dev.release = memory_block_release;
720 memory->dev.groups = memory_memblk_attr_groups;
721 memory->dev.offline = memory->state == MEM_OFFLINE;
722
723 ret = device_register(&memory->dev);
724 if (ret) {
725 put_device(&memory->dev);
726 return ret;
727 }
728 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
729 GFP_KERNEL));
730 if (ret)
731 device_unregister(&memory->dev);
732
733 return ret;
734 }
735
early_node_zone_for_memory_block(struct memory_block * mem,int nid)736 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
737 int nid)
738 {
739 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
740 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
741 struct zone *zone, *matching_zone = NULL;
742 pg_data_t *pgdat = NODE_DATA(nid);
743 int i;
744
745 /*
746 * This logic only works for early memory, when the applicable zones
747 * already span the memory block. We don't expect overlapping zones on
748 * a single node for early memory. So if we're told that some PFNs
749 * of a node fall into this memory block, we can assume that all node
750 * zones that intersect with the memory block are actually applicable.
751 * No need to look at the memmap.
752 */
753 for (i = 0; i < MAX_NR_ZONES; i++) {
754 zone = pgdat->node_zones + i;
755 if (!populated_zone(zone))
756 continue;
757 if (!zone_intersects(zone, start_pfn, nr_pages))
758 continue;
759 if (!matching_zone) {
760 matching_zone = zone;
761 continue;
762 }
763 /* Spans multiple zones ... */
764 matching_zone = NULL;
765 break;
766 }
767 return matching_zone;
768 }
769
770 #ifdef CONFIG_NUMA
771 /**
772 * memory_block_add_nid() - Indicate that system RAM falling into this memory
773 * block device (partially) belongs to the given node.
774 * @mem: The memory block device.
775 * @nid: The node id.
776 * @context: The memory initialization context.
777 *
778 * Indicate that system RAM falling into this memory block (partially) belongs
779 * to the given node. If the context indicates ("early") that we are adding the
780 * node during node device subsystem initialization, this will also properly
781 * set/adjust mem->zone based on the zone ranges of the given node.
782 */
memory_block_add_nid(struct memory_block * mem,int nid,enum meminit_context context)783 void memory_block_add_nid(struct memory_block *mem, int nid,
784 enum meminit_context context)
785 {
786 if (context == MEMINIT_EARLY && mem->nid != nid) {
787 /*
788 * For early memory we have to determine the zone when setting
789 * the node id and handle multiple nodes spanning a single
790 * memory block by indicate via zone == NULL that we're not
791 * dealing with a single zone. So if we're setting the node id
792 * the first time, determine if there is a single zone. If we're
793 * setting the node id a second time to a different node,
794 * invalidate the single detected zone.
795 */
796 if (mem->nid == NUMA_NO_NODE)
797 mem->zone = early_node_zone_for_memory_block(mem, nid);
798 else
799 mem->zone = NULL;
800 }
801
802 /*
803 * If this memory block spans multiple nodes, we only indicate
804 * the last processed node. If we span multiple nodes (not applicable
805 * to hotplugged memory), zone == NULL will prohibit memory offlining
806 * and consequently unplug.
807 */
808 mem->nid = nid;
809 }
810 #endif
811
add_memory_block(unsigned long block_id,unsigned long state,struct vmem_altmap * altmap,struct memory_group * group)812 static int add_memory_block(unsigned long block_id, unsigned long state,
813 struct vmem_altmap *altmap,
814 struct memory_group *group)
815 {
816 struct memory_block *mem;
817 int ret = 0;
818
819 mem = find_memory_block_by_id(block_id);
820 if (mem) {
821 put_device(&mem->dev);
822 return -EEXIST;
823 }
824 mem = kzalloc(sizeof(*mem), GFP_KERNEL);
825 if (!mem)
826 return -ENOMEM;
827
828 mem->start_section_nr = block_id * sections_per_block;
829 mem->state = state;
830 mem->nid = NUMA_NO_NODE;
831 mem->altmap = altmap;
832 INIT_LIST_HEAD(&mem->group_next);
833
834 #ifndef CONFIG_NUMA
835 if (state == MEM_ONLINE)
836 /*
837 * MEM_ONLINE at this point implies early memory. With NUMA,
838 * we'll determine the zone when setting the node id via
839 * memory_block_add_nid(). Memory hotplug updated the zone
840 * manually when memory onlining/offlining succeeds.
841 */
842 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
843 #endif /* CONFIG_NUMA */
844
845 ret = __add_memory_block(mem);
846 if (ret)
847 return ret;
848
849 if (group) {
850 mem->group = group;
851 list_add(&mem->group_next, &group->memory_blocks);
852 }
853
854 return 0;
855 }
856
add_hotplug_memory_block(unsigned long block_id,struct vmem_altmap * altmap,struct memory_group * group)857 static int add_hotplug_memory_block(unsigned long block_id,
858 struct vmem_altmap *altmap,
859 struct memory_group *group)
860 {
861 return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
862 }
863
remove_memory_block(struct memory_block * memory)864 static void remove_memory_block(struct memory_block *memory)
865 {
866 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
867 return;
868
869 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
870
871 if (memory->group) {
872 list_del(&memory->group_next);
873 memory->group = NULL;
874 }
875
876 /* drop the ref. we got via find_memory_block() */
877 put_device(&memory->dev);
878 device_unregister(&memory->dev);
879 }
880
881 /*
882 * Create memory block devices for the given memory area. Start and size
883 * have to be aligned to memory block granularity. Memory block devices
884 * will be initialized as offline.
885 *
886 * Called under device_hotplug_lock.
887 */
create_memory_block_devices(unsigned long start,unsigned long size,struct vmem_altmap * altmap,struct memory_group * group)888 int create_memory_block_devices(unsigned long start, unsigned long size,
889 struct vmem_altmap *altmap,
890 struct memory_group *group)
891 {
892 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
893 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
894 struct memory_block *mem;
895 unsigned long block_id;
896 int ret = 0;
897
898 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
899 !IS_ALIGNED(size, memory_block_size_bytes())))
900 return -EINVAL;
901
902 for (block_id = start_block_id; block_id != end_block_id; block_id++) {
903 ret = add_hotplug_memory_block(block_id, altmap, group);
904 if (ret)
905 break;
906 }
907 if (ret) {
908 end_block_id = block_id;
909 for (block_id = start_block_id; block_id != end_block_id;
910 block_id++) {
911 mem = find_memory_block_by_id(block_id);
912 if (WARN_ON_ONCE(!mem))
913 continue;
914 remove_memory_block(mem);
915 }
916 }
917 return ret;
918 }
919
920 /*
921 * Remove memory block devices for the given memory area. Start and size
922 * have to be aligned to memory block granularity. Memory block devices
923 * have to be offline.
924 *
925 * Called under device_hotplug_lock.
926 */
remove_memory_block_devices(unsigned long start,unsigned long size)927 void remove_memory_block_devices(unsigned long start, unsigned long size)
928 {
929 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
930 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
931 struct memory_block *mem;
932 unsigned long block_id;
933
934 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
935 !IS_ALIGNED(size, memory_block_size_bytes())))
936 return;
937
938 for (block_id = start_block_id; block_id != end_block_id; block_id++) {
939 mem = find_memory_block_by_id(block_id);
940 if (WARN_ON_ONCE(!mem))
941 continue;
942 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
943 unregister_memory_block_under_nodes(mem);
944 remove_memory_block(mem);
945 }
946 }
947
948 static struct attribute *memory_root_attrs[] = {
949 #ifdef CONFIG_ARCH_MEMORY_PROBE
950 &dev_attr_probe.attr,
951 #endif
952
953 #ifdef CONFIG_MEMORY_FAILURE
954 &dev_attr_soft_offline_page.attr,
955 &dev_attr_hard_offline_page.attr,
956 #endif
957
958 &dev_attr_block_size_bytes.attr,
959 &dev_attr_auto_online_blocks.attr,
960 #ifdef CONFIG_CRASH_HOTPLUG
961 &dev_attr_crash_hotplug.attr,
962 #endif
963 NULL
964 };
965
966 static const struct attribute_group memory_root_attr_group = {
967 .attrs = memory_root_attrs,
968 };
969
970 static const struct attribute_group *memory_root_attr_groups[] = {
971 &memory_root_attr_group,
972 NULL,
973 };
974
975 /*
976 * Initialize the sysfs support for memory devices. At the time this function
977 * is called, we cannot have concurrent creation/deletion of memory block
978 * devices, the device_hotplug_lock is not needed.
979 */
memory_dev_init(void)980 void __init memory_dev_init(void)
981 {
982 int ret;
983 unsigned long block_sz, block_id, nr;
984
985 /* Validate the configured memory block size */
986 block_sz = memory_block_size_bytes();
987 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
988 panic("Memory block size not suitable: 0x%lx\n", block_sz);
989 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
990
991 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
992 if (ret)
993 panic("%s() failed to register subsystem: %d\n", __func__, ret);
994
995 /*
996 * Create entries for memory sections that were found during boot
997 * and have been initialized. Use @block_id to track the last
998 * handled block and initialize it to an invalid value (ULONG_MAX)
999 * to bypass the block ID matching check for the first present
1000 * block so that it can be covered.
1001 */
1002 block_id = ULONG_MAX;
1003 for_each_present_section_nr(0, nr) {
1004 if (block_id != ULONG_MAX && memory_block_id(nr) == block_id)
1005 continue;
1006
1007 block_id = memory_block_id(nr);
1008 ret = add_memory_block(block_id, MEM_ONLINE, NULL, NULL);
1009 if (ret) {
1010 panic("%s() failed to add memory block: %d\n",
1011 __func__, ret);
1012 }
1013 }
1014 }
1015
1016 /**
1017 * walk_memory_blocks - walk through all present memory blocks overlapped
1018 * by the range [start, start + size)
1019 *
1020 * @start: start address of the memory range
1021 * @size: size of the memory range
1022 * @arg: argument passed to func
1023 * @func: callback for each memory section walked
1024 *
1025 * This function walks through all present memory blocks overlapped by the
1026 * range [start, start + size), calling func on each memory block.
1027 *
1028 * In case func() returns an error, walking is aborted and the error is
1029 * returned.
1030 *
1031 * Called under device_hotplug_lock.
1032 */
walk_memory_blocks(unsigned long start,unsigned long size,void * arg,walk_memory_blocks_func_t func)1033 int walk_memory_blocks(unsigned long start, unsigned long size,
1034 void *arg, walk_memory_blocks_func_t func)
1035 {
1036 const unsigned long start_block_id = phys_to_block_id(start);
1037 const unsigned long end_block_id = phys_to_block_id(start + size - 1);
1038 struct memory_block *mem;
1039 unsigned long block_id;
1040 int ret = 0;
1041
1042 if (!size)
1043 return 0;
1044
1045 for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
1046 mem = find_memory_block_by_id(block_id);
1047 if (!mem)
1048 continue;
1049
1050 ret = func(mem, arg);
1051 put_device(&mem->dev);
1052 if (ret)
1053 break;
1054 }
1055 return ret;
1056 }
1057
1058 struct for_each_memory_block_cb_data {
1059 walk_memory_blocks_func_t func;
1060 void *arg;
1061 };
1062
for_each_memory_block_cb(struct device * dev,void * data)1063 static int for_each_memory_block_cb(struct device *dev, void *data)
1064 {
1065 struct memory_block *mem = to_memory_block(dev);
1066 struct for_each_memory_block_cb_data *cb_data = data;
1067
1068 return cb_data->func(mem, cb_data->arg);
1069 }
1070
1071 /**
1072 * for_each_memory_block - walk through all present memory blocks
1073 *
1074 * @arg: argument passed to func
1075 * @func: callback for each memory block walked
1076 *
1077 * This function walks through all present memory blocks, calling func on
1078 * each memory block.
1079 *
1080 * In case func() returns an error, walking is aborted and the error is
1081 * returned.
1082 */
for_each_memory_block(void * arg,walk_memory_blocks_func_t func)1083 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
1084 {
1085 struct for_each_memory_block_cb_data cb_data = {
1086 .func = func,
1087 .arg = arg,
1088 };
1089
1090 return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
1091 for_each_memory_block_cb);
1092 }
1093
1094 /*
1095 * This is an internal helper to unify allocation and initialization of
1096 * memory groups. Note that the passed memory group will be copied to a
1097 * dynamically allocated memory group. After this call, the passed
1098 * memory group should no longer be used.
1099 */
memory_group_register(struct memory_group group)1100 static int memory_group_register(struct memory_group group)
1101 {
1102 struct memory_group *new_group;
1103 uint32_t mgid;
1104 int ret;
1105
1106 if (!node_possible(group.nid))
1107 return -EINVAL;
1108
1109 new_group = kzalloc(sizeof(group), GFP_KERNEL);
1110 if (!new_group)
1111 return -ENOMEM;
1112 *new_group = group;
1113 INIT_LIST_HEAD(&new_group->memory_blocks);
1114
1115 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
1116 GFP_KERNEL);
1117 if (ret) {
1118 kfree(new_group);
1119 return ret;
1120 } else if (group.is_dynamic) {
1121 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
1122 }
1123 return mgid;
1124 }
1125
1126 /**
1127 * memory_group_register_static() - Register a static memory group.
1128 * @nid: The node id.
1129 * @max_pages: The maximum number of pages we'll have in this static memory
1130 * group.
1131 *
1132 * Register a new static memory group and return the memory group id.
1133 * All memory in the group belongs to a single unit, such as a DIMM. All
1134 * memory belonging to a static memory group is added in one go to be removed
1135 * in one go -- it's static.
1136 *
1137 * Returns an error if out of memory, if the node id is invalid, if no new
1138 * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
1139 * returns the new memory group id.
1140 */
memory_group_register_static(int nid,unsigned long max_pages)1141 int memory_group_register_static(int nid, unsigned long max_pages)
1142 {
1143 struct memory_group group = {
1144 .nid = nid,
1145 .s = {
1146 .max_pages = max_pages,
1147 },
1148 };
1149
1150 if (!max_pages)
1151 return -EINVAL;
1152 return memory_group_register(group);
1153 }
1154 EXPORT_SYMBOL_GPL(memory_group_register_static);
1155
1156 /**
1157 * memory_group_register_dynamic() - Register a dynamic memory group.
1158 * @nid: The node id.
1159 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
1160 * memory group.
1161 *
1162 * Register a new dynamic memory group and return the memory group id.
1163 * Memory within a dynamic memory group is added/removed dynamically
1164 * in unit_pages.
1165 *
1166 * Returns an error if out of memory, if the node id is invalid, if no new
1167 * memory groups can be registered, or if unit_pages is invalid (0, not a
1168 * power of two, smaller than a single memory block). Otherwise, returns the
1169 * new memory group id.
1170 */
memory_group_register_dynamic(int nid,unsigned long unit_pages)1171 int memory_group_register_dynamic(int nid, unsigned long unit_pages)
1172 {
1173 struct memory_group group = {
1174 .nid = nid,
1175 .is_dynamic = true,
1176 .d = {
1177 .unit_pages = unit_pages,
1178 },
1179 };
1180
1181 if (!unit_pages || !is_power_of_2(unit_pages) ||
1182 unit_pages < PHYS_PFN(memory_block_size_bytes()))
1183 return -EINVAL;
1184 return memory_group_register(group);
1185 }
1186 EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
1187
1188 /**
1189 * memory_group_unregister() - Unregister a memory group.
1190 * @mgid: the memory group id
1191 *
1192 * Unregister a memory group. If any memory block still belongs to this
1193 * memory group, unregistering will fail.
1194 *
1195 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
1196 * memory blocks still belong to this memory group and returns 0 if
1197 * unregistering succeeded.
1198 */
memory_group_unregister(int mgid)1199 int memory_group_unregister(int mgid)
1200 {
1201 struct memory_group *group;
1202
1203 if (mgid < 0)
1204 return -EINVAL;
1205
1206 group = xa_load(&memory_groups, mgid);
1207 if (!group)
1208 return -EINVAL;
1209 if (!list_empty(&group->memory_blocks))
1210 return -EBUSY;
1211 xa_erase(&memory_groups, mgid);
1212 kfree(group);
1213 return 0;
1214 }
1215 EXPORT_SYMBOL_GPL(memory_group_unregister);
1216
1217 /*
1218 * This is an internal helper only to be used in core memory hotplug code to
1219 * lookup a memory group. We don't care about locking, as we don't expect a
1220 * memory group to get unregistered while adding memory to it -- because
1221 * the group and the memory is managed by the same driver.
1222 */
memory_group_find_by_id(int mgid)1223 struct memory_group *memory_group_find_by_id(int mgid)
1224 {
1225 return xa_load(&memory_groups, mgid);
1226 }
1227
1228 /*
1229 * This is an internal helper only to be used in core memory hotplug code to
1230 * walk all dynamic memory groups excluding a given memory group, either
1231 * belonging to a specific node, or belonging to any node.
1232 */
walk_dynamic_memory_groups(int nid,walk_memory_groups_func_t func,struct memory_group * excluded,void * arg)1233 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
1234 struct memory_group *excluded, void *arg)
1235 {
1236 struct memory_group *group;
1237 unsigned long index;
1238 int ret = 0;
1239
1240 xa_for_each_marked(&memory_groups, index, group,
1241 MEMORY_GROUP_MARK_DYNAMIC) {
1242 if (group == excluded)
1243 continue;
1244 #ifdef CONFIG_NUMA
1245 if (nid != NUMA_NO_NODE && group->nid != nid)
1246 continue;
1247 #endif /* CONFIG_NUMA */
1248 ret = func(group, arg);
1249 if (ret)
1250 break;
1251 }
1252 return ret;
1253 }
1254
1255 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
memblk_nr_poison_inc(unsigned long pfn)1256 void memblk_nr_poison_inc(unsigned long pfn)
1257 {
1258 const unsigned long block_id = pfn_to_block_id(pfn);
1259 struct memory_block *mem = find_memory_block_by_id(block_id);
1260
1261 if (mem)
1262 atomic_long_inc(&mem->nr_hwpoison);
1263 }
1264
memblk_nr_poison_sub(unsigned long pfn,long i)1265 void memblk_nr_poison_sub(unsigned long pfn, long i)
1266 {
1267 const unsigned long block_id = pfn_to_block_id(pfn);
1268 struct memory_block *mem = find_memory_block_by_id(block_id);
1269
1270 if (mem)
1271 atomic_long_sub(i, &mem->nr_hwpoison);
1272 }
1273
memblk_nr_poison(struct memory_block * mem)1274 static unsigned long memblk_nr_poison(struct memory_block *mem)
1275 {
1276 return atomic_long_read(&mem->nr_hwpoison);
1277 }
1278 #endif
1279