1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Ram backed block device driver.
4 *
5 * Copyright (C) 2007 Nick Piggin
6 * Copyright (C) 2007 Novell Inc.
7 *
8 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
9 * of their respective owners.
10 */
11
12 #include <linux/init.h>
13 #include <linux/initrd.h>
14 #include <linux/module.h>
15 #include <linux/moduleparam.h>
16 #include <linux/major.h>
17 #include <linux/blkdev.h>
18 #include <linux/bio.h>
19 #include <linux/highmem.h>
20 #include <linux/mutex.h>
21 #include <linux/pagemap.h>
22 #include <linux/xarray.h>
23 #include <linux/fs.h>
24 #include <linux/slab.h>
25 #include <linux/backing-dev.h>
26 #include <linux/debugfs.h>
27
28 #include <linux/uaccess.h>
29
30 /*
31 * Each block ramdisk device has a xarray brd_pages of pages that stores
32 * the pages containing the block device's contents.
33 */
34 struct brd_device {
35 int brd_number;
36 struct gendisk *brd_disk;
37 struct list_head brd_list;
38
39 /*
40 * Backing store of pages. This is the contents of the block device.
41 */
42 struct xarray brd_pages;
43 u64 brd_nr_pages;
44 };
45
46 /*
47 * Look up and return a brd's page with reference grabbed for a given sector.
48 */
brd_lookup_page(struct brd_device * brd,sector_t sector)49 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
50 {
51 struct page *page;
52 XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
53
54 rcu_read_lock();
55 repeat:
56 page = xas_load(&xas);
57 if (xas_retry(&xas, page)) {
58 xas_reset(&xas);
59 goto repeat;
60 }
61
62 if (!page)
63 goto out;
64
65 if (!get_page_unless_zero(page)) {
66 xas_reset(&xas);
67 goto repeat;
68 }
69
70 if (unlikely(page != xas_reload(&xas))) {
71 put_page(page);
72 xas_reset(&xas);
73 goto repeat;
74 }
75 out:
76 rcu_read_unlock();
77
78 return page;
79 }
80
81 /*
82 * Insert a new page for a given sector, if one does not already exist.
83 * The returned page will grab reference.
84 */
brd_insert_page(struct brd_device * brd,sector_t sector,blk_opf_t opf)85 static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
86 blk_opf_t opf)
87 {
88 gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
89 struct page *page, *ret;
90
91 page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
92 if (!page)
93 return ERR_PTR(-ENOMEM);
94
95 xa_lock(&brd->brd_pages);
96 ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
97 page, gfp);
98 if (!ret) {
99 brd->brd_nr_pages++;
100 get_page(page);
101 xa_unlock(&brd->brd_pages);
102 return page;
103 }
104
105 if (!xa_is_err(ret)) {
106 get_page(ret);
107 xa_unlock(&brd->brd_pages);
108 put_page(page);
109 return ret;
110 }
111
112 xa_unlock(&brd->brd_pages);
113 put_page(page);
114 return ERR_PTR(xa_err(ret));
115 }
116
117 /*
118 * Free all backing store pages and xarray. This must only be called when
119 * there are no other users of the device.
120 */
brd_free_pages(struct brd_device * brd)121 static void brd_free_pages(struct brd_device *brd)
122 {
123 struct page *page;
124 pgoff_t idx;
125
126 xa_for_each(&brd->brd_pages, idx, page) {
127 put_page(page);
128 cond_resched();
129 }
130
131 xa_destroy(&brd->brd_pages);
132 }
133
134 /*
135 * Process a single segment. The segment is capped to not cross page boundaries
136 * in both the bio and the brd backing memory.
137 */
brd_rw_bvec(struct brd_device * brd,struct bio * bio)138 static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
139 {
140 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
141 sector_t sector = bio->bi_iter.bi_sector;
142 u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
143 blk_opf_t opf = bio->bi_opf;
144 struct page *page;
145 void *kaddr;
146
147 bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
148
149 page = brd_lookup_page(brd, sector);
150 if (!page && op_is_write(opf)) {
151 page = brd_insert_page(brd, sector, opf);
152 if (IS_ERR(page))
153 goto out_error;
154 }
155
156 kaddr = bvec_kmap_local(&bv);
157 if (op_is_write(opf)) {
158 memcpy_to_page(page, offset, kaddr, bv.bv_len);
159 } else {
160 if (page)
161 memcpy_from_page(kaddr, page, offset, bv.bv_len);
162 else
163 memset(kaddr, 0, bv.bv_len);
164 }
165 kunmap_local(kaddr);
166
167 bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
168 if (page)
169 put_page(page);
170 return true;
171
172 out_error:
173 if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
174 bio_wouldblock_error(bio);
175 else
176 bio_io_error(bio);
177 return false;
178 }
179
brd_do_discard(struct brd_device * brd,sector_t sector,u32 size)180 static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
181 {
182 sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
183 sector_t aligned_end = round_down(
184 sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
185 struct page *page;
186
187 if (aligned_end <= aligned_sector)
188 return;
189
190 xa_lock(&brd->brd_pages);
191 while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
192 page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
193 if (page) {
194 put_page(page);
195 brd->brd_nr_pages--;
196 }
197 aligned_sector += PAGE_SECTORS;
198 }
199 xa_unlock(&brd->brd_pages);
200 }
201
brd_submit_bio(struct bio * bio)202 static void brd_submit_bio(struct bio *bio)
203 {
204 struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
205
206 if (unlikely(op_is_discard(bio->bi_opf))) {
207 brd_do_discard(brd, bio->bi_iter.bi_sector,
208 bio->bi_iter.bi_size);
209 bio_endio(bio);
210 return;
211 }
212
213 do {
214 if (!brd_rw_bvec(brd, bio))
215 return;
216 } while (bio->bi_iter.bi_size);
217
218 bio_endio(bio);
219 }
220
221 static const struct block_device_operations brd_fops = {
222 .owner = THIS_MODULE,
223 .submit_bio = brd_submit_bio,
224 };
225
226 /*
227 * And now the modules code and kernel interface.
228 */
229 static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
230 module_param(rd_nr, int, 0444);
231 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
232
233 unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
234 module_param(rd_size, ulong, 0444);
235 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
236
237 static int max_part = 1;
238 module_param(max_part, int, 0444);
239 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
240
241 MODULE_DESCRIPTION("Ram backed block device driver");
242 MODULE_LICENSE("GPL");
243 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
244 MODULE_ALIAS("rd");
245
246 #ifndef MODULE
247 /* Legacy boot options - nonmodular */
ramdisk_size(char * str)248 static int __init ramdisk_size(char *str)
249 {
250 rd_size = simple_strtol(str, NULL, 0);
251 return 1;
252 }
253 __setup("ramdisk_size=", ramdisk_size);
254 #endif
255
256 /*
257 * The device scheme is derived from loop.c. Keep them in synch where possible
258 * (should share code eventually).
259 */
260 static LIST_HEAD(brd_devices);
261 static DEFINE_MUTEX(brd_devices_mutex);
262 static struct dentry *brd_debugfs_dir;
263
brd_find_or_alloc_device(int i)264 static struct brd_device *brd_find_or_alloc_device(int i)
265 {
266 struct brd_device *brd;
267
268 mutex_lock(&brd_devices_mutex);
269 list_for_each_entry(brd, &brd_devices, brd_list) {
270 if (brd->brd_number == i) {
271 mutex_unlock(&brd_devices_mutex);
272 return ERR_PTR(-EEXIST);
273 }
274 }
275
276 brd = kzalloc(sizeof(*brd), GFP_KERNEL);
277 if (!brd) {
278 mutex_unlock(&brd_devices_mutex);
279 return ERR_PTR(-ENOMEM);
280 }
281 brd->brd_number = i;
282 list_add_tail(&brd->brd_list, &brd_devices);
283 mutex_unlock(&brd_devices_mutex);
284 return brd;
285 }
286
brd_free_device(struct brd_device * brd)287 static void brd_free_device(struct brd_device *brd)
288 {
289 mutex_lock(&brd_devices_mutex);
290 list_del(&brd->brd_list);
291 mutex_unlock(&brd_devices_mutex);
292 kfree(brd);
293 }
294
brd_alloc(int i)295 static int brd_alloc(int i)
296 {
297 struct brd_device *brd;
298 struct gendisk *disk;
299 char buf[DISK_NAME_LEN];
300 int err = -ENOMEM;
301 struct queue_limits lim = {
302 /*
303 * This is so fdisk will align partitions on 4k, because of
304 * direct_access API needing 4k alignment, returning a PFN
305 * (This is only a problem on very small devices <= 4M,
306 * otherwise fdisk will align on 1M. Regardless this call
307 * is harmless)
308 */
309 .physical_block_size = PAGE_SIZE,
310 .max_hw_discard_sectors = UINT_MAX,
311 .max_discard_segments = 1,
312 .discard_granularity = PAGE_SIZE,
313 .features = BLK_FEAT_SYNCHRONOUS |
314 BLK_FEAT_NOWAIT,
315 };
316
317 brd = brd_find_or_alloc_device(i);
318 if (IS_ERR(brd))
319 return PTR_ERR(brd);
320
321 xa_init(&brd->brd_pages);
322
323 snprintf(buf, DISK_NAME_LEN, "ram%d", i);
324 if (!IS_ERR_OR_NULL(brd_debugfs_dir))
325 debugfs_create_u64(buf, 0444, brd_debugfs_dir,
326 &brd->brd_nr_pages);
327
328 disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
329 if (IS_ERR(disk)) {
330 err = PTR_ERR(disk);
331 goto out_free_dev;
332 }
333 disk->major = RAMDISK_MAJOR;
334 disk->first_minor = i * max_part;
335 disk->minors = max_part;
336 disk->fops = &brd_fops;
337 disk->private_data = brd;
338 strscpy(disk->disk_name, buf, DISK_NAME_LEN);
339 set_capacity(disk, rd_size * 2);
340
341 err = add_disk(disk);
342 if (err)
343 goto out_cleanup_disk;
344
345 return 0;
346
347 out_cleanup_disk:
348 put_disk(disk);
349 out_free_dev:
350 brd_free_device(brd);
351 return err;
352 }
353
brd_probe(dev_t dev)354 static void brd_probe(dev_t dev)
355 {
356 brd_alloc(MINOR(dev) / max_part);
357 }
358
brd_cleanup(void)359 static void brd_cleanup(void)
360 {
361 struct brd_device *brd, *next;
362
363 debugfs_remove_recursive(brd_debugfs_dir);
364
365 list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
366 del_gendisk(brd->brd_disk);
367 put_disk(brd->brd_disk);
368 brd_free_pages(brd);
369 brd_free_device(brd);
370 }
371 }
372
brd_check_and_reset_par(void)373 static inline void brd_check_and_reset_par(void)
374 {
375 if (unlikely(!max_part))
376 max_part = 1;
377
378 /*
379 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
380 * otherwise, it is possiable to get same dev_t when adding partitions.
381 */
382 if ((1U << MINORBITS) % max_part != 0)
383 max_part = 1UL << fls(max_part);
384
385 if (max_part > DISK_MAX_PARTS) {
386 pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
387 DISK_MAX_PARTS, DISK_MAX_PARTS);
388 max_part = DISK_MAX_PARTS;
389 }
390 }
391
brd_init(void)392 static int __init brd_init(void)
393 {
394 int err, i;
395
396 /*
397 * brd module now has a feature to instantiate underlying device
398 * structure on-demand, provided that there is an access dev node.
399 *
400 * (1) if rd_nr is specified, create that many upfront. else
401 * it defaults to CONFIG_BLK_DEV_RAM_COUNT
402 * (2) User can further extend brd devices by create dev node themselves
403 * and have kernel automatically instantiate actual device
404 * on-demand. Example:
405 * mknod /path/devnod_name b 1 X # 1 is the rd major
406 * fdisk -l /path/devnod_name
407 * If (X / max_part) was not already created it will be created
408 * dynamically.
409 */
410
411 brd_check_and_reset_par();
412
413 brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
414
415 if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
416 err = -EIO;
417 goto out_free;
418 }
419
420 for (i = 0; i < rd_nr; i++)
421 brd_alloc(i);
422
423 pr_info("brd: module loaded\n");
424 return 0;
425
426 out_free:
427 brd_cleanup();
428
429 pr_info("brd: module NOT loaded !!!\n");
430 return err;
431 }
432
brd_exit(void)433 static void __exit brd_exit(void)
434 {
435
436 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
437 brd_cleanup();
438
439 pr_info("brd: module unloaded\n");
440 }
441
442 module_init(brd_init);
443 module_exit(brd_exit);
444
445