1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2005-2007 Red Hat GmbH 4 * 5 * A target that delays reads and/or writes and can send 6 * them to different devices. 7 * 8 * This file is released under the GPL. 9 */ 10 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/blkdev.h> 14 #include <linux/bio.h> 15 #include <linux/slab.h> 16 #include <linux/kthread.h> 17 #include <linux/delay.h> 18 19 #include <linux/device-mapper.h> 20 21 #define DM_MSG_PREFIX "delay" 22 23 #define SLEEP_SHIFT 3 24 25 struct delay_class { 26 struct dm_dev *dev; 27 sector_t start; 28 unsigned int delay; 29 unsigned int ops; 30 }; 31 32 struct delay_c { 33 struct timer_list delay_timer; 34 struct mutex process_bios_lock; /* hold while removing bios to be processed from list */ 35 spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */ 36 struct workqueue_struct *kdelayd_wq; 37 struct work_struct flush_expired_bios; 38 struct list_head delayed_bios; 39 struct task_struct *worker; 40 unsigned int worker_sleep_us; 41 bool may_delay; 42 43 struct delay_class read; 44 struct delay_class write; 45 struct delay_class flush; 46 47 int argc; 48 }; 49 50 struct dm_delay_info { 51 struct delay_c *context; 52 struct delay_class *class; 53 struct list_head list; 54 unsigned long expires; 55 }; 56 57 static void handle_delayed_timer(struct timer_list *t) 58 { 59 struct delay_c *dc = timer_container_of(dc, t, delay_timer); 60 61 queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); 62 } 63 64 static void queue_timeout(struct delay_c *dc, unsigned long expires) 65 { 66 timer_reduce(&dc->delay_timer, expires); 67 } 68 69 static inline bool delay_is_fast(struct delay_c *dc) 70 { 71 return !!dc->worker; 72 } 73 74 static void flush_bios(struct bio *bio) 75 { 76 struct bio *n; 77 78 while (bio) { 79 n = bio->bi_next; 80 bio->bi_next = NULL; 81 dm_submit_bio_remap(bio, NULL); 82 bio = n; 83 } 84 } 85 86 static void flush_delayed_bios(struct delay_c *dc, bool flush_all) 87 { 88 struct dm_delay_info *delayed, *next; 89 struct bio_list flush_bio_list; 90 LIST_HEAD(local_list); 91 unsigned long next_expires = 0; 92 bool start_timer = false; 93 bio_list_init(&flush_bio_list); 94 95 mutex_lock(&dc->process_bios_lock); 96 spin_lock(&dc->delayed_bios_lock); 97 list_replace_init(&dc->delayed_bios, &local_list); 98 spin_unlock(&dc->delayed_bios_lock); 99 list_for_each_entry_safe(delayed, next, &local_list, list) { 100 cond_resched(); 101 if (flush_all || time_after_eq(jiffies, delayed->expires)) { 102 struct bio *bio = dm_bio_from_per_bio_data(delayed, 103 sizeof(struct dm_delay_info)); 104 list_del(&delayed->list); 105 bio_list_add(&flush_bio_list, bio); 106 delayed->class->ops--; 107 continue; 108 } 109 110 if (!delay_is_fast(dc)) { 111 if (!start_timer) { 112 start_timer = true; 113 next_expires = delayed->expires; 114 } else { 115 next_expires = min(next_expires, delayed->expires); 116 } 117 } 118 } 119 spin_lock(&dc->delayed_bios_lock); 120 list_splice(&local_list, &dc->delayed_bios); 121 spin_unlock(&dc->delayed_bios_lock); 122 mutex_unlock(&dc->process_bios_lock); 123 124 if (start_timer) 125 queue_timeout(dc, next_expires); 126 127 flush_bios(bio_list_get(&flush_bio_list)); 128 } 129 130 static int flush_worker_fn(void *data) 131 { 132 struct delay_c *dc = data; 133 134 while (!kthread_should_stop()) { 135 flush_delayed_bios(dc, false); 136 spin_lock(&dc->delayed_bios_lock); 137 if (unlikely(list_empty(&dc->delayed_bios))) { 138 set_current_state(TASK_INTERRUPTIBLE); 139 spin_unlock(&dc->delayed_bios_lock); 140 schedule(); 141 } else { 142 spin_unlock(&dc->delayed_bios_lock); 143 fsleep(dc->worker_sleep_us); 144 cond_resched(); 145 } 146 } 147 148 return 0; 149 } 150 151 static void flush_expired_bios(struct work_struct *work) 152 { 153 struct delay_c *dc; 154 155 dc = container_of(work, struct delay_c, flush_expired_bios); 156 flush_delayed_bios(dc, false); 157 } 158 159 static void delay_dtr(struct dm_target *ti) 160 { 161 struct delay_c *dc = ti->private; 162 163 if (dc->kdelayd_wq) { 164 timer_shutdown_sync(&dc->delay_timer); 165 destroy_workqueue(dc->kdelayd_wq); 166 } 167 168 if (dc->read.dev) 169 dm_put_device(ti, dc->read.dev); 170 if (dc->write.dev) 171 dm_put_device(ti, dc->write.dev); 172 if (dc->flush.dev) 173 dm_put_device(ti, dc->flush.dev); 174 if (dc->worker) 175 kthread_stop(dc->worker); 176 177 mutex_destroy(&dc->process_bios_lock); 178 179 kfree(dc); 180 } 181 182 static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv) 183 { 184 int ret; 185 unsigned long long tmpll; 186 char dummy; 187 188 if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { 189 ti->error = "Invalid device sector"; 190 return -EINVAL; 191 } 192 c->start = tmpll; 193 194 if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) { 195 ti->error = "Invalid delay"; 196 return -EINVAL; 197 } 198 199 ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev); 200 if (ret) { 201 ti->error = "Device lookup failed"; 202 return ret; 203 } 204 205 return 0; 206 } 207 208 /* 209 * Mapping parameters: 210 * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>] 211 * 212 * With separate write parameters, the first set is only used for reads. 213 * Offsets are specified in sectors. 214 * Delays are specified in milliseconds. 215 */ 216 static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) 217 { 218 struct delay_c *dc; 219 int ret; 220 unsigned int max_delay, min_delay; 221 222 if (argc != 3 && argc != 6 && argc != 9) { 223 ti->error = "Requires exactly 3, 6 or 9 arguments"; 224 return -EINVAL; 225 } 226 227 dc = kzalloc(sizeof(*dc), GFP_KERNEL); 228 if (!dc) { 229 ti->error = "Cannot allocate context"; 230 return -ENOMEM; 231 } 232 233 ti->private = dc; 234 INIT_LIST_HEAD(&dc->delayed_bios); 235 mutex_init(&dc->process_bios_lock); 236 spin_lock_init(&dc->delayed_bios_lock); 237 dc->may_delay = true; 238 dc->argc = argc; 239 240 ret = delay_class_ctr(ti, &dc->read, argv); 241 if (ret) 242 goto bad; 243 min_delay = max_delay = dc->read.delay; 244 245 if (argc == 3) { 246 ret = delay_class_ctr(ti, &dc->write, argv); 247 if (ret) 248 goto bad; 249 ret = delay_class_ctr(ti, &dc->flush, argv); 250 if (ret) 251 goto bad; 252 goto out; 253 } 254 255 ret = delay_class_ctr(ti, &dc->write, argv + 3); 256 if (ret) 257 goto bad; 258 max_delay = max(max_delay, dc->write.delay); 259 min_delay = min_not_zero(min_delay, dc->write.delay); 260 261 if (argc == 6) { 262 ret = delay_class_ctr(ti, &dc->flush, argv + 3); 263 if (ret) 264 goto bad; 265 goto out; 266 } 267 268 ret = delay_class_ctr(ti, &dc->flush, argv + 6); 269 if (ret) 270 goto bad; 271 max_delay = max(max_delay, dc->flush.delay); 272 min_delay = min_not_zero(min_delay, dc->flush.delay); 273 274 out: 275 if (max_delay < 50) { 276 if (min_delay >> SLEEP_SHIFT) 277 dc->worker_sleep_us = 1000; 278 else 279 dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT; 280 /* 281 * In case of small requested delays, use kthread instead of 282 * timers and workqueue to achieve better latency. 283 */ 284 dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker"); 285 if (IS_ERR(dc->worker)) { 286 ret = PTR_ERR(dc->worker); 287 dc->worker = NULL; 288 goto bad; 289 } 290 } else { 291 timer_setup(&dc->delay_timer, handle_delayed_timer, 0); 292 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); 293 dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); 294 if (!dc->kdelayd_wq) { 295 ret = -EINVAL; 296 DMERR("Couldn't start kdelayd"); 297 goto bad; 298 } 299 } 300 301 ti->num_flush_bios = 1; 302 ti->num_discard_bios = 1; 303 ti->accounts_remapped_io = true; 304 ti->per_io_data_size = sizeof(struct dm_delay_info); 305 return 0; 306 307 bad: 308 delay_dtr(ti); 309 return ret; 310 } 311 312 static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) 313 { 314 struct dm_delay_info *delayed; 315 unsigned long expires = 0; 316 317 if (!c->delay) 318 return DM_MAPIO_REMAPPED; 319 320 delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); 321 322 delayed->context = dc; 323 delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); 324 325 spin_lock(&dc->delayed_bios_lock); 326 if (unlikely(!dc->may_delay)) { 327 spin_unlock(&dc->delayed_bios_lock); 328 return DM_MAPIO_REMAPPED; 329 } 330 c->ops++; 331 list_add_tail(&delayed->list, &dc->delayed_bios); 332 spin_unlock(&dc->delayed_bios_lock); 333 334 if (delay_is_fast(dc)) 335 wake_up_process(dc->worker); 336 else 337 queue_timeout(dc, expires); 338 339 return DM_MAPIO_SUBMITTED; 340 } 341 342 static void delay_presuspend(struct dm_target *ti) 343 { 344 struct delay_c *dc = ti->private; 345 346 spin_lock(&dc->delayed_bios_lock); 347 dc->may_delay = false; 348 spin_unlock(&dc->delayed_bios_lock); 349 350 if (!delay_is_fast(dc)) 351 timer_delete(&dc->delay_timer); 352 flush_delayed_bios(dc, true); 353 } 354 355 static void delay_resume(struct dm_target *ti) 356 { 357 struct delay_c *dc = ti->private; 358 359 dc->may_delay = true; 360 } 361 362 static int delay_map(struct dm_target *ti, struct bio *bio) 363 { 364 struct delay_c *dc = ti->private; 365 struct delay_class *c; 366 struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); 367 368 if (bio_data_dir(bio) == WRITE) { 369 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) 370 c = &dc->flush; 371 else 372 c = &dc->write; 373 } else { 374 c = &dc->read; 375 } 376 delayed->class = c; 377 bio_set_dev(bio, c->dev->bdev); 378 bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); 379 380 return delay_bio(dc, c, bio); 381 } 382 383 #ifdef CONFIG_BLK_DEV_ZONED 384 static int delay_report_zones(struct dm_target *ti, 385 struct dm_report_zones_args *args, unsigned int nr_zones) 386 { 387 struct delay_c *dc = ti->private; 388 struct delay_class *c = &dc->read; 389 390 return dm_report_zones(c->dev->bdev, c->start, 391 c->start + dm_target_offset(ti, args->next_sector), 392 args, nr_zones); 393 } 394 #else 395 #define delay_report_zones NULL 396 #endif 397 398 #define DMEMIT_DELAY_CLASS(c) \ 399 DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) 400 401 static void delay_status(struct dm_target *ti, status_type_t type, 402 unsigned int status_flags, char *result, unsigned int maxlen) 403 { 404 struct delay_c *dc = ti->private; 405 int sz = 0; 406 407 switch (type) { 408 case STATUSTYPE_INFO: 409 DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops); 410 break; 411 412 case STATUSTYPE_TABLE: 413 DMEMIT_DELAY_CLASS(&dc->read); 414 if (dc->argc >= 6) { 415 DMEMIT(" "); 416 DMEMIT_DELAY_CLASS(&dc->write); 417 } 418 if (dc->argc >= 9) { 419 DMEMIT(" "); 420 DMEMIT_DELAY_CLASS(&dc->flush); 421 } 422 break; 423 424 case STATUSTYPE_IMA: 425 *result = '\0'; 426 break; 427 } 428 } 429 430 static int delay_iterate_devices(struct dm_target *ti, 431 iterate_devices_callout_fn fn, void *data) 432 { 433 struct delay_c *dc = ti->private; 434 int ret = 0; 435 436 ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data); 437 if (ret) 438 goto out; 439 ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data); 440 if (ret) 441 goto out; 442 ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data); 443 if (ret) 444 goto out; 445 446 out: 447 return ret; 448 } 449 450 static struct target_type delay_target = { 451 .name = "delay", 452 .version = {1, 5, 0}, 453 .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, 454 .module = THIS_MODULE, 455 .ctr = delay_ctr, 456 .dtr = delay_dtr, 457 .map = delay_map, 458 .report_zones = delay_report_zones, 459 .presuspend = delay_presuspend, 460 .resume = delay_resume, 461 .status = delay_status, 462 .iterate_devices = delay_iterate_devices, 463 }; 464 module_dm(delay); 465 466 MODULE_DESCRIPTION(DM_NAME " delay target"); 467 MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>"); 468 MODULE_LICENSE("GPL"); 469