1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "config-host.h" 25 #include "qemu-common.h" 26 #include "trace.h" 27 #include "monitor/monitor.h" 28 #include "block/block_int.h" 29 #include "block/blockjob.h" 30 #include "qemu/module.h" 31 #include "qapi/qmp/qjson.h" 32 #include "sysemu/sysemu.h" 33 #include "qemu/notify.h" 34 #include "block/coroutine.h" 35 #include "qmp-commands.h" 36 #include "qemu/timer.h" 37 38 #ifdef CONFIG_BSD 39 #include <sys/types.h> 40 #include <sys/stat.h> 41 #include <sys/ioctl.h> 42 #include <sys/queue.h> 43 #ifndef __DragonFly__ 44 #include <sys/disk.h> 45 #endif 46 #endif 47 48 #ifdef _WIN32 49 #include <windows.h> 50 #endif 51 52 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 53 54 typedef enum { 55 BDRV_REQ_COPY_ON_READ = 0x1, 56 BDRV_REQ_ZERO_WRITE = 0x2, 57 } BdrvRequestFlags; 58 59 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); 60 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 61 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 62 BlockDriverCompletionFunc *cb, void *opaque); 63 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 64 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 65 BlockDriverCompletionFunc *cb, void *opaque); 66 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 67 int64_t sector_num, int nb_sectors, 68 QEMUIOVector *iov); 69 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 70 int64_t sector_num, int nb_sectors, 71 QEMUIOVector *iov); 72 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 73 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 74 BdrvRequestFlags flags); 75 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 76 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 77 BdrvRequestFlags flags); 78 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 79 int64_t sector_num, 80 QEMUIOVector *qiov, 81 int nb_sectors, 82 BlockDriverCompletionFunc *cb, 83 void *opaque, 84 bool is_write); 85 static void coroutine_fn bdrv_co_do_rw(void *opaque); 86 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 87 int64_t sector_num, int nb_sectors); 88 89 static QTAILQ_HEAD(, BlockDriverState) bdrv_states = 90 QTAILQ_HEAD_INITIALIZER(bdrv_states); 91 92 static QLIST_HEAD(, BlockDriver) bdrv_drivers = 93 QLIST_HEAD_INITIALIZER(bdrv_drivers); 94 95 /* If non-zero, use only whitelisted block drivers */ 96 static int use_bdrv_whitelist; 97 98 #ifdef _WIN32 99 static int is_windows_drive_prefix(const char *filename) 100 { 101 return (((filename[0] >= 'a' && filename[0] <= 'z') || 102 (filename[0] >= 'A' && filename[0] <= 'Z')) && 103 filename[1] == ':'); 104 } 105 106 int is_windows_drive(const char *filename) 107 { 108 if (is_windows_drive_prefix(filename) && 109 filename[2] == '\0') 110 return 1; 111 if (strstart(filename, "\\\\.\\", NULL) || 112 strstart(filename, "//./", NULL)) 113 return 1; 114 return 0; 115 } 116 #endif 117 118 /* throttling disk I/O limits */ 119 void bdrv_set_io_limits(BlockDriverState *bs, 120 ThrottleConfig *cfg) 121 { 122 int i; 123 124 throttle_config(&bs->throttle_state, cfg); 125 126 for (i = 0; i < 2; i++) { 127 qemu_co_enter_next(&bs->throttled_reqs[i]); 128 } 129 } 130 131 /* this function drain all the throttled IOs */ 132 static bool bdrv_start_throttled_reqs(BlockDriverState *bs) 133 { 134 bool drained = false; 135 bool enabled = bs->io_limits_enabled; 136 int i; 137 138 bs->io_limits_enabled = false; 139 140 for (i = 0; i < 2; i++) { 141 while (qemu_co_enter_next(&bs->throttled_reqs[i])) { 142 drained = true; 143 } 144 } 145 146 bs->io_limits_enabled = enabled; 147 148 return drained; 149 } 150 151 void bdrv_io_limits_disable(BlockDriverState *bs) 152 { 153 bs->io_limits_enabled = false; 154 155 bdrv_start_throttled_reqs(bs); 156 157 throttle_destroy(&bs->throttle_state); 158 } 159 160 static void bdrv_throttle_read_timer_cb(void *opaque) 161 { 162 BlockDriverState *bs = opaque; 163 qemu_co_enter_next(&bs->throttled_reqs[0]); 164 } 165 166 static void bdrv_throttle_write_timer_cb(void *opaque) 167 { 168 BlockDriverState *bs = opaque; 169 qemu_co_enter_next(&bs->throttled_reqs[1]); 170 } 171 172 /* should be called before bdrv_set_io_limits if a limit is set */ 173 void bdrv_io_limits_enable(BlockDriverState *bs) 174 { 175 assert(!bs->io_limits_enabled); 176 throttle_init(&bs->throttle_state, 177 QEMU_CLOCK_VIRTUAL, 178 bdrv_throttle_read_timer_cb, 179 bdrv_throttle_write_timer_cb, 180 bs); 181 bs->io_limits_enabled = true; 182 } 183 184 /* This function makes an IO wait if needed 185 * 186 * @nb_sectors: the number of sectors of the IO 187 * @is_write: is the IO a write 188 */ 189 static void bdrv_io_limits_intercept(BlockDriverState *bs, 190 int nb_sectors, 191 bool is_write) 192 { 193 /* does this io must wait */ 194 bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); 195 196 /* if must wait or any request of this type throttled queue the IO */ 197 if (must_wait || 198 !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { 199 qemu_co_queue_wait(&bs->throttled_reqs[is_write]); 200 } 201 202 /* the IO will be executed, do the accounting */ 203 throttle_account(&bs->throttle_state, 204 is_write, 205 nb_sectors * BDRV_SECTOR_SIZE); 206 207 /* if the next request must wait -> do nothing */ 208 if (throttle_schedule_timer(&bs->throttle_state, is_write)) { 209 return; 210 } 211 212 /* else queue next request for execution */ 213 qemu_co_queue_next(&bs->throttled_reqs[is_write]); 214 } 215 216 /* check if the path starts with "<protocol>:" */ 217 static int path_has_protocol(const char *path) 218 { 219 const char *p; 220 221 #ifdef _WIN32 222 if (is_windows_drive(path) || 223 is_windows_drive_prefix(path)) { 224 return 0; 225 } 226 p = path + strcspn(path, ":/\\"); 227 #else 228 p = path + strcspn(path, ":/"); 229 #endif 230 231 return *p == ':'; 232 } 233 234 int path_is_absolute(const char *path) 235 { 236 #ifdef _WIN32 237 /* specific case for names like: "\\.\d:" */ 238 if (is_windows_drive(path) || is_windows_drive_prefix(path)) { 239 return 1; 240 } 241 return (*path == '/' || *path == '\\'); 242 #else 243 return (*path == '/'); 244 #endif 245 } 246 247 /* if filename is absolute, just copy it to dest. Otherwise, build a 248 path to it by considering it is relative to base_path. URL are 249 supported. */ 250 void path_combine(char *dest, int dest_size, 251 const char *base_path, 252 const char *filename) 253 { 254 const char *p, *p1; 255 int len; 256 257 if (dest_size <= 0) 258 return; 259 if (path_is_absolute(filename)) { 260 pstrcpy(dest, dest_size, filename); 261 } else { 262 p = strchr(base_path, ':'); 263 if (p) 264 p++; 265 else 266 p = base_path; 267 p1 = strrchr(base_path, '/'); 268 #ifdef _WIN32 269 { 270 const char *p2; 271 p2 = strrchr(base_path, '\\'); 272 if (!p1 || p2 > p1) 273 p1 = p2; 274 } 275 #endif 276 if (p1) 277 p1++; 278 else 279 p1 = base_path; 280 if (p1 > p) 281 p = p1; 282 len = p - base_path; 283 if (len > dest_size - 1) 284 len = dest_size - 1; 285 memcpy(dest, base_path, len); 286 dest[len] = '\0'; 287 pstrcat(dest, dest_size, filename); 288 } 289 } 290 291 void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz) 292 { 293 if (bs->backing_file[0] == '\0' || path_has_protocol(bs->backing_file)) { 294 pstrcpy(dest, sz, bs->backing_file); 295 } else { 296 path_combine(dest, sz, bs->filename, bs->backing_file); 297 } 298 } 299 300 void bdrv_register(BlockDriver *bdrv) 301 { 302 /* Block drivers without coroutine functions need emulation */ 303 if (!bdrv->bdrv_co_readv) { 304 bdrv->bdrv_co_readv = bdrv_co_readv_em; 305 bdrv->bdrv_co_writev = bdrv_co_writev_em; 306 307 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 308 * the block driver lacks aio we need to emulate that too. 309 */ 310 if (!bdrv->bdrv_aio_readv) { 311 /* add AIO emulation layer */ 312 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 313 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 314 } 315 } 316 317 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); 318 } 319 320 /* create a new block device (by default it is empty) */ 321 BlockDriverState *bdrv_new(const char *device_name) 322 { 323 BlockDriverState *bs; 324 325 bs = g_malloc0(sizeof(BlockDriverState)); 326 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); 327 if (device_name[0] != '\0') { 328 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); 329 } 330 bdrv_iostatus_disable(bs); 331 notifier_list_init(&bs->close_notifiers); 332 notifier_with_return_list_init(&bs->before_write_notifiers); 333 qemu_co_queue_init(&bs->throttled_reqs[0]); 334 qemu_co_queue_init(&bs->throttled_reqs[1]); 335 bs->refcnt = 1; 336 337 return bs; 338 } 339 340 void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify) 341 { 342 notifier_list_add(&bs->close_notifiers, notify); 343 } 344 345 BlockDriver *bdrv_find_format(const char *format_name) 346 { 347 BlockDriver *drv1; 348 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 349 if (!strcmp(drv1->format_name, format_name)) { 350 return drv1; 351 } 352 } 353 return NULL; 354 } 355 356 static int bdrv_is_whitelisted(BlockDriver *drv, bool read_only) 357 { 358 static const char *whitelist_rw[] = { 359 CONFIG_BDRV_RW_WHITELIST 360 }; 361 static const char *whitelist_ro[] = { 362 CONFIG_BDRV_RO_WHITELIST 363 }; 364 const char **p; 365 366 if (!whitelist_rw[0] && !whitelist_ro[0]) { 367 return 1; /* no whitelist, anything goes */ 368 } 369 370 for (p = whitelist_rw; *p; p++) { 371 if (!strcmp(drv->format_name, *p)) { 372 return 1; 373 } 374 } 375 if (read_only) { 376 for (p = whitelist_ro; *p; p++) { 377 if (!strcmp(drv->format_name, *p)) { 378 return 1; 379 } 380 } 381 } 382 return 0; 383 } 384 385 BlockDriver *bdrv_find_whitelisted_format(const char *format_name, 386 bool read_only) 387 { 388 BlockDriver *drv = bdrv_find_format(format_name); 389 return drv && bdrv_is_whitelisted(drv, read_only) ? drv : NULL; 390 } 391 392 typedef struct CreateCo { 393 BlockDriver *drv; 394 char *filename; 395 QEMUOptionParameter *options; 396 int ret; 397 Error *err; 398 } CreateCo; 399 400 static void coroutine_fn bdrv_create_co_entry(void *opaque) 401 { 402 Error *local_err = NULL; 403 int ret; 404 405 CreateCo *cco = opaque; 406 assert(cco->drv); 407 408 ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err); 409 if (error_is_set(&local_err)) { 410 error_propagate(&cco->err, local_err); 411 } 412 cco->ret = ret; 413 } 414 415 int bdrv_create(BlockDriver *drv, const char* filename, 416 QEMUOptionParameter *options, Error **errp) 417 { 418 int ret; 419 420 Coroutine *co; 421 CreateCo cco = { 422 .drv = drv, 423 .filename = g_strdup(filename), 424 .options = options, 425 .ret = NOT_DONE, 426 .err = NULL, 427 }; 428 429 if (!drv->bdrv_create) { 430 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); 431 ret = -ENOTSUP; 432 goto out; 433 } 434 435 if (qemu_in_coroutine()) { 436 /* Fast-path if already in coroutine context */ 437 bdrv_create_co_entry(&cco); 438 } else { 439 co = qemu_coroutine_create(bdrv_create_co_entry); 440 qemu_coroutine_enter(co, &cco); 441 while (cco.ret == NOT_DONE) { 442 qemu_aio_wait(); 443 } 444 } 445 446 ret = cco.ret; 447 if (ret < 0) { 448 if (error_is_set(&cco.err)) { 449 error_propagate(errp, cco.err); 450 } else { 451 error_setg_errno(errp, -ret, "Could not create image"); 452 } 453 } 454 455 out: 456 g_free(cco.filename); 457 return ret; 458 } 459 460 int bdrv_create_file(const char* filename, QEMUOptionParameter *options, 461 Error **errp) 462 { 463 BlockDriver *drv; 464 Error *local_err = NULL; 465 int ret; 466 467 drv = bdrv_find_protocol(filename, true); 468 if (drv == NULL) { 469 error_setg(errp, "Could not find protocol for file '%s'", filename); 470 return -ENOENT; 471 } 472 473 ret = bdrv_create(drv, filename, options, &local_err); 474 if (error_is_set(&local_err)) { 475 error_propagate(errp, local_err); 476 } 477 return ret; 478 } 479 480 /* 481 * Create a uniquely-named empty temporary file. 482 * Return 0 upon success, otherwise a negative errno value. 483 */ 484 int get_tmp_filename(char *filename, int size) 485 { 486 #ifdef _WIN32 487 char temp_dir[MAX_PATH]; 488 /* GetTempFileName requires that its output buffer (4th param) 489 have length MAX_PATH or greater. */ 490 assert(size >= MAX_PATH); 491 return (GetTempPath(MAX_PATH, temp_dir) 492 && GetTempFileName(temp_dir, "qem", 0, filename) 493 ? 0 : -GetLastError()); 494 #else 495 int fd; 496 const char *tmpdir; 497 tmpdir = getenv("TMPDIR"); 498 if (!tmpdir) 499 tmpdir = "/tmp"; 500 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { 501 return -EOVERFLOW; 502 } 503 fd = mkstemp(filename); 504 if (fd < 0) { 505 return -errno; 506 } 507 if (close(fd) != 0) { 508 unlink(filename); 509 return -errno; 510 } 511 return 0; 512 #endif 513 } 514 515 /* 516 * Detect host devices. By convention, /dev/cdrom[N] is always 517 * recognized as a host CDROM. 518 */ 519 static BlockDriver *find_hdev_driver(const char *filename) 520 { 521 int score_max = 0, score; 522 BlockDriver *drv = NULL, *d; 523 524 QLIST_FOREACH(d, &bdrv_drivers, list) { 525 if (d->bdrv_probe_device) { 526 score = d->bdrv_probe_device(filename); 527 if (score > score_max) { 528 score_max = score; 529 drv = d; 530 } 531 } 532 } 533 534 return drv; 535 } 536 537 BlockDriver *bdrv_find_protocol(const char *filename, 538 bool allow_protocol_prefix) 539 { 540 BlockDriver *drv1; 541 char protocol[128]; 542 int len; 543 const char *p; 544 545 /* TODO Drivers without bdrv_file_open must be specified explicitly */ 546 547 /* 548 * XXX(hch): we really should not let host device detection 549 * override an explicit protocol specification, but moving this 550 * later breaks access to device names with colons in them. 551 * Thanks to the brain-dead persistent naming schemes on udev- 552 * based Linux systems those actually are quite common. 553 */ 554 drv1 = find_hdev_driver(filename); 555 if (drv1) { 556 return drv1; 557 } 558 559 if (!path_has_protocol(filename) || !allow_protocol_prefix) { 560 return bdrv_find_format("file"); 561 } 562 563 p = strchr(filename, ':'); 564 assert(p != NULL); 565 len = p - filename; 566 if (len > sizeof(protocol) - 1) 567 len = sizeof(protocol) - 1; 568 memcpy(protocol, filename, len); 569 protocol[len] = '\0'; 570 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 571 if (drv1->protocol_name && 572 !strcmp(drv1->protocol_name, protocol)) { 573 return drv1; 574 } 575 } 576 return NULL; 577 } 578 579 static int find_image_format(BlockDriverState *bs, const char *filename, 580 BlockDriver **pdrv, Error **errp) 581 { 582 int score, score_max; 583 BlockDriver *drv1, *drv; 584 uint8_t buf[2048]; 585 int ret = 0; 586 587 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ 588 if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { 589 drv = bdrv_find_format("raw"); 590 if (!drv) { 591 error_setg(errp, "Could not find raw image format"); 592 ret = -ENOENT; 593 } 594 *pdrv = drv; 595 return ret; 596 } 597 598 ret = bdrv_pread(bs, 0, buf, sizeof(buf)); 599 if (ret < 0) { 600 error_setg_errno(errp, -ret, "Could not read image for determining its " 601 "format"); 602 *pdrv = NULL; 603 return ret; 604 } 605 606 score_max = 0; 607 drv = NULL; 608 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 609 if (drv1->bdrv_probe) { 610 score = drv1->bdrv_probe(buf, ret, filename); 611 if (score > score_max) { 612 score_max = score; 613 drv = drv1; 614 } 615 } 616 } 617 if (!drv) { 618 error_setg(errp, "Could not determine image format: No compatible " 619 "driver found"); 620 ret = -ENOENT; 621 } 622 *pdrv = drv; 623 return ret; 624 } 625 626 /** 627 * Set the current 'total_sectors' value 628 */ 629 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) 630 { 631 BlockDriver *drv = bs->drv; 632 633 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ 634 if (bs->sg) 635 return 0; 636 637 /* query actual device if possible, otherwise just trust the hint */ 638 if (drv->bdrv_getlength) { 639 int64_t length = drv->bdrv_getlength(bs); 640 if (length < 0) { 641 return length; 642 } 643 hint = length >> BDRV_SECTOR_BITS; 644 } 645 646 bs->total_sectors = hint; 647 return 0; 648 } 649 650 /** 651 * Set open flags for a given discard mode 652 * 653 * Return 0 on success, -1 if the discard mode was invalid. 654 */ 655 int bdrv_parse_discard_flags(const char *mode, int *flags) 656 { 657 *flags &= ~BDRV_O_UNMAP; 658 659 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) { 660 /* do nothing */ 661 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) { 662 *flags |= BDRV_O_UNMAP; 663 } else { 664 return -1; 665 } 666 667 return 0; 668 } 669 670 /** 671 * Set open flags for a given cache mode 672 * 673 * Return 0 on success, -1 if the cache mode was invalid. 674 */ 675 int bdrv_parse_cache_flags(const char *mode, int *flags) 676 { 677 *flags &= ~BDRV_O_CACHE_MASK; 678 679 if (!strcmp(mode, "off") || !strcmp(mode, "none")) { 680 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; 681 } else if (!strcmp(mode, "directsync")) { 682 *flags |= BDRV_O_NOCACHE; 683 } else if (!strcmp(mode, "writeback")) { 684 *flags |= BDRV_O_CACHE_WB; 685 } else if (!strcmp(mode, "unsafe")) { 686 *flags |= BDRV_O_CACHE_WB; 687 *flags |= BDRV_O_NO_FLUSH; 688 } else if (!strcmp(mode, "writethrough")) { 689 /* this is the default */ 690 } else { 691 return -1; 692 } 693 694 return 0; 695 } 696 697 /** 698 * The copy-on-read flag is actually a reference count so multiple users may 699 * use the feature without worrying about clobbering its previous state. 700 * Copy-on-read stays enabled until all users have called to disable it. 701 */ 702 void bdrv_enable_copy_on_read(BlockDriverState *bs) 703 { 704 bs->copy_on_read++; 705 } 706 707 void bdrv_disable_copy_on_read(BlockDriverState *bs) 708 { 709 assert(bs->copy_on_read > 0); 710 bs->copy_on_read--; 711 } 712 713 static int bdrv_open_flags(BlockDriverState *bs, int flags) 714 { 715 int open_flags = flags | BDRV_O_CACHE_WB; 716 717 /* 718 * Clear flags that are internal to the block layer before opening the 719 * image. 720 */ 721 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 722 723 /* 724 * Snapshots should be writable. 725 */ 726 if (bs->is_temporary) { 727 open_flags |= BDRV_O_RDWR; 728 } 729 730 return open_flags; 731 } 732 733 /* 734 * Common part for opening disk images and files 735 * 736 * Removes all processed options from *options. 737 */ 738 static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, 739 QDict *options, int flags, BlockDriver *drv, Error **errp) 740 { 741 int ret, open_flags; 742 const char *filename; 743 Error *local_err = NULL; 744 745 assert(drv != NULL); 746 assert(bs->file == NULL); 747 assert(options != NULL && bs->options != options); 748 749 if (file != NULL) { 750 filename = file->filename; 751 } else { 752 filename = qdict_get_try_str(options, "filename"); 753 } 754 755 trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); 756 757 /* bdrv_open() with directly using a protocol as drv. This layer is already 758 * opened, so assign it to bs (while file becomes a closed BlockDriverState) 759 * and return immediately. */ 760 if (file != NULL && drv->bdrv_file_open) { 761 bdrv_swap(file, bs); 762 return 0; 763 } 764 765 bs->open_flags = flags; 766 bs->buffer_alignment = 512; 767 bs->zero_beyond_eof = true; 768 open_flags = bdrv_open_flags(bs, flags); 769 bs->read_only = !(open_flags & BDRV_O_RDWR); 770 771 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) { 772 error_setg(errp, 773 !bs->read_only && bdrv_is_whitelisted(drv, true) 774 ? "Driver '%s' can only be used for read-only devices" 775 : "Driver '%s' is not whitelisted", 776 drv->format_name); 777 return -ENOTSUP; 778 } 779 780 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ 781 if (flags & BDRV_O_COPY_ON_READ) { 782 if (!bs->read_only) { 783 bdrv_enable_copy_on_read(bs); 784 } else { 785 error_setg(errp, "Can't use copy-on-read on read-only device"); 786 return -EINVAL; 787 } 788 } 789 790 if (filename != NULL) { 791 pstrcpy(bs->filename, sizeof(bs->filename), filename); 792 } else { 793 bs->filename[0] = '\0'; 794 } 795 796 bs->drv = drv; 797 bs->opaque = g_malloc0(drv->instance_size); 798 799 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); 800 801 /* Open the image, either directly or using a protocol */ 802 if (drv->bdrv_file_open) { 803 assert(file == NULL); 804 assert(!drv->bdrv_needs_filename || filename != NULL); 805 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err); 806 } else { 807 if (file == NULL) { 808 error_setg(errp, "Can't use '%s' as a block driver for the " 809 "protocol level", drv->format_name); 810 ret = -EINVAL; 811 goto free_and_fail; 812 } 813 bs->file = file; 814 ret = drv->bdrv_open(bs, options, open_flags, &local_err); 815 } 816 817 if (ret < 0) { 818 if (error_is_set(&local_err)) { 819 error_propagate(errp, local_err); 820 } else if (bs->filename[0]) { 821 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename); 822 } else { 823 error_setg_errno(errp, -ret, "Could not open image"); 824 } 825 goto free_and_fail; 826 } 827 828 ret = refresh_total_sectors(bs, bs->total_sectors); 829 if (ret < 0) { 830 error_setg_errno(errp, -ret, "Could not refresh total sector count"); 831 goto free_and_fail; 832 } 833 834 #ifndef _WIN32 835 if (bs->is_temporary) { 836 assert(bs->filename[0] != '\0'); 837 unlink(bs->filename); 838 } 839 #endif 840 return 0; 841 842 free_and_fail: 843 bs->file = NULL; 844 g_free(bs->opaque); 845 bs->opaque = NULL; 846 bs->drv = NULL; 847 return ret; 848 } 849 850 /* 851 * Opens a file using a protocol (file, host_device, nbd, ...) 852 * 853 * options is a QDict of options to pass to the block drivers, or NULL for an 854 * empty set of options. The reference to the QDict belongs to the block layer 855 * after the call (even on failure), so if the caller intends to reuse the 856 * dictionary, it needs to use QINCREF() before calling bdrv_file_open. 857 */ 858 int bdrv_file_open(BlockDriverState **pbs, const char *filename, 859 QDict *options, int flags, Error **errp) 860 { 861 BlockDriverState *bs; 862 BlockDriver *drv; 863 const char *drvname; 864 bool allow_protocol_prefix = false; 865 Error *local_err = NULL; 866 int ret; 867 868 /* NULL means an empty set of options */ 869 if (options == NULL) { 870 options = qdict_new(); 871 } 872 873 bs = bdrv_new(""); 874 bs->options = options; 875 options = qdict_clone_shallow(options); 876 877 /* Fetch the file name from the options QDict if necessary */ 878 if (!filename) { 879 filename = qdict_get_try_str(options, "filename"); 880 } else if (filename && !qdict_haskey(options, "filename")) { 881 qdict_put(options, "filename", qstring_from_str(filename)); 882 allow_protocol_prefix = true; 883 } else { 884 error_setg(errp, "Can't specify 'file' and 'filename' options at the " 885 "same time"); 886 ret = -EINVAL; 887 goto fail; 888 } 889 890 /* Find the right block driver */ 891 drvname = qdict_get_try_str(options, "driver"); 892 if (drvname) { 893 drv = bdrv_find_format(drvname); 894 if (!drv) { 895 error_setg(errp, "Unknown driver '%s'", drvname); 896 } 897 qdict_del(options, "driver"); 898 } else if (filename) { 899 drv = bdrv_find_protocol(filename, allow_protocol_prefix); 900 if (!drv) { 901 error_setg(errp, "Unknown protocol"); 902 } 903 } else { 904 error_setg(errp, "Must specify either driver or file"); 905 drv = NULL; 906 } 907 908 if (!drv) { 909 /* errp has been set already */ 910 ret = -ENOENT; 911 goto fail; 912 } 913 914 /* Parse the filename and open it */ 915 if (drv->bdrv_parse_filename && filename) { 916 drv->bdrv_parse_filename(filename, options, &local_err); 917 if (error_is_set(&local_err)) { 918 error_propagate(errp, local_err); 919 ret = -EINVAL; 920 goto fail; 921 } 922 qdict_del(options, "filename"); 923 } else if (drv->bdrv_needs_filename && !filename) { 924 error_setg(errp, "The '%s' block driver requires a file name", 925 drv->format_name); 926 ret = -EINVAL; 927 goto fail; 928 } 929 930 ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); 931 if (ret < 0) { 932 error_propagate(errp, local_err); 933 goto fail; 934 } 935 936 /* Check if any unknown options were used */ 937 if (qdict_size(options) != 0) { 938 const QDictEntry *entry = qdict_first(options); 939 error_setg(errp, "Block protocol '%s' doesn't support the option '%s'", 940 drv->format_name, entry->key); 941 ret = -EINVAL; 942 goto fail; 943 } 944 QDECREF(options); 945 946 bs->growable = 1; 947 *pbs = bs; 948 return 0; 949 950 fail: 951 QDECREF(options); 952 if (!bs->drv) { 953 QDECREF(bs->options); 954 } 955 bdrv_unref(bs); 956 return ret; 957 } 958 959 /* 960 * Opens the backing file for a BlockDriverState if not yet open 961 * 962 * options is a QDict of options to pass to the block drivers, or NULL for an 963 * empty set of options. The reference to the QDict is transferred to this 964 * function (even on failure), so if the caller intends to reuse the dictionary, 965 * it needs to use QINCREF() before calling bdrv_file_open. 966 */ 967 int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) 968 { 969 char backing_filename[PATH_MAX]; 970 int back_flags, ret; 971 BlockDriver *back_drv = NULL; 972 Error *local_err = NULL; 973 974 if (bs->backing_hd != NULL) { 975 QDECREF(options); 976 return 0; 977 } 978 979 /* NULL means an empty set of options */ 980 if (options == NULL) { 981 options = qdict_new(); 982 } 983 984 bs->open_flags &= ~BDRV_O_NO_BACKING; 985 if (qdict_haskey(options, "file.filename")) { 986 backing_filename[0] = '\0'; 987 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) { 988 QDECREF(options); 989 return 0; 990 } else { 991 bdrv_get_full_backing_filename(bs, backing_filename, 992 sizeof(backing_filename)); 993 } 994 995 bs->backing_hd = bdrv_new(""); 996 997 if (bs->backing_format[0] != '\0') { 998 back_drv = bdrv_find_format(bs->backing_format); 999 } 1000 1001 /* backing files always opened read-only */ 1002 back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | 1003 BDRV_O_COPY_ON_READ); 1004 1005 ret = bdrv_open(bs->backing_hd, 1006 *backing_filename ? backing_filename : NULL, options, 1007 back_flags, back_drv, &local_err); 1008 if (ret < 0) { 1009 bdrv_unref(bs->backing_hd); 1010 bs->backing_hd = NULL; 1011 bs->open_flags |= BDRV_O_NO_BACKING; 1012 error_propagate(errp, local_err); 1013 return ret; 1014 } 1015 pstrcpy(bs->backing_file, sizeof(bs->backing_file), 1016 bs->backing_hd->file->filename); 1017 return 0; 1018 } 1019 1020 /* 1021 * Opens a disk image (raw, qcow2, vmdk, ...) 1022 * 1023 * options is a QDict of options to pass to the block drivers, or NULL for an 1024 * empty set of options. The reference to the QDict belongs to the block layer 1025 * after the call (even on failure), so if the caller intends to reuse the 1026 * dictionary, it needs to use QINCREF() before calling bdrv_open. 1027 */ 1028 int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, 1029 int flags, BlockDriver *drv, Error **errp) 1030 { 1031 int ret; 1032 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ 1033 char tmp_filename[PATH_MAX + 1]; 1034 BlockDriverState *file = NULL; 1035 QDict *file_options = NULL; 1036 const char *drvname; 1037 Error *local_err = NULL; 1038 1039 /* NULL means an empty set of options */ 1040 if (options == NULL) { 1041 options = qdict_new(); 1042 } 1043 1044 bs->options = options; 1045 options = qdict_clone_shallow(options); 1046 1047 /* For snapshot=on, create a temporary qcow2 overlay */ 1048 if (flags & BDRV_O_SNAPSHOT) { 1049 BlockDriverState *bs1; 1050 int64_t total_size; 1051 BlockDriver *bdrv_qcow2; 1052 QEMUOptionParameter *create_options; 1053 char backing_filename[PATH_MAX]; 1054 1055 if (qdict_size(options) != 0) { 1056 error_setg(errp, "Can't use snapshot=on with driver-specific options"); 1057 ret = -EINVAL; 1058 goto fail; 1059 } 1060 assert(filename != NULL); 1061 1062 /* if snapshot, we create a temporary backing file and open it 1063 instead of opening 'filename' directly */ 1064 1065 /* if there is a backing file, use it */ 1066 bs1 = bdrv_new(""); 1067 ret = bdrv_open(bs1, filename, NULL, 0, drv, &local_err); 1068 if (ret < 0) { 1069 bdrv_unref(bs1); 1070 goto fail; 1071 } 1072 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK; 1073 1074 bdrv_unref(bs1); 1075 1076 ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename)); 1077 if (ret < 0) { 1078 error_setg_errno(errp, -ret, "Could not get temporary filename"); 1079 goto fail; 1080 } 1081 1082 /* Real path is meaningless for protocols */ 1083 if (path_has_protocol(filename)) { 1084 snprintf(backing_filename, sizeof(backing_filename), 1085 "%s", filename); 1086 } else if (!realpath(filename, backing_filename)) { 1087 error_setg_errno(errp, errno, "Could not resolve path '%s'", filename); 1088 ret = -errno; 1089 goto fail; 1090 } 1091 1092 bdrv_qcow2 = bdrv_find_format("qcow2"); 1093 create_options = parse_option_parameters("", bdrv_qcow2->create_options, 1094 NULL); 1095 1096 set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size); 1097 set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE, 1098 backing_filename); 1099 if (drv) { 1100 set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT, 1101 drv->format_name); 1102 } 1103 1104 ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err); 1105 free_option_parameters(create_options); 1106 if (ret < 0) { 1107 error_setg_errno(errp, -ret, "Could not create temporary overlay " 1108 "'%s': %s", tmp_filename, 1109 error_get_pretty(local_err)); 1110 error_free(local_err); 1111 local_err = NULL; 1112 goto fail; 1113 } 1114 1115 filename = tmp_filename; 1116 drv = bdrv_qcow2; 1117 bs->is_temporary = 1; 1118 } 1119 1120 /* Open image file without format layer */ 1121 if (flags & BDRV_O_RDWR) { 1122 flags |= BDRV_O_ALLOW_RDWR; 1123 } 1124 1125 qdict_extract_subqdict(options, &file_options, "file."); 1126 1127 ret = bdrv_file_open(&file, filename, file_options, 1128 bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err); 1129 if (ret < 0) { 1130 goto fail; 1131 } 1132 1133 /* Find the right image format driver */ 1134 drvname = qdict_get_try_str(options, "driver"); 1135 if (drvname) { 1136 drv = bdrv_find_format(drvname); 1137 qdict_del(options, "driver"); 1138 } 1139 1140 if (!drv) { 1141 ret = find_image_format(file, filename, &drv, &local_err); 1142 } 1143 1144 if (!drv) { 1145 goto unlink_and_fail; 1146 } 1147 1148 /* Open the image */ 1149 ret = bdrv_open_common(bs, file, options, flags, drv, &local_err); 1150 if (ret < 0) { 1151 goto unlink_and_fail; 1152 } 1153 1154 if (bs->file != file) { 1155 bdrv_unref(file); 1156 file = NULL; 1157 } 1158 1159 /* If there is a backing file, use it */ 1160 if ((flags & BDRV_O_NO_BACKING) == 0) { 1161 QDict *backing_options; 1162 1163 qdict_extract_subqdict(options, &backing_options, "backing."); 1164 ret = bdrv_open_backing_file(bs, backing_options, &local_err); 1165 if (ret < 0) { 1166 goto close_and_fail; 1167 } 1168 } 1169 1170 /* Check if any unknown options were used */ 1171 if (qdict_size(options) != 0) { 1172 const QDictEntry *entry = qdict_first(options); 1173 error_setg(errp, "Block format '%s' used by device '%s' doesn't " 1174 "support the option '%s'", drv->format_name, bs->device_name, 1175 entry->key); 1176 1177 ret = -EINVAL; 1178 goto close_and_fail; 1179 } 1180 QDECREF(options); 1181 1182 if (!bdrv_key_required(bs)) { 1183 bdrv_dev_change_media_cb(bs, true); 1184 } 1185 1186 return 0; 1187 1188 unlink_and_fail: 1189 if (file != NULL) { 1190 bdrv_unref(file); 1191 } 1192 if (bs->is_temporary) { 1193 unlink(filename); 1194 } 1195 fail: 1196 QDECREF(bs->options); 1197 QDECREF(options); 1198 bs->options = NULL; 1199 if (error_is_set(&local_err)) { 1200 error_propagate(errp, local_err); 1201 } 1202 return ret; 1203 1204 close_and_fail: 1205 bdrv_close(bs); 1206 QDECREF(options); 1207 if (error_is_set(&local_err)) { 1208 error_propagate(errp, local_err); 1209 } 1210 return ret; 1211 } 1212 1213 typedef struct BlockReopenQueueEntry { 1214 bool prepared; 1215 BDRVReopenState state; 1216 QSIMPLEQ_ENTRY(BlockReopenQueueEntry) entry; 1217 } BlockReopenQueueEntry; 1218 1219 /* 1220 * Adds a BlockDriverState to a simple queue for an atomic, transactional 1221 * reopen of multiple devices. 1222 * 1223 * bs_queue can either be an existing BlockReopenQueue that has had QSIMPLE_INIT 1224 * already performed, or alternatively may be NULL a new BlockReopenQueue will 1225 * be created and initialized. This newly created BlockReopenQueue should be 1226 * passed back in for subsequent calls that are intended to be of the same 1227 * atomic 'set'. 1228 * 1229 * bs is the BlockDriverState to add to the reopen queue. 1230 * 1231 * flags contains the open flags for the associated bs 1232 * 1233 * returns a pointer to bs_queue, which is either the newly allocated 1234 * bs_queue, or the existing bs_queue being used. 1235 * 1236 */ 1237 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, 1238 BlockDriverState *bs, int flags) 1239 { 1240 assert(bs != NULL); 1241 1242 BlockReopenQueueEntry *bs_entry; 1243 if (bs_queue == NULL) { 1244 bs_queue = g_new0(BlockReopenQueue, 1); 1245 QSIMPLEQ_INIT(bs_queue); 1246 } 1247 1248 if (bs->file) { 1249 bdrv_reopen_queue(bs_queue, bs->file, flags); 1250 } 1251 1252 bs_entry = g_new0(BlockReopenQueueEntry, 1); 1253 QSIMPLEQ_INSERT_TAIL(bs_queue, bs_entry, entry); 1254 1255 bs_entry->state.bs = bs; 1256 bs_entry->state.flags = flags; 1257 1258 return bs_queue; 1259 } 1260 1261 /* 1262 * Reopen multiple BlockDriverStates atomically & transactionally. 1263 * 1264 * The queue passed in (bs_queue) must have been built up previous 1265 * via bdrv_reopen_queue(). 1266 * 1267 * Reopens all BDS specified in the queue, with the appropriate 1268 * flags. All devices are prepared for reopen, and failure of any 1269 * device will cause all device changes to be abandonded, and intermediate 1270 * data cleaned up. 1271 * 1272 * If all devices prepare successfully, then the changes are committed 1273 * to all devices. 1274 * 1275 */ 1276 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) 1277 { 1278 int ret = -1; 1279 BlockReopenQueueEntry *bs_entry, *next; 1280 Error *local_err = NULL; 1281 1282 assert(bs_queue != NULL); 1283 1284 bdrv_drain_all(); 1285 1286 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1287 if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { 1288 error_propagate(errp, local_err); 1289 goto cleanup; 1290 } 1291 bs_entry->prepared = true; 1292 } 1293 1294 /* If we reach this point, we have success and just need to apply the 1295 * changes 1296 */ 1297 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { 1298 bdrv_reopen_commit(&bs_entry->state); 1299 } 1300 1301 ret = 0; 1302 1303 cleanup: 1304 QSIMPLEQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { 1305 if (ret && bs_entry->prepared) { 1306 bdrv_reopen_abort(&bs_entry->state); 1307 } 1308 g_free(bs_entry); 1309 } 1310 g_free(bs_queue); 1311 return ret; 1312 } 1313 1314 1315 /* Reopen a single BlockDriverState with the specified flags. */ 1316 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) 1317 { 1318 int ret = -1; 1319 Error *local_err = NULL; 1320 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, bdrv_flags); 1321 1322 ret = bdrv_reopen_multiple(queue, &local_err); 1323 if (local_err != NULL) { 1324 error_propagate(errp, local_err); 1325 } 1326 return ret; 1327 } 1328 1329 1330 /* 1331 * Prepares a BlockDriverState for reopen. All changes are staged in the 1332 * 'opaque' field of the BDRVReopenState, which is used and allocated by 1333 * the block driver layer .bdrv_reopen_prepare() 1334 * 1335 * bs is the BlockDriverState to reopen 1336 * flags are the new open flags 1337 * queue is the reopen queue 1338 * 1339 * Returns 0 on success, non-zero on error. On error errp will be set 1340 * as well. 1341 * 1342 * On failure, bdrv_reopen_abort() will be called to clean up any data. 1343 * It is the responsibility of the caller to then call the abort() or 1344 * commit() for any other BDS that have been left in a prepare() state 1345 * 1346 */ 1347 int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, 1348 Error **errp) 1349 { 1350 int ret = -1; 1351 Error *local_err = NULL; 1352 BlockDriver *drv; 1353 1354 assert(reopen_state != NULL); 1355 assert(reopen_state->bs->drv != NULL); 1356 drv = reopen_state->bs->drv; 1357 1358 /* if we are to stay read-only, do not allow permission change 1359 * to r/w */ 1360 if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) && 1361 reopen_state->flags & BDRV_O_RDWR) { 1362 error_set(errp, QERR_DEVICE_IS_READ_ONLY, 1363 reopen_state->bs->device_name); 1364 goto error; 1365 } 1366 1367 1368 ret = bdrv_flush(reopen_state->bs); 1369 if (ret) { 1370 error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive", 1371 strerror(-ret)); 1372 goto error; 1373 } 1374 1375 if (drv->bdrv_reopen_prepare) { 1376 ret = drv->bdrv_reopen_prepare(reopen_state, queue, &local_err); 1377 if (ret) { 1378 if (local_err != NULL) { 1379 error_propagate(errp, local_err); 1380 } else { 1381 error_setg(errp, "failed while preparing to reopen image '%s'", 1382 reopen_state->bs->filename); 1383 } 1384 goto error; 1385 } 1386 } else { 1387 /* It is currently mandatory to have a bdrv_reopen_prepare() 1388 * handler for each supported drv. */ 1389 error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, 1390 drv->format_name, reopen_state->bs->device_name, 1391 "reopening of file"); 1392 ret = -1; 1393 goto error; 1394 } 1395 1396 ret = 0; 1397 1398 error: 1399 return ret; 1400 } 1401 1402 /* 1403 * Takes the staged changes for the reopen from bdrv_reopen_prepare(), and 1404 * makes them final by swapping the staging BlockDriverState contents into 1405 * the active BlockDriverState contents. 1406 */ 1407 void bdrv_reopen_commit(BDRVReopenState *reopen_state) 1408 { 1409 BlockDriver *drv; 1410 1411 assert(reopen_state != NULL); 1412 drv = reopen_state->bs->drv; 1413 assert(drv != NULL); 1414 1415 /* If there are any driver level actions to take */ 1416 if (drv->bdrv_reopen_commit) { 1417 drv->bdrv_reopen_commit(reopen_state); 1418 } 1419 1420 /* set BDS specific flags now */ 1421 reopen_state->bs->open_flags = reopen_state->flags; 1422 reopen_state->bs->enable_write_cache = !!(reopen_state->flags & 1423 BDRV_O_CACHE_WB); 1424 reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); 1425 } 1426 1427 /* 1428 * Abort the reopen, and delete and free the staged changes in 1429 * reopen_state 1430 */ 1431 void bdrv_reopen_abort(BDRVReopenState *reopen_state) 1432 { 1433 BlockDriver *drv; 1434 1435 assert(reopen_state != NULL); 1436 drv = reopen_state->bs->drv; 1437 assert(drv != NULL); 1438 1439 if (drv->bdrv_reopen_abort) { 1440 drv->bdrv_reopen_abort(reopen_state); 1441 } 1442 } 1443 1444 1445 void bdrv_close(BlockDriverState *bs) 1446 { 1447 if (bs->job) { 1448 block_job_cancel_sync(bs->job); 1449 } 1450 bdrv_drain_all(); /* complete I/O */ 1451 bdrv_flush(bs); 1452 bdrv_drain_all(); /* in case flush left pending I/O */ 1453 notifier_list_notify(&bs->close_notifiers, bs); 1454 1455 if (bs->drv) { 1456 if (bs->backing_hd) { 1457 bdrv_unref(bs->backing_hd); 1458 bs->backing_hd = NULL; 1459 } 1460 bs->drv->bdrv_close(bs); 1461 g_free(bs->opaque); 1462 #ifdef _WIN32 1463 if (bs->is_temporary) { 1464 unlink(bs->filename); 1465 } 1466 #endif 1467 bs->opaque = NULL; 1468 bs->drv = NULL; 1469 bs->copy_on_read = 0; 1470 bs->backing_file[0] = '\0'; 1471 bs->backing_format[0] = '\0'; 1472 bs->total_sectors = 0; 1473 bs->encrypted = 0; 1474 bs->valid_key = 0; 1475 bs->sg = 0; 1476 bs->growable = 0; 1477 bs->zero_beyond_eof = false; 1478 QDECREF(bs->options); 1479 bs->options = NULL; 1480 1481 if (bs->file != NULL) { 1482 bdrv_unref(bs->file); 1483 bs->file = NULL; 1484 } 1485 } 1486 1487 bdrv_dev_change_media_cb(bs, false); 1488 1489 /*throttling disk I/O limits*/ 1490 if (bs->io_limits_enabled) { 1491 bdrv_io_limits_disable(bs); 1492 } 1493 } 1494 1495 void bdrv_close_all(void) 1496 { 1497 BlockDriverState *bs; 1498 1499 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1500 bdrv_close(bs); 1501 } 1502 } 1503 1504 /* Check if any requests are in-flight (including throttled requests) */ 1505 static bool bdrv_requests_pending(BlockDriverState *bs) 1506 { 1507 if (!QLIST_EMPTY(&bs->tracked_requests)) { 1508 return true; 1509 } 1510 if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { 1511 return true; 1512 } 1513 if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { 1514 return true; 1515 } 1516 if (bs->file && bdrv_requests_pending(bs->file)) { 1517 return true; 1518 } 1519 if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { 1520 return true; 1521 } 1522 return false; 1523 } 1524 1525 static bool bdrv_requests_pending_all(void) 1526 { 1527 BlockDriverState *bs; 1528 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1529 if (bdrv_requests_pending(bs)) { 1530 return true; 1531 } 1532 } 1533 return false; 1534 } 1535 1536 /* 1537 * Wait for pending requests to complete across all BlockDriverStates 1538 * 1539 * This function does not flush data to disk, use bdrv_flush_all() for that 1540 * after calling this function. 1541 * 1542 * Note that completion of an asynchronous I/O operation can trigger any 1543 * number of other I/O operations on other devices---for example a coroutine 1544 * can be arbitrarily complex and a constant flow of I/O can come until the 1545 * coroutine is complete. Because of this, it is not possible to have a 1546 * function to drain a single device's I/O queue. 1547 */ 1548 void bdrv_drain_all(void) 1549 { 1550 /* Always run first iteration so any pending completion BHs run */ 1551 bool busy = true; 1552 BlockDriverState *bs; 1553 1554 while (busy) { 1555 /* FIXME: We do not have timer support here, so this is effectively 1556 * a busy wait. 1557 */ 1558 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1559 if (bdrv_start_throttled_reqs(bs)) { 1560 busy = true; 1561 } 1562 } 1563 1564 busy = bdrv_requests_pending_all(); 1565 busy |= aio_poll(qemu_get_aio_context(), busy); 1566 } 1567 } 1568 1569 /* make a BlockDriverState anonymous by removing from bdrv_state list. 1570 Also, NULL terminate the device_name to prevent double remove */ 1571 void bdrv_make_anon(BlockDriverState *bs) 1572 { 1573 if (bs->device_name[0] != '\0') { 1574 QTAILQ_REMOVE(&bdrv_states, bs, list); 1575 } 1576 bs->device_name[0] = '\0'; 1577 } 1578 1579 static void bdrv_rebind(BlockDriverState *bs) 1580 { 1581 if (bs->drv && bs->drv->bdrv_rebind) { 1582 bs->drv->bdrv_rebind(bs); 1583 } 1584 } 1585 1586 static void bdrv_move_feature_fields(BlockDriverState *bs_dest, 1587 BlockDriverState *bs_src) 1588 { 1589 /* move some fields that need to stay attached to the device */ 1590 bs_dest->open_flags = bs_src->open_flags; 1591 1592 /* dev info */ 1593 bs_dest->dev_ops = bs_src->dev_ops; 1594 bs_dest->dev_opaque = bs_src->dev_opaque; 1595 bs_dest->dev = bs_src->dev; 1596 bs_dest->buffer_alignment = bs_src->buffer_alignment; 1597 bs_dest->copy_on_read = bs_src->copy_on_read; 1598 1599 bs_dest->enable_write_cache = bs_src->enable_write_cache; 1600 1601 /* i/o throttled req */ 1602 memcpy(&bs_dest->throttle_state, 1603 &bs_src->throttle_state, 1604 sizeof(ThrottleState)); 1605 bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0]; 1606 bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1]; 1607 bs_dest->io_limits_enabled = bs_src->io_limits_enabled; 1608 1609 /* r/w error */ 1610 bs_dest->on_read_error = bs_src->on_read_error; 1611 bs_dest->on_write_error = bs_src->on_write_error; 1612 1613 /* i/o status */ 1614 bs_dest->iostatus_enabled = bs_src->iostatus_enabled; 1615 bs_dest->iostatus = bs_src->iostatus; 1616 1617 /* dirty bitmap */ 1618 bs_dest->dirty_bitmap = bs_src->dirty_bitmap; 1619 1620 /* reference count */ 1621 bs_dest->refcnt = bs_src->refcnt; 1622 1623 /* job */ 1624 bs_dest->in_use = bs_src->in_use; 1625 bs_dest->job = bs_src->job; 1626 1627 /* keep the same entry in bdrv_states */ 1628 pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), 1629 bs_src->device_name); 1630 bs_dest->list = bs_src->list; 1631 } 1632 1633 /* 1634 * Swap bs contents for two image chains while they are live, 1635 * while keeping required fields on the BlockDriverState that is 1636 * actually attached to a device. 1637 * 1638 * This will modify the BlockDriverState fields, and swap contents 1639 * between bs_new and bs_old. Both bs_new and bs_old are modified. 1640 * 1641 * bs_new is required to be anonymous. 1642 * 1643 * This function does not create any image files. 1644 */ 1645 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) 1646 { 1647 BlockDriverState tmp; 1648 1649 /* bs_new must be anonymous and shouldn't have anything fancy enabled */ 1650 assert(bs_new->device_name[0] == '\0'); 1651 assert(bs_new->dirty_bitmap == NULL); 1652 assert(bs_new->job == NULL); 1653 assert(bs_new->dev == NULL); 1654 assert(bs_new->in_use == 0); 1655 assert(bs_new->io_limits_enabled == false); 1656 assert(!throttle_have_timer(&bs_new->throttle_state)); 1657 1658 tmp = *bs_new; 1659 *bs_new = *bs_old; 1660 *bs_old = tmp; 1661 1662 /* there are some fields that should not be swapped, move them back */ 1663 bdrv_move_feature_fields(&tmp, bs_old); 1664 bdrv_move_feature_fields(bs_old, bs_new); 1665 bdrv_move_feature_fields(bs_new, &tmp); 1666 1667 /* bs_new shouldn't be in bdrv_states even after the swap! */ 1668 assert(bs_new->device_name[0] == '\0'); 1669 1670 /* Check a few fields that should remain attached to the device */ 1671 assert(bs_new->dev == NULL); 1672 assert(bs_new->job == NULL); 1673 assert(bs_new->in_use == 0); 1674 assert(bs_new->io_limits_enabled == false); 1675 assert(!throttle_have_timer(&bs_new->throttle_state)); 1676 1677 bdrv_rebind(bs_new); 1678 bdrv_rebind(bs_old); 1679 } 1680 1681 /* 1682 * Add new bs contents at the top of an image chain while the chain is 1683 * live, while keeping required fields on the top layer. 1684 * 1685 * This will modify the BlockDriverState fields, and swap contents 1686 * between bs_new and bs_top. Both bs_new and bs_top are modified. 1687 * 1688 * bs_new is required to be anonymous. 1689 * 1690 * This function does not create any image files. 1691 */ 1692 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) 1693 { 1694 bdrv_swap(bs_new, bs_top); 1695 1696 /* The contents of 'tmp' will become bs_top, as we are 1697 * swapping bs_new and bs_top contents. */ 1698 bs_top->backing_hd = bs_new; 1699 bs_top->open_flags &= ~BDRV_O_NO_BACKING; 1700 pstrcpy(bs_top->backing_file, sizeof(bs_top->backing_file), 1701 bs_new->filename); 1702 pstrcpy(bs_top->backing_format, sizeof(bs_top->backing_format), 1703 bs_new->drv ? bs_new->drv->format_name : ""); 1704 } 1705 1706 static void bdrv_delete(BlockDriverState *bs) 1707 { 1708 assert(!bs->dev); 1709 assert(!bs->job); 1710 assert(!bs->in_use); 1711 assert(!bs->refcnt); 1712 1713 bdrv_close(bs); 1714 1715 /* remove from list, if necessary */ 1716 bdrv_make_anon(bs); 1717 1718 g_free(bs); 1719 } 1720 1721 int bdrv_attach_dev(BlockDriverState *bs, void *dev) 1722 /* TODO change to DeviceState *dev when all users are qdevified */ 1723 { 1724 if (bs->dev) { 1725 return -EBUSY; 1726 } 1727 bs->dev = dev; 1728 bdrv_iostatus_reset(bs); 1729 return 0; 1730 } 1731 1732 /* TODO qdevified devices don't use this, remove when devices are qdevified */ 1733 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev) 1734 { 1735 if (bdrv_attach_dev(bs, dev) < 0) { 1736 abort(); 1737 } 1738 } 1739 1740 void bdrv_detach_dev(BlockDriverState *bs, void *dev) 1741 /* TODO change to DeviceState *dev when all users are qdevified */ 1742 { 1743 assert(bs->dev == dev); 1744 bs->dev = NULL; 1745 bs->dev_ops = NULL; 1746 bs->dev_opaque = NULL; 1747 bs->buffer_alignment = 512; 1748 } 1749 1750 /* TODO change to return DeviceState * when all users are qdevified */ 1751 void *bdrv_get_attached_dev(BlockDriverState *bs) 1752 { 1753 return bs->dev; 1754 } 1755 1756 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, 1757 void *opaque) 1758 { 1759 bs->dev_ops = ops; 1760 bs->dev_opaque = opaque; 1761 } 1762 1763 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv, 1764 enum MonitorEvent ev, 1765 BlockErrorAction action, bool is_read) 1766 { 1767 QObject *data; 1768 const char *action_str; 1769 1770 switch (action) { 1771 case BDRV_ACTION_REPORT: 1772 action_str = "report"; 1773 break; 1774 case BDRV_ACTION_IGNORE: 1775 action_str = "ignore"; 1776 break; 1777 case BDRV_ACTION_STOP: 1778 action_str = "stop"; 1779 break; 1780 default: 1781 abort(); 1782 } 1783 1784 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }", 1785 bdrv->device_name, 1786 action_str, 1787 is_read ? "read" : "write"); 1788 monitor_protocol_event(ev, data); 1789 1790 qobject_decref(data); 1791 } 1792 1793 static void bdrv_emit_qmp_eject_event(BlockDriverState *bs, bool ejected) 1794 { 1795 QObject *data; 1796 1797 data = qobject_from_jsonf("{ 'device': %s, 'tray-open': %i }", 1798 bdrv_get_device_name(bs), ejected); 1799 monitor_protocol_event(QEVENT_DEVICE_TRAY_MOVED, data); 1800 1801 qobject_decref(data); 1802 } 1803 1804 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load) 1805 { 1806 if (bs->dev_ops && bs->dev_ops->change_media_cb) { 1807 bool tray_was_closed = !bdrv_dev_is_tray_open(bs); 1808 bs->dev_ops->change_media_cb(bs->dev_opaque, load); 1809 if (tray_was_closed) { 1810 /* tray open */ 1811 bdrv_emit_qmp_eject_event(bs, true); 1812 } 1813 if (load) { 1814 /* tray close */ 1815 bdrv_emit_qmp_eject_event(bs, false); 1816 } 1817 } 1818 } 1819 1820 bool bdrv_dev_has_removable_media(BlockDriverState *bs) 1821 { 1822 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb); 1823 } 1824 1825 void bdrv_dev_eject_request(BlockDriverState *bs, bool force) 1826 { 1827 if (bs->dev_ops && bs->dev_ops->eject_request_cb) { 1828 bs->dev_ops->eject_request_cb(bs->dev_opaque, force); 1829 } 1830 } 1831 1832 bool bdrv_dev_is_tray_open(BlockDriverState *bs) 1833 { 1834 if (bs->dev_ops && bs->dev_ops->is_tray_open) { 1835 return bs->dev_ops->is_tray_open(bs->dev_opaque); 1836 } 1837 return false; 1838 } 1839 1840 static void bdrv_dev_resize_cb(BlockDriverState *bs) 1841 { 1842 if (bs->dev_ops && bs->dev_ops->resize_cb) { 1843 bs->dev_ops->resize_cb(bs->dev_opaque); 1844 } 1845 } 1846 1847 bool bdrv_dev_is_medium_locked(BlockDriverState *bs) 1848 { 1849 if (bs->dev_ops && bs->dev_ops->is_medium_locked) { 1850 return bs->dev_ops->is_medium_locked(bs->dev_opaque); 1851 } 1852 return false; 1853 } 1854 1855 /* 1856 * Run consistency checks on an image 1857 * 1858 * Returns 0 if the check could be completed (it doesn't mean that the image is 1859 * free of errors) or -errno when an internal error occurred. The results of the 1860 * check are stored in res. 1861 */ 1862 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 1863 { 1864 if (bs->drv->bdrv_check == NULL) { 1865 return -ENOTSUP; 1866 } 1867 1868 memset(res, 0, sizeof(*res)); 1869 return bs->drv->bdrv_check(bs, res, fix); 1870 } 1871 1872 #define COMMIT_BUF_SECTORS 2048 1873 1874 /* commit COW file into the raw image */ 1875 int bdrv_commit(BlockDriverState *bs) 1876 { 1877 BlockDriver *drv = bs->drv; 1878 int64_t sector, total_sectors; 1879 int n, ro, open_flags; 1880 int ret = 0; 1881 uint8_t *buf; 1882 char filename[PATH_MAX]; 1883 1884 if (!drv) 1885 return -ENOMEDIUM; 1886 1887 if (!bs->backing_hd) { 1888 return -ENOTSUP; 1889 } 1890 1891 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) { 1892 return -EBUSY; 1893 } 1894 1895 ro = bs->backing_hd->read_only; 1896 /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */ 1897 pstrcpy(filename, sizeof(filename), bs->backing_hd->filename); 1898 open_flags = bs->backing_hd->open_flags; 1899 1900 if (ro) { 1901 if (bdrv_reopen(bs->backing_hd, open_flags | BDRV_O_RDWR, NULL)) { 1902 return -EACCES; 1903 } 1904 } 1905 1906 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; 1907 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); 1908 1909 for (sector = 0; sector < total_sectors; sector += n) { 1910 ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); 1911 if (ret < 0) { 1912 goto ro_cleanup; 1913 } 1914 if (ret) { 1915 if (bdrv_read(bs, sector, buf, n) != 0) { 1916 ret = -EIO; 1917 goto ro_cleanup; 1918 } 1919 1920 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { 1921 ret = -EIO; 1922 goto ro_cleanup; 1923 } 1924 } 1925 } 1926 1927 if (drv->bdrv_make_empty) { 1928 ret = drv->bdrv_make_empty(bs); 1929 bdrv_flush(bs); 1930 } 1931 1932 /* 1933 * Make sure all data we wrote to the backing device is actually 1934 * stable on disk. 1935 */ 1936 if (bs->backing_hd) 1937 bdrv_flush(bs->backing_hd); 1938 1939 ro_cleanup: 1940 g_free(buf); 1941 1942 if (ro) { 1943 /* ignoring error return here */ 1944 bdrv_reopen(bs->backing_hd, open_flags & ~BDRV_O_RDWR, NULL); 1945 } 1946 1947 return ret; 1948 } 1949 1950 int bdrv_commit_all(void) 1951 { 1952 BlockDriverState *bs; 1953 1954 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1955 if (bs->drv && bs->backing_hd) { 1956 int ret = bdrv_commit(bs); 1957 if (ret < 0) { 1958 return ret; 1959 } 1960 } 1961 } 1962 return 0; 1963 } 1964 1965 /** 1966 * Remove an active request from the tracked requests list 1967 * 1968 * This function should be called when a tracked request is completing. 1969 */ 1970 static void tracked_request_end(BdrvTrackedRequest *req) 1971 { 1972 QLIST_REMOVE(req, list); 1973 qemu_co_queue_restart_all(&req->wait_queue); 1974 } 1975 1976 /** 1977 * Add an active request to the tracked requests list 1978 */ 1979 static void tracked_request_begin(BdrvTrackedRequest *req, 1980 BlockDriverState *bs, 1981 int64_t sector_num, 1982 int nb_sectors, bool is_write) 1983 { 1984 *req = (BdrvTrackedRequest){ 1985 .bs = bs, 1986 .sector_num = sector_num, 1987 .nb_sectors = nb_sectors, 1988 .is_write = is_write, 1989 .co = qemu_coroutine_self(), 1990 }; 1991 1992 qemu_co_queue_init(&req->wait_queue); 1993 1994 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 1995 } 1996 1997 /** 1998 * Round a region to cluster boundaries 1999 */ 2000 void bdrv_round_to_clusters(BlockDriverState *bs, 2001 int64_t sector_num, int nb_sectors, 2002 int64_t *cluster_sector_num, 2003 int *cluster_nb_sectors) 2004 { 2005 BlockDriverInfo bdi; 2006 2007 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 2008 *cluster_sector_num = sector_num; 2009 *cluster_nb_sectors = nb_sectors; 2010 } else { 2011 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 2012 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 2013 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 2014 nb_sectors, c); 2015 } 2016 } 2017 2018 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 2019 int64_t sector_num, int nb_sectors) { 2020 /* aaaa bbbb */ 2021 if (sector_num >= req->sector_num + req->nb_sectors) { 2022 return false; 2023 } 2024 /* bbbb aaaa */ 2025 if (req->sector_num >= sector_num + nb_sectors) { 2026 return false; 2027 } 2028 return true; 2029 } 2030 2031 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, 2032 int64_t sector_num, int nb_sectors) 2033 { 2034 BdrvTrackedRequest *req; 2035 int64_t cluster_sector_num; 2036 int cluster_nb_sectors; 2037 bool retry; 2038 2039 /* If we touch the same cluster it counts as an overlap. This guarantees 2040 * that allocating writes will be serialized and not race with each other 2041 * for the same cluster. For example, in copy-on-read it ensures that the 2042 * CoR read and write operations are atomic and guest writes cannot 2043 * interleave between them. 2044 */ 2045 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2046 &cluster_sector_num, &cluster_nb_sectors); 2047 2048 do { 2049 retry = false; 2050 QLIST_FOREACH(req, &bs->tracked_requests, list) { 2051 if (tracked_request_overlaps(req, cluster_sector_num, 2052 cluster_nb_sectors)) { 2053 /* Hitting this means there was a reentrant request, for 2054 * example, a block driver issuing nested requests. This must 2055 * never happen since it means deadlock. 2056 */ 2057 assert(qemu_coroutine_self() != req->co); 2058 2059 qemu_co_queue_wait(&req->wait_queue); 2060 retry = true; 2061 break; 2062 } 2063 } 2064 } while (retry); 2065 } 2066 2067 /* 2068 * Return values: 2069 * 0 - success 2070 * -EINVAL - backing format specified, but no file 2071 * -ENOSPC - can't update the backing file because no space is left in the 2072 * image file header 2073 * -ENOTSUP - format driver doesn't support changing the backing file 2074 */ 2075 int bdrv_change_backing_file(BlockDriverState *bs, 2076 const char *backing_file, const char *backing_fmt) 2077 { 2078 BlockDriver *drv = bs->drv; 2079 int ret; 2080 2081 /* Backing file format doesn't make sense without a backing file */ 2082 if (backing_fmt && !backing_file) { 2083 return -EINVAL; 2084 } 2085 2086 if (drv->bdrv_change_backing_file != NULL) { 2087 ret = drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); 2088 } else { 2089 ret = -ENOTSUP; 2090 } 2091 2092 if (ret == 0) { 2093 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 2094 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 2095 } 2096 return ret; 2097 } 2098 2099 /* 2100 * Finds the image layer in the chain that has 'bs' as its backing file. 2101 * 2102 * active is the current topmost image. 2103 * 2104 * Returns NULL if bs is not found in active's image chain, 2105 * or if active == bs. 2106 */ 2107 BlockDriverState *bdrv_find_overlay(BlockDriverState *active, 2108 BlockDriverState *bs) 2109 { 2110 BlockDriverState *overlay = NULL; 2111 BlockDriverState *intermediate; 2112 2113 assert(active != NULL); 2114 assert(bs != NULL); 2115 2116 /* if bs is the same as active, then by definition it has no overlay 2117 */ 2118 if (active == bs) { 2119 return NULL; 2120 } 2121 2122 intermediate = active; 2123 while (intermediate->backing_hd) { 2124 if (intermediate->backing_hd == bs) { 2125 overlay = intermediate; 2126 break; 2127 } 2128 intermediate = intermediate->backing_hd; 2129 } 2130 2131 return overlay; 2132 } 2133 2134 typedef struct BlkIntermediateStates { 2135 BlockDriverState *bs; 2136 QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; 2137 } BlkIntermediateStates; 2138 2139 2140 /* 2141 * Drops images above 'base' up to and including 'top', and sets the image 2142 * above 'top' to have base as its backing file. 2143 * 2144 * Requires that the overlay to 'top' is opened r/w, so that the backing file 2145 * information in 'bs' can be properly updated. 2146 * 2147 * E.g., this will convert the following chain: 2148 * bottom <- base <- intermediate <- top <- active 2149 * 2150 * to 2151 * 2152 * bottom <- base <- active 2153 * 2154 * It is allowed for bottom==base, in which case it converts: 2155 * 2156 * base <- intermediate <- top <- active 2157 * 2158 * to 2159 * 2160 * base <- active 2161 * 2162 * Error conditions: 2163 * if active == top, that is considered an error 2164 * 2165 */ 2166 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, 2167 BlockDriverState *base) 2168 { 2169 BlockDriverState *intermediate; 2170 BlockDriverState *base_bs = NULL; 2171 BlockDriverState *new_top_bs = NULL; 2172 BlkIntermediateStates *intermediate_state, *next; 2173 int ret = -EIO; 2174 2175 QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete; 2176 QSIMPLEQ_INIT(&states_to_delete); 2177 2178 if (!top->drv || !base->drv) { 2179 goto exit; 2180 } 2181 2182 new_top_bs = bdrv_find_overlay(active, top); 2183 2184 if (new_top_bs == NULL) { 2185 /* we could not find the image above 'top', this is an error */ 2186 goto exit; 2187 } 2188 2189 /* special case of new_top_bs->backing_hd already pointing to base - nothing 2190 * to do, no intermediate images */ 2191 if (new_top_bs->backing_hd == base) { 2192 ret = 0; 2193 goto exit; 2194 } 2195 2196 intermediate = top; 2197 2198 /* now we will go down through the list, and add each BDS we find 2199 * into our deletion queue, until we hit the 'base' 2200 */ 2201 while (intermediate) { 2202 intermediate_state = g_malloc0(sizeof(BlkIntermediateStates)); 2203 intermediate_state->bs = intermediate; 2204 QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); 2205 2206 if (intermediate->backing_hd == base) { 2207 base_bs = intermediate->backing_hd; 2208 break; 2209 } 2210 intermediate = intermediate->backing_hd; 2211 } 2212 if (base_bs == NULL) { 2213 /* something went wrong, we did not end at the base. safely 2214 * unravel everything, and exit with error */ 2215 goto exit; 2216 } 2217 2218 /* success - we can delete the intermediate states, and link top->base */ 2219 ret = bdrv_change_backing_file(new_top_bs, base_bs->filename, 2220 base_bs->drv ? base_bs->drv->format_name : ""); 2221 if (ret) { 2222 goto exit; 2223 } 2224 new_top_bs->backing_hd = base_bs; 2225 2226 2227 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2228 /* so that bdrv_close() does not recursively close the chain */ 2229 intermediate_state->bs->backing_hd = NULL; 2230 bdrv_unref(intermediate_state->bs); 2231 } 2232 ret = 0; 2233 2234 exit: 2235 QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { 2236 g_free(intermediate_state); 2237 } 2238 return ret; 2239 } 2240 2241 2242 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 2243 size_t size) 2244 { 2245 int64_t len; 2246 2247 if (!bdrv_is_inserted(bs)) 2248 return -ENOMEDIUM; 2249 2250 if (bs->growable) 2251 return 0; 2252 2253 len = bdrv_getlength(bs); 2254 2255 if (offset < 0) 2256 return -EIO; 2257 2258 if ((offset > len) || (len - offset < size)) 2259 return -EIO; 2260 2261 return 0; 2262 } 2263 2264 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 2265 int nb_sectors) 2266 { 2267 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 2268 nb_sectors * BDRV_SECTOR_SIZE); 2269 } 2270 2271 typedef struct RwCo { 2272 BlockDriverState *bs; 2273 int64_t sector_num; 2274 int nb_sectors; 2275 QEMUIOVector *qiov; 2276 bool is_write; 2277 int ret; 2278 BdrvRequestFlags flags; 2279 } RwCo; 2280 2281 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 2282 { 2283 RwCo *rwco = opaque; 2284 2285 if (!rwco->is_write) { 2286 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, 2287 rwco->nb_sectors, rwco->qiov, 2288 rwco->flags); 2289 } else { 2290 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, 2291 rwco->nb_sectors, rwco->qiov, 2292 rwco->flags); 2293 } 2294 } 2295 2296 /* 2297 * Process a vectored synchronous request using coroutines 2298 */ 2299 static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, 2300 QEMUIOVector *qiov, bool is_write, 2301 BdrvRequestFlags flags) 2302 { 2303 Coroutine *co; 2304 RwCo rwco = { 2305 .bs = bs, 2306 .sector_num = sector_num, 2307 .nb_sectors = qiov->size >> BDRV_SECTOR_BITS, 2308 .qiov = qiov, 2309 .is_write = is_write, 2310 .ret = NOT_DONE, 2311 .flags = flags, 2312 }; 2313 assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); 2314 2315 /** 2316 * In sync call context, when the vcpu is blocked, this throttling timer 2317 * will not fire; so the I/O throttling function has to be disabled here 2318 * if it has been enabled. 2319 */ 2320 if (bs->io_limits_enabled) { 2321 fprintf(stderr, "Disabling I/O throttling on '%s' due " 2322 "to synchronous I/O.\n", bdrv_get_device_name(bs)); 2323 bdrv_io_limits_disable(bs); 2324 } 2325 2326 if (qemu_in_coroutine()) { 2327 /* Fast-path if already in coroutine context */ 2328 bdrv_rw_co_entry(&rwco); 2329 } else { 2330 co = qemu_coroutine_create(bdrv_rw_co_entry); 2331 qemu_coroutine_enter(co, &rwco); 2332 while (rwco.ret == NOT_DONE) { 2333 qemu_aio_wait(); 2334 } 2335 } 2336 return rwco.ret; 2337 } 2338 2339 /* 2340 * Process a synchronous request using coroutines 2341 */ 2342 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 2343 int nb_sectors, bool is_write, BdrvRequestFlags flags) 2344 { 2345 QEMUIOVector qiov; 2346 struct iovec iov = { 2347 .iov_base = (void *)buf, 2348 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 2349 }; 2350 2351 qemu_iovec_init_external(&qiov, &iov, 1); 2352 return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags); 2353 } 2354 2355 /* return < 0 if error. See bdrv_write() for the return codes */ 2356 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 2357 uint8_t *buf, int nb_sectors) 2358 { 2359 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); 2360 } 2361 2362 /* Just like bdrv_read(), but with I/O throttling temporarily disabled */ 2363 int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, 2364 uint8_t *buf, int nb_sectors) 2365 { 2366 bool enabled; 2367 int ret; 2368 2369 enabled = bs->io_limits_enabled; 2370 bs->io_limits_enabled = false; 2371 ret = bdrv_read(bs, sector_num, buf, nb_sectors); 2372 bs->io_limits_enabled = enabled; 2373 return ret; 2374 } 2375 2376 /* Return < 0 if error. Important errors are: 2377 -EIO generic I/O error (may happen for all errors) 2378 -ENOMEDIUM No media inserted. 2379 -EINVAL Invalid sector number or nb_sectors 2380 -EACCES Trying to write a read-only device 2381 */ 2382 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 2383 const uint8_t *buf, int nb_sectors) 2384 { 2385 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 2386 } 2387 2388 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) 2389 { 2390 return bdrv_rwv_co(bs, sector_num, qiov, true, 0); 2391 } 2392 2393 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 2394 { 2395 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, 2396 BDRV_REQ_ZERO_WRITE); 2397 } 2398 2399 int bdrv_pread(BlockDriverState *bs, int64_t offset, 2400 void *buf, int count1) 2401 { 2402 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 2403 int len, nb_sectors, count; 2404 int64_t sector_num; 2405 int ret; 2406 2407 count = count1; 2408 /* first read to align to sector start */ 2409 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 2410 if (len > count) 2411 len = count; 2412 sector_num = offset >> BDRV_SECTOR_BITS; 2413 if (len > 0) { 2414 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2415 return ret; 2416 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); 2417 count -= len; 2418 if (count == 0) 2419 return count1; 2420 sector_num++; 2421 buf += len; 2422 } 2423 2424 /* read the sectors "in place" */ 2425 nb_sectors = count >> BDRV_SECTOR_BITS; 2426 if (nb_sectors > 0) { 2427 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) 2428 return ret; 2429 sector_num += nb_sectors; 2430 len = nb_sectors << BDRV_SECTOR_BITS; 2431 buf += len; 2432 count -= len; 2433 } 2434 2435 /* add data from the last sector */ 2436 if (count > 0) { 2437 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2438 return ret; 2439 memcpy(buf, tmp_buf, count); 2440 } 2441 return count1; 2442 } 2443 2444 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) 2445 { 2446 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 2447 int len, nb_sectors, count; 2448 int64_t sector_num; 2449 int ret; 2450 2451 count = qiov->size; 2452 2453 /* first write to align to sector start */ 2454 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 2455 if (len > count) 2456 len = count; 2457 sector_num = offset >> BDRV_SECTOR_BITS; 2458 if (len > 0) { 2459 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2460 return ret; 2461 qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), 2462 len); 2463 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 2464 return ret; 2465 count -= len; 2466 if (count == 0) 2467 return qiov->size; 2468 sector_num++; 2469 } 2470 2471 /* write the sectors "in place" */ 2472 nb_sectors = count >> BDRV_SECTOR_BITS; 2473 if (nb_sectors > 0) { 2474 QEMUIOVector qiov_inplace; 2475 2476 qemu_iovec_init(&qiov_inplace, qiov->niov); 2477 qemu_iovec_concat(&qiov_inplace, qiov, len, 2478 nb_sectors << BDRV_SECTOR_BITS); 2479 ret = bdrv_writev(bs, sector_num, &qiov_inplace); 2480 qemu_iovec_destroy(&qiov_inplace); 2481 if (ret < 0) { 2482 return ret; 2483 } 2484 2485 sector_num += nb_sectors; 2486 len = nb_sectors << BDRV_SECTOR_BITS; 2487 count -= len; 2488 } 2489 2490 /* add data from the last sector */ 2491 if (count > 0) { 2492 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 2493 return ret; 2494 qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count); 2495 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 2496 return ret; 2497 } 2498 return qiov->size; 2499 } 2500 2501 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 2502 const void *buf, int count1) 2503 { 2504 QEMUIOVector qiov; 2505 struct iovec iov = { 2506 .iov_base = (void *) buf, 2507 .iov_len = count1, 2508 }; 2509 2510 qemu_iovec_init_external(&qiov, &iov, 1); 2511 return bdrv_pwritev(bs, offset, &qiov); 2512 } 2513 2514 /* 2515 * Writes to the file and ensures that no writes are reordered across this 2516 * request (acts as a barrier) 2517 * 2518 * Returns 0 on success, -errno in error cases. 2519 */ 2520 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 2521 const void *buf, int count) 2522 { 2523 int ret; 2524 2525 ret = bdrv_pwrite(bs, offset, buf, count); 2526 if (ret < 0) { 2527 return ret; 2528 } 2529 2530 /* No flush needed for cache modes that already do it */ 2531 if (bs->enable_write_cache) { 2532 bdrv_flush(bs); 2533 } 2534 2535 return 0; 2536 } 2537 2538 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, 2539 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2540 { 2541 /* Perform I/O through a temporary buffer so that users who scribble over 2542 * their read buffer while the operation is in progress do not end up 2543 * modifying the image file. This is critical for zero-copy guest I/O 2544 * where anything might happen inside guest memory. 2545 */ 2546 void *bounce_buffer; 2547 2548 BlockDriver *drv = bs->drv; 2549 struct iovec iov; 2550 QEMUIOVector bounce_qiov; 2551 int64_t cluster_sector_num; 2552 int cluster_nb_sectors; 2553 size_t skip_bytes; 2554 int ret; 2555 2556 /* Cover entire cluster so no additional backing file I/O is required when 2557 * allocating cluster in the image file. 2558 */ 2559 bdrv_round_to_clusters(bs, sector_num, nb_sectors, 2560 &cluster_sector_num, &cluster_nb_sectors); 2561 2562 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, 2563 cluster_sector_num, cluster_nb_sectors); 2564 2565 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 2566 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); 2567 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 2568 2569 ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 2570 &bounce_qiov); 2571 if (ret < 0) { 2572 goto err; 2573 } 2574 2575 if (drv->bdrv_co_write_zeroes && 2576 buffer_is_zero(bounce_buffer, iov.iov_len)) { 2577 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, 2578 cluster_nb_sectors); 2579 } else { 2580 /* This does not change the data on the disk, it is not necessary 2581 * to flush even in cache=writethrough mode. 2582 */ 2583 ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 2584 &bounce_qiov); 2585 } 2586 2587 if (ret < 0) { 2588 /* It might be okay to ignore write errors for guest requests. If this 2589 * is a deliberate copy-on-read then we don't want to ignore the error. 2590 * Simply report it in all cases. 2591 */ 2592 goto err; 2593 } 2594 2595 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 2596 qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, 2597 nb_sectors * BDRV_SECTOR_SIZE); 2598 2599 err: 2600 qemu_vfree(bounce_buffer); 2601 return ret; 2602 } 2603 2604 /* 2605 * Handle a read request in coroutine context 2606 */ 2607 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 2608 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 2609 BdrvRequestFlags flags) 2610 { 2611 BlockDriver *drv = bs->drv; 2612 BdrvTrackedRequest req; 2613 int ret; 2614 2615 if (!drv) { 2616 return -ENOMEDIUM; 2617 } 2618 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 2619 return -EIO; 2620 } 2621 2622 if (bs->copy_on_read) { 2623 flags |= BDRV_REQ_COPY_ON_READ; 2624 } 2625 if (flags & BDRV_REQ_COPY_ON_READ) { 2626 bs->copy_on_read_in_flight++; 2627 } 2628 2629 if (bs->copy_on_read_in_flight) { 2630 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 2631 } 2632 2633 /* throttling disk I/O */ 2634 if (bs->io_limits_enabled) { 2635 bdrv_io_limits_intercept(bs, nb_sectors, false); 2636 } 2637 2638 tracked_request_begin(&req, bs, sector_num, nb_sectors, false); 2639 2640 if (flags & BDRV_REQ_COPY_ON_READ) { 2641 int pnum; 2642 2643 ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); 2644 if (ret < 0) { 2645 goto out; 2646 } 2647 2648 if (!ret || pnum != nb_sectors) { 2649 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); 2650 goto out; 2651 } 2652 } 2653 2654 if (!(bs->zero_beyond_eof && bs->growable)) { 2655 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 2656 } else { 2657 /* Read zeros after EOF of growable BDSes */ 2658 int64_t len, total_sectors, max_nb_sectors; 2659 2660 len = bdrv_getlength(bs); 2661 if (len < 0) { 2662 ret = len; 2663 goto out; 2664 } 2665 2666 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); 2667 max_nb_sectors = MAX(0, total_sectors - sector_num); 2668 if (max_nb_sectors > 0) { 2669 ret = drv->bdrv_co_readv(bs, sector_num, 2670 MIN(nb_sectors, max_nb_sectors), qiov); 2671 } else { 2672 ret = 0; 2673 } 2674 2675 /* Reading beyond end of file is supposed to produce zeroes */ 2676 if (ret == 0 && total_sectors < sector_num + nb_sectors) { 2677 uint64_t offset = MAX(0, total_sectors - sector_num); 2678 uint64_t bytes = (sector_num + nb_sectors - offset) * 2679 BDRV_SECTOR_SIZE; 2680 qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); 2681 } 2682 } 2683 2684 out: 2685 tracked_request_end(&req); 2686 2687 if (flags & BDRV_REQ_COPY_ON_READ) { 2688 bs->copy_on_read_in_flight--; 2689 } 2690 2691 return ret; 2692 } 2693 2694 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 2695 int nb_sectors, QEMUIOVector *qiov) 2696 { 2697 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 2698 2699 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); 2700 } 2701 2702 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 2703 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 2704 { 2705 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); 2706 2707 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 2708 BDRV_REQ_COPY_ON_READ); 2709 } 2710 2711 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, 2712 int64_t sector_num, int nb_sectors) 2713 { 2714 BlockDriver *drv = bs->drv; 2715 QEMUIOVector qiov; 2716 struct iovec iov; 2717 int ret; 2718 2719 /* TODO Emulate only part of misaligned requests instead of letting block 2720 * drivers return -ENOTSUP and emulate everything */ 2721 2722 /* First try the efficient write zeroes operation */ 2723 if (drv->bdrv_co_write_zeroes) { 2724 ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); 2725 if (ret != -ENOTSUP) { 2726 return ret; 2727 } 2728 } 2729 2730 /* Fall back to bounce buffer if write zeroes is unsupported */ 2731 iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; 2732 iov.iov_base = qemu_blockalign(bs, iov.iov_len); 2733 memset(iov.iov_base, 0, iov.iov_len); 2734 qemu_iovec_init_external(&qiov, &iov, 1); 2735 2736 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); 2737 2738 qemu_vfree(iov.iov_base); 2739 return ret; 2740 } 2741 2742 /* 2743 * Handle a write request in coroutine context 2744 */ 2745 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 2746 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, 2747 BdrvRequestFlags flags) 2748 { 2749 BlockDriver *drv = bs->drv; 2750 BdrvTrackedRequest req; 2751 int ret; 2752 2753 if (!bs->drv) { 2754 return -ENOMEDIUM; 2755 } 2756 if (bs->read_only) { 2757 return -EACCES; 2758 } 2759 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 2760 return -EIO; 2761 } 2762 2763 if (bs->copy_on_read_in_flight) { 2764 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 2765 } 2766 2767 /* throttling disk I/O */ 2768 if (bs->io_limits_enabled) { 2769 bdrv_io_limits_intercept(bs, nb_sectors, true); 2770 } 2771 2772 tracked_request_begin(&req, bs, sector_num, nb_sectors, true); 2773 2774 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2775 2776 if (ret < 0) { 2777 /* Do nothing, write notifier decided to fail this request */ 2778 } else if (flags & BDRV_REQ_ZERO_WRITE) { 2779 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); 2780 } else { 2781 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 2782 } 2783 2784 if (ret == 0 && !bs->enable_write_cache) { 2785 ret = bdrv_co_flush(bs); 2786 } 2787 2788 if (bs->dirty_bitmap) { 2789 bdrv_set_dirty(bs, sector_num, nb_sectors); 2790 } 2791 2792 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { 2793 bs->wr_highest_sector = sector_num + nb_sectors - 1; 2794 } 2795 if (bs->growable && ret >= 0) { 2796 bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); 2797 } 2798 2799 tracked_request_end(&req); 2800 2801 return ret; 2802 } 2803 2804 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 2805 int nb_sectors, QEMUIOVector *qiov) 2806 { 2807 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 2808 2809 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); 2810 } 2811 2812 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, 2813 int64_t sector_num, int nb_sectors) 2814 { 2815 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); 2816 2817 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, 2818 BDRV_REQ_ZERO_WRITE); 2819 } 2820 2821 /** 2822 * Truncate file to 'offset' bytes (needed only for file protocols) 2823 */ 2824 int bdrv_truncate(BlockDriverState *bs, int64_t offset) 2825 { 2826 BlockDriver *drv = bs->drv; 2827 int ret; 2828 if (!drv) 2829 return -ENOMEDIUM; 2830 if (!drv->bdrv_truncate) 2831 return -ENOTSUP; 2832 if (bs->read_only) 2833 return -EACCES; 2834 if (bdrv_in_use(bs)) 2835 return -EBUSY; 2836 ret = drv->bdrv_truncate(bs, offset); 2837 if (ret == 0) { 2838 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 2839 bdrv_dev_resize_cb(bs); 2840 } 2841 return ret; 2842 } 2843 2844 /** 2845 * Length of a allocated file in bytes. Sparse files are counted by actual 2846 * allocated space. Return < 0 if error or unknown. 2847 */ 2848 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) 2849 { 2850 BlockDriver *drv = bs->drv; 2851 if (!drv) { 2852 return -ENOMEDIUM; 2853 } 2854 if (drv->bdrv_get_allocated_file_size) { 2855 return drv->bdrv_get_allocated_file_size(bs); 2856 } 2857 if (bs->file) { 2858 return bdrv_get_allocated_file_size(bs->file); 2859 } 2860 return -ENOTSUP; 2861 } 2862 2863 /** 2864 * Length of a file in bytes. Return < 0 if error or unknown. 2865 */ 2866 int64_t bdrv_getlength(BlockDriverState *bs) 2867 { 2868 BlockDriver *drv = bs->drv; 2869 if (!drv) 2870 return -ENOMEDIUM; 2871 2872 if (drv->has_variable_length) { 2873 int ret = refresh_total_sectors(bs, bs->total_sectors); 2874 if (ret < 0) { 2875 return ret; 2876 } 2877 } 2878 return bs->total_sectors * BDRV_SECTOR_SIZE; 2879 } 2880 2881 /* return 0 as number of sectors if no device present or error */ 2882 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) 2883 { 2884 int64_t length; 2885 length = bdrv_getlength(bs); 2886 if (length < 0) 2887 length = 0; 2888 else 2889 length = length >> BDRV_SECTOR_BITS; 2890 *nb_sectors_ptr = length; 2891 } 2892 2893 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, 2894 BlockdevOnError on_write_error) 2895 { 2896 bs->on_read_error = on_read_error; 2897 bs->on_write_error = on_write_error; 2898 } 2899 2900 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) 2901 { 2902 return is_read ? bs->on_read_error : bs->on_write_error; 2903 } 2904 2905 BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) 2906 { 2907 BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; 2908 2909 switch (on_err) { 2910 case BLOCKDEV_ON_ERROR_ENOSPC: 2911 return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT; 2912 case BLOCKDEV_ON_ERROR_STOP: 2913 return BDRV_ACTION_STOP; 2914 case BLOCKDEV_ON_ERROR_REPORT: 2915 return BDRV_ACTION_REPORT; 2916 case BLOCKDEV_ON_ERROR_IGNORE: 2917 return BDRV_ACTION_IGNORE; 2918 default: 2919 abort(); 2920 } 2921 } 2922 2923 /* This is done by device models because, while the block layer knows 2924 * about the error, it does not know whether an operation comes from 2925 * the device or the block layer (from a job, for example). 2926 */ 2927 void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, 2928 bool is_read, int error) 2929 { 2930 assert(error >= 0); 2931 bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read); 2932 if (action == BDRV_ACTION_STOP) { 2933 vm_stop(RUN_STATE_IO_ERROR); 2934 bdrv_iostatus_set_err(bs, error); 2935 } 2936 } 2937 2938 int bdrv_is_read_only(BlockDriverState *bs) 2939 { 2940 return bs->read_only; 2941 } 2942 2943 int bdrv_is_sg(BlockDriverState *bs) 2944 { 2945 return bs->sg; 2946 } 2947 2948 int bdrv_enable_write_cache(BlockDriverState *bs) 2949 { 2950 return bs->enable_write_cache; 2951 } 2952 2953 void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) 2954 { 2955 bs->enable_write_cache = wce; 2956 2957 /* so a reopen() will preserve wce */ 2958 if (wce) { 2959 bs->open_flags |= BDRV_O_CACHE_WB; 2960 } else { 2961 bs->open_flags &= ~BDRV_O_CACHE_WB; 2962 } 2963 } 2964 2965 int bdrv_is_encrypted(BlockDriverState *bs) 2966 { 2967 if (bs->backing_hd && bs->backing_hd->encrypted) 2968 return 1; 2969 return bs->encrypted; 2970 } 2971 2972 int bdrv_key_required(BlockDriverState *bs) 2973 { 2974 BlockDriverState *backing_hd = bs->backing_hd; 2975 2976 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) 2977 return 1; 2978 return (bs->encrypted && !bs->valid_key); 2979 } 2980 2981 int bdrv_set_key(BlockDriverState *bs, const char *key) 2982 { 2983 int ret; 2984 if (bs->backing_hd && bs->backing_hd->encrypted) { 2985 ret = bdrv_set_key(bs->backing_hd, key); 2986 if (ret < 0) 2987 return ret; 2988 if (!bs->encrypted) 2989 return 0; 2990 } 2991 if (!bs->encrypted) { 2992 return -EINVAL; 2993 } else if (!bs->drv || !bs->drv->bdrv_set_key) { 2994 return -ENOMEDIUM; 2995 } 2996 ret = bs->drv->bdrv_set_key(bs, key); 2997 if (ret < 0) { 2998 bs->valid_key = 0; 2999 } else if (!bs->valid_key) { 3000 bs->valid_key = 1; 3001 /* call the change callback now, we skipped it on open */ 3002 bdrv_dev_change_media_cb(bs, true); 3003 } 3004 return ret; 3005 } 3006 3007 const char *bdrv_get_format_name(BlockDriverState *bs) 3008 { 3009 return bs->drv ? bs->drv->format_name : NULL; 3010 } 3011 3012 void bdrv_iterate_format(void (*it)(void *opaque, const char *name), 3013 void *opaque) 3014 { 3015 BlockDriver *drv; 3016 3017 QLIST_FOREACH(drv, &bdrv_drivers, list) { 3018 it(opaque, drv->format_name); 3019 } 3020 } 3021 3022 BlockDriverState *bdrv_find(const char *name) 3023 { 3024 BlockDriverState *bs; 3025 3026 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3027 if (!strcmp(name, bs->device_name)) { 3028 return bs; 3029 } 3030 } 3031 return NULL; 3032 } 3033 3034 BlockDriverState *bdrv_next(BlockDriverState *bs) 3035 { 3036 if (!bs) { 3037 return QTAILQ_FIRST(&bdrv_states); 3038 } 3039 return QTAILQ_NEXT(bs, list); 3040 } 3041 3042 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) 3043 { 3044 BlockDriverState *bs; 3045 3046 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3047 it(opaque, bs); 3048 } 3049 } 3050 3051 const char *bdrv_get_device_name(BlockDriverState *bs) 3052 { 3053 return bs->device_name; 3054 } 3055 3056 int bdrv_get_flags(BlockDriverState *bs) 3057 { 3058 return bs->open_flags; 3059 } 3060 3061 int bdrv_flush_all(void) 3062 { 3063 BlockDriverState *bs; 3064 int result = 0; 3065 3066 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3067 int ret = bdrv_flush(bs); 3068 if (ret < 0 && !result) { 3069 result = ret; 3070 } 3071 } 3072 3073 return result; 3074 } 3075 3076 int bdrv_has_zero_init_1(BlockDriverState *bs) 3077 { 3078 return 1; 3079 } 3080 3081 int bdrv_has_zero_init(BlockDriverState *bs) 3082 { 3083 assert(bs->drv); 3084 3085 /* If BS is a copy on write image, it is initialized to 3086 the contents of the base image, which may not be zeroes. */ 3087 if (bs->backing_hd) { 3088 return 0; 3089 } 3090 if (bs->drv->bdrv_has_zero_init) { 3091 return bs->drv->bdrv_has_zero_init(bs); 3092 } 3093 3094 /* safe default */ 3095 return 0; 3096 } 3097 3098 typedef struct BdrvCoGetBlockStatusData { 3099 BlockDriverState *bs; 3100 BlockDriverState *base; 3101 int64_t sector_num; 3102 int nb_sectors; 3103 int *pnum; 3104 int64_t ret; 3105 bool done; 3106 } BdrvCoGetBlockStatusData; 3107 3108 /* 3109 * Returns true iff the specified sector is present in the disk image. Drivers 3110 * not implementing the functionality are assumed to not support backing files, 3111 * hence all their sectors are reported as allocated. 3112 * 3113 * If 'sector_num' is beyond the end of the disk image the return value is 0 3114 * and 'pnum' is set to 0. 3115 * 3116 * 'pnum' is set to the number of sectors (including and immediately following 3117 * the specified sector) that are known to be in the same 3118 * allocated/unallocated state. 3119 * 3120 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 3121 * beyond the end of the disk image it will be clamped. 3122 */ 3123 static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, 3124 int64_t sector_num, 3125 int nb_sectors, int *pnum) 3126 { 3127 int64_t length; 3128 int64_t n; 3129 int64_t ret, ret2; 3130 3131 length = bdrv_getlength(bs); 3132 if (length < 0) { 3133 return length; 3134 } 3135 3136 if (sector_num >= (length >> BDRV_SECTOR_BITS)) { 3137 *pnum = 0; 3138 return 0; 3139 } 3140 3141 n = bs->total_sectors - sector_num; 3142 if (n < nb_sectors) { 3143 nb_sectors = n; 3144 } 3145 3146 if (!bs->drv->bdrv_co_get_block_status) { 3147 *pnum = nb_sectors; 3148 ret = BDRV_BLOCK_DATA; 3149 if (bs->drv->protocol_name) { 3150 ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); 3151 } 3152 return ret; 3153 } 3154 3155 ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); 3156 if (ret < 0) { 3157 *pnum = 0; 3158 return ret; 3159 } 3160 3161 if (ret & BDRV_BLOCK_RAW) { 3162 assert(ret & BDRV_BLOCK_OFFSET_VALID); 3163 return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 3164 *pnum, pnum); 3165 } 3166 3167 if (!(ret & BDRV_BLOCK_DATA)) { 3168 if (bdrv_has_zero_init(bs)) { 3169 ret |= BDRV_BLOCK_ZERO; 3170 } else if (bs->backing_hd) { 3171 BlockDriverState *bs2 = bs->backing_hd; 3172 int64_t length2 = bdrv_getlength(bs2); 3173 if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) { 3174 ret |= BDRV_BLOCK_ZERO; 3175 } 3176 } 3177 } 3178 3179 if (bs->file && 3180 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 3181 (ret & BDRV_BLOCK_OFFSET_VALID)) { 3182 ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, 3183 *pnum, pnum); 3184 if (ret2 >= 0) { 3185 /* Ignore errors. This is just providing extra information, it 3186 * is useful but not necessary. 3187 */ 3188 ret |= (ret2 & BDRV_BLOCK_ZERO); 3189 } 3190 } 3191 3192 return ret; 3193 } 3194 3195 /* Coroutine wrapper for bdrv_get_block_status() */ 3196 static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) 3197 { 3198 BdrvCoGetBlockStatusData *data = opaque; 3199 BlockDriverState *bs = data->bs; 3200 3201 data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, 3202 data->pnum); 3203 data->done = true; 3204 } 3205 3206 /* 3207 * Synchronous wrapper around bdrv_co_get_block_status(). 3208 * 3209 * See bdrv_co_get_block_status() for details. 3210 */ 3211 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, 3212 int nb_sectors, int *pnum) 3213 { 3214 Coroutine *co; 3215 BdrvCoGetBlockStatusData data = { 3216 .bs = bs, 3217 .sector_num = sector_num, 3218 .nb_sectors = nb_sectors, 3219 .pnum = pnum, 3220 .done = false, 3221 }; 3222 3223 if (qemu_in_coroutine()) { 3224 /* Fast-path if already in coroutine context */ 3225 bdrv_get_block_status_co_entry(&data); 3226 } else { 3227 co = qemu_coroutine_create(bdrv_get_block_status_co_entry); 3228 qemu_coroutine_enter(co, &data); 3229 while (!data.done) { 3230 qemu_aio_wait(); 3231 } 3232 } 3233 return data.ret; 3234 } 3235 3236 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, 3237 int nb_sectors, int *pnum) 3238 { 3239 int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); 3240 if (ret < 0) { 3241 return ret; 3242 } 3243 return 3244 (ret & BDRV_BLOCK_DATA) || 3245 ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs)); 3246 } 3247 3248 /* 3249 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 3250 * 3251 * Return true if the given sector is allocated in any image between 3252 * BASE and TOP (inclusive). BASE can be NULL to check if the given 3253 * sector is allocated in any image of the chain. Return false otherwise. 3254 * 3255 * 'pnum' is set to the number of sectors (including and immediately following 3256 * the specified sector) that are known to be in the same 3257 * allocated/unallocated state. 3258 * 3259 */ 3260 int bdrv_is_allocated_above(BlockDriverState *top, 3261 BlockDriverState *base, 3262 int64_t sector_num, 3263 int nb_sectors, int *pnum) 3264 { 3265 BlockDriverState *intermediate; 3266 int ret, n = nb_sectors; 3267 3268 intermediate = top; 3269 while (intermediate && intermediate != base) { 3270 int pnum_inter; 3271 ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, 3272 &pnum_inter); 3273 if (ret < 0) { 3274 return ret; 3275 } else if (ret) { 3276 *pnum = pnum_inter; 3277 return 1; 3278 } 3279 3280 /* 3281 * [sector_num, nb_sectors] is unallocated on top but intermediate 3282 * might have 3283 * 3284 * [sector_num+x, nr_sectors] allocated. 3285 */ 3286 if (n > pnum_inter && 3287 (intermediate == top || 3288 sector_num + pnum_inter < intermediate->total_sectors)) { 3289 n = pnum_inter; 3290 } 3291 3292 intermediate = intermediate->backing_hd; 3293 } 3294 3295 *pnum = n; 3296 return 0; 3297 } 3298 3299 const char *bdrv_get_encrypted_filename(BlockDriverState *bs) 3300 { 3301 if (bs->backing_hd && bs->backing_hd->encrypted) 3302 return bs->backing_file; 3303 else if (bs->encrypted) 3304 return bs->filename; 3305 else 3306 return NULL; 3307 } 3308 3309 void bdrv_get_backing_filename(BlockDriverState *bs, 3310 char *filename, int filename_size) 3311 { 3312 pstrcpy(filename, filename_size, bs->backing_file); 3313 } 3314 3315 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 3316 const uint8_t *buf, int nb_sectors) 3317 { 3318 BlockDriver *drv = bs->drv; 3319 if (!drv) 3320 return -ENOMEDIUM; 3321 if (!drv->bdrv_write_compressed) 3322 return -ENOTSUP; 3323 if (bdrv_check_request(bs, sector_num, nb_sectors)) 3324 return -EIO; 3325 3326 assert(!bs->dirty_bitmap); 3327 3328 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 3329 } 3330 3331 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 3332 { 3333 BlockDriver *drv = bs->drv; 3334 if (!drv) 3335 return -ENOMEDIUM; 3336 if (!drv->bdrv_get_info) 3337 return -ENOTSUP; 3338 memset(bdi, 0, sizeof(*bdi)); 3339 return drv->bdrv_get_info(bs, bdi); 3340 } 3341 3342 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) 3343 { 3344 BlockDriver *drv = bs->drv; 3345 if (drv && drv->bdrv_get_specific_info) { 3346 return drv->bdrv_get_specific_info(bs); 3347 } 3348 return NULL; 3349 } 3350 3351 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 3352 int64_t pos, int size) 3353 { 3354 QEMUIOVector qiov; 3355 struct iovec iov = { 3356 .iov_base = (void *) buf, 3357 .iov_len = size, 3358 }; 3359 3360 qemu_iovec_init_external(&qiov, &iov, 1); 3361 return bdrv_writev_vmstate(bs, &qiov, pos); 3362 } 3363 3364 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 3365 { 3366 BlockDriver *drv = bs->drv; 3367 3368 if (!drv) { 3369 return -ENOMEDIUM; 3370 } else if (drv->bdrv_save_vmstate) { 3371 return drv->bdrv_save_vmstate(bs, qiov, pos); 3372 } else if (bs->file) { 3373 return bdrv_writev_vmstate(bs->file, qiov, pos); 3374 } 3375 3376 return -ENOTSUP; 3377 } 3378 3379 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 3380 int64_t pos, int size) 3381 { 3382 BlockDriver *drv = bs->drv; 3383 if (!drv) 3384 return -ENOMEDIUM; 3385 if (drv->bdrv_load_vmstate) 3386 return drv->bdrv_load_vmstate(bs, buf, pos, size); 3387 if (bs->file) 3388 return bdrv_load_vmstate(bs->file, buf, pos, size); 3389 return -ENOTSUP; 3390 } 3391 3392 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) 3393 { 3394 if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { 3395 return; 3396 } 3397 3398 bs->drv->bdrv_debug_event(bs, event); 3399 } 3400 3401 int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, 3402 const char *tag) 3403 { 3404 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { 3405 bs = bs->file; 3406 } 3407 3408 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { 3409 return bs->drv->bdrv_debug_breakpoint(bs, event, tag); 3410 } 3411 3412 return -ENOTSUP; 3413 } 3414 3415 int bdrv_debug_resume(BlockDriverState *bs, const char *tag) 3416 { 3417 while (bs && bs->drv && !bs->drv->bdrv_debug_resume) { 3418 bs = bs->file; 3419 } 3420 3421 if (bs && bs->drv && bs->drv->bdrv_debug_resume) { 3422 return bs->drv->bdrv_debug_resume(bs, tag); 3423 } 3424 3425 return -ENOTSUP; 3426 } 3427 3428 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) 3429 { 3430 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { 3431 bs = bs->file; 3432 } 3433 3434 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { 3435 return bs->drv->bdrv_debug_is_suspended(bs, tag); 3436 } 3437 3438 return false; 3439 } 3440 3441 int bdrv_is_snapshot(BlockDriverState *bs) 3442 { 3443 return !!(bs->open_flags & BDRV_O_SNAPSHOT); 3444 } 3445 3446 /* backing_file can either be relative, or absolute, or a protocol. If it is 3447 * relative, it must be relative to the chain. So, passing in bs->filename 3448 * from a BDS as backing_file should not be done, as that may be relative to 3449 * the CWD rather than the chain. */ 3450 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, 3451 const char *backing_file) 3452 { 3453 char *filename_full = NULL; 3454 char *backing_file_full = NULL; 3455 char *filename_tmp = NULL; 3456 int is_protocol = 0; 3457 BlockDriverState *curr_bs = NULL; 3458 BlockDriverState *retval = NULL; 3459 3460 if (!bs || !bs->drv || !backing_file) { 3461 return NULL; 3462 } 3463 3464 filename_full = g_malloc(PATH_MAX); 3465 backing_file_full = g_malloc(PATH_MAX); 3466 filename_tmp = g_malloc(PATH_MAX); 3467 3468 is_protocol = path_has_protocol(backing_file); 3469 3470 for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { 3471 3472 /* If either of the filename paths is actually a protocol, then 3473 * compare unmodified paths; otherwise make paths relative */ 3474 if (is_protocol || path_has_protocol(curr_bs->backing_file)) { 3475 if (strcmp(backing_file, curr_bs->backing_file) == 0) { 3476 retval = curr_bs->backing_hd; 3477 break; 3478 } 3479 } else { 3480 /* If not an absolute filename path, make it relative to the current 3481 * image's filename path */ 3482 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 3483 backing_file); 3484 3485 /* We are going to compare absolute pathnames */ 3486 if (!realpath(filename_tmp, filename_full)) { 3487 continue; 3488 } 3489 3490 /* We need to make sure the backing filename we are comparing against 3491 * is relative to the current image filename (or absolute) */ 3492 path_combine(filename_tmp, PATH_MAX, curr_bs->filename, 3493 curr_bs->backing_file); 3494 3495 if (!realpath(filename_tmp, backing_file_full)) { 3496 continue; 3497 } 3498 3499 if (strcmp(backing_file_full, filename_full) == 0) { 3500 retval = curr_bs->backing_hd; 3501 break; 3502 } 3503 } 3504 } 3505 3506 g_free(filename_full); 3507 g_free(backing_file_full); 3508 g_free(filename_tmp); 3509 return retval; 3510 } 3511 3512 int bdrv_get_backing_file_depth(BlockDriverState *bs) 3513 { 3514 if (!bs->drv) { 3515 return 0; 3516 } 3517 3518 if (!bs->backing_hd) { 3519 return 0; 3520 } 3521 3522 return 1 + bdrv_get_backing_file_depth(bs->backing_hd); 3523 } 3524 3525 BlockDriverState *bdrv_find_base(BlockDriverState *bs) 3526 { 3527 BlockDriverState *curr_bs = NULL; 3528 3529 if (!bs) { 3530 return NULL; 3531 } 3532 3533 curr_bs = bs; 3534 3535 while (curr_bs->backing_hd) { 3536 curr_bs = curr_bs->backing_hd; 3537 } 3538 return curr_bs; 3539 } 3540 3541 /**************************************************************/ 3542 /* async I/Os */ 3543 3544 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 3545 QEMUIOVector *qiov, int nb_sectors, 3546 BlockDriverCompletionFunc *cb, void *opaque) 3547 { 3548 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 3549 3550 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 3551 cb, opaque, false); 3552 } 3553 3554 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 3555 QEMUIOVector *qiov, int nb_sectors, 3556 BlockDriverCompletionFunc *cb, void *opaque) 3557 { 3558 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 3559 3560 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 3561 cb, opaque, true); 3562 } 3563 3564 3565 typedef struct MultiwriteCB { 3566 int error; 3567 int num_requests; 3568 int num_callbacks; 3569 struct { 3570 BlockDriverCompletionFunc *cb; 3571 void *opaque; 3572 QEMUIOVector *free_qiov; 3573 } callbacks[]; 3574 } MultiwriteCB; 3575 3576 static void multiwrite_user_cb(MultiwriteCB *mcb) 3577 { 3578 int i; 3579 3580 for (i = 0; i < mcb->num_callbacks; i++) { 3581 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 3582 if (mcb->callbacks[i].free_qiov) { 3583 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 3584 } 3585 g_free(mcb->callbacks[i].free_qiov); 3586 } 3587 } 3588 3589 static void multiwrite_cb(void *opaque, int ret) 3590 { 3591 MultiwriteCB *mcb = opaque; 3592 3593 trace_multiwrite_cb(mcb, ret); 3594 3595 if (ret < 0 && !mcb->error) { 3596 mcb->error = ret; 3597 } 3598 3599 mcb->num_requests--; 3600 if (mcb->num_requests == 0) { 3601 multiwrite_user_cb(mcb); 3602 g_free(mcb); 3603 } 3604 } 3605 3606 static int multiwrite_req_compare(const void *a, const void *b) 3607 { 3608 const BlockRequest *req1 = a, *req2 = b; 3609 3610 /* 3611 * Note that we can't simply subtract req2->sector from req1->sector 3612 * here as that could overflow the return value. 3613 */ 3614 if (req1->sector > req2->sector) { 3615 return 1; 3616 } else if (req1->sector < req2->sector) { 3617 return -1; 3618 } else { 3619 return 0; 3620 } 3621 } 3622 3623 /* 3624 * Takes a bunch of requests and tries to merge them. Returns the number of 3625 * requests that remain after merging. 3626 */ 3627 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 3628 int num_reqs, MultiwriteCB *mcb) 3629 { 3630 int i, outidx; 3631 3632 // Sort requests by start sector 3633 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 3634 3635 // Check if adjacent requests touch the same clusters. If so, combine them, 3636 // filling up gaps with zero sectors. 3637 outidx = 0; 3638 for (i = 1; i < num_reqs; i++) { 3639 int merge = 0; 3640 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 3641 3642 // Handle exactly sequential writes and overlapping writes. 3643 if (reqs[i].sector <= oldreq_last) { 3644 merge = 1; 3645 } 3646 3647 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 3648 merge = 0; 3649 } 3650 3651 if (merge) { 3652 size_t size; 3653 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 3654 qemu_iovec_init(qiov, 3655 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 3656 3657 // Add the first request to the merged one. If the requests are 3658 // overlapping, drop the last sectors of the first request. 3659 size = (reqs[i].sector - reqs[outidx].sector) << 9; 3660 qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); 3661 3662 // We should need to add any zeros between the two requests 3663 assert (reqs[i].sector <= oldreq_last); 3664 3665 // Add the second request 3666 qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); 3667 3668 reqs[outidx].nb_sectors = qiov->size >> 9; 3669 reqs[outidx].qiov = qiov; 3670 3671 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 3672 } else { 3673 outidx++; 3674 reqs[outidx].sector = reqs[i].sector; 3675 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 3676 reqs[outidx].qiov = reqs[i].qiov; 3677 } 3678 } 3679 3680 return outidx + 1; 3681 } 3682 3683 /* 3684 * Submit multiple AIO write requests at once. 3685 * 3686 * On success, the function returns 0 and all requests in the reqs array have 3687 * been submitted. In error case this function returns -1, and any of the 3688 * requests may or may not be submitted yet. In particular, this means that the 3689 * callback will be called for some of the requests, for others it won't. The 3690 * caller must check the error field of the BlockRequest to wait for the right 3691 * callbacks (if error != 0, no callback will be called). 3692 * 3693 * The implementation may modify the contents of the reqs array, e.g. to merge 3694 * requests. However, the fields opaque and error are left unmodified as they 3695 * are used to signal failure for a single request to the caller. 3696 */ 3697 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 3698 { 3699 MultiwriteCB *mcb; 3700 int i; 3701 3702 /* don't submit writes if we don't have a medium */ 3703 if (bs->drv == NULL) { 3704 for (i = 0; i < num_reqs; i++) { 3705 reqs[i].error = -ENOMEDIUM; 3706 } 3707 return -1; 3708 } 3709 3710 if (num_reqs == 0) { 3711 return 0; 3712 } 3713 3714 // Create MultiwriteCB structure 3715 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 3716 mcb->num_requests = 0; 3717 mcb->num_callbacks = num_reqs; 3718 3719 for (i = 0; i < num_reqs; i++) { 3720 mcb->callbacks[i].cb = reqs[i].cb; 3721 mcb->callbacks[i].opaque = reqs[i].opaque; 3722 } 3723 3724 // Check for mergable requests 3725 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 3726 3727 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 3728 3729 /* Run the aio requests. */ 3730 mcb->num_requests = num_reqs; 3731 for (i = 0; i < num_reqs; i++) { 3732 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, 3733 reqs[i].nb_sectors, multiwrite_cb, mcb); 3734 } 3735 3736 return 0; 3737 } 3738 3739 void bdrv_aio_cancel(BlockDriverAIOCB *acb) 3740 { 3741 acb->aiocb_info->cancel(acb); 3742 } 3743 3744 /**************************************************************/ 3745 /* async block device emulation */ 3746 3747 typedef struct BlockDriverAIOCBSync { 3748 BlockDriverAIOCB common; 3749 QEMUBH *bh; 3750 int ret; 3751 /* vector translation state */ 3752 QEMUIOVector *qiov; 3753 uint8_t *bounce; 3754 int is_write; 3755 } BlockDriverAIOCBSync; 3756 3757 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) 3758 { 3759 BlockDriverAIOCBSync *acb = 3760 container_of(blockacb, BlockDriverAIOCBSync, common); 3761 qemu_bh_delete(acb->bh); 3762 acb->bh = NULL; 3763 qemu_aio_release(acb); 3764 } 3765 3766 static const AIOCBInfo bdrv_em_aiocb_info = { 3767 .aiocb_size = sizeof(BlockDriverAIOCBSync), 3768 .cancel = bdrv_aio_cancel_em, 3769 }; 3770 3771 static void bdrv_aio_bh_cb(void *opaque) 3772 { 3773 BlockDriverAIOCBSync *acb = opaque; 3774 3775 if (!acb->is_write) 3776 qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); 3777 qemu_vfree(acb->bounce); 3778 acb->common.cb(acb->common.opaque, acb->ret); 3779 qemu_bh_delete(acb->bh); 3780 acb->bh = NULL; 3781 qemu_aio_release(acb); 3782 } 3783 3784 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 3785 int64_t sector_num, 3786 QEMUIOVector *qiov, 3787 int nb_sectors, 3788 BlockDriverCompletionFunc *cb, 3789 void *opaque, 3790 int is_write) 3791 3792 { 3793 BlockDriverAIOCBSync *acb; 3794 3795 acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); 3796 acb->is_write = is_write; 3797 acb->qiov = qiov; 3798 acb->bounce = qemu_blockalign(bs, qiov->size); 3799 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); 3800 3801 if (is_write) { 3802 qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); 3803 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 3804 } else { 3805 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 3806 } 3807 3808 qemu_bh_schedule(acb->bh); 3809 3810 return &acb->common; 3811 } 3812 3813 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 3814 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3815 BlockDriverCompletionFunc *cb, void *opaque) 3816 { 3817 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 3818 } 3819 3820 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 3821 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3822 BlockDriverCompletionFunc *cb, void *opaque) 3823 { 3824 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 3825 } 3826 3827 3828 typedef struct BlockDriverAIOCBCoroutine { 3829 BlockDriverAIOCB common; 3830 BlockRequest req; 3831 bool is_write; 3832 bool *done; 3833 QEMUBH* bh; 3834 } BlockDriverAIOCBCoroutine; 3835 3836 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) 3837 { 3838 BlockDriverAIOCBCoroutine *acb = 3839 container_of(blockacb, BlockDriverAIOCBCoroutine, common); 3840 bool done = false; 3841 3842 acb->done = &done; 3843 while (!done) { 3844 qemu_aio_wait(); 3845 } 3846 } 3847 3848 static const AIOCBInfo bdrv_em_co_aiocb_info = { 3849 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), 3850 .cancel = bdrv_aio_co_cancel_em, 3851 }; 3852 3853 static void bdrv_co_em_bh(void *opaque) 3854 { 3855 BlockDriverAIOCBCoroutine *acb = opaque; 3856 3857 acb->common.cb(acb->common.opaque, acb->req.error); 3858 3859 if (acb->done) { 3860 *acb->done = true; 3861 } 3862 3863 qemu_bh_delete(acb->bh); 3864 qemu_aio_release(acb); 3865 } 3866 3867 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 3868 static void coroutine_fn bdrv_co_do_rw(void *opaque) 3869 { 3870 BlockDriverAIOCBCoroutine *acb = opaque; 3871 BlockDriverState *bs = acb->common.bs; 3872 3873 if (!acb->is_write) { 3874 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 3875 acb->req.nb_sectors, acb->req.qiov, 0); 3876 } else { 3877 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 3878 acb->req.nb_sectors, acb->req.qiov, 0); 3879 } 3880 3881 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3882 qemu_bh_schedule(acb->bh); 3883 } 3884 3885 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 3886 int64_t sector_num, 3887 QEMUIOVector *qiov, 3888 int nb_sectors, 3889 BlockDriverCompletionFunc *cb, 3890 void *opaque, 3891 bool is_write) 3892 { 3893 Coroutine *co; 3894 BlockDriverAIOCBCoroutine *acb; 3895 3896 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 3897 acb->req.sector = sector_num; 3898 acb->req.nb_sectors = nb_sectors; 3899 acb->req.qiov = qiov; 3900 acb->is_write = is_write; 3901 acb->done = NULL; 3902 3903 co = qemu_coroutine_create(bdrv_co_do_rw); 3904 qemu_coroutine_enter(co, acb); 3905 3906 return &acb->common; 3907 } 3908 3909 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 3910 { 3911 BlockDriverAIOCBCoroutine *acb = opaque; 3912 BlockDriverState *bs = acb->common.bs; 3913 3914 acb->req.error = bdrv_co_flush(bs); 3915 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3916 qemu_bh_schedule(acb->bh); 3917 } 3918 3919 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 3920 BlockDriverCompletionFunc *cb, void *opaque) 3921 { 3922 trace_bdrv_aio_flush(bs, opaque); 3923 3924 Coroutine *co; 3925 BlockDriverAIOCBCoroutine *acb; 3926 3927 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 3928 acb->done = NULL; 3929 3930 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 3931 qemu_coroutine_enter(co, acb); 3932 3933 return &acb->common; 3934 } 3935 3936 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 3937 { 3938 BlockDriverAIOCBCoroutine *acb = opaque; 3939 BlockDriverState *bs = acb->common.bs; 3940 3941 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 3942 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3943 qemu_bh_schedule(acb->bh); 3944 } 3945 3946 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, 3947 int64_t sector_num, int nb_sectors, 3948 BlockDriverCompletionFunc *cb, void *opaque) 3949 { 3950 Coroutine *co; 3951 BlockDriverAIOCBCoroutine *acb; 3952 3953 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 3954 3955 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); 3956 acb->req.sector = sector_num; 3957 acb->req.nb_sectors = nb_sectors; 3958 acb->done = NULL; 3959 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 3960 qemu_coroutine_enter(co, acb); 3961 3962 return &acb->common; 3963 } 3964 3965 void bdrv_init(void) 3966 { 3967 module_call_init(MODULE_INIT_BLOCK); 3968 } 3969 3970 void bdrv_init_with_whitelist(void) 3971 { 3972 use_bdrv_whitelist = 1; 3973 bdrv_init(); 3974 } 3975 3976 void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, 3977 BlockDriverCompletionFunc *cb, void *opaque) 3978 { 3979 BlockDriverAIOCB *acb; 3980 3981 acb = g_slice_alloc(aiocb_info->aiocb_size); 3982 acb->aiocb_info = aiocb_info; 3983 acb->bs = bs; 3984 acb->cb = cb; 3985 acb->opaque = opaque; 3986 return acb; 3987 } 3988 3989 void qemu_aio_release(void *p) 3990 { 3991 BlockDriverAIOCB *acb = p; 3992 g_slice_free1(acb->aiocb_info->aiocb_size, acb); 3993 } 3994 3995 /**************************************************************/ 3996 /* Coroutine block device emulation */ 3997 3998 typedef struct CoroutineIOCompletion { 3999 Coroutine *coroutine; 4000 int ret; 4001 } CoroutineIOCompletion; 4002 4003 static void bdrv_co_io_em_complete(void *opaque, int ret) 4004 { 4005 CoroutineIOCompletion *co = opaque; 4006 4007 co->ret = ret; 4008 qemu_coroutine_enter(co->coroutine, NULL); 4009 } 4010 4011 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 4012 int nb_sectors, QEMUIOVector *iov, 4013 bool is_write) 4014 { 4015 CoroutineIOCompletion co = { 4016 .coroutine = qemu_coroutine_self(), 4017 }; 4018 BlockDriverAIOCB *acb; 4019 4020 if (is_write) { 4021 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 4022 bdrv_co_io_em_complete, &co); 4023 } else { 4024 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 4025 bdrv_co_io_em_complete, &co); 4026 } 4027 4028 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 4029 if (!acb) { 4030 return -EIO; 4031 } 4032 qemu_coroutine_yield(); 4033 4034 return co.ret; 4035 } 4036 4037 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 4038 int64_t sector_num, int nb_sectors, 4039 QEMUIOVector *iov) 4040 { 4041 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 4042 } 4043 4044 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 4045 int64_t sector_num, int nb_sectors, 4046 QEMUIOVector *iov) 4047 { 4048 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 4049 } 4050 4051 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 4052 { 4053 RwCo *rwco = opaque; 4054 4055 rwco->ret = bdrv_co_flush(rwco->bs); 4056 } 4057 4058 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 4059 { 4060 int ret; 4061 4062 if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 4063 return 0; 4064 } 4065 4066 /* Write back cached data to the OS even with cache=unsafe */ 4067 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 4068 if (bs->drv->bdrv_co_flush_to_os) { 4069 ret = bs->drv->bdrv_co_flush_to_os(bs); 4070 if (ret < 0) { 4071 return ret; 4072 } 4073 } 4074 4075 /* But don't actually force it to the disk with cache=unsafe */ 4076 if (bs->open_flags & BDRV_O_NO_FLUSH) { 4077 goto flush_parent; 4078 } 4079 4080 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 4081 if (bs->drv->bdrv_co_flush_to_disk) { 4082 ret = bs->drv->bdrv_co_flush_to_disk(bs); 4083 } else if (bs->drv->bdrv_aio_flush) { 4084 BlockDriverAIOCB *acb; 4085 CoroutineIOCompletion co = { 4086 .coroutine = qemu_coroutine_self(), 4087 }; 4088 4089 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 4090 if (acb == NULL) { 4091 ret = -EIO; 4092 } else { 4093 qemu_coroutine_yield(); 4094 ret = co.ret; 4095 } 4096 } else { 4097 /* 4098 * Some block drivers always operate in either writethrough or unsafe 4099 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 4100 * know how the server works (because the behaviour is hardcoded or 4101 * depends on server-side configuration), so we can't ensure that 4102 * everything is safe on disk. Returning an error doesn't work because 4103 * that would break guests even if the server operates in writethrough 4104 * mode. 4105 * 4106 * Let's hope the user knows what he's doing. 4107 */ 4108 ret = 0; 4109 } 4110 if (ret < 0) { 4111 return ret; 4112 } 4113 4114 /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 4115 * in the case of cache=unsafe, so there are no useless flushes. 4116 */ 4117 flush_parent: 4118 return bdrv_co_flush(bs->file); 4119 } 4120 4121 void bdrv_invalidate_cache(BlockDriverState *bs) 4122 { 4123 if (bs->drv && bs->drv->bdrv_invalidate_cache) { 4124 bs->drv->bdrv_invalidate_cache(bs); 4125 } 4126 } 4127 4128 void bdrv_invalidate_cache_all(void) 4129 { 4130 BlockDriverState *bs; 4131 4132 QTAILQ_FOREACH(bs, &bdrv_states, list) { 4133 bdrv_invalidate_cache(bs); 4134 } 4135 } 4136 4137 void bdrv_clear_incoming_migration_all(void) 4138 { 4139 BlockDriverState *bs; 4140 4141 QTAILQ_FOREACH(bs, &bdrv_states, list) { 4142 bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); 4143 } 4144 } 4145 4146 int bdrv_flush(BlockDriverState *bs) 4147 { 4148 Coroutine *co; 4149 RwCo rwco = { 4150 .bs = bs, 4151 .ret = NOT_DONE, 4152 }; 4153 4154 if (qemu_in_coroutine()) { 4155 /* Fast-path if already in coroutine context */ 4156 bdrv_flush_co_entry(&rwco); 4157 } else { 4158 co = qemu_coroutine_create(bdrv_flush_co_entry); 4159 qemu_coroutine_enter(co, &rwco); 4160 while (rwco.ret == NOT_DONE) { 4161 qemu_aio_wait(); 4162 } 4163 } 4164 4165 return rwco.ret; 4166 } 4167 4168 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 4169 { 4170 RwCo *rwco = opaque; 4171 4172 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 4173 } 4174 4175 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 4176 int nb_sectors) 4177 { 4178 if (!bs->drv) { 4179 return -ENOMEDIUM; 4180 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { 4181 return -EIO; 4182 } else if (bs->read_only) { 4183 return -EROFS; 4184 } 4185 4186 if (bs->dirty_bitmap) { 4187 bdrv_reset_dirty(bs, sector_num, nb_sectors); 4188 } 4189 4190 /* Do nothing if disabled. */ 4191 if (!(bs->open_flags & BDRV_O_UNMAP)) { 4192 return 0; 4193 } 4194 4195 if (bs->drv->bdrv_co_discard) { 4196 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); 4197 } else if (bs->drv->bdrv_aio_discard) { 4198 BlockDriverAIOCB *acb; 4199 CoroutineIOCompletion co = { 4200 .coroutine = qemu_coroutine_self(), 4201 }; 4202 4203 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 4204 bdrv_co_io_em_complete, &co); 4205 if (acb == NULL) { 4206 return -EIO; 4207 } else { 4208 qemu_coroutine_yield(); 4209 return co.ret; 4210 } 4211 } else { 4212 return 0; 4213 } 4214 } 4215 4216 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 4217 { 4218 Coroutine *co; 4219 RwCo rwco = { 4220 .bs = bs, 4221 .sector_num = sector_num, 4222 .nb_sectors = nb_sectors, 4223 .ret = NOT_DONE, 4224 }; 4225 4226 if (qemu_in_coroutine()) { 4227 /* Fast-path if already in coroutine context */ 4228 bdrv_discard_co_entry(&rwco); 4229 } else { 4230 co = qemu_coroutine_create(bdrv_discard_co_entry); 4231 qemu_coroutine_enter(co, &rwco); 4232 while (rwco.ret == NOT_DONE) { 4233 qemu_aio_wait(); 4234 } 4235 } 4236 4237 return rwco.ret; 4238 } 4239 4240 /**************************************************************/ 4241 /* removable device support */ 4242 4243 /** 4244 * Return TRUE if the media is present 4245 */ 4246 int bdrv_is_inserted(BlockDriverState *bs) 4247 { 4248 BlockDriver *drv = bs->drv; 4249 4250 if (!drv) 4251 return 0; 4252 if (!drv->bdrv_is_inserted) 4253 return 1; 4254 return drv->bdrv_is_inserted(bs); 4255 } 4256 4257 /** 4258 * Return whether the media changed since the last call to this 4259 * function, or -ENOTSUP if we don't know. Most drivers don't know. 4260 */ 4261 int bdrv_media_changed(BlockDriverState *bs) 4262 { 4263 BlockDriver *drv = bs->drv; 4264 4265 if (drv && drv->bdrv_media_changed) { 4266 return drv->bdrv_media_changed(bs); 4267 } 4268 return -ENOTSUP; 4269 } 4270 4271 /** 4272 * If eject_flag is TRUE, eject the media. Otherwise, close the tray 4273 */ 4274 void bdrv_eject(BlockDriverState *bs, bool eject_flag) 4275 { 4276 BlockDriver *drv = bs->drv; 4277 4278 if (drv && drv->bdrv_eject) { 4279 drv->bdrv_eject(bs, eject_flag); 4280 } 4281 4282 if (bs->device_name[0] != '\0') { 4283 bdrv_emit_qmp_eject_event(bs, eject_flag); 4284 } 4285 } 4286 4287 /** 4288 * Lock or unlock the media (if it is locked, the user won't be able 4289 * to eject it manually). 4290 */ 4291 void bdrv_lock_medium(BlockDriverState *bs, bool locked) 4292 { 4293 BlockDriver *drv = bs->drv; 4294 4295 trace_bdrv_lock_medium(bs, locked); 4296 4297 if (drv && drv->bdrv_lock_medium) { 4298 drv->bdrv_lock_medium(bs, locked); 4299 } 4300 } 4301 4302 /* needed for generic scsi interface */ 4303 4304 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 4305 { 4306 BlockDriver *drv = bs->drv; 4307 4308 if (drv && drv->bdrv_ioctl) 4309 return drv->bdrv_ioctl(bs, req, buf); 4310 return -ENOTSUP; 4311 } 4312 4313 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 4314 unsigned long int req, void *buf, 4315 BlockDriverCompletionFunc *cb, void *opaque) 4316 { 4317 BlockDriver *drv = bs->drv; 4318 4319 if (drv && drv->bdrv_aio_ioctl) 4320 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 4321 return NULL; 4322 } 4323 4324 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) 4325 { 4326 bs->buffer_alignment = align; 4327 } 4328 4329 void *qemu_blockalign(BlockDriverState *bs, size_t size) 4330 { 4331 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); 4332 } 4333 4334 /* 4335 * Check if all memory in this vector is sector aligned. 4336 */ 4337 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 4338 { 4339 int i; 4340 4341 for (i = 0; i < qiov->niov; i++) { 4342 if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { 4343 return false; 4344 } 4345 } 4346 4347 return true; 4348 } 4349 4350 void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) 4351 { 4352 int64_t bitmap_size; 4353 4354 assert((granularity & (granularity - 1)) == 0); 4355 4356 if (granularity) { 4357 granularity >>= BDRV_SECTOR_BITS; 4358 assert(!bs->dirty_bitmap); 4359 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); 4360 bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); 4361 } else { 4362 if (bs->dirty_bitmap) { 4363 hbitmap_free(bs->dirty_bitmap); 4364 bs->dirty_bitmap = NULL; 4365 } 4366 } 4367 } 4368 4369 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) 4370 { 4371 if (bs->dirty_bitmap) { 4372 return hbitmap_get(bs->dirty_bitmap, sector); 4373 } else { 4374 return 0; 4375 } 4376 } 4377 4378 void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) 4379 { 4380 hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); 4381 } 4382 4383 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, 4384 int nr_sectors) 4385 { 4386 hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); 4387 } 4388 4389 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, 4390 int nr_sectors) 4391 { 4392 hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); 4393 } 4394 4395 int64_t bdrv_get_dirty_count(BlockDriverState *bs) 4396 { 4397 if (bs->dirty_bitmap) { 4398 return hbitmap_count(bs->dirty_bitmap); 4399 } else { 4400 return 0; 4401 } 4402 } 4403 4404 /* Get a reference to bs */ 4405 void bdrv_ref(BlockDriverState *bs) 4406 { 4407 bs->refcnt++; 4408 } 4409 4410 /* Release a previously grabbed reference to bs. 4411 * If after releasing, reference count is zero, the BlockDriverState is 4412 * deleted. */ 4413 void bdrv_unref(BlockDriverState *bs) 4414 { 4415 assert(bs->refcnt > 0); 4416 if (--bs->refcnt == 0) { 4417 bdrv_delete(bs); 4418 } 4419 } 4420 4421 void bdrv_set_in_use(BlockDriverState *bs, int in_use) 4422 { 4423 assert(bs->in_use != in_use); 4424 bs->in_use = in_use; 4425 } 4426 4427 int bdrv_in_use(BlockDriverState *bs) 4428 { 4429 return bs->in_use; 4430 } 4431 4432 void bdrv_iostatus_enable(BlockDriverState *bs) 4433 { 4434 bs->iostatus_enabled = true; 4435 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 4436 } 4437 4438 /* The I/O status is only enabled if the drive explicitly 4439 * enables it _and_ the VM is configured to stop on errors */ 4440 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) 4441 { 4442 return (bs->iostatus_enabled && 4443 (bs->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || 4444 bs->on_write_error == BLOCKDEV_ON_ERROR_STOP || 4445 bs->on_read_error == BLOCKDEV_ON_ERROR_STOP)); 4446 } 4447 4448 void bdrv_iostatus_disable(BlockDriverState *bs) 4449 { 4450 bs->iostatus_enabled = false; 4451 } 4452 4453 void bdrv_iostatus_reset(BlockDriverState *bs) 4454 { 4455 if (bdrv_iostatus_is_enabled(bs)) { 4456 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 4457 if (bs->job) { 4458 block_job_iostatus_reset(bs->job); 4459 } 4460 } 4461 } 4462 4463 void bdrv_iostatus_set_err(BlockDriverState *bs, int error) 4464 { 4465 assert(bdrv_iostatus_is_enabled(bs)); 4466 if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 4467 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : 4468 BLOCK_DEVICE_IO_STATUS_FAILED; 4469 } 4470 } 4471 4472 void 4473 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, 4474 enum BlockAcctType type) 4475 { 4476 assert(type < BDRV_MAX_IOTYPE); 4477 4478 cookie->bytes = bytes; 4479 cookie->start_time_ns = get_clock(); 4480 cookie->type = type; 4481 } 4482 4483 void 4484 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie) 4485 { 4486 assert(cookie->type < BDRV_MAX_IOTYPE); 4487 4488 bs->nr_bytes[cookie->type] += cookie->bytes; 4489 bs->nr_ops[cookie->type]++; 4490 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns; 4491 } 4492 4493 void bdrv_img_create(const char *filename, const char *fmt, 4494 const char *base_filename, const char *base_fmt, 4495 char *options, uint64_t img_size, int flags, 4496 Error **errp, bool quiet) 4497 { 4498 QEMUOptionParameter *param = NULL, *create_options = NULL; 4499 QEMUOptionParameter *backing_fmt, *backing_file, *size; 4500 BlockDriverState *bs = NULL; 4501 BlockDriver *drv, *proto_drv; 4502 BlockDriver *backing_drv = NULL; 4503 Error *local_err = NULL; 4504 int ret = 0; 4505 4506 /* Find driver and parse its options */ 4507 drv = bdrv_find_format(fmt); 4508 if (!drv) { 4509 error_setg(errp, "Unknown file format '%s'", fmt); 4510 return; 4511 } 4512 4513 proto_drv = bdrv_find_protocol(filename, true); 4514 if (!proto_drv) { 4515 error_setg(errp, "Unknown protocol '%s'", filename); 4516 return; 4517 } 4518 4519 create_options = append_option_parameters(create_options, 4520 drv->create_options); 4521 create_options = append_option_parameters(create_options, 4522 proto_drv->create_options); 4523 4524 /* Create parameter list with default values */ 4525 param = parse_option_parameters("", create_options, param); 4526 4527 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size); 4528 4529 /* Parse -o options */ 4530 if (options) { 4531 param = parse_option_parameters(options, create_options, param); 4532 if (param == NULL) { 4533 error_setg(errp, "Invalid options for file format '%s'.", fmt); 4534 goto out; 4535 } 4536 } 4537 4538 if (base_filename) { 4539 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE, 4540 base_filename)) { 4541 error_setg(errp, "Backing file not supported for file format '%s'", 4542 fmt); 4543 goto out; 4544 } 4545 } 4546 4547 if (base_fmt) { 4548 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) { 4549 error_setg(errp, "Backing file format not supported for file " 4550 "format '%s'", fmt); 4551 goto out; 4552 } 4553 } 4554 4555 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE); 4556 if (backing_file && backing_file->value.s) { 4557 if (!strcmp(filename, backing_file->value.s)) { 4558 error_setg(errp, "Error: Trying to create an image with the " 4559 "same filename as the backing file"); 4560 goto out; 4561 } 4562 } 4563 4564 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT); 4565 if (backing_fmt && backing_fmt->value.s) { 4566 backing_drv = bdrv_find_format(backing_fmt->value.s); 4567 if (!backing_drv) { 4568 error_setg(errp, "Unknown backing file format '%s'", 4569 backing_fmt->value.s); 4570 goto out; 4571 } 4572 } 4573 4574 // The size for the image must always be specified, with one exception: 4575 // If we are using a backing file, we can obtain the size from there 4576 size = get_option_parameter(param, BLOCK_OPT_SIZE); 4577 if (size && size->value.n == -1) { 4578 if (backing_file && backing_file->value.s) { 4579 uint64_t size; 4580 char buf[32]; 4581 int back_flags; 4582 4583 /* backing files always opened read-only */ 4584 back_flags = 4585 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 4586 4587 bs = bdrv_new(""); 4588 4589 ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags, 4590 backing_drv, &local_err); 4591 if (ret < 0) { 4592 error_setg_errno(errp, -ret, "Could not open '%s': %s", 4593 backing_file->value.s, 4594 error_get_pretty(local_err)); 4595 error_free(local_err); 4596 local_err = NULL; 4597 goto out; 4598 } 4599 bdrv_get_geometry(bs, &size); 4600 size *= 512; 4601 4602 snprintf(buf, sizeof(buf), "%" PRId64, size); 4603 set_option_parameter(param, BLOCK_OPT_SIZE, buf); 4604 } else { 4605 error_setg(errp, "Image creation needs a size parameter"); 4606 goto out; 4607 } 4608 } 4609 4610 if (!quiet) { 4611 printf("Formatting '%s', fmt=%s ", filename, fmt); 4612 print_option_parameters(param); 4613 puts(""); 4614 } 4615 ret = bdrv_create(drv, filename, param, &local_err); 4616 if (ret == -EFBIG) { 4617 /* This is generally a better message than whatever the driver would 4618 * deliver (especially because of the cluster_size_hint), since that 4619 * is most probably not much different from "image too large". */ 4620 const char *cluster_size_hint = ""; 4621 if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) { 4622 cluster_size_hint = " (try using a larger cluster size)"; 4623 } 4624 error_setg(errp, "The image size is too large for file format '%s'" 4625 "%s", fmt, cluster_size_hint); 4626 error_free(local_err); 4627 local_err = NULL; 4628 } 4629 4630 out: 4631 free_option_parameters(create_options); 4632 free_option_parameters(param); 4633 4634 if (bs) { 4635 bdrv_unref(bs); 4636 } 4637 if (error_is_set(&local_err)) { 4638 error_propagate(errp, local_err); 4639 } 4640 } 4641 4642 AioContext *bdrv_get_aio_context(BlockDriverState *bs) 4643 { 4644 /* Currently BlockDriverState always uses the main loop AioContext */ 4645 return qemu_get_aio_context(); 4646 } 4647 4648 void bdrv_add_before_write_notifier(BlockDriverState *bs, 4649 NotifierWithReturn *notifier) 4650 { 4651 notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 4652 } 4653 4654 int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options) 4655 { 4656 if (bs->drv->bdrv_amend_options == NULL) { 4657 return -ENOTSUP; 4658 } 4659 return bs->drv->bdrv_amend_options(bs, options); 4660 } 4661 4662 ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs) 4663 { 4664 if (bs->drv->bdrv_check_ext_snapshot) { 4665 return bs->drv->bdrv_check_ext_snapshot(bs); 4666 } 4667 4668 if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) { 4669 return bs->file->drv->bdrv_check_ext_snapshot(bs); 4670 } 4671 4672 /* external snapshots are allowed by default */ 4673 return EXT_SNAPSHOT_ALLOWED; 4674 } 4675 4676 ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs) 4677 { 4678 return EXT_SNAPSHOT_FORBIDDEN; 4679 } 4680