1 /* 2 * QEMU System Emulator block driver 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "config-host.h" 25 #include "qemu-common.h" 26 #include "trace.h" 27 #include "monitor.h" 28 #include "block_int.h" 29 #include "module.h" 30 #include "qjson.h" 31 #include "qemu-coroutine.h" 32 #include "qmp-commands.h" 33 #include "qemu-timer.h" 34 35 #ifdef CONFIG_BSD 36 #include <sys/types.h> 37 #include <sys/stat.h> 38 #include <sys/ioctl.h> 39 #include <sys/queue.h> 40 #ifndef __DragonFly__ 41 #include <sys/disk.h> 42 #endif 43 #endif 44 45 #ifdef _WIN32 46 #include <windows.h> 47 #endif 48 49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 50 51 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); 52 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 53 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 54 BlockDriverCompletionFunc *cb, void *opaque); 55 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 56 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 57 BlockDriverCompletionFunc *cb, void *opaque); 58 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 59 int64_t sector_num, int nb_sectors, 60 QEMUIOVector *iov); 61 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 62 int64_t sector_num, int nb_sectors, 63 QEMUIOVector *iov); 64 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 65 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); 66 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 67 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); 68 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 69 int64_t sector_num, 70 QEMUIOVector *qiov, 71 int nb_sectors, 72 BlockDriverCompletionFunc *cb, 73 void *opaque, 74 bool is_write); 75 static void coroutine_fn bdrv_co_do_rw(void *opaque); 76 77 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, 78 bool is_write, double elapsed_time, uint64_t *wait); 79 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, 80 double elapsed_time, uint64_t *wait); 81 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, 82 bool is_write, int64_t *wait); 83 84 static QTAILQ_HEAD(, BlockDriverState) bdrv_states = 85 QTAILQ_HEAD_INITIALIZER(bdrv_states); 86 87 static QLIST_HEAD(, BlockDriver) bdrv_drivers = 88 QLIST_HEAD_INITIALIZER(bdrv_drivers); 89 90 /* The device to use for VM snapshots */ 91 static BlockDriverState *bs_snapshots; 92 93 /* If non-zero, use only whitelisted block drivers */ 94 static int use_bdrv_whitelist; 95 96 #ifdef _WIN32 97 static int is_windows_drive_prefix(const char *filename) 98 { 99 return (((filename[0] >= 'a' && filename[0] <= 'z') || 100 (filename[0] >= 'A' && filename[0] <= 'Z')) && 101 filename[1] == ':'); 102 } 103 104 int is_windows_drive(const char *filename) 105 { 106 if (is_windows_drive_prefix(filename) && 107 filename[2] == '\0') 108 return 1; 109 if (strstart(filename, "\\\\.\\", NULL) || 110 strstart(filename, "//./", NULL)) 111 return 1; 112 return 0; 113 } 114 #endif 115 116 /* throttling disk I/O limits */ 117 void bdrv_io_limits_disable(BlockDriverState *bs) 118 { 119 bs->io_limits_enabled = false; 120 121 while (qemu_co_queue_next(&bs->throttled_reqs)); 122 123 if (bs->block_timer) { 124 qemu_del_timer(bs->block_timer); 125 qemu_free_timer(bs->block_timer); 126 bs->block_timer = NULL; 127 } 128 129 bs->slice_start = 0; 130 bs->slice_end = 0; 131 bs->slice_time = 0; 132 memset(&bs->io_base, 0, sizeof(bs->io_base)); 133 } 134 135 static void bdrv_block_timer(void *opaque) 136 { 137 BlockDriverState *bs = opaque; 138 139 qemu_co_queue_next(&bs->throttled_reqs); 140 } 141 142 void bdrv_io_limits_enable(BlockDriverState *bs) 143 { 144 qemu_co_queue_init(&bs->throttled_reqs); 145 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); 146 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; 147 bs->slice_start = qemu_get_clock_ns(vm_clock); 148 bs->slice_end = bs->slice_start + bs->slice_time; 149 memset(&bs->io_base, 0, sizeof(bs->io_base)); 150 bs->io_limits_enabled = true; 151 } 152 153 bool bdrv_io_limits_enabled(BlockDriverState *bs) 154 { 155 BlockIOLimit *io_limits = &bs->io_limits; 156 return io_limits->bps[BLOCK_IO_LIMIT_READ] 157 || io_limits->bps[BLOCK_IO_LIMIT_WRITE] 158 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL] 159 || io_limits->iops[BLOCK_IO_LIMIT_READ] 160 || io_limits->iops[BLOCK_IO_LIMIT_WRITE] 161 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]; 162 } 163 164 static void bdrv_io_limits_intercept(BlockDriverState *bs, 165 bool is_write, int nb_sectors) 166 { 167 int64_t wait_time = -1; 168 169 if (!qemu_co_queue_empty(&bs->throttled_reqs)) { 170 qemu_co_queue_wait(&bs->throttled_reqs); 171 } 172 173 /* In fact, we hope to keep each request's timing, in FIFO mode. The next 174 * throttled requests will not be dequeued until the current request is 175 * allowed to be serviced. So if the current request still exceeds the 176 * limits, it will be inserted to the head. All requests followed it will 177 * be still in throttled_reqs queue. 178 */ 179 180 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) { 181 qemu_mod_timer(bs->block_timer, 182 wait_time + qemu_get_clock_ns(vm_clock)); 183 qemu_co_queue_wait_insert_head(&bs->throttled_reqs); 184 } 185 186 qemu_co_queue_next(&bs->throttled_reqs); 187 } 188 189 /* check if the path starts with "<protocol>:" */ 190 static int path_has_protocol(const char *path) 191 { 192 #ifdef _WIN32 193 if (is_windows_drive(path) || 194 is_windows_drive_prefix(path)) { 195 return 0; 196 } 197 #endif 198 199 return strchr(path, ':') != NULL; 200 } 201 202 int path_is_absolute(const char *path) 203 { 204 const char *p; 205 #ifdef _WIN32 206 /* specific case for names like: "\\.\d:" */ 207 if (*path == '/' || *path == '\\') 208 return 1; 209 #endif 210 p = strchr(path, ':'); 211 if (p) 212 p++; 213 else 214 p = path; 215 #ifdef _WIN32 216 return (*p == '/' || *p == '\\'); 217 #else 218 return (*p == '/'); 219 #endif 220 } 221 222 /* if filename is absolute, just copy it to dest. Otherwise, build a 223 path to it by considering it is relative to base_path. URL are 224 supported. */ 225 void path_combine(char *dest, int dest_size, 226 const char *base_path, 227 const char *filename) 228 { 229 const char *p, *p1; 230 int len; 231 232 if (dest_size <= 0) 233 return; 234 if (path_is_absolute(filename)) { 235 pstrcpy(dest, dest_size, filename); 236 } else { 237 p = strchr(base_path, ':'); 238 if (p) 239 p++; 240 else 241 p = base_path; 242 p1 = strrchr(base_path, '/'); 243 #ifdef _WIN32 244 { 245 const char *p2; 246 p2 = strrchr(base_path, '\\'); 247 if (!p1 || p2 > p1) 248 p1 = p2; 249 } 250 #endif 251 if (p1) 252 p1++; 253 else 254 p1 = base_path; 255 if (p1 > p) 256 p = p1; 257 len = p - base_path; 258 if (len > dest_size - 1) 259 len = dest_size - 1; 260 memcpy(dest, base_path, len); 261 dest[len] = '\0'; 262 pstrcat(dest, dest_size, filename); 263 } 264 } 265 266 void bdrv_register(BlockDriver *bdrv) 267 { 268 /* Block drivers without coroutine functions need emulation */ 269 if (!bdrv->bdrv_co_readv) { 270 bdrv->bdrv_co_readv = bdrv_co_readv_em; 271 bdrv->bdrv_co_writev = bdrv_co_writev_em; 272 273 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if 274 * the block driver lacks aio we need to emulate that too. 275 */ 276 if (!bdrv->bdrv_aio_readv) { 277 /* add AIO emulation layer */ 278 bdrv->bdrv_aio_readv = bdrv_aio_readv_em; 279 bdrv->bdrv_aio_writev = bdrv_aio_writev_em; 280 } 281 } 282 283 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); 284 } 285 286 /* create a new block device (by default it is empty) */ 287 BlockDriverState *bdrv_new(const char *device_name) 288 { 289 BlockDriverState *bs; 290 291 bs = g_malloc0(sizeof(BlockDriverState)); 292 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); 293 if (device_name[0] != '\0') { 294 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); 295 } 296 bdrv_iostatus_disable(bs); 297 return bs; 298 } 299 300 BlockDriver *bdrv_find_format(const char *format_name) 301 { 302 BlockDriver *drv1; 303 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 304 if (!strcmp(drv1->format_name, format_name)) { 305 return drv1; 306 } 307 } 308 return NULL; 309 } 310 311 static int bdrv_is_whitelisted(BlockDriver *drv) 312 { 313 static const char *whitelist[] = { 314 CONFIG_BDRV_WHITELIST 315 }; 316 const char **p; 317 318 if (!whitelist[0]) 319 return 1; /* no whitelist, anything goes */ 320 321 for (p = whitelist; *p; p++) { 322 if (!strcmp(drv->format_name, *p)) { 323 return 1; 324 } 325 } 326 return 0; 327 } 328 329 BlockDriver *bdrv_find_whitelisted_format(const char *format_name) 330 { 331 BlockDriver *drv = bdrv_find_format(format_name); 332 return drv && bdrv_is_whitelisted(drv) ? drv : NULL; 333 } 334 335 int bdrv_create(BlockDriver *drv, const char* filename, 336 QEMUOptionParameter *options) 337 { 338 if (!drv->bdrv_create) 339 return -ENOTSUP; 340 341 return drv->bdrv_create(filename, options); 342 } 343 344 int bdrv_create_file(const char* filename, QEMUOptionParameter *options) 345 { 346 BlockDriver *drv; 347 348 drv = bdrv_find_protocol(filename); 349 if (drv == NULL) { 350 return -ENOENT; 351 } 352 353 return bdrv_create(drv, filename, options); 354 } 355 356 #ifdef _WIN32 357 void get_tmp_filename(char *filename, int size) 358 { 359 char temp_dir[MAX_PATH]; 360 361 GetTempPath(MAX_PATH, temp_dir); 362 GetTempFileName(temp_dir, "qem", 0, filename); 363 } 364 #else 365 void get_tmp_filename(char *filename, int size) 366 { 367 int fd; 368 const char *tmpdir; 369 /* XXX: race condition possible */ 370 tmpdir = getenv("TMPDIR"); 371 if (!tmpdir) 372 tmpdir = "/tmp"; 373 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir); 374 fd = mkstemp(filename); 375 close(fd); 376 } 377 #endif 378 379 /* 380 * Detect host devices. By convention, /dev/cdrom[N] is always 381 * recognized as a host CDROM. 382 */ 383 static BlockDriver *find_hdev_driver(const char *filename) 384 { 385 int score_max = 0, score; 386 BlockDriver *drv = NULL, *d; 387 388 QLIST_FOREACH(d, &bdrv_drivers, list) { 389 if (d->bdrv_probe_device) { 390 score = d->bdrv_probe_device(filename); 391 if (score > score_max) { 392 score_max = score; 393 drv = d; 394 } 395 } 396 } 397 398 return drv; 399 } 400 401 BlockDriver *bdrv_find_protocol(const char *filename) 402 { 403 BlockDriver *drv1; 404 char protocol[128]; 405 int len; 406 const char *p; 407 408 /* TODO Drivers without bdrv_file_open must be specified explicitly */ 409 410 /* 411 * XXX(hch): we really should not let host device detection 412 * override an explicit protocol specification, but moving this 413 * later breaks access to device names with colons in them. 414 * Thanks to the brain-dead persistent naming schemes on udev- 415 * based Linux systems those actually are quite common. 416 */ 417 drv1 = find_hdev_driver(filename); 418 if (drv1) { 419 return drv1; 420 } 421 422 if (!path_has_protocol(filename)) { 423 return bdrv_find_format("file"); 424 } 425 p = strchr(filename, ':'); 426 assert(p != NULL); 427 len = p - filename; 428 if (len > sizeof(protocol) - 1) 429 len = sizeof(protocol) - 1; 430 memcpy(protocol, filename, len); 431 protocol[len] = '\0'; 432 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 433 if (drv1->protocol_name && 434 !strcmp(drv1->protocol_name, protocol)) { 435 return drv1; 436 } 437 } 438 return NULL; 439 } 440 441 static int find_image_format(const char *filename, BlockDriver **pdrv) 442 { 443 int ret, score, score_max; 444 BlockDriver *drv1, *drv; 445 uint8_t buf[2048]; 446 BlockDriverState *bs; 447 448 ret = bdrv_file_open(&bs, filename, 0); 449 if (ret < 0) { 450 *pdrv = NULL; 451 return ret; 452 } 453 454 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */ 455 if (bs->sg || !bdrv_is_inserted(bs)) { 456 bdrv_delete(bs); 457 drv = bdrv_find_format("raw"); 458 if (!drv) { 459 ret = -ENOENT; 460 } 461 *pdrv = drv; 462 return ret; 463 } 464 465 ret = bdrv_pread(bs, 0, buf, sizeof(buf)); 466 bdrv_delete(bs); 467 if (ret < 0) { 468 *pdrv = NULL; 469 return ret; 470 } 471 472 score_max = 0; 473 drv = NULL; 474 QLIST_FOREACH(drv1, &bdrv_drivers, list) { 475 if (drv1->bdrv_probe) { 476 score = drv1->bdrv_probe(buf, ret, filename); 477 if (score > score_max) { 478 score_max = score; 479 drv = drv1; 480 } 481 } 482 } 483 if (!drv) { 484 ret = -ENOENT; 485 } 486 *pdrv = drv; 487 return ret; 488 } 489 490 /** 491 * Set the current 'total_sectors' value 492 */ 493 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) 494 { 495 BlockDriver *drv = bs->drv; 496 497 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */ 498 if (bs->sg) 499 return 0; 500 501 /* query actual device if possible, otherwise just trust the hint */ 502 if (drv->bdrv_getlength) { 503 int64_t length = drv->bdrv_getlength(bs); 504 if (length < 0) { 505 return length; 506 } 507 hint = length >> BDRV_SECTOR_BITS; 508 } 509 510 bs->total_sectors = hint; 511 return 0; 512 } 513 514 /** 515 * Set open flags for a given cache mode 516 * 517 * Return 0 on success, -1 if the cache mode was invalid. 518 */ 519 int bdrv_parse_cache_flags(const char *mode, int *flags) 520 { 521 *flags &= ~BDRV_O_CACHE_MASK; 522 523 if (!strcmp(mode, "off") || !strcmp(mode, "none")) { 524 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB; 525 } else if (!strcmp(mode, "directsync")) { 526 *flags |= BDRV_O_NOCACHE; 527 } else if (!strcmp(mode, "writeback")) { 528 *flags |= BDRV_O_CACHE_WB; 529 } else if (!strcmp(mode, "unsafe")) { 530 *flags |= BDRV_O_CACHE_WB; 531 *flags |= BDRV_O_NO_FLUSH; 532 } else if (!strcmp(mode, "writethrough")) { 533 /* this is the default */ 534 } else { 535 return -1; 536 } 537 538 return 0; 539 } 540 541 /** 542 * The copy-on-read flag is actually a reference count so multiple users may 543 * use the feature without worrying about clobbering its previous state. 544 * Copy-on-read stays enabled until all users have called to disable it. 545 */ 546 void bdrv_enable_copy_on_read(BlockDriverState *bs) 547 { 548 bs->copy_on_read++; 549 } 550 551 void bdrv_disable_copy_on_read(BlockDriverState *bs) 552 { 553 assert(bs->copy_on_read > 0); 554 bs->copy_on_read--; 555 } 556 557 /* 558 * Common part for opening disk images and files 559 */ 560 static int bdrv_open_common(BlockDriverState *bs, const char *filename, 561 int flags, BlockDriver *drv) 562 { 563 int ret, open_flags; 564 565 assert(drv != NULL); 566 567 trace_bdrv_open_common(bs, filename, flags, drv->format_name); 568 569 bs->file = NULL; 570 bs->total_sectors = 0; 571 bs->encrypted = 0; 572 bs->valid_key = 0; 573 bs->sg = 0; 574 bs->open_flags = flags; 575 bs->growable = 0; 576 bs->buffer_alignment = 512; 577 578 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ 579 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) { 580 bdrv_enable_copy_on_read(bs); 581 } 582 583 pstrcpy(bs->filename, sizeof(bs->filename), filename); 584 bs->backing_file[0] = '\0'; 585 586 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) { 587 return -ENOTSUP; 588 } 589 590 bs->drv = drv; 591 bs->opaque = g_malloc0(drv->instance_size); 592 593 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB); 594 595 /* 596 * Clear flags that are internal to the block layer before opening the 597 * image. 598 */ 599 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 600 601 /* 602 * Snapshots should be writable. 603 */ 604 if (bs->is_temporary) { 605 open_flags |= BDRV_O_RDWR; 606 } 607 608 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR); 609 610 /* Open the image, either directly or using a protocol */ 611 if (drv->bdrv_file_open) { 612 ret = drv->bdrv_file_open(bs, filename, open_flags); 613 } else { 614 ret = bdrv_file_open(&bs->file, filename, open_flags); 615 if (ret >= 0) { 616 ret = drv->bdrv_open(bs, open_flags); 617 } 618 } 619 620 if (ret < 0) { 621 goto free_and_fail; 622 } 623 624 ret = refresh_total_sectors(bs, bs->total_sectors); 625 if (ret < 0) { 626 goto free_and_fail; 627 } 628 629 #ifndef _WIN32 630 if (bs->is_temporary) { 631 unlink(filename); 632 } 633 #endif 634 return 0; 635 636 free_and_fail: 637 if (bs->file) { 638 bdrv_delete(bs->file); 639 bs->file = NULL; 640 } 641 g_free(bs->opaque); 642 bs->opaque = NULL; 643 bs->drv = NULL; 644 return ret; 645 } 646 647 /* 648 * Opens a file using a protocol (file, host_device, nbd, ...) 649 */ 650 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags) 651 { 652 BlockDriverState *bs; 653 BlockDriver *drv; 654 int ret; 655 656 drv = bdrv_find_protocol(filename); 657 if (!drv) { 658 return -ENOENT; 659 } 660 661 bs = bdrv_new(""); 662 ret = bdrv_open_common(bs, filename, flags, drv); 663 if (ret < 0) { 664 bdrv_delete(bs); 665 return ret; 666 } 667 bs->growable = 1; 668 *pbs = bs; 669 return 0; 670 } 671 672 /* 673 * Opens a disk image (raw, qcow2, vmdk, ...) 674 */ 675 int bdrv_open(BlockDriverState *bs, const char *filename, int flags, 676 BlockDriver *drv) 677 { 678 int ret; 679 char tmp_filename[PATH_MAX]; 680 681 if (flags & BDRV_O_SNAPSHOT) { 682 BlockDriverState *bs1; 683 int64_t total_size; 684 int is_protocol = 0; 685 BlockDriver *bdrv_qcow2; 686 QEMUOptionParameter *options; 687 char backing_filename[PATH_MAX]; 688 689 /* if snapshot, we create a temporary backing file and open it 690 instead of opening 'filename' directly */ 691 692 /* if there is a backing file, use it */ 693 bs1 = bdrv_new(""); 694 ret = bdrv_open(bs1, filename, 0, drv); 695 if (ret < 0) { 696 bdrv_delete(bs1); 697 return ret; 698 } 699 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK; 700 701 if (bs1->drv && bs1->drv->protocol_name) 702 is_protocol = 1; 703 704 bdrv_delete(bs1); 705 706 get_tmp_filename(tmp_filename, sizeof(tmp_filename)); 707 708 /* Real path is meaningless for protocols */ 709 if (is_protocol) 710 snprintf(backing_filename, sizeof(backing_filename), 711 "%s", filename); 712 else if (!realpath(filename, backing_filename)) 713 return -errno; 714 715 bdrv_qcow2 = bdrv_find_format("qcow2"); 716 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL); 717 718 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size); 719 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename); 720 if (drv) { 721 set_option_parameter(options, BLOCK_OPT_BACKING_FMT, 722 drv->format_name); 723 } 724 725 ret = bdrv_create(bdrv_qcow2, tmp_filename, options); 726 free_option_parameters(options); 727 if (ret < 0) { 728 return ret; 729 } 730 731 filename = tmp_filename; 732 drv = bdrv_qcow2; 733 bs->is_temporary = 1; 734 } 735 736 /* Find the right image format driver */ 737 if (!drv) { 738 ret = find_image_format(filename, &drv); 739 } 740 741 if (!drv) { 742 goto unlink_and_fail; 743 } 744 745 /* Open the image */ 746 ret = bdrv_open_common(bs, filename, flags, drv); 747 if (ret < 0) { 748 goto unlink_and_fail; 749 } 750 751 /* If there is a backing file, use it */ 752 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') { 753 char backing_filename[PATH_MAX]; 754 int back_flags; 755 BlockDriver *back_drv = NULL; 756 757 bs->backing_hd = bdrv_new(""); 758 759 if (path_has_protocol(bs->backing_file)) { 760 pstrcpy(backing_filename, sizeof(backing_filename), 761 bs->backing_file); 762 } else { 763 path_combine(backing_filename, sizeof(backing_filename), 764 filename, bs->backing_file); 765 } 766 767 if (bs->backing_format[0] != '\0') { 768 back_drv = bdrv_find_format(bs->backing_format); 769 } 770 771 /* backing files always opened read-only */ 772 back_flags = 773 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); 774 775 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv); 776 if (ret < 0) { 777 bdrv_close(bs); 778 return ret; 779 } 780 if (bs->is_temporary) { 781 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR); 782 } else { 783 /* base image inherits from "parent" */ 784 bs->backing_hd->keep_read_only = bs->keep_read_only; 785 } 786 } 787 788 if (!bdrv_key_required(bs)) { 789 bdrv_dev_change_media_cb(bs, true); 790 } 791 792 /* throttling disk I/O limits */ 793 if (bs->io_limits_enabled) { 794 bdrv_io_limits_enable(bs); 795 } 796 797 return 0; 798 799 unlink_and_fail: 800 if (bs->is_temporary) { 801 unlink(filename); 802 } 803 return ret; 804 } 805 806 void bdrv_close(BlockDriverState *bs) 807 { 808 if (bs->drv) { 809 if (bs == bs_snapshots) { 810 bs_snapshots = NULL; 811 } 812 if (bs->backing_hd) { 813 bdrv_delete(bs->backing_hd); 814 bs->backing_hd = NULL; 815 } 816 bs->drv->bdrv_close(bs); 817 g_free(bs->opaque); 818 #ifdef _WIN32 819 if (bs->is_temporary) { 820 unlink(bs->filename); 821 } 822 #endif 823 bs->opaque = NULL; 824 bs->drv = NULL; 825 bs->copy_on_read = 0; 826 827 if (bs->file != NULL) { 828 bdrv_close(bs->file); 829 } 830 831 bdrv_dev_change_media_cb(bs, false); 832 } 833 834 /*throttling disk I/O limits*/ 835 if (bs->io_limits_enabled) { 836 bdrv_io_limits_disable(bs); 837 } 838 } 839 840 void bdrv_close_all(void) 841 { 842 BlockDriverState *bs; 843 844 QTAILQ_FOREACH(bs, &bdrv_states, list) { 845 bdrv_close(bs); 846 } 847 } 848 849 /* 850 * Wait for pending requests to complete across all BlockDriverStates 851 * 852 * This function does not flush data to disk, use bdrv_flush_all() for that 853 * after calling this function. 854 */ 855 void bdrv_drain_all(void) 856 { 857 BlockDriverState *bs; 858 859 qemu_aio_flush(); 860 861 /* If requests are still pending there is a bug somewhere */ 862 QTAILQ_FOREACH(bs, &bdrv_states, list) { 863 assert(QLIST_EMPTY(&bs->tracked_requests)); 864 assert(qemu_co_queue_empty(&bs->throttled_reqs)); 865 } 866 } 867 868 /* make a BlockDriverState anonymous by removing from bdrv_state list. 869 Also, NULL terminate the device_name to prevent double remove */ 870 void bdrv_make_anon(BlockDriverState *bs) 871 { 872 if (bs->device_name[0] != '\0') { 873 QTAILQ_REMOVE(&bdrv_states, bs, list); 874 } 875 bs->device_name[0] = '\0'; 876 } 877 878 void bdrv_delete(BlockDriverState *bs) 879 { 880 assert(!bs->dev); 881 882 /* remove from list, if necessary */ 883 bdrv_make_anon(bs); 884 885 bdrv_close(bs); 886 if (bs->file != NULL) { 887 bdrv_delete(bs->file); 888 } 889 890 assert(bs != bs_snapshots); 891 g_free(bs); 892 } 893 894 int bdrv_attach_dev(BlockDriverState *bs, void *dev) 895 /* TODO change to DeviceState *dev when all users are qdevified */ 896 { 897 if (bs->dev) { 898 return -EBUSY; 899 } 900 bs->dev = dev; 901 bdrv_iostatus_reset(bs); 902 return 0; 903 } 904 905 /* TODO qdevified devices don't use this, remove when devices are qdevified */ 906 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev) 907 { 908 if (bdrv_attach_dev(bs, dev) < 0) { 909 abort(); 910 } 911 } 912 913 void bdrv_detach_dev(BlockDriverState *bs, void *dev) 914 /* TODO change to DeviceState *dev when all users are qdevified */ 915 { 916 assert(bs->dev == dev); 917 bs->dev = NULL; 918 bs->dev_ops = NULL; 919 bs->dev_opaque = NULL; 920 bs->buffer_alignment = 512; 921 } 922 923 /* TODO change to return DeviceState * when all users are qdevified */ 924 void *bdrv_get_attached_dev(BlockDriverState *bs) 925 { 926 return bs->dev; 927 } 928 929 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops, 930 void *opaque) 931 { 932 bs->dev_ops = ops; 933 bs->dev_opaque = opaque; 934 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) { 935 bs_snapshots = NULL; 936 } 937 } 938 939 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load) 940 { 941 if (bs->dev_ops && bs->dev_ops->change_media_cb) { 942 bs->dev_ops->change_media_cb(bs->dev_opaque, load); 943 } 944 } 945 946 bool bdrv_dev_has_removable_media(BlockDriverState *bs) 947 { 948 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb); 949 } 950 951 void bdrv_dev_eject_request(BlockDriverState *bs, bool force) 952 { 953 if (bs->dev_ops && bs->dev_ops->eject_request_cb) { 954 bs->dev_ops->eject_request_cb(bs->dev_opaque, force); 955 } 956 } 957 958 bool bdrv_dev_is_tray_open(BlockDriverState *bs) 959 { 960 if (bs->dev_ops && bs->dev_ops->is_tray_open) { 961 return bs->dev_ops->is_tray_open(bs->dev_opaque); 962 } 963 return false; 964 } 965 966 static void bdrv_dev_resize_cb(BlockDriverState *bs) 967 { 968 if (bs->dev_ops && bs->dev_ops->resize_cb) { 969 bs->dev_ops->resize_cb(bs->dev_opaque); 970 } 971 } 972 973 bool bdrv_dev_is_medium_locked(BlockDriverState *bs) 974 { 975 if (bs->dev_ops && bs->dev_ops->is_medium_locked) { 976 return bs->dev_ops->is_medium_locked(bs->dev_opaque); 977 } 978 return false; 979 } 980 981 /* 982 * Run consistency checks on an image 983 * 984 * Returns 0 if the check could be completed (it doesn't mean that the image is 985 * free of errors) or -errno when an internal error occurred. The results of the 986 * check are stored in res. 987 */ 988 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res) 989 { 990 if (bs->drv->bdrv_check == NULL) { 991 return -ENOTSUP; 992 } 993 994 memset(res, 0, sizeof(*res)); 995 return bs->drv->bdrv_check(bs, res); 996 } 997 998 #define COMMIT_BUF_SECTORS 2048 999 1000 /* commit COW file into the raw image */ 1001 int bdrv_commit(BlockDriverState *bs) 1002 { 1003 BlockDriver *drv = bs->drv; 1004 BlockDriver *backing_drv; 1005 int64_t sector, total_sectors; 1006 int n, ro, open_flags; 1007 int ret = 0, rw_ret = 0; 1008 uint8_t *buf; 1009 char filename[1024]; 1010 BlockDriverState *bs_rw, *bs_ro; 1011 1012 if (!drv) 1013 return -ENOMEDIUM; 1014 1015 if (!bs->backing_hd) { 1016 return -ENOTSUP; 1017 } 1018 1019 if (bs->backing_hd->keep_read_only) { 1020 return -EACCES; 1021 } 1022 1023 backing_drv = bs->backing_hd->drv; 1024 ro = bs->backing_hd->read_only; 1025 strncpy(filename, bs->backing_hd->filename, sizeof(filename)); 1026 open_flags = bs->backing_hd->open_flags; 1027 1028 if (ro) { 1029 /* re-open as RW */ 1030 bdrv_delete(bs->backing_hd); 1031 bs->backing_hd = NULL; 1032 bs_rw = bdrv_new(""); 1033 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR, 1034 backing_drv); 1035 if (rw_ret < 0) { 1036 bdrv_delete(bs_rw); 1037 /* try to re-open read-only */ 1038 bs_ro = bdrv_new(""); 1039 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, 1040 backing_drv); 1041 if (ret < 0) { 1042 bdrv_delete(bs_ro); 1043 /* drive not functional anymore */ 1044 bs->drv = NULL; 1045 return ret; 1046 } 1047 bs->backing_hd = bs_ro; 1048 return rw_ret; 1049 } 1050 bs->backing_hd = bs_rw; 1051 } 1052 1053 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; 1054 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); 1055 1056 for (sector = 0; sector < total_sectors; sector += n) { 1057 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { 1058 1059 if (bdrv_read(bs, sector, buf, n) != 0) { 1060 ret = -EIO; 1061 goto ro_cleanup; 1062 } 1063 1064 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { 1065 ret = -EIO; 1066 goto ro_cleanup; 1067 } 1068 } 1069 } 1070 1071 if (drv->bdrv_make_empty) { 1072 ret = drv->bdrv_make_empty(bs); 1073 bdrv_flush(bs); 1074 } 1075 1076 /* 1077 * Make sure all data we wrote to the backing device is actually 1078 * stable on disk. 1079 */ 1080 if (bs->backing_hd) 1081 bdrv_flush(bs->backing_hd); 1082 1083 ro_cleanup: 1084 g_free(buf); 1085 1086 if (ro) { 1087 /* re-open as RO */ 1088 bdrv_delete(bs->backing_hd); 1089 bs->backing_hd = NULL; 1090 bs_ro = bdrv_new(""); 1091 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR, 1092 backing_drv); 1093 if (ret < 0) { 1094 bdrv_delete(bs_ro); 1095 /* drive not functional anymore */ 1096 bs->drv = NULL; 1097 return ret; 1098 } 1099 bs->backing_hd = bs_ro; 1100 bs->backing_hd->keep_read_only = 0; 1101 } 1102 1103 return ret; 1104 } 1105 1106 void bdrv_commit_all(void) 1107 { 1108 BlockDriverState *bs; 1109 1110 QTAILQ_FOREACH(bs, &bdrv_states, list) { 1111 bdrv_commit(bs); 1112 } 1113 } 1114 1115 struct BdrvTrackedRequest { 1116 BlockDriverState *bs; 1117 int64_t sector_num; 1118 int nb_sectors; 1119 bool is_write; 1120 QLIST_ENTRY(BdrvTrackedRequest) list; 1121 Coroutine *co; /* owner, used for deadlock detection */ 1122 CoQueue wait_queue; /* coroutines blocked on this request */ 1123 }; 1124 1125 /** 1126 * Remove an active request from the tracked requests list 1127 * 1128 * This function should be called when a tracked request is completing. 1129 */ 1130 static void tracked_request_end(BdrvTrackedRequest *req) 1131 { 1132 QLIST_REMOVE(req, list); 1133 qemu_co_queue_restart_all(&req->wait_queue); 1134 } 1135 1136 /** 1137 * Add an active request to the tracked requests list 1138 */ 1139 static void tracked_request_begin(BdrvTrackedRequest *req, 1140 BlockDriverState *bs, 1141 int64_t sector_num, 1142 int nb_sectors, bool is_write) 1143 { 1144 *req = (BdrvTrackedRequest){ 1145 .bs = bs, 1146 .sector_num = sector_num, 1147 .nb_sectors = nb_sectors, 1148 .is_write = is_write, 1149 .co = qemu_coroutine_self(), 1150 }; 1151 1152 qemu_co_queue_init(&req->wait_queue); 1153 1154 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 1155 } 1156 1157 /** 1158 * Round a region to cluster boundaries 1159 */ 1160 static void round_to_clusters(BlockDriverState *bs, 1161 int64_t sector_num, int nb_sectors, 1162 int64_t *cluster_sector_num, 1163 int *cluster_nb_sectors) 1164 { 1165 BlockDriverInfo bdi; 1166 1167 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 1168 *cluster_sector_num = sector_num; 1169 *cluster_nb_sectors = nb_sectors; 1170 } else { 1171 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; 1172 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); 1173 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + 1174 nb_sectors, c); 1175 } 1176 } 1177 1178 static bool tracked_request_overlaps(BdrvTrackedRequest *req, 1179 int64_t sector_num, int nb_sectors) { 1180 /* aaaa bbbb */ 1181 if (sector_num >= req->sector_num + req->nb_sectors) { 1182 return false; 1183 } 1184 /* bbbb aaaa */ 1185 if (req->sector_num >= sector_num + nb_sectors) { 1186 return false; 1187 } 1188 return true; 1189 } 1190 1191 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, 1192 int64_t sector_num, int nb_sectors) 1193 { 1194 BdrvTrackedRequest *req; 1195 int64_t cluster_sector_num; 1196 int cluster_nb_sectors; 1197 bool retry; 1198 1199 /* If we touch the same cluster it counts as an overlap. This guarantees 1200 * that allocating writes will be serialized and not race with each other 1201 * for the same cluster. For example, in copy-on-read it ensures that the 1202 * CoR read and write operations are atomic and guest writes cannot 1203 * interleave between them. 1204 */ 1205 round_to_clusters(bs, sector_num, nb_sectors, 1206 &cluster_sector_num, &cluster_nb_sectors); 1207 1208 do { 1209 retry = false; 1210 QLIST_FOREACH(req, &bs->tracked_requests, list) { 1211 if (tracked_request_overlaps(req, cluster_sector_num, 1212 cluster_nb_sectors)) { 1213 /* Hitting this means there was a reentrant request, for 1214 * example, a block driver issuing nested requests. This must 1215 * never happen since it means deadlock. 1216 */ 1217 assert(qemu_coroutine_self() != req->co); 1218 1219 qemu_co_queue_wait(&req->wait_queue); 1220 retry = true; 1221 break; 1222 } 1223 } 1224 } while (retry); 1225 } 1226 1227 /* 1228 * Return values: 1229 * 0 - success 1230 * -EINVAL - backing format specified, but no file 1231 * -ENOSPC - can't update the backing file because no space is left in the 1232 * image file header 1233 * -ENOTSUP - format driver doesn't support changing the backing file 1234 */ 1235 int bdrv_change_backing_file(BlockDriverState *bs, 1236 const char *backing_file, const char *backing_fmt) 1237 { 1238 BlockDriver *drv = bs->drv; 1239 1240 if (drv->bdrv_change_backing_file != NULL) { 1241 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt); 1242 } else { 1243 return -ENOTSUP; 1244 } 1245 } 1246 1247 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 1248 size_t size) 1249 { 1250 int64_t len; 1251 1252 if (!bdrv_is_inserted(bs)) 1253 return -ENOMEDIUM; 1254 1255 if (bs->growable) 1256 return 0; 1257 1258 len = bdrv_getlength(bs); 1259 1260 if (offset < 0) 1261 return -EIO; 1262 1263 if ((offset > len) || (len - offset < size)) 1264 return -EIO; 1265 1266 return 0; 1267 } 1268 1269 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, 1270 int nb_sectors) 1271 { 1272 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, 1273 nb_sectors * BDRV_SECTOR_SIZE); 1274 } 1275 1276 typedef struct RwCo { 1277 BlockDriverState *bs; 1278 int64_t sector_num; 1279 int nb_sectors; 1280 QEMUIOVector *qiov; 1281 bool is_write; 1282 int ret; 1283 } RwCo; 1284 1285 static void coroutine_fn bdrv_rw_co_entry(void *opaque) 1286 { 1287 RwCo *rwco = opaque; 1288 1289 if (!rwco->is_write) { 1290 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, 1291 rwco->nb_sectors, rwco->qiov); 1292 } else { 1293 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, 1294 rwco->nb_sectors, rwco->qiov); 1295 } 1296 } 1297 1298 /* 1299 * Process a synchronous request using coroutines 1300 */ 1301 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 1302 int nb_sectors, bool is_write) 1303 { 1304 QEMUIOVector qiov; 1305 struct iovec iov = { 1306 .iov_base = (void *)buf, 1307 .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 1308 }; 1309 Coroutine *co; 1310 RwCo rwco = { 1311 .bs = bs, 1312 .sector_num = sector_num, 1313 .nb_sectors = nb_sectors, 1314 .qiov = &qiov, 1315 .is_write = is_write, 1316 .ret = NOT_DONE, 1317 }; 1318 1319 qemu_iovec_init_external(&qiov, &iov, 1); 1320 1321 if (qemu_in_coroutine()) { 1322 /* Fast-path if already in coroutine context */ 1323 bdrv_rw_co_entry(&rwco); 1324 } else { 1325 co = qemu_coroutine_create(bdrv_rw_co_entry); 1326 qemu_coroutine_enter(co, &rwco); 1327 while (rwco.ret == NOT_DONE) { 1328 qemu_aio_wait(); 1329 } 1330 } 1331 return rwco.ret; 1332 } 1333 1334 /* return < 0 if error. See bdrv_write() for the return codes */ 1335 int bdrv_read(BlockDriverState *bs, int64_t sector_num, 1336 uint8_t *buf, int nb_sectors) 1337 { 1338 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false); 1339 } 1340 1341 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, 1342 int nb_sectors, int dirty) 1343 { 1344 int64_t start, end; 1345 unsigned long val, idx, bit; 1346 1347 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; 1348 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; 1349 1350 for (; start <= end; start++) { 1351 idx = start / (sizeof(unsigned long) * 8); 1352 bit = start % (sizeof(unsigned long) * 8); 1353 val = bs->dirty_bitmap[idx]; 1354 if (dirty) { 1355 if (!(val & (1UL << bit))) { 1356 bs->dirty_count++; 1357 val |= 1UL << bit; 1358 } 1359 } else { 1360 if (val & (1UL << bit)) { 1361 bs->dirty_count--; 1362 val &= ~(1UL << bit); 1363 } 1364 } 1365 bs->dirty_bitmap[idx] = val; 1366 } 1367 } 1368 1369 /* Return < 0 if error. Important errors are: 1370 -EIO generic I/O error (may happen for all errors) 1371 -ENOMEDIUM No media inserted. 1372 -EINVAL Invalid sector number or nb_sectors 1373 -EACCES Trying to write a read-only device 1374 */ 1375 int bdrv_write(BlockDriverState *bs, int64_t sector_num, 1376 const uint8_t *buf, int nb_sectors) 1377 { 1378 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true); 1379 } 1380 1381 int bdrv_pread(BlockDriverState *bs, int64_t offset, 1382 void *buf, int count1) 1383 { 1384 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 1385 int len, nb_sectors, count; 1386 int64_t sector_num; 1387 int ret; 1388 1389 count = count1; 1390 /* first read to align to sector start */ 1391 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 1392 if (len > count) 1393 len = count; 1394 sector_num = offset >> BDRV_SECTOR_BITS; 1395 if (len > 0) { 1396 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 1397 return ret; 1398 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); 1399 count -= len; 1400 if (count == 0) 1401 return count1; 1402 sector_num++; 1403 buf += len; 1404 } 1405 1406 /* read the sectors "in place" */ 1407 nb_sectors = count >> BDRV_SECTOR_BITS; 1408 if (nb_sectors > 0) { 1409 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) 1410 return ret; 1411 sector_num += nb_sectors; 1412 len = nb_sectors << BDRV_SECTOR_BITS; 1413 buf += len; 1414 count -= len; 1415 } 1416 1417 /* add data from the last sector */ 1418 if (count > 0) { 1419 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 1420 return ret; 1421 memcpy(buf, tmp_buf, count); 1422 } 1423 return count1; 1424 } 1425 1426 int bdrv_pwrite(BlockDriverState *bs, int64_t offset, 1427 const void *buf, int count1) 1428 { 1429 uint8_t tmp_buf[BDRV_SECTOR_SIZE]; 1430 int len, nb_sectors, count; 1431 int64_t sector_num; 1432 int ret; 1433 1434 count = count1; 1435 /* first write to align to sector start */ 1436 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); 1437 if (len > count) 1438 len = count; 1439 sector_num = offset >> BDRV_SECTOR_BITS; 1440 if (len > 0) { 1441 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 1442 return ret; 1443 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len); 1444 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 1445 return ret; 1446 count -= len; 1447 if (count == 0) 1448 return count1; 1449 sector_num++; 1450 buf += len; 1451 } 1452 1453 /* write the sectors "in place" */ 1454 nb_sectors = count >> BDRV_SECTOR_BITS; 1455 if (nb_sectors > 0) { 1456 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0) 1457 return ret; 1458 sector_num += nb_sectors; 1459 len = nb_sectors << BDRV_SECTOR_BITS; 1460 buf += len; 1461 count -= len; 1462 } 1463 1464 /* add data from the last sector */ 1465 if (count > 0) { 1466 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) 1467 return ret; 1468 memcpy(tmp_buf, buf, count); 1469 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) 1470 return ret; 1471 } 1472 return count1; 1473 } 1474 1475 /* 1476 * Writes to the file and ensures that no writes are reordered across this 1477 * request (acts as a barrier) 1478 * 1479 * Returns 0 on success, -errno in error cases. 1480 */ 1481 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, 1482 const void *buf, int count) 1483 { 1484 int ret; 1485 1486 ret = bdrv_pwrite(bs, offset, buf, count); 1487 if (ret < 0) { 1488 return ret; 1489 } 1490 1491 /* No flush needed for cache modes that use O_DSYNC */ 1492 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) { 1493 bdrv_flush(bs); 1494 } 1495 1496 return 0; 1497 } 1498 1499 static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, 1500 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1501 { 1502 /* Perform I/O through a temporary buffer so that users who scribble over 1503 * their read buffer while the operation is in progress do not end up 1504 * modifying the image file. This is critical for zero-copy guest I/O 1505 * where anything might happen inside guest memory. 1506 */ 1507 void *bounce_buffer; 1508 1509 struct iovec iov; 1510 QEMUIOVector bounce_qiov; 1511 int64_t cluster_sector_num; 1512 int cluster_nb_sectors; 1513 size_t skip_bytes; 1514 int ret; 1515 1516 /* Cover entire cluster so no additional backing file I/O is required when 1517 * allocating cluster in the image file. 1518 */ 1519 round_to_clusters(bs, sector_num, nb_sectors, 1520 &cluster_sector_num, &cluster_nb_sectors); 1521 1522 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, 1523 cluster_sector_num, cluster_nb_sectors); 1524 1525 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; 1526 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); 1527 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 1528 1529 ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, 1530 &bounce_qiov); 1531 if (ret < 0) { 1532 goto err; 1533 } 1534 1535 ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, 1536 &bounce_qiov); 1537 if (ret < 0) { 1538 /* It might be okay to ignore write errors for guest requests. If this 1539 * is a deliberate copy-on-read then we don't want to ignore the error. 1540 * Simply report it in all cases. 1541 */ 1542 goto err; 1543 } 1544 1545 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; 1546 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes, 1547 nb_sectors * BDRV_SECTOR_SIZE); 1548 1549 err: 1550 qemu_vfree(bounce_buffer); 1551 return ret; 1552 } 1553 1554 /* 1555 * Handle a read request in coroutine context 1556 */ 1557 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, 1558 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1559 { 1560 BlockDriver *drv = bs->drv; 1561 BdrvTrackedRequest req; 1562 int ret; 1563 1564 if (!drv) { 1565 return -ENOMEDIUM; 1566 } 1567 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 1568 return -EIO; 1569 } 1570 1571 /* throttling disk read I/O */ 1572 if (bs->io_limits_enabled) { 1573 bdrv_io_limits_intercept(bs, false, nb_sectors); 1574 } 1575 1576 if (bs->copy_on_read) { 1577 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 1578 } 1579 1580 tracked_request_begin(&req, bs, sector_num, nb_sectors, false); 1581 1582 if (bs->copy_on_read) { 1583 int pnum; 1584 1585 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum); 1586 if (ret < 0) { 1587 goto out; 1588 } 1589 1590 if (!ret || pnum != nb_sectors) { 1591 ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov); 1592 goto out; 1593 } 1594 } 1595 1596 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1597 1598 out: 1599 tracked_request_end(&req); 1600 return ret; 1601 } 1602 1603 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, 1604 int nb_sectors, QEMUIOVector *qiov) 1605 { 1606 trace_bdrv_co_readv(bs, sector_num, nb_sectors); 1607 1608 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov); 1609 } 1610 1611 /* 1612 * Handle a write request in coroutine context 1613 */ 1614 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, 1615 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) 1616 { 1617 BlockDriver *drv = bs->drv; 1618 BdrvTrackedRequest req; 1619 int ret; 1620 1621 if (!bs->drv) { 1622 return -ENOMEDIUM; 1623 } 1624 if (bs->read_only) { 1625 return -EACCES; 1626 } 1627 if (bdrv_check_request(bs, sector_num, nb_sectors)) { 1628 return -EIO; 1629 } 1630 1631 /* throttling disk write I/O */ 1632 if (bs->io_limits_enabled) { 1633 bdrv_io_limits_intercept(bs, true, nb_sectors); 1634 } 1635 1636 if (bs->copy_on_read) { 1637 wait_for_overlapping_requests(bs, sector_num, nb_sectors); 1638 } 1639 1640 tracked_request_begin(&req, bs, sector_num, nb_sectors, true); 1641 1642 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); 1643 1644 if (bs->dirty_bitmap) { 1645 set_dirty_bitmap(bs, sector_num, nb_sectors, 1); 1646 } 1647 1648 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { 1649 bs->wr_highest_sector = sector_num + nb_sectors - 1; 1650 } 1651 1652 tracked_request_end(&req); 1653 1654 return ret; 1655 } 1656 1657 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, 1658 int nb_sectors, QEMUIOVector *qiov) 1659 { 1660 trace_bdrv_co_writev(bs, sector_num, nb_sectors); 1661 1662 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov); 1663 } 1664 1665 /** 1666 * Truncate file to 'offset' bytes (needed only for file protocols) 1667 */ 1668 int bdrv_truncate(BlockDriverState *bs, int64_t offset) 1669 { 1670 BlockDriver *drv = bs->drv; 1671 int ret; 1672 if (!drv) 1673 return -ENOMEDIUM; 1674 if (!drv->bdrv_truncate) 1675 return -ENOTSUP; 1676 if (bs->read_only) 1677 return -EACCES; 1678 if (bdrv_in_use(bs)) 1679 return -EBUSY; 1680 ret = drv->bdrv_truncate(bs, offset); 1681 if (ret == 0) { 1682 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 1683 bdrv_dev_resize_cb(bs); 1684 } 1685 return ret; 1686 } 1687 1688 /** 1689 * Length of a allocated file in bytes. Sparse files are counted by actual 1690 * allocated space. Return < 0 if error or unknown. 1691 */ 1692 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) 1693 { 1694 BlockDriver *drv = bs->drv; 1695 if (!drv) { 1696 return -ENOMEDIUM; 1697 } 1698 if (drv->bdrv_get_allocated_file_size) { 1699 return drv->bdrv_get_allocated_file_size(bs); 1700 } 1701 if (bs->file) { 1702 return bdrv_get_allocated_file_size(bs->file); 1703 } 1704 return -ENOTSUP; 1705 } 1706 1707 /** 1708 * Length of a file in bytes. Return < 0 if error or unknown. 1709 */ 1710 int64_t bdrv_getlength(BlockDriverState *bs) 1711 { 1712 BlockDriver *drv = bs->drv; 1713 if (!drv) 1714 return -ENOMEDIUM; 1715 1716 if (bs->growable || bdrv_dev_has_removable_media(bs)) { 1717 if (drv->bdrv_getlength) { 1718 return drv->bdrv_getlength(bs); 1719 } 1720 } 1721 return bs->total_sectors * BDRV_SECTOR_SIZE; 1722 } 1723 1724 /* return 0 as number of sectors if no device present or error */ 1725 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) 1726 { 1727 int64_t length; 1728 length = bdrv_getlength(bs); 1729 if (length < 0) 1730 length = 0; 1731 else 1732 length = length >> BDRV_SECTOR_BITS; 1733 *nb_sectors_ptr = length; 1734 } 1735 1736 struct partition { 1737 uint8_t boot_ind; /* 0x80 - active */ 1738 uint8_t head; /* starting head */ 1739 uint8_t sector; /* starting sector */ 1740 uint8_t cyl; /* starting cylinder */ 1741 uint8_t sys_ind; /* What partition type */ 1742 uint8_t end_head; /* end head */ 1743 uint8_t end_sector; /* end sector */ 1744 uint8_t end_cyl; /* end cylinder */ 1745 uint32_t start_sect; /* starting sector counting from 0 */ 1746 uint32_t nr_sects; /* nr of sectors in partition */ 1747 } QEMU_PACKED; 1748 1749 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */ 1750 static int guess_disk_lchs(BlockDriverState *bs, 1751 int *pcylinders, int *pheads, int *psectors) 1752 { 1753 uint8_t buf[BDRV_SECTOR_SIZE]; 1754 int ret, i, heads, sectors, cylinders; 1755 struct partition *p; 1756 uint32_t nr_sects; 1757 uint64_t nb_sectors; 1758 1759 bdrv_get_geometry(bs, &nb_sectors); 1760 1761 ret = bdrv_read(bs, 0, buf, 1); 1762 if (ret < 0) 1763 return -1; 1764 /* test msdos magic */ 1765 if (buf[510] != 0x55 || buf[511] != 0xaa) 1766 return -1; 1767 for(i = 0; i < 4; i++) { 1768 p = ((struct partition *)(buf + 0x1be)) + i; 1769 nr_sects = le32_to_cpu(p->nr_sects); 1770 if (nr_sects && p->end_head) { 1771 /* We make the assumption that the partition terminates on 1772 a cylinder boundary */ 1773 heads = p->end_head + 1; 1774 sectors = p->end_sector & 63; 1775 if (sectors == 0) 1776 continue; 1777 cylinders = nb_sectors / (heads * sectors); 1778 if (cylinders < 1 || cylinders > 16383) 1779 continue; 1780 *pheads = heads; 1781 *psectors = sectors; 1782 *pcylinders = cylinders; 1783 #if 0 1784 printf("guessed geometry: LCHS=%d %d %d\n", 1785 cylinders, heads, sectors); 1786 #endif 1787 return 0; 1788 } 1789 } 1790 return -1; 1791 } 1792 1793 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs) 1794 { 1795 int translation, lba_detected = 0; 1796 int cylinders, heads, secs; 1797 uint64_t nb_sectors; 1798 1799 /* if a geometry hint is available, use it */ 1800 bdrv_get_geometry(bs, &nb_sectors); 1801 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs); 1802 translation = bdrv_get_translation_hint(bs); 1803 if (cylinders != 0) { 1804 *pcyls = cylinders; 1805 *pheads = heads; 1806 *psecs = secs; 1807 } else { 1808 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) { 1809 if (heads > 16) { 1810 /* if heads > 16, it means that a BIOS LBA 1811 translation was active, so the default 1812 hardware geometry is OK */ 1813 lba_detected = 1; 1814 goto default_geometry; 1815 } else { 1816 *pcyls = cylinders; 1817 *pheads = heads; 1818 *psecs = secs; 1819 /* disable any translation to be in sync with 1820 the logical geometry */ 1821 if (translation == BIOS_ATA_TRANSLATION_AUTO) { 1822 bdrv_set_translation_hint(bs, 1823 BIOS_ATA_TRANSLATION_NONE); 1824 } 1825 } 1826 } else { 1827 default_geometry: 1828 /* if no geometry, use a standard physical disk geometry */ 1829 cylinders = nb_sectors / (16 * 63); 1830 1831 if (cylinders > 16383) 1832 cylinders = 16383; 1833 else if (cylinders < 2) 1834 cylinders = 2; 1835 *pcyls = cylinders; 1836 *pheads = 16; 1837 *psecs = 63; 1838 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) { 1839 if ((*pcyls * *pheads) <= 131072) { 1840 bdrv_set_translation_hint(bs, 1841 BIOS_ATA_TRANSLATION_LARGE); 1842 } else { 1843 bdrv_set_translation_hint(bs, 1844 BIOS_ATA_TRANSLATION_LBA); 1845 } 1846 } 1847 } 1848 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs); 1849 } 1850 } 1851 1852 void bdrv_set_geometry_hint(BlockDriverState *bs, 1853 int cyls, int heads, int secs) 1854 { 1855 bs->cyls = cyls; 1856 bs->heads = heads; 1857 bs->secs = secs; 1858 } 1859 1860 void bdrv_set_translation_hint(BlockDriverState *bs, int translation) 1861 { 1862 bs->translation = translation; 1863 } 1864 1865 void bdrv_get_geometry_hint(BlockDriverState *bs, 1866 int *pcyls, int *pheads, int *psecs) 1867 { 1868 *pcyls = bs->cyls; 1869 *pheads = bs->heads; 1870 *psecs = bs->secs; 1871 } 1872 1873 /* throttling disk io limits */ 1874 void bdrv_set_io_limits(BlockDriverState *bs, 1875 BlockIOLimit *io_limits) 1876 { 1877 bs->io_limits = *io_limits; 1878 bs->io_limits_enabled = bdrv_io_limits_enabled(bs); 1879 } 1880 1881 /* Recognize floppy formats */ 1882 typedef struct FDFormat { 1883 FDriveType drive; 1884 uint8_t last_sect; 1885 uint8_t max_track; 1886 uint8_t max_head; 1887 } FDFormat; 1888 1889 static const FDFormat fd_formats[] = { 1890 /* First entry is default format */ 1891 /* 1.44 MB 3"1/2 floppy disks */ 1892 { FDRIVE_DRV_144, 18, 80, 1, }, 1893 { FDRIVE_DRV_144, 20, 80, 1, }, 1894 { FDRIVE_DRV_144, 21, 80, 1, }, 1895 { FDRIVE_DRV_144, 21, 82, 1, }, 1896 { FDRIVE_DRV_144, 21, 83, 1, }, 1897 { FDRIVE_DRV_144, 22, 80, 1, }, 1898 { FDRIVE_DRV_144, 23, 80, 1, }, 1899 { FDRIVE_DRV_144, 24, 80, 1, }, 1900 /* 2.88 MB 3"1/2 floppy disks */ 1901 { FDRIVE_DRV_288, 36, 80, 1, }, 1902 { FDRIVE_DRV_288, 39, 80, 1, }, 1903 { FDRIVE_DRV_288, 40, 80, 1, }, 1904 { FDRIVE_DRV_288, 44, 80, 1, }, 1905 { FDRIVE_DRV_288, 48, 80, 1, }, 1906 /* 720 kB 3"1/2 floppy disks */ 1907 { FDRIVE_DRV_144, 9, 80, 1, }, 1908 { FDRIVE_DRV_144, 10, 80, 1, }, 1909 { FDRIVE_DRV_144, 10, 82, 1, }, 1910 { FDRIVE_DRV_144, 10, 83, 1, }, 1911 { FDRIVE_DRV_144, 13, 80, 1, }, 1912 { FDRIVE_DRV_144, 14, 80, 1, }, 1913 /* 1.2 MB 5"1/4 floppy disks */ 1914 { FDRIVE_DRV_120, 15, 80, 1, }, 1915 { FDRIVE_DRV_120, 18, 80, 1, }, 1916 { FDRIVE_DRV_120, 18, 82, 1, }, 1917 { FDRIVE_DRV_120, 18, 83, 1, }, 1918 { FDRIVE_DRV_120, 20, 80, 1, }, 1919 /* 720 kB 5"1/4 floppy disks */ 1920 { FDRIVE_DRV_120, 9, 80, 1, }, 1921 { FDRIVE_DRV_120, 11, 80, 1, }, 1922 /* 360 kB 5"1/4 floppy disks */ 1923 { FDRIVE_DRV_120, 9, 40, 1, }, 1924 { FDRIVE_DRV_120, 9, 40, 0, }, 1925 { FDRIVE_DRV_120, 10, 41, 1, }, 1926 { FDRIVE_DRV_120, 10, 42, 1, }, 1927 /* 320 kB 5"1/4 floppy disks */ 1928 { FDRIVE_DRV_120, 8, 40, 1, }, 1929 { FDRIVE_DRV_120, 8, 40, 0, }, 1930 /* 360 kB must match 5"1/4 better than 3"1/2... */ 1931 { FDRIVE_DRV_144, 9, 80, 0, }, 1932 /* end */ 1933 { FDRIVE_DRV_NONE, -1, -1, 0, }, 1934 }; 1935 1936 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads, 1937 int *max_track, int *last_sect, 1938 FDriveType drive_in, FDriveType *drive) 1939 { 1940 const FDFormat *parse; 1941 uint64_t nb_sectors, size; 1942 int i, first_match, match; 1943 1944 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect); 1945 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) { 1946 /* User defined disk */ 1947 } else { 1948 bdrv_get_geometry(bs, &nb_sectors); 1949 match = -1; 1950 first_match = -1; 1951 for (i = 0; ; i++) { 1952 parse = &fd_formats[i]; 1953 if (parse->drive == FDRIVE_DRV_NONE) { 1954 break; 1955 } 1956 if (drive_in == parse->drive || 1957 drive_in == FDRIVE_DRV_NONE) { 1958 size = (parse->max_head + 1) * parse->max_track * 1959 parse->last_sect; 1960 if (nb_sectors == size) { 1961 match = i; 1962 break; 1963 } 1964 if (first_match == -1) { 1965 first_match = i; 1966 } 1967 } 1968 } 1969 if (match == -1) { 1970 if (first_match == -1) { 1971 match = 1; 1972 } else { 1973 match = first_match; 1974 } 1975 parse = &fd_formats[match]; 1976 } 1977 *nb_heads = parse->max_head + 1; 1978 *max_track = parse->max_track; 1979 *last_sect = parse->last_sect; 1980 *drive = parse->drive; 1981 } 1982 } 1983 1984 int bdrv_get_translation_hint(BlockDriverState *bs) 1985 { 1986 return bs->translation; 1987 } 1988 1989 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error, 1990 BlockErrorAction on_write_error) 1991 { 1992 bs->on_read_error = on_read_error; 1993 bs->on_write_error = on_write_error; 1994 } 1995 1996 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read) 1997 { 1998 return is_read ? bs->on_read_error : bs->on_write_error; 1999 } 2000 2001 int bdrv_is_read_only(BlockDriverState *bs) 2002 { 2003 return bs->read_only; 2004 } 2005 2006 int bdrv_is_sg(BlockDriverState *bs) 2007 { 2008 return bs->sg; 2009 } 2010 2011 int bdrv_enable_write_cache(BlockDriverState *bs) 2012 { 2013 return bs->enable_write_cache; 2014 } 2015 2016 int bdrv_is_encrypted(BlockDriverState *bs) 2017 { 2018 if (bs->backing_hd && bs->backing_hd->encrypted) 2019 return 1; 2020 return bs->encrypted; 2021 } 2022 2023 int bdrv_key_required(BlockDriverState *bs) 2024 { 2025 BlockDriverState *backing_hd = bs->backing_hd; 2026 2027 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) 2028 return 1; 2029 return (bs->encrypted && !bs->valid_key); 2030 } 2031 2032 int bdrv_set_key(BlockDriverState *bs, const char *key) 2033 { 2034 int ret; 2035 if (bs->backing_hd && bs->backing_hd->encrypted) { 2036 ret = bdrv_set_key(bs->backing_hd, key); 2037 if (ret < 0) 2038 return ret; 2039 if (!bs->encrypted) 2040 return 0; 2041 } 2042 if (!bs->encrypted) { 2043 return -EINVAL; 2044 } else if (!bs->drv || !bs->drv->bdrv_set_key) { 2045 return -ENOMEDIUM; 2046 } 2047 ret = bs->drv->bdrv_set_key(bs, key); 2048 if (ret < 0) { 2049 bs->valid_key = 0; 2050 } else if (!bs->valid_key) { 2051 bs->valid_key = 1; 2052 /* call the change callback now, we skipped it on open */ 2053 bdrv_dev_change_media_cb(bs, true); 2054 } 2055 return ret; 2056 } 2057 2058 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size) 2059 { 2060 if (!bs->drv) { 2061 buf[0] = '\0'; 2062 } else { 2063 pstrcpy(buf, buf_size, bs->drv->format_name); 2064 } 2065 } 2066 2067 void bdrv_iterate_format(void (*it)(void *opaque, const char *name), 2068 void *opaque) 2069 { 2070 BlockDriver *drv; 2071 2072 QLIST_FOREACH(drv, &bdrv_drivers, list) { 2073 it(opaque, drv->format_name); 2074 } 2075 } 2076 2077 BlockDriverState *bdrv_find(const char *name) 2078 { 2079 BlockDriverState *bs; 2080 2081 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2082 if (!strcmp(name, bs->device_name)) { 2083 return bs; 2084 } 2085 } 2086 return NULL; 2087 } 2088 2089 BlockDriverState *bdrv_next(BlockDriverState *bs) 2090 { 2091 if (!bs) { 2092 return QTAILQ_FIRST(&bdrv_states); 2093 } 2094 return QTAILQ_NEXT(bs, list); 2095 } 2096 2097 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) 2098 { 2099 BlockDriverState *bs; 2100 2101 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2102 it(opaque, bs); 2103 } 2104 } 2105 2106 const char *bdrv_get_device_name(BlockDriverState *bs) 2107 { 2108 return bs->device_name; 2109 } 2110 2111 void bdrv_flush_all(void) 2112 { 2113 BlockDriverState *bs; 2114 2115 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2116 if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) { 2117 bdrv_flush(bs); 2118 } 2119 } 2120 } 2121 2122 int bdrv_has_zero_init(BlockDriverState *bs) 2123 { 2124 assert(bs->drv); 2125 2126 if (bs->drv->bdrv_has_zero_init) { 2127 return bs->drv->bdrv_has_zero_init(bs); 2128 } 2129 2130 return 1; 2131 } 2132 2133 typedef struct BdrvCoIsAllocatedData { 2134 BlockDriverState *bs; 2135 int64_t sector_num; 2136 int nb_sectors; 2137 int *pnum; 2138 int ret; 2139 bool done; 2140 } BdrvCoIsAllocatedData; 2141 2142 /* 2143 * Returns true iff the specified sector is present in the disk image. Drivers 2144 * not implementing the functionality are assumed to not support backing files, 2145 * hence all their sectors are reported as allocated. 2146 * 2147 * If 'sector_num' is beyond the end of the disk image the return value is 0 2148 * and 'pnum' is set to 0. 2149 * 2150 * 'pnum' is set to the number of sectors (including and immediately following 2151 * the specified sector) that are known to be in the same 2152 * allocated/unallocated state. 2153 * 2154 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 2155 * beyond the end of the disk image it will be clamped. 2156 */ 2157 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, 2158 int nb_sectors, int *pnum) 2159 { 2160 int64_t n; 2161 2162 if (sector_num >= bs->total_sectors) { 2163 *pnum = 0; 2164 return 0; 2165 } 2166 2167 n = bs->total_sectors - sector_num; 2168 if (n < nb_sectors) { 2169 nb_sectors = n; 2170 } 2171 2172 if (!bs->drv->bdrv_co_is_allocated) { 2173 *pnum = nb_sectors; 2174 return 1; 2175 } 2176 2177 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum); 2178 } 2179 2180 /* Coroutine wrapper for bdrv_is_allocated() */ 2181 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque) 2182 { 2183 BdrvCoIsAllocatedData *data = opaque; 2184 BlockDriverState *bs = data->bs; 2185 2186 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors, 2187 data->pnum); 2188 data->done = true; 2189 } 2190 2191 /* 2192 * Synchronous wrapper around bdrv_co_is_allocated(). 2193 * 2194 * See bdrv_co_is_allocated() for details. 2195 */ 2196 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, 2197 int *pnum) 2198 { 2199 Coroutine *co; 2200 BdrvCoIsAllocatedData data = { 2201 .bs = bs, 2202 .sector_num = sector_num, 2203 .nb_sectors = nb_sectors, 2204 .pnum = pnum, 2205 .done = false, 2206 }; 2207 2208 co = qemu_coroutine_create(bdrv_is_allocated_co_entry); 2209 qemu_coroutine_enter(co, &data); 2210 while (!data.done) { 2211 qemu_aio_wait(); 2212 } 2213 return data.ret; 2214 } 2215 2216 void bdrv_mon_event(const BlockDriverState *bdrv, 2217 BlockMonEventAction action, int is_read) 2218 { 2219 QObject *data; 2220 const char *action_str; 2221 2222 switch (action) { 2223 case BDRV_ACTION_REPORT: 2224 action_str = "report"; 2225 break; 2226 case BDRV_ACTION_IGNORE: 2227 action_str = "ignore"; 2228 break; 2229 case BDRV_ACTION_STOP: 2230 action_str = "stop"; 2231 break; 2232 default: 2233 abort(); 2234 } 2235 2236 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }", 2237 bdrv->device_name, 2238 action_str, 2239 is_read ? "read" : "write"); 2240 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data); 2241 2242 qobject_decref(data); 2243 } 2244 2245 BlockInfoList *qmp_query_block(Error **errp) 2246 { 2247 BlockInfoList *head = NULL, *cur_item = NULL; 2248 BlockDriverState *bs; 2249 2250 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2251 BlockInfoList *info = g_malloc0(sizeof(*info)); 2252 2253 info->value = g_malloc0(sizeof(*info->value)); 2254 info->value->device = g_strdup(bs->device_name); 2255 info->value->type = g_strdup("unknown"); 2256 info->value->locked = bdrv_dev_is_medium_locked(bs); 2257 info->value->removable = bdrv_dev_has_removable_media(bs); 2258 2259 if (bdrv_dev_has_removable_media(bs)) { 2260 info->value->has_tray_open = true; 2261 info->value->tray_open = bdrv_dev_is_tray_open(bs); 2262 } 2263 2264 if (bdrv_iostatus_is_enabled(bs)) { 2265 info->value->has_io_status = true; 2266 info->value->io_status = bs->iostatus; 2267 } 2268 2269 if (bs->drv) { 2270 info->value->has_inserted = true; 2271 info->value->inserted = g_malloc0(sizeof(*info->value->inserted)); 2272 info->value->inserted->file = g_strdup(bs->filename); 2273 info->value->inserted->ro = bs->read_only; 2274 info->value->inserted->drv = g_strdup(bs->drv->format_name); 2275 info->value->inserted->encrypted = bs->encrypted; 2276 if (bs->backing_file[0]) { 2277 info->value->inserted->has_backing_file = true; 2278 info->value->inserted->backing_file = g_strdup(bs->backing_file); 2279 } 2280 2281 if (bs->io_limits_enabled) { 2282 info->value->inserted->bps = 2283 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; 2284 info->value->inserted->bps_rd = 2285 bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; 2286 info->value->inserted->bps_wr = 2287 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; 2288 info->value->inserted->iops = 2289 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; 2290 info->value->inserted->iops_rd = 2291 bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; 2292 info->value->inserted->iops_wr = 2293 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; 2294 } 2295 } 2296 2297 /* XXX: waiting for the qapi to support GSList */ 2298 if (!cur_item) { 2299 head = cur_item = info; 2300 } else { 2301 cur_item->next = info; 2302 cur_item = info; 2303 } 2304 } 2305 2306 return head; 2307 } 2308 2309 /* Consider exposing this as a full fledged QMP command */ 2310 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp) 2311 { 2312 BlockStats *s; 2313 2314 s = g_malloc0(sizeof(*s)); 2315 2316 if (bs->device_name[0]) { 2317 s->has_device = true; 2318 s->device = g_strdup(bs->device_name); 2319 } 2320 2321 s->stats = g_malloc0(sizeof(*s->stats)); 2322 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ]; 2323 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE]; 2324 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ]; 2325 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE]; 2326 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE; 2327 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH]; 2328 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE]; 2329 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ]; 2330 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH]; 2331 2332 if (bs->file) { 2333 s->has_parent = true; 2334 s->parent = qmp_query_blockstat(bs->file, NULL); 2335 } 2336 2337 return s; 2338 } 2339 2340 BlockStatsList *qmp_query_blockstats(Error **errp) 2341 { 2342 BlockStatsList *head = NULL, *cur_item = NULL; 2343 BlockDriverState *bs; 2344 2345 QTAILQ_FOREACH(bs, &bdrv_states, list) { 2346 BlockStatsList *info = g_malloc0(sizeof(*info)); 2347 info->value = qmp_query_blockstat(bs, NULL); 2348 2349 /* XXX: waiting for the qapi to support GSList */ 2350 if (!cur_item) { 2351 head = cur_item = info; 2352 } else { 2353 cur_item->next = info; 2354 cur_item = info; 2355 } 2356 } 2357 2358 return head; 2359 } 2360 2361 const char *bdrv_get_encrypted_filename(BlockDriverState *bs) 2362 { 2363 if (bs->backing_hd && bs->backing_hd->encrypted) 2364 return bs->backing_file; 2365 else if (bs->encrypted) 2366 return bs->filename; 2367 else 2368 return NULL; 2369 } 2370 2371 void bdrv_get_backing_filename(BlockDriverState *bs, 2372 char *filename, int filename_size) 2373 { 2374 pstrcpy(filename, filename_size, bs->backing_file); 2375 } 2376 2377 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, 2378 const uint8_t *buf, int nb_sectors) 2379 { 2380 BlockDriver *drv = bs->drv; 2381 if (!drv) 2382 return -ENOMEDIUM; 2383 if (!drv->bdrv_write_compressed) 2384 return -ENOTSUP; 2385 if (bdrv_check_request(bs, sector_num, nb_sectors)) 2386 return -EIO; 2387 2388 if (bs->dirty_bitmap) { 2389 set_dirty_bitmap(bs, sector_num, nb_sectors, 1); 2390 } 2391 2392 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); 2393 } 2394 2395 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 2396 { 2397 BlockDriver *drv = bs->drv; 2398 if (!drv) 2399 return -ENOMEDIUM; 2400 if (!drv->bdrv_get_info) 2401 return -ENOTSUP; 2402 memset(bdi, 0, sizeof(*bdi)); 2403 return drv->bdrv_get_info(bs, bdi); 2404 } 2405 2406 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 2407 int64_t pos, int size) 2408 { 2409 BlockDriver *drv = bs->drv; 2410 if (!drv) 2411 return -ENOMEDIUM; 2412 if (drv->bdrv_save_vmstate) 2413 return drv->bdrv_save_vmstate(bs, buf, pos, size); 2414 if (bs->file) 2415 return bdrv_save_vmstate(bs->file, buf, pos, size); 2416 return -ENOTSUP; 2417 } 2418 2419 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 2420 int64_t pos, int size) 2421 { 2422 BlockDriver *drv = bs->drv; 2423 if (!drv) 2424 return -ENOMEDIUM; 2425 if (drv->bdrv_load_vmstate) 2426 return drv->bdrv_load_vmstate(bs, buf, pos, size); 2427 if (bs->file) 2428 return bdrv_load_vmstate(bs->file, buf, pos, size); 2429 return -ENOTSUP; 2430 } 2431 2432 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) 2433 { 2434 BlockDriver *drv = bs->drv; 2435 2436 if (!drv || !drv->bdrv_debug_event) { 2437 return; 2438 } 2439 2440 return drv->bdrv_debug_event(bs, event); 2441 2442 } 2443 2444 /**************************************************************/ 2445 /* handling of snapshots */ 2446 2447 int bdrv_can_snapshot(BlockDriverState *bs) 2448 { 2449 BlockDriver *drv = bs->drv; 2450 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { 2451 return 0; 2452 } 2453 2454 if (!drv->bdrv_snapshot_create) { 2455 if (bs->file != NULL) { 2456 return bdrv_can_snapshot(bs->file); 2457 } 2458 return 0; 2459 } 2460 2461 return 1; 2462 } 2463 2464 int bdrv_is_snapshot(BlockDriverState *bs) 2465 { 2466 return !!(bs->open_flags & BDRV_O_SNAPSHOT); 2467 } 2468 2469 BlockDriverState *bdrv_snapshots(void) 2470 { 2471 BlockDriverState *bs; 2472 2473 if (bs_snapshots) { 2474 return bs_snapshots; 2475 } 2476 2477 bs = NULL; 2478 while ((bs = bdrv_next(bs))) { 2479 if (bdrv_can_snapshot(bs)) { 2480 bs_snapshots = bs; 2481 return bs; 2482 } 2483 } 2484 return NULL; 2485 } 2486 2487 int bdrv_snapshot_create(BlockDriverState *bs, 2488 QEMUSnapshotInfo *sn_info) 2489 { 2490 BlockDriver *drv = bs->drv; 2491 if (!drv) 2492 return -ENOMEDIUM; 2493 if (drv->bdrv_snapshot_create) 2494 return drv->bdrv_snapshot_create(bs, sn_info); 2495 if (bs->file) 2496 return bdrv_snapshot_create(bs->file, sn_info); 2497 return -ENOTSUP; 2498 } 2499 2500 int bdrv_snapshot_goto(BlockDriverState *bs, 2501 const char *snapshot_id) 2502 { 2503 BlockDriver *drv = bs->drv; 2504 int ret, open_ret; 2505 2506 if (!drv) 2507 return -ENOMEDIUM; 2508 if (drv->bdrv_snapshot_goto) 2509 return drv->bdrv_snapshot_goto(bs, snapshot_id); 2510 2511 if (bs->file) { 2512 drv->bdrv_close(bs); 2513 ret = bdrv_snapshot_goto(bs->file, snapshot_id); 2514 open_ret = drv->bdrv_open(bs, bs->open_flags); 2515 if (open_ret < 0) { 2516 bdrv_delete(bs->file); 2517 bs->drv = NULL; 2518 return open_ret; 2519 } 2520 return ret; 2521 } 2522 2523 return -ENOTSUP; 2524 } 2525 2526 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) 2527 { 2528 BlockDriver *drv = bs->drv; 2529 if (!drv) 2530 return -ENOMEDIUM; 2531 if (drv->bdrv_snapshot_delete) 2532 return drv->bdrv_snapshot_delete(bs, snapshot_id); 2533 if (bs->file) 2534 return bdrv_snapshot_delete(bs->file, snapshot_id); 2535 return -ENOTSUP; 2536 } 2537 2538 int bdrv_snapshot_list(BlockDriverState *bs, 2539 QEMUSnapshotInfo **psn_info) 2540 { 2541 BlockDriver *drv = bs->drv; 2542 if (!drv) 2543 return -ENOMEDIUM; 2544 if (drv->bdrv_snapshot_list) 2545 return drv->bdrv_snapshot_list(bs, psn_info); 2546 if (bs->file) 2547 return bdrv_snapshot_list(bs->file, psn_info); 2548 return -ENOTSUP; 2549 } 2550 2551 int bdrv_snapshot_load_tmp(BlockDriverState *bs, 2552 const char *snapshot_name) 2553 { 2554 BlockDriver *drv = bs->drv; 2555 if (!drv) { 2556 return -ENOMEDIUM; 2557 } 2558 if (!bs->read_only) { 2559 return -EINVAL; 2560 } 2561 if (drv->bdrv_snapshot_load_tmp) { 2562 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); 2563 } 2564 return -ENOTSUP; 2565 } 2566 2567 #define NB_SUFFIXES 4 2568 2569 char *get_human_readable_size(char *buf, int buf_size, int64_t size) 2570 { 2571 static const char suffixes[NB_SUFFIXES] = "KMGT"; 2572 int64_t base; 2573 int i; 2574 2575 if (size <= 999) { 2576 snprintf(buf, buf_size, "%" PRId64, size); 2577 } else { 2578 base = 1024; 2579 for(i = 0; i < NB_SUFFIXES; i++) { 2580 if (size < (10 * base)) { 2581 snprintf(buf, buf_size, "%0.1f%c", 2582 (double)size / base, 2583 suffixes[i]); 2584 break; 2585 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) { 2586 snprintf(buf, buf_size, "%" PRId64 "%c", 2587 ((size + (base >> 1)) / base), 2588 suffixes[i]); 2589 break; 2590 } 2591 base = base * 1024; 2592 } 2593 } 2594 return buf; 2595 } 2596 2597 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) 2598 { 2599 char buf1[128], date_buf[128], clock_buf[128]; 2600 #ifdef _WIN32 2601 struct tm *ptm; 2602 #else 2603 struct tm tm; 2604 #endif 2605 time_t ti; 2606 int64_t secs; 2607 2608 if (!sn) { 2609 snprintf(buf, buf_size, 2610 "%-10s%-20s%7s%20s%15s", 2611 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK"); 2612 } else { 2613 ti = sn->date_sec; 2614 #ifdef _WIN32 2615 ptm = localtime(&ti); 2616 strftime(date_buf, sizeof(date_buf), 2617 "%Y-%m-%d %H:%M:%S", ptm); 2618 #else 2619 localtime_r(&ti, &tm); 2620 strftime(date_buf, sizeof(date_buf), 2621 "%Y-%m-%d %H:%M:%S", &tm); 2622 #endif 2623 secs = sn->vm_clock_nsec / 1000000000; 2624 snprintf(clock_buf, sizeof(clock_buf), 2625 "%02d:%02d:%02d.%03d", 2626 (int)(secs / 3600), 2627 (int)((secs / 60) % 60), 2628 (int)(secs % 60), 2629 (int)((sn->vm_clock_nsec / 1000000) % 1000)); 2630 snprintf(buf, buf_size, 2631 "%-10s%-20s%7s%20s%15s", 2632 sn->id_str, sn->name, 2633 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size), 2634 date_buf, 2635 clock_buf); 2636 } 2637 return buf; 2638 } 2639 2640 /**************************************************************/ 2641 /* async I/Os */ 2642 2643 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, 2644 QEMUIOVector *qiov, int nb_sectors, 2645 BlockDriverCompletionFunc *cb, void *opaque) 2646 { 2647 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); 2648 2649 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 2650 cb, opaque, false); 2651 } 2652 2653 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, 2654 QEMUIOVector *qiov, int nb_sectors, 2655 BlockDriverCompletionFunc *cb, void *opaque) 2656 { 2657 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); 2658 2659 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 2660 cb, opaque, true); 2661 } 2662 2663 2664 typedef struct MultiwriteCB { 2665 int error; 2666 int num_requests; 2667 int num_callbacks; 2668 struct { 2669 BlockDriverCompletionFunc *cb; 2670 void *opaque; 2671 QEMUIOVector *free_qiov; 2672 void *free_buf; 2673 } callbacks[]; 2674 } MultiwriteCB; 2675 2676 static void multiwrite_user_cb(MultiwriteCB *mcb) 2677 { 2678 int i; 2679 2680 for (i = 0; i < mcb->num_callbacks; i++) { 2681 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); 2682 if (mcb->callbacks[i].free_qiov) { 2683 qemu_iovec_destroy(mcb->callbacks[i].free_qiov); 2684 } 2685 g_free(mcb->callbacks[i].free_qiov); 2686 qemu_vfree(mcb->callbacks[i].free_buf); 2687 } 2688 } 2689 2690 static void multiwrite_cb(void *opaque, int ret) 2691 { 2692 MultiwriteCB *mcb = opaque; 2693 2694 trace_multiwrite_cb(mcb, ret); 2695 2696 if (ret < 0 && !mcb->error) { 2697 mcb->error = ret; 2698 } 2699 2700 mcb->num_requests--; 2701 if (mcb->num_requests == 0) { 2702 multiwrite_user_cb(mcb); 2703 g_free(mcb); 2704 } 2705 } 2706 2707 static int multiwrite_req_compare(const void *a, const void *b) 2708 { 2709 const BlockRequest *req1 = a, *req2 = b; 2710 2711 /* 2712 * Note that we can't simply subtract req2->sector from req1->sector 2713 * here as that could overflow the return value. 2714 */ 2715 if (req1->sector > req2->sector) { 2716 return 1; 2717 } else if (req1->sector < req2->sector) { 2718 return -1; 2719 } else { 2720 return 0; 2721 } 2722 } 2723 2724 /* 2725 * Takes a bunch of requests and tries to merge them. Returns the number of 2726 * requests that remain after merging. 2727 */ 2728 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, 2729 int num_reqs, MultiwriteCB *mcb) 2730 { 2731 int i, outidx; 2732 2733 // Sort requests by start sector 2734 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); 2735 2736 // Check if adjacent requests touch the same clusters. If so, combine them, 2737 // filling up gaps with zero sectors. 2738 outidx = 0; 2739 for (i = 1; i < num_reqs; i++) { 2740 int merge = 0; 2741 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; 2742 2743 // This handles the cases that are valid for all block drivers, namely 2744 // exactly sequential writes and overlapping writes. 2745 if (reqs[i].sector <= oldreq_last) { 2746 merge = 1; 2747 } 2748 2749 // The block driver may decide that it makes sense to combine requests 2750 // even if there is a gap of some sectors between them. In this case, 2751 // the gap is filled with zeros (therefore only applicable for yet 2752 // unused space in format like qcow2). 2753 if (!merge && bs->drv->bdrv_merge_requests) { 2754 merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]); 2755 } 2756 2757 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { 2758 merge = 0; 2759 } 2760 2761 if (merge) { 2762 size_t size; 2763 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); 2764 qemu_iovec_init(qiov, 2765 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); 2766 2767 // Add the first request to the merged one. If the requests are 2768 // overlapping, drop the last sectors of the first request. 2769 size = (reqs[i].sector - reqs[outidx].sector) << 9; 2770 qemu_iovec_concat(qiov, reqs[outidx].qiov, size); 2771 2772 // We might need to add some zeros between the two requests 2773 if (reqs[i].sector > oldreq_last) { 2774 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9; 2775 uint8_t *buf = qemu_blockalign(bs, zero_bytes); 2776 memset(buf, 0, zero_bytes); 2777 qemu_iovec_add(qiov, buf, zero_bytes); 2778 mcb->callbacks[i].free_buf = buf; 2779 } 2780 2781 // Add the second request 2782 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size); 2783 2784 reqs[outidx].nb_sectors = qiov->size >> 9; 2785 reqs[outidx].qiov = qiov; 2786 2787 mcb->callbacks[i].free_qiov = reqs[outidx].qiov; 2788 } else { 2789 outidx++; 2790 reqs[outidx].sector = reqs[i].sector; 2791 reqs[outidx].nb_sectors = reqs[i].nb_sectors; 2792 reqs[outidx].qiov = reqs[i].qiov; 2793 } 2794 } 2795 2796 return outidx + 1; 2797 } 2798 2799 /* 2800 * Submit multiple AIO write requests at once. 2801 * 2802 * On success, the function returns 0 and all requests in the reqs array have 2803 * been submitted. In error case this function returns -1, and any of the 2804 * requests may or may not be submitted yet. In particular, this means that the 2805 * callback will be called for some of the requests, for others it won't. The 2806 * caller must check the error field of the BlockRequest to wait for the right 2807 * callbacks (if error != 0, no callback will be called). 2808 * 2809 * The implementation may modify the contents of the reqs array, e.g. to merge 2810 * requests. However, the fields opaque and error are left unmodified as they 2811 * are used to signal failure for a single request to the caller. 2812 */ 2813 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) 2814 { 2815 BlockDriverAIOCB *acb; 2816 MultiwriteCB *mcb; 2817 int i; 2818 2819 /* don't submit writes if we don't have a medium */ 2820 if (bs->drv == NULL) { 2821 for (i = 0; i < num_reqs; i++) { 2822 reqs[i].error = -ENOMEDIUM; 2823 } 2824 return -1; 2825 } 2826 2827 if (num_reqs == 0) { 2828 return 0; 2829 } 2830 2831 // Create MultiwriteCB structure 2832 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); 2833 mcb->num_requests = 0; 2834 mcb->num_callbacks = num_reqs; 2835 2836 for (i = 0; i < num_reqs; i++) { 2837 mcb->callbacks[i].cb = reqs[i].cb; 2838 mcb->callbacks[i].opaque = reqs[i].opaque; 2839 } 2840 2841 // Check for mergable requests 2842 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); 2843 2844 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); 2845 2846 /* 2847 * Run the aio requests. As soon as one request can't be submitted 2848 * successfully, fail all requests that are not yet submitted (we must 2849 * return failure for all requests anyway) 2850 * 2851 * num_requests cannot be set to the right value immediately: If 2852 * bdrv_aio_writev fails for some request, num_requests would be too high 2853 * and therefore multiwrite_cb() would never recognize the multiwrite 2854 * request as completed. We also cannot use the loop variable i to set it 2855 * when the first request fails because the callback may already have been 2856 * called for previously submitted requests. Thus, num_requests must be 2857 * incremented for each request that is submitted. 2858 * 2859 * The problem that callbacks may be called early also means that we need 2860 * to take care that num_requests doesn't become 0 before all requests are 2861 * submitted - multiwrite_cb() would consider the multiwrite request 2862 * completed. A dummy request that is "completed" by a manual call to 2863 * multiwrite_cb() takes care of this. 2864 */ 2865 mcb->num_requests = 1; 2866 2867 // Run the aio requests 2868 for (i = 0; i < num_reqs; i++) { 2869 mcb->num_requests++; 2870 acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, 2871 reqs[i].nb_sectors, multiwrite_cb, mcb); 2872 2873 if (acb == NULL) { 2874 // We can only fail the whole thing if no request has been 2875 // submitted yet. Otherwise we'll wait for the submitted AIOs to 2876 // complete and report the error in the callback. 2877 if (i == 0) { 2878 trace_bdrv_aio_multiwrite_earlyfail(mcb); 2879 goto fail; 2880 } else { 2881 trace_bdrv_aio_multiwrite_latefail(mcb, i); 2882 multiwrite_cb(mcb, -EIO); 2883 break; 2884 } 2885 } 2886 } 2887 2888 /* Complete the dummy request */ 2889 multiwrite_cb(mcb, 0); 2890 2891 return 0; 2892 2893 fail: 2894 for (i = 0; i < mcb->num_callbacks; i++) { 2895 reqs[i].error = -EIO; 2896 } 2897 g_free(mcb); 2898 return -1; 2899 } 2900 2901 void bdrv_aio_cancel(BlockDriverAIOCB *acb) 2902 { 2903 acb->pool->cancel(acb); 2904 } 2905 2906 /* block I/O throttling */ 2907 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, 2908 bool is_write, double elapsed_time, uint64_t *wait) 2909 { 2910 uint64_t bps_limit = 0; 2911 double bytes_limit, bytes_base, bytes_res; 2912 double slice_time, wait_time; 2913 2914 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { 2915 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; 2916 } else if (bs->io_limits.bps[is_write]) { 2917 bps_limit = bs->io_limits.bps[is_write]; 2918 } else { 2919 if (wait) { 2920 *wait = 0; 2921 } 2922 2923 return false; 2924 } 2925 2926 slice_time = bs->slice_end - bs->slice_start; 2927 slice_time /= (NANOSECONDS_PER_SECOND); 2928 bytes_limit = bps_limit * slice_time; 2929 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write]; 2930 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { 2931 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write]; 2932 } 2933 2934 /* bytes_base: the bytes of data which have been read/written; and 2935 * it is obtained from the history statistic info. 2936 * bytes_res: the remaining bytes of data which need to be read/written. 2937 * (bytes_base + bytes_res) / bps_limit: used to calcuate 2938 * the total time for completing reading/writting all data. 2939 */ 2940 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; 2941 2942 if (bytes_base + bytes_res <= bytes_limit) { 2943 if (wait) { 2944 *wait = 0; 2945 } 2946 2947 return false; 2948 } 2949 2950 /* Calc approx time to dispatch */ 2951 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time; 2952 2953 /* When the I/O rate at runtime exceeds the limits, 2954 * bs->slice_end need to be extended in order that the current statistic 2955 * info can be kept until the timer fire, so it is increased and tuned 2956 * based on the result of experiment. 2957 */ 2958 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; 2959 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; 2960 if (wait) { 2961 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; 2962 } 2963 2964 return true; 2965 } 2966 2967 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, 2968 double elapsed_time, uint64_t *wait) 2969 { 2970 uint64_t iops_limit = 0; 2971 double ios_limit, ios_base; 2972 double slice_time, wait_time; 2973 2974 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { 2975 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; 2976 } else if (bs->io_limits.iops[is_write]) { 2977 iops_limit = bs->io_limits.iops[is_write]; 2978 } else { 2979 if (wait) { 2980 *wait = 0; 2981 } 2982 2983 return false; 2984 } 2985 2986 slice_time = bs->slice_end - bs->slice_start; 2987 slice_time /= (NANOSECONDS_PER_SECOND); 2988 ios_limit = iops_limit * slice_time; 2989 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write]; 2990 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { 2991 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write]; 2992 } 2993 2994 if (ios_base + 1 <= ios_limit) { 2995 if (wait) { 2996 *wait = 0; 2997 } 2998 2999 return false; 3000 } 3001 3002 /* Calc approx time to dispatch */ 3003 wait_time = (ios_base + 1) / iops_limit; 3004 if (wait_time > elapsed_time) { 3005 wait_time = wait_time - elapsed_time; 3006 } else { 3007 wait_time = 0; 3008 } 3009 3010 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10; 3011 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME; 3012 if (wait) { 3013 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10; 3014 } 3015 3016 return true; 3017 } 3018 3019 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, 3020 bool is_write, int64_t *wait) 3021 { 3022 int64_t now, max_wait; 3023 uint64_t bps_wait = 0, iops_wait = 0; 3024 double elapsed_time; 3025 int bps_ret, iops_ret; 3026 3027 now = qemu_get_clock_ns(vm_clock); 3028 if ((bs->slice_start < now) 3029 && (bs->slice_end > now)) { 3030 bs->slice_end = now + bs->slice_time; 3031 } else { 3032 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; 3033 bs->slice_start = now; 3034 bs->slice_end = now + bs->slice_time; 3035 3036 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write]; 3037 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write]; 3038 3039 bs->io_base.ios[is_write] = bs->nr_ops[is_write]; 3040 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write]; 3041 } 3042 3043 elapsed_time = now - bs->slice_start; 3044 elapsed_time /= (NANOSECONDS_PER_SECOND); 3045 3046 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, 3047 is_write, elapsed_time, &bps_wait); 3048 iops_ret = bdrv_exceed_iops_limits(bs, is_write, 3049 elapsed_time, &iops_wait); 3050 if (bps_ret || iops_ret) { 3051 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; 3052 if (wait) { 3053 *wait = max_wait; 3054 } 3055 3056 now = qemu_get_clock_ns(vm_clock); 3057 if (bs->slice_end < now + max_wait) { 3058 bs->slice_end = now + max_wait; 3059 } 3060 3061 return true; 3062 } 3063 3064 if (wait) { 3065 *wait = 0; 3066 } 3067 3068 return false; 3069 } 3070 3071 /**************************************************************/ 3072 /* async block device emulation */ 3073 3074 typedef struct BlockDriverAIOCBSync { 3075 BlockDriverAIOCB common; 3076 QEMUBH *bh; 3077 int ret; 3078 /* vector translation state */ 3079 QEMUIOVector *qiov; 3080 uint8_t *bounce; 3081 int is_write; 3082 } BlockDriverAIOCBSync; 3083 3084 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) 3085 { 3086 BlockDriverAIOCBSync *acb = 3087 container_of(blockacb, BlockDriverAIOCBSync, common); 3088 qemu_bh_delete(acb->bh); 3089 acb->bh = NULL; 3090 qemu_aio_release(acb); 3091 } 3092 3093 static AIOPool bdrv_em_aio_pool = { 3094 .aiocb_size = sizeof(BlockDriverAIOCBSync), 3095 .cancel = bdrv_aio_cancel_em, 3096 }; 3097 3098 static void bdrv_aio_bh_cb(void *opaque) 3099 { 3100 BlockDriverAIOCBSync *acb = opaque; 3101 3102 if (!acb->is_write) 3103 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); 3104 qemu_vfree(acb->bounce); 3105 acb->common.cb(acb->common.opaque, acb->ret); 3106 qemu_bh_delete(acb->bh); 3107 acb->bh = NULL; 3108 qemu_aio_release(acb); 3109 } 3110 3111 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, 3112 int64_t sector_num, 3113 QEMUIOVector *qiov, 3114 int nb_sectors, 3115 BlockDriverCompletionFunc *cb, 3116 void *opaque, 3117 int is_write) 3118 3119 { 3120 BlockDriverAIOCBSync *acb; 3121 3122 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque); 3123 acb->is_write = is_write; 3124 acb->qiov = qiov; 3125 acb->bounce = qemu_blockalign(bs, qiov->size); 3126 3127 if (!acb->bh) 3128 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); 3129 3130 if (is_write) { 3131 qemu_iovec_to_buffer(acb->qiov, acb->bounce); 3132 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); 3133 } else { 3134 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); 3135 } 3136 3137 qemu_bh_schedule(acb->bh); 3138 3139 return &acb->common; 3140 } 3141 3142 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, 3143 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3144 BlockDriverCompletionFunc *cb, void *opaque) 3145 { 3146 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 3147 } 3148 3149 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, 3150 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 3151 BlockDriverCompletionFunc *cb, void *opaque) 3152 { 3153 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 3154 } 3155 3156 3157 typedef struct BlockDriverAIOCBCoroutine { 3158 BlockDriverAIOCB common; 3159 BlockRequest req; 3160 bool is_write; 3161 QEMUBH* bh; 3162 } BlockDriverAIOCBCoroutine; 3163 3164 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) 3165 { 3166 qemu_aio_flush(); 3167 } 3168 3169 static AIOPool bdrv_em_co_aio_pool = { 3170 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), 3171 .cancel = bdrv_aio_co_cancel_em, 3172 }; 3173 3174 static void bdrv_co_em_bh(void *opaque) 3175 { 3176 BlockDriverAIOCBCoroutine *acb = opaque; 3177 3178 acb->common.cb(acb->common.opaque, acb->req.error); 3179 qemu_bh_delete(acb->bh); 3180 qemu_aio_release(acb); 3181 } 3182 3183 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ 3184 static void coroutine_fn bdrv_co_do_rw(void *opaque) 3185 { 3186 BlockDriverAIOCBCoroutine *acb = opaque; 3187 BlockDriverState *bs = acb->common.bs; 3188 3189 if (!acb->is_write) { 3190 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, 3191 acb->req.nb_sectors, acb->req.qiov); 3192 } else { 3193 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, 3194 acb->req.nb_sectors, acb->req.qiov); 3195 } 3196 3197 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3198 qemu_bh_schedule(acb->bh); 3199 } 3200 3201 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, 3202 int64_t sector_num, 3203 QEMUIOVector *qiov, 3204 int nb_sectors, 3205 BlockDriverCompletionFunc *cb, 3206 void *opaque, 3207 bool is_write) 3208 { 3209 Coroutine *co; 3210 BlockDriverAIOCBCoroutine *acb; 3211 3212 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); 3213 acb->req.sector = sector_num; 3214 acb->req.nb_sectors = nb_sectors; 3215 acb->req.qiov = qiov; 3216 acb->is_write = is_write; 3217 3218 co = qemu_coroutine_create(bdrv_co_do_rw); 3219 qemu_coroutine_enter(co, acb); 3220 3221 return &acb->common; 3222 } 3223 3224 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) 3225 { 3226 BlockDriverAIOCBCoroutine *acb = opaque; 3227 BlockDriverState *bs = acb->common.bs; 3228 3229 acb->req.error = bdrv_co_flush(bs); 3230 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3231 qemu_bh_schedule(acb->bh); 3232 } 3233 3234 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 3235 BlockDriverCompletionFunc *cb, void *opaque) 3236 { 3237 trace_bdrv_aio_flush(bs, opaque); 3238 3239 Coroutine *co; 3240 BlockDriverAIOCBCoroutine *acb; 3241 3242 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); 3243 co = qemu_coroutine_create(bdrv_aio_flush_co_entry); 3244 qemu_coroutine_enter(co, acb); 3245 3246 return &acb->common; 3247 } 3248 3249 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) 3250 { 3251 BlockDriverAIOCBCoroutine *acb = opaque; 3252 BlockDriverState *bs = acb->common.bs; 3253 3254 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); 3255 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); 3256 qemu_bh_schedule(acb->bh); 3257 } 3258 3259 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, 3260 int64_t sector_num, int nb_sectors, 3261 BlockDriverCompletionFunc *cb, void *opaque) 3262 { 3263 Coroutine *co; 3264 BlockDriverAIOCBCoroutine *acb; 3265 3266 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); 3267 3268 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque); 3269 acb->req.sector = sector_num; 3270 acb->req.nb_sectors = nb_sectors; 3271 co = qemu_coroutine_create(bdrv_aio_discard_co_entry); 3272 qemu_coroutine_enter(co, acb); 3273 3274 return &acb->common; 3275 } 3276 3277 void bdrv_init(void) 3278 { 3279 module_call_init(MODULE_INIT_BLOCK); 3280 } 3281 3282 void bdrv_init_with_whitelist(void) 3283 { 3284 use_bdrv_whitelist = 1; 3285 bdrv_init(); 3286 } 3287 3288 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs, 3289 BlockDriverCompletionFunc *cb, void *opaque) 3290 { 3291 BlockDriverAIOCB *acb; 3292 3293 if (pool->free_aiocb) { 3294 acb = pool->free_aiocb; 3295 pool->free_aiocb = acb->next; 3296 } else { 3297 acb = g_malloc0(pool->aiocb_size); 3298 acb->pool = pool; 3299 } 3300 acb->bs = bs; 3301 acb->cb = cb; 3302 acb->opaque = opaque; 3303 return acb; 3304 } 3305 3306 void qemu_aio_release(void *p) 3307 { 3308 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p; 3309 AIOPool *pool = acb->pool; 3310 acb->next = pool->free_aiocb; 3311 pool->free_aiocb = acb; 3312 } 3313 3314 /**************************************************************/ 3315 /* Coroutine block device emulation */ 3316 3317 typedef struct CoroutineIOCompletion { 3318 Coroutine *coroutine; 3319 int ret; 3320 } CoroutineIOCompletion; 3321 3322 static void bdrv_co_io_em_complete(void *opaque, int ret) 3323 { 3324 CoroutineIOCompletion *co = opaque; 3325 3326 co->ret = ret; 3327 qemu_coroutine_enter(co->coroutine, NULL); 3328 } 3329 3330 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, 3331 int nb_sectors, QEMUIOVector *iov, 3332 bool is_write) 3333 { 3334 CoroutineIOCompletion co = { 3335 .coroutine = qemu_coroutine_self(), 3336 }; 3337 BlockDriverAIOCB *acb; 3338 3339 if (is_write) { 3340 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, 3341 bdrv_co_io_em_complete, &co); 3342 } else { 3343 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, 3344 bdrv_co_io_em_complete, &co); 3345 } 3346 3347 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); 3348 if (!acb) { 3349 return -EIO; 3350 } 3351 qemu_coroutine_yield(); 3352 3353 return co.ret; 3354 } 3355 3356 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, 3357 int64_t sector_num, int nb_sectors, 3358 QEMUIOVector *iov) 3359 { 3360 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); 3361 } 3362 3363 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, 3364 int64_t sector_num, int nb_sectors, 3365 QEMUIOVector *iov) 3366 { 3367 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); 3368 } 3369 3370 static void coroutine_fn bdrv_flush_co_entry(void *opaque) 3371 { 3372 RwCo *rwco = opaque; 3373 3374 rwco->ret = bdrv_co_flush(rwco->bs); 3375 } 3376 3377 int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 3378 { 3379 int ret; 3380 3381 if (!bs->drv) { 3382 return 0; 3383 } 3384 3385 /* Write back cached data to the OS even with cache=unsafe */ 3386 if (bs->drv->bdrv_co_flush_to_os) { 3387 ret = bs->drv->bdrv_co_flush_to_os(bs); 3388 if (ret < 0) { 3389 return ret; 3390 } 3391 } 3392 3393 /* But don't actually force it to the disk with cache=unsafe */ 3394 if (bs->open_flags & BDRV_O_NO_FLUSH) { 3395 return 0; 3396 } 3397 3398 if (bs->drv->bdrv_co_flush_to_disk) { 3399 return bs->drv->bdrv_co_flush_to_disk(bs); 3400 } else if (bs->drv->bdrv_aio_flush) { 3401 BlockDriverAIOCB *acb; 3402 CoroutineIOCompletion co = { 3403 .coroutine = qemu_coroutine_self(), 3404 }; 3405 3406 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 3407 if (acb == NULL) { 3408 return -EIO; 3409 } else { 3410 qemu_coroutine_yield(); 3411 return co.ret; 3412 } 3413 } else { 3414 /* 3415 * Some block drivers always operate in either writethrough or unsafe 3416 * mode and don't support bdrv_flush therefore. Usually qemu doesn't 3417 * know how the server works (because the behaviour is hardcoded or 3418 * depends on server-side configuration), so we can't ensure that 3419 * everything is safe on disk. Returning an error doesn't work because 3420 * that would break guests even if the server operates in writethrough 3421 * mode. 3422 * 3423 * Let's hope the user knows what he's doing. 3424 */ 3425 return 0; 3426 } 3427 } 3428 3429 void bdrv_invalidate_cache(BlockDriverState *bs) 3430 { 3431 if (bs->drv && bs->drv->bdrv_invalidate_cache) { 3432 bs->drv->bdrv_invalidate_cache(bs); 3433 } 3434 } 3435 3436 void bdrv_invalidate_cache_all(void) 3437 { 3438 BlockDriverState *bs; 3439 3440 QTAILQ_FOREACH(bs, &bdrv_states, list) { 3441 bdrv_invalidate_cache(bs); 3442 } 3443 } 3444 3445 int bdrv_flush(BlockDriverState *bs) 3446 { 3447 Coroutine *co; 3448 RwCo rwco = { 3449 .bs = bs, 3450 .ret = NOT_DONE, 3451 }; 3452 3453 if (qemu_in_coroutine()) { 3454 /* Fast-path if already in coroutine context */ 3455 bdrv_flush_co_entry(&rwco); 3456 } else { 3457 co = qemu_coroutine_create(bdrv_flush_co_entry); 3458 qemu_coroutine_enter(co, &rwco); 3459 while (rwco.ret == NOT_DONE) { 3460 qemu_aio_wait(); 3461 } 3462 } 3463 3464 return rwco.ret; 3465 } 3466 3467 static void coroutine_fn bdrv_discard_co_entry(void *opaque) 3468 { 3469 RwCo *rwco = opaque; 3470 3471 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); 3472 } 3473 3474 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, 3475 int nb_sectors) 3476 { 3477 if (!bs->drv) { 3478 return -ENOMEDIUM; 3479 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { 3480 return -EIO; 3481 } else if (bs->read_only) { 3482 return -EROFS; 3483 } else if (bs->drv->bdrv_co_discard) { 3484 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); 3485 } else if (bs->drv->bdrv_aio_discard) { 3486 BlockDriverAIOCB *acb; 3487 CoroutineIOCompletion co = { 3488 .coroutine = qemu_coroutine_self(), 3489 }; 3490 3491 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, 3492 bdrv_co_io_em_complete, &co); 3493 if (acb == NULL) { 3494 return -EIO; 3495 } else { 3496 qemu_coroutine_yield(); 3497 return co.ret; 3498 } 3499 } else { 3500 return 0; 3501 } 3502 } 3503 3504 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) 3505 { 3506 Coroutine *co; 3507 RwCo rwco = { 3508 .bs = bs, 3509 .sector_num = sector_num, 3510 .nb_sectors = nb_sectors, 3511 .ret = NOT_DONE, 3512 }; 3513 3514 if (qemu_in_coroutine()) { 3515 /* Fast-path if already in coroutine context */ 3516 bdrv_discard_co_entry(&rwco); 3517 } else { 3518 co = qemu_coroutine_create(bdrv_discard_co_entry); 3519 qemu_coroutine_enter(co, &rwco); 3520 while (rwco.ret == NOT_DONE) { 3521 qemu_aio_wait(); 3522 } 3523 } 3524 3525 return rwco.ret; 3526 } 3527 3528 /**************************************************************/ 3529 /* removable device support */ 3530 3531 /** 3532 * Return TRUE if the media is present 3533 */ 3534 int bdrv_is_inserted(BlockDriverState *bs) 3535 { 3536 BlockDriver *drv = bs->drv; 3537 3538 if (!drv) 3539 return 0; 3540 if (!drv->bdrv_is_inserted) 3541 return 1; 3542 return drv->bdrv_is_inserted(bs); 3543 } 3544 3545 /** 3546 * Return whether the media changed since the last call to this 3547 * function, or -ENOTSUP if we don't know. Most drivers don't know. 3548 */ 3549 int bdrv_media_changed(BlockDriverState *bs) 3550 { 3551 BlockDriver *drv = bs->drv; 3552 3553 if (drv && drv->bdrv_media_changed) { 3554 return drv->bdrv_media_changed(bs); 3555 } 3556 return -ENOTSUP; 3557 } 3558 3559 /** 3560 * If eject_flag is TRUE, eject the media. Otherwise, close the tray 3561 */ 3562 void bdrv_eject(BlockDriverState *bs, int eject_flag) 3563 { 3564 BlockDriver *drv = bs->drv; 3565 3566 if (drv && drv->bdrv_eject) { 3567 drv->bdrv_eject(bs, eject_flag); 3568 } 3569 } 3570 3571 /** 3572 * Lock or unlock the media (if it is locked, the user won't be able 3573 * to eject it manually). 3574 */ 3575 void bdrv_lock_medium(BlockDriverState *bs, bool locked) 3576 { 3577 BlockDriver *drv = bs->drv; 3578 3579 trace_bdrv_lock_medium(bs, locked); 3580 3581 if (drv && drv->bdrv_lock_medium) { 3582 drv->bdrv_lock_medium(bs, locked); 3583 } 3584 } 3585 3586 /* needed for generic scsi interface */ 3587 3588 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) 3589 { 3590 BlockDriver *drv = bs->drv; 3591 3592 if (drv && drv->bdrv_ioctl) 3593 return drv->bdrv_ioctl(bs, req, buf); 3594 return -ENOTSUP; 3595 } 3596 3597 BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, 3598 unsigned long int req, void *buf, 3599 BlockDriverCompletionFunc *cb, void *opaque) 3600 { 3601 BlockDriver *drv = bs->drv; 3602 3603 if (drv && drv->bdrv_aio_ioctl) 3604 return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); 3605 return NULL; 3606 } 3607 3608 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) 3609 { 3610 bs->buffer_alignment = align; 3611 } 3612 3613 void *qemu_blockalign(BlockDriverState *bs, size_t size) 3614 { 3615 return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); 3616 } 3617 3618 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) 3619 { 3620 int64_t bitmap_size; 3621 3622 bs->dirty_count = 0; 3623 if (enable) { 3624 if (!bs->dirty_bitmap) { 3625 bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + 3626 BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; 3627 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; 3628 3629 bs->dirty_bitmap = g_malloc0(bitmap_size); 3630 } 3631 } else { 3632 if (bs->dirty_bitmap) { 3633 g_free(bs->dirty_bitmap); 3634 bs->dirty_bitmap = NULL; 3635 } 3636 } 3637 } 3638 3639 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) 3640 { 3641 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; 3642 3643 if (bs->dirty_bitmap && 3644 (sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) { 3645 return !!(bs->dirty_bitmap[chunk / (sizeof(unsigned long) * 8)] & 3646 (1UL << (chunk % (sizeof(unsigned long) * 8)))); 3647 } else { 3648 return 0; 3649 } 3650 } 3651 3652 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, 3653 int nr_sectors) 3654 { 3655 set_dirty_bitmap(bs, cur_sector, nr_sectors, 0); 3656 } 3657 3658 int64_t bdrv_get_dirty_count(BlockDriverState *bs) 3659 { 3660 return bs->dirty_count; 3661 } 3662 3663 void bdrv_set_in_use(BlockDriverState *bs, int in_use) 3664 { 3665 assert(bs->in_use != in_use); 3666 bs->in_use = in_use; 3667 } 3668 3669 int bdrv_in_use(BlockDriverState *bs) 3670 { 3671 return bs->in_use; 3672 } 3673 3674 void bdrv_iostatus_enable(BlockDriverState *bs) 3675 { 3676 bs->iostatus_enabled = true; 3677 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 3678 } 3679 3680 /* The I/O status is only enabled if the drive explicitly 3681 * enables it _and_ the VM is configured to stop on errors */ 3682 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs) 3683 { 3684 return (bs->iostatus_enabled && 3685 (bs->on_write_error == BLOCK_ERR_STOP_ENOSPC || 3686 bs->on_write_error == BLOCK_ERR_STOP_ANY || 3687 bs->on_read_error == BLOCK_ERR_STOP_ANY)); 3688 } 3689 3690 void bdrv_iostatus_disable(BlockDriverState *bs) 3691 { 3692 bs->iostatus_enabled = false; 3693 } 3694 3695 void bdrv_iostatus_reset(BlockDriverState *bs) 3696 { 3697 if (bdrv_iostatus_is_enabled(bs)) { 3698 bs->iostatus = BLOCK_DEVICE_IO_STATUS_OK; 3699 } 3700 } 3701 3702 /* XXX: Today this is set by device models because it makes the implementation 3703 quite simple. However, the block layer knows about the error, so it's 3704 possible to implement this without device models being involved */ 3705 void bdrv_iostatus_set_err(BlockDriverState *bs, int error) 3706 { 3707 if (bdrv_iostatus_is_enabled(bs) && 3708 bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 3709 assert(error >= 0); 3710 bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : 3711 BLOCK_DEVICE_IO_STATUS_FAILED; 3712 } 3713 } 3714 3715 void 3716 bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes, 3717 enum BlockAcctType type) 3718 { 3719 assert(type < BDRV_MAX_IOTYPE); 3720 3721 cookie->bytes = bytes; 3722 cookie->start_time_ns = get_clock(); 3723 cookie->type = type; 3724 } 3725 3726 void 3727 bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie) 3728 { 3729 assert(cookie->type < BDRV_MAX_IOTYPE); 3730 3731 bs->nr_bytes[cookie->type] += cookie->bytes; 3732 bs->nr_ops[cookie->type]++; 3733 bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns; 3734 } 3735 3736 int bdrv_img_create(const char *filename, const char *fmt, 3737 const char *base_filename, const char *base_fmt, 3738 char *options, uint64_t img_size, int flags) 3739 { 3740 QEMUOptionParameter *param = NULL, *create_options = NULL; 3741 QEMUOptionParameter *backing_fmt, *backing_file, *size; 3742 BlockDriverState *bs = NULL; 3743 BlockDriver *drv, *proto_drv; 3744 BlockDriver *backing_drv = NULL; 3745 int ret = 0; 3746 3747 /* Find driver and parse its options */ 3748 drv = bdrv_find_format(fmt); 3749 if (!drv) { 3750 error_report("Unknown file format '%s'", fmt); 3751 ret = -EINVAL; 3752 goto out; 3753 } 3754 3755 proto_drv = bdrv_find_protocol(filename); 3756 if (!proto_drv) { 3757 error_report("Unknown protocol '%s'", filename); 3758 ret = -EINVAL; 3759 goto out; 3760 } 3761 3762 create_options = append_option_parameters(create_options, 3763 drv->create_options); 3764 create_options = append_option_parameters(create_options, 3765 proto_drv->create_options); 3766 3767 /* Create parameter list with default values */ 3768 param = parse_option_parameters("", create_options, param); 3769 3770 set_option_parameter_int(param, BLOCK_OPT_SIZE, img_size); 3771 3772 /* Parse -o options */ 3773 if (options) { 3774 param = parse_option_parameters(options, create_options, param); 3775 if (param == NULL) { 3776 error_report("Invalid options for file format '%s'.", fmt); 3777 ret = -EINVAL; 3778 goto out; 3779 } 3780 } 3781 3782 if (base_filename) { 3783 if (set_option_parameter(param, BLOCK_OPT_BACKING_FILE, 3784 base_filename)) { 3785 error_report("Backing file not supported for file format '%s'", 3786 fmt); 3787 ret = -EINVAL; 3788 goto out; 3789 } 3790 } 3791 3792 if (base_fmt) { 3793 if (set_option_parameter(param, BLOCK_OPT_BACKING_FMT, base_fmt)) { 3794 error_report("Backing file format not supported for file " 3795 "format '%s'", fmt); 3796 ret = -EINVAL; 3797 goto out; 3798 } 3799 } 3800 3801 backing_file = get_option_parameter(param, BLOCK_OPT_BACKING_FILE); 3802 if (backing_file && backing_file->value.s) { 3803 if (!strcmp(filename, backing_file->value.s)) { 3804 error_report("Error: Trying to create an image with the " 3805 "same filename as the backing file"); 3806 ret = -EINVAL; 3807 goto out; 3808 } 3809 } 3810 3811 backing_fmt = get_option_parameter(param, BLOCK_OPT_BACKING_FMT); 3812 if (backing_fmt && backing_fmt->value.s) { 3813 backing_drv = bdrv_find_format(backing_fmt->value.s); 3814 if (!backing_drv) { 3815 error_report("Unknown backing file format '%s'", 3816 backing_fmt->value.s); 3817 ret = -EINVAL; 3818 goto out; 3819 } 3820 } 3821 3822 // The size for the image must always be specified, with one exception: 3823 // If we are using a backing file, we can obtain the size from there 3824 size = get_option_parameter(param, BLOCK_OPT_SIZE); 3825 if (size && size->value.n == -1) { 3826 if (backing_file && backing_file->value.s) { 3827 uint64_t size; 3828 char buf[32]; 3829 3830 bs = bdrv_new(""); 3831 3832 ret = bdrv_open(bs, backing_file->value.s, flags, backing_drv); 3833 if (ret < 0) { 3834 error_report("Could not open '%s'", backing_file->value.s); 3835 goto out; 3836 } 3837 bdrv_get_geometry(bs, &size); 3838 size *= 512; 3839 3840 snprintf(buf, sizeof(buf), "%" PRId64, size); 3841 set_option_parameter(param, BLOCK_OPT_SIZE, buf); 3842 } else { 3843 error_report("Image creation needs a size parameter"); 3844 ret = -EINVAL; 3845 goto out; 3846 } 3847 } 3848 3849 printf("Formatting '%s', fmt=%s ", filename, fmt); 3850 print_option_parameters(param); 3851 puts(""); 3852 3853 ret = bdrv_create(drv, filename, param); 3854 3855 if (ret < 0) { 3856 if (ret == -ENOTSUP) { 3857 error_report("Formatting or formatting option not supported for " 3858 "file format '%s'", fmt); 3859 } else if (ret == -EFBIG) { 3860 error_report("The image size is too large for file format '%s'", 3861 fmt); 3862 } else { 3863 error_report("%s: error while creating %s: %s", filename, fmt, 3864 strerror(-ret)); 3865 } 3866 } 3867 3868 out: 3869 free_option_parameters(create_options); 3870 free_option_parameters(param); 3871 3872 if (bs) { 3873 bdrv_delete(bs); 3874 } 3875 3876 return ret; 3877 } 3878