161007b31SStefan Hajnoczi /* 261007b31SStefan Hajnoczi * Block layer I/O functions 361007b31SStefan Hajnoczi * 461007b31SStefan Hajnoczi * Copyright (c) 2003 Fabrice Bellard 561007b31SStefan Hajnoczi * 661007b31SStefan Hajnoczi * Permission is hereby granted, free of charge, to any person obtaining a copy 761007b31SStefan Hajnoczi * of this software and associated documentation files (the "Software"), to deal 861007b31SStefan Hajnoczi * in the Software without restriction, including without limitation the rights 961007b31SStefan Hajnoczi * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1061007b31SStefan Hajnoczi * copies of the Software, and to permit persons to whom the Software is 1161007b31SStefan Hajnoczi * furnished to do so, subject to the following conditions: 1261007b31SStefan Hajnoczi * 1361007b31SStefan Hajnoczi * The above copyright notice and this permission notice shall be included in 1461007b31SStefan Hajnoczi * all copies or substantial portions of the Software. 1561007b31SStefan Hajnoczi * 1661007b31SStefan Hajnoczi * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1761007b31SStefan Hajnoczi * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1861007b31SStefan Hajnoczi * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1961007b31SStefan Hajnoczi * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2061007b31SStefan Hajnoczi * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2161007b31SStefan Hajnoczi * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2261007b31SStefan Hajnoczi * THE SOFTWARE. 2361007b31SStefan Hajnoczi */ 2461007b31SStefan Hajnoczi 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2661007b31SStefan Hajnoczi #include "trace.h" 277f0e9da6SMax Reitz #include "sysemu/block-backend.h" 287719f3c9SStefan Hajnoczi #include "block/aio-wait.h" 2961007b31SStefan Hajnoczi #include "block/blockjob.h" 30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h" 3161007b31SStefan Hajnoczi #include "block/block_int.h" 32f348b6d1SVeronia Bahaa #include "qemu/cutils.h" 33da34e65cSMarkus Armbruster #include "qapi/error.h" 34d49b6836SMarkus Armbruster #include "qemu/error-report.h" 35db725815SMarkus Armbruster #include "qemu/main-loop.h" 36*c8aa7895SPavel Dovgalyuk #include "sysemu/replay.h" 3761007b31SStefan Hajnoczi 3861007b31SStefan Hajnoczi #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 3961007b31SStefan Hajnoczi 40cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 41cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 42cb2e2878SEric Blake 437f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs); 44d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 45f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags); 4661007b31SStefan Hajnoczi 47f4c8a43bSMax Reitz static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 486cd5c9d7SKevin Wolf bool ignore_bds_parents) 4961007b31SStefan Hajnoczi { 5002d21300SKevin Wolf BdrvChild *c, *next; 5127ccdd52SKevin Wolf 5202d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 536cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 540152bf40SKevin Wolf continue; 550152bf40SKevin Wolf } 564be6a6d1SKevin Wolf bdrv_parent_drained_begin_single(c, false); 57ce0f1412SPaolo Bonzini } 58ce0f1412SPaolo Bonzini } 59ce0f1412SPaolo Bonzini 60e037c09cSMax Reitz static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 61e037c09cSMax Reitz int *drained_end_counter) 62804db8eaSMax Reitz { 63804db8eaSMax Reitz assert(c->parent_quiesce_counter > 0); 64804db8eaSMax Reitz c->parent_quiesce_counter--; 65804db8eaSMax Reitz if (c->role->drained_end) { 66e037c09cSMax Reitz c->role->drained_end(c, drained_end_counter); 67804db8eaSMax Reitz } 68804db8eaSMax Reitz } 69804db8eaSMax Reitz 70e037c09cSMax Reitz void bdrv_parent_drained_end_single(BdrvChild *c) 71e037c09cSMax Reitz { 72e037c09cSMax Reitz int drained_end_counter = 0; 73e037c09cSMax Reitz bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 74e037c09cSMax Reitz BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0); 75e037c09cSMax Reitz } 76e037c09cSMax Reitz 77f4c8a43bSMax Reitz static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 78e037c09cSMax Reitz bool ignore_bds_parents, 79e037c09cSMax Reitz int *drained_end_counter) 80ce0f1412SPaolo Bonzini { 8161ad631cSMax Reitz BdrvChild *c; 8227ccdd52SKevin Wolf 8361ad631cSMax Reitz QLIST_FOREACH(c, &bs->parents, next_parent) { 846cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 850152bf40SKevin Wolf continue; 860152bf40SKevin Wolf } 87e037c09cSMax Reitz bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 88c2066af0SKevin Wolf } 8961007b31SStefan Hajnoczi } 9061007b31SStefan Hajnoczi 914be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c) 924be6a6d1SKevin Wolf { 934be6a6d1SKevin Wolf if (c->role->drained_poll) { 944be6a6d1SKevin Wolf return c->role->drained_poll(c); 954be6a6d1SKevin Wolf } 964be6a6d1SKevin Wolf return false; 974be6a6d1SKevin Wolf } 984be6a6d1SKevin Wolf 996cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 1006cd5c9d7SKevin Wolf bool ignore_bds_parents) 10189bd0305SKevin Wolf { 10289bd0305SKevin Wolf BdrvChild *c, *next; 10389bd0305SKevin Wolf bool busy = false; 10489bd0305SKevin Wolf 10589bd0305SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 1066cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 10789bd0305SKevin Wolf continue; 10889bd0305SKevin Wolf } 1094be6a6d1SKevin Wolf busy |= bdrv_parent_drained_poll_single(c); 11089bd0305SKevin Wolf } 11189bd0305SKevin Wolf 11289bd0305SKevin Wolf return busy; 11389bd0305SKevin Wolf } 11489bd0305SKevin Wolf 1154be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 1164be6a6d1SKevin Wolf { 117804db8eaSMax Reitz c->parent_quiesce_counter++; 1184be6a6d1SKevin Wolf if (c->role->drained_begin) { 1194be6a6d1SKevin Wolf c->role->drained_begin(c); 1204be6a6d1SKevin Wolf } 1214be6a6d1SKevin Wolf if (poll) { 1224be6a6d1SKevin Wolf BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 1234be6a6d1SKevin Wolf } 1244be6a6d1SKevin Wolf } 1254be6a6d1SKevin Wolf 126d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 127d9e0dfa2SEric Blake { 128d9e0dfa2SEric Blake dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 129d9e0dfa2SEric Blake dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 130d9e0dfa2SEric Blake dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 131d9e0dfa2SEric Blake src->opt_mem_alignment); 132d9e0dfa2SEric Blake dst->min_mem_alignment = MAX(dst->min_mem_alignment, 133d9e0dfa2SEric Blake src->min_mem_alignment); 134d9e0dfa2SEric Blake dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 135d9e0dfa2SEric Blake } 136d9e0dfa2SEric Blake 13761007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 13861007b31SStefan Hajnoczi { 13961007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 14061007b31SStefan Hajnoczi Error *local_err = NULL; 14161007b31SStefan Hajnoczi 14261007b31SStefan Hajnoczi memset(&bs->bl, 0, sizeof(bs->bl)); 14361007b31SStefan Hajnoczi 14461007b31SStefan Hajnoczi if (!drv) { 14561007b31SStefan Hajnoczi return; 14661007b31SStefan Hajnoczi } 14761007b31SStefan Hajnoczi 14879ba8c98SEric Blake /* Default alignment based on whether driver has byte interface */ 149e31f6864SEric Blake bs->bl.request_alignment = (drv->bdrv_co_preadv || 150ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_aio_preadv || 151ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_co_preadv_part) ? 1 : 512; 15279ba8c98SEric Blake 15361007b31SStefan Hajnoczi /* Take some limits from the children as a default */ 15461007b31SStefan Hajnoczi if (bs->file) { 1559a4f4c31SKevin Wolf bdrv_refresh_limits(bs->file->bs, &local_err); 15661007b31SStefan Hajnoczi if (local_err) { 15761007b31SStefan Hajnoczi error_propagate(errp, local_err); 15861007b31SStefan Hajnoczi return; 15961007b31SStefan Hajnoczi } 160d9e0dfa2SEric Blake bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 16161007b31SStefan Hajnoczi } else { 1624196d2f0SDenis V. Lunev bs->bl.min_mem_alignment = 512; 163459b4e66SDenis V. Lunev bs->bl.opt_mem_alignment = getpagesize(); 164bd44feb7SStefan Hajnoczi 165bd44feb7SStefan Hajnoczi /* Safe default since most protocols use readv()/writev()/etc */ 166bd44feb7SStefan Hajnoczi bs->bl.max_iov = IOV_MAX; 16761007b31SStefan Hajnoczi } 16861007b31SStefan Hajnoczi 169760e0063SKevin Wolf if (bs->backing) { 170760e0063SKevin Wolf bdrv_refresh_limits(bs->backing->bs, &local_err); 17161007b31SStefan Hajnoczi if (local_err) { 17261007b31SStefan Hajnoczi error_propagate(errp, local_err); 17361007b31SStefan Hajnoczi return; 17461007b31SStefan Hajnoczi } 175d9e0dfa2SEric Blake bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 17661007b31SStefan Hajnoczi } 17761007b31SStefan Hajnoczi 17861007b31SStefan Hajnoczi /* Then let the driver override it */ 17961007b31SStefan Hajnoczi if (drv->bdrv_refresh_limits) { 18061007b31SStefan Hajnoczi drv->bdrv_refresh_limits(bs, errp); 18161007b31SStefan Hajnoczi } 18261007b31SStefan Hajnoczi } 18361007b31SStefan Hajnoczi 18461007b31SStefan Hajnoczi /** 18561007b31SStefan Hajnoczi * The copy-on-read flag is actually a reference count so multiple users may 18661007b31SStefan Hajnoczi * use the feature without worrying about clobbering its previous state. 18761007b31SStefan Hajnoczi * Copy-on-read stays enabled until all users have called to disable it. 18861007b31SStefan Hajnoczi */ 18961007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs) 19061007b31SStefan Hajnoczi { 191d3faa13eSPaolo Bonzini atomic_inc(&bs->copy_on_read); 19261007b31SStefan Hajnoczi } 19361007b31SStefan Hajnoczi 19461007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs) 19561007b31SStefan Hajnoczi { 196d3faa13eSPaolo Bonzini int old = atomic_fetch_dec(&bs->copy_on_read); 197d3faa13eSPaolo Bonzini assert(old >= 1); 19861007b31SStefan Hajnoczi } 19961007b31SStefan Hajnoczi 20061124f03SPaolo Bonzini typedef struct { 20161124f03SPaolo Bonzini Coroutine *co; 20261124f03SPaolo Bonzini BlockDriverState *bs; 20361124f03SPaolo Bonzini bool done; 204481cad48SManos Pitsidianakis bool begin; 205b0165585SKevin Wolf bool recursive; 206fe4f0614SKevin Wolf bool poll; 2070152bf40SKevin Wolf BdrvChild *parent; 2086cd5c9d7SKevin Wolf bool ignore_bds_parents; 2098e1da77eSMax Reitz int *drained_end_counter; 21061124f03SPaolo Bonzini } BdrvCoDrainData; 21161124f03SPaolo Bonzini 21261124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 21361124f03SPaolo Bonzini { 21461124f03SPaolo Bonzini BdrvCoDrainData *data = opaque; 21561124f03SPaolo Bonzini BlockDriverState *bs = data->bs; 21661124f03SPaolo Bonzini 217481cad48SManos Pitsidianakis if (data->begin) { 218f8ea8dacSManos Pitsidianakis bs->drv->bdrv_co_drain_begin(bs); 219481cad48SManos Pitsidianakis } else { 220481cad48SManos Pitsidianakis bs->drv->bdrv_co_drain_end(bs); 221481cad48SManos Pitsidianakis } 22261124f03SPaolo Bonzini 22365181d63SMax Reitz /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 22461124f03SPaolo Bonzini atomic_mb_set(&data->done, true); 225e037c09cSMax Reitz if (!data->begin) { 2268e1da77eSMax Reitz atomic_dec(data->drained_end_counter); 2278e1da77eSMax Reitz } 22865181d63SMax Reitz bdrv_dec_in_flight(bs); 2298e1da77eSMax Reitz 2300109e7e6SKevin Wolf g_free(data); 2310109e7e6SKevin Wolf } 23261124f03SPaolo Bonzini 233db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 2348e1da77eSMax Reitz static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 2358e1da77eSMax Reitz int *drained_end_counter) 23661124f03SPaolo Bonzini { 2370109e7e6SKevin Wolf BdrvCoDrainData *data; 23861124f03SPaolo Bonzini 239f8ea8dacSManos Pitsidianakis if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 240481cad48SManos Pitsidianakis (!begin && !bs->drv->bdrv_co_drain_end)) { 24161124f03SPaolo Bonzini return; 24261124f03SPaolo Bonzini } 24361124f03SPaolo Bonzini 2440109e7e6SKevin Wolf data = g_new(BdrvCoDrainData, 1); 2450109e7e6SKevin Wolf *data = (BdrvCoDrainData) { 2460109e7e6SKevin Wolf .bs = bs, 2470109e7e6SKevin Wolf .done = false, 2488e1da77eSMax Reitz .begin = begin, 2498e1da77eSMax Reitz .drained_end_counter = drained_end_counter, 2500109e7e6SKevin Wolf }; 2510109e7e6SKevin Wolf 252e037c09cSMax Reitz if (!begin) { 2538e1da77eSMax Reitz atomic_inc(drained_end_counter); 2548e1da77eSMax Reitz } 2558e1da77eSMax Reitz 2560109e7e6SKevin Wolf /* Make sure the driver callback completes during the polling phase for 2570109e7e6SKevin Wolf * drain_begin. */ 2580109e7e6SKevin Wolf bdrv_inc_in_flight(bs); 2590109e7e6SKevin Wolf data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 2600109e7e6SKevin Wolf aio_co_schedule(bdrv_get_aio_context(bs), data->co); 26161124f03SPaolo Bonzini } 26261124f03SPaolo Bonzini 2631cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 264fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 2656cd5c9d7SKevin Wolf BdrvChild *ignore_parent, bool ignore_bds_parents) 26689bd0305SKevin Wolf { 267fe4f0614SKevin Wolf BdrvChild *child, *next; 268fe4f0614SKevin Wolf 2696cd5c9d7SKevin Wolf if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 27089bd0305SKevin Wolf return true; 27189bd0305SKevin Wolf } 27289bd0305SKevin Wolf 273fe4f0614SKevin Wolf if (atomic_read(&bs->in_flight)) { 274fe4f0614SKevin Wolf return true; 27589bd0305SKevin Wolf } 27689bd0305SKevin Wolf 277fe4f0614SKevin Wolf if (recursive) { 2786cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 279fe4f0614SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 2806cd5c9d7SKevin Wolf if (bdrv_drain_poll(child->bs, recursive, child, false)) { 281fe4f0614SKevin Wolf return true; 282fe4f0614SKevin Wolf } 283fe4f0614SKevin Wolf } 284fe4f0614SKevin Wolf } 285fe4f0614SKevin Wolf 286fe4f0614SKevin Wolf return false; 287fe4f0614SKevin Wolf } 288fe4f0614SKevin Wolf 289fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 29089bd0305SKevin Wolf BdrvChild *ignore_parent) 2911cc8e54aSKevin Wolf { 2926cd5c9d7SKevin Wolf return bdrv_drain_poll(bs, recursive, ignore_parent, false); 2931cc8e54aSKevin Wolf } 2941cc8e54aSKevin Wolf 295b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 2966cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 2976cd5c9d7SKevin Wolf bool poll); 298b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 2998e1da77eSMax Reitz BdrvChild *parent, bool ignore_bds_parents, 3008e1da77eSMax Reitz int *drained_end_counter); 3010152bf40SKevin Wolf 302a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque) 303a77fd4bbSFam Zheng { 304a77fd4bbSFam Zheng BdrvCoDrainData *data = opaque; 305a77fd4bbSFam Zheng Coroutine *co = data->co; 30699723548SPaolo Bonzini BlockDriverState *bs = data->bs; 307a77fd4bbSFam Zheng 308c8ca33d0SKevin Wolf if (bs) { 309aa1361d5SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 310aa1361d5SKevin Wolf AioContext *co_ctx = qemu_coroutine_get_aio_context(co); 311aa1361d5SKevin Wolf 312aa1361d5SKevin Wolf /* 313aa1361d5SKevin Wolf * When the coroutine yielded, the lock for its home context was 314aa1361d5SKevin Wolf * released, so we need to re-acquire it here. If it explicitly 315aa1361d5SKevin Wolf * acquired a different context, the lock is still held and we don't 316aa1361d5SKevin Wolf * want to lock it a second time (or AIO_WAIT_WHILE() would hang). 317aa1361d5SKevin Wolf */ 318aa1361d5SKevin Wolf if (ctx == co_ctx) { 319aa1361d5SKevin Wolf aio_context_acquire(ctx); 320aa1361d5SKevin Wolf } 32199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 322481cad48SManos Pitsidianakis if (data->begin) { 323e037c09cSMax Reitz assert(!data->drained_end_counter); 3246cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, data->recursive, data->parent, 3256cd5c9d7SKevin Wolf data->ignore_bds_parents, data->poll); 326481cad48SManos Pitsidianakis } else { 327e037c09cSMax Reitz assert(!data->poll); 3286cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, data->recursive, data->parent, 3298e1da77eSMax Reitz data->ignore_bds_parents, 3308e1da77eSMax Reitz data->drained_end_counter); 331481cad48SManos Pitsidianakis } 332aa1361d5SKevin Wolf if (ctx == co_ctx) { 333aa1361d5SKevin Wolf aio_context_release(ctx); 334aa1361d5SKevin Wolf } 335c8ca33d0SKevin Wolf } else { 336c8ca33d0SKevin Wolf assert(data->begin); 337c8ca33d0SKevin Wolf bdrv_drain_all_begin(); 338c8ca33d0SKevin Wolf } 339481cad48SManos Pitsidianakis 340a77fd4bbSFam Zheng data->done = true; 3411919631eSPaolo Bonzini aio_co_wake(co); 342a77fd4bbSFam Zheng } 343a77fd4bbSFam Zheng 344481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 345b0165585SKevin Wolf bool begin, bool recursive, 3466cd5c9d7SKevin Wolf BdrvChild *parent, 3476cd5c9d7SKevin Wolf bool ignore_bds_parents, 3488e1da77eSMax Reitz bool poll, 3498e1da77eSMax Reitz int *drained_end_counter) 350a77fd4bbSFam Zheng { 351a77fd4bbSFam Zheng BdrvCoDrainData data; 352a77fd4bbSFam Zheng 353a77fd4bbSFam Zheng /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 354c40a2545SStefan Hajnoczi * other coroutines run if they were queued by aio_co_enter(). */ 355a77fd4bbSFam Zheng 356a77fd4bbSFam Zheng assert(qemu_in_coroutine()); 357a77fd4bbSFam Zheng data = (BdrvCoDrainData) { 358a77fd4bbSFam Zheng .co = qemu_coroutine_self(), 359a77fd4bbSFam Zheng .bs = bs, 360a77fd4bbSFam Zheng .done = false, 361481cad48SManos Pitsidianakis .begin = begin, 362b0165585SKevin Wolf .recursive = recursive, 3630152bf40SKevin Wolf .parent = parent, 3646cd5c9d7SKevin Wolf .ignore_bds_parents = ignore_bds_parents, 365fe4f0614SKevin Wolf .poll = poll, 3668e1da77eSMax Reitz .drained_end_counter = drained_end_counter, 367a77fd4bbSFam Zheng }; 3688e1da77eSMax Reitz 369c8ca33d0SKevin Wolf if (bs) { 37099723548SPaolo Bonzini bdrv_inc_in_flight(bs); 371c8ca33d0SKevin Wolf } 372fffb6e12SPaolo Bonzini aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 373fffb6e12SPaolo Bonzini bdrv_co_drain_bh_cb, &data); 374a77fd4bbSFam Zheng 375a77fd4bbSFam Zheng qemu_coroutine_yield(); 376a77fd4bbSFam Zheng /* If we are resumed from some other event (such as an aio completion or a 377a77fd4bbSFam Zheng * timer callback), it is a bug in the caller that should be fixed. */ 378a77fd4bbSFam Zheng assert(data.done); 379a77fd4bbSFam Zheng } 380a77fd4bbSFam Zheng 381dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 3826cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents) 383dcf94a23SKevin Wolf { 384dcf94a23SKevin Wolf assert(!qemu_in_coroutine()); 385dcf94a23SKevin Wolf 386dcf94a23SKevin Wolf /* Stop things in parent-to-child order */ 387dcf94a23SKevin Wolf if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 388dcf94a23SKevin Wolf aio_disable_external(bdrv_get_aio_context(bs)); 389dcf94a23SKevin Wolf } 390dcf94a23SKevin Wolf 3916cd5c9d7SKevin Wolf bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 3928e1da77eSMax Reitz bdrv_drain_invoke(bs, true, NULL); 393dcf94a23SKevin Wolf } 394dcf94a23SKevin Wolf 395dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 3966cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 3976cd5c9d7SKevin Wolf bool poll) 3986820643fSKevin Wolf { 399b0165585SKevin Wolf BdrvChild *child, *next; 400b0165585SKevin Wolf 401d42cf288SPaolo Bonzini if (qemu_in_coroutine()) { 4026cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 4038e1da77eSMax Reitz poll, NULL); 404d42cf288SPaolo Bonzini return; 405d42cf288SPaolo Bonzini } 406d42cf288SPaolo Bonzini 4076cd5c9d7SKevin Wolf bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 408d30b8e64SKevin Wolf 409b0165585SKevin Wolf if (recursive) { 4106cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 411d736f119SKevin Wolf bs->recursive_quiesce_counter++; 412b0165585SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 4136cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 4146cd5c9d7SKevin Wolf false); 415b0165585SKevin Wolf } 416b0165585SKevin Wolf } 417fe4f0614SKevin Wolf 418fe4f0614SKevin Wolf /* 419fe4f0614SKevin Wolf * Wait for drained requests to finish. 420fe4f0614SKevin Wolf * 421fe4f0614SKevin Wolf * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 422fe4f0614SKevin Wolf * call is needed so things in this AioContext can make progress even 423fe4f0614SKevin Wolf * though we don't return to the main AioContext loop - this automatically 424fe4f0614SKevin Wolf * includes other nodes in the same AioContext and therefore all child 425fe4f0614SKevin Wolf * nodes. 426fe4f0614SKevin Wolf */ 427fe4f0614SKevin Wolf if (poll) { 4286cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 429fe4f0614SKevin Wolf BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 430fe4f0614SKevin Wolf } 4316820643fSKevin Wolf } 4326820643fSKevin Wolf 4330152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs) 4340152bf40SKevin Wolf { 4356cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, false, NULL, false, true); 4360152bf40SKevin Wolf } 4370152bf40SKevin Wolf 438b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs) 4396820643fSKevin Wolf { 4406cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, true, NULL, false, true); 441b0165585SKevin Wolf } 442b0165585SKevin Wolf 443e037c09cSMax Reitz /** 444e037c09cSMax Reitz * This function does not poll, nor must any of its recursively called 445e037c09cSMax Reitz * functions. The *drained_end_counter pointee will be incremented 446e037c09cSMax Reitz * once for every background operation scheduled, and decremented once 447e037c09cSMax Reitz * the operation settles. Therefore, the pointer must remain valid 448e037c09cSMax Reitz * until the pointee reaches 0. That implies that whoever sets up the 449e037c09cSMax Reitz * pointee has to poll until it is 0. 450e037c09cSMax Reitz * 451e037c09cSMax Reitz * We use atomic operations to access *drained_end_counter, because 452e037c09cSMax Reitz * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 453e037c09cSMax Reitz * @bs may contain nodes in different AioContexts, 454e037c09cSMax Reitz * (2) bdrv_drain_all_end() uses the same counter for all nodes, 455e037c09cSMax Reitz * regardless of which AioContext they are in. 456e037c09cSMax Reitz */ 4576cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 4588e1da77eSMax Reitz BdrvChild *parent, bool ignore_bds_parents, 4598e1da77eSMax Reitz int *drained_end_counter) 460b0165585SKevin Wolf { 46161ad631cSMax Reitz BdrvChild *child; 4620f115168SKevin Wolf int old_quiesce_counter; 4630f115168SKevin Wolf 464e037c09cSMax Reitz assert(drained_end_counter != NULL); 465e037c09cSMax Reitz 466481cad48SManos Pitsidianakis if (qemu_in_coroutine()) { 4676cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 4688e1da77eSMax Reitz false, drained_end_counter); 469481cad48SManos Pitsidianakis return; 470481cad48SManos Pitsidianakis } 4716820643fSKevin Wolf assert(bs->quiesce_counter > 0); 4726820643fSKevin Wolf 47360369b86SKevin Wolf /* Re-enable things in child-to-parent order */ 4748e1da77eSMax Reitz bdrv_drain_invoke(bs, false, drained_end_counter); 475e037c09cSMax Reitz bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 476e037c09cSMax Reitz drained_end_counter); 4775cb2737eSMax Reitz 4785cb2737eSMax Reitz old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 4790f115168SKevin Wolf if (old_quiesce_counter == 1) { 4806820643fSKevin Wolf aio_enable_external(bdrv_get_aio_context(bs)); 4816820643fSKevin Wolf } 482b0165585SKevin Wolf 483b0165585SKevin Wolf if (recursive) { 4846cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 485d736f119SKevin Wolf bs->recursive_quiesce_counter--; 48661ad631cSMax Reitz QLIST_FOREACH(child, &bs->children, next) { 4878e1da77eSMax Reitz bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 4888e1da77eSMax Reitz drained_end_counter); 489b0165585SKevin Wolf } 490b0165585SKevin Wolf } 4910f115168SKevin Wolf } 4926820643fSKevin Wolf 4930152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs) 4940152bf40SKevin Wolf { 495e037c09cSMax Reitz int drained_end_counter = 0; 496e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 497e037c09cSMax Reitz BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 498e037c09cSMax Reitz } 499e037c09cSMax Reitz 500e037c09cSMax Reitz void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 501e037c09cSMax Reitz { 502e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 503b0165585SKevin Wolf } 504b0165585SKevin Wolf 505b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs) 506b0165585SKevin Wolf { 507e037c09cSMax Reitz int drained_end_counter = 0; 508e037c09cSMax Reitz bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 509e037c09cSMax Reitz BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0); 5100152bf40SKevin Wolf } 5110152bf40SKevin Wolf 512d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 513d736f119SKevin Wolf { 514d736f119SKevin Wolf int i; 515d736f119SKevin Wolf 516d736f119SKevin Wolf for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 5176cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, false, true); 518d736f119SKevin Wolf } 519d736f119SKevin Wolf } 520d736f119SKevin Wolf 521d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 522d736f119SKevin Wolf { 523e037c09cSMax Reitz int drained_end_counter = 0; 524d736f119SKevin Wolf int i; 525d736f119SKevin Wolf 526d736f119SKevin Wolf for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 527e037c09cSMax Reitz bdrv_do_drained_end(child->bs, true, child, false, 528e037c09cSMax Reitz &drained_end_counter); 529d736f119SKevin Wolf } 530e037c09cSMax Reitz 531e037c09cSMax Reitz BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0); 532d736f119SKevin Wolf } 533d736f119SKevin Wolf 53461007b31SStefan Hajnoczi /* 53567da1dc5SFam Zheng * Wait for pending requests to complete on a single BlockDriverState subtree, 53667da1dc5SFam Zheng * and suspend block driver's internal I/O until next request arrives. 53761007b31SStefan Hajnoczi * 53861007b31SStefan Hajnoczi * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 53961007b31SStefan Hajnoczi * AioContext. 54061007b31SStefan Hajnoczi */ 541b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 542b6e84c97SPaolo Bonzini { 5436820643fSKevin Wolf assert(qemu_in_coroutine()); 5446820643fSKevin Wolf bdrv_drained_begin(bs); 5456820643fSKevin Wolf bdrv_drained_end(bs); 546b6e84c97SPaolo Bonzini } 547b6e84c97SPaolo Bonzini 54861007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs) 54961007b31SStefan Hajnoczi { 5506820643fSKevin Wolf bdrv_drained_begin(bs); 5516820643fSKevin Wolf bdrv_drained_end(bs); 55261007b31SStefan Hajnoczi } 55361007b31SStefan Hajnoczi 554c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs) 555c13ad59fSKevin Wolf { 556c13ad59fSKevin Wolf BdrvChild *child, *next; 557c13ad59fSKevin Wolf 558c13ad59fSKevin Wolf assert(atomic_read(&bs->in_flight) == 0); 559c13ad59fSKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 560c13ad59fSKevin Wolf bdrv_drain_assert_idle(child->bs); 561c13ad59fSKevin Wolf } 562c13ad59fSKevin Wolf } 563c13ad59fSKevin Wolf 5640f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0; 5650f12264eSKevin Wolf 5660f12264eSKevin Wolf static bool bdrv_drain_all_poll(void) 5670f12264eSKevin Wolf { 5680f12264eSKevin Wolf BlockDriverState *bs = NULL; 5690f12264eSKevin Wolf bool result = false; 5700f12264eSKevin Wolf 5710f12264eSKevin Wolf /* bdrv_drain_poll() can't make changes to the graph and we are holding the 5720f12264eSKevin Wolf * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 5730f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 5740f12264eSKevin Wolf AioContext *aio_context = bdrv_get_aio_context(bs); 5750f12264eSKevin Wolf aio_context_acquire(aio_context); 5760f12264eSKevin Wolf result |= bdrv_drain_poll(bs, false, NULL, true); 5770f12264eSKevin Wolf aio_context_release(aio_context); 5780f12264eSKevin Wolf } 5790f12264eSKevin Wolf 5800f12264eSKevin Wolf return result; 5810f12264eSKevin Wolf } 5820f12264eSKevin Wolf 58361007b31SStefan Hajnoczi /* 58461007b31SStefan Hajnoczi * Wait for pending requests to complete across all BlockDriverStates 58561007b31SStefan Hajnoczi * 58661007b31SStefan Hajnoczi * This function does not flush data to disk, use bdrv_flush_all() for that 58761007b31SStefan Hajnoczi * after calling this function. 588c0778f66SAlberto Garcia * 589c0778f66SAlberto Garcia * This pauses all block jobs and disables external clients. It must 590c0778f66SAlberto Garcia * be paired with bdrv_drain_all_end(). 591c0778f66SAlberto Garcia * 592c0778f66SAlberto Garcia * NOTE: no new block jobs or BlockDriverStates can be created between 593c0778f66SAlberto Garcia * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 59461007b31SStefan Hajnoczi */ 595c0778f66SAlberto Garcia void bdrv_drain_all_begin(void) 59661007b31SStefan Hajnoczi { 5970f12264eSKevin Wolf BlockDriverState *bs = NULL; 59861007b31SStefan Hajnoczi 599c8ca33d0SKevin Wolf if (qemu_in_coroutine()) { 6008e1da77eSMax Reitz bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 601c8ca33d0SKevin Wolf return; 602c8ca33d0SKevin Wolf } 603c8ca33d0SKevin Wolf 604*c8aa7895SPavel Dovgalyuk /* 605*c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 606*c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 607*c8aa7895SPavel Dovgalyuk * be infinite 608*c8aa7895SPavel Dovgalyuk */ 609*c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 610*c8aa7895SPavel Dovgalyuk return; 611*c8aa7895SPavel Dovgalyuk } 612*c8aa7895SPavel Dovgalyuk 6130f12264eSKevin Wolf /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 6140f12264eSKevin Wolf * loop AioContext, so make sure we're in the main context. */ 6159a7e86c8SKevin Wolf assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 6160f12264eSKevin Wolf assert(bdrv_drain_all_count < INT_MAX); 6170f12264eSKevin Wolf bdrv_drain_all_count++; 6189a7e86c8SKevin Wolf 6190f12264eSKevin Wolf /* Quiesce all nodes, without polling in-flight requests yet. The graph 6200f12264eSKevin Wolf * cannot change during this loop. */ 6210f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 62261007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 62361007b31SStefan Hajnoczi 62461007b31SStefan Hajnoczi aio_context_acquire(aio_context); 6250f12264eSKevin Wolf bdrv_do_drained_begin(bs, false, NULL, true, false); 62661007b31SStefan Hajnoczi aio_context_release(aio_context); 62761007b31SStefan Hajnoczi } 62861007b31SStefan Hajnoczi 6290f12264eSKevin Wolf /* Now poll the in-flight requests */ 630cfe29d82SKevin Wolf AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 6310f12264eSKevin Wolf 6320f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 633c13ad59fSKevin Wolf bdrv_drain_assert_idle(bs); 634f406c03cSAlexander Yarygin } 635f406c03cSAlexander Yarygin } 636c0778f66SAlberto Garcia 637c0778f66SAlberto Garcia void bdrv_drain_all_end(void) 638c0778f66SAlberto Garcia { 6390f12264eSKevin Wolf BlockDriverState *bs = NULL; 640e037c09cSMax Reitz int drained_end_counter = 0; 641c0778f66SAlberto Garcia 642*c8aa7895SPavel Dovgalyuk /* 643*c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 644*c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 645*c8aa7895SPavel Dovgalyuk * be endless 646*c8aa7895SPavel Dovgalyuk */ 647*c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 648*c8aa7895SPavel Dovgalyuk return; 649*c8aa7895SPavel Dovgalyuk } 650*c8aa7895SPavel Dovgalyuk 6510f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 65261007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 65361007b31SStefan Hajnoczi 65461007b31SStefan Hajnoczi aio_context_acquire(aio_context); 655e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 65661007b31SStefan Hajnoczi aio_context_release(aio_context); 65761007b31SStefan Hajnoczi } 6580f12264eSKevin Wolf 659e037c09cSMax Reitz assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 660e037c09cSMax Reitz AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0); 661e037c09cSMax Reitz 6620f12264eSKevin Wolf assert(bdrv_drain_all_count > 0); 6630f12264eSKevin Wolf bdrv_drain_all_count--; 66461007b31SStefan Hajnoczi } 66561007b31SStefan Hajnoczi 666c0778f66SAlberto Garcia void bdrv_drain_all(void) 667c0778f66SAlberto Garcia { 668c0778f66SAlberto Garcia bdrv_drain_all_begin(); 669c0778f66SAlberto Garcia bdrv_drain_all_end(); 670c0778f66SAlberto Garcia } 671c0778f66SAlberto Garcia 67261007b31SStefan Hajnoczi /** 67361007b31SStefan Hajnoczi * Remove an active request from the tracked requests list 67461007b31SStefan Hajnoczi * 67561007b31SStefan Hajnoczi * This function should be called when a tracked request is completing. 67661007b31SStefan Hajnoczi */ 67761007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req) 67861007b31SStefan Hajnoczi { 67961007b31SStefan Hajnoczi if (req->serialising) { 68020fc71b2SPaolo Bonzini atomic_dec(&req->bs->serialising_in_flight); 68161007b31SStefan Hajnoczi } 68261007b31SStefan Hajnoczi 6833783fa3dSPaolo Bonzini qemu_co_mutex_lock(&req->bs->reqs_lock); 68461007b31SStefan Hajnoczi QLIST_REMOVE(req, list); 68561007b31SStefan Hajnoczi qemu_co_queue_restart_all(&req->wait_queue); 6863783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&req->bs->reqs_lock); 68761007b31SStefan Hajnoczi } 68861007b31SStefan Hajnoczi 68961007b31SStefan Hajnoczi /** 69061007b31SStefan Hajnoczi * Add an active request to the tracked requests list 69161007b31SStefan Hajnoczi */ 69261007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req, 69361007b31SStefan Hajnoczi BlockDriverState *bs, 69461007b31SStefan Hajnoczi int64_t offset, 69522931a15SFam Zheng uint64_t bytes, 696ebde595cSFam Zheng enum BdrvTrackedRequestType type) 69761007b31SStefan Hajnoczi { 69822931a15SFam Zheng assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); 69922931a15SFam Zheng 70061007b31SStefan Hajnoczi *req = (BdrvTrackedRequest){ 70161007b31SStefan Hajnoczi .bs = bs, 70261007b31SStefan Hajnoczi .offset = offset, 70361007b31SStefan Hajnoczi .bytes = bytes, 704ebde595cSFam Zheng .type = type, 70561007b31SStefan Hajnoczi .co = qemu_coroutine_self(), 70661007b31SStefan Hajnoczi .serialising = false, 70761007b31SStefan Hajnoczi .overlap_offset = offset, 70861007b31SStefan Hajnoczi .overlap_bytes = bytes, 70961007b31SStefan Hajnoczi }; 71061007b31SStefan Hajnoczi 71161007b31SStefan Hajnoczi qemu_co_queue_init(&req->wait_queue); 71261007b31SStefan Hajnoczi 7133783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 71461007b31SStefan Hajnoczi QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 7153783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 71661007b31SStefan Hajnoczi } 71761007b31SStefan Hajnoczi 71861007b31SStefan Hajnoczi static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 71961007b31SStefan Hajnoczi { 72061007b31SStefan Hajnoczi int64_t overlap_offset = req->offset & ~(align - 1); 72122931a15SFam Zheng uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 72261007b31SStefan Hajnoczi - overlap_offset; 72361007b31SStefan Hajnoczi 72461007b31SStefan Hajnoczi if (!req->serialising) { 72520fc71b2SPaolo Bonzini atomic_inc(&req->bs->serialising_in_flight); 72661007b31SStefan Hajnoczi req->serialising = true; 72761007b31SStefan Hajnoczi } 72861007b31SStefan Hajnoczi 72961007b31SStefan Hajnoczi req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 73061007b31SStefan Hajnoczi req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 73161007b31SStefan Hajnoczi } 73261007b31SStefan Hajnoczi 73309d2f948SVladimir Sementsov-Ogievskiy static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req) 73409d2f948SVladimir Sementsov-Ogievskiy { 73509d2f948SVladimir Sementsov-Ogievskiy /* 73609d2f948SVladimir Sementsov-Ogievskiy * If the request is serialising, overlap_offset and overlap_bytes are set, 73709d2f948SVladimir Sementsov-Ogievskiy * so we can check if the request is aligned. Otherwise, don't care and 73809d2f948SVladimir Sementsov-Ogievskiy * return false. 73909d2f948SVladimir Sementsov-Ogievskiy */ 74009d2f948SVladimir Sementsov-Ogievskiy 74109d2f948SVladimir Sementsov-Ogievskiy return req->serialising && (req->offset == req->overlap_offset) && 74209d2f948SVladimir Sementsov-Ogievskiy (req->bytes == req->overlap_bytes); 74309d2f948SVladimir Sementsov-Ogievskiy } 74409d2f948SVladimir Sementsov-Ogievskiy 74561007b31SStefan Hajnoczi /** 746244483e6SKevin Wolf * Round a region to cluster boundaries 747244483e6SKevin Wolf */ 748244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs, 7497cfd5275SEric Blake int64_t offset, int64_t bytes, 750244483e6SKevin Wolf int64_t *cluster_offset, 7517cfd5275SEric Blake int64_t *cluster_bytes) 752244483e6SKevin Wolf { 753244483e6SKevin Wolf BlockDriverInfo bdi; 754244483e6SKevin Wolf 755244483e6SKevin Wolf if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 756244483e6SKevin Wolf *cluster_offset = offset; 757244483e6SKevin Wolf *cluster_bytes = bytes; 758244483e6SKevin Wolf } else { 759244483e6SKevin Wolf int64_t c = bdi.cluster_size; 760244483e6SKevin Wolf *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 761244483e6SKevin Wolf *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 762244483e6SKevin Wolf } 763244483e6SKevin Wolf } 764244483e6SKevin Wolf 76561007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs) 76661007b31SStefan Hajnoczi { 76761007b31SStefan Hajnoczi BlockDriverInfo bdi; 76861007b31SStefan Hajnoczi int ret; 76961007b31SStefan Hajnoczi 77061007b31SStefan Hajnoczi ret = bdrv_get_info(bs, &bdi); 77161007b31SStefan Hajnoczi if (ret < 0 || bdi.cluster_size == 0) { 772a5b8dd2cSEric Blake return bs->bl.request_alignment; 77361007b31SStefan Hajnoczi } else { 77461007b31SStefan Hajnoczi return bdi.cluster_size; 77561007b31SStefan Hajnoczi } 77661007b31SStefan Hajnoczi } 77761007b31SStefan Hajnoczi 77861007b31SStefan Hajnoczi static bool tracked_request_overlaps(BdrvTrackedRequest *req, 77922931a15SFam Zheng int64_t offset, uint64_t bytes) 78061007b31SStefan Hajnoczi { 78161007b31SStefan Hajnoczi /* aaaa bbbb */ 78261007b31SStefan Hajnoczi if (offset >= req->overlap_offset + req->overlap_bytes) { 78361007b31SStefan Hajnoczi return false; 78461007b31SStefan Hajnoczi } 78561007b31SStefan Hajnoczi /* bbbb aaaa */ 78661007b31SStefan Hajnoczi if (req->overlap_offset >= offset + bytes) { 78761007b31SStefan Hajnoczi return false; 78861007b31SStefan Hajnoczi } 78961007b31SStefan Hajnoczi return true; 79061007b31SStefan Hajnoczi } 79161007b31SStefan Hajnoczi 79299723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs) 79399723548SPaolo Bonzini { 79499723548SPaolo Bonzini atomic_inc(&bs->in_flight); 79599723548SPaolo Bonzini } 79699723548SPaolo Bonzini 797c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs) 798c9d1a561SPaolo Bonzini { 799cfe29d82SKevin Wolf aio_wait_kick(); 800c9d1a561SPaolo Bonzini } 801c9d1a561SPaolo Bonzini 80299723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs) 80399723548SPaolo Bonzini { 80499723548SPaolo Bonzini atomic_dec(&bs->in_flight); 805c9d1a561SPaolo Bonzini bdrv_wakeup(bs); 80699723548SPaolo Bonzini } 80799723548SPaolo Bonzini 80861007b31SStefan Hajnoczi static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 80961007b31SStefan Hajnoczi { 81061007b31SStefan Hajnoczi BlockDriverState *bs = self->bs; 81161007b31SStefan Hajnoczi BdrvTrackedRequest *req; 81261007b31SStefan Hajnoczi bool retry; 81361007b31SStefan Hajnoczi bool waited = false; 81461007b31SStefan Hajnoczi 81520fc71b2SPaolo Bonzini if (!atomic_read(&bs->serialising_in_flight)) { 81661007b31SStefan Hajnoczi return false; 81761007b31SStefan Hajnoczi } 81861007b31SStefan Hajnoczi 81961007b31SStefan Hajnoczi do { 82061007b31SStefan Hajnoczi retry = false; 8213783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 82261007b31SStefan Hajnoczi QLIST_FOREACH(req, &bs->tracked_requests, list) { 82361007b31SStefan Hajnoczi if (req == self || (!req->serialising && !self->serialising)) { 82461007b31SStefan Hajnoczi continue; 82561007b31SStefan Hajnoczi } 82661007b31SStefan Hajnoczi if (tracked_request_overlaps(req, self->overlap_offset, 82761007b31SStefan Hajnoczi self->overlap_bytes)) 82861007b31SStefan Hajnoczi { 82961007b31SStefan Hajnoczi /* Hitting this means there was a reentrant request, for 83061007b31SStefan Hajnoczi * example, a block driver issuing nested requests. This must 83161007b31SStefan Hajnoczi * never happen since it means deadlock. 83261007b31SStefan Hajnoczi */ 83361007b31SStefan Hajnoczi assert(qemu_coroutine_self() != req->co); 83461007b31SStefan Hajnoczi 83561007b31SStefan Hajnoczi /* If the request is already (indirectly) waiting for us, or 83661007b31SStefan Hajnoczi * will wait for us as soon as it wakes up, then just go on 83761007b31SStefan Hajnoczi * (instead of producing a deadlock in the former case). */ 83861007b31SStefan Hajnoczi if (!req->waiting_for) { 83961007b31SStefan Hajnoczi self->waiting_for = req; 8403783fa3dSPaolo Bonzini qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 84161007b31SStefan Hajnoczi self->waiting_for = NULL; 84261007b31SStefan Hajnoczi retry = true; 84361007b31SStefan Hajnoczi waited = true; 84461007b31SStefan Hajnoczi break; 84561007b31SStefan Hajnoczi } 84661007b31SStefan Hajnoczi } 84761007b31SStefan Hajnoczi } 8483783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 84961007b31SStefan Hajnoczi } while (retry); 85061007b31SStefan Hajnoczi 85161007b31SStefan Hajnoczi return waited; 85261007b31SStefan Hajnoczi } 85361007b31SStefan Hajnoczi 85461007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 85561007b31SStefan Hajnoczi size_t size) 85661007b31SStefan Hajnoczi { 85741ae31e3SAlberto Garcia if (size > BDRV_REQUEST_MAX_BYTES) { 85861007b31SStefan Hajnoczi return -EIO; 85961007b31SStefan Hajnoczi } 86061007b31SStefan Hajnoczi 86161007b31SStefan Hajnoczi if (!bdrv_is_inserted(bs)) { 86261007b31SStefan Hajnoczi return -ENOMEDIUM; 86361007b31SStefan Hajnoczi } 86461007b31SStefan Hajnoczi 86561007b31SStefan Hajnoczi if (offset < 0) { 86661007b31SStefan Hajnoczi return -EIO; 86761007b31SStefan Hajnoczi } 86861007b31SStefan Hajnoczi 86961007b31SStefan Hajnoczi return 0; 87061007b31SStefan Hajnoczi } 87161007b31SStefan Hajnoczi 87261007b31SStefan Hajnoczi typedef struct RwCo { 873e293b7a3SKevin Wolf BdrvChild *child; 87461007b31SStefan Hajnoczi int64_t offset; 87561007b31SStefan Hajnoczi QEMUIOVector *qiov; 87661007b31SStefan Hajnoczi bool is_write; 87761007b31SStefan Hajnoczi int ret; 87861007b31SStefan Hajnoczi BdrvRequestFlags flags; 87961007b31SStefan Hajnoczi } RwCo; 88061007b31SStefan Hajnoczi 88161007b31SStefan Hajnoczi static void coroutine_fn bdrv_rw_co_entry(void *opaque) 88261007b31SStefan Hajnoczi { 88361007b31SStefan Hajnoczi RwCo *rwco = opaque; 88461007b31SStefan Hajnoczi 88561007b31SStefan Hajnoczi if (!rwco->is_write) { 886a03ef88fSKevin Wolf rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 88761007b31SStefan Hajnoczi rwco->qiov->size, rwco->qiov, 88861007b31SStefan Hajnoczi rwco->flags); 88961007b31SStefan Hajnoczi } else { 890a03ef88fSKevin Wolf rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 89161007b31SStefan Hajnoczi rwco->qiov->size, rwco->qiov, 89261007b31SStefan Hajnoczi rwco->flags); 89361007b31SStefan Hajnoczi } 8944720cbeeSKevin Wolf aio_wait_kick(); 89561007b31SStefan Hajnoczi } 89661007b31SStefan Hajnoczi 89761007b31SStefan Hajnoczi /* 89861007b31SStefan Hajnoczi * Process a vectored synchronous request using coroutines 89961007b31SStefan Hajnoczi */ 900e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 90161007b31SStefan Hajnoczi QEMUIOVector *qiov, bool is_write, 90261007b31SStefan Hajnoczi BdrvRequestFlags flags) 90361007b31SStefan Hajnoczi { 90461007b31SStefan Hajnoczi Coroutine *co; 90561007b31SStefan Hajnoczi RwCo rwco = { 906e293b7a3SKevin Wolf .child = child, 90761007b31SStefan Hajnoczi .offset = offset, 90861007b31SStefan Hajnoczi .qiov = qiov, 90961007b31SStefan Hajnoczi .is_write = is_write, 91061007b31SStefan Hajnoczi .ret = NOT_DONE, 91161007b31SStefan Hajnoczi .flags = flags, 91261007b31SStefan Hajnoczi }; 91361007b31SStefan Hajnoczi 91461007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 91561007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 91661007b31SStefan Hajnoczi bdrv_rw_co_entry(&rwco); 91761007b31SStefan Hajnoczi } else { 9180b8b8753SPaolo Bonzini co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 919e92f0e19SFam Zheng bdrv_coroutine_enter(child->bs, co); 92088b062c2SPaolo Bonzini BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 92161007b31SStefan Hajnoczi } 92261007b31SStefan Hajnoczi return rwco.ret; 92361007b31SStefan Hajnoczi } 92461007b31SStefan Hajnoczi 925720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 926f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 92761007b31SStefan Hajnoczi { 9280d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); 92974021bc4SEric Blake 930e293b7a3SKevin Wolf return bdrv_prwv_co(child, offset, &qiov, true, 93161007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 93261007b31SStefan Hajnoczi } 93361007b31SStefan Hajnoczi 93461007b31SStefan Hajnoczi /* 93574021bc4SEric Blake * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 93661007b31SStefan Hajnoczi * The operation is sped up by checking the block status and only writing 93761007b31SStefan Hajnoczi * zeroes to the device if they currently do not return zeroes. Optional 93874021bc4SEric Blake * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 939465fe887SEric Blake * BDRV_REQ_FUA). 94061007b31SStefan Hajnoczi * 94161007b31SStefan Hajnoczi * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 94261007b31SStefan Hajnoczi */ 943720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 94461007b31SStefan Hajnoczi { 945237d78f8SEric Blake int ret; 946237d78f8SEric Blake int64_t target_size, bytes, offset = 0; 947720ff280SKevin Wolf BlockDriverState *bs = child->bs; 94861007b31SStefan Hajnoczi 9497286d610SEric Blake target_size = bdrv_getlength(bs); 9507286d610SEric Blake if (target_size < 0) { 9517286d610SEric Blake return target_size; 95261007b31SStefan Hajnoczi } 95361007b31SStefan Hajnoczi 95461007b31SStefan Hajnoczi for (;;) { 9557286d610SEric Blake bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 9567286d610SEric Blake if (bytes <= 0) { 95761007b31SStefan Hajnoczi return 0; 95861007b31SStefan Hajnoczi } 959237d78f8SEric Blake ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 96061007b31SStefan Hajnoczi if (ret < 0) { 96161007b31SStefan Hajnoczi return ret; 96261007b31SStefan Hajnoczi } 96361007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_ZERO) { 964237d78f8SEric Blake offset += bytes; 96561007b31SStefan Hajnoczi continue; 96661007b31SStefan Hajnoczi } 967237d78f8SEric Blake ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 96861007b31SStefan Hajnoczi if (ret < 0) { 96961007b31SStefan Hajnoczi return ret; 97061007b31SStefan Hajnoczi } 971237d78f8SEric Blake offset += bytes; 97261007b31SStefan Hajnoczi } 97361007b31SStefan Hajnoczi } 97461007b31SStefan Hajnoczi 975cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 976f1e84741SKevin Wolf { 977f1e84741SKevin Wolf int ret; 978f1e84741SKevin Wolf 979e293b7a3SKevin Wolf ret = bdrv_prwv_co(child, offset, qiov, false, 0); 980f1e84741SKevin Wolf if (ret < 0) { 981f1e84741SKevin Wolf return ret; 982f1e84741SKevin Wolf } 983f1e84741SKevin Wolf 984f1e84741SKevin Wolf return qiov->size; 985f1e84741SKevin Wolf } 986f1e84741SKevin Wolf 9872e11d756SAlberto Garcia /* See bdrv_pwrite() for the return codes */ 988cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 98961007b31SStefan Hajnoczi { 9900d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 99161007b31SStefan Hajnoczi 99261007b31SStefan Hajnoczi if (bytes < 0) { 99361007b31SStefan Hajnoczi return -EINVAL; 99461007b31SStefan Hajnoczi } 99561007b31SStefan Hajnoczi 996cf2ab8fcSKevin Wolf return bdrv_preadv(child, offset, &qiov); 99761007b31SStefan Hajnoczi } 99861007b31SStefan Hajnoczi 999d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 100061007b31SStefan Hajnoczi { 100161007b31SStefan Hajnoczi int ret; 100261007b31SStefan Hajnoczi 1003e293b7a3SKevin Wolf ret = bdrv_prwv_co(child, offset, qiov, true, 0); 100461007b31SStefan Hajnoczi if (ret < 0) { 100561007b31SStefan Hajnoczi return ret; 100661007b31SStefan Hajnoczi } 100761007b31SStefan Hajnoczi 100861007b31SStefan Hajnoczi return qiov->size; 100961007b31SStefan Hajnoczi } 101061007b31SStefan Hajnoczi 10112e11d756SAlberto Garcia /* Return no. of bytes on success or < 0 on error. Important errors are: 10122e11d756SAlberto Garcia -EIO generic I/O error (may happen for all errors) 10132e11d756SAlberto Garcia -ENOMEDIUM No media inserted. 10142e11d756SAlberto Garcia -EINVAL Invalid offset or number of bytes 10152e11d756SAlberto Garcia -EACCES Trying to write a read-only device 10162e11d756SAlberto Garcia */ 1017d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 101861007b31SStefan Hajnoczi { 10190d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 102061007b31SStefan Hajnoczi 102161007b31SStefan Hajnoczi if (bytes < 0) { 102261007b31SStefan Hajnoczi return -EINVAL; 102361007b31SStefan Hajnoczi } 102461007b31SStefan Hajnoczi 1025d9ca2ea2SKevin Wolf return bdrv_pwritev(child, offset, &qiov); 102661007b31SStefan Hajnoczi } 102761007b31SStefan Hajnoczi 102861007b31SStefan Hajnoczi /* 102961007b31SStefan Hajnoczi * Writes to the file and ensures that no writes are reordered across this 103061007b31SStefan Hajnoczi * request (acts as a barrier) 103161007b31SStefan Hajnoczi * 103261007b31SStefan Hajnoczi * Returns 0 on success, -errno in error cases. 103361007b31SStefan Hajnoczi */ 1034d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 103561007b31SStefan Hajnoczi const void *buf, int count) 103661007b31SStefan Hajnoczi { 103761007b31SStefan Hajnoczi int ret; 103861007b31SStefan Hajnoczi 1039d9ca2ea2SKevin Wolf ret = bdrv_pwrite(child, offset, buf, count); 104061007b31SStefan Hajnoczi if (ret < 0) { 104161007b31SStefan Hajnoczi return ret; 104261007b31SStefan Hajnoczi } 104361007b31SStefan Hajnoczi 1044d9ca2ea2SKevin Wolf ret = bdrv_flush(child->bs); 1045855a6a93SKevin Wolf if (ret < 0) { 1046855a6a93SKevin Wolf return ret; 104761007b31SStefan Hajnoczi } 104861007b31SStefan Hajnoczi 104961007b31SStefan Hajnoczi return 0; 105061007b31SStefan Hajnoczi } 105161007b31SStefan Hajnoczi 105208844473SKevin Wolf typedef struct CoroutineIOCompletion { 105308844473SKevin Wolf Coroutine *coroutine; 105408844473SKevin Wolf int ret; 105508844473SKevin Wolf } CoroutineIOCompletion; 105608844473SKevin Wolf 105708844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret) 105808844473SKevin Wolf { 105908844473SKevin Wolf CoroutineIOCompletion *co = opaque; 106008844473SKevin Wolf 106108844473SKevin Wolf co->ret = ret; 1062b9e413ddSPaolo Bonzini aio_co_wake(co->coroutine); 106308844473SKevin Wolf } 106408844473SKevin Wolf 1065166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1066166fe960SKevin Wolf uint64_t offset, uint64_t bytes, 1067ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1068ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 1069166fe960SKevin Wolf { 1070166fe960SKevin Wolf BlockDriver *drv = bs->drv; 10713fb06697SKevin Wolf int64_t sector_num; 10723fb06697SKevin Wolf unsigned int nb_sectors; 1073ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1074ac850bf0SVladimir Sementsov-Ogievskiy int ret; 10753fb06697SKevin Wolf 1076fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1077fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1078fa166538SEric Blake 1079d470ad42SMax Reitz if (!drv) { 1080d470ad42SMax Reitz return -ENOMEDIUM; 1081d470ad42SMax Reitz } 1082d470ad42SMax Reitz 1083ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_preadv_part) { 1084ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1085ac850bf0SVladimir Sementsov-Ogievskiy flags); 1086ac850bf0SVladimir Sementsov-Ogievskiy } 1087ac850bf0SVladimir Sementsov-Ogievskiy 1088ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1089ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1090ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1091ac850bf0SVladimir Sementsov-Ogievskiy } 1092ac850bf0SVladimir Sementsov-Ogievskiy 10933fb06697SKevin Wolf if (drv->bdrv_co_preadv) { 1094ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1095ac850bf0SVladimir Sementsov-Ogievskiy goto out; 10963fb06697SKevin Wolf } 10973fb06697SKevin Wolf 1098edfab6a0SEric Blake if (drv->bdrv_aio_preadv) { 109908844473SKevin Wolf BlockAIOCB *acb; 110008844473SKevin Wolf CoroutineIOCompletion co = { 110108844473SKevin Wolf .coroutine = qemu_coroutine_self(), 110208844473SKevin Wolf }; 110308844473SKevin Wolf 1104e31f6864SEric Blake acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 110508844473SKevin Wolf bdrv_co_io_em_complete, &co); 110608844473SKevin Wolf if (acb == NULL) { 1107ac850bf0SVladimir Sementsov-Ogievskiy ret = -EIO; 1108ac850bf0SVladimir Sementsov-Ogievskiy goto out; 110908844473SKevin Wolf } else { 111008844473SKevin Wolf qemu_coroutine_yield(); 1111ac850bf0SVladimir Sementsov-Ogievskiy ret = co.ret; 1112ac850bf0SVladimir Sementsov-Ogievskiy goto out; 111308844473SKevin Wolf } 111408844473SKevin Wolf } 1115edfab6a0SEric Blake 1116edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1117edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1118edfab6a0SEric Blake 11191bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 11201bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 112141ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1122edfab6a0SEric Blake assert(drv->bdrv_co_readv); 1123edfab6a0SEric Blake 1124ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1125ac850bf0SVladimir Sementsov-Ogievskiy 1126ac850bf0SVladimir Sementsov-Ogievskiy out: 1127ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1128ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1129ac850bf0SVladimir Sementsov-Ogievskiy } 1130ac850bf0SVladimir Sementsov-Ogievskiy 1131ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1132166fe960SKevin Wolf } 1133166fe960SKevin Wolf 113478a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 113578a07294SKevin Wolf uint64_t offset, uint64_t bytes, 1136ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1137ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 113878a07294SKevin Wolf { 113978a07294SKevin Wolf BlockDriver *drv = bs->drv; 11403fb06697SKevin Wolf int64_t sector_num; 11413fb06697SKevin Wolf unsigned int nb_sectors; 1142ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 114378a07294SKevin Wolf int ret; 114478a07294SKevin Wolf 1145fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1146fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1147fa166538SEric Blake 1148d470ad42SMax Reitz if (!drv) { 1149d470ad42SMax Reitz return -ENOMEDIUM; 1150d470ad42SMax Reitz } 1151d470ad42SMax Reitz 1152ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_part) { 1153ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1154ac850bf0SVladimir Sementsov-Ogievskiy flags & bs->supported_write_flags); 1155ac850bf0SVladimir Sementsov-Ogievskiy flags &= ~bs->supported_write_flags; 1156ac850bf0SVladimir Sementsov-Ogievskiy goto emulate_flags; 1157ac850bf0SVladimir Sementsov-Ogievskiy } 1158ac850bf0SVladimir Sementsov-Ogievskiy 1159ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1160ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1161ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1162ac850bf0SVladimir Sementsov-Ogievskiy } 1163ac850bf0SVladimir Sementsov-Ogievskiy 11643fb06697SKevin Wolf if (drv->bdrv_co_pwritev) { 1165515c2f43SKevin Wolf ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1166515c2f43SKevin Wolf flags & bs->supported_write_flags); 1167515c2f43SKevin Wolf flags &= ~bs->supported_write_flags; 11683fb06697SKevin Wolf goto emulate_flags; 11693fb06697SKevin Wolf } 11703fb06697SKevin Wolf 1171edfab6a0SEric Blake if (drv->bdrv_aio_pwritev) { 117208844473SKevin Wolf BlockAIOCB *acb; 117308844473SKevin Wolf CoroutineIOCompletion co = { 117408844473SKevin Wolf .coroutine = qemu_coroutine_self(), 117508844473SKevin Wolf }; 117608844473SKevin Wolf 1177e31f6864SEric Blake acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1178e31f6864SEric Blake flags & bs->supported_write_flags, 117908844473SKevin Wolf bdrv_co_io_em_complete, &co); 1180e31f6864SEric Blake flags &= ~bs->supported_write_flags; 118108844473SKevin Wolf if (acb == NULL) { 11823fb06697SKevin Wolf ret = -EIO; 118308844473SKevin Wolf } else { 118408844473SKevin Wolf qemu_coroutine_yield(); 11853fb06697SKevin Wolf ret = co.ret; 118608844473SKevin Wolf } 1187edfab6a0SEric Blake goto emulate_flags; 1188edfab6a0SEric Blake } 1189edfab6a0SEric Blake 1190edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1191edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1192edfab6a0SEric Blake 11931bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 11941bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 119541ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1196edfab6a0SEric Blake 1197e18a58b4SEric Blake assert(drv->bdrv_co_writev); 1198e18a58b4SEric Blake ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1199edfab6a0SEric Blake flags & bs->supported_write_flags); 1200edfab6a0SEric Blake flags &= ~bs->supported_write_flags; 120178a07294SKevin Wolf 12023fb06697SKevin Wolf emulate_flags: 12034df863f3SEric Blake if (ret == 0 && (flags & BDRV_REQ_FUA)) { 120478a07294SKevin Wolf ret = bdrv_co_flush(bs); 120578a07294SKevin Wolf } 120678a07294SKevin Wolf 1207ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1208ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1209ac850bf0SVladimir Sementsov-Ogievskiy } 1210ac850bf0SVladimir Sementsov-Ogievskiy 121178a07294SKevin Wolf return ret; 121278a07294SKevin Wolf } 121378a07294SKevin Wolf 121429a298afSPavel Butsykin static int coroutine_fn 121529a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1216ac850bf0SVladimir Sementsov-Ogievskiy uint64_t bytes, QEMUIOVector *qiov, 1217ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset) 121829a298afSPavel Butsykin { 121929a298afSPavel Butsykin BlockDriver *drv = bs->drv; 1220ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1221ac850bf0SVladimir Sementsov-Ogievskiy int ret; 122229a298afSPavel Butsykin 1223d470ad42SMax Reitz if (!drv) { 1224d470ad42SMax Reitz return -ENOMEDIUM; 1225d470ad42SMax Reitz } 1226d470ad42SMax Reitz 1227ac850bf0SVladimir Sementsov-Ogievskiy if (!block_driver_can_compress(drv)) { 122829a298afSPavel Butsykin return -ENOTSUP; 122929a298afSPavel Butsykin } 123029a298afSPavel Butsykin 1231ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_compressed_part) { 1232ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1233ac850bf0SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 1234ac850bf0SVladimir Sementsov-Ogievskiy } 1235ac850bf0SVladimir Sementsov-Ogievskiy 1236ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset == 0) { 123729a298afSPavel Butsykin return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 123829a298afSPavel Butsykin } 123929a298afSPavel Butsykin 1240ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1241ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1242ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1243ac850bf0SVladimir Sementsov-Ogievskiy 1244ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1245ac850bf0SVladimir Sementsov-Ogievskiy } 1246ac850bf0SVladimir Sementsov-Ogievskiy 124785c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 12483299e5ecSVladimir Sementsov-Ogievskiy int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 12491143ec5eSVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 125061007b31SStefan Hajnoczi { 125185c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 125285c97ca7SKevin Wolf 125361007b31SStefan Hajnoczi /* Perform I/O through a temporary buffer so that users who scribble over 125461007b31SStefan Hajnoczi * their read buffer while the operation is in progress do not end up 125561007b31SStefan Hajnoczi * modifying the image file. This is critical for zero-copy guest I/O 125661007b31SStefan Hajnoczi * where anything might happen inside guest memory. 125761007b31SStefan Hajnoczi */ 12582275cc90SVladimir Sementsov-Ogievskiy void *bounce_buffer = NULL; 125961007b31SStefan Hajnoczi 126061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 1261244483e6SKevin Wolf int64_t cluster_offset; 12627cfd5275SEric Blake int64_t cluster_bytes; 126361007b31SStefan Hajnoczi size_t skip_bytes; 126461007b31SStefan Hajnoczi int ret; 1265cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1266cb2e2878SEric Blake BDRV_REQUEST_MAX_BYTES); 1267cb2e2878SEric Blake unsigned int progress = 0; 12688644476eSMax Reitz bool skip_write; 126961007b31SStefan Hajnoczi 1270d470ad42SMax Reitz if (!drv) { 1271d470ad42SMax Reitz return -ENOMEDIUM; 1272d470ad42SMax Reitz } 1273d470ad42SMax Reitz 12748644476eSMax Reitz /* 12758644476eSMax Reitz * Do not write anything when the BDS is inactive. That is not 12768644476eSMax Reitz * allowed, and it would not help. 12778644476eSMax Reitz */ 12788644476eSMax Reitz skip_write = (bs->open_flags & BDRV_O_INACTIVE); 12798644476eSMax Reitz 12801bf03e66SKevin Wolf /* FIXME We cannot require callers to have write permissions when all they 12811bf03e66SKevin Wolf * are doing is a read request. If we did things right, write permissions 12821bf03e66SKevin Wolf * would be obtained anyway, but internally by the copy-on-read code. As 1283765d9df9SEric Blake * long as it is implemented here rather than in a separate filter driver, 12841bf03e66SKevin Wolf * the copy-on-read code doesn't have its own BdrvChild, however, for which 12851bf03e66SKevin Wolf * it could request permissions. Therefore we have to bypass the permission 12861bf03e66SKevin Wolf * system for the moment. */ 12871bf03e66SKevin Wolf // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1288afa4b293SKevin Wolf 128961007b31SStefan Hajnoczi /* Cover entire cluster so no additional backing file I/O is required when 1290cb2e2878SEric Blake * allocating cluster in the image file. Note that this value may exceed 1291cb2e2878SEric Blake * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1292cb2e2878SEric Blake * is one reason we loop rather than doing it all at once. 129361007b31SStefan Hajnoczi */ 1294244483e6SKevin Wolf bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1295cb2e2878SEric Blake skip_bytes = offset - cluster_offset; 129661007b31SStefan Hajnoczi 1297244483e6SKevin Wolf trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1298244483e6SKevin Wolf cluster_offset, cluster_bytes); 129961007b31SStefan Hajnoczi 1300cb2e2878SEric Blake while (cluster_bytes) { 1301cb2e2878SEric Blake int64_t pnum; 130261007b31SStefan Hajnoczi 13038644476eSMax Reitz if (skip_write) { 13048644476eSMax Reitz ret = 1; /* "already allocated", so nothing will be copied */ 13058644476eSMax Reitz pnum = MIN(cluster_bytes, max_transfer); 13068644476eSMax Reitz } else { 1307cb2e2878SEric Blake ret = bdrv_is_allocated(bs, cluster_offset, 1308cb2e2878SEric Blake MIN(cluster_bytes, max_transfer), &pnum); 1309cb2e2878SEric Blake if (ret < 0) { 13108644476eSMax Reitz /* 13118644476eSMax Reitz * Safe to treat errors in querying allocation as if 1312cb2e2878SEric Blake * unallocated; we'll probably fail again soon on the 1313cb2e2878SEric Blake * read, but at least that will set a decent errno. 1314cb2e2878SEric Blake */ 1315cb2e2878SEric Blake pnum = MIN(cluster_bytes, max_transfer); 1316cb2e2878SEric Blake } 1317cb2e2878SEric Blake 1318b0ddcbbbSKevin Wolf /* Stop at EOF if the image ends in the middle of the cluster */ 1319b0ddcbbbSKevin Wolf if (ret == 0 && pnum == 0) { 1320b0ddcbbbSKevin Wolf assert(progress >= bytes); 1321b0ddcbbbSKevin Wolf break; 1322b0ddcbbbSKevin Wolf } 1323b0ddcbbbSKevin Wolf 1324cb2e2878SEric Blake assert(skip_bytes < pnum); 13258644476eSMax Reitz } 1326cb2e2878SEric Blake 1327cb2e2878SEric Blake if (ret <= 0) { 13281143ec5eSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 13291143ec5eSVladimir Sementsov-Ogievskiy 1330cb2e2878SEric Blake /* Must copy-on-read; use the bounce buffer */ 13310d93ed08SVladimir Sementsov-Ogievskiy pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 13322275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 13332275cc90SVladimir Sementsov-Ogievskiy int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 13342275cc90SVladimir Sementsov-Ogievskiy int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 13352275cc90SVladimir Sementsov-Ogievskiy int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 13362275cc90SVladimir Sementsov-Ogievskiy 13372275cc90SVladimir Sementsov-Ogievskiy bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 13382275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 13392275cc90SVladimir Sementsov-Ogievskiy ret = -ENOMEM; 13402275cc90SVladimir Sementsov-Ogievskiy goto err; 13412275cc90SVladimir Sementsov-Ogievskiy } 13422275cc90SVladimir Sementsov-Ogievskiy } 13430d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1344cb2e2878SEric Blake 1345cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1346ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 0); 134761007b31SStefan Hajnoczi if (ret < 0) { 134861007b31SStefan Hajnoczi goto err; 134961007b31SStefan Hajnoczi } 135061007b31SStefan Hajnoczi 1351d855ebcdSEric Blake bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1352c1499a5eSEric Blake if (drv->bdrv_co_pwrite_zeroes && 1353cb2e2878SEric Blake buffer_is_zero(bounce_buffer, pnum)) { 1354a604fa2bSEric Blake /* FIXME: Should we (perhaps conditionally) be setting 1355a604fa2bSEric Blake * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1356a604fa2bSEric Blake * that still correctly reads as zero? */ 13577adcf59fSMax Reitz ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 13587adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 135961007b31SStefan Hajnoczi } else { 1360cb2e2878SEric Blake /* This does not change the data on the disk, it is not 1361cb2e2878SEric Blake * necessary to flush even in cache=writethrough mode. 136261007b31SStefan Hajnoczi */ 1363cb2e2878SEric Blake ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1364ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 13657adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 136661007b31SStefan Hajnoczi } 136761007b31SStefan Hajnoczi 136861007b31SStefan Hajnoczi if (ret < 0) { 1369cb2e2878SEric Blake /* It might be okay to ignore write errors for guest 1370cb2e2878SEric Blake * requests. If this is a deliberate copy-on-read 1371cb2e2878SEric Blake * then we don't want to ignore the error. Simply 1372cb2e2878SEric Blake * report it in all cases. 137361007b31SStefan Hajnoczi */ 137461007b31SStefan Hajnoczi goto err; 137561007b31SStefan Hajnoczi } 137661007b31SStefan Hajnoczi 13773299e5ecSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_PREFETCH)) { 13781143ec5eSVladimir Sementsov-Ogievskiy qemu_iovec_from_buf(qiov, qiov_offset + progress, 13791143ec5eSVladimir Sementsov-Ogievskiy bounce_buffer + skip_bytes, 1380cb2e2878SEric Blake pnum - skip_bytes); 13813299e5ecSVladimir Sementsov-Ogievskiy } 13823299e5ecSVladimir Sementsov-Ogievskiy } else if (!(flags & BDRV_REQ_PREFETCH)) { 1383cb2e2878SEric Blake /* Read directly into the destination */ 13841143ec5eSVladimir Sementsov-Ogievskiy ret = bdrv_driver_preadv(bs, offset + progress, 13851143ec5eSVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress), 13861143ec5eSVladimir Sementsov-Ogievskiy qiov, qiov_offset + progress, 0); 1387cb2e2878SEric Blake if (ret < 0) { 1388cb2e2878SEric Blake goto err; 1389cb2e2878SEric Blake } 1390cb2e2878SEric Blake } 1391cb2e2878SEric Blake 1392cb2e2878SEric Blake cluster_offset += pnum; 1393cb2e2878SEric Blake cluster_bytes -= pnum; 1394cb2e2878SEric Blake progress += pnum - skip_bytes; 1395cb2e2878SEric Blake skip_bytes = 0; 1396cb2e2878SEric Blake } 1397cb2e2878SEric Blake ret = 0; 139861007b31SStefan Hajnoczi 139961007b31SStefan Hajnoczi err: 140061007b31SStefan Hajnoczi qemu_vfree(bounce_buffer); 140161007b31SStefan Hajnoczi return ret; 140261007b31SStefan Hajnoczi } 140361007b31SStefan Hajnoczi 140461007b31SStefan Hajnoczi /* 140561007b31SStefan Hajnoczi * Forwards an already correctly aligned request to the BlockDriver. This 14061a62d0acSEric Blake * handles copy on read, zeroing after EOF, and fragmentation of large 14071a62d0acSEric Blake * reads; any other features must be implemented by the caller. 140861007b31SStefan Hajnoczi */ 140985c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 141061007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 141165cd4424SVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 141261007b31SStefan Hajnoczi { 141385c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 1414c9d20029SKevin Wolf int64_t total_bytes, max_bytes; 14151a62d0acSEric Blake int ret = 0; 14161a62d0acSEric Blake uint64_t bytes_remaining = bytes; 14171a62d0acSEric Blake int max_transfer; 141861007b31SStefan Hajnoczi 141949c07526SKevin Wolf assert(is_power_of_2(align)); 142049c07526SKevin Wolf assert((offset & (align - 1)) == 0); 142149c07526SKevin Wolf assert((bytes & (align - 1)) == 0); 1422abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 14231a62d0acSEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 14241a62d0acSEric Blake align); 1425a604fa2bSEric Blake 1426a604fa2bSEric Blake /* TODO: We would need a per-BDS .supported_read_flags and 1427a604fa2bSEric Blake * potential fallback support, if we ever implement any read flags 1428a604fa2bSEric Blake * to pass through to drivers. For now, there aren't any 1429a604fa2bSEric Blake * passthrough flags. */ 14303299e5ecSVladimir Sementsov-Ogievskiy assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ | 14313299e5ecSVladimir Sementsov-Ogievskiy BDRV_REQ_PREFETCH))); 143261007b31SStefan Hajnoczi 143361007b31SStefan Hajnoczi /* Handle Copy on Read and associated serialisation */ 143461007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 143561007b31SStefan Hajnoczi /* If we touch the same cluster it counts as an overlap. This 143661007b31SStefan Hajnoczi * guarantees that allocating writes will be serialized and not race 143761007b31SStefan Hajnoczi * with each other for the same cluster. For example, in copy-on-read 143861007b31SStefan Hajnoczi * it ensures that the CoR read and write operations are atomic and 143961007b31SStefan Hajnoczi * guest writes cannot interleave between them. */ 144061007b31SStefan Hajnoczi mark_request_serialising(req, bdrv_get_cluster_size(bs)); 144161007b31SStefan Hajnoczi } 144261007b31SStefan Hajnoczi 144309d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 144409d2f948SVladimir Sementsov-Ogievskiy assert(!(flags & BDRV_REQ_SERIALISING)); 144509d2f948SVladimir Sementsov-Ogievskiy 144661408b25SFam Zheng if (!(flags & BDRV_REQ_NO_SERIALISING)) { 144761007b31SStefan Hajnoczi wait_serialising_requests(req); 144861408b25SFam Zheng } 144961007b31SStefan Hajnoczi 145061007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 1451d6a644bbSEric Blake int64_t pnum; 145261007b31SStefan Hajnoczi 145388e63df2SEric Blake ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 145461007b31SStefan Hajnoczi if (ret < 0) { 145561007b31SStefan Hajnoczi goto out; 145661007b31SStefan Hajnoczi } 145761007b31SStefan Hajnoczi 145888e63df2SEric Blake if (!ret || pnum != bytes) { 145965cd4424SVladimir Sementsov-Ogievskiy ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 146065cd4424SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 14613299e5ecSVladimir Sementsov-Ogievskiy goto out; 14623299e5ecSVladimir Sementsov-Ogievskiy } else if (flags & BDRV_REQ_PREFETCH) { 146361007b31SStefan Hajnoczi goto out; 146461007b31SStefan Hajnoczi } 146561007b31SStefan Hajnoczi } 146661007b31SStefan Hajnoczi 14671a62d0acSEric Blake /* Forward the request to the BlockDriver, possibly fragmenting it */ 146849c07526SKevin Wolf total_bytes = bdrv_getlength(bs); 146949c07526SKevin Wolf if (total_bytes < 0) { 147049c07526SKevin Wolf ret = total_bytes; 147161007b31SStefan Hajnoczi goto out; 147261007b31SStefan Hajnoczi } 147361007b31SStefan Hajnoczi 147449c07526SKevin Wolf max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 14751a62d0acSEric Blake if (bytes <= max_bytes && bytes <= max_transfer) { 147665cd4424SVladimir Sementsov-Ogievskiy ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0); 14771a62d0acSEric Blake goto out; 147861007b31SStefan Hajnoczi } 147961007b31SStefan Hajnoczi 14801a62d0acSEric Blake while (bytes_remaining) { 14811a62d0acSEric Blake int num; 14821a62d0acSEric Blake 14831a62d0acSEric Blake if (max_bytes) { 14841a62d0acSEric Blake num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 14851a62d0acSEric Blake assert(num); 14861a62d0acSEric Blake 14871a62d0acSEric Blake ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 148865cd4424SVladimir Sementsov-Ogievskiy num, qiov, bytes - bytes_remaining, 0); 14891a62d0acSEric Blake max_bytes -= num; 14901a62d0acSEric Blake } else { 14911a62d0acSEric Blake num = bytes_remaining; 14921a62d0acSEric Blake ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 14931a62d0acSEric Blake bytes_remaining); 14941a62d0acSEric Blake } 14951a62d0acSEric Blake if (ret < 0) { 14961a62d0acSEric Blake goto out; 14971a62d0acSEric Blake } 14981a62d0acSEric Blake bytes_remaining -= num; 149961007b31SStefan Hajnoczi } 150061007b31SStefan Hajnoczi 150161007b31SStefan Hajnoczi out: 15021a62d0acSEric Blake return ret < 0 ? ret : 0; 150361007b31SStefan Hajnoczi } 150461007b31SStefan Hajnoczi 150561007b31SStefan Hajnoczi /* 15067a3f542fSVladimir Sementsov-Ogievskiy * Request padding 15077a3f542fSVladimir Sementsov-Ogievskiy * 15087a3f542fSVladimir Sementsov-Ogievskiy * |<---- align ----->| |<----- align ---->| 15097a3f542fSVladimir Sementsov-Ogievskiy * |<- head ->|<------------- bytes ------------->|<-- tail -->| 15107a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 15117a3f542fSVladimir Sementsov-Ogievskiy * -*----------$-------*-------- ... --------*-----$------------*--- 15127a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 15137a3f542fSVladimir Sementsov-Ogievskiy * | offset | | end | 15147a3f542fSVladimir Sementsov-Ogievskiy * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 15157a3f542fSVladimir Sementsov-Ogievskiy * [buf ... ) [tail_buf ) 15167a3f542fSVladimir Sementsov-Ogievskiy * 15177a3f542fSVladimir Sementsov-Ogievskiy * @buf is an aligned allocation needed to store @head and @tail paddings. @head 15187a3f542fSVladimir Sementsov-Ogievskiy * is placed at the beginning of @buf and @tail at the @end. 15197a3f542fSVladimir Sementsov-Ogievskiy * 15207a3f542fSVladimir Sementsov-Ogievskiy * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 15217a3f542fSVladimir Sementsov-Ogievskiy * around tail, if tail exists. 15227a3f542fSVladimir Sementsov-Ogievskiy * 15237a3f542fSVladimir Sementsov-Ogievskiy * @merge_reads is true for small requests, 15247a3f542fSVladimir Sementsov-Ogievskiy * if @buf_len == @head + bytes + @tail. In this case it is possible that both 15257a3f542fSVladimir Sementsov-Ogievskiy * head and tail exist but @buf_len == align and @tail_buf == @buf. 152661007b31SStefan Hajnoczi */ 15277a3f542fSVladimir Sementsov-Ogievskiy typedef struct BdrvRequestPadding { 15287a3f542fSVladimir Sementsov-Ogievskiy uint8_t *buf; 15297a3f542fSVladimir Sementsov-Ogievskiy size_t buf_len; 15307a3f542fSVladimir Sementsov-Ogievskiy uint8_t *tail_buf; 15317a3f542fSVladimir Sementsov-Ogievskiy size_t head; 15327a3f542fSVladimir Sementsov-Ogievskiy size_t tail; 15337a3f542fSVladimir Sementsov-Ogievskiy bool merge_reads; 15347a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 15357a3f542fSVladimir Sementsov-Ogievskiy } BdrvRequestPadding; 15367a3f542fSVladimir Sementsov-Ogievskiy 15377a3f542fSVladimir Sementsov-Ogievskiy static bool bdrv_init_padding(BlockDriverState *bs, 15387a3f542fSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 15397a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad) 15407a3f542fSVladimir Sementsov-Ogievskiy { 15417a3f542fSVladimir Sementsov-Ogievskiy uint64_t align = bs->bl.request_alignment; 15427a3f542fSVladimir Sementsov-Ogievskiy size_t sum; 15437a3f542fSVladimir Sementsov-Ogievskiy 15447a3f542fSVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 15457a3f542fSVladimir Sementsov-Ogievskiy 15467a3f542fSVladimir Sementsov-Ogievskiy pad->head = offset & (align - 1); 15477a3f542fSVladimir Sementsov-Ogievskiy pad->tail = ((offset + bytes) & (align - 1)); 15487a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 15497a3f542fSVladimir Sementsov-Ogievskiy pad->tail = align - pad->tail; 15507a3f542fSVladimir Sementsov-Ogievskiy } 15517a3f542fSVladimir Sementsov-Ogievskiy 15527a3f542fSVladimir Sementsov-Ogievskiy if ((!pad->head && !pad->tail) || !bytes) { 15537a3f542fSVladimir Sementsov-Ogievskiy return false; 15547a3f542fSVladimir Sementsov-Ogievskiy } 15557a3f542fSVladimir Sementsov-Ogievskiy 15567a3f542fSVladimir Sementsov-Ogievskiy sum = pad->head + bytes + pad->tail; 15577a3f542fSVladimir Sementsov-Ogievskiy pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 15587a3f542fSVladimir Sementsov-Ogievskiy pad->buf = qemu_blockalign(bs, pad->buf_len); 15597a3f542fSVladimir Sementsov-Ogievskiy pad->merge_reads = sum == pad->buf_len; 15607a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 15617a3f542fSVladimir Sementsov-Ogievskiy pad->tail_buf = pad->buf + pad->buf_len - align; 15627a3f542fSVladimir Sementsov-Ogievskiy } 15637a3f542fSVladimir Sementsov-Ogievskiy 15647a3f542fSVladimir Sementsov-Ogievskiy return true; 15657a3f542fSVladimir Sementsov-Ogievskiy } 15667a3f542fSVladimir Sementsov-Ogievskiy 15677a3f542fSVladimir Sementsov-Ogievskiy static int bdrv_padding_rmw_read(BdrvChild *child, 15687a3f542fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, 15697a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad, 15707a3f542fSVladimir Sementsov-Ogievskiy bool zero_middle) 15717a3f542fSVladimir Sementsov-Ogievskiy { 15727a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 15737a3f542fSVladimir Sementsov-Ogievskiy BlockDriverState *bs = child->bs; 15747a3f542fSVladimir Sementsov-Ogievskiy uint64_t align = bs->bl.request_alignment; 15757a3f542fSVladimir Sementsov-Ogievskiy int ret; 15767a3f542fSVladimir Sementsov-Ogievskiy 15777a3f542fSVladimir Sementsov-Ogievskiy assert(req->serialising && pad->buf); 15787a3f542fSVladimir Sementsov-Ogievskiy 15797a3f542fSVladimir Sementsov-Ogievskiy if (pad->head || pad->merge_reads) { 15807a3f542fSVladimir Sementsov-Ogievskiy uint64_t bytes = pad->merge_reads ? pad->buf_len : align; 15817a3f542fSVladimir Sementsov-Ogievskiy 15827a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 15837a3f542fSVladimir Sementsov-Ogievskiy 15847a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 15857a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 15867a3f542fSVladimir Sementsov-Ogievskiy } 15877a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 15887a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 15897a3f542fSVladimir Sementsov-Ogievskiy } 15907a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 159165cd4424SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 0); 15927a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 15937a3f542fSVladimir Sementsov-Ogievskiy return ret; 15947a3f542fSVladimir Sementsov-Ogievskiy } 15957a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 15967a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 15977a3f542fSVladimir Sementsov-Ogievskiy } 15987a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 15997a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 16007a3f542fSVladimir Sementsov-Ogievskiy } 16017a3f542fSVladimir Sementsov-Ogievskiy 16027a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads) { 16037a3f542fSVladimir Sementsov-Ogievskiy goto zero_mem; 16047a3f542fSVladimir Sementsov-Ogievskiy } 16057a3f542fSVladimir Sementsov-Ogievskiy } 16067a3f542fSVladimir Sementsov-Ogievskiy 16077a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 16087a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 16097a3f542fSVladimir Sementsov-Ogievskiy 16107a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 16117a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv( 16127a3f542fSVladimir Sementsov-Ogievskiy child, req, 16137a3f542fSVladimir Sementsov-Ogievskiy req->overlap_offset + req->overlap_bytes - align, 161465cd4424SVladimir Sementsov-Ogievskiy align, align, &local_qiov, 0, 0); 16157a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 16167a3f542fSVladimir Sementsov-Ogievskiy return ret; 16177a3f542fSVladimir Sementsov-Ogievskiy } 16187a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 16197a3f542fSVladimir Sementsov-Ogievskiy } 16207a3f542fSVladimir Sementsov-Ogievskiy 16217a3f542fSVladimir Sementsov-Ogievskiy zero_mem: 16227a3f542fSVladimir Sementsov-Ogievskiy if (zero_middle) { 16237a3f542fSVladimir Sementsov-Ogievskiy memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 16247a3f542fSVladimir Sementsov-Ogievskiy } 16257a3f542fSVladimir Sementsov-Ogievskiy 16267a3f542fSVladimir Sementsov-Ogievskiy return 0; 16277a3f542fSVladimir Sementsov-Ogievskiy } 16287a3f542fSVladimir Sementsov-Ogievskiy 16297a3f542fSVladimir Sementsov-Ogievskiy static void bdrv_padding_destroy(BdrvRequestPadding *pad) 16307a3f542fSVladimir Sementsov-Ogievskiy { 16317a3f542fSVladimir Sementsov-Ogievskiy if (pad->buf) { 16327a3f542fSVladimir Sementsov-Ogievskiy qemu_vfree(pad->buf); 16337a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&pad->local_qiov); 16347a3f542fSVladimir Sementsov-Ogievskiy } 16357a3f542fSVladimir Sementsov-Ogievskiy } 16367a3f542fSVladimir Sementsov-Ogievskiy 16377a3f542fSVladimir Sementsov-Ogievskiy /* 16387a3f542fSVladimir Sementsov-Ogievskiy * bdrv_pad_request 16397a3f542fSVladimir Sementsov-Ogievskiy * 16407a3f542fSVladimir Sementsov-Ogievskiy * Exchange request parameters with padded request if needed. Don't include RMW 16417a3f542fSVladimir Sementsov-Ogievskiy * read of padding, bdrv_padding_rmw_read() should be called separately if 16427a3f542fSVladimir Sementsov-Ogievskiy * needed. 16437a3f542fSVladimir Sementsov-Ogievskiy * 16447a3f542fSVladimir Sementsov-Ogievskiy * All parameters except @bs are in-out: they represent original request at 16457a3f542fSVladimir Sementsov-Ogievskiy * function call and padded (if padding needed) at function finish. 16467a3f542fSVladimir Sementsov-Ogievskiy * 16477a3f542fSVladimir Sementsov-Ogievskiy * Function always succeeds. 16487a3f542fSVladimir Sementsov-Ogievskiy */ 16491acc3466SVladimir Sementsov-Ogievskiy static bool bdrv_pad_request(BlockDriverState *bs, 16501acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector **qiov, size_t *qiov_offset, 16517a3f542fSVladimir Sementsov-Ogievskiy int64_t *offset, unsigned int *bytes, 16527a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad) 16537a3f542fSVladimir Sementsov-Ogievskiy { 16547a3f542fSVladimir Sementsov-Ogievskiy if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 16557a3f542fSVladimir Sementsov-Ogievskiy return false; 16567a3f542fSVladimir Sementsov-Ogievskiy } 16577a3f542fSVladimir Sementsov-Ogievskiy 16587a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 16591acc3466SVladimir Sementsov-Ogievskiy *qiov, *qiov_offset, *bytes, 16607a3f542fSVladimir Sementsov-Ogievskiy pad->buf + pad->buf_len - pad->tail, pad->tail); 16617a3f542fSVladimir Sementsov-Ogievskiy *bytes += pad->head + pad->tail; 16627a3f542fSVladimir Sementsov-Ogievskiy *offset -= pad->head; 16637a3f542fSVladimir Sementsov-Ogievskiy *qiov = &pad->local_qiov; 16641acc3466SVladimir Sementsov-Ogievskiy *qiov_offset = 0; 16657a3f542fSVladimir Sementsov-Ogievskiy 16667a3f542fSVladimir Sementsov-Ogievskiy return true; 16677a3f542fSVladimir Sementsov-Ogievskiy } 16687a3f542fSVladimir Sementsov-Ogievskiy 1669a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child, 167061007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 167161007b31SStefan Hajnoczi BdrvRequestFlags flags) 167261007b31SStefan Hajnoczi { 16731acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 16741acc3466SVladimir Sementsov-Ogievskiy } 16751acc3466SVladimir Sementsov-Ogievskiy 16761acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 16771acc3466SVladimir Sementsov-Ogievskiy int64_t offset, unsigned int bytes, 16781acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 16791acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 16801acc3466SVladimir Sementsov-Ogievskiy { 1681a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 168261007b31SStefan Hajnoczi BdrvTrackedRequest req; 16837a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 168461007b31SStefan Hajnoczi int ret; 168561007b31SStefan Hajnoczi 16867a3f542fSVladimir Sementsov-Ogievskiy trace_bdrv_co_preadv(bs, offset, bytes, flags); 168761007b31SStefan Hajnoczi 168861007b31SStefan Hajnoczi ret = bdrv_check_byte_request(bs, offset, bytes); 168961007b31SStefan Hajnoczi if (ret < 0) { 169061007b31SStefan Hajnoczi return ret; 169161007b31SStefan Hajnoczi } 169261007b31SStefan Hajnoczi 169399723548SPaolo Bonzini bdrv_inc_in_flight(bs); 169499723548SPaolo Bonzini 16959568b511SWen Congyang /* Don't do copy-on-read if we read data before write operation */ 1696d3faa13eSPaolo Bonzini if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 169761007b31SStefan Hajnoczi flags |= BDRV_REQ_COPY_ON_READ; 169861007b31SStefan Hajnoczi } 169961007b31SStefan Hajnoczi 17001acc3466SVladimir Sementsov-Ogievskiy bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad); 170161007b31SStefan Hajnoczi 1702ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 17037a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, &req, offset, bytes, 17047a3f542fSVladimir Sementsov-Ogievskiy bs->bl.request_alignment, 17051acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 170661007b31SStefan Hajnoczi tracked_request_end(&req); 170799723548SPaolo Bonzini bdrv_dec_in_flight(bs); 170861007b31SStefan Hajnoczi 17097a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 171061007b31SStefan Hajnoczi 171161007b31SStefan Hajnoczi return ret; 171261007b31SStefan Hajnoczi } 171361007b31SStefan Hajnoczi 1714d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1715f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags) 171661007b31SStefan Hajnoczi { 171761007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 171861007b31SStefan Hajnoczi QEMUIOVector qiov; 17190d93ed08SVladimir Sementsov-Ogievskiy void *buf = NULL; 172061007b31SStefan Hajnoczi int ret = 0; 1721465fe887SEric Blake bool need_flush = false; 1722443668caSDenis V. Lunev int head = 0; 1723443668caSDenis V. Lunev int tail = 0; 172461007b31SStefan Hajnoczi 1725cf081fcaSEric Blake int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1726a5b8dd2cSEric Blake int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1727a5b8dd2cSEric Blake bs->bl.request_alignment); 1728cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1729cf081fcaSEric Blake 1730d470ad42SMax Reitz if (!drv) { 1731d470ad42SMax Reitz return -ENOMEDIUM; 1732d470ad42SMax Reitz } 1733d470ad42SMax Reitz 1734fe0480d6SKevin Wolf if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1735fe0480d6SKevin Wolf return -ENOTSUP; 1736fe0480d6SKevin Wolf } 1737fe0480d6SKevin Wolf 1738b8d0a980SEric Blake assert(alignment % bs->bl.request_alignment == 0); 1739b8d0a980SEric Blake head = offset % alignment; 1740f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % alignment; 1741b8d0a980SEric Blake max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1742b8d0a980SEric Blake assert(max_write_zeroes >= bs->bl.request_alignment); 174361007b31SStefan Hajnoczi 1744f5a5ca79SManos Pitsidianakis while (bytes > 0 && !ret) { 1745f5a5ca79SManos Pitsidianakis int num = bytes; 174661007b31SStefan Hajnoczi 174761007b31SStefan Hajnoczi /* Align request. Block drivers can expect the "bulk" of the request 1748443668caSDenis V. Lunev * to be aligned, and that unaligned requests do not cross cluster 1749443668caSDenis V. Lunev * boundaries. 175061007b31SStefan Hajnoczi */ 1751443668caSDenis V. Lunev if (head) { 1752b2f95feeSEric Blake /* Make a small request up to the first aligned sector. For 1753b2f95feeSEric Blake * convenience, limit this request to max_transfer even if 1754b2f95feeSEric Blake * we don't need to fall back to writes. */ 1755f5a5ca79SManos Pitsidianakis num = MIN(MIN(bytes, max_transfer), alignment - head); 1756b2f95feeSEric Blake head = (head + num) % alignment; 1757b2f95feeSEric Blake assert(num < max_write_zeroes); 1758d05aa8bbSEric Blake } else if (tail && num > alignment) { 1759443668caSDenis V. Lunev /* Shorten the request to the last aligned sector. */ 1760443668caSDenis V. Lunev num -= tail; 176161007b31SStefan Hajnoczi } 176261007b31SStefan Hajnoczi 176361007b31SStefan Hajnoczi /* limit request size */ 176461007b31SStefan Hajnoczi if (num > max_write_zeroes) { 176561007b31SStefan Hajnoczi num = max_write_zeroes; 176661007b31SStefan Hajnoczi } 176761007b31SStefan Hajnoczi 176861007b31SStefan Hajnoczi ret = -ENOTSUP; 176961007b31SStefan Hajnoczi /* First try the efficient write zeroes operation */ 1770d05aa8bbSEric Blake if (drv->bdrv_co_pwrite_zeroes) { 1771d05aa8bbSEric Blake ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1772d05aa8bbSEric Blake flags & bs->supported_zero_flags); 1773d05aa8bbSEric Blake if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1774d05aa8bbSEric Blake !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1775d05aa8bbSEric Blake need_flush = true; 1776d05aa8bbSEric Blake } 1777465fe887SEric Blake } else { 1778465fe887SEric Blake assert(!bs->supported_zero_flags); 177961007b31SStefan Hajnoczi } 178061007b31SStefan Hajnoczi 1781294682ccSAndrey Shinkevich if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 178261007b31SStefan Hajnoczi /* Fall back to bounce buffer if write zeroes is unsupported */ 1783465fe887SEric Blake BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1784465fe887SEric Blake 1785465fe887SEric Blake if ((flags & BDRV_REQ_FUA) && 1786465fe887SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1787465fe887SEric Blake /* No need for bdrv_driver_pwrite() to do a fallback 1788465fe887SEric Blake * flush on each chunk; use just one at the end */ 1789465fe887SEric Blake write_flags &= ~BDRV_REQ_FUA; 1790465fe887SEric Blake need_flush = true; 1791465fe887SEric Blake } 17925def6b80SEric Blake num = MIN(num, max_transfer); 17930d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 17940d93ed08SVladimir Sementsov-Ogievskiy buf = qemu_try_blockalign0(bs, num); 17950d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 179661007b31SStefan Hajnoczi ret = -ENOMEM; 179761007b31SStefan Hajnoczi goto fail; 179861007b31SStefan Hajnoczi } 179961007b31SStefan Hajnoczi } 18000d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&qiov, buf, num); 180161007b31SStefan Hajnoczi 1802ac850bf0SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 180361007b31SStefan Hajnoczi 180461007b31SStefan Hajnoczi /* Keep bounce buffer around if it is big enough for all 180561007b31SStefan Hajnoczi * all future requests. 180661007b31SStefan Hajnoczi */ 18075def6b80SEric Blake if (num < max_transfer) { 18080d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 18090d93ed08SVladimir Sementsov-Ogievskiy buf = NULL; 181061007b31SStefan Hajnoczi } 181161007b31SStefan Hajnoczi } 181261007b31SStefan Hajnoczi 1813d05aa8bbSEric Blake offset += num; 1814f5a5ca79SManos Pitsidianakis bytes -= num; 181561007b31SStefan Hajnoczi } 181661007b31SStefan Hajnoczi 181761007b31SStefan Hajnoczi fail: 1818465fe887SEric Blake if (ret == 0 && need_flush) { 1819465fe887SEric Blake ret = bdrv_co_flush(bs); 1820465fe887SEric Blake } 18210d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 182261007b31SStefan Hajnoczi return ret; 182361007b31SStefan Hajnoczi } 182461007b31SStefan Hajnoczi 182585fe2479SFam Zheng static inline int coroutine_fn 182685fe2479SFam Zheng bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, 182785fe2479SFam Zheng BdrvTrackedRequest *req, int flags) 182885fe2479SFam Zheng { 182985fe2479SFam Zheng BlockDriverState *bs = child->bs; 183085fe2479SFam Zheng bool waited; 183185fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 183285fe2479SFam Zheng 183385fe2479SFam Zheng if (bs->read_only) { 183485fe2479SFam Zheng return -EPERM; 183585fe2479SFam Zheng } 183685fe2479SFam Zheng 183785fe2479SFam Zheng /* BDRV_REQ_NO_SERIALISING is only for read operation */ 183885fe2479SFam Zheng assert(!(flags & BDRV_REQ_NO_SERIALISING)); 183985fe2479SFam Zheng assert(!(bs->open_flags & BDRV_O_INACTIVE)); 184085fe2479SFam Zheng assert((bs->open_flags & BDRV_O_NO_IO) == 0); 184185fe2479SFam Zheng assert(!(flags & ~BDRV_REQ_MASK)); 184285fe2479SFam Zheng 184385fe2479SFam Zheng if (flags & BDRV_REQ_SERIALISING) { 184485fe2479SFam Zheng mark_request_serialising(req, bdrv_get_cluster_size(bs)); 184585fe2479SFam Zheng } 184685fe2479SFam Zheng 184785fe2479SFam Zheng waited = wait_serialising_requests(req); 184885fe2479SFam Zheng 184985fe2479SFam Zheng assert(!waited || !req->serialising || 185085fe2479SFam Zheng is_request_serialising_and_aligned(req)); 185185fe2479SFam Zheng assert(req->overlap_offset <= offset); 185285fe2479SFam Zheng assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1853cd47d792SFam Zheng assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 185485fe2479SFam Zheng 1855cd47d792SFam Zheng switch (req->type) { 1856cd47d792SFam Zheng case BDRV_TRACKED_WRITE: 1857cd47d792SFam Zheng case BDRV_TRACKED_DISCARD: 185885fe2479SFam Zheng if (flags & BDRV_REQ_WRITE_UNCHANGED) { 185985fe2479SFam Zheng assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 186085fe2479SFam Zheng } else { 186185fe2479SFam Zheng assert(child->perm & BLK_PERM_WRITE); 186285fe2479SFam Zheng } 1863cd47d792SFam Zheng return notifier_with_return_list_notify(&bs->before_write_notifiers, 1864cd47d792SFam Zheng req); 1865cd47d792SFam Zheng case BDRV_TRACKED_TRUNCATE: 1866cd47d792SFam Zheng assert(child->perm & BLK_PERM_RESIZE); 1867cd47d792SFam Zheng return 0; 1868cd47d792SFam Zheng default: 1869cd47d792SFam Zheng abort(); 1870cd47d792SFam Zheng } 187185fe2479SFam Zheng } 187285fe2479SFam Zheng 187385fe2479SFam Zheng static inline void coroutine_fn 187485fe2479SFam Zheng bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, 187585fe2479SFam Zheng BdrvTrackedRequest *req, int ret) 187685fe2479SFam Zheng { 187785fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 187885fe2479SFam Zheng BlockDriverState *bs = child->bs; 187985fe2479SFam Zheng 188085fe2479SFam Zheng atomic_inc(&bs->write_gen); 188185fe2479SFam Zheng 188200695c27SFam Zheng /* 188300695c27SFam Zheng * Discard cannot extend the image, but in error handling cases, such as 188400695c27SFam Zheng * when reverting a qcow2 cluster allocation, the discarded range can pass 188500695c27SFam Zheng * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 188600695c27SFam Zheng * here. Instead, just skip it, since semantically a discard request 188700695c27SFam Zheng * beyond EOF cannot expand the image anyway. 188800695c27SFam Zheng */ 18897f8f03efSFam Zheng if (ret == 0 && 1890cd47d792SFam Zheng (req->type == BDRV_TRACKED_TRUNCATE || 1891cd47d792SFam Zheng end_sector > bs->total_sectors) && 189200695c27SFam Zheng req->type != BDRV_TRACKED_DISCARD) { 18937f8f03efSFam Zheng bs->total_sectors = end_sector; 18947f8f03efSFam Zheng bdrv_parent_cb_resize(bs); 18957f8f03efSFam Zheng bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 189685fe2479SFam Zheng } 189700695c27SFam Zheng if (req->bytes) { 189800695c27SFam Zheng switch (req->type) { 189900695c27SFam Zheng case BDRV_TRACKED_WRITE: 190000695c27SFam Zheng stat64_max(&bs->wr_highest_offset, offset + bytes); 190100695c27SFam Zheng /* fall through, to set dirty bits */ 190200695c27SFam Zheng case BDRV_TRACKED_DISCARD: 19037f8f03efSFam Zheng bdrv_set_dirty(bs, offset, bytes); 190400695c27SFam Zheng break; 190500695c27SFam Zheng default: 190600695c27SFam Zheng break; 190700695c27SFam Zheng } 190800695c27SFam Zheng } 190985fe2479SFam Zheng } 191085fe2479SFam Zheng 191161007b31SStefan Hajnoczi /* 191204ed95f4SEric Blake * Forwards an already correctly aligned write request to the BlockDriver, 191304ed95f4SEric Blake * after possibly fragmenting it. 191461007b31SStefan Hajnoczi */ 191585c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 191661007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 191728c4da28SVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 191861007b31SStefan Hajnoczi { 191985c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 192061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 192161007b31SStefan Hajnoczi int ret; 192261007b31SStefan Hajnoczi 192304ed95f4SEric Blake uint64_t bytes_remaining = bytes; 192404ed95f4SEric Blake int max_transfer; 192561007b31SStefan Hajnoczi 1926d470ad42SMax Reitz if (!drv) { 1927d470ad42SMax Reitz return -ENOMEDIUM; 1928d470ad42SMax Reitz } 1929d470ad42SMax Reitz 1930d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 1931d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 1932d6883bc9SVladimir Sementsov-Ogievskiy } 1933d6883bc9SVladimir Sementsov-Ogievskiy 1934cff86b38SEric Blake assert(is_power_of_2(align)); 1935cff86b38SEric Blake assert((offset & (align - 1)) == 0); 1936cff86b38SEric Blake assert((bytes & (align - 1)) == 0); 193728c4da28SVladimir Sementsov-Ogievskiy assert(!qiov || qiov_offset + bytes <= qiov->size); 193804ed95f4SEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 193904ed95f4SEric Blake align); 194061007b31SStefan Hajnoczi 194185fe2479SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 194261007b31SStefan Hajnoczi 194361007b31SStefan Hajnoczi if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1944c1499a5eSEric Blake !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 194528c4da28SVladimir Sementsov-Ogievskiy qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 194661007b31SStefan Hajnoczi flags |= BDRV_REQ_ZERO_WRITE; 194761007b31SStefan Hajnoczi if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 194861007b31SStefan Hajnoczi flags |= BDRV_REQ_MAY_UNMAP; 194961007b31SStefan Hajnoczi } 195061007b31SStefan Hajnoczi } 195161007b31SStefan Hajnoczi 195261007b31SStefan Hajnoczi if (ret < 0) { 195361007b31SStefan Hajnoczi /* Do nothing, write notifier decided to fail this request */ 195461007b31SStefan Hajnoczi } else if (flags & BDRV_REQ_ZERO_WRITE) { 19559a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 19569896c876SKevin Wolf ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 19573ea1a091SPavel Butsykin } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 195828c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 195928c4da28SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 196004ed95f4SEric Blake } else if (bytes <= max_transfer) { 19619a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV); 196228c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 196304ed95f4SEric Blake } else { 196404ed95f4SEric Blake bdrv_debug_event(bs, BLKDBG_PWRITEV); 196504ed95f4SEric Blake while (bytes_remaining) { 196604ed95f4SEric Blake int num = MIN(bytes_remaining, max_transfer); 196704ed95f4SEric Blake int local_flags = flags; 196804ed95f4SEric Blake 196904ed95f4SEric Blake assert(num); 197004ed95f4SEric Blake if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 197104ed95f4SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 197204ed95f4SEric Blake /* If FUA is going to be emulated by flush, we only 197304ed95f4SEric Blake * need to flush on the last iteration */ 197404ed95f4SEric Blake local_flags &= ~BDRV_REQ_FUA; 197504ed95f4SEric Blake } 197604ed95f4SEric Blake 197704ed95f4SEric Blake ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 197828c4da28SVladimir Sementsov-Ogievskiy num, qiov, bytes - bytes_remaining, 197928c4da28SVladimir Sementsov-Ogievskiy local_flags); 198004ed95f4SEric Blake if (ret < 0) { 198104ed95f4SEric Blake break; 198204ed95f4SEric Blake } 198304ed95f4SEric Blake bytes_remaining -= num; 198404ed95f4SEric Blake } 198561007b31SStefan Hajnoczi } 19869a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 198761007b31SStefan Hajnoczi 198861007b31SStefan Hajnoczi if (ret >= 0) { 198904ed95f4SEric Blake ret = 0; 199061007b31SStefan Hajnoczi } 199185fe2479SFam Zheng bdrv_co_write_req_finish(child, offset, bytes, req, ret); 199261007b31SStefan Hajnoczi 199361007b31SStefan Hajnoczi return ret; 199461007b31SStefan Hajnoczi } 199561007b31SStefan Hajnoczi 199685c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 19979eeb6dd1SFam Zheng int64_t offset, 19989eeb6dd1SFam Zheng unsigned int bytes, 19999eeb6dd1SFam Zheng BdrvRequestFlags flags, 20009eeb6dd1SFam Zheng BdrvTrackedRequest *req) 20019eeb6dd1SFam Zheng { 200285c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 20039eeb6dd1SFam Zheng QEMUIOVector local_qiov; 2004a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 20059eeb6dd1SFam Zheng int ret = 0; 20067a3f542fSVladimir Sementsov-Ogievskiy bool padding; 20077a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 20089eeb6dd1SFam Zheng 20097a3f542fSVladimir Sementsov-Ogievskiy padding = bdrv_init_padding(bs, offset, bytes, &pad); 20107a3f542fSVladimir Sementsov-Ogievskiy if (padding) { 20119eeb6dd1SFam Zheng mark_request_serialising(req, align); 20129eeb6dd1SFam Zheng wait_serialising_requests(req); 20139eeb6dd1SFam Zheng 20147a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, req, &pad, true); 20157a3f542fSVladimir Sementsov-Ogievskiy 20167a3f542fSVladimir Sementsov-Ogievskiy if (pad.head || pad.merge_reads) { 20177a3f542fSVladimir Sementsov-Ogievskiy int64_t aligned_offset = offset & ~(align - 1); 20187a3f542fSVladimir Sementsov-Ogievskiy int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 20197a3f542fSVladimir Sementsov-Ogievskiy 20207a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 20217a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 202228c4da28SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 20239eeb6dd1SFam Zheng flags & ~BDRV_REQ_ZERO_WRITE); 20247a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0 || pad.merge_reads) { 20257a3f542fSVladimir Sementsov-Ogievskiy /* Error or all work is done */ 20267a3f542fSVladimir Sementsov-Ogievskiy goto out; 20279eeb6dd1SFam Zheng } 20287a3f542fSVladimir Sementsov-Ogievskiy offset += write_bytes - pad.head; 20297a3f542fSVladimir Sementsov-Ogievskiy bytes -= write_bytes - pad.head; 20307a3f542fSVladimir Sementsov-Ogievskiy } 20319eeb6dd1SFam Zheng } 20329eeb6dd1SFam Zheng 20339eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 20349eeb6dd1SFam Zheng if (bytes >= align) { 20359eeb6dd1SFam Zheng /* Write the aligned part in the middle. */ 20369eeb6dd1SFam Zheng uint64_t aligned_bytes = bytes & ~(align - 1); 203785c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 203828c4da28SVladimir Sementsov-Ogievskiy NULL, 0, flags); 20399eeb6dd1SFam Zheng if (ret < 0) { 20407a3f542fSVladimir Sementsov-Ogievskiy goto out; 20419eeb6dd1SFam Zheng } 20429eeb6dd1SFam Zheng bytes -= aligned_bytes; 20439eeb6dd1SFam Zheng offset += aligned_bytes; 20449eeb6dd1SFam Zheng } 20459eeb6dd1SFam Zheng 20469eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 20479eeb6dd1SFam Zheng if (bytes) { 20487a3f542fSVladimir Sementsov-Ogievskiy assert(align == pad.tail + bytes); 20499eeb6dd1SFam Zheng 20507a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 205185c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, align, align, 205228c4da28SVladimir Sementsov-Ogievskiy &local_qiov, 0, 205328c4da28SVladimir Sementsov-Ogievskiy flags & ~BDRV_REQ_ZERO_WRITE); 20549eeb6dd1SFam Zheng } 20559eeb6dd1SFam Zheng 20567a3f542fSVladimir Sementsov-Ogievskiy out: 20577a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 20587a3f542fSVladimir Sementsov-Ogievskiy 20597a3f542fSVladimir Sementsov-Ogievskiy return ret; 20609eeb6dd1SFam Zheng } 20619eeb6dd1SFam Zheng 206261007b31SStefan Hajnoczi /* 206361007b31SStefan Hajnoczi * Handle a write request in coroutine context 206461007b31SStefan Hajnoczi */ 2065a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 206661007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 206761007b31SStefan Hajnoczi BdrvRequestFlags flags) 206861007b31SStefan Hajnoczi { 20691acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 20701acc3466SVladimir Sementsov-Ogievskiy } 20711acc3466SVladimir Sementsov-Ogievskiy 20721acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 20731acc3466SVladimir Sementsov-Ogievskiy int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, 20741acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 20751acc3466SVladimir Sementsov-Ogievskiy { 2076a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 207761007b31SStefan Hajnoczi BdrvTrackedRequest req; 2078a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 20797a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 208061007b31SStefan Hajnoczi int ret; 208161007b31SStefan Hajnoczi 2082f42cf447SDaniel P. Berrange trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 2083f42cf447SDaniel P. Berrange 208461007b31SStefan Hajnoczi if (!bs->drv) { 208561007b31SStefan Hajnoczi return -ENOMEDIUM; 208661007b31SStefan Hajnoczi } 208761007b31SStefan Hajnoczi 208861007b31SStefan Hajnoczi ret = bdrv_check_byte_request(bs, offset, bytes); 208961007b31SStefan Hajnoczi if (ret < 0) { 209061007b31SStefan Hajnoczi return ret; 209161007b31SStefan Hajnoczi } 209261007b31SStefan Hajnoczi 209399723548SPaolo Bonzini bdrv_inc_in_flight(bs); 209461007b31SStefan Hajnoczi /* 209561007b31SStefan Hajnoczi * Align write if necessary by performing a read-modify-write cycle. 209661007b31SStefan Hajnoczi * Pad qiov with the read parts and be sure to have a tracked request not 209761007b31SStefan Hajnoczi * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 209861007b31SStefan Hajnoczi */ 2099ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 210061007b31SStefan Hajnoczi 210118a59f03SAnton Nefedov if (flags & BDRV_REQ_ZERO_WRITE) { 210285c97ca7SKevin Wolf ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 21039eeb6dd1SFam Zheng goto out; 21049eeb6dd1SFam Zheng } 21059eeb6dd1SFam Zheng 21061acc3466SVladimir Sementsov-Ogievskiy if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { 210761007b31SStefan Hajnoczi mark_request_serialising(&req, align); 210861007b31SStefan Hajnoczi wait_serialising_requests(&req); 21097a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, &req, &pad, false); 211061007b31SStefan Hajnoczi } 211161007b31SStefan Hajnoczi 211285c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 21131acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 211461007b31SStefan Hajnoczi 21157a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 211661007b31SStefan Hajnoczi 21179eeb6dd1SFam Zheng out: 21189eeb6dd1SFam Zheng tracked_request_end(&req); 211999723548SPaolo Bonzini bdrv_dec_in_flight(bs); 21207a3f542fSVladimir Sementsov-Ogievskiy 212161007b31SStefan Hajnoczi return ret; 212261007b31SStefan Hajnoczi } 212361007b31SStefan Hajnoczi 2124a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2125f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 212661007b31SStefan Hajnoczi { 2127f5a5ca79SManos Pitsidianakis trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 212861007b31SStefan Hajnoczi 2129a03ef88fSKevin Wolf if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 213061007b31SStefan Hajnoczi flags &= ~BDRV_REQ_MAY_UNMAP; 213161007b31SStefan Hajnoczi } 213261007b31SStefan Hajnoczi 2133f5a5ca79SManos Pitsidianakis return bdrv_co_pwritev(child, offset, bytes, NULL, 213461007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 213561007b31SStefan Hajnoczi } 213661007b31SStefan Hajnoczi 21374085f5c7SJohn Snow /* 21384085f5c7SJohn Snow * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 21394085f5c7SJohn Snow */ 21404085f5c7SJohn Snow int bdrv_flush_all(void) 21414085f5c7SJohn Snow { 21424085f5c7SJohn Snow BdrvNextIterator it; 21434085f5c7SJohn Snow BlockDriverState *bs = NULL; 21444085f5c7SJohn Snow int result = 0; 21454085f5c7SJohn Snow 2146*c8aa7895SPavel Dovgalyuk /* 2147*c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 2148*c8aa7895SPavel Dovgalyuk * creating new flush request for stopping 2149*c8aa7895SPavel Dovgalyuk * the VM may break the determinism 2150*c8aa7895SPavel Dovgalyuk */ 2151*c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 2152*c8aa7895SPavel Dovgalyuk return result; 2153*c8aa7895SPavel Dovgalyuk } 2154*c8aa7895SPavel Dovgalyuk 21554085f5c7SJohn Snow for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 21564085f5c7SJohn Snow AioContext *aio_context = bdrv_get_aio_context(bs); 21574085f5c7SJohn Snow int ret; 21584085f5c7SJohn Snow 21594085f5c7SJohn Snow aio_context_acquire(aio_context); 21604085f5c7SJohn Snow ret = bdrv_flush(bs); 21614085f5c7SJohn Snow if (ret < 0 && !result) { 21624085f5c7SJohn Snow result = ret; 21634085f5c7SJohn Snow } 21644085f5c7SJohn Snow aio_context_release(aio_context); 21654085f5c7SJohn Snow } 21664085f5c7SJohn Snow 21674085f5c7SJohn Snow return result; 21684085f5c7SJohn Snow } 21694085f5c7SJohn Snow 21704085f5c7SJohn Snow 21714bcd936eSEric Blake typedef struct BdrvCoBlockStatusData { 217261007b31SStefan Hajnoczi BlockDriverState *bs; 217361007b31SStefan Hajnoczi BlockDriverState *base; 2174c9ce8c4dSEric Blake bool want_zero; 21754bcd936eSEric Blake int64_t offset; 21764bcd936eSEric Blake int64_t bytes; 21774bcd936eSEric Blake int64_t *pnum; 21784bcd936eSEric Blake int64_t *map; 2179c9ce8c4dSEric Blake BlockDriverState **file; 21804bcd936eSEric Blake int ret; 218161007b31SStefan Hajnoczi bool done; 21824bcd936eSEric Blake } BdrvCoBlockStatusData; 218361007b31SStefan Hajnoczi 21843e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 21853e4d0e72SEric Blake bool want_zero, 21863e4d0e72SEric Blake int64_t offset, 21873e4d0e72SEric Blake int64_t bytes, 21883e4d0e72SEric Blake int64_t *pnum, 21893e4d0e72SEric Blake int64_t *map, 2190f7cc69b3SManos Pitsidianakis BlockDriverState **file) 2191f7cc69b3SManos Pitsidianakis { 2192f7cc69b3SManos Pitsidianakis assert(bs->file && bs->file->bs); 21933e4d0e72SEric Blake *pnum = bytes; 21943e4d0e72SEric Blake *map = offset; 2195f7cc69b3SManos Pitsidianakis *file = bs->file->bs; 21963e4d0e72SEric Blake return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2197f7cc69b3SManos Pitsidianakis } 2198f7cc69b3SManos Pitsidianakis 21993e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 22003e4d0e72SEric Blake bool want_zero, 22013e4d0e72SEric Blake int64_t offset, 22023e4d0e72SEric Blake int64_t bytes, 22033e4d0e72SEric Blake int64_t *pnum, 22043e4d0e72SEric Blake int64_t *map, 2205f7cc69b3SManos Pitsidianakis BlockDriverState **file) 2206f7cc69b3SManos Pitsidianakis { 2207f7cc69b3SManos Pitsidianakis assert(bs->backing && bs->backing->bs); 22083e4d0e72SEric Blake *pnum = bytes; 22093e4d0e72SEric Blake *map = offset; 2210f7cc69b3SManos Pitsidianakis *file = bs->backing->bs; 22113e4d0e72SEric Blake return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2212f7cc69b3SManos Pitsidianakis } 2213f7cc69b3SManos Pitsidianakis 221461007b31SStefan Hajnoczi /* 221561007b31SStefan Hajnoczi * Returns the allocation status of the specified sectors. 221661007b31SStefan Hajnoczi * Drivers not implementing the functionality are assumed to not support 221761007b31SStefan Hajnoczi * backing files, hence all their sectors are reported as allocated. 221861007b31SStefan Hajnoczi * 221986a3d5c6SEric Blake * If 'want_zero' is true, the caller is querying for mapping 222086a3d5c6SEric Blake * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 222186a3d5c6SEric Blake * _ZERO where possible; otherwise, the result favors larger 'pnum', 222286a3d5c6SEric Blake * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2223c9ce8c4dSEric Blake * 22242e8bc787SEric Blake * If 'offset' is beyond the end of the disk image the return value is 2225fb0d8654SEric Blake * BDRV_BLOCK_EOF and 'pnum' is set to 0. 222661007b31SStefan Hajnoczi * 22272e8bc787SEric Blake * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2228fb0d8654SEric Blake * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2229fb0d8654SEric Blake * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 223067a0fd2aSFam Zheng * 22312e8bc787SEric Blake * 'pnum' is set to the number of bytes (including and immediately 22322e8bc787SEric Blake * following the specified offset) that are easily known to be in the 22332e8bc787SEric Blake * same allocated/unallocated state. Note that a second call starting 22342e8bc787SEric Blake * at the original offset plus returned pnum may have the same status. 22352e8bc787SEric Blake * The returned value is non-zero on success except at end-of-file. 22362e8bc787SEric Blake * 22372e8bc787SEric Blake * Returns negative errno on failure. Otherwise, if the 22382e8bc787SEric Blake * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 22392e8bc787SEric Blake * set to the host mapping and BDS corresponding to the guest offset. 224061007b31SStefan Hajnoczi */ 22412e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2242c9ce8c4dSEric Blake bool want_zero, 22432e8bc787SEric Blake int64_t offset, int64_t bytes, 22442e8bc787SEric Blake int64_t *pnum, int64_t *map, 224567a0fd2aSFam Zheng BlockDriverState **file) 224661007b31SStefan Hajnoczi { 22472e8bc787SEric Blake int64_t total_size; 22482e8bc787SEric Blake int64_t n; /* bytes */ 2249efa6e2edSEric Blake int ret; 22502e8bc787SEric Blake int64_t local_map = 0; 2251298a1665SEric Blake BlockDriverState *local_file = NULL; 2252efa6e2edSEric Blake int64_t aligned_offset, aligned_bytes; 2253efa6e2edSEric Blake uint32_t align; 225461007b31SStefan Hajnoczi 2255298a1665SEric Blake assert(pnum); 2256298a1665SEric Blake *pnum = 0; 22572e8bc787SEric Blake total_size = bdrv_getlength(bs); 22582e8bc787SEric Blake if (total_size < 0) { 22592e8bc787SEric Blake ret = total_size; 2260298a1665SEric Blake goto early_out; 226161007b31SStefan Hajnoczi } 226261007b31SStefan Hajnoczi 22632e8bc787SEric Blake if (offset >= total_size) { 2264298a1665SEric Blake ret = BDRV_BLOCK_EOF; 2265298a1665SEric Blake goto early_out; 226661007b31SStefan Hajnoczi } 22672e8bc787SEric Blake if (!bytes) { 2268298a1665SEric Blake ret = 0; 2269298a1665SEric Blake goto early_out; 22709cdcfd9fSEric Blake } 227161007b31SStefan Hajnoczi 22722e8bc787SEric Blake n = total_size - offset; 22732e8bc787SEric Blake if (n < bytes) { 22742e8bc787SEric Blake bytes = n; 227561007b31SStefan Hajnoczi } 227661007b31SStefan Hajnoczi 2277d470ad42SMax Reitz /* Must be non-NULL or bdrv_getlength() would have failed */ 2278d470ad42SMax Reitz assert(bs->drv); 2279636cb512SEric Blake if (!bs->drv->bdrv_co_block_status) { 22802e8bc787SEric Blake *pnum = bytes; 228161007b31SStefan Hajnoczi ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 22822e8bc787SEric Blake if (offset + bytes == total_size) { 2283fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2284fb0d8654SEric Blake } 228561007b31SStefan Hajnoczi if (bs->drv->protocol_name) { 22862e8bc787SEric Blake ret |= BDRV_BLOCK_OFFSET_VALID; 22872e8bc787SEric Blake local_map = offset; 2288298a1665SEric Blake local_file = bs; 228961007b31SStefan Hajnoczi } 2290298a1665SEric Blake goto early_out; 229161007b31SStefan Hajnoczi } 229261007b31SStefan Hajnoczi 229399723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2294efa6e2edSEric Blake 2295efa6e2edSEric Blake /* Round out to request_alignment boundaries */ 229686a3d5c6SEric Blake align = bs->bl.request_alignment; 2297efa6e2edSEric Blake aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2298efa6e2edSEric Blake aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2299efa6e2edSEric Blake 230086a3d5c6SEric Blake ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 230186a3d5c6SEric Blake aligned_bytes, pnum, &local_map, 230286a3d5c6SEric Blake &local_file); 230386a3d5c6SEric Blake if (ret < 0) { 230486a3d5c6SEric Blake *pnum = 0; 230586a3d5c6SEric Blake goto out; 230686a3d5c6SEric Blake } 2307efa6e2edSEric Blake 2308efa6e2edSEric Blake /* 2309636cb512SEric Blake * The driver's result must be a non-zero multiple of request_alignment. 2310efa6e2edSEric Blake * Clamp pnum and adjust map to original request. 2311efa6e2edSEric Blake */ 2312636cb512SEric Blake assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2313636cb512SEric Blake align > offset - aligned_offset); 231469f47505SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_RECURSE) { 231569f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_DATA); 231669f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_OFFSET_VALID); 231769f47505SVladimir Sementsov-Ogievskiy assert(!(ret & BDRV_BLOCK_ZERO)); 231869f47505SVladimir Sementsov-Ogievskiy } 231969f47505SVladimir Sementsov-Ogievskiy 2320efa6e2edSEric Blake *pnum -= offset - aligned_offset; 2321efa6e2edSEric Blake if (*pnum > bytes) { 2322efa6e2edSEric Blake *pnum = bytes; 2323efa6e2edSEric Blake } 2324efa6e2edSEric Blake if (ret & BDRV_BLOCK_OFFSET_VALID) { 2325efa6e2edSEric Blake local_map += offset - aligned_offset; 2326efa6e2edSEric Blake } 232761007b31SStefan Hajnoczi 232861007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_RAW) { 2329298a1665SEric Blake assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 23302e8bc787SEric Blake ret = bdrv_co_block_status(local_file, want_zero, local_map, 23312e8bc787SEric Blake *pnum, pnum, &local_map, &local_file); 233299723548SPaolo Bonzini goto out; 233361007b31SStefan Hajnoczi } 233461007b31SStefan Hajnoczi 233561007b31SStefan Hajnoczi if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 233661007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ALLOCATED; 2337c9ce8c4dSEric Blake } else if (want_zero) { 233861007b31SStefan Hajnoczi if (bdrv_unallocated_blocks_are_zero(bs)) { 233961007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 2340760e0063SKevin Wolf } else if (bs->backing) { 2341760e0063SKevin Wolf BlockDriverState *bs2 = bs->backing->bs; 23422e8bc787SEric Blake int64_t size2 = bdrv_getlength(bs2); 2343c9ce8c4dSEric Blake 23442e8bc787SEric Blake if (size2 >= 0 && offset >= size2) { 234561007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 234661007b31SStefan Hajnoczi } 234761007b31SStefan Hajnoczi } 234861007b31SStefan Hajnoczi } 234961007b31SStefan Hajnoczi 235069f47505SVladimir Sementsov-Ogievskiy if (want_zero && ret & BDRV_BLOCK_RECURSE && 235169f47505SVladimir Sementsov-Ogievskiy local_file && local_file != bs && 235261007b31SStefan Hajnoczi (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 235361007b31SStefan Hajnoczi (ret & BDRV_BLOCK_OFFSET_VALID)) { 23542e8bc787SEric Blake int64_t file_pnum; 23552e8bc787SEric Blake int ret2; 235661007b31SStefan Hajnoczi 23572e8bc787SEric Blake ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 23582e8bc787SEric Blake *pnum, &file_pnum, NULL, NULL); 235961007b31SStefan Hajnoczi if (ret2 >= 0) { 236061007b31SStefan Hajnoczi /* Ignore errors. This is just providing extra information, it 236161007b31SStefan Hajnoczi * is useful but not necessary. 236261007b31SStefan Hajnoczi */ 2363c61e684eSEric Blake if (ret2 & BDRV_BLOCK_EOF && 2364c61e684eSEric Blake (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2365c61e684eSEric Blake /* 2366c61e684eSEric Blake * It is valid for the format block driver to read 2367c61e684eSEric Blake * beyond the end of the underlying file's current 2368c61e684eSEric Blake * size; such areas read as zero. 2369c61e684eSEric Blake */ 237061007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 237161007b31SStefan Hajnoczi } else { 237261007b31SStefan Hajnoczi /* Limit request to the range reported by the protocol driver */ 237361007b31SStefan Hajnoczi *pnum = file_pnum; 237461007b31SStefan Hajnoczi ret |= (ret2 & BDRV_BLOCK_ZERO); 237561007b31SStefan Hajnoczi } 237661007b31SStefan Hajnoczi } 237761007b31SStefan Hajnoczi } 237861007b31SStefan Hajnoczi 237999723548SPaolo Bonzini out: 238099723548SPaolo Bonzini bdrv_dec_in_flight(bs); 23812e8bc787SEric Blake if (ret >= 0 && offset + *pnum == total_size) { 2382fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2383fb0d8654SEric Blake } 2384298a1665SEric Blake early_out: 2385298a1665SEric Blake if (file) { 2386298a1665SEric Blake *file = local_file; 2387298a1665SEric Blake } 23882e8bc787SEric Blake if (map) { 23892e8bc787SEric Blake *map = local_map; 23902e8bc787SEric Blake } 239161007b31SStefan Hajnoczi return ret; 239261007b31SStefan Hajnoczi } 239361007b31SStefan Hajnoczi 23945b648c67SEric Blake static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2395ba3f0e25SFam Zheng BlockDriverState *base, 2396c9ce8c4dSEric Blake bool want_zero, 23975b648c67SEric Blake int64_t offset, 23985b648c67SEric Blake int64_t bytes, 23995b648c67SEric Blake int64_t *pnum, 24005b648c67SEric Blake int64_t *map, 240167a0fd2aSFam Zheng BlockDriverState **file) 2402ba3f0e25SFam Zheng { 2403ba3f0e25SFam Zheng BlockDriverState *p; 24045b648c67SEric Blake int ret = 0; 2405c61e684eSEric Blake bool first = true; 2406ba3f0e25SFam Zheng 2407ba3f0e25SFam Zheng assert(bs != base); 2408760e0063SKevin Wolf for (p = bs; p != base; p = backing_bs(p)) { 24095b648c67SEric Blake ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 24105b648c67SEric Blake file); 2411c61e684eSEric Blake if (ret < 0) { 2412c61e684eSEric Blake break; 2413c61e684eSEric Blake } 2414c61e684eSEric Blake if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2415c61e684eSEric Blake /* 2416c61e684eSEric Blake * Reading beyond the end of the file continues to read 2417c61e684eSEric Blake * zeroes, but we can only widen the result to the 2418c61e684eSEric Blake * unallocated length we learned from an earlier 2419c61e684eSEric Blake * iteration. 2420c61e684eSEric Blake */ 24215b648c67SEric Blake *pnum = bytes; 2422c61e684eSEric Blake } 2423c61e684eSEric Blake if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2424ba3f0e25SFam Zheng break; 2425ba3f0e25SFam Zheng } 24265b648c67SEric Blake /* [offset, pnum] unallocated on this layer, which could be only 24275b648c67SEric Blake * the first part of [offset, bytes]. */ 24285b648c67SEric Blake bytes = MIN(bytes, *pnum); 2429c61e684eSEric Blake first = false; 2430ba3f0e25SFam Zheng } 2431ba3f0e25SFam Zheng return ret; 2432ba3f0e25SFam Zheng } 2433ba3f0e25SFam Zheng 243431826642SEric Blake /* Coroutine wrapper for bdrv_block_status_above() */ 24355b648c67SEric Blake static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 243661007b31SStefan Hajnoczi { 24374bcd936eSEric Blake BdrvCoBlockStatusData *data = opaque; 243861007b31SStefan Hajnoczi 24395b648c67SEric Blake data->ret = bdrv_co_block_status_above(data->bs, data->base, 2440c9ce8c4dSEric Blake data->want_zero, 24415b648c67SEric Blake data->offset, data->bytes, 24425b648c67SEric Blake data->pnum, data->map, data->file); 244361007b31SStefan Hajnoczi data->done = true; 24444720cbeeSKevin Wolf aio_wait_kick(); 244561007b31SStefan Hajnoczi } 244661007b31SStefan Hajnoczi 244761007b31SStefan Hajnoczi /* 24485b648c67SEric Blake * Synchronous wrapper around bdrv_co_block_status_above(). 244961007b31SStefan Hajnoczi * 24505b648c67SEric Blake * See bdrv_co_block_status_above() for details. 245161007b31SStefan Hajnoczi */ 24527ddb99b9SEric Blake static int bdrv_common_block_status_above(BlockDriverState *bs, 2453ba3f0e25SFam Zheng BlockDriverState *base, 24547ddb99b9SEric Blake bool want_zero, int64_t offset, 24557ddb99b9SEric Blake int64_t bytes, int64_t *pnum, 24567ddb99b9SEric Blake int64_t *map, 245767a0fd2aSFam Zheng BlockDriverState **file) 245861007b31SStefan Hajnoczi { 245961007b31SStefan Hajnoczi Coroutine *co; 24604bcd936eSEric Blake BdrvCoBlockStatusData data = { 246161007b31SStefan Hajnoczi .bs = bs, 2462ba3f0e25SFam Zheng .base = base, 2463c9ce8c4dSEric Blake .want_zero = want_zero, 24647ddb99b9SEric Blake .offset = offset, 24657ddb99b9SEric Blake .bytes = bytes, 24667ddb99b9SEric Blake .pnum = pnum, 24677ddb99b9SEric Blake .map = map, 2468c9ce8c4dSEric Blake .file = file, 246961007b31SStefan Hajnoczi .done = false, 247061007b31SStefan Hajnoczi }; 247161007b31SStefan Hajnoczi 247261007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 247361007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 24745b648c67SEric Blake bdrv_block_status_above_co_entry(&data); 247561007b31SStefan Hajnoczi } else { 24765b648c67SEric Blake co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2477e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 247888b062c2SPaolo Bonzini BDRV_POLL_WHILE(bs, !data.done); 247961007b31SStefan Hajnoczi } 248061007b31SStefan Hajnoczi return data.ret; 248161007b31SStefan Hajnoczi } 248261007b31SStefan Hajnoczi 248331826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 248431826642SEric Blake int64_t offset, int64_t bytes, int64_t *pnum, 248531826642SEric Blake int64_t *map, BlockDriverState **file) 2486c9ce8c4dSEric Blake { 248731826642SEric Blake return bdrv_common_block_status_above(bs, base, true, offset, bytes, 248831826642SEric Blake pnum, map, file); 2489c9ce8c4dSEric Blake } 2490c9ce8c4dSEric Blake 2491237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2492237d78f8SEric Blake int64_t *pnum, int64_t *map, BlockDriverState **file) 2493ba3f0e25SFam Zheng { 249431826642SEric Blake return bdrv_block_status_above(bs, backing_bs(bs), 249531826642SEric Blake offset, bytes, pnum, map, file); 2496ba3f0e25SFam Zheng } 2497ba3f0e25SFam Zheng 2498d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2499d6a644bbSEric Blake int64_t bytes, int64_t *pnum) 250061007b31SStefan Hajnoczi { 25017ddb99b9SEric Blake int ret; 25027ddb99b9SEric Blake int64_t dummy; 2503d6a644bbSEric Blake 25047ddb99b9SEric Blake ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 25057ddb99b9SEric Blake bytes, pnum ? pnum : &dummy, NULL, 2506298a1665SEric Blake NULL); 250761007b31SStefan Hajnoczi if (ret < 0) { 250861007b31SStefan Hajnoczi return ret; 250961007b31SStefan Hajnoczi } 251061007b31SStefan Hajnoczi return !!(ret & BDRV_BLOCK_ALLOCATED); 251161007b31SStefan Hajnoczi } 251261007b31SStefan Hajnoczi 251361007b31SStefan Hajnoczi /* 251461007b31SStefan Hajnoczi * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 251561007b31SStefan Hajnoczi * 2516170d3bd3SAndrey Shinkevich * Return 1 if (a prefix of) the given range is allocated in any image 2517170d3bd3SAndrey Shinkevich * between BASE and TOP (BASE is only included if include_base is set). 2518170d3bd3SAndrey Shinkevich * BASE can be NULL to check if the given offset is allocated in any 2519170d3bd3SAndrey Shinkevich * image of the chain. Return 0 otherwise, or negative errno on 2520170d3bd3SAndrey Shinkevich * failure. 252161007b31SStefan Hajnoczi * 252251b0a488SEric Blake * 'pnum' is set to the number of bytes (including and immediately 252351b0a488SEric Blake * following the specified offset) that are known to be in the same 252451b0a488SEric Blake * allocated/unallocated state. Note that a subsequent call starting 252551b0a488SEric Blake * at 'offset + *pnum' may return the same allocation status (in other 252651b0a488SEric Blake * words, the result is not necessarily the maximum possible range); 252751b0a488SEric Blake * but 'pnum' will only be 0 when end of file is reached. 252861007b31SStefan Hajnoczi * 252961007b31SStefan Hajnoczi */ 253061007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top, 253161007b31SStefan Hajnoczi BlockDriverState *base, 2532170d3bd3SAndrey Shinkevich bool include_base, int64_t offset, 2533170d3bd3SAndrey Shinkevich int64_t bytes, int64_t *pnum) 253461007b31SStefan Hajnoczi { 253561007b31SStefan Hajnoczi BlockDriverState *intermediate; 253651b0a488SEric Blake int ret; 253751b0a488SEric Blake int64_t n = bytes; 253861007b31SStefan Hajnoczi 2539170d3bd3SAndrey Shinkevich assert(base || !include_base); 2540170d3bd3SAndrey Shinkevich 254161007b31SStefan Hajnoczi intermediate = top; 2542170d3bd3SAndrey Shinkevich while (include_base || intermediate != base) { 2543d6a644bbSEric Blake int64_t pnum_inter; 2544c00716beSEric Blake int64_t size_inter; 2545d6a644bbSEric Blake 2546170d3bd3SAndrey Shinkevich assert(intermediate); 254751b0a488SEric Blake ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 254861007b31SStefan Hajnoczi if (ret < 0) { 254961007b31SStefan Hajnoczi return ret; 2550d6a644bbSEric Blake } 2551d6a644bbSEric Blake if (ret) { 255251b0a488SEric Blake *pnum = pnum_inter; 255361007b31SStefan Hajnoczi return 1; 255461007b31SStefan Hajnoczi } 255561007b31SStefan Hajnoczi 255651b0a488SEric Blake size_inter = bdrv_getlength(intermediate); 2557c00716beSEric Blake if (size_inter < 0) { 2558c00716beSEric Blake return size_inter; 2559c00716beSEric Blake } 256051b0a488SEric Blake if (n > pnum_inter && 256151b0a488SEric Blake (intermediate == top || offset + pnum_inter < size_inter)) { 256251b0a488SEric Blake n = pnum_inter; 256361007b31SStefan Hajnoczi } 256461007b31SStefan Hajnoczi 2565170d3bd3SAndrey Shinkevich if (intermediate == base) { 2566170d3bd3SAndrey Shinkevich break; 2567170d3bd3SAndrey Shinkevich } 2568170d3bd3SAndrey Shinkevich 2569760e0063SKevin Wolf intermediate = backing_bs(intermediate); 257061007b31SStefan Hajnoczi } 257161007b31SStefan Hajnoczi 257261007b31SStefan Hajnoczi *pnum = n; 257361007b31SStefan Hajnoczi return 0; 257461007b31SStefan Hajnoczi } 257561007b31SStefan Hajnoczi 25761a8ae822SKevin Wolf typedef struct BdrvVmstateCo { 25771a8ae822SKevin Wolf BlockDriverState *bs; 25781a8ae822SKevin Wolf QEMUIOVector *qiov; 25791a8ae822SKevin Wolf int64_t pos; 25801a8ae822SKevin Wolf bool is_read; 25811a8ae822SKevin Wolf int ret; 25821a8ae822SKevin Wolf } BdrvVmstateCo; 25831a8ae822SKevin Wolf 25841a8ae822SKevin Wolf static int coroutine_fn 25851a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 25861a8ae822SKevin Wolf bool is_read) 25871a8ae822SKevin Wolf { 25881a8ae822SKevin Wolf BlockDriver *drv = bs->drv; 2589dc88a467SStefan Hajnoczi int ret = -ENOTSUP; 2590dc88a467SStefan Hajnoczi 2591dc88a467SStefan Hajnoczi bdrv_inc_in_flight(bs); 25921a8ae822SKevin Wolf 25931a8ae822SKevin Wolf if (!drv) { 2594dc88a467SStefan Hajnoczi ret = -ENOMEDIUM; 25951a8ae822SKevin Wolf } else if (drv->bdrv_load_vmstate) { 2596dc88a467SStefan Hajnoczi if (is_read) { 2597dc88a467SStefan Hajnoczi ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2598dc88a467SStefan Hajnoczi } else { 2599dc88a467SStefan Hajnoczi ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2600dc88a467SStefan Hajnoczi } 26011a8ae822SKevin Wolf } else if (bs->file) { 2602dc88a467SStefan Hajnoczi ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 26031a8ae822SKevin Wolf } 26041a8ae822SKevin Wolf 2605dc88a467SStefan Hajnoczi bdrv_dec_in_flight(bs); 2606dc88a467SStefan Hajnoczi return ret; 26071a8ae822SKevin Wolf } 26081a8ae822SKevin Wolf 26091a8ae822SKevin Wolf static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 26101a8ae822SKevin Wolf { 26111a8ae822SKevin Wolf BdrvVmstateCo *co = opaque; 26121a8ae822SKevin Wolf co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 26134720cbeeSKevin Wolf aio_wait_kick(); 26141a8ae822SKevin Wolf } 26151a8ae822SKevin Wolf 26161a8ae822SKevin Wolf static inline int 26171a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 26181a8ae822SKevin Wolf bool is_read) 26191a8ae822SKevin Wolf { 26201a8ae822SKevin Wolf if (qemu_in_coroutine()) { 26211a8ae822SKevin Wolf return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 26221a8ae822SKevin Wolf } else { 26231a8ae822SKevin Wolf BdrvVmstateCo data = { 26241a8ae822SKevin Wolf .bs = bs, 26251a8ae822SKevin Wolf .qiov = qiov, 26261a8ae822SKevin Wolf .pos = pos, 26271a8ae822SKevin Wolf .is_read = is_read, 26281a8ae822SKevin Wolf .ret = -EINPROGRESS, 26291a8ae822SKevin Wolf }; 26300b8b8753SPaolo Bonzini Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 26311a8ae822SKevin Wolf 2632e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 2633ea17c9d2SStefan Hajnoczi BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 26341a8ae822SKevin Wolf return data.ret; 26351a8ae822SKevin Wolf } 26361a8ae822SKevin Wolf } 26371a8ae822SKevin Wolf 263861007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 263961007b31SStefan Hajnoczi int64_t pos, int size) 264061007b31SStefan Hajnoczi { 26410d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2642b433d942SKevin Wolf int ret; 264361007b31SStefan Hajnoczi 2644b433d942SKevin Wolf ret = bdrv_writev_vmstate(bs, &qiov, pos); 2645b433d942SKevin Wolf if (ret < 0) { 2646b433d942SKevin Wolf return ret; 2647b433d942SKevin Wolf } 2648b433d942SKevin Wolf 2649b433d942SKevin Wolf return size; 265061007b31SStefan Hajnoczi } 265161007b31SStefan Hajnoczi 265261007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 265361007b31SStefan Hajnoczi { 26541a8ae822SKevin Wolf return bdrv_rw_vmstate(bs, qiov, pos, false); 265561007b31SStefan Hajnoczi } 265661007b31SStefan Hajnoczi 265761007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 265861007b31SStefan Hajnoczi int64_t pos, int size) 265961007b31SStefan Hajnoczi { 26600d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2661b433d942SKevin Wolf int ret; 26625ddda0b8SKevin Wolf 2663b433d942SKevin Wolf ret = bdrv_readv_vmstate(bs, &qiov, pos); 2664b433d942SKevin Wolf if (ret < 0) { 2665b433d942SKevin Wolf return ret; 2666b433d942SKevin Wolf } 2667b433d942SKevin Wolf 2668b433d942SKevin Wolf return size; 26695ddda0b8SKevin Wolf } 26705ddda0b8SKevin Wolf 26715ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 26725ddda0b8SKevin Wolf { 26731a8ae822SKevin Wolf return bdrv_rw_vmstate(bs, qiov, pos, true); 267461007b31SStefan Hajnoczi } 267561007b31SStefan Hajnoczi 267661007b31SStefan Hajnoczi /**************************************************************/ 267761007b31SStefan Hajnoczi /* async I/Os */ 267861007b31SStefan Hajnoczi 267961007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb) 268061007b31SStefan Hajnoczi { 268161007b31SStefan Hajnoczi qemu_aio_ref(acb); 268261007b31SStefan Hajnoczi bdrv_aio_cancel_async(acb); 268361007b31SStefan Hajnoczi while (acb->refcnt > 1) { 268461007b31SStefan Hajnoczi if (acb->aiocb_info->get_aio_context) { 268561007b31SStefan Hajnoczi aio_poll(acb->aiocb_info->get_aio_context(acb), true); 268661007b31SStefan Hajnoczi } else if (acb->bs) { 26872f47da5fSPaolo Bonzini /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 26882f47da5fSPaolo Bonzini * assert that we're not using an I/O thread. Thread-safe 26892f47da5fSPaolo Bonzini * code should use bdrv_aio_cancel_async exclusively. 26902f47da5fSPaolo Bonzini */ 26912f47da5fSPaolo Bonzini assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 269261007b31SStefan Hajnoczi aio_poll(bdrv_get_aio_context(acb->bs), true); 269361007b31SStefan Hajnoczi } else { 269461007b31SStefan Hajnoczi abort(); 269561007b31SStefan Hajnoczi } 269661007b31SStefan Hajnoczi } 269761007b31SStefan Hajnoczi qemu_aio_unref(acb); 269861007b31SStefan Hajnoczi } 269961007b31SStefan Hajnoczi 270061007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements 270161007b31SStefan Hajnoczi * cancel_async, otherwise we do nothing and let the request normally complete. 270261007b31SStefan Hajnoczi * In either case the completion callback must be called. */ 270361007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb) 270461007b31SStefan Hajnoczi { 270561007b31SStefan Hajnoczi if (acb->aiocb_info->cancel_async) { 270661007b31SStefan Hajnoczi acb->aiocb_info->cancel_async(acb); 270761007b31SStefan Hajnoczi } 270861007b31SStefan Hajnoczi } 270961007b31SStefan Hajnoczi 271061007b31SStefan Hajnoczi /**************************************************************/ 271161007b31SStefan Hajnoczi /* Coroutine block device emulation */ 271261007b31SStefan Hajnoczi 2713e293b7a3SKevin Wolf typedef struct FlushCo { 2714e293b7a3SKevin Wolf BlockDriverState *bs; 2715e293b7a3SKevin Wolf int ret; 2716e293b7a3SKevin Wolf } FlushCo; 2717e293b7a3SKevin Wolf 2718e293b7a3SKevin Wolf 271961007b31SStefan Hajnoczi static void coroutine_fn bdrv_flush_co_entry(void *opaque) 272061007b31SStefan Hajnoczi { 2721e293b7a3SKevin Wolf FlushCo *rwco = opaque; 272261007b31SStefan Hajnoczi 272361007b31SStefan Hajnoczi rwco->ret = bdrv_co_flush(rwco->bs); 27244720cbeeSKevin Wolf aio_wait_kick(); 272561007b31SStefan Hajnoczi } 272661007b31SStefan Hajnoczi 272761007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 272861007b31SStefan Hajnoczi { 272949ca6259SFam Zheng int current_gen; 273049ca6259SFam Zheng int ret = 0; 273161007b31SStefan Hajnoczi 273299723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2733c32b82afSPavel Dovgalyuk 2734e914404eSFam Zheng if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 273549ca6259SFam Zheng bdrv_is_sg(bs)) { 273649ca6259SFam Zheng goto early_exit; 273749ca6259SFam Zheng } 273849ca6259SFam Zheng 27393783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 274047fec599SPaolo Bonzini current_gen = atomic_read(&bs->write_gen); 27413ff2f67aSEvgeny Yakovlev 27423ff2f67aSEvgeny Yakovlev /* Wait until any previous flushes are completed */ 274399723548SPaolo Bonzini while (bs->active_flush_req) { 27443783fa3dSPaolo Bonzini qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 27453ff2f67aSEvgeny Yakovlev } 27463ff2f67aSEvgeny Yakovlev 27473783fa3dSPaolo Bonzini /* Flushes reach this point in nondecreasing current_gen order. */ 274899723548SPaolo Bonzini bs->active_flush_req = true; 27493783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 27503ff2f67aSEvgeny Yakovlev 2751c32b82afSPavel Dovgalyuk /* Write back all layers by calling one driver function */ 2752c32b82afSPavel Dovgalyuk if (bs->drv->bdrv_co_flush) { 2753c32b82afSPavel Dovgalyuk ret = bs->drv->bdrv_co_flush(bs); 2754c32b82afSPavel Dovgalyuk goto out; 2755c32b82afSPavel Dovgalyuk } 2756c32b82afSPavel Dovgalyuk 275761007b31SStefan Hajnoczi /* Write back cached data to the OS even with cache=unsafe */ 275861007b31SStefan Hajnoczi BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 275961007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_os) { 276061007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_os(bs); 276161007b31SStefan Hajnoczi if (ret < 0) { 2762cdb5e315SFam Zheng goto out; 276361007b31SStefan Hajnoczi } 276461007b31SStefan Hajnoczi } 276561007b31SStefan Hajnoczi 276661007b31SStefan Hajnoczi /* But don't actually force it to the disk with cache=unsafe */ 276761007b31SStefan Hajnoczi if (bs->open_flags & BDRV_O_NO_FLUSH) { 276861007b31SStefan Hajnoczi goto flush_parent; 276961007b31SStefan Hajnoczi } 277061007b31SStefan Hajnoczi 27713ff2f67aSEvgeny Yakovlev /* Check if we really need to flush anything */ 27723ff2f67aSEvgeny Yakovlev if (bs->flushed_gen == current_gen) { 27733ff2f67aSEvgeny Yakovlev goto flush_parent; 27743ff2f67aSEvgeny Yakovlev } 27753ff2f67aSEvgeny Yakovlev 277661007b31SStefan Hajnoczi BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2777d470ad42SMax Reitz if (!bs->drv) { 2778d470ad42SMax Reitz /* bs->drv->bdrv_co_flush() might have ejected the BDS 2779d470ad42SMax Reitz * (even in case of apparent success) */ 2780d470ad42SMax Reitz ret = -ENOMEDIUM; 2781d470ad42SMax Reitz goto out; 2782d470ad42SMax Reitz } 278361007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_disk) { 278461007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_disk(bs); 278561007b31SStefan Hajnoczi } else if (bs->drv->bdrv_aio_flush) { 278661007b31SStefan Hajnoczi BlockAIOCB *acb; 278761007b31SStefan Hajnoczi CoroutineIOCompletion co = { 278861007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 278961007b31SStefan Hajnoczi }; 279061007b31SStefan Hajnoczi 279161007b31SStefan Hajnoczi acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 279261007b31SStefan Hajnoczi if (acb == NULL) { 279361007b31SStefan Hajnoczi ret = -EIO; 279461007b31SStefan Hajnoczi } else { 279561007b31SStefan Hajnoczi qemu_coroutine_yield(); 279661007b31SStefan Hajnoczi ret = co.ret; 279761007b31SStefan Hajnoczi } 279861007b31SStefan Hajnoczi } else { 279961007b31SStefan Hajnoczi /* 280061007b31SStefan Hajnoczi * Some block drivers always operate in either writethrough or unsafe 280161007b31SStefan Hajnoczi * mode and don't support bdrv_flush therefore. Usually qemu doesn't 280261007b31SStefan Hajnoczi * know how the server works (because the behaviour is hardcoded or 280361007b31SStefan Hajnoczi * depends on server-side configuration), so we can't ensure that 280461007b31SStefan Hajnoczi * everything is safe on disk. Returning an error doesn't work because 280561007b31SStefan Hajnoczi * that would break guests even if the server operates in writethrough 280661007b31SStefan Hajnoczi * mode. 280761007b31SStefan Hajnoczi * 280861007b31SStefan Hajnoczi * Let's hope the user knows what he's doing. 280961007b31SStefan Hajnoczi */ 281061007b31SStefan Hajnoczi ret = 0; 281161007b31SStefan Hajnoczi } 28123ff2f67aSEvgeny Yakovlev 281361007b31SStefan Hajnoczi if (ret < 0) { 2814cdb5e315SFam Zheng goto out; 281561007b31SStefan Hajnoczi } 281661007b31SStefan Hajnoczi 281761007b31SStefan Hajnoczi /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 281861007b31SStefan Hajnoczi * in the case of cache=unsafe, so there are no useless flushes. 281961007b31SStefan Hajnoczi */ 282061007b31SStefan Hajnoczi flush_parent: 2821cdb5e315SFam Zheng ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2822cdb5e315SFam Zheng out: 28233ff2f67aSEvgeny Yakovlev /* Notify any pending flushes that we have completed */ 2824e6af1e08SKevin Wolf if (ret == 0) { 28253ff2f67aSEvgeny Yakovlev bs->flushed_gen = current_gen; 2826e6af1e08SKevin Wolf } 28273783fa3dSPaolo Bonzini 28283783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 282999723548SPaolo Bonzini bs->active_flush_req = false; 2830156af3acSDenis V. Lunev /* Return value is ignored - it's ok if wait queue is empty */ 2831156af3acSDenis V. Lunev qemu_co_queue_next(&bs->flush_queue); 28323783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 28333ff2f67aSEvgeny Yakovlev 283449ca6259SFam Zheng early_exit: 283599723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2836cdb5e315SFam Zheng return ret; 283761007b31SStefan Hajnoczi } 283861007b31SStefan Hajnoczi 283961007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs) 284061007b31SStefan Hajnoczi { 284161007b31SStefan Hajnoczi Coroutine *co; 2842e293b7a3SKevin Wolf FlushCo flush_co = { 284361007b31SStefan Hajnoczi .bs = bs, 284461007b31SStefan Hajnoczi .ret = NOT_DONE, 284561007b31SStefan Hajnoczi }; 284661007b31SStefan Hajnoczi 284761007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 284861007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 2849e293b7a3SKevin Wolf bdrv_flush_co_entry(&flush_co); 285061007b31SStefan Hajnoczi } else { 28510b8b8753SPaolo Bonzini co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2852e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 285388b062c2SPaolo Bonzini BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 285461007b31SStefan Hajnoczi } 285561007b31SStefan Hajnoczi 2856e293b7a3SKevin Wolf return flush_co.ret; 285761007b31SStefan Hajnoczi } 285861007b31SStefan Hajnoczi 285961007b31SStefan Hajnoczi typedef struct DiscardCo { 28600b9fd3f4SFam Zheng BdrvChild *child; 28610c51a893SEric Blake int64_t offset; 2862d93e5726SVladimir Sementsov-Ogievskiy int64_t bytes; 286361007b31SStefan Hajnoczi int ret; 286461007b31SStefan Hajnoczi } DiscardCo; 28650c51a893SEric Blake static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 286661007b31SStefan Hajnoczi { 286761007b31SStefan Hajnoczi DiscardCo *rwco = opaque; 286861007b31SStefan Hajnoczi 28690b9fd3f4SFam Zheng rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); 28704720cbeeSKevin Wolf aio_wait_kick(); 287161007b31SStefan Hajnoczi } 287261007b31SStefan Hajnoczi 2873d93e5726SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2874d93e5726SVladimir Sementsov-Ogievskiy int64_t bytes) 287561007b31SStefan Hajnoczi { 2876b1066c87SFam Zheng BdrvTrackedRequest req; 28779f1963b3SEric Blake int max_pdiscard, ret; 28783482b9bcSEric Blake int head, tail, align; 28790b9fd3f4SFam Zheng BlockDriverState *bs = child->bs; 288061007b31SStefan Hajnoczi 2881d93e5726SVladimir Sementsov-Ogievskiy if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 288261007b31SStefan Hajnoczi return -ENOMEDIUM; 288361007b31SStefan Hajnoczi } 288461007b31SStefan Hajnoczi 2885d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2886d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2887d6883bc9SVladimir Sementsov-Ogievskiy } 2888d6883bc9SVladimir Sementsov-Ogievskiy 2889d93e5726SVladimir Sementsov-Ogievskiy if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) { 2890d93e5726SVladimir Sementsov-Ogievskiy return -EIO; 289161007b31SStefan Hajnoczi } 289261007b31SStefan Hajnoczi 289361007b31SStefan Hajnoczi /* Do nothing if disabled. */ 289461007b31SStefan Hajnoczi if (!(bs->open_flags & BDRV_O_UNMAP)) { 289561007b31SStefan Hajnoczi return 0; 289661007b31SStefan Hajnoczi } 289761007b31SStefan Hajnoczi 289802aefe43SEric Blake if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 289961007b31SStefan Hajnoczi return 0; 290061007b31SStefan Hajnoczi } 290161007b31SStefan Hajnoczi 29023482b9bcSEric Blake /* Discard is advisory, but some devices track and coalesce 29033482b9bcSEric Blake * unaligned requests, so we must pass everything down rather than 29043482b9bcSEric Blake * round here. Still, most devices will just silently ignore 29053482b9bcSEric Blake * unaligned requests (by returning -ENOTSUP), so we must fragment 29063482b9bcSEric Blake * the request accordingly. */ 290702aefe43SEric Blake align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2908b8d0a980SEric Blake assert(align % bs->bl.request_alignment == 0); 2909b8d0a980SEric Blake head = offset % align; 2910f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % align; 29119f1963b3SEric Blake 291299723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2913f5a5ca79SManos Pitsidianakis tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 291450824995SFam Zheng 291500695c27SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2916ec050f77SDenis V. Lunev if (ret < 0) { 2917ec050f77SDenis V. Lunev goto out; 2918ec050f77SDenis V. Lunev } 2919ec050f77SDenis V. Lunev 29209f1963b3SEric Blake max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 29219f1963b3SEric Blake align); 29223482b9bcSEric Blake assert(max_pdiscard >= bs->bl.request_alignment); 29239f1963b3SEric Blake 2924f5a5ca79SManos Pitsidianakis while (bytes > 0) { 2925d93e5726SVladimir Sementsov-Ogievskiy int64_t num = bytes; 29263482b9bcSEric Blake 29273482b9bcSEric Blake if (head) { 29283482b9bcSEric Blake /* Make small requests to get to alignment boundaries. */ 2929f5a5ca79SManos Pitsidianakis num = MIN(bytes, align - head); 29303482b9bcSEric Blake if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 29313482b9bcSEric Blake num %= bs->bl.request_alignment; 29323482b9bcSEric Blake } 29333482b9bcSEric Blake head = (head + num) % align; 29343482b9bcSEric Blake assert(num < max_pdiscard); 29353482b9bcSEric Blake } else if (tail) { 29363482b9bcSEric Blake if (num > align) { 29373482b9bcSEric Blake /* Shorten the request to the last aligned cluster. */ 29383482b9bcSEric Blake num -= tail; 29393482b9bcSEric Blake } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 29403482b9bcSEric Blake tail > bs->bl.request_alignment) { 29413482b9bcSEric Blake tail %= bs->bl.request_alignment; 29423482b9bcSEric Blake num -= tail; 29433482b9bcSEric Blake } 29443482b9bcSEric Blake } 29453482b9bcSEric Blake /* limit request size */ 29463482b9bcSEric Blake if (num > max_pdiscard) { 29473482b9bcSEric Blake num = max_pdiscard; 29483482b9bcSEric Blake } 294961007b31SStefan Hajnoczi 2950d470ad42SMax Reitz if (!bs->drv) { 2951d470ad42SMax Reitz ret = -ENOMEDIUM; 2952d470ad42SMax Reitz goto out; 2953d470ad42SMax Reitz } 295447a5486dSEric Blake if (bs->drv->bdrv_co_pdiscard) { 295547a5486dSEric Blake ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 295661007b31SStefan Hajnoczi } else { 295761007b31SStefan Hajnoczi BlockAIOCB *acb; 295861007b31SStefan Hajnoczi CoroutineIOCompletion co = { 295961007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 296061007b31SStefan Hajnoczi }; 296161007b31SStefan Hajnoczi 29624da444a0SEric Blake acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 296361007b31SStefan Hajnoczi bdrv_co_io_em_complete, &co); 296461007b31SStefan Hajnoczi if (acb == NULL) { 2965b1066c87SFam Zheng ret = -EIO; 2966b1066c87SFam Zheng goto out; 296761007b31SStefan Hajnoczi } else { 296861007b31SStefan Hajnoczi qemu_coroutine_yield(); 296961007b31SStefan Hajnoczi ret = co.ret; 297061007b31SStefan Hajnoczi } 297161007b31SStefan Hajnoczi } 297261007b31SStefan Hajnoczi if (ret && ret != -ENOTSUP) { 2973b1066c87SFam Zheng goto out; 297461007b31SStefan Hajnoczi } 297561007b31SStefan Hajnoczi 29769f1963b3SEric Blake offset += num; 2977f5a5ca79SManos Pitsidianakis bytes -= num; 297861007b31SStefan Hajnoczi } 2979b1066c87SFam Zheng ret = 0; 2980b1066c87SFam Zheng out: 298100695c27SFam Zheng bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 2982b1066c87SFam Zheng tracked_request_end(&req); 298399723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2984b1066c87SFam Zheng return ret; 298561007b31SStefan Hajnoczi } 298661007b31SStefan Hajnoczi 2987d93e5726SVladimir Sementsov-Ogievskiy int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) 298861007b31SStefan Hajnoczi { 298961007b31SStefan Hajnoczi Coroutine *co; 299061007b31SStefan Hajnoczi DiscardCo rwco = { 29910b9fd3f4SFam Zheng .child = child, 29920c51a893SEric Blake .offset = offset, 2993f5a5ca79SManos Pitsidianakis .bytes = bytes, 299461007b31SStefan Hajnoczi .ret = NOT_DONE, 299561007b31SStefan Hajnoczi }; 299661007b31SStefan Hajnoczi 299761007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 299861007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 29990c51a893SEric Blake bdrv_pdiscard_co_entry(&rwco); 300061007b31SStefan Hajnoczi } else { 30010c51a893SEric Blake co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 30020b9fd3f4SFam Zheng bdrv_coroutine_enter(child->bs, co); 30030b9fd3f4SFam Zheng BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 300461007b31SStefan Hajnoczi } 300561007b31SStefan Hajnoczi 300661007b31SStefan Hajnoczi return rwco.ret; 300761007b31SStefan Hajnoczi } 300861007b31SStefan Hajnoczi 300948af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 301061007b31SStefan Hajnoczi { 301161007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 30125c5ae76aSFam Zheng CoroutineIOCompletion co = { 30135c5ae76aSFam Zheng .coroutine = qemu_coroutine_self(), 30145c5ae76aSFam Zheng }; 30155c5ae76aSFam Zheng BlockAIOCB *acb; 301661007b31SStefan Hajnoczi 301799723548SPaolo Bonzini bdrv_inc_in_flight(bs); 301816a389dcSKevin Wolf if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 30195c5ae76aSFam Zheng co.ret = -ENOTSUP; 30205c5ae76aSFam Zheng goto out; 30215c5ae76aSFam Zheng } 30225c5ae76aSFam Zheng 302316a389dcSKevin Wolf if (drv->bdrv_co_ioctl) { 302416a389dcSKevin Wolf co.ret = drv->bdrv_co_ioctl(bs, req, buf); 302516a389dcSKevin Wolf } else { 30265c5ae76aSFam Zheng acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 30275c5ae76aSFam Zheng if (!acb) { 3028c8a9fd80SFam Zheng co.ret = -ENOTSUP; 3029c8a9fd80SFam Zheng goto out; 30305c5ae76aSFam Zheng } 30315c5ae76aSFam Zheng qemu_coroutine_yield(); 303216a389dcSKevin Wolf } 30335c5ae76aSFam Zheng out: 303499723548SPaolo Bonzini bdrv_dec_in_flight(bs); 30355c5ae76aSFam Zheng return co.ret; 30365c5ae76aSFam Zheng } 30375c5ae76aSFam Zheng 303861007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size) 303961007b31SStefan Hajnoczi { 304061007b31SStefan Hajnoczi return qemu_memalign(bdrv_opt_mem_align(bs), size); 304161007b31SStefan Hajnoczi } 304261007b31SStefan Hajnoczi 304361007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size) 304461007b31SStefan Hajnoczi { 304561007b31SStefan Hajnoczi return memset(qemu_blockalign(bs, size), 0, size); 304661007b31SStefan Hajnoczi } 304761007b31SStefan Hajnoczi 304861007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 304961007b31SStefan Hajnoczi { 305061007b31SStefan Hajnoczi size_t align = bdrv_opt_mem_align(bs); 305161007b31SStefan Hajnoczi 305261007b31SStefan Hajnoczi /* Ensure that NULL is never returned on success */ 305361007b31SStefan Hajnoczi assert(align > 0); 305461007b31SStefan Hajnoczi if (size == 0) { 305561007b31SStefan Hajnoczi size = align; 305661007b31SStefan Hajnoczi } 305761007b31SStefan Hajnoczi 305861007b31SStefan Hajnoczi return qemu_try_memalign(align, size); 305961007b31SStefan Hajnoczi } 306061007b31SStefan Hajnoczi 306161007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 306261007b31SStefan Hajnoczi { 306361007b31SStefan Hajnoczi void *mem = qemu_try_blockalign(bs, size); 306461007b31SStefan Hajnoczi 306561007b31SStefan Hajnoczi if (mem) { 306661007b31SStefan Hajnoczi memset(mem, 0, size); 306761007b31SStefan Hajnoczi } 306861007b31SStefan Hajnoczi 306961007b31SStefan Hajnoczi return mem; 307061007b31SStefan Hajnoczi } 307161007b31SStefan Hajnoczi 307261007b31SStefan Hajnoczi /* 307361007b31SStefan Hajnoczi * Check if all memory in this vector is sector aligned. 307461007b31SStefan Hajnoczi */ 307561007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 307661007b31SStefan Hajnoczi { 307761007b31SStefan Hajnoczi int i; 30784196d2f0SDenis V. Lunev size_t alignment = bdrv_min_mem_align(bs); 307961007b31SStefan Hajnoczi 308061007b31SStefan Hajnoczi for (i = 0; i < qiov->niov; i++) { 308161007b31SStefan Hajnoczi if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 308261007b31SStefan Hajnoczi return false; 308361007b31SStefan Hajnoczi } 308461007b31SStefan Hajnoczi if (qiov->iov[i].iov_len % alignment) { 308561007b31SStefan Hajnoczi return false; 308661007b31SStefan Hajnoczi } 308761007b31SStefan Hajnoczi } 308861007b31SStefan Hajnoczi 308961007b31SStefan Hajnoczi return true; 309061007b31SStefan Hajnoczi } 309161007b31SStefan Hajnoczi 309261007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs, 309361007b31SStefan Hajnoczi NotifierWithReturn *notifier) 309461007b31SStefan Hajnoczi { 309561007b31SStefan Hajnoczi notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 309661007b31SStefan Hajnoczi } 309761007b31SStefan Hajnoczi 309861007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs) 309961007b31SStefan Hajnoczi { 31006b98bd64SPaolo Bonzini BdrvChild *child; 31016b98bd64SPaolo Bonzini 31026b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 31036b98bd64SPaolo Bonzini bdrv_io_plug(child->bs); 31046b98bd64SPaolo Bonzini } 31056b98bd64SPaolo Bonzini 3106850d54a2SPaolo Bonzini if (atomic_fetch_inc(&bs->io_plugged) == 0) { 310761007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 310861007b31SStefan Hajnoczi if (drv && drv->bdrv_io_plug) { 310961007b31SStefan Hajnoczi drv->bdrv_io_plug(bs); 31106b98bd64SPaolo Bonzini } 311161007b31SStefan Hajnoczi } 311261007b31SStefan Hajnoczi } 311361007b31SStefan Hajnoczi 311461007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs) 311561007b31SStefan Hajnoczi { 31166b98bd64SPaolo Bonzini BdrvChild *child; 31176b98bd64SPaolo Bonzini 31186b98bd64SPaolo Bonzini assert(bs->io_plugged); 3119850d54a2SPaolo Bonzini if (atomic_fetch_dec(&bs->io_plugged) == 1) { 312061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 312161007b31SStefan Hajnoczi if (drv && drv->bdrv_io_unplug) { 312261007b31SStefan Hajnoczi drv->bdrv_io_unplug(bs); 312361007b31SStefan Hajnoczi } 312461007b31SStefan Hajnoczi } 312561007b31SStefan Hajnoczi 31266b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 31276b98bd64SPaolo Bonzini bdrv_io_unplug(child->bs); 31286b98bd64SPaolo Bonzini } 31296b98bd64SPaolo Bonzini } 313023d0ba93SFam Zheng 313123d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 313223d0ba93SFam Zheng { 313323d0ba93SFam Zheng BdrvChild *child; 313423d0ba93SFam Zheng 313523d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_register_buf) { 313623d0ba93SFam Zheng bs->drv->bdrv_register_buf(bs, host, size); 313723d0ba93SFam Zheng } 313823d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 313923d0ba93SFam Zheng bdrv_register_buf(child->bs, host, size); 314023d0ba93SFam Zheng } 314123d0ba93SFam Zheng } 314223d0ba93SFam Zheng 314323d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host) 314423d0ba93SFam Zheng { 314523d0ba93SFam Zheng BdrvChild *child; 314623d0ba93SFam Zheng 314723d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_unregister_buf) { 314823d0ba93SFam Zheng bs->drv->bdrv_unregister_buf(bs, host); 314923d0ba93SFam Zheng } 315023d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 315123d0ba93SFam Zheng bdrv_unregister_buf(child->bs, host); 315223d0ba93SFam Zheng } 315323d0ba93SFam Zheng } 3154fcc67678SFam Zheng 315567b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal( 315667b51fb9SVladimir Sementsov-Ogievskiy BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 315767b51fb9SVladimir Sementsov-Ogievskiy uint64_t dst_offset, uint64_t bytes, 315867b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3159fcc67678SFam Zheng bool recurse_src) 3160fcc67678SFam Zheng { 3161999658a0SVladimir Sementsov-Ogievskiy BdrvTrackedRequest req; 3162fcc67678SFam Zheng int ret; 3163fcc67678SFam Zheng 3164fe0480d6SKevin Wolf /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3165fe0480d6SKevin Wolf assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3166fe0480d6SKevin Wolf assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3167fe0480d6SKevin Wolf 3168d4d3e5a0SFam Zheng if (!dst || !dst->bs) { 3169fcc67678SFam Zheng return -ENOMEDIUM; 3170fcc67678SFam Zheng } 3171fcc67678SFam Zheng ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 3172fcc67678SFam Zheng if (ret) { 3173fcc67678SFam Zheng return ret; 3174fcc67678SFam Zheng } 317567b51fb9SVladimir Sementsov-Ogievskiy if (write_flags & BDRV_REQ_ZERO_WRITE) { 317667b51fb9SVladimir Sementsov-Ogievskiy return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3177fcc67678SFam Zheng } 3178fcc67678SFam Zheng 3179d4d3e5a0SFam Zheng if (!src || !src->bs) { 3180d4d3e5a0SFam Zheng return -ENOMEDIUM; 3181d4d3e5a0SFam Zheng } 3182d4d3e5a0SFam Zheng ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 3183d4d3e5a0SFam Zheng if (ret) { 3184d4d3e5a0SFam Zheng return ret; 3185d4d3e5a0SFam Zheng } 3186d4d3e5a0SFam Zheng 3187fcc67678SFam Zheng if (!src->bs->drv->bdrv_co_copy_range_from 3188fcc67678SFam Zheng || !dst->bs->drv->bdrv_co_copy_range_to 3189fcc67678SFam Zheng || src->bs->encrypted || dst->bs->encrypted) { 3190fcc67678SFam Zheng return -ENOTSUP; 3191fcc67678SFam Zheng } 3192999658a0SVladimir Sementsov-Ogievskiy 3193999658a0SVladimir Sementsov-Ogievskiy if (recurse_src) { 3194d4d3e5a0SFam Zheng bdrv_inc_in_flight(src->bs); 3195999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, src->bs, src_offset, bytes, 3196999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_READ); 319737aec7d7SFam Zheng 319809d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 319909d2f948SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_SERIALISING)); 320067b51fb9SVladimir Sementsov-Ogievskiy if (!(read_flags & BDRV_REQ_NO_SERIALISING)) { 3201999658a0SVladimir Sementsov-Ogievskiy wait_serialising_requests(&req); 3202dee12de8SFam Zheng } 3203999658a0SVladimir Sementsov-Ogievskiy 320437aec7d7SFam Zheng ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3205fcc67678SFam Zheng src, src_offset, 3206fcc67678SFam Zheng dst, dst_offset, 320767b51fb9SVladimir Sementsov-Ogievskiy bytes, 320867b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 3209999658a0SVladimir Sementsov-Ogievskiy 3210999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3211999658a0SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(src->bs); 3212fcc67678SFam Zheng } else { 3213999658a0SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(dst->bs); 3214999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3215999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_WRITE); 32160eb1e891SFam Zheng ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 32170eb1e891SFam Zheng write_flags); 32180eb1e891SFam Zheng if (!ret) { 321937aec7d7SFam Zheng ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3220fcc67678SFam Zheng src, src_offset, 3221fcc67678SFam Zheng dst, dst_offset, 322267b51fb9SVladimir Sementsov-Ogievskiy bytes, 322367b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 32240eb1e891SFam Zheng } 32250eb1e891SFam Zheng bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3226999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3227d4d3e5a0SFam Zheng bdrv_dec_in_flight(dst->bs); 3228999658a0SVladimir Sementsov-Ogievskiy } 3229999658a0SVladimir Sementsov-Ogievskiy 323037aec7d7SFam Zheng return ret; 3231fcc67678SFam Zheng } 3232fcc67678SFam Zheng 3233fcc67678SFam Zheng /* Copy range from @src to @dst. 3234fcc67678SFam Zheng * 3235fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3236fcc67678SFam Zheng * semantics. */ 3237fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 3238fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 323967b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 324067b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 324167b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3242fcc67678SFam Zheng { 3243ecc983a5SFam Zheng trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3244ecc983a5SFam Zheng read_flags, write_flags); 3245fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 324667b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, true); 3247fcc67678SFam Zheng } 3248fcc67678SFam Zheng 3249fcc67678SFam Zheng /* Copy range from @src to @dst. 3250fcc67678SFam Zheng * 3251fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3252fcc67678SFam Zheng * semantics. */ 3253fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3254fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 325567b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 325667b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 325767b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3258fcc67678SFam Zheng { 3259ecc983a5SFam Zheng trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3260ecc983a5SFam Zheng read_flags, write_flags); 3261fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 326267b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, false); 3263fcc67678SFam Zheng } 3264fcc67678SFam Zheng 3265fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3266fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 326767b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, BdrvRequestFlags read_flags, 326867b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3269fcc67678SFam Zheng { 327037aec7d7SFam Zheng return bdrv_co_copy_range_from(src, src_offset, 3271fcc67678SFam Zheng dst, dst_offset, 327267b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags); 3273fcc67678SFam Zheng } 32743d9f2d2aSKevin Wolf 32753d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs) 32763d9f2d2aSKevin Wolf { 32773d9f2d2aSKevin Wolf BdrvChild *c; 32783d9f2d2aSKevin Wolf QLIST_FOREACH(c, &bs->parents, next_parent) { 32793d9f2d2aSKevin Wolf if (c->role->resize) { 32803d9f2d2aSKevin Wolf c->role->resize(c); 32813d9f2d2aSKevin Wolf } 32823d9f2d2aSKevin Wolf } 32833d9f2d2aSKevin Wolf } 32843d9f2d2aSKevin Wolf 32853d9f2d2aSKevin Wolf /** 32863d9f2d2aSKevin Wolf * Truncate file to 'offset' bytes (needed only for file protocols) 32873d9f2d2aSKevin Wolf */ 32883d9f2d2aSKevin Wolf int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, 32893d9f2d2aSKevin Wolf PreallocMode prealloc, Error **errp) 32903d9f2d2aSKevin Wolf { 32913d9f2d2aSKevin Wolf BlockDriverState *bs = child->bs; 32923d9f2d2aSKevin Wolf BlockDriver *drv = bs->drv; 32931bc5f09fSKevin Wolf BdrvTrackedRequest req; 32941bc5f09fSKevin Wolf int64_t old_size, new_bytes; 32953d9f2d2aSKevin Wolf int ret; 32963d9f2d2aSKevin Wolf 32973d9f2d2aSKevin Wolf 32983d9f2d2aSKevin Wolf /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 32993d9f2d2aSKevin Wolf if (!drv) { 33003d9f2d2aSKevin Wolf error_setg(errp, "No medium inserted"); 33013d9f2d2aSKevin Wolf return -ENOMEDIUM; 33023d9f2d2aSKevin Wolf } 33033d9f2d2aSKevin Wolf if (offset < 0) { 33043d9f2d2aSKevin Wolf error_setg(errp, "Image size cannot be negative"); 33053d9f2d2aSKevin Wolf return -EINVAL; 33063d9f2d2aSKevin Wolf } 33073d9f2d2aSKevin Wolf 33081bc5f09fSKevin Wolf old_size = bdrv_getlength(bs); 33091bc5f09fSKevin Wolf if (old_size < 0) { 33101bc5f09fSKevin Wolf error_setg_errno(errp, -old_size, "Failed to get old image size"); 33111bc5f09fSKevin Wolf return old_size; 33121bc5f09fSKevin Wolf } 33131bc5f09fSKevin Wolf 33141bc5f09fSKevin Wolf if (offset > old_size) { 33151bc5f09fSKevin Wolf new_bytes = offset - old_size; 33161bc5f09fSKevin Wolf } else { 33171bc5f09fSKevin Wolf new_bytes = 0; 33181bc5f09fSKevin Wolf } 33191bc5f09fSKevin Wolf 33203d9f2d2aSKevin Wolf bdrv_inc_in_flight(bs); 33215416a11eSFam Zheng tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 33225416a11eSFam Zheng BDRV_TRACKED_TRUNCATE); 33231bc5f09fSKevin Wolf 33241bc5f09fSKevin Wolf /* If we are growing the image and potentially using preallocation for the 33251bc5f09fSKevin Wolf * new area, we need to make sure that no write requests are made to it 33261bc5f09fSKevin Wolf * concurrently or they might be overwritten by preallocation. */ 33271bc5f09fSKevin Wolf if (new_bytes) { 33281bc5f09fSKevin Wolf mark_request_serialising(&req, 1); 3329cd47d792SFam Zheng } 3330cd47d792SFam Zheng if (bs->read_only) { 3331cd47d792SFam Zheng error_setg(errp, "Image is read-only"); 3332cd47d792SFam Zheng ret = -EACCES; 3333cd47d792SFam Zheng goto out; 3334cd47d792SFam Zheng } 3335cd47d792SFam Zheng ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3336cd47d792SFam Zheng 0); 3337cd47d792SFam Zheng if (ret < 0) { 3338cd47d792SFam Zheng error_setg_errno(errp, -ret, 3339cd47d792SFam Zheng "Failed to prepare request for truncation"); 3340cd47d792SFam Zheng goto out; 33411bc5f09fSKevin Wolf } 33423d9f2d2aSKevin Wolf 33433d9f2d2aSKevin Wolf if (!drv->bdrv_co_truncate) { 33443d9f2d2aSKevin Wolf if (bs->file && drv->is_filter) { 33453d9f2d2aSKevin Wolf ret = bdrv_co_truncate(bs->file, offset, prealloc, errp); 33463d9f2d2aSKevin Wolf goto out; 33473d9f2d2aSKevin Wolf } 33483d9f2d2aSKevin Wolf error_setg(errp, "Image format driver does not support resize"); 33493d9f2d2aSKevin Wolf ret = -ENOTSUP; 33503d9f2d2aSKevin Wolf goto out; 33513d9f2d2aSKevin Wolf } 33523d9f2d2aSKevin Wolf 33533d9f2d2aSKevin Wolf ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp); 33543d9f2d2aSKevin Wolf if (ret < 0) { 33553d9f2d2aSKevin Wolf goto out; 33563d9f2d2aSKevin Wolf } 33573d9f2d2aSKevin Wolf ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 33583d9f2d2aSKevin Wolf if (ret < 0) { 33593d9f2d2aSKevin Wolf error_setg_errno(errp, -ret, "Could not refresh total sector count"); 33603d9f2d2aSKevin Wolf } else { 33613d9f2d2aSKevin Wolf offset = bs->total_sectors * BDRV_SECTOR_SIZE; 33623d9f2d2aSKevin Wolf } 3363cd47d792SFam Zheng /* It's possible that truncation succeeded but refresh_total_sectors 3364cd47d792SFam Zheng * failed, but the latter doesn't affect how we should finish the request. 3365cd47d792SFam Zheng * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3366cd47d792SFam Zheng bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 33673d9f2d2aSKevin Wolf 33683d9f2d2aSKevin Wolf out: 33691bc5f09fSKevin Wolf tracked_request_end(&req); 33703d9f2d2aSKevin Wolf bdrv_dec_in_flight(bs); 33711bc5f09fSKevin Wolf 33723d9f2d2aSKevin Wolf return ret; 33733d9f2d2aSKevin Wolf } 33743d9f2d2aSKevin Wolf 33753d9f2d2aSKevin Wolf typedef struct TruncateCo { 33763d9f2d2aSKevin Wolf BdrvChild *child; 33773d9f2d2aSKevin Wolf int64_t offset; 33783d9f2d2aSKevin Wolf PreallocMode prealloc; 33793d9f2d2aSKevin Wolf Error **errp; 33803d9f2d2aSKevin Wolf int ret; 33813d9f2d2aSKevin Wolf } TruncateCo; 33823d9f2d2aSKevin Wolf 33833d9f2d2aSKevin Wolf static void coroutine_fn bdrv_truncate_co_entry(void *opaque) 33843d9f2d2aSKevin Wolf { 33853d9f2d2aSKevin Wolf TruncateCo *tco = opaque; 33863d9f2d2aSKevin Wolf tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc, 33873d9f2d2aSKevin Wolf tco->errp); 33884720cbeeSKevin Wolf aio_wait_kick(); 33893d9f2d2aSKevin Wolf } 33903d9f2d2aSKevin Wolf 33913d9f2d2aSKevin Wolf int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc, 33923d9f2d2aSKevin Wolf Error **errp) 33933d9f2d2aSKevin Wolf { 33943d9f2d2aSKevin Wolf Coroutine *co; 33953d9f2d2aSKevin Wolf TruncateCo tco = { 33963d9f2d2aSKevin Wolf .child = child, 33973d9f2d2aSKevin Wolf .offset = offset, 33983d9f2d2aSKevin Wolf .prealloc = prealloc, 33993d9f2d2aSKevin Wolf .errp = errp, 34003d9f2d2aSKevin Wolf .ret = NOT_DONE, 34013d9f2d2aSKevin Wolf }; 34023d9f2d2aSKevin Wolf 34033d9f2d2aSKevin Wolf if (qemu_in_coroutine()) { 34043d9f2d2aSKevin Wolf /* Fast-path if already in coroutine context */ 34053d9f2d2aSKevin Wolf bdrv_truncate_co_entry(&tco); 34063d9f2d2aSKevin Wolf } else { 34073d9f2d2aSKevin Wolf co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco); 34084720cbeeSKevin Wolf bdrv_coroutine_enter(child->bs, co); 34093d9f2d2aSKevin Wolf BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE); 34103d9f2d2aSKevin Wolf } 34113d9f2d2aSKevin Wolf 34123d9f2d2aSKevin Wolf return tco.ret; 34133d9f2d2aSKevin Wolf } 3414