161007b31SStefan Hajnoczi /* 261007b31SStefan Hajnoczi * Block layer I/O functions 361007b31SStefan Hajnoczi * 461007b31SStefan Hajnoczi * Copyright (c) 2003 Fabrice Bellard 561007b31SStefan Hajnoczi * 661007b31SStefan Hajnoczi * Permission is hereby granted, free of charge, to any person obtaining a copy 761007b31SStefan Hajnoczi * of this software and associated documentation files (the "Software"), to deal 861007b31SStefan Hajnoczi * in the Software without restriction, including without limitation the rights 961007b31SStefan Hajnoczi * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1061007b31SStefan Hajnoczi * copies of the Software, and to permit persons to whom the Software is 1161007b31SStefan Hajnoczi * furnished to do so, subject to the following conditions: 1261007b31SStefan Hajnoczi * 1361007b31SStefan Hajnoczi * The above copyright notice and this permission notice shall be included in 1461007b31SStefan Hajnoczi * all copies or substantial portions of the Software. 1561007b31SStefan Hajnoczi * 1661007b31SStefan Hajnoczi * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1761007b31SStefan Hajnoczi * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1861007b31SStefan Hajnoczi * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1961007b31SStefan Hajnoczi * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2061007b31SStefan Hajnoczi * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2161007b31SStefan Hajnoczi * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2261007b31SStefan Hajnoczi * THE SOFTWARE. 2361007b31SStefan Hajnoczi */ 2461007b31SStefan Hajnoczi 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2661007b31SStefan Hajnoczi #include "trace.h" 277f0e9da6SMax Reitz #include "sysemu/block-backend.h" 287719f3c9SStefan Hajnoczi #include "block/aio-wait.h" 2961007b31SStefan Hajnoczi #include "block/blockjob.h" 30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h" 3161007b31SStefan Hajnoczi #include "block/block_int.h" 32f348b6d1SVeronia Bahaa #include "qemu/cutils.h" 33da34e65cSMarkus Armbruster #include "qapi/error.h" 34d49b6836SMarkus Armbruster #include "qemu/error-report.h" 3561007b31SStefan Hajnoczi 3661007b31SStefan Hajnoczi #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 3761007b31SStefan Hajnoczi 38cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40cb2e2878SEric Blake 410f12264eSKevin Wolf static AioWait drain_all_aio_wait; 420f12264eSKevin Wolf 43d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 44f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags); 4561007b31SStefan Hajnoczi 466cd5c9d7SKevin Wolf void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 476cd5c9d7SKevin Wolf bool ignore_bds_parents) 4861007b31SStefan Hajnoczi { 4902d21300SKevin Wolf BdrvChild *c, *next; 5027ccdd52SKevin Wolf 5102d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 526cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 530152bf40SKevin Wolf continue; 540152bf40SKevin Wolf } 554be6a6d1SKevin Wolf bdrv_parent_drained_begin_single(c, false); 56ce0f1412SPaolo Bonzini } 57ce0f1412SPaolo Bonzini } 58ce0f1412SPaolo Bonzini 596cd5c9d7SKevin Wolf void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 606cd5c9d7SKevin Wolf bool ignore_bds_parents) 61ce0f1412SPaolo Bonzini { 6202d21300SKevin Wolf BdrvChild *c, *next; 6327ccdd52SKevin Wolf 6402d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 656cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 660152bf40SKevin Wolf continue; 670152bf40SKevin Wolf } 68c2066af0SKevin Wolf if (c->role->drained_end) { 69c2066af0SKevin Wolf c->role->drained_end(c); 7027ccdd52SKevin Wolf } 71c2066af0SKevin Wolf } 7261007b31SStefan Hajnoczi } 7361007b31SStefan Hajnoczi 744be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c) 754be6a6d1SKevin Wolf { 764be6a6d1SKevin Wolf if (c->role->drained_poll) { 774be6a6d1SKevin Wolf return c->role->drained_poll(c); 784be6a6d1SKevin Wolf } 794be6a6d1SKevin Wolf return false; 804be6a6d1SKevin Wolf } 814be6a6d1SKevin Wolf 826cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 836cd5c9d7SKevin Wolf bool ignore_bds_parents) 8489bd0305SKevin Wolf { 8589bd0305SKevin Wolf BdrvChild *c, *next; 8689bd0305SKevin Wolf bool busy = false; 8789bd0305SKevin Wolf 8889bd0305SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 896cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 9089bd0305SKevin Wolf continue; 9189bd0305SKevin Wolf } 924be6a6d1SKevin Wolf busy |= bdrv_parent_drained_poll_single(c); 9389bd0305SKevin Wolf } 9489bd0305SKevin Wolf 9589bd0305SKevin Wolf return busy; 9689bd0305SKevin Wolf } 9789bd0305SKevin Wolf 984be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 994be6a6d1SKevin Wolf { 1004be6a6d1SKevin Wolf if (c->role->drained_begin) { 1014be6a6d1SKevin Wolf c->role->drained_begin(c); 1024be6a6d1SKevin Wolf } 1034be6a6d1SKevin Wolf if (poll) { 1044be6a6d1SKevin Wolf BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 1054be6a6d1SKevin Wolf } 1064be6a6d1SKevin Wolf } 1074be6a6d1SKevin Wolf 108d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 109d9e0dfa2SEric Blake { 110d9e0dfa2SEric Blake dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 111d9e0dfa2SEric Blake dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 112d9e0dfa2SEric Blake dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 113d9e0dfa2SEric Blake src->opt_mem_alignment); 114d9e0dfa2SEric Blake dst->min_mem_alignment = MAX(dst->min_mem_alignment, 115d9e0dfa2SEric Blake src->min_mem_alignment); 116d9e0dfa2SEric Blake dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 117d9e0dfa2SEric Blake } 118d9e0dfa2SEric Blake 11961007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 12061007b31SStefan Hajnoczi { 12161007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 12261007b31SStefan Hajnoczi Error *local_err = NULL; 12361007b31SStefan Hajnoczi 12461007b31SStefan Hajnoczi memset(&bs->bl, 0, sizeof(bs->bl)); 12561007b31SStefan Hajnoczi 12661007b31SStefan Hajnoczi if (!drv) { 12761007b31SStefan Hajnoczi return; 12861007b31SStefan Hajnoczi } 12961007b31SStefan Hajnoczi 13079ba8c98SEric Blake /* Default alignment based on whether driver has byte interface */ 131e31f6864SEric Blake bs->bl.request_alignment = (drv->bdrv_co_preadv || 132e31f6864SEric Blake drv->bdrv_aio_preadv) ? 1 : 512; 13379ba8c98SEric Blake 13461007b31SStefan Hajnoczi /* Take some limits from the children as a default */ 13561007b31SStefan Hajnoczi if (bs->file) { 1369a4f4c31SKevin Wolf bdrv_refresh_limits(bs->file->bs, &local_err); 13761007b31SStefan Hajnoczi if (local_err) { 13861007b31SStefan Hajnoczi error_propagate(errp, local_err); 13961007b31SStefan Hajnoczi return; 14061007b31SStefan Hajnoczi } 141d9e0dfa2SEric Blake bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 14261007b31SStefan Hajnoczi } else { 1434196d2f0SDenis V. Lunev bs->bl.min_mem_alignment = 512; 144459b4e66SDenis V. Lunev bs->bl.opt_mem_alignment = getpagesize(); 145bd44feb7SStefan Hajnoczi 146bd44feb7SStefan Hajnoczi /* Safe default since most protocols use readv()/writev()/etc */ 147bd44feb7SStefan Hajnoczi bs->bl.max_iov = IOV_MAX; 14861007b31SStefan Hajnoczi } 14961007b31SStefan Hajnoczi 150760e0063SKevin Wolf if (bs->backing) { 151760e0063SKevin Wolf bdrv_refresh_limits(bs->backing->bs, &local_err); 15261007b31SStefan Hajnoczi if (local_err) { 15361007b31SStefan Hajnoczi error_propagate(errp, local_err); 15461007b31SStefan Hajnoczi return; 15561007b31SStefan Hajnoczi } 156d9e0dfa2SEric Blake bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 15761007b31SStefan Hajnoczi } 15861007b31SStefan Hajnoczi 15961007b31SStefan Hajnoczi /* Then let the driver override it */ 16061007b31SStefan Hajnoczi if (drv->bdrv_refresh_limits) { 16161007b31SStefan Hajnoczi drv->bdrv_refresh_limits(bs, errp); 16261007b31SStefan Hajnoczi } 16361007b31SStefan Hajnoczi } 16461007b31SStefan Hajnoczi 16561007b31SStefan Hajnoczi /** 16661007b31SStefan Hajnoczi * The copy-on-read flag is actually a reference count so multiple users may 16761007b31SStefan Hajnoczi * use the feature without worrying about clobbering its previous state. 16861007b31SStefan Hajnoczi * Copy-on-read stays enabled until all users have called to disable it. 16961007b31SStefan Hajnoczi */ 17061007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs) 17161007b31SStefan Hajnoczi { 172d3faa13eSPaolo Bonzini atomic_inc(&bs->copy_on_read); 17361007b31SStefan Hajnoczi } 17461007b31SStefan Hajnoczi 17561007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs) 17661007b31SStefan Hajnoczi { 177d3faa13eSPaolo Bonzini int old = atomic_fetch_dec(&bs->copy_on_read); 178d3faa13eSPaolo Bonzini assert(old >= 1); 17961007b31SStefan Hajnoczi } 18061007b31SStefan Hajnoczi 18161124f03SPaolo Bonzini typedef struct { 18261124f03SPaolo Bonzini Coroutine *co; 18361124f03SPaolo Bonzini BlockDriverState *bs; 18461124f03SPaolo Bonzini bool done; 185481cad48SManos Pitsidianakis bool begin; 186b0165585SKevin Wolf bool recursive; 187fe4f0614SKevin Wolf bool poll; 1880152bf40SKevin Wolf BdrvChild *parent; 1896cd5c9d7SKevin Wolf bool ignore_bds_parents; 19061124f03SPaolo Bonzini } BdrvCoDrainData; 19161124f03SPaolo Bonzini 19261124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 19361124f03SPaolo Bonzini { 19461124f03SPaolo Bonzini BdrvCoDrainData *data = opaque; 19561124f03SPaolo Bonzini BlockDriverState *bs = data->bs; 19661124f03SPaolo Bonzini 197481cad48SManos Pitsidianakis if (data->begin) { 198f8ea8dacSManos Pitsidianakis bs->drv->bdrv_co_drain_begin(bs); 199481cad48SManos Pitsidianakis } else { 200481cad48SManos Pitsidianakis bs->drv->bdrv_co_drain_end(bs); 201481cad48SManos Pitsidianakis } 20261124f03SPaolo Bonzini 20361124f03SPaolo Bonzini /* Set data->done before reading bs->wakeup. */ 20461124f03SPaolo Bonzini atomic_mb_set(&data->done, true); 2050109e7e6SKevin Wolf bdrv_dec_in_flight(bs); 2060109e7e6SKevin Wolf 2070109e7e6SKevin Wolf if (data->begin) { 2080109e7e6SKevin Wolf g_free(data); 2090109e7e6SKevin Wolf } 21061124f03SPaolo Bonzini } 21161124f03SPaolo Bonzini 212db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 2137d40d9efSKevin Wolf static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) 21461124f03SPaolo Bonzini { 2150109e7e6SKevin Wolf BdrvCoDrainData *data; 21661124f03SPaolo Bonzini 217f8ea8dacSManos Pitsidianakis if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 218481cad48SManos Pitsidianakis (!begin && !bs->drv->bdrv_co_drain_end)) { 21961124f03SPaolo Bonzini return; 22061124f03SPaolo Bonzini } 22161124f03SPaolo Bonzini 2220109e7e6SKevin Wolf data = g_new(BdrvCoDrainData, 1); 2230109e7e6SKevin Wolf *data = (BdrvCoDrainData) { 2240109e7e6SKevin Wolf .bs = bs, 2250109e7e6SKevin Wolf .done = false, 2260109e7e6SKevin Wolf .begin = begin 2270109e7e6SKevin Wolf }; 2280109e7e6SKevin Wolf 2290109e7e6SKevin Wolf /* Make sure the driver callback completes during the polling phase for 2300109e7e6SKevin Wolf * drain_begin. */ 2310109e7e6SKevin Wolf bdrv_inc_in_flight(bs); 2320109e7e6SKevin Wolf data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 2330109e7e6SKevin Wolf aio_co_schedule(bdrv_get_aio_context(bs), data->co); 2340109e7e6SKevin Wolf 2350109e7e6SKevin Wolf if (!begin) { 2360109e7e6SKevin Wolf BDRV_POLL_WHILE(bs, !data->done); 2370109e7e6SKevin Wolf g_free(data); 2380109e7e6SKevin Wolf } 23961124f03SPaolo Bonzini } 24061124f03SPaolo Bonzini 2411cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 242fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 2436cd5c9d7SKevin Wolf BdrvChild *ignore_parent, bool ignore_bds_parents) 24489bd0305SKevin Wolf { 245fe4f0614SKevin Wolf BdrvChild *child, *next; 246fe4f0614SKevin Wolf 2476cd5c9d7SKevin Wolf if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 24889bd0305SKevin Wolf return true; 24989bd0305SKevin Wolf } 25089bd0305SKevin Wolf 251fe4f0614SKevin Wolf if (atomic_read(&bs->in_flight)) { 252fe4f0614SKevin Wolf return true; 25389bd0305SKevin Wolf } 25489bd0305SKevin Wolf 255fe4f0614SKevin Wolf if (recursive) { 2566cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 257fe4f0614SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 2586cd5c9d7SKevin Wolf if (bdrv_drain_poll(child->bs, recursive, child, false)) { 259fe4f0614SKevin Wolf return true; 260fe4f0614SKevin Wolf } 261fe4f0614SKevin Wolf } 262fe4f0614SKevin Wolf } 263fe4f0614SKevin Wolf 264fe4f0614SKevin Wolf return false; 265fe4f0614SKevin Wolf } 266fe4f0614SKevin Wolf 267fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 26889bd0305SKevin Wolf BdrvChild *ignore_parent) 2691cc8e54aSKevin Wolf { 2701cc8e54aSKevin Wolf /* Execute pending BHs first and check everything else only after the BHs 2711cc8e54aSKevin Wolf * have executed. */ 2721cc8e54aSKevin Wolf while (aio_poll(bs->aio_context, false)); 27389bd0305SKevin Wolf 2746cd5c9d7SKevin Wolf return bdrv_drain_poll(bs, recursive, ignore_parent, false); 2751cc8e54aSKevin Wolf } 2761cc8e54aSKevin Wolf 277b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 2786cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 2796cd5c9d7SKevin Wolf bool poll); 280b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 2816cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents); 2820152bf40SKevin Wolf 283a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque) 284a77fd4bbSFam Zheng { 285a77fd4bbSFam Zheng BdrvCoDrainData *data = opaque; 286a77fd4bbSFam Zheng Coroutine *co = data->co; 28799723548SPaolo Bonzini BlockDriverState *bs = data->bs; 288a77fd4bbSFam Zheng 289c8ca33d0SKevin Wolf if (bs) { 29099723548SPaolo Bonzini bdrv_dec_in_flight(bs); 291481cad48SManos Pitsidianakis if (data->begin) { 2926cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, data->recursive, data->parent, 2936cd5c9d7SKevin Wolf data->ignore_bds_parents, data->poll); 294481cad48SManos Pitsidianakis } else { 2956cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, data->recursive, data->parent, 2966cd5c9d7SKevin Wolf data->ignore_bds_parents); 297481cad48SManos Pitsidianakis } 298c8ca33d0SKevin Wolf } else { 299c8ca33d0SKevin Wolf assert(data->begin); 300c8ca33d0SKevin Wolf bdrv_drain_all_begin(); 301c8ca33d0SKevin Wolf } 302481cad48SManos Pitsidianakis 303a77fd4bbSFam Zheng data->done = true; 3041919631eSPaolo Bonzini aio_co_wake(co); 305a77fd4bbSFam Zheng } 306a77fd4bbSFam Zheng 307481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 308b0165585SKevin Wolf bool begin, bool recursive, 3096cd5c9d7SKevin Wolf BdrvChild *parent, 3106cd5c9d7SKevin Wolf bool ignore_bds_parents, 3116cd5c9d7SKevin Wolf bool poll) 312a77fd4bbSFam Zheng { 313a77fd4bbSFam Zheng BdrvCoDrainData data; 314a77fd4bbSFam Zheng 315a77fd4bbSFam Zheng /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 316c40a2545SStefan Hajnoczi * other coroutines run if they were queued by aio_co_enter(). */ 317a77fd4bbSFam Zheng 318a77fd4bbSFam Zheng assert(qemu_in_coroutine()); 319a77fd4bbSFam Zheng data = (BdrvCoDrainData) { 320a77fd4bbSFam Zheng .co = qemu_coroutine_self(), 321a77fd4bbSFam Zheng .bs = bs, 322a77fd4bbSFam Zheng .done = false, 323481cad48SManos Pitsidianakis .begin = begin, 324b0165585SKevin Wolf .recursive = recursive, 3250152bf40SKevin Wolf .parent = parent, 3266cd5c9d7SKevin Wolf .ignore_bds_parents = ignore_bds_parents, 327fe4f0614SKevin Wolf .poll = poll, 328a77fd4bbSFam Zheng }; 329c8ca33d0SKevin Wolf if (bs) { 33099723548SPaolo Bonzini bdrv_inc_in_flight(bs); 331c8ca33d0SKevin Wolf } 332fffb6e12SPaolo Bonzini aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 333fffb6e12SPaolo Bonzini bdrv_co_drain_bh_cb, &data); 334a77fd4bbSFam Zheng 335a77fd4bbSFam Zheng qemu_coroutine_yield(); 336a77fd4bbSFam Zheng /* If we are resumed from some other event (such as an aio completion or a 337a77fd4bbSFam Zheng * timer callback), it is a bug in the caller that should be fixed. */ 338a77fd4bbSFam Zheng assert(data.done); 339a77fd4bbSFam Zheng } 340a77fd4bbSFam Zheng 341dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 3426cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents) 343dcf94a23SKevin Wolf { 344dcf94a23SKevin Wolf assert(!qemu_in_coroutine()); 345dcf94a23SKevin Wolf 346dcf94a23SKevin Wolf /* Stop things in parent-to-child order */ 347dcf94a23SKevin Wolf if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 348dcf94a23SKevin Wolf aio_disable_external(bdrv_get_aio_context(bs)); 349dcf94a23SKevin Wolf } 350dcf94a23SKevin Wolf 3516cd5c9d7SKevin Wolf bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 352dcf94a23SKevin Wolf bdrv_drain_invoke(bs, true); 353dcf94a23SKevin Wolf } 354dcf94a23SKevin Wolf 355dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 3566cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 3576cd5c9d7SKevin Wolf bool poll) 3586820643fSKevin Wolf { 359b0165585SKevin Wolf BdrvChild *child, *next; 360b0165585SKevin Wolf 361d42cf288SPaolo Bonzini if (qemu_in_coroutine()) { 3626cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 3636cd5c9d7SKevin Wolf poll); 364d42cf288SPaolo Bonzini return; 365d42cf288SPaolo Bonzini } 366d42cf288SPaolo Bonzini 3676cd5c9d7SKevin Wolf bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 368d30b8e64SKevin Wolf 369b0165585SKevin Wolf if (recursive) { 3706cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 371d736f119SKevin Wolf bs->recursive_quiesce_counter++; 372b0165585SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 3736cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 3746cd5c9d7SKevin Wolf false); 375b0165585SKevin Wolf } 376b0165585SKevin Wolf } 377fe4f0614SKevin Wolf 378fe4f0614SKevin Wolf /* 379fe4f0614SKevin Wolf * Wait for drained requests to finish. 380fe4f0614SKevin Wolf * 381fe4f0614SKevin Wolf * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 382fe4f0614SKevin Wolf * call is needed so things in this AioContext can make progress even 383fe4f0614SKevin Wolf * though we don't return to the main AioContext loop - this automatically 384fe4f0614SKevin Wolf * includes other nodes in the same AioContext and therefore all child 385fe4f0614SKevin Wolf * nodes. 386fe4f0614SKevin Wolf */ 387fe4f0614SKevin Wolf if (poll) { 3886cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 389fe4f0614SKevin Wolf BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 390fe4f0614SKevin Wolf } 3916820643fSKevin Wolf } 3926820643fSKevin Wolf 3930152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs) 3940152bf40SKevin Wolf { 3956cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, false, NULL, false, true); 3960152bf40SKevin Wolf } 3970152bf40SKevin Wolf 398b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs) 3996820643fSKevin Wolf { 4006cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, true, NULL, false, true); 401b0165585SKevin Wolf } 402b0165585SKevin Wolf 4036cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 4046cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents) 405b0165585SKevin Wolf { 406b0165585SKevin Wolf BdrvChild *child, *next; 4070f115168SKevin Wolf int old_quiesce_counter; 4080f115168SKevin Wolf 409481cad48SManos Pitsidianakis if (qemu_in_coroutine()) { 4106cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 4116cd5c9d7SKevin Wolf false); 412481cad48SManos Pitsidianakis return; 413481cad48SManos Pitsidianakis } 4146820643fSKevin Wolf assert(bs->quiesce_counter > 0); 4150f115168SKevin Wolf old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 4166820643fSKevin Wolf 41760369b86SKevin Wolf /* Re-enable things in child-to-parent order */ 4187d40d9efSKevin Wolf bdrv_drain_invoke(bs, false); 4196cd5c9d7SKevin Wolf bdrv_parent_drained_end(bs, parent, ignore_bds_parents); 4200f115168SKevin Wolf if (old_quiesce_counter == 1) { 4216820643fSKevin Wolf aio_enable_external(bdrv_get_aio_context(bs)); 4226820643fSKevin Wolf } 423b0165585SKevin Wolf 424b0165585SKevin Wolf if (recursive) { 4256cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 426d736f119SKevin Wolf bs->recursive_quiesce_counter--; 427b0165585SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 4286cd5c9d7SKevin Wolf bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents); 429b0165585SKevin Wolf } 430b0165585SKevin Wolf } 4310f115168SKevin Wolf } 4326820643fSKevin Wolf 4330152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs) 4340152bf40SKevin Wolf { 4356cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, false, NULL, false); 436b0165585SKevin Wolf } 437b0165585SKevin Wolf 438b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs) 439b0165585SKevin Wolf { 4406cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, true, NULL, false); 4410152bf40SKevin Wolf } 4420152bf40SKevin Wolf 443d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 444d736f119SKevin Wolf { 445d736f119SKevin Wolf int i; 446d736f119SKevin Wolf 447d736f119SKevin Wolf for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 4486cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, false, true); 449d736f119SKevin Wolf } 450d736f119SKevin Wolf } 451d736f119SKevin Wolf 452d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 453d736f119SKevin Wolf { 454d736f119SKevin Wolf int i; 455d736f119SKevin Wolf 456d736f119SKevin Wolf for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 4576cd5c9d7SKevin Wolf bdrv_do_drained_end(child->bs, true, child, false); 458d736f119SKevin Wolf } 459d736f119SKevin Wolf } 460d736f119SKevin Wolf 46161007b31SStefan Hajnoczi /* 46267da1dc5SFam Zheng * Wait for pending requests to complete on a single BlockDriverState subtree, 46367da1dc5SFam Zheng * and suspend block driver's internal I/O until next request arrives. 46461007b31SStefan Hajnoczi * 46561007b31SStefan Hajnoczi * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 46661007b31SStefan Hajnoczi * AioContext. 46761007b31SStefan Hajnoczi */ 468b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 469b6e84c97SPaolo Bonzini { 4706820643fSKevin Wolf assert(qemu_in_coroutine()); 4716820643fSKevin Wolf bdrv_drained_begin(bs); 4726820643fSKevin Wolf bdrv_drained_end(bs); 473b6e84c97SPaolo Bonzini } 474b6e84c97SPaolo Bonzini 47561007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs) 47661007b31SStefan Hajnoczi { 4776820643fSKevin Wolf bdrv_drained_begin(bs); 4786820643fSKevin Wolf bdrv_drained_end(bs); 47961007b31SStefan Hajnoczi } 48061007b31SStefan Hajnoczi 481c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs) 482c13ad59fSKevin Wolf { 483c13ad59fSKevin Wolf BdrvChild *child, *next; 484c13ad59fSKevin Wolf 485c13ad59fSKevin Wolf assert(atomic_read(&bs->in_flight) == 0); 486c13ad59fSKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 487c13ad59fSKevin Wolf bdrv_drain_assert_idle(child->bs); 488c13ad59fSKevin Wolf } 489c13ad59fSKevin Wolf } 490c13ad59fSKevin Wolf 4910f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0; 4920f12264eSKevin Wolf 4930f12264eSKevin Wolf static bool bdrv_drain_all_poll(void) 4940f12264eSKevin Wolf { 4950f12264eSKevin Wolf BlockDriverState *bs = NULL; 4960f12264eSKevin Wolf bool result = false; 4970f12264eSKevin Wolf 4980f12264eSKevin Wolf /* Execute pending BHs first (may modify the graph) and check everything 4990f12264eSKevin Wolf * else only after the BHs have executed. */ 5000f12264eSKevin Wolf while (aio_poll(qemu_get_aio_context(), false)); 5010f12264eSKevin Wolf 5020f12264eSKevin Wolf /* bdrv_drain_poll() can't make changes to the graph and we are holding the 5030f12264eSKevin Wolf * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 5040f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 5050f12264eSKevin Wolf AioContext *aio_context = bdrv_get_aio_context(bs); 5060f12264eSKevin Wolf aio_context_acquire(aio_context); 5070f12264eSKevin Wolf result |= bdrv_drain_poll(bs, false, NULL, true); 5080f12264eSKevin Wolf aio_context_release(aio_context); 5090f12264eSKevin Wolf } 5100f12264eSKevin Wolf 5110f12264eSKevin Wolf return result; 5120f12264eSKevin Wolf } 5130f12264eSKevin Wolf 51461007b31SStefan Hajnoczi /* 51561007b31SStefan Hajnoczi * Wait for pending requests to complete across all BlockDriverStates 51661007b31SStefan Hajnoczi * 51761007b31SStefan Hajnoczi * This function does not flush data to disk, use bdrv_flush_all() for that 51861007b31SStefan Hajnoczi * after calling this function. 519c0778f66SAlberto Garcia * 520c0778f66SAlberto Garcia * This pauses all block jobs and disables external clients. It must 521c0778f66SAlberto Garcia * be paired with bdrv_drain_all_end(). 522c0778f66SAlberto Garcia * 523c0778f66SAlberto Garcia * NOTE: no new block jobs or BlockDriverStates can be created between 524c0778f66SAlberto Garcia * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 52561007b31SStefan Hajnoczi */ 526c0778f66SAlberto Garcia void bdrv_drain_all_begin(void) 52761007b31SStefan Hajnoczi { 5280f12264eSKevin Wolf BlockDriverState *bs = NULL; 52961007b31SStefan Hajnoczi 530c8ca33d0SKevin Wolf if (qemu_in_coroutine()) { 5310f12264eSKevin Wolf bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true); 532c8ca33d0SKevin Wolf return; 533c8ca33d0SKevin Wolf } 534c8ca33d0SKevin Wolf 5350f12264eSKevin Wolf /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 5360f12264eSKevin Wolf * loop AioContext, so make sure we're in the main context. */ 5379a7e86c8SKevin Wolf assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 5380f12264eSKevin Wolf assert(bdrv_drain_all_count < INT_MAX); 5390f12264eSKevin Wolf bdrv_drain_all_count++; 5409a7e86c8SKevin Wolf 5410f12264eSKevin Wolf /* Quiesce all nodes, without polling in-flight requests yet. The graph 5420f12264eSKevin Wolf * cannot change during this loop. */ 5430f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 54461007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 54561007b31SStefan Hajnoczi 54661007b31SStefan Hajnoczi aio_context_acquire(aio_context); 5470f12264eSKevin Wolf bdrv_do_drained_begin(bs, false, NULL, true, false); 54861007b31SStefan Hajnoczi aio_context_release(aio_context); 54961007b31SStefan Hajnoczi } 55061007b31SStefan Hajnoczi 5510f12264eSKevin Wolf /* Now poll the in-flight requests */ 5520f12264eSKevin Wolf AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll()); 5530f12264eSKevin Wolf 5540f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 555c13ad59fSKevin Wolf bdrv_drain_assert_idle(bs); 556f406c03cSAlexander Yarygin } 557f406c03cSAlexander Yarygin } 558c0778f66SAlberto Garcia 559c0778f66SAlberto Garcia void bdrv_drain_all_end(void) 560c0778f66SAlberto Garcia { 5610f12264eSKevin Wolf BlockDriverState *bs = NULL; 562c0778f66SAlberto Garcia 5630f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 56461007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 56561007b31SStefan Hajnoczi 56661007b31SStefan Hajnoczi aio_context_acquire(aio_context); 5670f12264eSKevin Wolf bdrv_do_drained_end(bs, false, NULL, true); 56861007b31SStefan Hajnoczi aio_context_release(aio_context); 56961007b31SStefan Hajnoczi } 5700f12264eSKevin Wolf 5710f12264eSKevin Wolf assert(bdrv_drain_all_count > 0); 5720f12264eSKevin Wolf bdrv_drain_all_count--; 57361007b31SStefan Hajnoczi } 57461007b31SStefan Hajnoczi 575c0778f66SAlberto Garcia void bdrv_drain_all(void) 576c0778f66SAlberto Garcia { 577c0778f66SAlberto Garcia bdrv_drain_all_begin(); 578c0778f66SAlberto Garcia bdrv_drain_all_end(); 579c0778f66SAlberto Garcia } 580c0778f66SAlberto Garcia 58161007b31SStefan Hajnoczi /** 58261007b31SStefan Hajnoczi * Remove an active request from the tracked requests list 58361007b31SStefan Hajnoczi * 58461007b31SStefan Hajnoczi * This function should be called when a tracked request is completing. 58561007b31SStefan Hajnoczi */ 58661007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req) 58761007b31SStefan Hajnoczi { 58861007b31SStefan Hajnoczi if (req->serialising) { 58920fc71b2SPaolo Bonzini atomic_dec(&req->bs->serialising_in_flight); 59061007b31SStefan Hajnoczi } 59161007b31SStefan Hajnoczi 5923783fa3dSPaolo Bonzini qemu_co_mutex_lock(&req->bs->reqs_lock); 59361007b31SStefan Hajnoczi QLIST_REMOVE(req, list); 59461007b31SStefan Hajnoczi qemu_co_queue_restart_all(&req->wait_queue); 5953783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&req->bs->reqs_lock); 59661007b31SStefan Hajnoczi } 59761007b31SStefan Hajnoczi 59861007b31SStefan Hajnoczi /** 59961007b31SStefan Hajnoczi * Add an active request to the tracked requests list 60061007b31SStefan Hajnoczi */ 60161007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req, 60261007b31SStefan Hajnoczi BlockDriverState *bs, 60361007b31SStefan Hajnoczi int64_t offset, 604ebde595cSFam Zheng unsigned int bytes, 605ebde595cSFam Zheng enum BdrvTrackedRequestType type) 60661007b31SStefan Hajnoczi { 60761007b31SStefan Hajnoczi *req = (BdrvTrackedRequest){ 60861007b31SStefan Hajnoczi .bs = bs, 60961007b31SStefan Hajnoczi .offset = offset, 61061007b31SStefan Hajnoczi .bytes = bytes, 611ebde595cSFam Zheng .type = type, 61261007b31SStefan Hajnoczi .co = qemu_coroutine_self(), 61361007b31SStefan Hajnoczi .serialising = false, 61461007b31SStefan Hajnoczi .overlap_offset = offset, 61561007b31SStefan Hajnoczi .overlap_bytes = bytes, 61661007b31SStefan Hajnoczi }; 61761007b31SStefan Hajnoczi 61861007b31SStefan Hajnoczi qemu_co_queue_init(&req->wait_queue); 61961007b31SStefan Hajnoczi 6203783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 62161007b31SStefan Hajnoczi QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 6223783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 62361007b31SStefan Hajnoczi } 62461007b31SStefan Hajnoczi 62561007b31SStefan Hajnoczi static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 62661007b31SStefan Hajnoczi { 62761007b31SStefan Hajnoczi int64_t overlap_offset = req->offset & ~(align - 1); 62861007b31SStefan Hajnoczi unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 62961007b31SStefan Hajnoczi - overlap_offset; 63061007b31SStefan Hajnoczi 63161007b31SStefan Hajnoczi if (!req->serialising) { 63220fc71b2SPaolo Bonzini atomic_inc(&req->bs->serialising_in_flight); 63361007b31SStefan Hajnoczi req->serialising = true; 63461007b31SStefan Hajnoczi } 63561007b31SStefan Hajnoczi 63661007b31SStefan Hajnoczi req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 63761007b31SStefan Hajnoczi req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 63861007b31SStefan Hajnoczi } 63961007b31SStefan Hajnoczi 64061007b31SStefan Hajnoczi /** 641244483e6SKevin Wolf * Round a region to cluster boundaries 642244483e6SKevin Wolf */ 643244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs, 6447cfd5275SEric Blake int64_t offset, int64_t bytes, 645244483e6SKevin Wolf int64_t *cluster_offset, 6467cfd5275SEric Blake int64_t *cluster_bytes) 647244483e6SKevin Wolf { 648244483e6SKevin Wolf BlockDriverInfo bdi; 649244483e6SKevin Wolf 650244483e6SKevin Wolf if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 651244483e6SKevin Wolf *cluster_offset = offset; 652244483e6SKevin Wolf *cluster_bytes = bytes; 653244483e6SKevin Wolf } else { 654244483e6SKevin Wolf int64_t c = bdi.cluster_size; 655244483e6SKevin Wolf *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 656244483e6SKevin Wolf *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 657244483e6SKevin Wolf } 658244483e6SKevin Wolf } 659244483e6SKevin Wolf 66061007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs) 66161007b31SStefan Hajnoczi { 66261007b31SStefan Hajnoczi BlockDriverInfo bdi; 66361007b31SStefan Hajnoczi int ret; 66461007b31SStefan Hajnoczi 66561007b31SStefan Hajnoczi ret = bdrv_get_info(bs, &bdi); 66661007b31SStefan Hajnoczi if (ret < 0 || bdi.cluster_size == 0) { 667a5b8dd2cSEric Blake return bs->bl.request_alignment; 66861007b31SStefan Hajnoczi } else { 66961007b31SStefan Hajnoczi return bdi.cluster_size; 67061007b31SStefan Hajnoczi } 67161007b31SStefan Hajnoczi } 67261007b31SStefan Hajnoczi 67361007b31SStefan Hajnoczi static bool tracked_request_overlaps(BdrvTrackedRequest *req, 67461007b31SStefan Hajnoczi int64_t offset, unsigned int bytes) 67561007b31SStefan Hajnoczi { 67661007b31SStefan Hajnoczi /* aaaa bbbb */ 67761007b31SStefan Hajnoczi if (offset >= req->overlap_offset + req->overlap_bytes) { 67861007b31SStefan Hajnoczi return false; 67961007b31SStefan Hajnoczi } 68061007b31SStefan Hajnoczi /* bbbb aaaa */ 68161007b31SStefan Hajnoczi if (req->overlap_offset >= offset + bytes) { 68261007b31SStefan Hajnoczi return false; 68361007b31SStefan Hajnoczi } 68461007b31SStefan Hajnoczi return true; 68561007b31SStefan Hajnoczi } 68661007b31SStefan Hajnoczi 68799723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs) 68899723548SPaolo Bonzini { 68999723548SPaolo Bonzini atomic_inc(&bs->in_flight); 69099723548SPaolo Bonzini } 69199723548SPaolo Bonzini 692c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs) 693c9d1a561SPaolo Bonzini { 6947719f3c9SStefan Hajnoczi aio_wait_kick(bdrv_get_aio_wait(bs)); 6950f12264eSKevin Wolf aio_wait_kick(&drain_all_aio_wait); 696c9d1a561SPaolo Bonzini } 697c9d1a561SPaolo Bonzini 69899723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs) 69999723548SPaolo Bonzini { 70099723548SPaolo Bonzini atomic_dec(&bs->in_flight); 701c9d1a561SPaolo Bonzini bdrv_wakeup(bs); 70299723548SPaolo Bonzini } 70399723548SPaolo Bonzini 70461007b31SStefan Hajnoczi static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 70561007b31SStefan Hajnoczi { 70661007b31SStefan Hajnoczi BlockDriverState *bs = self->bs; 70761007b31SStefan Hajnoczi BdrvTrackedRequest *req; 70861007b31SStefan Hajnoczi bool retry; 70961007b31SStefan Hajnoczi bool waited = false; 71061007b31SStefan Hajnoczi 71120fc71b2SPaolo Bonzini if (!atomic_read(&bs->serialising_in_flight)) { 71261007b31SStefan Hajnoczi return false; 71361007b31SStefan Hajnoczi } 71461007b31SStefan Hajnoczi 71561007b31SStefan Hajnoczi do { 71661007b31SStefan Hajnoczi retry = false; 7173783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 71861007b31SStefan Hajnoczi QLIST_FOREACH(req, &bs->tracked_requests, list) { 71961007b31SStefan Hajnoczi if (req == self || (!req->serialising && !self->serialising)) { 72061007b31SStefan Hajnoczi continue; 72161007b31SStefan Hajnoczi } 72261007b31SStefan Hajnoczi if (tracked_request_overlaps(req, self->overlap_offset, 72361007b31SStefan Hajnoczi self->overlap_bytes)) 72461007b31SStefan Hajnoczi { 72561007b31SStefan Hajnoczi /* Hitting this means there was a reentrant request, for 72661007b31SStefan Hajnoczi * example, a block driver issuing nested requests. This must 72761007b31SStefan Hajnoczi * never happen since it means deadlock. 72861007b31SStefan Hajnoczi */ 72961007b31SStefan Hajnoczi assert(qemu_coroutine_self() != req->co); 73061007b31SStefan Hajnoczi 73161007b31SStefan Hajnoczi /* If the request is already (indirectly) waiting for us, or 73261007b31SStefan Hajnoczi * will wait for us as soon as it wakes up, then just go on 73361007b31SStefan Hajnoczi * (instead of producing a deadlock in the former case). */ 73461007b31SStefan Hajnoczi if (!req->waiting_for) { 73561007b31SStefan Hajnoczi self->waiting_for = req; 7363783fa3dSPaolo Bonzini qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 73761007b31SStefan Hajnoczi self->waiting_for = NULL; 73861007b31SStefan Hajnoczi retry = true; 73961007b31SStefan Hajnoczi waited = true; 74061007b31SStefan Hajnoczi break; 74161007b31SStefan Hajnoczi } 74261007b31SStefan Hajnoczi } 74361007b31SStefan Hajnoczi } 7443783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 74561007b31SStefan Hajnoczi } while (retry); 74661007b31SStefan Hajnoczi 74761007b31SStefan Hajnoczi return waited; 74861007b31SStefan Hajnoczi } 74961007b31SStefan Hajnoczi 75061007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 75161007b31SStefan Hajnoczi size_t size) 75261007b31SStefan Hajnoczi { 75361007b31SStefan Hajnoczi if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { 75461007b31SStefan Hajnoczi return -EIO; 75561007b31SStefan Hajnoczi } 75661007b31SStefan Hajnoczi 75761007b31SStefan Hajnoczi if (!bdrv_is_inserted(bs)) { 75861007b31SStefan Hajnoczi return -ENOMEDIUM; 75961007b31SStefan Hajnoczi } 76061007b31SStefan Hajnoczi 76161007b31SStefan Hajnoczi if (offset < 0) { 76261007b31SStefan Hajnoczi return -EIO; 76361007b31SStefan Hajnoczi } 76461007b31SStefan Hajnoczi 76561007b31SStefan Hajnoczi return 0; 76661007b31SStefan Hajnoczi } 76761007b31SStefan Hajnoczi 76861007b31SStefan Hajnoczi typedef struct RwCo { 769e293b7a3SKevin Wolf BdrvChild *child; 77061007b31SStefan Hajnoczi int64_t offset; 77161007b31SStefan Hajnoczi QEMUIOVector *qiov; 77261007b31SStefan Hajnoczi bool is_write; 77361007b31SStefan Hajnoczi int ret; 77461007b31SStefan Hajnoczi BdrvRequestFlags flags; 77561007b31SStefan Hajnoczi } RwCo; 77661007b31SStefan Hajnoczi 77761007b31SStefan Hajnoczi static void coroutine_fn bdrv_rw_co_entry(void *opaque) 77861007b31SStefan Hajnoczi { 77961007b31SStefan Hajnoczi RwCo *rwco = opaque; 78061007b31SStefan Hajnoczi 78161007b31SStefan Hajnoczi if (!rwco->is_write) { 782a03ef88fSKevin Wolf rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 78361007b31SStefan Hajnoczi rwco->qiov->size, rwco->qiov, 78461007b31SStefan Hajnoczi rwco->flags); 78561007b31SStefan Hajnoczi } else { 786a03ef88fSKevin Wolf rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 78761007b31SStefan Hajnoczi rwco->qiov->size, rwco->qiov, 78861007b31SStefan Hajnoczi rwco->flags); 78961007b31SStefan Hajnoczi } 79061007b31SStefan Hajnoczi } 79161007b31SStefan Hajnoczi 79261007b31SStefan Hajnoczi /* 79361007b31SStefan Hajnoczi * Process a vectored synchronous request using coroutines 79461007b31SStefan Hajnoczi */ 795e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 79661007b31SStefan Hajnoczi QEMUIOVector *qiov, bool is_write, 79761007b31SStefan Hajnoczi BdrvRequestFlags flags) 79861007b31SStefan Hajnoczi { 79961007b31SStefan Hajnoczi Coroutine *co; 80061007b31SStefan Hajnoczi RwCo rwco = { 801e293b7a3SKevin Wolf .child = child, 80261007b31SStefan Hajnoczi .offset = offset, 80361007b31SStefan Hajnoczi .qiov = qiov, 80461007b31SStefan Hajnoczi .is_write = is_write, 80561007b31SStefan Hajnoczi .ret = NOT_DONE, 80661007b31SStefan Hajnoczi .flags = flags, 80761007b31SStefan Hajnoczi }; 80861007b31SStefan Hajnoczi 80961007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 81061007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 81161007b31SStefan Hajnoczi bdrv_rw_co_entry(&rwco); 81261007b31SStefan Hajnoczi } else { 8130b8b8753SPaolo Bonzini co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 814e92f0e19SFam Zheng bdrv_coroutine_enter(child->bs, co); 81588b062c2SPaolo Bonzini BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 81661007b31SStefan Hajnoczi } 81761007b31SStefan Hajnoczi return rwco.ret; 81861007b31SStefan Hajnoczi } 81961007b31SStefan Hajnoczi 82061007b31SStefan Hajnoczi /* 82161007b31SStefan Hajnoczi * Process a synchronous request using coroutines 82261007b31SStefan Hajnoczi */ 823e293b7a3SKevin Wolf static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf, 82461007b31SStefan Hajnoczi int nb_sectors, bool is_write, BdrvRequestFlags flags) 82561007b31SStefan Hajnoczi { 82661007b31SStefan Hajnoczi QEMUIOVector qiov; 82761007b31SStefan Hajnoczi struct iovec iov = { 82861007b31SStefan Hajnoczi .iov_base = (void *)buf, 82961007b31SStefan Hajnoczi .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 83061007b31SStefan Hajnoczi }; 83161007b31SStefan Hajnoczi 83261007b31SStefan Hajnoczi if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { 83361007b31SStefan Hajnoczi return -EINVAL; 83461007b31SStefan Hajnoczi } 83561007b31SStefan Hajnoczi 83661007b31SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 837e293b7a3SKevin Wolf return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS, 83861007b31SStefan Hajnoczi &qiov, is_write, flags); 83961007b31SStefan Hajnoczi } 84061007b31SStefan Hajnoczi 84161007b31SStefan Hajnoczi /* return < 0 if error. See bdrv_write() for the return codes */ 842fbcbbf4eSKevin Wolf int bdrv_read(BdrvChild *child, int64_t sector_num, 84361007b31SStefan Hajnoczi uint8_t *buf, int nb_sectors) 84461007b31SStefan Hajnoczi { 845e293b7a3SKevin Wolf return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0); 84661007b31SStefan Hajnoczi } 84761007b31SStefan Hajnoczi 84861007b31SStefan Hajnoczi /* Return < 0 if error. Important errors are: 84961007b31SStefan Hajnoczi -EIO generic I/O error (may happen for all errors) 85061007b31SStefan Hajnoczi -ENOMEDIUM No media inserted. 85161007b31SStefan Hajnoczi -EINVAL Invalid sector number or nb_sectors 85261007b31SStefan Hajnoczi -EACCES Trying to write a read-only device 85361007b31SStefan Hajnoczi */ 85418d51c4bSKevin Wolf int bdrv_write(BdrvChild *child, int64_t sector_num, 85561007b31SStefan Hajnoczi const uint8_t *buf, int nb_sectors) 85661007b31SStefan Hajnoczi { 857e293b7a3SKevin Wolf return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0); 85861007b31SStefan Hajnoczi } 85961007b31SStefan Hajnoczi 860720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 861f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 86261007b31SStefan Hajnoczi { 86374021bc4SEric Blake QEMUIOVector qiov; 86474021bc4SEric Blake struct iovec iov = { 86574021bc4SEric Blake .iov_base = NULL, 866f5a5ca79SManos Pitsidianakis .iov_len = bytes, 86774021bc4SEric Blake }; 86874021bc4SEric Blake 86974021bc4SEric Blake qemu_iovec_init_external(&qiov, &iov, 1); 870e293b7a3SKevin Wolf return bdrv_prwv_co(child, offset, &qiov, true, 87161007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 87261007b31SStefan Hajnoczi } 87361007b31SStefan Hajnoczi 87461007b31SStefan Hajnoczi /* 87574021bc4SEric Blake * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 87661007b31SStefan Hajnoczi * The operation is sped up by checking the block status and only writing 87761007b31SStefan Hajnoczi * zeroes to the device if they currently do not return zeroes. Optional 87874021bc4SEric Blake * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 879465fe887SEric Blake * BDRV_REQ_FUA). 88061007b31SStefan Hajnoczi * 88161007b31SStefan Hajnoczi * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 88261007b31SStefan Hajnoczi */ 883720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 88461007b31SStefan Hajnoczi { 885237d78f8SEric Blake int ret; 886237d78f8SEric Blake int64_t target_size, bytes, offset = 0; 887720ff280SKevin Wolf BlockDriverState *bs = child->bs; 88861007b31SStefan Hajnoczi 8897286d610SEric Blake target_size = bdrv_getlength(bs); 8907286d610SEric Blake if (target_size < 0) { 8917286d610SEric Blake return target_size; 89261007b31SStefan Hajnoczi } 89361007b31SStefan Hajnoczi 89461007b31SStefan Hajnoczi for (;;) { 8957286d610SEric Blake bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 8967286d610SEric Blake if (bytes <= 0) { 89761007b31SStefan Hajnoczi return 0; 89861007b31SStefan Hajnoczi } 899237d78f8SEric Blake ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 90061007b31SStefan Hajnoczi if (ret < 0) { 9017286d610SEric Blake error_report("error getting block status at offset %" PRId64 ": %s", 9027286d610SEric Blake offset, strerror(-ret)); 90361007b31SStefan Hajnoczi return ret; 90461007b31SStefan Hajnoczi } 90561007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_ZERO) { 906237d78f8SEric Blake offset += bytes; 90761007b31SStefan Hajnoczi continue; 90861007b31SStefan Hajnoczi } 909237d78f8SEric Blake ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 91061007b31SStefan Hajnoczi if (ret < 0) { 9117286d610SEric Blake error_report("error writing zeroes at offset %" PRId64 ": %s", 9127286d610SEric Blake offset, strerror(-ret)); 91361007b31SStefan Hajnoczi return ret; 91461007b31SStefan Hajnoczi } 915237d78f8SEric Blake offset += bytes; 91661007b31SStefan Hajnoczi } 91761007b31SStefan Hajnoczi } 91861007b31SStefan Hajnoczi 919cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 920f1e84741SKevin Wolf { 921f1e84741SKevin Wolf int ret; 922f1e84741SKevin Wolf 923e293b7a3SKevin Wolf ret = bdrv_prwv_co(child, offset, qiov, false, 0); 924f1e84741SKevin Wolf if (ret < 0) { 925f1e84741SKevin Wolf return ret; 926f1e84741SKevin Wolf } 927f1e84741SKevin Wolf 928f1e84741SKevin Wolf return qiov->size; 929f1e84741SKevin Wolf } 930f1e84741SKevin Wolf 931cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 93261007b31SStefan Hajnoczi { 93361007b31SStefan Hajnoczi QEMUIOVector qiov; 93461007b31SStefan Hajnoczi struct iovec iov = { 93561007b31SStefan Hajnoczi .iov_base = (void *)buf, 93661007b31SStefan Hajnoczi .iov_len = bytes, 93761007b31SStefan Hajnoczi }; 93861007b31SStefan Hajnoczi 93961007b31SStefan Hajnoczi if (bytes < 0) { 94061007b31SStefan Hajnoczi return -EINVAL; 94161007b31SStefan Hajnoczi } 94261007b31SStefan Hajnoczi 94361007b31SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 944cf2ab8fcSKevin Wolf return bdrv_preadv(child, offset, &qiov); 94561007b31SStefan Hajnoczi } 94661007b31SStefan Hajnoczi 947d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 94861007b31SStefan Hajnoczi { 94961007b31SStefan Hajnoczi int ret; 95061007b31SStefan Hajnoczi 951e293b7a3SKevin Wolf ret = bdrv_prwv_co(child, offset, qiov, true, 0); 95261007b31SStefan Hajnoczi if (ret < 0) { 95361007b31SStefan Hajnoczi return ret; 95461007b31SStefan Hajnoczi } 95561007b31SStefan Hajnoczi 95661007b31SStefan Hajnoczi return qiov->size; 95761007b31SStefan Hajnoczi } 95861007b31SStefan Hajnoczi 959d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 96061007b31SStefan Hajnoczi { 96161007b31SStefan Hajnoczi QEMUIOVector qiov; 96261007b31SStefan Hajnoczi struct iovec iov = { 96361007b31SStefan Hajnoczi .iov_base = (void *) buf, 96461007b31SStefan Hajnoczi .iov_len = bytes, 96561007b31SStefan Hajnoczi }; 96661007b31SStefan Hajnoczi 96761007b31SStefan Hajnoczi if (bytes < 0) { 96861007b31SStefan Hajnoczi return -EINVAL; 96961007b31SStefan Hajnoczi } 97061007b31SStefan Hajnoczi 97161007b31SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 972d9ca2ea2SKevin Wolf return bdrv_pwritev(child, offset, &qiov); 97361007b31SStefan Hajnoczi } 97461007b31SStefan Hajnoczi 97561007b31SStefan Hajnoczi /* 97661007b31SStefan Hajnoczi * Writes to the file and ensures that no writes are reordered across this 97761007b31SStefan Hajnoczi * request (acts as a barrier) 97861007b31SStefan Hajnoczi * 97961007b31SStefan Hajnoczi * Returns 0 on success, -errno in error cases. 98061007b31SStefan Hajnoczi */ 981d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 98261007b31SStefan Hajnoczi const void *buf, int count) 98361007b31SStefan Hajnoczi { 98461007b31SStefan Hajnoczi int ret; 98561007b31SStefan Hajnoczi 986d9ca2ea2SKevin Wolf ret = bdrv_pwrite(child, offset, buf, count); 98761007b31SStefan Hajnoczi if (ret < 0) { 98861007b31SStefan Hajnoczi return ret; 98961007b31SStefan Hajnoczi } 99061007b31SStefan Hajnoczi 991d9ca2ea2SKevin Wolf ret = bdrv_flush(child->bs); 992855a6a93SKevin Wolf if (ret < 0) { 993855a6a93SKevin Wolf return ret; 99461007b31SStefan Hajnoczi } 99561007b31SStefan Hajnoczi 99661007b31SStefan Hajnoczi return 0; 99761007b31SStefan Hajnoczi } 99861007b31SStefan Hajnoczi 99908844473SKevin Wolf typedef struct CoroutineIOCompletion { 100008844473SKevin Wolf Coroutine *coroutine; 100108844473SKevin Wolf int ret; 100208844473SKevin Wolf } CoroutineIOCompletion; 100308844473SKevin Wolf 100408844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret) 100508844473SKevin Wolf { 100608844473SKevin Wolf CoroutineIOCompletion *co = opaque; 100708844473SKevin Wolf 100808844473SKevin Wolf co->ret = ret; 1009b9e413ddSPaolo Bonzini aio_co_wake(co->coroutine); 101008844473SKevin Wolf } 101108844473SKevin Wolf 1012166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1013166fe960SKevin Wolf uint64_t offset, uint64_t bytes, 1014166fe960SKevin Wolf QEMUIOVector *qiov, int flags) 1015166fe960SKevin Wolf { 1016166fe960SKevin Wolf BlockDriver *drv = bs->drv; 10173fb06697SKevin Wolf int64_t sector_num; 10183fb06697SKevin Wolf unsigned int nb_sectors; 10193fb06697SKevin Wolf 1020fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1021fa166538SEric Blake 1022d470ad42SMax Reitz if (!drv) { 1023d470ad42SMax Reitz return -ENOMEDIUM; 1024d470ad42SMax Reitz } 1025d470ad42SMax Reitz 10263fb06697SKevin Wolf if (drv->bdrv_co_preadv) { 10273fb06697SKevin Wolf return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 10283fb06697SKevin Wolf } 10293fb06697SKevin Wolf 1030edfab6a0SEric Blake if (drv->bdrv_aio_preadv) { 103108844473SKevin Wolf BlockAIOCB *acb; 103208844473SKevin Wolf CoroutineIOCompletion co = { 103308844473SKevin Wolf .coroutine = qemu_coroutine_self(), 103408844473SKevin Wolf }; 103508844473SKevin Wolf 1036e31f6864SEric Blake acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 103708844473SKevin Wolf bdrv_co_io_em_complete, &co); 103808844473SKevin Wolf if (acb == NULL) { 103908844473SKevin Wolf return -EIO; 104008844473SKevin Wolf } else { 104108844473SKevin Wolf qemu_coroutine_yield(); 104208844473SKevin Wolf return co.ret; 104308844473SKevin Wolf } 104408844473SKevin Wolf } 1045edfab6a0SEric Blake 1046edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1047edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1048edfab6a0SEric Blake 1049edfab6a0SEric Blake assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1050edfab6a0SEric Blake assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1051edfab6a0SEric Blake assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 1052edfab6a0SEric Blake assert(drv->bdrv_co_readv); 1053edfab6a0SEric Blake 1054edfab6a0SEric Blake return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1055166fe960SKevin Wolf } 1056166fe960SKevin Wolf 105778a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 105878a07294SKevin Wolf uint64_t offset, uint64_t bytes, 105978a07294SKevin Wolf QEMUIOVector *qiov, int flags) 106078a07294SKevin Wolf { 106178a07294SKevin Wolf BlockDriver *drv = bs->drv; 10623fb06697SKevin Wolf int64_t sector_num; 10633fb06697SKevin Wolf unsigned int nb_sectors; 106478a07294SKevin Wolf int ret; 106578a07294SKevin Wolf 1066fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1067fa166538SEric Blake 1068d470ad42SMax Reitz if (!drv) { 1069d470ad42SMax Reitz return -ENOMEDIUM; 1070d470ad42SMax Reitz } 1071d470ad42SMax Reitz 10723fb06697SKevin Wolf if (drv->bdrv_co_pwritev) { 1073515c2f43SKevin Wolf ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1074515c2f43SKevin Wolf flags & bs->supported_write_flags); 1075515c2f43SKevin Wolf flags &= ~bs->supported_write_flags; 10763fb06697SKevin Wolf goto emulate_flags; 10773fb06697SKevin Wolf } 10783fb06697SKevin Wolf 1079edfab6a0SEric Blake if (drv->bdrv_aio_pwritev) { 108008844473SKevin Wolf BlockAIOCB *acb; 108108844473SKevin Wolf CoroutineIOCompletion co = { 108208844473SKevin Wolf .coroutine = qemu_coroutine_self(), 108308844473SKevin Wolf }; 108408844473SKevin Wolf 1085e31f6864SEric Blake acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1086e31f6864SEric Blake flags & bs->supported_write_flags, 108708844473SKevin Wolf bdrv_co_io_em_complete, &co); 1088e31f6864SEric Blake flags &= ~bs->supported_write_flags; 108908844473SKevin Wolf if (acb == NULL) { 10903fb06697SKevin Wolf ret = -EIO; 109108844473SKevin Wolf } else { 109208844473SKevin Wolf qemu_coroutine_yield(); 10933fb06697SKevin Wolf ret = co.ret; 109408844473SKevin Wolf } 1095edfab6a0SEric Blake goto emulate_flags; 1096edfab6a0SEric Blake } 1097edfab6a0SEric Blake 1098edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1099edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1100edfab6a0SEric Blake 1101edfab6a0SEric Blake assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1102edfab6a0SEric Blake assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1103edfab6a0SEric Blake assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS); 1104edfab6a0SEric Blake 1105e18a58b4SEric Blake assert(drv->bdrv_co_writev); 1106e18a58b4SEric Blake ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1107edfab6a0SEric Blake flags & bs->supported_write_flags); 1108edfab6a0SEric Blake flags &= ~bs->supported_write_flags; 110978a07294SKevin Wolf 11103fb06697SKevin Wolf emulate_flags: 11114df863f3SEric Blake if (ret == 0 && (flags & BDRV_REQ_FUA)) { 111278a07294SKevin Wolf ret = bdrv_co_flush(bs); 111378a07294SKevin Wolf } 111478a07294SKevin Wolf 111578a07294SKevin Wolf return ret; 111678a07294SKevin Wolf } 111778a07294SKevin Wolf 111829a298afSPavel Butsykin static int coroutine_fn 111929a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 112029a298afSPavel Butsykin uint64_t bytes, QEMUIOVector *qiov) 112129a298afSPavel Butsykin { 112229a298afSPavel Butsykin BlockDriver *drv = bs->drv; 112329a298afSPavel Butsykin 1124d470ad42SMax Reitz if (!drv) { 1125d470ad42SMax Reitz return -ENOMEDIUM; 1126d470ad42SMax Reitz } 1127d470ad42SMax Reitz 112829a298afSPavel Butsykin if (!drv->bdrv_co_pwritev_compressed) { 112929a298afSPavel Butsykin return -ENOTSUP; 113029a298afSPavel Butsykin } 113129a298afSPavel Butsykin 113229a298afSPavel Butsykin return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 113329a298afSPavel Butsykin } 113429a298afSPavel Butsykin 113585c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1136244483e6SKevin Wolf int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 113761007b31SStefan Hajnoczi { 113885c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 113985c97ca7SKevin Wolf 114061007b31SStefan Hajnoczi /* Perform I/O through a temporary buffer so that users who scribble over 114161007b31SStefan Hajnoczi * their read buffer while the operation is in progress do not end up 114261007b31SStefan Hajnoczi * modifying the image file. This is critical for zero-copy guest I/O 114361007b31SStefan Hajnoczi * where anything might happen inside guest memory. 114461007b31SStefan Hajnoczi */ 114561007b31SStefan Hajnoczi void *bounce_buffer; 114661007b31SStefan Hajnoczi 114761007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 114861007b31SStefan Hajnoczi struct iovec iov; 1149cb2e2878SEric Blake QEMUIOVector local_qiov; 1150244483e6SKevin Wolf int64_t cluster_offset; 11517cfd5275SEric Blake int64_t cluster_bytes; 115261007b31SStefan Hajnoczi size_t skip_bytes; 115361007b31SStefan Hajnoczi int ret; 1154cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1155cb2e2878SEric Blake BDRV_REQUEST_MAX_BYTES); 1156cb2e2878SEric Blake unsigned int progress = 0; 115761007b31SStefan Hajnoczi 1158d470ad42SMax Reitz if (!drv) { 1159d470ad42SMax Reitz return -ENOMEDIUM; 1160d470ad42SMax Reitz } 1161d470ad42SMax Reitz 11621bf03e66SKevin Wolf /* FIXME We cannot require callers to have write permissions when all they 11631bf03e66SKevin Wolf * are doing is a read request. If we did things right, write permissions 11641bf03e66SKevin Wolf * would be obtained anyway, but internally by the copy-on-read code. As 1165765d9df9SEric Blake * long as it is implemented here rather than in a separate filter driver, 11661bf03e66SKevin Wolf * the copy-on-read code doesn't have its own BdrvChild, however, for which 11671bf03e66SKevin Wolf * it could request permissions. Therefore we have to bypass the permission 11681bf03e66SKevin Wolf * system for the moment. */ 11691bf03e66SKevin Wolf // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1170afa4b293SKevin Wolf 117161007b31SStefan Hajnoczi /* Cover entire cluster so no additional backing file I/O is required when 1172cb2e2878SEric Blake * allocating cluster in the image file. Note that this value may exceed 1173cb2e2878SEric Blake * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1174cb2e2878SEric Blake * is one reason we loop rather than doing it all at once. 117561007b31SStefan Hajnoczi */ 1176244483e6SKevin Wolf bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1177cb2e2878SEric Blake skip_bytes = offset - cluster_offset; 117861007b31SStefan Hajnoczi 1179244483e6SKevin Wolf trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1180244483e6SKevin Wolf cluster_offset, cluster_bytes); 118161007b31SStefan Hajnoczi 1182cb2e2878SEric Blake bounce_buffer = qemu_try_blockalign(bs, 1183cb2e2878SEric Blake MIN(MIN(max_transfer, cluster_bytes), 1184cb2e2878SEric Blake MAX_BOUNCE_BUFFER)); 118561007b31SStefan Hajnoczi if (bounce_buffer == NULL) { 118661007b31SStefan Hajnoczi ret = -ENOMEM; 118761007b31SStefan Hajnoczi goto err; 118861007b31SStefan Hajnoczi } 118961007b31SStefan Hajnoczi 1190cb2e2878SEric Blake while (cluster_bytes) { 1191cb2e2878SEric Blake int64_t pnum; 119261007b31SStefan Hajnoczi 1193cb2e2878SEric Blake ret = bdrv_is_allocated(bs, cluster_offset, 1194cb2e2878SEric Blake MIN(cluster_bytes, max_transfer), &pnum); 1195cb2e2878SEric Blake if (ret < 0) { 1196cb2e2878SEric Blake /* Safe to treat errors in querying allocation as if 1197cb2e2878SEric Blake * unallocated; we'll probably fail again soon on the 1198cb2e2878SEric Blake * read, but at least that will set a decent errno. 1199cb2e2878SEric Blake */ 1200cb2e2878SEric Blake pnum = MIN(cluster_bytes, max_transfer); 1201cb2e2878SEric Blake } 1202cb2e2878SEric Blake 1203b0ddcbbbSKevin Wolf /* Stop at EOF if the image ends in the middle of the cluster */ 1204b0ddcbbbSKevin Wolf if (ret == 0 && pnum == 0) { 1205b0ddcbbbSKevin Wolf assert(progress >= bytes); 1206b0ddcbbbSKevin Wolf break; 1207b0ddcbbbSKevin Wolf } 1208b0ddcbbbSKevin Wolf 1209cb2e2878SEric Blake assert(skip_bytes < pnum); 1210cb2e2878SEric Blake 1211cb2e2878SEric Blake if (ret <= 0) { 1212cb2e2878SEric Blake /* Must copy-on-read; use the bounce buffer */ 1213cb2e2878SEric Blake iov.iov_base = bounce_buffer; 1214cb2e2878SEric Blake iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 1215cb2e2878SEric Blake qemu_iovec_init_external(&local_qiov, &iov, 1); 1216cb2e2878SEric Blake 1217cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1218cb2e2878SEric Blake &local_qiov, 0); 121961007b31SStefan Hajnoczi if (ret < 0) { 122061007b31SStefan Hajnoczi goto err; 122161007b31SStefan Hajnoczi } 122261007b31SStefan Hajnoczi 1223d855ebcdSEric Blake bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1224c1499a5eSEric Blake if (drv->bdrv_co_pwrite_zeroes && 1225cb2e2878SEric Blake buffer_is_zero(bounce_buffer, pnum)) { 1226a604fa2bSEric Blake /* FIXME: Should we (perhaps conditionally) be setting 1227a604fa2bSEric Blake * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1228a604fa2bSEric Blake * that still correctly reads as zero? */ 12297adcf59fSMax Reitz ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 12307adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 123161007b31SStefan Hajnoczi } else { 1232cb2e2878SEric Blake /* This does not change the data on the disk, it is not 1233cb2e2878SEric Blake * necessary to flush even in cache=writethrough mode. 123461007b31SStefan Hajnoczi */ 1235cb2e2878SEric Blake ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 12367adcf59fSMax Reitz &local_qiov, 12377adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 123861007b31SStefan Hajnoczi } 123961007b31SStefan Hajnoczi 124061007b31SStefan Hajnoczi if (ret < 0) { 1241cb2e2878SEric Blake /* It might be okay to ignore write errors for guest 1242cb2e2878SEric Blake * requests. If this is a deliberate copy-on-read 1243cb2e2878SEric Blake * then we don't want to ignore the error. Simply 1244cb2e2878SEric Blake * report it in all cases. 124561007b31SStefan Hajnoczi */ 124661007b31SStefan Hajnoczi goto err; 124761007b31SStefan Hajnoczi } 124861007b31SStefan Hajnoczi 1249cb2e2878SEric Blake qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, 1250cb2e2878SEric Blake pnum - skip_bytes); 1251cb2e2878SEric Blake } else { 1252cb2e2878SEric Blake /* Read directly into the destination */ 1253cb2e2878SEric Blake qemu_iovec_init(&local_qiov, qiov->niov); 1254cb2e2878SEric Blake qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); 1255cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, 1256cb2e2878SEric Blake &local_qiov, 0); 1257cb2e2878SEric Blake qemu_iovec_destroy(&local_qiov); 1258cb2e2878SEric Blake if (ret < 0) { 1259cb2e2878SEric Blake goto err; 1260cb2e2878SEric Blake } 1261cb2e2878SEric Blake } 1262cb2e2878SEric Blake 1263cb2e2878SEric Blake cluster_offset += pnum; 1264cb2e2878SEric Blake cluster_bytes -= pnum; 1265cb2e2878SEric Blake progress += pnum - skip_bytes; 1266cb2e2878SEric Blake skip_bytes = 0; 1267cb2e2878SEric Blake } 1268cb2e2878SEric Blake ret = 0; 126961007b31SStefan Hajnoczi 127061007b31SStefan Hajnoczi err: 127161007b31SStefan Hajnoczi qemu_vfree(bounce_buffer); 127261007b31SStefan Hajnoczi return ret; 127361007b31SStefan Hajnoczi } 127461007b31SStefan Hajnoczi 127561007b31SStefan Hajnoczi /* 127661007b31SStefan Hajnoczi * Forwards an already correctly aligned request to the BlockDriver. This 12771a62d0acSEric Blake * handles copy on read, zeroing after EOF, and fragmentation of large 12781a62d0acSEric Blake * reads; any other features must be implemented by the caller. 127961007b31SStefan Hajnoczi */ 128085c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 128161007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 128261007b31SStefan Hajnoczi int64_t align, QEMUIOVector *qiov, int flags) 128361007b31SStefan Hajnoczi { 128485c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 1285c9d20029SKevin Wolf int64_t total_bytes, max_bytes; 12861a62d0acSEric Blake int ret = 0; 12871a62d0acSEric Blake uint64_t bytes_remaining = bytes; 12881a62d0acSEric Blake int max_transfer; 128961007b31SStefan Hajnoczi 129049c07526SKevin Wolf assert(is_power_of_2(align)); 129149c07526SKevin Wolf assert((offset & (align - 1)) == 0); 129249c07526SKevin Wolf assert((bytes & (align - 1)) == 0); 129361007b31SStefan Hajnoczi assert(!qiov || bytes == qiov->size); 1294abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 12951a62d0acSEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 12961a62d0acSEric Blake align); 1297a604fa2bSEric Blake 1298a604fa2bSEric Blake /* TODO: We would need a per-BDS .supported_read_flags and 1299a604fa2bSEric Blake * potential fallback support, if we ever implement any read flags 1300a604fa2bSEric Blake * to pass through to drivers. For now, there aren't any 1301a604fa2bSEric Blake * passthrough flags. */ 1302a604fa2bSEric Blake assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 130361007b31SStefan Hajnoczi 130461007b31SStefan Hajnoczi /* Handle Copy on Read and associated serialisation */ 130561007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 130661007b31SStefan Hajnoczi /* If we touch the same cluster it counts as an overlap. This 130761007b31SStefan Hajnoczi * guarantees that allocating writes will be serialized and not race 130861007b31SStefan Hajnoczi * with each other for the same cluster. For example, in copy-on-read 130961007b31SStefan Hajnoczi * it ensures that the CoR read and write operations are atomic and 131061007b31SStefan Hajnoczi * guest writes cannot interleave between them. */ 131161007b31SStefan Hajnoczi mark_request_serialising(req, bdrv_get_cluster_size(bs)); 131261007b31SStefan Hajnoczi } 131361007b31SStefan Hajnoczi 131461408b25SFam Zheng if (!(flags & BDRV_REQ_NO_SERIALISING)) { 131561007b31SStefan Hajnoczi wait_serialising_requests(req); 131661408b25SFam Zheng } 131761007b31SStefan Hajnoczi 131861007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 1319d6a644bbSEric Blake int64_t pnum; 132061007b31SStefan Hajnoczi 132188e63df2SEric Blake ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 132261007b31SStefan Hajnoczi if (ret < 0) { 132361007b31SStefan Hajnoczi goto out; 132461007b31SStefan Hajnoczi } 132561007b31SStefan Hajnoczi 132688e63df2SEric Blake if (!ret || pnum != bytes) { 132785c97ca7SKevin Wolf ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 132861007b31SStefan Hajnoczi goto out; 132961007b31SStefan Hajnoczi } 133061007b31SStefan Hajnoczi } 133161007b31SStefan Hajnoczi 13321a62d0acSEric Blake /* Forward the request to the BlockDriver, possibly fragmenting it */ 133349c07526SKevin Wolf total_bytes = bdrv_getlength(bs); 133449c07526SKevin Wolf if (total_bytes < 0) { 133549c07526SKevin Wolf ret = total_bytes; 133661007b31SStefan Hajnoczi goto out; 133761007b31SStefan Hajnoczi } 133861007b31SStefan Hajnoczi 133949c07526SKevin Wolf max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 13401a62d0acSEric Blake if (bytes <= max_bytes && bytes <= max_transfer) { 1341166fe960SKevin Wolf ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 13421a62d0acSEric Blake goto out; 134361007b31SStefan Hajnoczi } 134461007b31SStefan Hajnoczi 13451a62d0acSEric Blake while (bytes_remaining) { 13461a62d0acSEric Blake int num; 13471a62d0acSEric Blake 13481a62d0acSEric Blake if (max_bytes) { 13491a62d0acSEric Blake QEMUIOVector local_qiov; 13501a62d0acSEric Blake 13511a62d0acSEric Blake num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 13521a62d0acSEric Blake assert(num); 13531a62d0acSEric Blake qemu_iovec_init(&local_qiov, qiov->niov); 13541a62d0acSEric Blake qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 13551a62d0acSEric Blake 13561a62d0acSEric Blake ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 13571a62d0acSEric Blake num, &local_qiov, 0); 13581a62d0acSEric Blake max_bytes -= num; 13591a62d0acSEric Blake qemu_iovec_destroy(&local_qiov); 13601a62d0acSEric Blake } else { 13611a62d0acSEric Blake num = bytes_remaining; 13621a62d0acSEric Blake ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 13631a62d0acSEric Blake bytes_remaining); 13641a62d0acSEric Blake } 13651a62d0acSEric Blake if (ret < 0) { 13661a62d0acSEric Blake goto out; 13671a62d0acSEric Blake } 13681a62d0acSEric Blake bytes_remaining -= num; 136961007b31SStefan Hajnoczi } 137061007b31SStefan Hajnoczi 137161007b31SStefan Hajnoczi out: 13721a62d0acSEric Blake return ret < 0 ? ret : 0; 137361007b31SStefan Hajnoczi } 137461007b31SStefan Hajnoczi 137561007b31SStefan Hajnoczi /* 137661007b31SStefan Hajnoczi * Handle a read request in coroutine context 137761007b31SStefan Hajnoczi */ 1378a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child, 137961007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 138061007b31SStefan Hajnoczi BdrvRequestFlags flags) 138161007b31SStefan Hajnoczi { 1382a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 138361007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 138461007b31SStefan Hajnoczi BdrvTrackedRequest req; 138561007b31SStefan Hajnoczi 1386a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 138761007b31SStefan Hajnoczi uint8_t *head_buf = NULL; 138861007b31SStefan Hajnoczi uint8_t *tail_buf = NULL; 138961007b31SStefan Hajnoczi QEMUIOVector local_qiov; 139061007b31SStefan Hajnoczi bool use_local_qiov = false; 139161007b31SStefan Hajnoczi int ret; 139261007b31SStefan Hajnoczi 1393f42cf447SDaniel P. Berrange trace_bdrv_co_preadv(child->bs, offset, bytes, flags); 1394f42cf447SDaniel P. Berrange 139561007b31SStefan Hajnoczi if (!drv) { 139661007b31SStefan Hajnoczi return -ENOMEDIUM; 139761007b31SStefan Hajnoczi } 139861007b31SStefan Hajnoczi 139961007b31SStefan Hajnoczi ret = bdrv_check_byte_request(bs, offset, bytes); 140061007b31SStefan Hajnoczi if (ret < 0) { 140161007b31SStefan Hajnoczi return ret; 140261007b31SStefan Hajnoczi } 140361007b31SStefan Hajnoczi 140499723548SPaolo Bonzini bdrv_inc_in_flight(bs); 140599723548SPaolo Bonzini 14069568b511SWen Congyang /* Don't do copy-on-read if we read data before write operation */ 1407d3faa13eSPaolo Bonzini if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 140861007b31SStefan Hajnoczi flags |= BDRV_REQ_COPY_ON_READ; 140961007b31SStefan Hajnoczi } 141061007b31SStefan Hajnoczi 141161007b31SStefan Hajnoczi /* Align read if necessary by padding qiov */ 141261007b31SStefan Hajnoczi if (offset & (align - 1)) { 141361007b31SStefan Hajnoczi head_buf = qemu_blockalign(bs, align); 141461007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 2); 141561007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 141661007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 141761007b31SStefan Hajnoczi use_local_qiov = true; 141861007b31SStefan Hajnoczi 141961007b31SStefan Hajnoczi bytes += offset & (align - 1); 142061007b31SStefan Hajnoczi offset = offset & ~(align - 1); 142161007b31SStefan Hajnoczi } 142261007b31SStefan Hajnoczi 142361007b31SStefan Hajnoczi if ((offset + bytes) & (align - 1)) { 142461007b31SStefan Hajnoczi if (!use_local_qiov) { 142561007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 1); 142661007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 142761007b31SStefan Hajnoczi use_local_qiov = true; 142861007b31SStefan Hajnoczi } 142961007b31SStefan Hajnoczi tail_buf = qemu_blockalign(bs, align); 143061007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, tail_buf, 143161007b31SStefan Hajnoczi align - ((offset + bytes) & (align - 1))); 143261007b31SStefan Hajnoczi 143361007b31SStefan Hajnoczi bytes = ROUND_UP(bytes, align); 143461007b31SStefan Hajnoczi } 143561007b31SStefan Hajnoczi 1436ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 143785c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 143861007b31SStefan Hajnoczi use_local_qiov ? &local_qiov : qiov, 143961007b31SStefan Hajnoczi flags); 144061007b31SStefan Hajnoczi tracked_request_end(&req); 144199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 144261007b31SStefan Hajnoczi 144361007b31SStefan Hajnoczi if (use_local_qiov) { 144461007b31SStefan Hajnoczi qemu_iovec_destroy(&local_qiov); 144561007b31SStefan Hajnoczi qemu_vfree(head_buf); 144661007b31SStefan Hajnoczi qemu_vfree(tail_buf); 144761007b31SStefan Hajnoczi } 144861007b31SStefan Hajnoczi 144961007b31SStefan Hajnoczi return ret; 145061007b31SStefan Hajnoczi } 145161007b31SStefan Hajnoczi 1452d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1453f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags) 145461007b31SStefan Hajnoczi { 145561007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 145661007b31SStefan Hajnoczi QEMUIOVector qiov; 145761007b31SStefan Hajnoczi struct iovec iov = {0}; 145861007b31SStefan Hajnoczi int ret = 0; 1459465fe887SEric Blake bool need_flush = false; 1460443668caSDenis V. Lunev int head = 0; 1461443668caSDenis V. Lunev int tail = 0; 146261007b31SStefan Hajnoczi 1463cf081fcaSEric Blake int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1464a5b8dd2cSEric Blake int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1465a5b8dd2cSEric Blake bs->bl.request_alignment); 1466cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1467cf081fcaSEric Blake 1468d470ad42SMax Reitz if (!drv) { 1469d470ad42SMax Reitz return -ENOMEDIUM; 1470d470ad42SMax Reitz } 1471d470ad42SMax Reitz 1472b8d0a980SEric Blake assert(alignment % bs->bl.request_alignment == 0); 1473b8d0a980SEric Blake head = offset % alignment; 1474f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % alignment; 1475b8d0a980SEric Blake max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1476b8d0a980SEric Blake assert(max_write_zeroes >= bs->bl.request_alignment); 147761007b31SStefan Hajnoczi 1478f5a5ca79SManos Pitsidianakis while (bytes > 0 && !ret) { 1479f5a5ca79SManos Pitsidianakis int num = bytes; 148061007b31SStefan Hajnoczi 148161007b31SStefan Hajnoczi /* Align request. Block drivers can expect the "bulk" of the request 1482443668caSDenis V. Lunev * to be aligned, and that unaligned requests do not cross cluster 1483443668caSDenis V. Lunev * boundaries. 148461007b31SStefan Hajnoczi */ 1485443668caSDenis V. Lunev if (head) { 1486b2f95feeSEric Blake /* Make a small request up to the first aligned sector. For 1487b2f95feeSEric Blake * convenience, limit this request to max_transfer even if 1488b2f95feeSEric Blake * we don't need to fall back to writes. */ 1489f5a5ca79SManos Pitsidianakis num = MIN(MIN(bytes, max_transfer), alignment - head); 1490b2f95feeSEric Blake head = (head + num) % alignment; 1491b2f95feeSEric Blake assert(num < max_write_zeroes); 1492d05aa8bbSEric Blake } else if (tail && num > alignment) { 1493443668caSDenis V. Lunev /* Shorten the request to the last aligned sector. */ 1494443668caSDenis V. Lunev num -= tail; 149561007b31SStefan Hajnoczi } 149661007b31SStefan Hajnoczi 149761007b31SStefan Hajnoczi /* limit request size */ 149861007b31SStefan Hajnoczi if (num > max_write_zeroes) { 149961007b31SStefan Hajnoczi num = max_write_zeroes; 150061007b31SStefan Hajnoczi } 150161007b31SStefan Hajnoczi 150261007b31SStefan Hajnoczi ret = -ENOTSUP; 150361007b31SStefan Hajnoczi /* First try the efficient write zeroes operation */ 1504d05aa8bbSEric Blake if (drv->bdrv_co_pwrite_zeroes) { 1505d05aa8bbSEric Blake ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1506d05aa8bbSEric Blake flags & bs->supported_zero_flags); 1507d05aa8bbSEric Blake if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1508d05aa8bbSEric Blake !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1509d05aa8bbSEric Blake need_flush = true; 1510d05aa8bbSEric Blake } 1511465fe887SEric Blake } else { 1512465fe887SEric Blake assert(!bs->supported_zero_flags); 151361007b31SStefan Hajnoczi } 151461007b31SStefan Hajnoczi 151561007b31SStefan Hajnoczi if (ret == -ENOTSUP) { 151661007b31SStefan Hajnoczi /* Fall back to bounce buffer if write zeroes is unsupported */ 1517465fe887SEric Blake BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1518465fe887SEric Blake 1519465fe887SEric Blake if ((flags & BDRV_REQ_FUA) && 1520465fe887SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1521465fe887SEric Blake /* No need for bdrv_driver_pwrite() to do a fallback 1522465fe887SEric Blake * flush on each chunk; use just one at the end */ 1523465fe887SEric Blake write_flags &= ~BDRV_REQ_FUA; 1524465fe887SEric Blake need_flush = true; 1525465fe887SEric Blake } 15265def6b80SEric Blake num = MIN(num, max_transfer); 1527d05aa8bbSEric Blake iov.iov_len = num; 152861007b31SStefan Hajnoczi if (iov.iov_base == NULL) { 1529d05aa8bbSEric Blake iov.iov_base = qemu_try_blockalign(bs, num); 153061007b31SStefan Hajnoczi if (iov.iov_base == NULL) { 153161007b31SStefan Hajnoczi ret = -ENOMEM; 153261007b31SStefan Hajnoczi goto fail; 153361007b31SStefan Hajnoczi } 1534d05aa8bbSEric Blake memset(iov.iov_base, 0, num); 153561007b31SStefan Hajnoczi } 153661007b31SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 153761007b31SStefan Hajnoczi 1538d05aa8bbSEric Blake ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 153961007b31SStefan Hajnoczi 154061007b31SStefan Hajnoczi /* Keep bounce buffer around if it is big enough for all 154161007b31SStefan Hajnoczi * all future requests. 154261007b31SStefan Hajnoczi */ 15435def6b80SEric Blake if (num < max_transfer) { 154461007b31SStefan Hajnoczi qemu_vfree(iov.iov_base); 154561007b31SStefan Hajnoczi iov.iov_base = NULL; 154661007b31SStefan Hajnoczi } 154761007b31SStefan Hajnoczi } 154861007b31SStefan Hajnoczi 1549d05aa8bbSEric Blake offset += num; 1550f5a5ca79SManos Pitsidianakis bytes -= num; 155161007b31SStefan Hajnoczi } 155261007b31SStefan Hajnoczi 155361007b31SStefan Hajnoczi fail: 1554465fe887SEric Blake if (ret == 0 && need_flush) { 1555465fe887SEric Blake ret = bdrv_co_flush(bs); 1556465fe887SEric Blake } 155761007b31SStefan Hajnoczi qemu_vfree(iov.iov_base); 155861007b31SStefan Hajnoczi return ret; 155961007b31SStefan Hajnoczi } 156061007b31SStefan Hajnoczi 156161007b31SStefan Hajnoczi /* 156204ed95f4SEric Blake * Forwards an already correctly aligned write request to the BlockDriver, 156304ed95f4SEric Blake * after possibly fragmenting it. 156461007b31SStefan Hajnoczi */ 156585c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 156661007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1567cff86b38SEric Blake int64_t align, QEMUIOVector *qiov, int flags) 156861007b31SStefan Hajnoczi { 156985c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 157061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 157161007b31SStefan Hajnoczi bool waited; 157261007b31SStefan Hajnoczi int ret; 157361007b31SStefan Hajnoczi 15749896c876SKevin Wolf int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 157504ed95f4SEric Blake uint64_t bytes_remaining = bytes; 157604ed95f4SEric Blake int max_transfer; 157761007b31SStefan Hajnoczi 1578d470ad42SMax Reitz if (!drv) { 1579d470ad42SMax Reitz return -ENOMEDIUM; 1580d470ad42SMax Reitz } 1581d470ad42SMax Reitz 1582d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 1583d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 1584d6883bc9SVladimir Sementsov-Ogievskiy } 1585d6883bc9SVladimir Sementsov-Ogievskiy 1586cff86b38SEric Blake assert(is_power_of_2(align)); 1587cff86b38SEric Blake assert((offset & (align - 1)) == 0); 1588cff86b38SEric Blake assert((bytes & (align - 1)) == 0); 158961007b31SStefan Hajnoczi assert(!qiov || bytes == qiov->size); 1590abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 1591fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 159204ed95f4SEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 159304ed95f4SEric Blake align); 159461007b31SStefan Hajnoczi 1595999658a0SVladimir Sementsov-Ogievskiy /* BDRV_REQ_NO_SERIALISING is only for read operation */ 1596999658a0SVladimir Sementsov-Ogievskiy assert(!(flags & BDRV_REQ_NO_SERIALISING)); 159761007b31SStefan Hajnoczi waited = wait_serialising_requests(req); 159861007b31SStefan Hajnoczi assert(!waited || !req->serialising); 159961007b31SStefan Hajnoczi assert(req->overlap_offset <= offset); 160061007b31SStefan Hajnoczi assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1601c6035964SMax Reitz if (flags & BDRV_REQ_WRITE_UNCHANGED) { 1602c6035964SMax Reitz assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1603c6035964SMax Reitz } else { 1604362b3786SMax Reitz assert(child->perm & BLK_PERM_WRITE); 1605c6035964SMax Reitz } 1606362b3786SMax Reitz assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 160761007b31SStefan Hajnoczi 160861007b31SStefan Hajnoczi ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); 160961007b31SStefan Hajnoczi 161061007b31SStefan Hajnoczi if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1611c1499a5eSEric Blake !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 161261007b31SStefan Hajnoczi qemu_iovec_is_zero(qiov)) { 161361007b31SStefan Hajnoczi flags |= BDRV_REQ_ZERO_WRITE; 161461007b31SStefan Hajnoczi if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 161561007b31SStefan Hajnoczi flags |= BDRV_REQ_MAY_UNMAP; 161661007b31SStefan Hajnoczi } 161761007b31SStefan Hajnoczi } 161861007b31SStefan Hajnoczi 161961007b31SStefan Hajnoczi if (ret < 0) { 162061007b31SStefan Hajnoczi /* Do nothing, write notifier decided to fail this request */ 162161007b31SStefan Hajnoczi } else if (flags & BDRV_REQ_ZERO_WRITE) { 16229a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 16239896c876SKevin Wolf ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 16243ea1a091SPavel Butsykin } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 16253ea1a091SPavel Butsykin ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 162604ed95f4SEric Blake } else if (bytes <= max_transfer) { 16279a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV); 162878a07294SKevin Wolf ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 162904ed95f4SEric Blake } else { 163004ed95f4SEric Blake bdrv_debug_event(bs, BLKDBG_PWRITEV); 163104ed95f4SEric Blake while (bytes_remaining) { 163204ed95f4SEric Blake int num = MIN(bytes_remaining, max_transfer); 163304ed95f4SEric Blake QEMUIOVector local_qiov; 163404ed95f4SEric Blake int local_flags = flags; 163504ed95f4SEric Blake 163604ed95f4SEric Blake assert(num); 163704ed95f4SEric Blake if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 163804ed95f4SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 163904ed95f4SEric Blake /* If FUA is going to be emulated by flush, we only 164004ed95f4SEric Blake * need to flush on the last iteration */ 164104ed95f4SEric Blake local_flags &= ~BDRV_REQ_FUA; 164204ed95f4SEric Blake } 164304ed95f4SEric Blake qemu_iovec_init(&local_qiov, qiov->niov); 164404ed95f4SEric Blake qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 164504ed95f4SEric Blake 164604ed95f4SEric Blake ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 164704ed95f4SEric Blake num, &local_qiov, local_flags); 164804ed95f4SEric Blake qemu_iovec_destroy(&local_qiov); 164904ed95f4SEric Blake if (ret < 0) { 165004ed95f4SEric Blake break; 165104ed95f4SEric Blake } 165204ed95f4SEric Blake bytes_remaining -= num; 165304ed95f4SEric Blake } 165461007b31SStefan Hajnoczi } 16559a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 165661007b31SStefan Hajnoczi 165747fec599SPaolo Bonzini atomic_inc(&bs->write_gen); 16580fdf1a4fSEric Blake bdrv_set_dirty(bs, offset, bytes); 165961007b31SStefan Hajnoczi 1660f7946da2SPaolo Bonzini stat64_max(&bs->wr_highest_offset, offset + bytes); 166161007b31SStefan Hajnoczi 166261007b31SStefan Hajnoczi if (ret >= 0) { 16639896c876SKevin Wolf bs->total_sectors = MAX(bs->total_sectors, end_sector); 166404ed95f4SEric Blake ret = 0; 166561007b31SStefan Hajnoczi } 166661007b31SStefan Hajnoczi 166761007b31SStefan Hajnoczi return ret; 166861007b31SStefan Hajnoczi } 166961007b31SStefan Hajnoczi 167085c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 16719eeb6dd1SFam Zheng int64_t offset, 16729eeb6dd1SFam Zheng unsigned int bytes, 16739eeb6dd1SFam Zheng BdrvRequestFlags flags, 16749eeb6dd1SFam Zheng BdrvTrackedRequest *req) 16759eeb6dd1SFam Zheng { 167685c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 16779eeb6dd1SFam Zheng uint8_t *buf = NULL; 16789eeb6dd1SFam Zheng QEMUIOVector local_qiov; 16799eeb6dd1SFam Zheng struct iovec iov; 1680a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 16819eeb6dd1SFam Zheng unsigned int head_padding_bytes, tail_padding_bytes; 16829eeb6dd1SFam Zheng int ret = 0; 16839eeb6dd1SFam Zheng 16849eeb6dd1SFam Zheng head_padding_bytes = offset & (align - 1); 1685f13ce1beSDenis V. Lunev tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 16869eeb6dd1SFam Zheng 16879eeb6dd1SFam Zheng 16889eeb6dd1SFam Zheng assert(flags & BDRV_REQ_ZERO_WRITE); 16899eeb6dd1SFam Zheng if (head_padding_bytes || tail_padding_bytes) { 16909eeb6dd1SFam Zheng buf = qemu_blockalign(bs, align); 16919eeb6dd1SFam Zheng iov = (struct iovec) { 16929eeb6dd1SFam Zheng .iov_base = buf, 16939eeb6dd1SFam Zheng .iov_len = align, 16949eeb6dd1SFam Zheng }; 16959eeb6dd1SFam Zheng qemu_iovec_init_external(&local_qiov, &iov, 1); 16969eeb6dd1SFam Zheng } 16979eeb6dd1SFam Zheng if (head_padding_bytes) { 16989eeb6dd1SFam Zheng uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 16999eeb6dd1SFam Zheng 17009eeb6dd1SFam Zheng /* RMW the unaligned part before head. */ 17019eeb6dd1SFam Zheng mark_request_serialising(req, align); 17029eeb6dd1SFam Zheng wait_serialising_requests(req); 17039a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 170485c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 17059eeb6dd1SFam Zheng align, &local_qiov, 0); 17069eeb6dd1SFam Zheng if (ret < 0) { 17079eeb6dd1SFam Zheng goto fail; 17089eeb6dd1SFam Zheng } 17099a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 17109eeb6dd1SFam Zheng 17119eeb6dd1SFam Zheng memset(buf + head_padding_bytes, 0, zero_bytes); 171285c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1713cff86b38SEric Blake align, &local_qiov, 17149eeb6dd1SFam Zheng flags & ~BDRV_REQ_ZERO_WRITE); 17159eeb6dd1SFam Zheng if (ret < 0) { 17169eeb6dd1SFam Zheng goto fail; 17179eeb6dd1SFam Zheng } 17189eeb6dd1SFam Zheng offset += zero_bytes; 17199eeb6dd1SFam Zheng bytes -= zero_bytes; 17209eeb6dd1SFam Zheng } 17219eeb6dd1SFam Zheng 17229eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 17239eeb6dd1SFam Zheng if (bytes >= align) { 17249eeb6dd1SFam Zheng /* Write the aligned part in the middle. */ 17259eeb6dd1SFam Zheng uint64_t aligned_bytes = bytes & ~(align - 1); 172685c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 17279eeb6dd1SFam Zheng NULL, flags); 17289eeb6dd1SFam Zheng if (ret < 0) { 17299eeb6dd1SFam Zheng goto fail; 17309eeb6dd1SFam Zheng } 17319eeb6dd1SFam Zheng bytes -= aligned_bytes; 17329eeb6dd1SFam Zheng offset += aligned_bytes; 17339eeb6dd1SFam Zheng } 17349eeb6dd1SFam Zheng 17359eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 17369eeb6dd1SFam Zheng if (bytes) { 17379eeb6dd1SFam Zheng assert(align == tail_padding_bytes + bytes); 17389eeb6dd1SFam Zheng /* RMW the unaligned part after tail. */ 17399eeb6dd1SFam Zheng mark_request_serialising(req, align); 17409eeb6dd1SFam Zheng wait_serialising_requests(req); 17419a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 174285c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, req, offset, align, 17439eeb6dd1SFam Zheng align, &local_qiov, 0); 17449eeb6dd1SFam Zheng if (ret < 0) { 17459eeb6dd1SFam Zheng goto fail; 17469eeb6dd1SFam Zheng } 17479a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 17489eeb6dd1SFam Zheng 17499eeb6dd1SFam Zheng memset(buf, 0, bytes); 175085c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, align, align, 17519eeb6dd1SFam Zheng &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 17529eeb6dd1SFam Zheng } 17539eeb6dd1SFam Zheng fail: 17549eeb6dd1SFam Zheng qemu_vfree(buf); 17559eeb6dd1SFam Zheng return ret; 17569eeb6dd1SFam Zheng 17579eeb6dd1SFam Zheng } 17589eeb6dd1SFam Zheng 175961007b31SStefan Hajnoczi /* 176061007b31SStefan Hajnoczi * Handle a write request in coroutine context 176161007b31SStefan Hajnoczi */ 1762a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 176361007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 176461007b31SStefan Hajnoczi BdrvRequestFlags flags) 176561007b31SStefan Hajnoczi { 1766a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 176761007b31SStefan Hajnoczi BdrvTrackedRequest req; 1768a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 176961007b31SStefan Hajnoczi uint8_t *head_buf = NULL; 177061007b31SStefan Hajnoczi uint8_t *tail_buf = NULL; 177161007b31SStefan Hajnoczi QEMUIOVector local_qiov; 177261007b31SStefan Hajnoczi bool use_local_qiov = false; 177361007b31SStefan Hajnoczi int ret; 177461007b31SStefan Hajnoczi 1775f42cf447SDaniel P. Berrange trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 1776f42cf447SDaniel P. Berrange 177761007b31SStefan Hajnoczi if (!bs->drv) { 177861007b31SStefan Hajnoczi return -ENOMEDIUM; 177961007b31SStefan Hajnoczi } 178061007b31SStefan Hajnoczi if (bs->read_only) { 1781eaf5fe2dSPaolo Bonzini return -EPERM; 178261007b31SStefan Hajnoczi } 178304c01a5cSKevin Wolf assert(!(bs->open_flags & BDRV_O_INACTIVE)); 178461007b31SStefan Hajnoczi 178561007b31SStefan Hajnoczi ret = bdrv_check_byte_request(bs, offset, bytes); 178661007b31SStefan Hajnoczi if (ret < 0) { 178761007b31SStefan Hajnoczi return ret; 178861007b31SStefan Hajnoczi } 178961007b31SStefan Hajnoczi 179099723548SPaolo Bonzini bdrv_inc_in_flight(bs); 179161007b31SStefan Hajnoczi /* 179261007b31SStefan Hajnoczi * Align write if necessary by performing a read-modify-write cycle. 179361007b31SStefan Hajnoczi * Pad qiov with the read parts and be sure to have a tracked request not 179461007b31SStefan Hajnoczi * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 179561007b31SStefan Hajnoczi */ 1796ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 179761007b31SStefan Hajnoczi 179818a59f03SAnton Nefedov if (flags & BDRV_REQ_ZERO_WRITE) { 179985c97ca7SKevin Wolf ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 18009eeb6dd1SFam Zheng goto out; 18019eeb6dd1SFam Zheng } 18029eeb6dd1SFam Zheng 180361007b31SStefan Hajnoczi if (offset & (align - 1)) { 180461007b31SStefan Hajnoczi QEMUIOVector head_qiov; 180561007b31SStefan Hajnoczi struct iovec head_iov; 180661007b31SStefan Hajnoczi 180761007b31SStefan Hajnoczi mark_request_serialising(&req, align); 180861007b31SStefan Hajnoczi wait_serialising_requests(&req); 180961007b31SStefan Hajnoczi 181061007b31SStefan Hajnoczi head_buf = qemu_blockalign(bs, align); 181161007b31SStefan Hajnoczi head_iov = (struct iovec) { 181261007b31SStefan Hajnoczi .iov_base = head_buf, 181361007b31SStefan Hajnoczi .iov_len = align, 181461007b31SStefan Hajnoczi }; 181561007b31SStefan Hajnoczi qemu_iovec_init_external(&head_qiov, &head_iov, 1); 181661007b31SStefan Hajnoczi 18179a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 181885c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 181961007b31SStefan Hajnoczi align, &head_qiov, 0); 182061007b31SStefan Hajnoczi if (ret < 0) { 182161007b31SStefan Hajnoczi goto fail; 182261007b31SStefan Hajnoczi } 18239a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 182461007b31SStefan Hajnoczi 182561007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 2); 182661007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 182761007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 182861007b31SStefan Hajnoczi use_local_qiov = true; 182961007b31SStefan Hajnoczi 183061007b31SStefan Hajnoczi bytes += offset & (align - 1); 183161007b31SStefan Hajnoczi offset = offset & ~(align - 1); 1832117bc3faSPeter Lieven 1833117bc3faSPeter Lieven /* We have read the tail already if the request is smaller 1834117bc3faSPeter Lieven * than one aligned block. 1835117bc3faSPeter Lieven */ 1836117bc3faSPeter Lieven if (bytes < align) { 1837117bc3faSPeter Lieven qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1838117bc3faSPeter Lieven bytes = align; 1839117bc3faSPeter Lieven } 184061007b31SStefan Hajnoczi } 184161007b31SStefan Hajnoczi 184261007b31SStefan Hajnoczi if ((offset + bytes) & (align - 1)) { 184361007b31SStefan Hajnoczi QEMUIOVector tail_qiov; 184461007b31SStefan Hajnoczi struct iovec tail_iov; 184561007b31SStefan Hajnoczi size_t tail_bytes; 184661007b31SStefan Hajnoczi bool waited; 184761007b31SStefan Hajnoczi 184861007b31SStefan Hajnoczi mark_request_serialising(&req, align); 184961007b31SStefan Hajnoczi waited = wait_serialising_requests(&req); 185061007b31SStefan Hajnoczi assert(!waited || !use_local_qiov); 185161007b31SStefan Hajnoczi 185261007b31SStefan Hajnoczi tail_buf = qemu_blockalign(bs, align); 185361007b31SStefan Hajnoczi tail_iov = (struct iovec) { 185461007b31SStefan Hajnoczi .iov_base = tail_buf, 185561007b31SStefan Hajnoczi .iov_len = align, 185661007b31SStefan Hajnoczi }; 185761007b31SStefan Hajnoczi qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); 185861007b31SStefan Hajnoczi 18599a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 186085c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 186185c97ca7SKevin Wolf align, align, &tail_qiov, 0); 186261007b31SStefan Hajnoczi if (ret < 0) { 186361007b31SStefan Hajnoczi goto fail; 186461007b31SStefan Hajnoczi } 18659a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 186661007b31SStefan Hajnoczi 186761007b31SStefan Hajnoczi if (!use_local_qiov) { 186861007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 1); 186961007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 187061007b31SStefan Hajnoczi use_local_qiov = true; 187161007b31SStefan Hajnoczi } 187261007b31SStefan Hajnoczi 187361007b31SStefan Hajnoczi tail_bytes = (offset + bytes) & (align - 1); 187461007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 187561007b31SStefan Hajnoczi 187661007b31SStefan Hajnoczi bytes = ROUND_UP(bytes, align); 187761007b31SStefan Hajnoczi } 187861007b31SStefan Hajnoczi 187985c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 188061007b31SStefan Hajnoczi use_local_qiov ? &local_qiov : qiov, 188161007b31SStefan Hajnoczi flags); 188261007b31SStefan Hajnoczi 188361007b31SStefan Hajnoczi fail: 188461007b31SStefan Hajnoczi 188561007b31SStefan Hajnoczi if (use_local_qiov) { 188661007b31SStefan Hajnoczi qemu_iovec_destroy(&local_qiov); 188761007b31SStefan Hajnoczi } 188861007b31SStefan Hajnoczi qemu_vfree(head_buf); 188961007b31SStefan Hajnoczi qemu_vfree(tail_buf); 18909eeb6dd1SFam Zheng out: 18919eeb6dd1SFam Zheng tracked_request_end(&req); 189299723548SPaolo Bonzini bdrv_dec_in_flight(bs); 189361007b31SStefan Hajnoczi return ret; 189461007b31SStefan Hajnoczi } 189561007b31SStefan Hajnoczi 1896a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1897f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 189861007b31SStefan Hajnoczi { 1899f5a5ca79SManos Pitsidianakis trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 190061007b31SStefan Hajnoczi 1901a03ef88fSKevin Wolf if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 190261007b31SStefan Hajnoczi flags &= ~BDRV_REQ_MAY_UNMAP; 190361007b31SStefan Hajnoczi } 190461007b31SStefan Hajnoczi 1905f5a5ca79SManos Pitsidianakis return bdrv_co_pwritev(child, offset, bytes, NULL, 190661007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 190761007b31SStefan Hajnoczi } 190861007b31SStefan Hajnoczi 19094085f5c7SJohn Snow /* 19104085f5c7SJohn Snow * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 19114085f5c7SJohn Snow */ 19124085f5c7SJohn Snow int bdrv_flush_all(void) 19134085f5c7SJohn Snow { 19144085f5c7SJohn Snow BdrvNextIterator it; 19154085f5c7SJohn Snow BlockDriverState *bs = NULL; 19164085f5c7SJohn Snow int result = 0; 19174085f5c7SJohn Snow 19184085f5c7SJohn Snow for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 19194085f5c7SJohn Snow AioContext *aio_context = bdrv_get_aio_context(bs); 19204085f5c7SJohn Snow int ret; 19214085f5c7SJohn Snow 19224085f5c7SJohn Snow aio_context_acquire(aio_context); 19234085f5c7SJohn Snow ret = bdrv_flush(bs); 19244085f5c7SJohn Snow if (ret < 0 && !result) { 19254085f5c7SJohn Snow result = ret; 19264085f5c7SJohn Snow } 19274085f5c7SJohn Snow aio_context_release(aio_context); 19284085f5c7SJohn Snow } 19294085f5c7SJohn Snow 19304085f5c7SJohn Snow return result; 19314085f5c7SJohn Snow } 19324085f5c7SJohn Snow 19334085f5c7SJohn Snow 19344bcd936eSEric Blake typedef struct BdrvCoBlockStatusData { 193561007b31SStefan Hajnoczi BlockDriverState *bs; 193661007b31SStefan Hajnoczi BlockDriverState *base; 1937c9ce8c4dSEric Blake bool want_zero; 19384bcd936eSEric Blake int64_t offset; 19394bcd936eSEric Blake int64_t bytes; 19404bcd936eSEric Blake int64_t *pnum; 19414bcd936eSEric Blake int64_t *map; 1942c9ce8c4dSEric Blake BlockDriverState **file; 19434bcd936eSEric Blake int ret; 194461007b31SStefan Hajnoczi bool done; 19454bcd936eSEric Blake } BdrvCoBlockStatusData; 194661007b31SStefan Hajnoczi 19473e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 19483e4d0e72SEric Blake bool want_zero, 19493e4d0e72SEric Blake int64_t offset, 19503e4d0e72SEric Blake int64_t bytes, 19513e4d0e72SEric Blake int64_t *pnum, 19523e4d0e72SEric Blake int64_t *map, 1953f7cc69b3SManos Pitsidianakis BlockDriverState **file) 1954f7cc69b3SManos Pitsidianakis { 1955f7cc69b3SManos Pitsidianakis assert(bs->file && bs->file->bs); 19563e4d0e72SEric Blake *pnum = bytes; 19573e4d0e72SEric Blake *map = offset; 1958f7cc69b3SManos Pitsidianakis *file = bs->file->bs; 19593e4d0e72SEric Blake return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1960f7cc69b3SManos Pitsidianakis } 1961f7cc69b3SManos Pitsidianakis 19623e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 19633e4d0e72SEric Blake bool want_zero, 19643e4d0e72SEric Blake int64_t offset, 19653e4d0e72SEric Blake int64_t bytes, 19663e4d0e72SEric Blake int64_t *pnum, 19673e4d0e72SEric Blake int64_t *map, 1968f7cc69b3SManos Pitsidianakis BlockDriverState **file) 1969f7cc69b3SManos Pitsidianakis { 1970f7cc69b3SManos Pitsidianakis assert(bs->backing && bs->backing->bs); 19713e4d0e72SEric Blake *pnum = bytes; 19723e4d0e72SEric Blake *map = offset; 1973f7cc69b3SManos Pitsidianakis *file = bs->backing->bs; 19743e4d0e72SEric Blake return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1975f7cc69b3SManos Pitsidianakis } 1976f7cc69b3SManos Pitsidianakis 197761007b31SStefan Hajnoczi /* 197861007b31SStefan Hajnoczi * Returns the allocation status of the specified sectors. 197961007b31SStefan Hajnoczi * Drivers not implementing the functionality are assumed to not support 198061007b31SStefan Hajnoczi * backing files, hence all their sectors are reported as allocated. 198161007b31SStefan Hajnoczi * 198286a3d5c6SEric Blake * If 'want_zero' is true, the caller is querying for mapping 198386a3d5c6SEric Blake * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 198486a3d5c6SEric Blake * _ZERO where possible; otherwise, the result favors larger 'pnum', 198586a3d5c6SEric Blake * with a focus on accurate BDRV_BLOCK_ALLOCATED. 1986c9ce8c4dSEric Blake * 19872e8bc787SEric Blake * If 'offset' is beyond the end of the disk image the return value is 1988fb0d8654SEric Blake * BDRV_BLOCK_EOF and 'pnum' is set to 0. 198961007b31SStefan Hajnoczi * 19902e8bc787SEric Blake * 'bytes' is the max value 'pnum' should be set to. If bytes goes 1991fb0d8654SEric Blake * beyond the end of the disk image it will be clamped; if 'pnum' is set to 1992fb0d8654SEric Blake * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 199367a0fd2aSFam Zheng * 19942e8bc787SEric Blake * 'pnum' is set to the number of bytes (including and immediately 19952e8bc787SEric Blake * following the specified offset) that are easily known to be in the 19962e8bc787SEric Blake * same allocated/unallocated state. Note that a second call starting 19972e8bc787SEric Blake * at the original offset plus returned pnum may have the same status. 19982e8bc787SEric Blake * The returned value is non-zero on success except at end-of-file. 19992e8bc787SEric Blake * 20002e8bc787SEric Blake * Returns negative errno on failure. Otherwise, if the 20012e8bc787SEric Blake * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 20022e8bc787SEric Blake * set to the host mapping and BDS corresponding to the guest offset. 200361007b31SStefan Hajnoczi */ 20042e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2005c9ce8c4dSEric Blake bool want_zero, 20062e8bc787SEric Blake int64_t offset, int64_t bytes, 20072e8bc787SEric Blake int64_t *pnum, int64_t *map, 200867a0fd2aSFam Zheng BlockDriverState **file) 200961007b31SStefan Hajnoczi { 20102e8bc787SEric Blake int64_t total_size; 20112e8bc787SEric Blake int64_t n; /* bytes */ 2012efa6e2edSEric Blake int ret; 20132e8bc787SEric Blake int64_t local_map = 0; 2014298a1665SEric Blake BlockDriverState *local_file = NULL; 2015efa6e2edSEric Blake int64_t aligned_offset, aligned_bytes; 2016efa6e2edSEric Blake uint32_t align; 201761007b31SStefan Hajnoczi 2018298a1665SEric Blake assert(pnum); 2019298a1665SEric Blake *pnum = 0; 20202e8bc787SEric Blake total_size = bdrv_getlength(bs); 20212e8bc787SEric Blake if (total_size < 0) { 20222e8bc787SEric Blake ret = total_size; 2023298a1665SEric Blake goto early_out; 202461007b31SStefan Hajnoczi } 202561007b31SStefan Hajnoczi 20262e8bc787SEric Blake if (offset >= total_size) { 2027298a1665SEric Blake ret = BDRV_BLOCK_EOF; 2028298a1665SEric Blake goto early_out; 202961007b31SStefan Hajnoczi } 20302e8bc787SEric Blake if (!bytes) { 2031298a1665SEric Blake ret = 0; 2032298a1665SEric Blake goto early_out; 20339cdcfd9fSEric Blake } 203461007b31SStefan Hajnoczi 20352e8bc787SEric Blake n = total_size - offset; 20362e8bc787SEric Blake if (n < bytes) { 20372e8bc787SEric Blake bytes = n; 203861007b31SStefan Hajnoczi } 203961007b31SStefan Hajnoczi 2040d470ad42SMax Reitz /* Must be non-NULL or bdrv_getlength() would have failed */ 2041d470ad42SMax Reitz assert(bs->drv); 2042636cb512SEric Blake if (!bs->drv->bdrv_co_block_status) { 20432e8bc787SEric Blake *pnum = bytes; 204461007b31SStefan Hajnoczi ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 20452e8bc787SEric Blake if (offset + bytes == total_size) { 2046fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2047fb0d8654SEric Blake } 204861007b31SStefan Hajnoczi if (bs->drv->protocol_name) { 20492e8bc787SEric Blake ret |= BDRV_BLOCK_OFFSET_VALID; 20502e8bc787SEric Blake local_map = offset; 2051298a1665SEric Blake local_file = bs; 205261007b31SStefan Hajnoczi } 2053298a1665SEric Blake goto early_out; 205461007b31SStefan Hajnoczi } 205561007b31SStefan Hajnoczi 205699723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2057efa6e2edSEric Blake 2058efa6e2edSEric Blake /* Round out to request_alignment boundaries */ 205986a3d5c6SEric Blake align = bs->bl.request_alignment; 2060efa6e2edSEric Blake aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2061efa6e2edSEric Blake aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2062efa6e2edSEric Blake 206386a3d5c6SEric Blake ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 206486a3d5c6SEric Blake aligned_bytes, pnum, &local_map, 206586a3d5c6SEric Blake &local_file); 206686a3d5c6SEric Blake if (ret < 0) { 206786a3d5c6SEric Blake *pnum = 0; 206886a3d5c6SEric Blake goto out; 206986a3d5c6SEric Blake } 2070efa6e2edSEric Blake 2071efa6e2edSEric Blake /* 2072636cb512SEric Blake * The driver's result must be a non-zero multiple of request_alignment. 2073efa6e2edSEric Blake * Clamp pnum and adjust map to original request. 2074efa6e2edSEric Blake */ 2075636cb512SEric Blake assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2076636cb512SEric Blake align > offset - aligned_offset); 2077efa6e2edSEric Blake *pnum -= offset - aligned_offset; 2078efa6e2edSEric Blake if (*pnum > bytes) { 2079efa6e2edSEric Blake *pnum = bytes; 2080efa6e2edSEric Blake } 2081efa6e2edSEric Blake if (ret & BDRV_BLOCK_OFFSET_VALID) { 2082efa6e2edSEric Blake local_map += offset - aligned_offset; 2083efa6e2edSEric Blake } 208461007b31SStefan Hajnoczi 208561007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_RAW) { 2086298a1665SEric Blake assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 20872e8bc787SEric Blake ret = bdrv_co_block_status(local_file, want_zero, local_map, 20882e8bc787SEric Blake *pnum, pnum, &local_map, &local_file); 208999723548SPaolo Bonzini goto out; 209061007b31SStefan Hajnoczi } 209161007b31SStefan Hajnoczi 209261007b31SStefan Hajnoczi if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 209361007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ALLOCATED; 2094c9ce8c4dSEric Blake } else if (want_zero) { 209561007b31SStefan Hajnoczi if (bdrv_unallocated_blocks_are_zero(bs)) { 209661007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 2097760e0063SKevin Wolf } else if (bs->backing) { 2098760e0063SKevin Wolf BlockDriverState *bs2 = bs->backing->bs; 20992e8bc787SEric Blake int64_t size2 = bdrv_getlength(bs2); 2100c9ce8c4dSEric Blake 21012e8bc787SEric Blake if (size2 >= 0 && offset >= size2) { 210261007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 210361007b31SStefan Hajnoczi } 210461007b31SStefan Hajnoczi } 210561007b31SStefan Hajnoczi } 210661007b31SStefan Hajnoczi 2107c9ce8c4dSEric Blake if (want_zero && local_file && local_file != bs && 210861007b31SStefan Hajnoczi (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 210961007b31SStefan Hajnoczi (ret & BDRV_BLOCK_OFFSET_VALID)) { 21102e8bc787SEric Blake int64_t file_pnum; 21112e8bc787SEric Blake int ret2; 211261007b31SStefan Hajnoczi 21132e8bc787SEric Blake ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 21142e8bc787SEric Blake *pnum, &file_pnum, NULL, NULL); 211561007b31SStefan Hajnoczi if (ret2 >= 0) { 211661007b31SStefan Hajnoczi /* Ignore errors. This is just providing extra information, it 211761007b31SStefan Hajnoczi * is useful but not necessary. 211861007b31SStefan Hajnoczi */ 2119c61e684eSEric Blake if (ret2 & BDRV_BLOCK_EOF && 2120c61e684eSEric Blake (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2121c61e684eSEric Blake /* 2122c61e684eSEric Blake * It is valid for the format block driver to read 2123c61e684eSEric Blake * beyond the end of the underlying file's current 2124c61e684eSEric Blake * size; such areas read as zero. 2125c61e684eSEric Blake */ 212661007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 212761007b31SStefan Hajnoczi } else { 212861007b31SStefan Hajnoczi /* Limit request to the range reported by the protocol driver */ 212961007b31SStefan Hajnoczi *pnum = file_pnum; 213061007b31SStefan Hajnoczi ret |= (ret2 & BDRV_BLOCK_ZERO); 213161007b31SStefan Hajnoczi } 213261007b31SStefan Hajnoczi } 213361007b31SStefan Hajnoczi } 213461007b31SStefan Hajnoczi 213599723548SPaolo Bonzini out: 213699723548SPaolo Bonzini bdrv_dec_in_flight(bs); 21372e8bc787SEric Blake if (ret >= 0 && offset + *pnum == total_size) { 2138fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2139fb0d8654SEric Blake } 2140298a1665SEric Blake early_out: 2141298a1665SEric Blake if (file) { 2142298a1665SEric Blake *file = local_file; 2143298a1665SEric Blake } 21442e8bc787SEric Blake if (map) { 21452e8bc787SEric Blake *map = local_map; 21462e8bc787SEric Blake } 214761007b31SStefan Hajnoczi return ret; 214861007b31SStefan Hajnoczi } 214961007b31SStefan Hajnoczi 21505b648c67SEric Blake static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2151ba3f0e25SFam Zheng BlockDriverState *base, 2152c9ce8c4dSEric Blake bool want_zero, 21535b648c67SEric Blake int64_t offset, 21545b648c67SEric Blake int64_t bytes, 21555b648c67SEric Blake int64_t *pnum, 21565b648c67SEric Blake int64_t *map, 215767a0fd2aSFam Zheng BlockDriverState **file) 2158ba3f0e25SFam Zheng { 2159ba3f0e25SFam Zheng BlockDriverState *p; 21605b648c67SEric Blake int ret = 0; 2161c61e684eSEric Blake bool first = true; 2162ba3f0e25SFam Zheng 2163ba3f0e25SFam Zheng assert(bs != base); 2164760e0063SKevin Wolf for (p = bs; p != base; p = backing_bs(p)) { 21655b648c67SEric Blake ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 21665b648c67SEric Blake file); 2167c61e684eSEric Blake if (ret < 0) { 2168c61e684eSEric Blake break; 2169c61e684eSEric Blake } 2170c61e684eSEric Blake if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2171c61e684eSEric Blake /* 2172c61e684eSEric Blake * Reading beyond the end of the file continues to read 2173c61e684eSEric Blake * zeroes, but we can only widen the result to the 2174c61e684eSEric Blake * unallocated length we learned from an earlier 2175c61e684eSEric Blake * iteration. 2176c61e684eSEric Blake */ 21775b648c67SEric Blake *pnum = bytes; 2178c61e684eSEric Blake } 2179c61e684eSEric Blake if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2180ba3f0e25SFam Zheng break; 2181ba3f0e25SFam Zheng } 21825b648c67SEric Blake /* [offset, pnum] unallocated on this layer, which could be only 21835b648c67SEric Blake * the first part of [offset, bytes]. */ 21845b648c67SEric Blake bytes = MIN(bytes, *pnum); 2185c61e684eSEric Blake first = false; 2186ba3f0e25SFam Zheng } 2187ba3f0e25SFam Zheng return ret; 2188ba3f0e25SFam Zheng } 2189ba3f0e25SFam Zheng 219031826642SEric Blake /* Coroutine wrapper for bdrv_block_status_above() */ 21915b648c67SEric Blake static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 219261007b31SStefan Hajnoczi { 21934bcd936eSEric Blake BdrvCoBlockStatusData *data = opaque; 219461007b31SStefan Hajnoczi 21955b648c67SEric Blake data->ret = bdrv_co_block_status_above(data->bs, data->base, 2196c9ce8c4dSEric Blake data->want_zero, 21975b648c67SEric Blake data->offset, data->bytes, 21985b648c67SEric Blake data->pnum, data->map, data->file); 219961007b31SStefan Hajnoczi data->done = true; 220061007b31SStefan Hajnoczi } 220161007b31SStefan Hajnoczi 220261007b31SStefan Hajnoczi /* 22035b648c67SEric Blake * Synchronous wrapper around bdrv_co_block_status_above(). 220461007b31SStefan Hajnoczi * 22055b648c67SEric Blake * See bdrv_co_block_status_above() for details. 220661007b31SStefan Hajnoczi */ 22077ddb99b9SEric Blake static int bdrv_common_block_status_above(BlockDriverState *bs, 2208ba3f0e25SFam Zheng BlockDriverState *base, 22097ddb99b9SEric Blake bool want_zero, int64_t offset, 22107ddb99b9SEric Blake int64_t bytes, int64_t *pnum, 22117ddb99b9SEric Blake int64_t *map, 221267a0fd2aSFam Zheng BlockDriverState **file) 221361007b31SStefan Hajnoczi { 221461007b31SStefan Hajnoczi Coroutine *co; 22154bcd936eSEric Blake BdrvCoBlockStatusData data = { 221661007b31SStefan Hajnoczi .bs = bs, 2217ba3f0e25SFam Zheng .base = base, 2218c9ce8c4dSEric Blake .want_zero = want_zero, 22197ddb99b9SEric Blake .offset = offset, 22207ddb99b9SEric Blake .bytes = bytes, 22217ddb99b9SEric Blake .pnum = pnum, 22227ddb99b9SEric Blake .map = map, 2223c9ce8c4dSEric Blake .file = file, 222461007b31SStefan Hajnoczi .done = false, 222561007b31SStefan Hajnoczi }; 222661007b31SStefan Hajnoczi 222761007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 222861007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 22295b648c67SEric Blake bdrv_block_status_above_co_entry(&data); 223061007b31SStefan Hajnoczi } else { 22315b648c67SEric Blake co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2232e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 223388b062c2SPaolo Bonzini BDRV_POLL_WHILE(bs, !data.done); 223461007b31SStefan Hajnoczi } 223561007b31SStefan Hajnoczi return data.ret; 223661007b31SStefan Hajnoczi } 223761007b31SStefan Hajnoczi 223831826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 223931826642SEric Blake int64_t offset, int64_t bytes, int64_t *pnum, 224031826642SEric Blake int64_t *map, BlockDriverState **file) 2241c9ce8c4dSEric Blake { 224231826642SEric Blake return bdrv_common_block_status_above(bs, base, true, offset, bytes, 224331826642SEric Blake pnum, map, file); 2244c9ce8c4dSEric Blake } 2245c9ce8c4dSEric Blake 2246237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2247237d78f8SEric Blake int64_t *pnum, int64_t *map, BlockDriverState **file) 2248ba3f0e25SFam Zheng { 224931826642SEric Blake return bdrv_block_status_above(bs, backing_bs(bs), 225031826642SEric Blake offset, bytes, pnum, map, file); 2251ba3f0e25SFam Zheng } 2252ba3f0e25SFam Zheng 2253d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2254d6a644bbSEric Blake int64_t bytes, int64_t *pnum) 225561007b31SStefan Hajnoczi { 22567ddb99b9SEric Blake int ret; 22577ddb99b9SEric Blake int64_t dummy; 2258d6a644bbSEric Blake 22597ddb99b9SEric Blake ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 22607ddb99b9SEric Blake bytes, pnum ? pnum : &dummy, NULL, 2261298a1665SEric Blake NULL); 226261007b31SStefan Hajnoczi if (ret < 0) { 226361007b31SStefan Hajnoczi return ret; 226461007b31SStefan Hajnoczi } 226561007b31SStefan Hajnoczi return !!(ret & BDRV_BLOCK_ALLOCATED); 226661007b31SStefan Hajnoczi } 226761007b31SStefan Hajnoczi 226861007b31SStefan Hajnoczi /* 226961007b31SStefan Hajnoczi * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 227061007b31SStefan Hajnoczi * 227151b0a488SEric Blake * Return true if (a prefix of) the given range is allocated in any image 227251b0a488SEric Blake * between BASE and TOP (inclusive). BASE can be NULL to check if the given 227351b0a488SEric Blake * offset is allocated in any image of the chain. Return false otherwise, 2274d6a644bbSEric Blake * or negative errno on failure. 227561007b31SStefan Hajnoczi * 227651b0a488SEric Blake * 'pnum' is set to the number of bytes (including and immediately 227751b0a488SEric Blake * following the specified offset) that are known to be in the same 227851b0a488SEric Blake * allocated/unallocated state. Note that a subsequent call starting 227951b0a488SEric Blake * at 'offset + *pnum' may return the same allocation status (in other 228051b0a488SEric Blake * words, the result is not necessarily the maximum possible range); 228151b0a488SEric Blake * but 'pnum' will only be 0 when end of file is reached. 228261007b31SStefan Hajnoczi * 228361007b31SStefan Hajnoczi */ 228461007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top, 228561007b31SStefan Hajnoczi BlockDriverState *base, 228651b0a488SEric Blake int64_t offset, int64_t bytes, int64_t *pnum) 228761007b31SStefan Hajnoczi { 228861007b31SStefan Hajnoczi BlockDriverState *intermediate; 228951b0a488SEric Blake int ret; 229051b0a488SEric Blake int64_t n = bytes; 229161007b31SStefan Hajnoczi 229261007b31SStefan Hajnoczi intermediate = top; 229361007b31SStefan Hajnoczi while (intermediate && intermediate != base) { 2294d6a644bbSEric Blake int64_t pnum_inter; 2295c00716beSEric Blake int64_t size_inter; 2296d6a644bbSEric Blake 229751b0a488SEric Blake ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 229861007b31SStefan Hajnoczi if (ret < 0) { 229961007b31SStefan Hajnoczi return ret; 2300d6a644bbSEric Blake } 2301d6a644bbSEric Blake if (ret) { 230251b0a488SEric Blake *pnum = pnum_inter; 230361007b31SStefan Hajnoczi return 1; 230461007b31SStefan Hajnoczi } 230561007b31SStefan Hajnoczi 230651b0a488SEric Blake size_inter = bdrv_getlength(intermediate); 2307c00716beSEric Blake if (size_inter < 0) { 2308c00716beSEric Blake return size_inter; 2309c00716beSEric Blake } 231051b0a488SEric Blake if (n > pnum_inter && 231151b0a488SEric Blake (intermediate == top || offset + pnum_inter < size_inter)) { 231251b0a488SEric Blake n = pnum_inter; 231361007b31SStefan Hajnoczi } 231461007b31SStefan Hajnoczi 2315760e0063SKevin Wolf intermediate = backing_bs(intermediate); 231661007b31SStefan Hajnoczi } 231761007b31SStefan Hajnoczi 231861007b31SStefan Hajnoczi *pnum = n; 231961007b31SStefan Hajnoczi return 0; 232061007b31SStefan Hajnoczi } 232161007b31SStefan Hajnoczi 23221a8ae822SKevin Wolf typedef struct BdrvVmstateCo { 23231a8ae822SKevin Wolf BlockDriverState *bs; 23241a8ae822SKevin Wolf QEMUIOVector *qiov; 23251a8ae822SKevin Wolf int64_t pos; 23261a8ae822SKevin Wolf bool is_read; 23271a8ae822SKevin Wolf int ret; 23281a8ae822SKevin Wolf } BdrvVmstateCo; 23291a8ae822SKevin Wolf 23301a8ae822SKevin Wolf static int coroutine_fn 23311a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 23321a8ae822SKevin Wolf bool is_read) 23331a8ae822SKevin Wolf { 23341a8ae822SKevin Wolf BlockDriver *drv = bs->drv; 2335dc88a467SStefan Hajnoczi int ret = -ENOTSUP; 2336dc88a467SStefan Hajnoczi 2337dc88a467SStefan Hajnoczi bdrv_inc_in_flight(bs); 23381a8ae822SKevin Wolf 23391a8ae822SKevin Wolf if (!drv) { 2340dc88a467SStefan Hajnoczi ret = -ENOMEDIUM; 23411a8ae822SKevin Wolf } else if (drv->bdrv_load_vmstate) { 2342dc88a467SStefan Hajnoczi if (is_read) { 2343dc88a467SStefan Hajnoczi ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2344dc88a467SStefan Hajnoczi } else { 2345dc88a467SStefan Hajnoczi ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2346dc88a467SStefan Hajnoczi } 23471a8ae822SKevin Wolf } else if (bs->file) { 2348dc88a467SStefan Hajnoczi ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 23491a8ae822SKevin Wolf } 23501a8ae822SKevin Wolf 2351dc88a467SStefan Hajnoczi bdrv_dec_in_flight(bs); 2352dc88a467SStefan Hajnoczi return ret; 23531a8ae822SKevin Wolf } 23541a8ae822SKevin Wolf 23551a8ae822SKevin Wolf static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 23561a8ae822SKevin Wolf { 23571a8ae822SKevin Wolf BdrvVmstateCo *co = opaque; 23581a8ae822SKevin Wolf co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 23591a8ae822SKevin Wolf } 23601a8ae822SKevin Wolf 23611a8ae822SKevin Wolf static inline int 23621a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 23631a8ae822SKevin Wolf bool is_read) 23641a8ae822SKevin Wolf { 23651a8ae822SKevin Wolf if (qemu_in_coroutine()) { 23661a8ae822SKevin Wolf return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 23671a8ae822SKevin Wolf } else { 23681a8ae822SKevin Wolf BdrvVmstateCo data = { 23691a8ae822SKevin Wolf .bs = bs, 23701a8ae822SKevin Wolf .qiov = qiov, 23711a8ae822SKevin Wolf .pos = pos, 23721a8ae822SKevin Wolf .is_read = is_read, 23731a8ae822SKevin Wolf .ret = -EINPROGRESS, 23741a8ae822SKevin Wolf }; 23750b8b8753SPaolo Bonzini Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 23761a8ae822SKevin Wolf 2377e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 2378ea17c9d2SStefan Hajnoczi BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 23791a8ae822SKevin Wolf return data.ret; 23801a8ae822SKevin Wolf } 23811a8ae822SKevin Wolf } 23821a8ae822SKevin Wolf 238361007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 238461007b31SStefan Hajnoczi int64_t pos, int size) 238561007b31SStefan Hajnoczi { 238661007b31SStefan Hajnoczi QEMUIOVector qiov; 238761007b31SStefan Hajnoczi struct iovec iov = { 238861007b31SStefan Hajnoczi .iov_base = (void *) buf, 238961007b31SStefan Hajnoczi .iov_len = size, 239061007b31SStefan Hajnoczi }; 2391b433d942SKevin Wolf int ret; 239261007b31SStefan Hajnoczi 239361007b31SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 2394b433d942SKevin Wolf 2395b433d942SKevin Wolf ret = bdrv_writev_vmstate(bs, &qiov, pos); 2396b433d942SKevin Wolf if (ret < 0) { 2397b433d942SKevin Wolf return ret; 2398b433d942SKevin Wolf } 2399b433d942SKevin Wolf 2400b433d942SKevin Wolf return size; 240161007b31SStefan Hajnoczi } 240261007b31SStefan Hajnoczi 240361007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 240461007b31SStefan Hajnoczi { 24051a8ae822SKevin Wolf return bdrv_rw_vmstate(bs, qiov, pos, false); 240661007b31SStefan Hajnoczi } 240761007b31SStefan Hajnoczi 240861007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 240961007b31SStefan Hajnoczi int64_t pos, int size) 241061007b31SStefan Hajnoczi { 24115ddda0b8SKevin Wolf QEMUIOVector qiov; 24125ddda0b8SKevin Wolf struct iovec iov = { 24135ddda0b8SKevin Wolf .iov_base = buf, 24145ddda0b8SKevin Wolf .iov_len = size, 24155ddda0b8SKevin Wolf }; 2416b433d942SKevin Wolf int ret; 24175ddda0b8SKevin Wolf 24185ddda0b8SKevin Wolf qemu_iovec_init_external(&qiov, &iov, 1); 2419b433d942SKevin Wolf ret = bdrv_readv_vmstate(bs, &qiov, pos); 2420b433d942SKevin Wolf if (ret < 0) { 2421b433d942SKevin Wolf return ret; 2422b433d942SKevin Wolf } 2423b433d942SKevin Wolf 2424b433d942SKevin Wolf return size; 24255ddda0b8SKevin Wolf } 24265ddda0b8SKevin Wolf 24275ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 24285ddda0b8SKevin Wolf { 24291a8ae822SKevin Wolf return bdrv_rw_vmstate(bs, qiov, pos, true); 243061007b31SStefan Hajnoczi } 243161007b31SStefan Hajnoczi 243261007b31SStefan Hajnoczi /**************************************************************/ 243361007b31SStefan Hajnoczi /* async I/Os */ 243461007b31SStefan Hajnoczi 243561007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb) 243661007b31SStefan Hajnoczi { 243761007b31SStefan Hajnoczi qemu_aio_ref(acb); 243861007b31SStefan Hajnoczi bdrv_aio_cancel_async(acb); 243961007b31SStefan Hajnoczi while (acb->refcnt > 1) { 244061007b31SStefan Hajnoczi if (acb->aiocb_info->get_aio_context) { 244161007b31SStefan Hajnoczi aio_poll(acb->aiocb_info->get_aio_context(acb), true); 244261007b31SStefan Hajnoczi } else if (acb->bs) { 24432f47da5fSPaolo Bonzini /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 24442f47da5fSPaolo Bonzini * assert that we're not using an I/O thread. Thread-safe 24452f47da5fSPaolo Bonzini * code should use bdrv_aio_cancel_async exclusively. 24462f47da5fSPaolo Bonzini */ 24472f47da5fSPaolo Bonzini assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 244861007b31SStefan Hajnoczi aio_poll(bdrv_get_aio_context(acb->bs), true); 244961007b31SStefan Hajnoczi } else { 245061007b31SStefan Hajnoczi abort(); 245161007b31SStefan Hajnoczi } 245261007b31SStefan Hajnoczi } 245361007b31SStefan Hajnoczi qemu_aio_unref(acb); 245461007b31SStefan Hajnoczi } 245561007b31SStefan Hajnoczi 245661007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements 245761007b31SStefan Hajnoczi * cancel_async, otherwise we do nothing and let the request normally complete. 245861007b31SStefan Hajnoczi * In either case the completion callback must be called. */ 245961007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb) 246061007b31SStefan Hajnoczi { 246161007b31SStefan Hajnoczi if (acb->aiocb_info->cancel_async) { 246261007b31SStefan Hajnoczi acb->aiocb_info->cancel_async(acb); 246361007b31SStefan Hajnoczi } 246461007b31SStefan Hajnoczi } 246561007b31SStefan Hajnoczi 246661007b31SStefan Hajnoczi /**************************************************************/ 246761007b31SStefan Hajnoczi /* Coroutine block device emulation */ 246861007b31SStefan Hajnoczi 2469e293b7a3SKevin Wolf typedef struct FlushCo { 2470e293b7a3SKevin Wolf BlockDriverState *bs; 2471e293b7a3SKevin Wolf int ret; 2472e293b7a3SKevin Wolf } FlushCo; 2473e293b7a3SKevin Wolf 2474e293b7a3SKevin Wolf 247561007b31SStefan Hajnoczi static void coroutine_fn bdrv_flush_co_entry(void *opaque) 247661007b31SStefan Hajnoczi { 2477e293b7a3SKevin Wolf FlushCo *rwco = opaque; 247861007b31SStefan Hajnoczi 247961007b31SStefan Hajnoczi rwco->ret = bdrv_co_flush(rwco->bs); 248061007b31SStefan Hajnoczi } 248161007b31SStefan Hajnoczi 248261007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 248361007b31SStefan Hajnoczi { 248449ca6259SFam Zheng int current_gen; 248549ca6259SFam Zheng int ret = 0; 248661007b31SStefan Hajnoczi 248799723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2488c32b82afSPavel Dovgalyuk 2489e914404eSFam Zheng if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 249049ca6259SFam Zheng bdrv_is_sg(bs)) { 249149ca6259SFam Zheng goto early_exit; 249249ca6259SFam Zheng } 249349ca6259SFam Zheng 24943783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 249547fec599SPaolo Bonzini current_gen = atomic_read(&bs->write_gen); 24963ff2f67aSEvgeny Yakovlev 24973ff2f67aSEvgeny Yakovlev /* Wait until any previous flushes are completed */ 249899723548SPaolo Bonzini while (bs->active_flush_req) { 24993783fa3dSPaolo Bonzini qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 25003ff2f67aSEvgeny Yakovlev } 25013ff2f67aSEvgeny Yakovlev 25023783fa3dSPaolo Bonzini /* Flushes reach this point in nondecreasing current_gen order. */ 250399723548SPaolo Bonzini bs->active_flush_req = true; 25043783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 25053ff2f67aSEvgeny Yakovlev 2506c32b82afSPavel Dovgalyuk /* Write back all layers by calling one driver function */ 2507c32b82afSPavel Dovgalyuk if (bs->drv->bdrv_co_flush) { 2508c32b82afSPavel Dovgalyuk ret = bs->drv->bdrv_co_flush(bs); 2509c32b82afSPavel Dovgalyuk goto out; 2510c32b82afSPavel Dovgalyuk } 2511c32b82afSPavel Dovgalyuk 251261007b31SStefan Hajnoczi /* Write back cached data to the OS even with cache=unsafe */ 251361007b31SStefan Hajnoczi BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 251461007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_os) { 251561007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_os(bs); 251661007b31SStefan Hajnoczi if (ret < 0) { 2517cdb5e315SFam Zheng goto out; 251861007b31SStefan Hajnoczi } 251961007b31SStefan Hajnoczi } 252061007b31SStefan Hajnoczi 252161007b31SStefan Hajnoczi /* But don't actually force it to the disk with cache=unsafe */ 252261007b31SStefan Hajnoczi if (bs->open_flags & BDRV_O_NO_FLUSH) { 252361007b31SStefan Hajnoczi goto flush_parent; 252461007b31SStefan Hajnoczi } 252561007b31SStefan Hajnoczi 25263ff2f67aSEvgeny Yakovlev /* Check if we really need to flush anything */ 25273ff2f67aSEvgeny Yakovlev if (bs->flushed_gen == current_gen) { 25283ff2f67aSEvgeny Yakovlev goto flush_parent; 25293ff2f67aSEvgeny Yakovlev } 25303ff2f67aSEvgeny Yakovlev 253161007b31SStefan Hajnoczi BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2532d470ad42SMax Reitz if (!bs->drv) { 2533d470ad42SMax Reitz /* bs->drv->bdrv_co_flush() might have ejected the BDS 2534d470ad42SMax Reitz * (even in case of apparent success) */ 2535d470ad42SMax Reitz ret = -ENOMEDIUM; 2536d470ad42SMax Reitz goto out; 2537d470ad42SMax Reitz } 253861007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_disk) { 253961007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_disk(bs); 254061007b31SStefan Hajnoczi } else if (bs->drv->bdrv_aio_flush) { 254161007b31SStefan Hajnoczi BlockAIOCB *acb; 254261007b31SStefan Hajnoczi CoroutineIOCompletion co = { 254361007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 254461007b31SStefan Hajnoczi }; 254561007b31SStefan Hajnoczi 254661007b31SStefan Hajnoczi acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 254761007b31SStefan Hajnoczi if (acb == NULL) { 254861007b31SStefan Hajnoczi ret = -EIO; 254961007b31SStefan Hajnoczi } else { 255061007b31SStefan Hajnoczi qemu_coroutine_yield(); 255161007b31SStefan Hajnoczi ret = co.ret; 255261007b31SStefan Hajnoczi } 255361007b31SStefan Hajnoczi } else { 255461007b31SStefan Hajnoczi /* 255561007b31SStefan Hajnoczi * Some block drivers always operate in either writethrough or unsafe 255661007b31SStefan Hajnoczi * mode and don't support bdrv_flush therefore. Usually qemu doesn't 255761007b31SStefan Hajnoczi * know how the server works (because the behaviour is hardcoded or 255861007b31SStefan Hajnoczi * depends on server-side configuration), so we can't ensure that 255961007b31SStefan Hajnoczi * everything is safe on disk. Returning an error doesn't work because 256061007b31SStefan Hajnoczi * that would break guests even if the server operates in writethrough 256161007b31SStefan Hajnoczi * mode. 256261007b31SStefan Hajnoczi * 256361007b31SStefan Hajnoczi * Let's hope the user knows what he's doing. 256461007b31SStefan Hajnoczi */ 256561007b31SStefan Hajnoczi ret = 0; 256661007b31SStefan Hajnoczi } 25673ff2f67aSEvgeny Yakovlev 256861007b31SStefan Hajnoczi if (ret < 0) { 2569cdb5e315SFam Zheng goto out; 257061007b31SStefan Hajnoczi } 257161007b31SStefan Hajnoczi 257261007b31SStefan Hajnoczi /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 257361007b31SStefan Hajnoczi * in the case of cache=unsafe, so there are no useless flushes. 257461007b31SStefan Hajnoczi */ 257561007b31SStefan Hajnoczi flush_parent: 2576cdb5e315SFam Zheng ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2577cdb5e315SFam Zheng out: 25783ff2f67aSEvgeny Yakovlev /* Notify any pending flushes that we have completed */ 2579e6af1e08SKevin Wolf if (ret == 0) { 25803ff2f67aSEvgeny Yakovlev bs->flushed_gen = current_gen; 2581e6af1e08SKevin Wolf } 25823783fa3dSPaolo Bonzini 25833783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 258499723548SPaolo Bonzini bs->active_flush_req = false; 2585156af3acSDenis V. Lunev /* Return value is ignored - it's ok if wait queue is empty */ 2586156af3acSDenis V. Lunev qemu_co_queue_next(&bs->flush_queue); 25873783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 25883ff2f67aSEvgeny Yakovlev 258949ca6259SFam Zheng early_exit: 259099723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2591cdb5e315SFam Zheng return ret; 259261007b31SStefan Hajnoczi } 259361007b31SStefan Hajnoczi 259461007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs) 259561007b31SStefan Hajnoczi { 259661007b31SStefan Hajnoczi Coroutine *co; 2597e293b7a3SKevin Wolf FlushCo flush_co = { 259861007b31SStefan Hajnoczi .bs = bs, 259961007b31SStefan Hajnoczi .ret = NOT_DONE, 260061007b31SStefan Hajnoczi }; 260161007b31SStefan Hajnoczi 260261007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 260361007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 2604e293b7a3SKevin Wolf bdrv_flush_co_entry(&flush_co); 260561007b31SStefan Hajnoczi } else { 26060b8b8753SPaolo Bonzini co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2607e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 260888b062c2SPaolo Bonzini BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 260961007b31SStefan Hajnoczi } 261061007b31SStefan Hajnoczi 2611e293b7a3SKevin Wolf return flush_co.ret; 261261007b31SStefan Hajnoczi } 261361007b31SStefan Hajnoczi 261461007b31SStefan Hajnoczi typedef struct DiscardCo { 261561007b31SStefan Hajnoczi BlockDriverState *bs; 26160c51a893SEric Blake int64_t offset; 2617f5a5ca79SManos Pitsidianakis int bytes; 261861007b31SStefan Hajnoczi int ret; 261961007b31SStefan Hajnoczi } DiscardCo; 26200c51a893SEric Blake static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 262161007b31SStefan Hajnoczi { 262261007b31SStefan Hajnoczi DiscardCo *rwco = opaque; 262361007b31SStefan Hajnoczi 2624f5a5ca79SManos Pitsidianakis rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); 262561007b31SStefan Hajnoczi } 262661007b31SStefan Hajnoczi 26279f1963b3SEric Blake int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, 2628f5a5ca79SManos Pitsidianakis int bytes) 262961007b31SStefan Hajnoczi { 2630b1066c87SFam Zheng BdrvTrackedRequest req; 26319f1963b3SEric Blake int max_pdiscard, ret; 26323482b9bcSEric Blake int head, tail, align; 263361007b31SStefan Hajnoczi 263461007b31SStefan Hajnoczi if (!bs->drv) { 263561007b31SStefan Hajnoczi return -ENOMEDIUM; 263661007b31SStefan Hajnoczi } 263761007b31SStefan Hajnoczi 2638d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2639d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2640d6883bc9SVladimir Sementsov-Ogievskiy } 2641d6883bc9SVladimir Sementsov-Ogievskiy 2642f5a5ca79SManos Pitsidianakis ret = bdrv_check_byte_request(bs, offset, bytes); 264361007b31SStefan Hajnoczi if (ret < 0) { 264461007b31SStefan Hajnoczi return ret; 264561007b31SStefan Hajnoczi } else if (bs->read_only) { 2646eaf5fe2dSPaolo Bonzini return -EPERM; 264761007b31SStefan Hajnoczi } 264804c01a5cSKevin Wolf assert(!(bs->open_flags & BDRV_O_INACTIVE)); 264961007b31SStefan Hajnoczi 265061007b31SStefan Hajnoczi /* Do nothing if disabled. */ 265161007b31SStefan Hajnoczi if (!(bs->open_flags & BDRV_O_UNMAP)) { 265261007b31SStefan Hajnoczi return 0; 265361007b31SStefan Hajnoczi } 265461007b31SStefan Hajnoczi 265502aefe43SEric Blake if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 265661007b31SStefan Hajnoczi return 0; 265761007b31SStefan Hajnoczi } 265861007b31SStefan Hajnoczi 26593482b9bcSEric Blake /* Discard is advisory, but some devices track and coalesce 26603482b9bcSEric Blake * unaligned requests, so we must pass everything down rather than 26613482b9bcSEric Blake * round here. Still, most devices will just silently ignore 26623482b9bcSEric Blake * unaligned requests (by returning -ENOTSUP), so we must fragment 26633482b9bcSEric Blake * the request accordingly. */ 266402aefe43SEric Blake align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2665b8d0a980SEric Blake assert(align % bs->bl.request_alignment == 0); 2666b8d0a980SEric Blake head = offset % align; 2667f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % align; 26689f1963b3SEric Blake 266999723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2670f5a5ca79SManos Pitsidianakis tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 267150824995SFam Zheng 2672ec050f77SDenis V. Lunev ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); 2673ec050f77SDenis V. Lunev if (ret < 0) { 2674ec050f77SDenis V. Lunev goto out; 2675ec050f77SDenis V. Lunev } 2676ec050f77SDenis V. Lunev 26779f1963b3SEric Blake max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 26789f1963b3SEric Blake align); 26793482b9bcSEric Blake assert(max_pdiscard >= bs->bl.request_alignment); 26809f1963b3SEric Blake 2681f5a5ca79SManos Pitsidianakis while (bytes > 0) { 2682f5a5ca79SManos Pitsidianakis int num = bytes; 26833482b9bcSEric Blake 26843482b9bcSEric Blake if (head) { 26853482b9bcSEric Blake /* Make small requests to get to alignment boundaries. */ 2686f5a5ca79SManos Pitsidianakis num = MIN(bytes, align - head); 26873482b9bcSEric Blake if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 26883482b9bcSEric Blake num %= bs->bl.request_alignment; 26893482b9bcSEric Blake } 26903482b9bcSEric Blake head = (head + num) % align; 26913482b9bcSEric Blake assert(num < max_pdiscard); 26923482b9bcSEric Blake } else if (tail) { 26933482b9bcSEric Blake if (num > align) { 26943482b9bcSEric Blake /* Shorten the request to the last aligned cluster. */ 26953482b9bcSEric Blake num -= tail; 26963482b9bcSEric Blake } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 26973482b9bcSEric Blake tail > bs->bl.request_alignment) { 26983482b9bcSEric Blake tail %= bs->bl.request_alignment; 26993482b9bcSEric Blake num -= tail; 27003482b9bcSEric Blake } 27013482b9bcSEric Blake } 27023482b9bcSEric Blake /* limit request size */ 27033482b9bcSEric Blake if (num > max_pdiscard) { 27043482b9bcSEric Blake num = max_pdiscard; 27053482b9bcSEric Blake } 270661007b31SStefan Hajnoczi 2707d470ad42SMax Reitz if (!bs->drv) { 2708d470ad42SMax Reitz ret = -ENOMEDIUM; 2709d470ad42SMax Reitz goto out; 2710d470ad42SMax Reitz } 271147a5486dSEric Blake if (bs->drv->bdrv_co_pdiscard) { 271247a5486dSEric Blake ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 271361007b31SStefan Hajnoczi } else { 271461007b31SStefan Hajnoczi BlockAIOCB *acb; 271561007b31SStefan Hajnoczi CoroutineIOCompletion co = { 271661007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 271761007b31SStefan Hajnoczi }; 271861007b31SStefan Hajnoczi 27194da444a0SEric Blake acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 272061007b31SStefan Hajnoczi bdrv_co_io_em_complete, &co); 272161007b31SStefan Hajnoczi if (acb == NULL) { 2722b1066c87SFam Zheng ret = -EIO; 2723b1066c87SFam Zheng goto out; 272461007b31SStefan Hajnoczi } else { 272561007b31SStefan Hajnoczi qemu_coroutine_yield(); 272661007b31SStefan Hajnoczi ret = co.ret; 272761007b31SStefan Hajnoczi } 272861007b31SStefan Hajnoczi } 272961007b31SStefan Hajnoczi if (ret && ret != -ENOTSUP) { 2730b1066c87SFam Zheng goto out; 273161007b31SStefan Hajnoczi } 273261007b31SStefan Hajnoczi 27339f1963b3SEric Blake offset += num; 2734f5a5ca79SManos Pitsidianakis bytes -= num; 273561007b31SStefan Hajnoczi } 2736b1066c87SFam Zheng ret = 0; 2737b1066c87SFam Zheng out: 273847fec599SPaolo Bonzini atomic_inc(&bs->write_gen); 27390fdf1a4fSEric Blake bdrv_set_dirty(bs, req.offset, req.bytes); 2740b1066c87SFam Zheng tracked_request_end(&req); 274199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2742b1066c87SFam Zheng return ret; 274361007b31SStefan Hajnoczi } 274461007b31SStefan Hajnoczi 2745f5a5ca79SManos Pitsidianakis int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) 274661007b31SStefan Hajnoczi { 274761007b31SStefan Hajnoczi Coroutine *co; 274861007b31SStefan Hajnoczi DiscardCo rwco = { 274961007b31SStefan Hajnoczi .bs = bs, 27500c51a893SEric Blake .offset = offset, 2751f5a5ca79SManos Pitsidianakis .bytes = bytes, 275261007b31SStefan Hajnoczi .ret = NOT_DONE, 275361007b31SStefan Hajnoczi }; 275461007b31SStefan Hajnoczi 275561007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 275661007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 27570c51a893SEric Blake bdrv_pdiscard_co_entry(&rwco); 275861007b31SStefan Hajnoczi } else { 27590c51a893SEric Blake co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 2760e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 276188b062c2SPaolo Bonzini BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); 276261007b31SStefan Hajnoczi } 276361007b31SStefan Hajnoczi 276461007b31SStefan Hajnoczi return rwco.ret; 276561007b31SStefan Hajnoczi } 276661007b31SStefan Hajnoczi 276748af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 276861007b31SStefan Hajnoczi { 276961007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 27705c5ae76aSFam Zheng CoroutineIOCompletion co = { 27715c5ae76aSFam Zheng .coroutine = qemu_coroutine_self(), 27725c5ae76aSFam Zheng }; 27735c5ae76aSFam Zheng BlockAIOCB *acb; 277461007b31SStefan Hajnoczi 277599723548SPaolo Bonzini bdrv_inc_in_flight(bs); 277616a389dcSKevin Wolf if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 27775c5ae76aSFam Zheng co.ret = -ENOTSUP; 27785c5ae76aSFam Zheng goto out; 27795c5ae76aSFam Zheng } 27805c5ae76aSFam Zheng 278116a389dcSKevin Wolf if (drv->bdrv_co_ioctl) { 278216a389dcSKevin Wolf co.ret = drv->bdrv_co_ioctl(bs, req, buf); 278316a389dcSKevin Wolf } else { 27845c5ae76aSFam Zheng acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 27855c5ae76aSFam Zheng if (!acb) { 2786c8a9fd80SFam Zheng co.ret = -ENOTSUP; 2787c8a9fd80SFam Zheng goto out; 27885c5ae76aSFam Zheng } 27895c5ae76aSFam Zheng qemu_coroutine_yield(); 279016a389dcSKevin Wolf } 27915c5ae76aSFam Zheng out: 279299723548SPaolo Bonzini bdrv_dec_in_flight(bs); 27935c5ae76aSFam Zheng return co.ret; 27945c5ae76aSFam Zheng } 27955c5ae76aSFam Zheng 279661007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size) 279761007b31SStefan Hajnoczi { 279861007b31SStefan Hajnoczi return qemu_memalign(bdrv_opt_mem_align(bs), size); 279961007b31SStefan Hajnoczi } 280061007b31SStefan Hajnoczi 280161007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size) 280261007b31SStefan Hajnoczi { 280361007b31SStefan Hajnoczi return memset(qemu_blockalign(bs, size), 0, size); 280461007b31SStefan Hajnoczi } 280561007b31SStefan Hajnoczi 280661007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 280761007b31SStefan Hajnoczi { 280861007b31SStefan Hajnoczi size_t align = bdrv_opt_mem_align(bs); 280961007b31SStefan Hajnoczi 281061007b31SStefan Hajnoczi /* Ensure that NULL is never returned on success */ 281161007b31SStefan Hajnoczi assert(align > 0); 281261007b31SStefan Hajnoczi if (size == 0) { 281361007b31SStefan Hajnoczi size = align; 281461007b31SStefan Hajnoczi } 281561007b31SStefan Hajnoczi 281661007b31SStefan Hajnoczi return qemu_try_memalign(align, size); 281761007b31SStefan Hajnoczi } 281861007b31SStefan Hajnoczi 281961007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 282061007b31SStefan Hajnoczi { 282161007b31SStefan Hajnoczi void *mem = qemu_try_blockalign(bs, size); 282261007b31SStefan Hajnoczi 282361007b31SStefan Hajnoczi if (mem) { 282461007b31SStefan Hajnoczi memset(mem, 0, size); 282561007b31SStefan Hajnoczi } 282661007b31SStefan Hajnoczi 282761007b31SStefan Hajnoczi return mem; 282861007b31SStefan Hajnoczi } 282961007b31SStefan Hajnoczi 283061007b31SStefan Hajnoczi /* 283161007b31SStefan Hajnoczi * Check if all memory in this vector is sector aligned. 283261007b31SStefan Hajnoczi */ 283361007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 283461007b31SStefan Hajnoczi { 283561007b31SStefan Hajnoczi int i; 28364196d2f0SDenis V. Lunev size_t alignment = bdrv_min_mem_align(bs); 283761007b31SStefan Hajnoczi 283861007b31SStefan Hajnoczi for (i = 0; i < qiov->niov; i++) { 283961007b31SStefan Hajnoczi if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 284061007b31SStefan Hajnoczi return false; 284161007b31SStefan Hajnoczi } 284261007b31SStefan Hajnoczi if (qiov->iov[i].iov_len % alignment) { 284361007b31SStefan Hajnoczi return false; 284461007b31SStefan Hajnoczi } 284561007b31SStefan Hajnoczi } 284661007b31SStefan Hajnoczi 284761007b31SStefan Hajnoczi return true; 284861007b31SStefan Hajnoczi } 284961007b31SStefan Hajnoczi 285061007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs, 285161007b31SStefan Hajnoczi NotifierWithReturn *notifier) 285261007b31SStefan Hajnoczi { 285361007b31SStefan Hajnoczi notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 285461007b31SStefan Hajnoczi } 285561007b31SStefan Hajnoczi 285661007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs) 285761007b31SStefan Hajnoczi { 28586b98bd64SPaolo Bonzini BdrvChild *child; 28596b98bd64SPaolo Bonzini 28606b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 28616b98bd64SPaolo Bonzini bdrv_io_plug(child->bs); 28626b98bd64SPaolo Bonzini } 28636b98bd64SPaolo Bonzini 2864850d54a2SPaolo Bonzini if (atomic_fetch_inc(&bs->io_plugged) == 0) { 286561007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 286661007b31SStefan Hajnoczi if (drv && drv->bdrv_io_plug) { 286761007b31SStefan Hajnoczi drv->bdrv_io_plug(bs); 28686b98bd64SPaolo Bonzini } 286961007b31SStefan Hajnoczi } 287061007b31SStefan Hajnoczi } 287161007b31SStefan Hajnoczi 287261007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs) 287361007b31SStefan Hajnoczi { 28746b98bd64SPaolo Bonzini BdrvChild *child; 28756b98bd64SPaolo Bonzini 28766b98bd64SPaolo Bonzini assert(bs->io_plugged); 2877850d54a2SPaolo Bonzini if (atomic_fetch_dec(&bs->io_plugged) == 1) { 287861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 287961007b31SStefan Hajnoczi if (drv && drv->bdrv_io_unplug) { 288061007b31SStefan Hajnoczi drv->bdrv_io_unplug(bs); 288161007b31SStefan Hajnoczi } 288261007b31SStefan Hajnoczi } 288361007b31SStefan Hajnoczi 28846b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 28856b98bd64SPaolo Bonzini bdrv_io_unplug(child->bs); 28866b98bd64SPaolo Bonzini } 28876b98bd64SPaolo Bonzini } 288823d0ba93SFam Zheng 288923d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 289023d0ba93SFam Zheng { 289123d0ba93SFam Zheng BdrvChild *child; 289223d0ba93SFam Zheng 289323d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_register_buf) { 289423d0ba93SFam Zheng bs->drv->bdrv_register_buf(bs, host, size); 289523d0ba93SFam Zheng } 289623d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 289723d0ba93SFam Zheng bdrv_register_buf(child->bs, host, size); 289823d0ba93SFam Zheng } 289923d0ba93SFam Zheng } 290023d0ba93SFam Zheng 290123d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host) 290223d0ba93SFam Zheng { 290323d0ba93SFam Zheng BdrvChild *child; 290423d0ba93SFam Zheng 290523d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_unregister_buf) { 290623d0ba93SFam Zheng bs->drv->bdrv_unregister_buf(bs, host); 290723d0ba93SFam Zheng } 290823d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 290923d0ba93SFam Zheng bdrv_unregister_buf(child->bs, host); 291023d0ba93SFam Zheng } 291123d0ba93SFam Zheng } 2912fcc67678SFam Zheng 2913*67b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal( 2914*67b51fb9SVladimir Sementsov-Ogievskiy BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 2915*67b51fb9SVladimir Sementsov-Ogievskiy uint64_t dst_offset, uint64_t bytes, 2916*67b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 2917fcc67678SFam Zheng bool recurse_src) 2918fcc67678SFam Zheng { 2919999658a0SVladimir Sementsov-Ogievskiy BdrvTrackedRequest req; 2920fcc67678SFam Zheng int ret; 2921fcc67678SFam Zheng 2922d4d3e5a0SFam Zheng if (!dst || !dst->bs) { 2923fcc67678SFam Zheng return -ENOMEDIUM; 2924fcc67678SFam Zheng } 2925fcc67678SFam Zheng ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 2926fcc67678SFam Zheng if (ret) { 2927fcc67678SFam Zheng return ret; 2928fcc67678SFam Zheng } 2929*67b51fb9SVladimir Sementsov-Ogievskiy if (write_flags & BDRV_REQ_ZERO_WRITE) { 2930*67b51fb9SVladimir Sementsov-Ogievskiy return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 2931fcc67678SFam Zheng } 2932fcc67678SFam Zheng 2933d4d3e5a0SFam Zheng if (!src || !src->bs) { 2934d4d3e5a0SFam Zheng return -ENOMEDIUM; 2935d4d3e5a0SFam Zheng } 2936d4d3e5a0SFam Zheng ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 2937d4d3e5a0SFam Zheng if (ret) { 2938d4d3e5a0SFam Zheng return ret; 2939d4d3e5a0SFam Zheng } 2940d4d3e5a0SFam Zheng 2941fcc67678SFam Zheng if (!src->bs->drv->bdrv_co_copy_range_from 2942fcc67678SFam Zheng || !dst->bs->drv->bdrv_co_copy_range_to 2943fcc67678SFam Zheng || src->bs->encrypted || dst->bs->encrypted) { 2944fcc67678SFam Zheng return -ENOTSUP; 2945fcc67678SFam Zheng } 2946999658a0SVladimir Sementsov-Ogievskiy 2947999658a0SVladimir Sementsov-Ogievskiy if (recurse_src) { 2948d4d3e5a0SFam Zheng bdrv_inc_in_flight(src->bs); 2949999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, src->bs, src_offset, bytes, 2950999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_READ); 295137aec7d7SFam Zheng 2952*67b51fb9SVladimir Sementsov-Ogievskiy if (!(read_flags & BDRV_REQ_NO_SERIALISING)) { 2953999658a0SVladimir Sementsov-Ogievskiy wait_serialising_requests(&req); 2954dee12de8SFam Zheng } 2955999658a0SVladimir Sementsov-Ogievskiy 295637aec7d7SFam Zheng ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 2957fcc67678SFam Zheng src, src_offset, 2958fcc67678SFam Zheng dst, dst_offset, 2959*67b51fb9SVladimir Sementsov-Ogievskiy bytes, 2960*67b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 2961999658a0SVladimir Sementsov-Ogievskiy 2962999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 2963999658a0SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(src->bs); 2964fcc67678SFam Zheng } else { 2965999658a0SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(dst->bs); 2966999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, dst->bs, dst_offset, bytes, 2967999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_WRITE); 2968999658a0SVladimir Sementsov-Ogievskiy 2969*67b51fb9SVladimir Sementsov-Ogievskiy /* BDRV_REQ_NO_SERIALISING is only for read operation */ 2970*67b51fb9SVladimir Sementsov-Ogievskiy assert(!(write_flags & BDRV_REQ_NO_SERIALISING)); 2971999658a0SVladimir Sementsov-Ogievskiy wait_serialising_requests(&req); 2972999658a0SVladimir Sementsov-Ogievskiy 297337aec7d7SFam Zheng ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 2974fcc67678SFam Zheng src, src_offset, 2975fcc67678SFam Zheng dst, dst_offset, 2976*67b51fb9SVladimir Sementsov-Ogievskiy bytes, 2977*67b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 2978999658a0SVladimir Sementsov-Ogievskiy 2979999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 2980d4d3e5a0SFam Zheng bdrv_dec_in_flight(dst->bs); 2981999658a0SVladimir Sementsov-Ogievskiy } 2982999658a0SVladimir Sementsov-Ogievskiy 298337aec7d7SFam Zheng return ret; 2984fcc67678SFam Zheng } 2985fcc67678SFam Zheng 2986fcc67678SFam Zheng /* Copy range from @src to @dst. 2987fcc67678SFam Zheng * 2988fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 2989fcc67678SFam Zheng * semantics. */ 2990fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 2991fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 2992*67b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 2993*67b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 2994*67b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 2995fcc67678SFam Zheng { 2996fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 2997*67b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, true); 2998fcc67678SFam Zheng } 2999fcc67678SFam Zheng 3000fcc67678SFam Zheng /* Copy range from @src to @dst. 3001fcc67678SFam Zheng * 3002fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3003fcc67678SFam Zheng * semantics. */ 3004fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3005fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 3006*67b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 3007*67b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 3008*67b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3009fcc67678SFam Zheng { 3010fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 3011*67b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, false); 3012fcc67678SFam Zheng } 3013fcc67678SFam Zheng 3014fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3015fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 3016*67b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, BdrvRequestFlags read_flags, 3017*67b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3018fcc67678SFam Zheng { 301937aec7d7SFam Zheng return bdrv_co_copy_range_from(src, src_offset, 3020fcc67678SFam Zheng dst, dst_offset, 3021*67b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags); 3022fcc67678SFam Zheng } 30233d9f2d2aSKevin Wolf 30243d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs) 30253d9f2d2aSKevin Wolf { 30263d9f2d2aSKevin Wolf BdrvChild *c; 30273d9f2d2aSKevin Wolf QLIST_FOREACH(c, &bs->parents, next_parent) { 30283d9f2d2aSKevin Wolf if (c->role->resize) { 30293d9f2d2aSKevin Wolf c->role->resize(c); 30303d9f2d2aSKevin Wolf } 30313d9f2d2aSKevin Wolf } 30323d9f2d2aSKevin Wolf } 30333d9f2d2aSKevin Wolf 30343d9f2d2aSKevin Wolf /** 30353d9f2d2aSKevin Wolf * Truncate file to 'offset' bytes (needed only for file protocols) 30363d9f2d2aSKevin Wolf */ 30373d9f2d2aSKevin Wolf int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, 30383d9f2d2aSKevin Wolf PreallocMode prealloc, Error **errp) 30393d9f2d2aSKevin Wolf { 30403d9f2d2aSKevin Wolf BlockDriverState *bs = child->bs; 30413d9f2d2aSKevin Wolf BlockDriver *drv = bs->drv; 30421bc5f09fSKevin Wolf BdrvTrackedRequest req; 30431bc5f09fSKevin Wolf int64_t old_size, new_bytes; 30443d9f2d2aSKevin Wolf int ret; 30453d9f2d2aSKevin Wolf 30463d9f2d2aSKevin Wolf assert(child->perm & BLK_PERM_RESIZE); 30473d9f2d2aSKevin Wolf 30483d9f2d2aSKevin Wolf /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 30493d9f2d2aSKevin Wolf if (!drv) { 30503d9f2d2aSKevin Wolf error_setg(errp, "No medium inserted"); 30513d9f2d2aSKevin Wolf return -ENOMEDIUM; 30523d9f2d2aSKevin Wolf } 30533d9f2d2aSKevin Wolf if (offset < 0) { 30543d9f2d2aSKevin Wolf error_setg(errp, "Image size cannot be negative"); 30553d9f2d2aSKevin Wolf return -EINVAL; 30563d9f2d2aSKevin Wolf } 30573d9f2d2aSKevin Wolf 30581bc5f09fSKevin Wolf old_size = bdrv_getlength(bs); 30591bc5f09fSKevin Wolf if (old_size < 0) { 30601bc5f09fSKevin Wolf error_setg_errno(errp, -old_size, "Failed to get old image size"); 30611bc5f09fSKevin Wolf return old_size; 30621bc5f09fSKevin Wolf } 30631bc5f09fSKevin Wolf 30641bc5f09fSKevin Wolf if (offset > old_size) { 30651bc5f09fSKevin Wolf new_bytes = offset - old_size; 30661bc5f09fSKevin Wolf } else { 30671bc5f09fSKevin Wolf new_bytes = 0; 30681bc5f09fSKevin Wolf } 30691bc5f09fSKevin Wolf 30703d9f2d2aSKevin Wolf bdrv_inc_in_flight(bs); 30711bc5f09fSKevin Wolf tracked_request_begin(&req, bs, offset, new_bytes, BDRV_TRACKED_TRUNCATE); 30721bc5f09fSKevin Wolf 30731bc5f09fSKevin Wolf /* If we are growing the image and potentially using preallocation for the 30741bc5f09fSKevin Wolf * new area, we need to make sure that no write requests are made to it 30751bc5f09fSKevin Wolf * concurrently or they might be overwritten by preallocation. */ 30761bc5f09fSKevin Wolf if (new_bytes) { 30771bc5f09fSKevin Wolf mark_request_serialising(&req, 1); 30781bc5f09fSKevin Wolf wait_serialising_requests(&req); 30791bc5f09fSKevin Wolf } 30803d9f2d2aSKevin Wolf 30813d9f2d2aSKevin Wolf if (!drv->bdrv_co_truncate) { 30823d9f2d2aSKevin Wolf if (bs->file && drv->is_filter) { 30833d9f2d2aSKevin Wolf ret = bdrv_co_truncate(bs->file, offset, prealloc, errp); 30843d9f2d2aSKevin Wolf goto out; 30853d9f2d2aSKevin Wolf } 30863d9f2d2aSKevin Wolf error_setg(errp, "Image format driver does not support resize"); 30873d9f2d2aSKevin Wolf ret = -ENOTSUP; 30883d9f2d2aSKevin Wolf goto out; 30893d9f2d2aSKevin Wolf } 30903d9f2d2aSKevin Wolf if (bs->read_only) { 30913d9f2d2aSKevin Wolf error_setg(errp, "Image is read-only"); 30923d9f2d2aSKevin Wolf ret = -EACCES; 30933d9f2d2aSKevin Wolf goto out; 30943d9f2d2aSKevin Wolf } 30953d9f2d2aSKevin Wolf 30963d9f2d2aSKevin Wolf assert(!(bs->open_flags & BDRV_O_INACTIVE)); 30973d9f2d2aSKevin Wolf 30983d9f2d2aSKevin Wolf ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp); 30993d9f2d2aSKevin Wolf if (ret < 0) { 31003d9f2d2aSKevin Wolf goto out; 31013d9f2d2aSKevin Wolf } 31023d9f2d2aSKevin Wolf ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 31033d9f2d2aSKevin Wolf if (ret < 0) { 31043d9f2d2aSKevin Wolf error_setg_errno(errp, -ret, "Could not refresh total sector count"); 31053d9f2d2aSKevin Wolf } else { 31063d9f2d2aSKevin Wolf offset = bs->total_sectors * BDRV_SECTOR_SIZE; 31073d9f2d2aSKevin Wolf } 31083d9f2d2aSKevin Wolf bdrv_dirty_bitmap_truncate(bs, offset); 31093d9f2d2aSKevin Wolf bdrv_parent_cb_resize(bs); 31103d9f2d2aSKevin Wolf atomic_inc(&bs->write_gen); 31113d9f2d2aSKevin Wolf 31123d9f2d2aSKevin Wolf out: 31131bc5f09fSKevin Wolf tracked_request_end(&req); 31143d9f2d2aSKevin Wolf bdrv_dec_in_flight(bs); 31151bc5f09fSKevin Wolf 31163d9f2d2aSKevin Wolf return ret; 31173d9f2d2aSKevin Wolf } 31183d9f2d2aSKevin Wolf 31193d9f2d2aSKevin Wolf typedef struct TruncateCo { 31203d9f2d2aSKevin Wolf BdrvChild *child; 31213d9f2d2aSKevin Wolf int64_t offset; 31223d9f2d2aSKevin Wolf PreallocMode prealloc; 31233d9f2d2aSKevin Wolf Error **errp; 31243d9f2d2aSKevin Wolf int ret; 31253d9f2d2aSKevin Wolf } TruncateCo; 31263d9f2d2aSKevin Wolf 31273d9f2d2aSKevin Wolf static void coroutine_fn bdrv_truncate_co_entry(void *opaque) 31283d9f2d2aSKevin Wolf { 31293d9f2d2aSKevin Wolf TruncateCo *tco = opaque; 31303d9f2d2aSKevin Wolf tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc, 31313d9f2d2aSKevin Wolf tco->errp); 31323d9f2d2aSKevin Wolf } 31333d9f2d2aSKevin Wolf 31343d9f2d2aSKevin Wolf int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc, 31353d9f2d2aSKevin Wolf Error **errp) 31363d9f2d2aSKevin Wolf { 31373d9f2d2aSKevin Wolf Coroutine *co; 31383d9f2d2aSKevin Wolf TruncateCo tco = { 31393d9f2d2aSKevin Wolf .child = child, 31403d9f2d2aSKevin Wolf .offset = offset, 31413d9f2d2aSKevin Wolf .prealloc = prealloc, 31423d9f2d2aSKevin Wolf .errp = errp, 31433d9f2d2aSKevin Wolf .ret = NOT_DONE, 31443d9f2d2aSKevin Wolf }; 31453d9f2d2aSKevin Wolf 31463d9f2d2aSKevin Wolf if (qemu_in_coroutine()) { 31473d9f2d2aSKevin Wolf /* Fast-path if already in coroutine context */ 31483d9f2d2aSKevin Wolf bdrv_truncate_co_entry(&tco); 31493d9f2d2aSKevin Wolf } else { 31503d9f2d2aSKevin Wolf co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco); 31513d9f2d2aSKevin Wolf qemu_coroutine_enter(co); 31523d9f2d2aSKevin Wolf BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE); 31533d9f2d2aSKevin Wolf } 31543d9f2d2aSKevin Wolf 31553d9f2d2aSKevin Wolf return tco.ret; 31563d9f2d2aSKevin Wolf } 3157