161007b31SStefan Hajnoczi /* 261007b31SStefan Hajnoczi * Block layer I/O functions 361007b31SStefan Hajnoczi * 461007b31SStefan Hajnoczi * Copyright (c) 2003 Fabrice Bellard 561007b31SStefan Hajnoczi * 661007b31SStefan Hajnoczi * Permission is hereby granted, free of charge, to any person obtaining a copy 761007b31SStefan Hajnoczi * of this software and associated documentation files (the "Software"), to deal 861007b31SStefan Hajnoczi * in the Software without restriction, including without limitation the rights 961007b31SStefan Hajnoczi * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1061007b31SStefan Hajnoczi * copies of the Software, and to permit persons to whom the Software is 1161007b31SStefan Hajnoczi * furnished to do so, subject to the following conditions: 1261007b31SStefan Hajnoczi * 1361007b31SStefan Hajnoczi * The above copyright notice and this permission notice shall be included in 1461007b31SStefan Hajnoczi * all copies or substantial portions of the Software. 1561007b31SStefan Hajnoczi * 1661007b31SStefan Hajnoczi * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1761007b31SStefan Hajnoczi * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1861007b31SStefan Hajnoczi * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1961007b31SStefan Hajnoczi * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2061007b31SStefan Hajnoczi * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2161007b31SStefan Hajnoczi * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2261007b31SStefan Hajnoczi * THE SOFTWARE. 2361007b31SStefan Hajnoczi */ 2461007b31SStefan Hajnoczi 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2661007b31SStefan Hajnoczi #include "trace.h" 277f0e9da6SMax Reitz #include "sysemu/block-backend.h" 287719f3c9SStefan Hajnoczi #include "block/aio-wait.h" 2961007b31SStefan Hajnoczi #include "block/blockjob.h" 30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h" 3161007b31SStefan Hajnoczi #include "block/block_int.h" 3221c2283eSVladimir Sementsov-Ogievskiy #include "block/coroutines.h" 3394783301SVladimir Sementsov-Ogievskiy #include "block/write-threshold.h" 34f348b6d1SVeronia Bahaa #include "qemu/cutils.h" 35da34e65cSMarkus Armbruster #include "qapi/error.h" 36d49b6836SMarkus Armbruster #include "qemu/error-report.h" 37db725815SMarkus Armbruster #include "qemu/main-loop.h" 38c8aa7895SPavel Dovgalyuk #include "sysemu/replay.h" 3961007b31SStefan Hajnoczi 40cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 41cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 42cb2e2878SEric Blake 437f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs); 44d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 455ae07b14SVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, BdrvRequestFlags flags); 4661007b31SStefan Hajnoczi 47f4c8a43bSMax Reitz static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 486cd5c9d7SKevin Wolf bool ignore_bds_parents) 4961007b31SStefan Hajnoczi { 5002d21300SKevin Wolf BdrvChild *c, *next; 5127ccdd52SKevin Wolf 5202d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 53bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 540152bf40SKevin Wolf continue; 550152bf40SKevin Wolf } 564be6a6d1SKevin Wolf bdrv_parent_drained_begin_single(c, false); 57ce0f1412SPaolo Bonzini } 58ce0f1412SPaolo Bonzini } 59ce0f1412SPaolo Bonzini 60e037c09cSMax Reitz static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 61e037c09cSMax Reitz int *drained_end_counter) 62804db8eaSMax Reitz { 63804db8eaSMax Reitz assert(c->parent_quiesce_counter > 0); 64804db8eaSMax Reitz c->parent_quiesce_counter--; 65bd86fb99SMax Reitz if (c->klass->drained_end) { 66bd86fb99SMax Reitz c->klass->drained_end(c, drained_end_counter); 67804db8eaSMax Reitz } 68804db8eaSMax Reitz } 69804db8eaSMax Reitz 70e037c09cSMax Reitz void bdrv_parent_drained_end_single(BdrvChild *c) 71e037c09cSMax Reitz { 72e037c09cSMax Reitz int drained_end_counter = 0; 73384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 74e037c09cSMax Reitz bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 75d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(c->bs, qatomic_read(&drained_end_counter) > 0); 76e037c09cSMax Reitz } 77e037c09cSMax Reitz 78f4c8a43bSMax Reitz static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 79e037c09cSMax Reitz bool ignore_bds_parents, 80e037c09cSMax Reitz int *drained_end_counter) 81ce0f1412SPaolo Bonzini { 8261ad631cSMax Reitz BdrvChild *c; 8327ccdd52SKevin Wolf 8461ad631cSMax Reitz QLIST_FOREACH(c, &bs->parents, next_parent) { 85bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 860152bf40SKevin Wolf continue; 870152bf40SKevin Wolf } 88e037c09cSMax Reitz bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 89c2066af0SKevin Wolf } 9061007b31SStefan Hajnoczi } 9161007b31SStefan Hajnoczi 924be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c) 934be6a6d1SKevin Wolf { 94bd86fb99SMax Reitz if (c->klass->drained_poll) { 95bd86fb99SMax Reitz return c->klass->drained_poll(c); 964be6a6d1SKevin Wolf } 974be6a6d1SKevin Wolf return false; 984be6a6d1SKevin Wolf } 994be6a6d1SKevin Wolf 1006cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 1016cd5c9d7SKevin Wolf bool ignore_bds_parents) 10289bd0305SKevin Wolf { 10389bd0305SKevin Wolf BdrvChild *c, *next; 10489bd0305SKevin Wolf bool busy = false; 10589bd0305SKevin Wolf 10689bd0305SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 107bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 10889bd0305SKevin Wolf continue; 10989bd0305SKevin Wolf } 1104be6a6d1SKevin Wolf busy |= bdrv_parent_drained_poll_single(c); 11189bd0305SKevin Wolf } 11289bd0305SKevin Wolf 11389bd0305SKevin Wolf return busy; 11489bd0305SKevin Wolf } 11589bd0305SKevin Wolf 1164be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 1174be6a6d1SKevin Wolf { 118384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 119804db8eaSMax Reitz c->parent_quiesce_counter++; 120bd86fb99SMax Reitz if (c->klass->drained_begin) { 121bd86fb99SMax Reitz c->klass->drained_begin(c); 1224be6a6d1SKevin Wolf } 1234be6a6d1SKevin Wolf if (poll) { 1244be6a6d1SKevin Wolf BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 1254be6a6d1SKevin Wolf } 1264be6a6d1SKevin Wolf } 1274be6a6d1SKevin Wolf 128d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 129d9e0dfa2SEric Blake { 1309f460c64SAkihiko Odaki dst->pdiscard_alignment = MAX(dst->pdiscard_alignment, 1319f460c64SAkihiko Odaki src->pdiscard_alignment); 132d9e0dfa2SEric Blake dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 133d9e0dfa2SEric Blake dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 13424b36e98SPaolo Bonzini dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, 13524b36e98SPaolo Bonzini src->max_hw_transfer); 136d9e0dfa2SEric Blake dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 137d9e0dfa2SEric Blake src->opt_mem_alignment); 138d9e0dfa2SEric Blake dst->min_mem_alignment = MAX(dst->min_mem_alignment, 139d9e0dfa2SEric Blake src->min_mem_alignment); 140d9e0dfa2SEric Blake dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 141cc071629SPaolo Bonzini dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov); 142d9e0dfa2SEric Blake } 143d9e0dfa2SEric Blake 1441e4c797cSVladimir Sementsov-Ogievskiy typedef struct BdrvRefreshLimitsState { 1451e4c797cSVladimir Sementsov-Ogievskiy BlockDriverState *bs; 1461e4c797cSVladimir Sementsov-Ogievskiy BlockLimits old_bl; 1471e4c797cSVladimir Sementsov-Ogievskiy } BdrvRefreshLimitsState; 1481e4c797cSVladimir Sementsov-Ogievskiy 1491e4c797cSVladimir Sementsov-Ogievskiy static void bdrv_refresh_limits_abort(void *opaque) 1501e4c797cSVladimir Sementsov-Ogievskiy { 1511e4c797cSVladimir Sementsov-Ogievskiy BdrvRefreshLimitsState *s = opaque; 1521e4c797cSVladimir Sementsov-Ogievskiy 1531e4c797cSVladimir Sementsov-Ogievskiy s->bs->bl = s->old_bl; 1541e4c797cSVladimir Sementsov-Ogievskiy } 1551e4c797cSVladimir Sementsov-Ogievskiy 1561e4c797cSVladimir Sementsov-Ogievskiy static TransactionActionDrv bdrv_refresh_limits_drv = { 1571e4c797cSVladimir Sementsov-Ogievskiy .abort = bdrv_refresh_limits_abort, 1581e4c797cSVladimir Sementsov-Ogievskiy .clean = g_free, 1591e4c797cSVladimir Sementsov-Ogievskiy }; 1601e4c797cSVladimir Sementsov-Ogievskiy 1611e4c797cSVladimir Sementsov-Ogievskiy /* @tran is allowed to be NULL, in this case no rollback is possible. */ 1621e4c797cSVladimir Sementsov-Ogievskiy void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp) 16361007b31SStefan Hajnoczi { 16433985614SVladimir Sementsov-Ogievskiy ERRP_GUARD(); 16561007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 16666b129acSMax Reitz BdrvChild *c; 16766b129acSMax Reitz bool have_limits; 16861007b31SStefan Hajnoczi 169f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 170f791bf7fSEmanuele Giuseppe Esposito 1711e4c797cSVladimir Sementsov-Ogievskiy if (tran) { 1721e4c797cSVladimir Sementsov-Ogievskiy BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1); 1731e4c797cSVladimir Sementsov-Ogievskiy *s = (BdrvRefreshLimitsState) { 1741e4c797cSVladimir Sementsov-Ogievskiy .bs = bs, 1751e4c797cSVladimir Sementsov-Ogievskiy .old_bl = bs->bl, 1761e4c797cSVladimir Sementsov-Ogievskiy }; 1771e4c797cSVladimir Sementsov-Ogievskiy tran_add(tran, &bdrv_refresh_limits_drv, s); 1781e4c797cSVladimir Sementsov-Ogievskiy } 1791e4c797cSVladimir Sementsov-Ogievskiy 18061007b31SStefan Hajnoczi memset(&bs->bl, 0, sizeof(bs->bl)); 18161007b31SStefan Hajnoczi 18261007b31SStefan Hajnoczi if (!drv) { 18361007b31SStefan Hajnoczi return; 18461007b31SStefan Hajnoczi } 18561007b31SStefan Hajnoczi 18679ba8c98SEric Blake /* Default alignment based on whether driver has byte interface */ 187e31f6864SEric Blake bs->bl.request_alignment = (drv->bdrv_co_preadv || 188ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_aio_preadv || 189ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_co_preadv_part) ? 1 : 512; 19079ba8c98SEric Blake 19161007b31SStefan Hajnoczi /* Take some limits from the children as a default */ 19266b129acSMax Reitz have_limits = false; 19366b129acSMax Reitz QLIST_FOREACH(c, &bs->children, next) { 19466b129acSMax Reitz if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) 19566b129acSMax Reitz { 1961e4c797cSVladimir Sementsov-Ogievskiy bdrv_refresh_limits(c->bs, tran, errp); 19733985614SVladimir Sementsov-Ogievskiy if (*errp) { 19861007b31SStefan Hajnoczi return; 19961007b31SStefan Hajnoczi } 20066b129acSMax Reitz bdrv_merge_limits(&bs->bl, &c->bs->bl); 20166b129acSMax Reitz have_limits = true; 20266b129acSMax Reitz } 20366b129acSMax Reitz } 20466b129acSMax Reitz 20566b129acSMax Reitz if (!have_limits) { 2064196d2f0SDenis V. Lunev bs->bl.min_mem_alignment = 512; 207038adc2fSWei Yang bs->bl.opt_mem_alignment = qemu_real_host_page_size; 208bd44feb7SStefan Hajnoczi 209bd44feb7SStefan Hajnoczi /* Safe default since most protocols use readv()/writev()/etc */ 210bd44feb7SStefan Hajnoczi bs->bl.max_iov = IOV_MAX; 21161007b31SStefan Hajnoczi } 21261007b31SStefan Hajnoczi 21361007b31SStefan Hajnoczi /* Then let the driver override it */ 21461007b31SStefan Hajnoczi if (drv->bdrv_refresh_limits) { 21561007b31SStefan Hajnoczi drv->bdrv_refresh_limits(bs, errp); 2168b117001SVladimir Sementsov-Ogievskiy if (*errp) { 2178b117001SVladimir Sementsov-Ogievskiy return; 2188b117001SVladimir Sementsov-Ogievskiy } 2198b117001SVladimir Sementsov-Ogievskiy } 2208b117001SVladimir Sementsov-Ogievskiy 2218b117001SVladimir Sementsov-Ogievskiy if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) { 2228b117001SVladimir Sementsov-Ogievskiy error_setg(errp, "Driver requires too large request alignment"); 22361007b31SStefan Hajnoczi } 22461007b31SStefan Hajnoczi } 22561007b31SStefan Hajnoczi 22661007b31SStefan Hajnoczi /** 22761007b31SStefan Hajnoczi * The copy-on-read flag is actually a reference count so multiple users may 22861007b31SStefan Hajnoczi * use the feature without worrying about clobbering its previous state. 22961007b31SStefan Hajnoczi * Copy-on-read stays enabled until all users have called to disable it. 23061007b31SStefan Hajnoczi */ 23161007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs) 23261007b31SStefan Hajnoczi { 233384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 234d73415a3SStefan Hajnoczi qatomic_inc(&bs->copy_on_read); 23561007b31SStefan Hajnoczi } 23661007b31SStefan Hajnoczi 23761007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs) 23861007b31SStefan Hajnoczi { 239d73415a3SStefan Hajnoczi int old = qatomic_fetch_dec(&bs->copy_on_read); 240384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 241d3faa13eSPaolo Bonzini assert(old >= 1); 24261007b31SStefan Hajnoczi } 24361007b31SStefan Hajnoczi 24461124f03SPaolo Bonzini typedef struct { 24561124f03SPaolo Bonzini Coroutine *co; 24661124f03SPaolo Bonzini BlockDriverState *bs; 24761124f03SPaolo Bonzini bool done; 248481cad48SManos Pitsidianakis bool begin; 249b0165585SKevin Wolf bool recursive; 250fe4f0614SKevin Wolf bool poll; 2510152bf40SKevin Wolf BdrvChild *parent; 2526cd5c9d7SKevin Wolf bool ignore_bds_parents; 2538e1da77eSMax Reitz int *drained_end_counter; 25461124f03SPaolo Bonzini } BdrvCoDrainData; 25561124f03SPaolo Bonzini 25661124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 25761124f03SPaolo Bonzini { 25861124f03SPaolo Bonzini BdrvCoDrainData *data = opaque; 25961124f03SPaolo Bonzini BlockDriverState *bs = data->bs; 26061124f03SPaolo Bonzini 261481cad48SManos Pitsidianakis if (data->begin) { 262f8ea8dacSManos Pitsidianakis bs->drv->bdrv_co_drain_begin(bs); 263481cad48SManos Pitsidianakis } else { 264481cad48SManos Pitsidianakis bs->drv->bdrv_co_drain_end(bs); 265481cad48SManos Pitsidianakis } 26661124f03SPaolo Bonzini 26765181d63SMax Reitz /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 268d73415a3SStefan Hajnoczi qatomic_mb_set(&data->done, true); 269e037c09cSMax Reitz if (!data->begin) { 270d73415a3SStefan Hajnoczi qatomic_dec(data->drained_end_counter); 2718e1da77eSMax Reitz } 27265181d63SMax Reitz bdrv_dec_in_flight(bs); 2738e1da77eSMax Reitz 2740109e7e6SKevin Wolf g_free(data); 2750109e7e6SKevin Wolf } 27661124f03SPaolo Bonzini 277db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 2788e1da77eSMax Reitz static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 2798e1da77eSMax Reitz int *drained_end_counter) 28061124f03SPaolo Bonzini { 2810109e7e6SKevin Wolf BdrvCoDrainData *data; 28261124f03SPaolo Bonzini 283f8ea8dacSManos Pitsidianakis if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 284481cad48SManos Pitsidianakis (!begin && !bs->drv->bdrv_co_drain_end)) { 28561124f03SPaolo Bonzini return; 28661124f03SPaolo Bonzini } 28761124f03SPaolo Bonzini 2880109e7e6SKevin Wolf data = g_new(BdrvCoDrainData, 1); 2890109e7e6SKevin Wolf *data = (BdrvCoDrainData) { 2900109e7e6SKevin Wolf .bs = bs, 2910109e7e6SKevin Wolf .done = false, 2928e1da77eSMax Reitz .begin = begin, 2938e1da77eSMax Reitz .drained_end_counter = drained_end_counter, 2940109e7e6SKevin Wolf }; 2950109e7e6SKevin Wolf 296e037c09cSMax Reitz if (!begin) { 297d73415a3SStefan Hajnoczi qatomic_inc(drained_end_counter); 2988e1da77eSMax Reitz } 2998e1da77eSMax Reitz 3000109e7e6SKevin Wolf /* Make sure the driver callback completes during the polling phase for 3010109e7e6SKevin Wolf * drain_begin. */ 3020109e7e6SKevin Wolf bdrv_inc_in_flight(bs); 3030109e7e6SKevin Wolf data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 3040109e7e6SKevin Wolf aio_co_schedule(bdrv_get_aio_context(bs), data->co); 30561124f03SPaolo Bonzini } 30661124f03SPaolo Bonzini 3071cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 308fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 3096cd5c9d7SKevin Wolf BdrvChild *ignore_parent, bool ignore_bds_parents) 31089bd0305SKevin Wolf { 311fe4f0614SKevin Wolf BdrvChild *child, *next; 312384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 313fe4f0614SKevin Wolf 3146cd5c9d7SKevin Wolf if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 31589bd0305SKevin Wolf return true; 31689bd0305SKevin Wolf } 31789bd0305SKevin Wolf 318d73415a3SStefan Hajnoczi if (qatomic_read(&bs->in_flight)) { 319fe4f0614SKevin Wolf return true; 32089bd0305SKevin Wolf } 32189bd0305SKevin Wolf 322fe4f0614SKevin Wolf if (recursive) { 3236cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 324fe4f0614SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 3256cd5c9d7SKevin Wolf if (bdrv_drain_poll(child->bs, recursive, child, false)) { 326fe4f0614SKevin Wolf return true; 327fe4f0614SKevin Wolf } 328fe4f0614SKevin Wolf } 329fe4f0614SKevin Wolf } 330fe4f0614SKevin Wolf 331fe4f0614SKevin Wolf return false; 332fe4f0614SKevin Wolf } 333fe4f0614SKevin Wolf 334fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 33589bd0305SKevin Wolf BdrvChild *ignore_parent) 3361cc8e54aSKevin Wolf { 3376cd5c9d7SKevin Wolf return bdrv_drain_poll(bs, recursive, ignore_parent, false); 3381cc8e54aSKevin Wolf } 3391cc8e54aSKevin Wolf 340b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 3416cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 3426cd5c9d7SKevin Wolf bool poll); 343b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 3448e1da77eSMax Reitz BdrvChild *parent, bool ignore_bds_parents, 3458e1da77eSMax Reitz int *drained_end_counter); 3460152bf40SKevin Wolf 347a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque) 348a77fd4bbSFam Zheng { 349a77fd4bbSFam Zheng BdrvCoDrainData *data = opaque; 350a77fd4bbSFam Zheng Coroutine *co = data->co; 35199723548SPaolo Bonzini BlockDriverState *bs = data->bs; 352a77fd4bbSFam Zheng 353c8ca33d0SKevin Wolf if (bs) { 354aa1361d5SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 355aa1361d5SKevin Wolf aio_context_acquire(ctx); 35699723548SPaolo Bonzini bdrv_dec_in_flight(bs); 357481cad48SManos Pitsidianakis if (data->begin) { 358e037c09cSMax Reitz assert(!data->drained_end_counter); 3596cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, data->recursive, data->parent, 3606cd5c9d7SKevin Wolf data->ignore_bds_parents, data->poll); 361481cad48SManos Pitsidianakis } else { 362e037c09cSMax Reitz assert(!data->poll); 3636cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, data->recursive, data->parent, 3648e1da77eSMax Reitz data->ignore_bds_parents, 3658e1da77eSMax Reitz data->drained_end_counter); 366481cad48SManos Pitsidianakis } 367aa1361d5SKevin Wolf aio_context_release(ctx); 368c8ca33d0SKevin Wolf } else { 369c8ca33d0SKevin Wolf assert(data->begin); 370c8ca33d0SKevin Wolf bdrv_drain_all_begin(); 371c8ca33d0SKevin Wolf } 372481cad48SManos Pitsidianakis 373a77fd4bbSFam Zheng data->done = true; 3741919631eSPaolo Bonzini aio_co_wake(co); 375a77fd4bbSFam Zheng } 376a77fd4bbSFam Zheng 377481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 378b0165585SKevin Wolf bool begin, bool recursive, 3796cd5c9d7SKevin Wolf BdrvChild *parent, 3806cd5c9d7SKevin Wolf bool ignore_bds_parents, 3818e1da77eSMax Reitz bool poll, 3828e1da77eSMax Reitz int *drained_end_counter) 383a77fd4bbSFam Zheng { 384a77fd4bbSFam Zheng BdrvCoDrainData data; 385960d5fb3SKevin Wolf Coroutine *self = qemu_coroutine_self(); 386960d5fb3SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 387960d5fb3SKevin Wolf AioContext *co_ctx = qemu_coroutine_get_aio_context(self); 388a77fd4bbSFam Zheng 389a77fd4bbSFam Zheng /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 390c40a2545SStefan Hajnoczi * other coroutines run if they were queued by aio_co_enter(). */ 391a77fd4bbSFam Zheng 392a77fd4bbSFam Zheng assert(qemu_in_coroutine()); 393a77fd4bbSFam Zheng data = (BdrvCoDrainData) { 394960d5fb3SKevin Wolf .co = self, 395a77fd4bbSFam Zheng .bs = bs, 396a77fd4bbSFam Zheng .done = false, 397481cad48SManos Pitsidianakis .begin = begin, 398b0165585SKevin Wolf .recursive = recursive, 3990152bf40SKevin Wolf .parent = parent, 4006cd5c9d7SKevin Wolf .ignore_bds_parents = ignore_bds_parents, 401fe4f0614SKevin Wolf .poll = poll, 4028e1da77eSMax Reitz .drained_end_counter = drained_end_counter, 403a77fd4bbSFam Zheng }; 4048e1da77eSMax Reitz 405c8ca33d0SKevin Wolf if (bs) { 40699723548SPaolo Bonzini bdrv_inc_in_flight(bs); 407c8ca33d0SKevin Wolf } 408960d5fb3SKevin Wolf 409960d5fb3SKevin Wolf /* 410960d5fb3SKevin Wolf * Temporarily drop the lock across yield or we would get deadlocks. 411960d5fb3SKevin Wolf * bdrv_co_drain_bh_cb() reaquires the lock as needed. 412960d5fb3SKevin Wolf * 413960d5fb3SKevin Wolf * When we yield below, the lock for the current context will be 414960d5fb3SKevin Wolf * released, so if this is actually the lock that protects bs, don't drop 415960d5fb3SKevin Wolf * it a second time. 416960d5fb3SKevin Wolf */ 417960d5fb3SKevin Wolf if (ctx != co_ctx) { 418960d5fb3SKevin Wolf aio_context_release(ctx); 419960d5fb3SKevin Wolf } 420960d5fb3SKevin Wolf replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data); 421a77fd4bbSFam Zheng 422a77fd4bbSFam Zheng qemu_coroutine_yield(); 423a77fd4bbSFam Zheng /* If we are resumed from some other event (such as an aio completion or a 424a77fd4bbSFam Zheng * timer callback), it is a bug in the caller that should be fixed. */ 425a77fd4bbSFam Zheng assert(data.done); 426960d5fb3SKevin Wolf 427960d5fb3SKevin Wolf /* Reaquire the AioContext of bs if we dropped it */ 428960d5fb3SKevin Wolf if (ctx != co_ctx) { 429960d5fb3SKevin Wolf aio_context_acquire(ctx); 430960d5fb3SKevin Wolf } 431a77fd4bbSFam Zheng } 432a77fd4bbSFam Zheng 433dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 4346cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents) 435dcf94a23SKevin Wolf { 436384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 437dcf94a23SKevin Wolf assert(!qemu_in_coroutine()); 438dcf94a23SKevin Wolf 439dcf94a23SKevin Wolf /* Stop things in parent-to-child order */ 440d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { 441dcf94a23SKevin Wolf aio_disable_external(bdrv_get_aio_context(bs)); 442dcf94a23SKevin Wolf } 443dcf94a23SKevin Wolf 4446cd5c9d7SKevin Wolf bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 4458e1da77eSMax Reitz bdrv_drain_invoke(bs, true, NULL); 446dcf94a23SKevin Wolf } 447dcf94a23SKevin Wolf 448dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 4496cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 4506cd5c9d7SKevin Wolf bool poll) 4516820643fSKevin Wolf { 452b0165585SKevin Wolf BdrvChild *child, *next; 453b0165585SKevin Wolf 454d42cf288SPaolo Bonzini if (qemu_in_coroutine()) { 4556cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 4568e1da77eSMax Reitz poll, NULL); 457d42cf288SPaolo Bonzini return; 458d42cf288SPaolo Bonzini } 459d42cf288SPaolo Bonzini 4606cd5c9d7SKevin Wolf bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 461d30b8e64SKevin Wolf 462b0165585SKevin Wolf if (recursive) { 4636cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 464d736f119SKevin Wolf bs->recursive_quiesce_counter++; 465b0165585SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 4666cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 4676cd5c9d7SKevin Wolf false); 468b0165585SKevin Wolf } 469b0165585SKevin Wolf } 470fe4f0614SKevin Wolf 471fe4f0614SKevin Wolf /* 472fe4f0614SKevin Wolf * Wait for drained requests to finish. 473fe4f0614SKevin Wolf * 474fe4f0614SKevin Wolf * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 475fe4f0614SKevin Wolf * call is needed so things in this AioContext can make progress even 476fe4f0614SKevin Wolf * though we don't return to the main AioContext loop - this automatically 477fe4f0614SKevin Wolf * includes other nodes in the same AioContext and therefore all child 478fe4f0614SKevin Wolf * nodes. 479fe4f0614SKevin Wolf */ 480fe4f0614SKevin Wolf if (poll) { 4816cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 482fe4f0614SKevin Wolf BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 483fe4f0614SKevin Wolf } 4846820643fSKevin Wolf } 4856820643fSKevin Wolf 4860152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs) 4870152bf40SKevin Wolf { 488384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 4896cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, false, NULL, false, true); 4900152bf40SKevin Wolf } 4910152bf40SKevin Wolf 492b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs) 4936820643fSKevin Wolf { 494384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 4956cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, true, NULL, false, true); 496b0165585SKevin Wolf } 497b0165585SKevin Wolf 498e037c09cSMax Reitz /** 499e037c09cSMax Reitz * This function does not poll, nor must any of its recursively called 500e037c09cSMax Reitz * functions. The *drained_end_counter pointee will be incremented 501e037c09cSMax Reitz * once for every background operation scheduled, and decremented once 502e037c09cSMax Reitz * the operation settles. Therefore, the pointer must remain valid 503e037c09cSMax Reitz * until the pointee reaches 0. That implies that whoever sets up the 504e037c09cSMax Reitz * pointee has to poll until it is 0. 505e037c09cSMax Reitz * 506e037c09cSMax Reitz * We use atomic operations to access *drained_end_counter, because 507e037c09cSMax Reitz * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 508e037c09cSMax Reitz * @bs may contain nodes in different AioContexts, 509e037c09cSMax Reitz * (2) bdrv_drain_all_end() uses the same counter for all nodes, 510e037c09cSMax Reitz * regardless of which AioContext they are in. 511e037c09cSMax Reitz */ 5126cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 5138e1da77eSMax Reitz BdrvChild *parent, bool ignore_bds_parents, 5148e1da77eSMax Reitz int *drained_end_counter) 515b0165585SKevin Wolf { 51661ad631cSMax Reitz BdrvChild *child; 5170f115168SKevin Wolf int old_quiesce_counter; 5180f115168SKevin Wolf 519e037c09cSMax Reitz assert(drained_end_counter != NULL); 520e037c09cSMax Reitz 521481cad48SManos Pitsidianakis if (qemu_in_coroutine()) { 5226cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 5238e1da77eSMax Reitz false, drained_end_counter); 524481cad48SManos Pitsidianakis return; 525481cad48SManos Pitsidianakis } 5266820643fSKevin Wolf assert(bs->quiesce_counter > 0); 5276820643fSKevin Wolf 52860369b86SKevin Wolf /* Re-enable things in child-to-parent order */ 5298e1da77eSMax Reitz bdrv_drain_invoke(bs, false, drained_end_counter); 530e037c09cSMax Reitz bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 531e037c09cSMax Reitz drained_end_counter); 5325cb2737eSMax Reitz 533d73415a3SStefan Hajnoczi old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); 5340f115168SKevin Wolf if (old_quiesce_counter == 1) { 5356820643fSKevin Wolf aio_enable_external(bdrv_get_aio_context(bs)); 5366820643fSKevin Wolf } 537b0165585SKevin Wolf 538b0165585SKevin Wolf if (recursive) { 5396cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 540d736f119SKevin Wolf bs->recursive_quiesce_counter--; 54161ad631cSMax Reitz QLIST_FOREACH(child, &bs->children, next) { 5428e1da77eSMax Reitz bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 5438e1da77eSMax Reitz drained_end_counter); 544b0165585SKevin Wolf } 545b0165585SKevin Wolf } 5460f115168SKevin Wolf } 5476820643fSKevin Wolf 5480152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs) 5490152bf40SKevin Wolf { 550e037c09cSMax Reitz int drained_end_counter = 0; 551384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 552e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 553d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 554e037c09cSMax Reitz } 555e037c09cSMax Reitz 556e037c09cSMax Reitz void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 557e037c09cSMax Reitz { 558384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 559e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 560b0165585SKevin Wolf } 561b0165585SKevin Wolf 562b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs) 563b0165585SKevin Wolf { 564e037c09cSMax Reitz int drained_end_counter = 0; 565384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 566e037c09cSMax Reitz bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 567d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 5680152bf40SKevin Wolf } 5690152bf40SKevin Wolf 570d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 571d736f119SKevin Wolf { 572d736f119SKevin Wolf int i; 573*967d7905SEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 574d736f119SKevin Wolf 575d736f119SKevin Wolf for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 5766cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, false, true); 577d736f119SKevin Wolf } 578d736f119SKevin Wolf } 579d736f119SKevin Wolf 580d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 581d736f119SKevin Wolf { 582e037c09cSMax Reitz int drained_end_counter = 0; 583d736f119SKevin Wolf int i; 584*967d7905SEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 585d736f119SKevin Wolf 586d736f119SKevin Wolf for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 587e037c09cSMax Reitz bdrv_do_drained_end(child->bs, true, child, false, 588e037c09cSMax Reitz &drained_end_counter); 589d736f119SKevin Wolf } 590e037c09cSMax Reitz 591d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0); 592d736f119SKevin Wolf } 593d736f119SKevin Wolf 59461007b31SStefan Hajnoczi /* 59567da1dc5SFam Zheng * Wait for pending requests to complete on a single BlockDriverState subtree, 59667da1dc5SFam Zheng * and suspend block driver's internal I/O until next request arrives. 59761007b31SStefan Hajnoczi * 59861007b31SStefan Hajnoczi * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 59961007b31SStefan Hajnoczi * AioContext. 60061007b31SStefan Hajnoczi */ 601b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 602b6e84c97SPaolo Bonzini { 603384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 6046820643fSKevin Wolf assert(qemu_in_coroutine()); 6056820643fSKevin Wolf bdrv_drained_begin(bs); 6066820643fSKevin Wolf bdrv_drained_end(bs); 607b6e84c97SPaolo Bonzini } 608b6e84c97SPaolo Bonzini 60961007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs) 61061007b31SStefan Hajnoczi { 611384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 6126820643fSKevin Wolf bdrv_drained_begin(bs); 6136820643fSKevin Wolf bdrv_drained_end(bs); 61461007b31SStefan Hajnoczi } 61561007b31SStefan Hajnoczi 616c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs) 617c13ad59fSKevin Wolf { 618c13ad59fSKevin Wolf BdrvChild *child, *next; 619c13ad59fSKevin Wolf 620d73415a3SStefan Hajnoczi assert(qatomic_read(&bs->in_flight) == 0); 621c13ad59fSKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 622c13ad59fSKevin Wolf bdrv_drain_assert_idle(child->bs); 623c13ad59fSKevin Wolf } 624c13ad59fSKevin Wolf } 625c13ad59fSKevin Wolf 6260f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0; 6270f12264eSKevin Wolf 6280f12264eSKevin Wolf static bool bdrv_drain_all_poll(void) 6290f12264eSKevin Wolf { 6300f12264eSKevin Wolf BlockDriverState *bs = NULL; 6310f12264eSKevin Wolf bool result = false; 632f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 6330f12264eSKevin Wolf 6340f12264eSKevin Wolf /* bdrv_drain_poll() can't make changes to the graph and we are holding the 6350f12264eSKevin Wolf * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 6360f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 6370f12264eSKevin Wolf AioContext *aio_context = bdrv_get_aio_context(bs); 6380f12264eSKevin Wolf aio_context_acquire(aio_context); 6390f12264eSKevin Wolf result |= bdrv_drain_poll(bs, false, NULL, true); 6400f12264eSKevin Wolf aio_context_release(aio_context); 6410f12264eSKevin Wolf } 6420f12264eSKevin Wolf 6430f12264eSKevin Wolf return result; 6440f12264eSKevin Wolf } 6450f12264eSKevin Wolf 64661007b31SStefan Hajnoczi /* 64761007b31SStefan Hajnoczi * Wait for pending requests to complete across all BlockDriverStates 64861007b31SStefan Hajnoczi * 64961007b31SStefan Hajnoczi * This function does not flush data to disk, use bdrv_flush_all() for that 65061007b31SStefan Hajnoczi * after calling this function. 651c0778f66SAlberto Garcia * 652c0778f66SAlberto Garcia * This pauses all block jobs and disables external clients. It must 653c0778f66SAlberto Garcia * be paired with bdrv_drain_all_end(). 654c0778f66SAlberto Garcia * 655c0778f66SAlberto Garcia * NOTE: no new block jobs or BlockDriverStates can be created between 656c0778f66SAlberto Garcia * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 65761007b31SStefan Hajnoczi */ 658c0778f66SAlberto Garcia void bdrv_drain_all_begin(void) 65961007b31SStefan Hajnoczi { 6600f12264eSKevin Wolf BlockDriverState *bs = NULL; 661f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 66261007b31SStefan Hajnoczi 663c8ca33d0SKevin Wolf if (qemu_in_coroutine()) { 6648e1da77eSMax Reitz bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 665c8ca33d0SKevin Wolf return; 666c8ca33d0SKevin Wolf } 667c8ca33d0SKevin Wolf 668c8aa7895SPavel Dovgalyuk /* 669c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 670c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 671c8aa7895SPavel Dovgalyuk * be infinite 672c8aa7895SPavel Dovgalyuk */ 673c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 674c8aa7895SPavel Dovgalyuk return; 675c8aa7895SPavel Dovgalyuk } 676c8aa7895SPavel Dovgalyuk 6770f12264eSKevin Wolf /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 6780f12264eSKevin Wolf * loop AioContext, so make sure we're in the main context. */ 6799a7e86c8SKevin Wolf assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 6800f12264eSKevin Wolf assert(bdrv_drain_all_count < INT_MAX); 6810f12264eSKevin Wolf bdrv_drain_all_count++; 6829a7e86c8SKevin Wolf 6830f12264eSKevin Wolf /* Quiesce all nodes, without polling in-flight requests yet. The graph 6840f12264eSKevin Wolf * cannot change during this loop. */ 6850f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 68661007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 68761007b31SStefan Hajnoczi 68861007b31SStefan Hajnoczi aio_context_acquire(aio_context); 6890f12264eSKevin Wolf bdrv_do_drained_begin(bs, false, NULL, true, false); 69061007b31SStefan Hajnoczi aio_context_release(aio_context); 69161007b31SStefan Hajnoczi } 69261007b31SStefan Hajnoczi 6930f12264eSKevin Wolf /* Now poll the in-flight requests */ 694cfe29d82SKevin Wolf AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 6950f12264eSKevin Wolf 6960f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 697c13ad59fSKevin Wolf bdrv_drain_assert_idle(bs); 698f406c03cSAlexander Yarygin } 699f406c03cSAlexander Yarygin } 700c0778f66SAlberto Garcia 7011a6d3bd2SGreg Kurz void bdrv_drain_all_end_quiesce(BlockDriverState *bs) 7021a6d3bd2SGreg Kurz { 7031a6d3bd2SGreg Kurz int drained_end_counter = 0; 704b4ad82aaSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 7051a6d3bd2SGreg Kurz 7061a6d3bd2SGreg Kurz g_assert(bs->quiesce_counter > 0); 7071a6d3bd2SGreg Kurz g_assert(!bs->refcnt); 7081a6d3bd2SGreg Kurz 7091a6d3bd2SGreg Kurz while (bs->quiesce_counter) { 7101a6d3bd2SGreg Kurz bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 7111a6d3bd2SGreg Kurz } 7121a6d3bd2SGreg Kurz BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 7131a6d3bd2SGreg Kurz } 7141a6d3bd2SGreg Kurz 715c0778f66SAlberto Garcia void bdrv_drain_all_end(void) 716c0778f66SAlberto Garcia { 7170f12264eSKevin Wolf BlockDriverState *bs = NULL; 718e037c09cSMax Reitz int drained_end_counter = 0; 719f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 720c0778f66SAlberto Garcia 721c8aa7895SPavel Dovgalyuk /* 722c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 723c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 724c8aa7895SPavel Dovgalyuk * be endless 725c8aa7895SPavel Dovgalyuk */ 726c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 727c8aa7895SPavel Dovgalyuk return; 728c8aa7895SPavel Dovgalyuk } 729c8aa7895SPavel Dovgalyuk 7300f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 73161007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 73261007b31SStefan Hajnoczi 73361007b31SStefan Hajnoczi aio_context_acquire(aio_context); 734e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 73561007b31SStefan Hajnoczi aio_context_release(aio_context); 73661007b31SStefan Hajnoczi } 7370f12264eSKevin Wolf 738e037c09cSMax Reitz assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 739d73415a3SStefan Hajnoczi AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0); 740e037c09cSMax Reitz 7410f12264eSKevin Wolf assert(bdrv_drain_all_count > 0); 7420f12264eSKevin Wolf bdrv_drain_all_count--; 74361007b31SStefan Hajnoczi } 74461007b31SStefan Hajnoczi 745c0778f66SAlberto Garcia void bdrv_drain_all(void) 746c0778f66SAlberto Garcia { 747f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 748c0778f66SAlberto Garcia bdrv_drain_all_begin(); 749c0778f66SAlberto Garcia bdrv_drain_all_end(); 750c0778f66SAlberto Garcia } 751c0778f66SAlberto Garcia 75261007b31SStefan Hajnoczi /** 75361007b31SStefan Hajnoczi * Remove an active request from the tracked requests list 75461007b31SStefan Hajnoczi * 75561007b31SStefan Hajnoczi * This function should be called when a tracked request is completing. 75661007b31SStefan Hajnoczi */ 75761007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req) 75861007b31SStefan Hajnoczi { 75961007b31SStefan Hajnoczi if (req->serialising) { 760d73415a3SStefan Hajnoczi qatomic_dec(&req->bs->serialising_in_flight); 76161007b31SStefan Hajnoczi } 76261007b31SStefan Hajnoczi 7633783fa3dSPaolo Bonzini qemu_co_mutex_lock(&req->bs->reqs_lock); 76461007b31SStefan Hajnoczi QLIST_REMOVE(req, list); 76561007b31SStefan Hajnoczi qemu_co_queue_restart_all(&req->wait_queue); 7663783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&req->bs->reqs_lock); 76761007b31SStefan Hajnoczi } 76861007b31SStefan Hajnoczi 76961007b31SStefan Hajnoczi /** 77061007b31SStefan Hajnoczi * Add an active request to the tracked requests list 77161007b31SStefan Hajnoczi */ 77261007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req, 77361007b31SStefan Hajnoczi BlockDriverState *bs, 77461007b31SStefan Hajnoczi int64_t offset, 77580247264SEric Blake int64_t bytes, 776ebde595cSFam Zheng enum BdrvTrackedRequestType type) 77761007b31SStefan Hajnoczi { 77880247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 77922931a15SFam Zheng 78061007b31SStefan Hajnoczi *req = (BdrvTrackedRequest){ 78161007b31SStefan Hajnoczi .bs = bs, 78261007b31SStefan Hajnoczi .offset = offset, 78361007b31SStefan Hajnoczi .bytes = bytes, 784ebde595cSFam Zheng .type = type, 78561007b31SStefan Hajnoczi .co = qemu_coroutine_self(), 78661007b31SStefan Hajnoczi .serialising = false, 78761007b31SStefan Hajnoczi .overlap_offset = offset, 78861007b31SStefan Hajnoczi .overlap_bytes = bytes, 78961007b31SStefan Hajnoczi }; 79061007b31SStefan Hajnoczi 79161007b31SStefan Hajnoczi qemu_co_queue_init(&req->wait_queue); 79261007b31SStefan Hajnoczi 7933783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 79461007b31SStefan Hajnoczi QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 7953783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 79661007b31SStefan Hajnoczi } 79761007b31SStefan Hajnoczi 7983ba0e1a0SPaolo Bonzini static bool tracked_request_overlaps(BdrvTrackedRequest *req, 79980247264SEric Blake int64_t offset, int64_t bytes) 8003ba0e1a0SPaolo Bonzini { 80180247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 80280247264SEric Blake 8033ba0e1a0SPaolo Bonzini /* aaaa bbbb */ 8043ba0e1a0SPaolo Bonzini if (offset >= req->overlap_offset + req->overlap_bytes) { 8053ba0e1a0SPaolo Bonzini return false; 8063ba0e1a0SPaolo Bonzini } 8073ba0e1a0SPaolo Bonzini /* bbbb aaaa */ 8083ba0e1a0SPaolo Bonzini if (req->overlap_offset >= offset + bytes) { 8093ba0e1a0SPaolo Bonzini return false; 8103ba0e1a0SPaolo Bonzini } 8113ba0e1a0SPaolo Bonzini return true; 8123ba0e1a0SPaolo Bonzini } 8133ba0e1a0SPaolo Bonzini 8143183937fSVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 8153183937fSVladimir Sementsov-Ogievskiy static BdrvTrackedRequest * 8163183937fSVladimir Sementsov-Ogievskiy bdrv_find_conflicting_request(BdrvTrackedRequest *self) 8173ba0e1a0SPaolo Bonzini { 8183ba0e1a0SPaolo Bonzini BdrvTrackedRequest *req; 8193ba0e1a0SPaolo Bonzini 8203183937fSVladimir Sementsov-Ogievskiy QLIST_FOREACH(req, &self->bs->tracked_requests, list) { 8213ba0e1a0SPaolo Bonzini if (req == self || (!req->serialising && !self->serialising)) { 8223ba0e1a0SPaolo Bonzini continue; 8233ba0e1a0SPaolo Bonzini } 8243ba0e1a0SPaolo Bonzini if (tracked_request_overlaps(req, self->overlap_offset, 8253ba0e1a0SPaolo Bonzini self->overlap_bytes)) 8263ba0e1a0SPaolo Bonzini { 8273183937fSVladimir Sementsov-Ogievskiy /* 8283183937fSVladimir Sementsov-Ogievskiy * Hitting this means there was a reentrant request, for 8293ba0e1a0SPaolo Bonzini * example, a block driver issuing nested requests. This must 8303ba0e1a0SPaolo Bonzini * never happen since it means deadlock. 8313ba0e1a0SPaolo Bonzini */ 8323ba0e1a0SPaolo Bonzini assert(qemu_coroutine_self() != req->co); 8333ba0e1a0SPaolo Bonzini 8343183937fSVladimir Sementsov-Ogievskiy /* 8353183937fSVladimir Sementsov-Ogievskiy * If the request is already (indirectly) waiting for us, or 8363ba0e1a0SPaolo Bonzini * will wait for us as soon as it wakes up, then just go on 8373183937fSVladimir Sementsov-Ogievskiy * (instead of producing a deadlock in the former case). 8383183937fSVladimir Sementsov-Ogievskiy */ 8393ba0e1a0SPaolo Bonzini if (!req->waiting_for) { 8403183937fSVladimir Sementsov-Ogievskiy return req; 8413183937fSVladimir Sementsov-Ogievskiy } 8423183937fSVladimir Sementsov-Ogievskiy } 8433183937fSVladimir Sementsov-Ogievskiy } 8443183937fSVladimir Sementsov-Ogievskiy 8453183937fSVladimir Sementsov-Ogievskiy return NULL; 8463183937fSVladimir Sementsov-Ogievskiy } 8473183937fSVladimir Sementsov-Ogievskiy 848ec1c8868SVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 8493183937fSVladimir Sementsov-Ogievskiy static bool coroutine_fn 850ec1c8868SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) 8513183937fSVladimir Sementsov-Ogievskiy { 8523183937fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req; 8533183937fSVladimir Sementsov-Ogievskiy bool waited = false; 8543183937fSVladimir Sementsov-Ogievskiy 8553183937fSVladimir Sementsov-Ogievskiy while ((req = bdrv_find_conflicting_request(self))) { 8563ba0e1a0SPaolo Bonzini self->waiting_for = req; 857ec1c8868SVladimir Sementsov-Ogievskiy qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); 8583ba0e1a0SPaolo Bonzini self->waiting_for = NULL; 8593ba0e1a0SPaolo Bonzini waited = true; 8603ba0e1a0SPaolo Bonzini } 8613183937fSVladimir Sementsov-Ogievskiy 8623ba0e1a0SPaolo Bonzini return waited; 8633ba0e1a0SPaolo Bonzini } 8643ba0e1a0SPaolo Bonzini 8658ac5aab2SVladimir Sementsov-Ogievskiy /* Called with req->bs->reqs_lock held */ 8668ac5aab2SVladimir Sementsov-Ogievskiy static void tracked_request_set_serialising(BdrvTrackedRequest *req, 8678ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 86861007b31SStefan Hajnoczi { 86961007b31SStefan Hajnoczi int64_t overlap_offset = req->offset & ~(align - 1); 87080247264SEric Blake int64_t overlap_bytes = 87180247264SEric Blake ROUND_UP(req->offset + req->bytes, align) - overlap_offset; 87280247264SEric Blake 87380247264SEric Blake bdrv_check_request(req->offset, req->bytes, &error_abort); 87461007b31SStefan Hajnoczi 87561007b31SStefan Hajnoczi if (!req->serialising) { 876d73415a3SStefan Hajnoczi qatomic_inc(&req->bs->serialising_in_flight); 87761007b31SStefan Hajnoczi req->serialising = true; 87861007b31SStefan Hajnoczi } 87961007b31SStefan Hajnoczi 88061007b31SStefan Hajnoczi req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 88161007b31SStefan Hajnoczi req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 88209d2f948SVladimir Sementsov-Ogievskiy } 88309d2f948SVladimir Sementsov-Ogievskiy 88461007b31SStefan Hajnoczi /** 885c28107e9SMax Reitz * Return the tracked request on @bs for the current coroutine, or 886c28107e9SMax Reitz * NULL if there is none. 887c28107e9SMax Reitz */ 888c28107e9SMax Reitz BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 889c28107e9SMax Reitz { 890c28107e9SMax Reitz BdrvTrackedRequest *req; 891c28107e9SMax Reitz Coroutine *self = qemu_coroutine_self(); 892*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 893c28107e9SMax Reitz 894c28107e9SMax Reitz QLIST_FOREACH(req, &bs->tracked_requests, list) { 895c28107e9SMax Reitz if (req->co == self) { 896c28107e9SMax Reitz return req; 897c28107e9SMax Reitz } 898c28107e9SMax Reitz } 899c28107e9SMax Reitz 900c28107e9SMax Reitz return NULL; 901c28107e9SMax Reitz } 902c28107e9SMax Reitz 903c28107e9SMax Reitz /** 904244483e6SKevin Wolf * Round a region to cluster boundaries 905244483e6SKevin Wolf */ 906244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs, 9077cfd5275SEric Blake int64_t offset, int64_t bytes, 908244483e6SKevin Wolf int64_t *cluster_offset, 9097cfd5275SEric Blake int64_t *cluster_bytes) 910244483e6SKevin Wolf { 911244483e6SKevin Wolf BlockDriverInfo bdi; 912384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 913244483e6SKevin Wolf if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 914244483e6SKevin Wolf *cluster_offset = offset; 915244483e6SKevin Wolf *cluster_bytes = bytes; 916244483e6SKevin Wolf } else { 917244483e6SKevin Wolf int64_t c = bdi.cluster_size; 918244483e6SKevin Wolf *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 919244483e6SKevin Wolf *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 920244483e6SKevin Wolf } 921244483e6SKevin Wolf } 922244483e6SKevin Wolf 92361007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs) 92461007b31SStefan Hajnoczi { 92561007b31SStefan Hajnoczi BlockDriverInfo bdi; 92661007b31SStefan Hajnoczi int ret; 92761007b31SStefan Hajnoczi 92861007b31SStefan Hajnoczi ret = bdrv_get_info(bs, &bdi); 92961007b31SStefan Hajnoczi if (ret < 0 || bdi.cluster_size == 0) { 930a5b8dd2cSEric Blake return bs->bl.request_alignment; 93161007b31SStefan Hajnoczi } else { 93261007b31SStefan Hajnoczi return bdi.cluster_size; 93361007b31SStefan Hajnoczi } 93461007b31SStefan Hajnoczi } 93561007b31SStefan Hajnoczi 93699723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs) 93799723548SPaolo Bonzini { 938*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 939d73415a3SStefan Hajnoczi qatomic_inc(&bs->in_flight); 94099723548SPaolo Bonzini } 94199723548SPaolo Bonzini 942c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs) 943c9d1a561SPaolo Bonzini { 944*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 945cfe29d82SKevin Wolf aio_wait_kick(); 946c9d1a561SPaolo Bonzini } 947c9d1a561SPaolo Bonzini 94899723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs) 94999723548SPaolo Bonzini { 950*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 951d73415a3SStefan Hajnoczi qatomic_dec(&bs->in_flight); 952c9d1a561SPaolo Bonzini bdrv_wakeup(bs); 95399723548SPaolo Bonzini } 95499723548SPaolo Bonzini 95518fbd0deSPaolo Bonzini static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 95661007b31SStefan Hajnoczi { 95761007b31SStefan Hajnoczi BlockDriverState *bs = self->bs; 95861007b31SStefan Hajnoczi bool waited = false; 95961007b31SStefan Hajnoczi 960d73415a3SStefan Hajnoczi if (!qatomic_read(&bs->serialising_in_flight)) { 96161007b31SStefan Hajnoczi return false; 96261007b31SStefan Hajnoczi } 96361007b31SStefan Hajnoczi 9643783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 965ec1c8868SVladimir Sementsov-Ogievskiy waited = bdrv_wait_serialising_requests_locked(self); 9663783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 96761007b31SStefan Hajnoczi 96861007b31SStefan Hajnoczi return waited; 96961007b31SStefan Hajnoczi } 97061007b31SStefan Hajnoczi 9718ac5aab2SVladimir Sementsov-Ogievskiy bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, 9728ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 9738ac5aab2SVladimir Sementsov-Ogievskiy { 9748ac5aab2SVladimir Sementsov-Ogievskiy bool waited; 975*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 9768ac5aab2SVladimir Sementsov-Ogievskiy 9778ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_lock(&req->bs->reqs_lock); 9788ac5aab2SVladimir Sementsov-Ogievskiy 9798ac5aab2SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, align); 9808ac5aab2SVladimir Sementsov-Ogievskiy waited = bdrv_wait_serialising_requests_locked(req); 9818ac5aab2SVladimir Sementsov-Ogievskiy 9828ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_unlock(&req->bs->reqs_lock); 9838ac5aab2SVladimir Sementsov-Ogievskiy 9848ac5aab2SVladimir Sementsov-Ogievskiy return waited; 9858ac5aab2SVladimir Sementsov-Ogievskiy } 9868ac5aab2SVladimir Sementsov-Ogievskiy 987558902ccSVladimir Sementsov-Ogievskiy int bdrv_check_qiov_request(int64_t offset, int64_t bytes, 98863f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 98963f4ad11SVladimir Sementsov-Ogievskiy Error **errp) 99061007b31SStefan Hajnoczi { 99163f4ad11SVladimir Sementsov-Ogievskiy /* 99263f4ad11SVladimir Sementsov-Ogievskiy * Check generic offset/bytes correctness 99363f4ad11SVladimir Sementsov-Ogievskiy */ 99463f4ad11SVladimir Sementsov-Ogievskiy 99569b55e03SVladimir Sementsov-Ogievskiy if (offset < 0) { 99669b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset is negative: %" PRIi64, offset); 99769b55e03SVladimir Sementsov-Ogievskiy return -EIO; 99869b55e03SVladimir Sementsov-Ogievskiy } 99969b55e03SVladimir Sementsov-Ogievskiy 100069b55e03SVladimir Sementsov-Ogievskiy if (bytes < 0) { 100169b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes is negative: %" PRIi64, bytes); 100261007b31SStefan Hajnoczi return -EIO; 100361007b31SStefan Hajnoczi } 100461007b31SStefan Hajnoczi 10058b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_MAX_LENGTH) { 100669b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 100769b55e03SVladimir Sementsov-Ogievskiy bytes, BDRV_MAX_LENGTH); 100869b55e03SVladimir Sementsov-Ogievskiy return -EIO; 100969b55e03SVladimir Sementsov-Ogievskiy } 101069b55e03SVladimir Sementsov-Ogievskiy 101169b55e03SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH) { 101269b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 101369b55e03SVladimir Sementsov-Ogievskiy offset, BDRV_MAX_LENGTH); 10148b117001SVladimir Sementsov-Ogievskiy return -EIO; 10158b117001SVladimir Sementsov-Ogievskiy } 10168b117001SVladimir Sementsov-Ogievskiy 10178b117001SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH - bytes) { 101869b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") " 101969b55e03SVladimir Sementsov-Ogievskiy "exceeds maximum(%" PRIi64 ")", offset, bytes, 102069b55e03SVladimir Sementsov-Ogievskiy BDRV_MAX_LENGTH); 10218b117001SVladimir Sementsov-Ogievskiy return -EIO; 10228b117001SVladimir Sementsov-Ogievskiy } 10238b117001SVladimir Sementsov-Ogievskiy 102463f4ad11SVladimir Sementsov-Ogievskiy if (!qiov) { 10258b117001SVladimir Sementsov-Ogievskiy return 0; 10268b117001SVladimir Sementsov-Ogievskiy } 10278b117001SVladimir Sementsov-Ogievskiy 102863f4ad11SVladimir Sementsov-Ogievskiy /* 102963f4ad11SVladimir Sementsov-Ogievskiy * Check qiov and qiov_offset 103063f4ad11SVladimir Sementsov-Ogievskiy */ 103163f4ad11SVladimir Sementsov-Ogievskiy 103263f4ad11SVladimir Sementsov-Ogievskiy if (qiov_offset > qiov->size) { 103363f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)", 103463f4ad11SVladimir Sementsov-Ogievskiy qiov_offset, qiov->size); 103563f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 103663f4ad11SVladimir Sementsov-Ogievskiy } 103763f4ad11SVladimir Sementsov-Ogievskiy 103863f4ad11SVladimir Sementsov-Ogievskiy if (bytes > qiov->size - qiov_offset) { 103963f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io " 104063f4ad11SVladimir Sementsov-Ogievskiy "vector size(%zu)", bytes, qiov_offset, qiov->size); 104163f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 104263f4ad11SVladimir Sementsov-Ogievskiy } 104363f4ad11SVladimir Sementsov-Ogievskiy 104463f4ad11SVladimir Sementsov-Ogievskiy return 0; 104563f4ad11SVladimir Sementsov-Ogievskiy } 104663f4ad11SVladimir Sementsov-Ogievskiy 104763f4ad11SVladimir Sementsov-Ogievskiy int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) 10488b117001SVladimir Sementsov-Ogievskiy { 104963f4ad11SVladimir Sementsov-Ogievskiy return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp); 105063f4ad11SVladimir Sementsov-Ogievskiy } 105163f4ad11SVladimir Sementsov-Ogievskiy 105263f4ad11SVladimir Sementsov-Ogievskiy static int bdrv_check_request32(int64_t offset, int64_t bytes, 105363f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset) 105463f4ad11SVladimir Sementsov-Ogievskiy { 105563f4ad11SVladimir Sementsov-Ogievskiy int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 10568b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 10578b117001SVladimir Sementsov-Ogievskiy return ret; 10588b117001SVladimir Sementsov-Ogievskiy } 10598b117001SVladimir Sementsov-Ogievskiy 10608b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_REQUEST_MAX_BYTES) { 106161007b31SStefan Hajnoczi return -EIO; 106261007b31SStefan Hajnoczi } 106361007b31SStefan Hajnoczi 106461007b31SStefan Hajnoczi return 0; 106561007b31SStefan Hajnoczi } 106661007b31SStefan Hajnoczi 1067720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 1068e9e52efdSVladimir Sementsov-Ogievskiy int64_t bytes, BdrvRequestFlags flags) 106961007b31SStefan Hajnoczi { 1070384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 1071fae2681aSVladimir Sementsov-Ogievskiy return bdrv_pwritev(child, offset, bytes, NULL, 1072fae2681aSVladimir Sementsov-Ogievskiy BDRV_REQ_ZERO_WRITE | flags); 107361007b31SStefan Hajnoczi } 107461007b31SStefan Hajnoczi 107561007b31SStefan Hajnoczi /* 107674021bc4SEric Blake * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 107761007b31SStefan Hajnoczi * The operation is sped up by checking the block status and only writing 107861007b31SStefan Hajnoczi * zeroes to the device if they currently do not return zeroes. Optional 107974021bc4SEric Blake * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 1080465fe887SEric Blake * BDRV_REQ_FUA). 108161007b31SStefan Hajnoczi * 1082f4649069SEric Blake * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 108361007b31SStefan Hajnoczi */ 1084720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 108561007b31SStefan Hajnoczi { 1086237d78f8SEric Blake int ret; 1087237d78f8SEric Blake int64_t target_size, bytes, offset = 0; 1088720ff280SKevin Wolf BlockDriverState *bs = child->bs; 1089384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 109061007b31SStefan Hajnoczi 10917286d610SEric Blake target_size = bdrv_getlength(bs); 10927286d610SEric Blake if (target_size < 0) { 10937286d610SEric Blake return target_size; 109461007b31SStefan Hajnoczi } 109561007b31SStefan Hajnoczi 109661007b31SStefan Hajnoczi for (;;) { 10977286d610SEric Blake bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 10987286d610SEric Blake if (bytes <= 0) { 109961007b31SStefan Hajnoczi return 0; 110061007b31SStefan Hajnoczi } 1101237d78f8SEric Blake ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 110261007b31SStefan Hajnoczi if (ret < 0) { 110361007b31SStefan Hajnoczi return ret; 110461007b31SStefan Hajnoczi } 110561007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_ZERO) { 1106237d78f8SEric Blake offset += bytes; 110761007b31SStefan Hajnoczi continue; 110861007b31SStefan Hajnoczi } 1109237d78f8SEric Blake ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 111061007b31SStefan Hajnoczi if (ret < 0) { 111161007b31SStefan Hajnoczi return ret; 111261007b31SStefan Hajnoczi } 1113237d78f8SEric Blake offset += bytes; 111461007b31SStefan Hajnoczi } 111561007b31SStefan Hajnoczi } 111661007b31SStefan Hajnoczi 11172e11d756SAlberto Garcia /* See bdrv_pwrite() for the return codes */ 1118e9e52efdSVladimir Sementsov-Ogievskiy int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes) 111961007b31SStefan Hajnoczi { 1120fae2681aSVladimir Sementsov-Ogievskiy int ret; 11210d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1122384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 112361007b31SStefan Hajnoczi 112461007b31SStefan Hajnoczi if (bytes < 0) { 112561007b31SStefan Hajnoczi return -EINVAL; 112661007b31SStefan Hajnoczi } 112761007b31SStefan Hajnoczi 1128fae2681aSVladimir Sementsov-Ogievskiy ret = bdrv_preadv(child, offset, bytes, &qiov, 0); 112961007b31SStefan Hajnoczi 1130fae2681aSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : bytes; 113161007b31SStefan Hajnoczi } 113261007b31SStefan Hajnoczi 11332e11d756SAlberto Garcia /* Return no. of bytes on success or < 0 on error. Important errors are: 11342e11d756SAlberto Garcia -EIO generic I/O error (may happen for all errors) 11352e11d756SAlberto Garcia -ENOMEDIUM No media inserted. 11362e11d756SAlberto Garcia -EINVAL Invalid offset or number of bytes 11372e11d756SAlberto Garcia -EACCES Trying to write a read-only device 11382e11d756SAlberto Garcia */ 1139e9e52efdSVladimir Sementsov-Ogievskiy int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, 1140e9e52efdSVladimir Sementsov-Ogievskiy int64_t bytes) 114161007b31SStefan Hajnoczi { 1142fae2681aSVladimir Sementsov-Ogievskiy int ret; 11430d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 1144384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 114561007b31SStefan Hajnoczi 114661007b31SStefan Hajnoczi if (bytes < 0) { 114761007b31SStefan Hajnoczi return -EINVAL; 114861007b31SStefan Hajnoczi } 114961007b31SStefan Hajnoczi 1150fae2681aSVladimir Sementsov-Ogievskiy ret = bdrv_pwritev(child, offset, bytes, &qiov, 0); 1151fae2681aSVladimir Sementsov-Ogievskiy 1152fae2681aSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : bytes; 115361007b31SStefan Hajnoczi } 115461007b31SStefan Hajnoczi 115561007b31SStefan Hajnoczi /* 115661007b31SStefan Hajnoczi * Writes to the file and ensures that no writes are reordered across this 115761007b31SStefan Hajnoczi * request (acts as a barrier) 115861007b31SStefan Hajnoczi * 115961007b31SStefan Hajnoczi * Returns 0 on success, -errno in error cases. 116061007b31SStefan Hajnoczi */ 1161d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 1162e9e52efdSVladimir Sementsov-Ogievskiy const void *buf, int64_t count) 116361007b31SStefan Hajnoczi { 116461007b31SStefan Hajnoczi int ret; 1165384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 116661007b31SStefan Hajnoczi 1167d9ca2ea2SKevin Wolf ret = bdrv_pwrite(child, offset, buf, count); 116861007b31SStefan Hajnoczi if (ret < 0) { 116961007b31SStefan Hajnoczi return ret; 117061007b31SStefan Hajnoczi } 117161007b31SStefan Hajnoczi 1172d9ca2ea2SKevin Wolf ret = bdrv_flush(child->bs); 1173855a6a93SKevin Wolf if (ret < 0) { 1174855a6a93SKevin Wolf return ret; 117561007b31SStefan Hajnoczi } 117661007b31SStefan Hajnoczi 117761007b31SStefan Hajnoczi return 0; 117861007b31SStefan Hajnoczi } 117961007b31SStefan Hajnoczi 118008844473SKevin Wolf typedef struct CoroutineIOCompletion { 118108844473SKevin Wolf Coroutine *coroutine; 118208844473SKevin Wolf int ret; 118308844473SKevin Wolf } CoroutineIOCompletion; 118408844473SKevin Wolf 118508844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret) 118608844473SKevin Wolf { 118708844473SKevin Wolf CoroutineIOCompletion *co = opaque; 118808844473SKevin Wolf 118908844473SKevin Wolf co->ret = ret; 1190b9e413ddSPaolo Bonzini aio_co_wake(co->coroutine); 119108844473SKevin Wolf } 119208844473SKevin Wolf 1193166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 119417abcbeeSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 1195ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1196ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 1197166fe960SKevin Wolf { 1198166fe960SKevin Wolf BlockDriver *drv = bs->drv; 11993fb06697SKevin Wolf int64_t sector_num; 12003fb06697SKevin Wolf unsigned int nb_sectors; 1201ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1202ac850bf0SVladimir Sementsov-Ogievskiy int ret; 12033fb06697SKevin Wolf 120417abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1205fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1206fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1207fa166538SEric Blake 1208d470ad42SMax Reitz if (!drv) { 1209d470ad42SMax Reitz return -ENOMEDIUM; 1210d470ad42SMax Reitz } 1211d470ad42SMax Reitz 1212ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_preadv_part) { 1213ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1214ac850bf0SVladimir Sementsov-Ogievskiy flags); 1215ac850bf0SVladimir Sementsov-Ogievskiy } 1216ac850bf0SVladimir Sementsov-Ogievskiy 1217ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1218ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1219ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1220ac850bf0SVladimir Sementsov-Ogievskiy } 1221ac850bf0SVladimir Sementsov-Ogievskiy 12223fb06697SKevin Wolf if (drv->bdrv_co_preadv) { 1223ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1224ac850bf0SVladimir Sementsov-Ogievskiy goto out; 12253fb06697SKevin Wolf } 12263fb06697SKevin Wolf 1227edfab6a0SEric Blake if (drv->bdrv_aio_preadv) { 122808844473SKevin Wolf BlockAIOCB *acb; 122908844473SKevin Wolf CoroutineIOCompletion co = { 123008844473SKevin Wolf .coroutine = qemu_coroutine_self(), 123108844473SKevin Wolf }; 123208844473SKevin Wolf 1233e31f6864SEric Blake acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 123408844473SKevin Wolf bdrv_co_io_em_complete, &co); 123508844473SKevin Wolf if (acb == NULL) { 1236ac850bf0SVladimir Sementsov-Ogievskiy ret = -EIO; 1237ac850bf0SVladimir Sementsov-Ogievskiy goto out; 123808844473SKevin Wolf } else { 123908844473SKevin Wolf qemu_coroutine_yield(); 1240ac850bf0SVladimir Sementsov-Ogievskiy ret = co.ret; 1241ac850bf0SVladimir Sementsov-Ogievskiy goto out; 124208844473SKevin Wolf } 124308844473SKevin Wolf } 1244edfab6a0SEric Blake 1245edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1246edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1247edfab6a0SEric Blake 12481bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 12491bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 125041ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1251edfab6a0SEric Blake assert(drv->bdrv_co_readv); 1252edfab6a0SEric Blake 1253ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1254ac850bf0SVladimir Sementsov-Ogievskiy 1255ac850bf0SVladimir Sementsov-Ogievskiy out: 1256ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1257ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1258ac850bf0SVladimir Sementsov-Ogievskiy } 1259ac850bf0SVladimir Sementsov-Ogievskiy 1260ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1261166fe960SKevin Wolf } 1262166fe960SKevin Wolf 126378a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 126417abcbeeSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 1265ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1266e75abedaSVladimir Sementsov-Ogievskiy size_t qiov_offset, 1267e75abedaSVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 126878a07294SKevin Wolf { 126978a07294SKevin Wolf BlockDriver *drv = bs->drv; 12703fb06697SKevin Wolf int64_t sector_num; 12713fb06697SKevin Wolf unsigned int nb_sectors; 1272ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 127378a07294SKevin Wolf int ret; 127478a07294SKevin Wolf 127517abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1276fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1277fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1278fa166538SEric Blake 1279d470ad42SMax Reitz if (!drv) { 1280d470ad42SMax Reitz return -ENOMEDIUM; 1281d470ad42SMax Reitz } 1282d470ad42SMax Reitz 1283ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_part) { 1284ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1285ac850bf0SVladimir Sementsov-Ogievskiy flags & bs->supported_write_flags); 1286ac850bf0SVladimir Sementsov-Ogievskiy flags &= ~bs->supported_write_flags; 1287ac850bf0SVladimir Sementsov-Ogievskiy goto emulate_flags; 1288ac850bf0SVladimir Sementsov-Ogievskiy } 1289ac850bf0SVladimir Sementsov-Ogievskiy 1290ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1291ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1292ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1293ac850bf0SVladimir Sementsov-Ogievskiy } 1294ac850bf0SVladimir Sementsov-Ogievskiy 12953fb06697SKevin Wolf if (drv->bdrv_co_pwritev) { 1296515c2f43SKevin Wolf ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1297515c2f43SKevin Wolf flags & bs->supported_write_flags); 1298515c2f43SKevin Wolf flags &= ~bs->supported_write_flags; 12993fb06697SKevin Wolf goto emulate_flags; 13003fb06697SKevin Wolf } 13013fb06697SKevin Wolf 1302edfab6a0SEric Blake if (drv->bdrv_aio_pwritev) { 130308844473SKevin Wolf BlockAIOCB *acb; 130408844473SKevin Wolf CoroutineIOCompletion co = { 130508844473SKevin Wolf .coroutine = qemu_coroutine_self(), 130608844473SKevin Wolf }; 130708844473SKevin Wolf 1308e31f6864SEric Blake acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1309e31f6864SEric Blake flags & bs->supported_write_flags, 131008844473SKevin Wolf bdrv_co_io_em_complete, &co); 1311e31f6864SEric Blake flags &= ~bs->supported_write_flags; 131208844473SKevin Wolf if (acb == NULL) { 13133fb06697SKevin Wolf ret = -EIO; 131408844473SKevin Wolf } else { 131508844473SKevin Wolf qemu_coroutine_yield(); 13163fb06697SKevin Wolf ret = co.ret; 131708844473SKevin Wolf } 1318edfab6a0SEric Blake goto emulate_flags; 1319edfab6a0SEric Blake } 1320edfab6a0SEric Blake 1321edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1322edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1323edfab6a0SEric Blake 13241bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 13251bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 132641ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1327edfab6a0SEric Blake 1328e18a58b4SEric Blake assert(drv->bdrv_co_writev); 1329e18a58b4SEric Blake ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1330edfab6a0SEric Blake flags & bs->supported_write_flags); 1331edfab6a0SEric Blake flags &= ~bs->supported_write_flags; 133278a07294SKevin Wolf 13333fb06697SKevin Wolf emulate_flags: 13344df863f3SEric Blake if (ret == 0 && (flags & BDRV_REQ_FUA)) { 133578a07294SKevin Wolf ret = bdrv_co_flush(bs); 133678a07294SKevin Wolf } 133778a07294SKevin Wolf 1338ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1339ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1340ac850bf0SVladimir Sementsov-Ogievskiy } 1341ac850bf0SVladimir Sementsov-Ogievskiy 134278a07294SKevin Wolf return ret; 134378a07294SKevin Wolf } 134478a07294SKevin Wolf 134529a298afSPavel Butsykin static int coroutine_fn 134617abcbeeSVladimir Sementsov-Ogievskiy bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset, 134717abcbeeSVladimir Sementsov-Ogievskiy int64_t bytes, QEMUIOVector *qiov, 1348ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset) 134929a298afSPavel Butsykin { 135029a298afSPavel Butsykin BlockDriver *drv = bs->drv; 1351ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1352ac850bf0SVladimir Sementsov-Ogievskiy int ret; 135329a298afSPavel Butsykin 135417abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 135517abcbeeSVladimir Sementsov-Ogievskiy 1356d470ad42SMax Reitz if (!drv) { 1357d470ad42SMax Reitz return -ENOMEDIUM; 1358d470ad42SMax Reitz } 1359d470ad42SMax Reitz 1360ac850bf0SVladimir Sementsov-Ogievskiy if (!block_driver_can_compress(drv)) { 136129a298afSPavel Butsykin return -ENOTSUP; 136229a298afSPavel Butsykin } 136329a298afSPavel Butsykin 1364ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_compressed_part) { 1365ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1366ac850bf0SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 1367ac850bf0SVladimir Sementsov-Ogievskiy } 1368ac850bf0SVladimir Sementsov-Ogievskiy 1369ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset == 0) { 137029a298afSPavel Butsykin return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 137129a298afSPavel Butsykin } 137229a298afSPavel Butsykin 1373ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1374ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1375ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1376ac850bf0SVladimir Sementsov-Ogievskiy 1377ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1378ac850bf0SVladimir Sementsov-Ogievskiy } 1379ac850bf0SVladimir Sementsov-Ogievskiy 138085c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 13819df5afbdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 13821143ec5eSVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 138361007b31SStefan Hajnoczi { 138485c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 138585c97ca7SKevin Wolf 138661007b31SStefan Hajnoczi /* Perform I/O through a temporary buffer so that users who scribble over 138761007b31SStefan Hajnoczi * their read buffer while the operation is in progress do not end up 138861007b31SStefan Hajnoczi * modifying the image file. This is critical for zero-copy guest I/O 138961007b31SStefan Hajnoczi * where anything might happen inside guest memory. 139061007b31SStefan Hajnoczi */ 13912275cc90SVladimir Sementsov-Ogievskiy void *bounce_buffer = NULL; 139261007b31SStefan Hajnoczi 139361007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 1394244483e6SKevin Wolf int64_t cluster_offset; 13957cfd5275SEric Blake int64_t cluster_bytes; 13969df5afbdSVladimir Sementsov-Ogievskiy int64_t skip_bytes; 139761007b31SStefan Hajnoczi int ret; 1398cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1399cb2e2878SEric Blake BDRV_REQUEST_MAX_BYTES); 14009df5afbdSVladimir Sementsov-Ogievskiy int64_t progress = 0; 14018644476eSMax Reitz bool skip_write; 140261007b31SStefan Hajnoczi 14039df5afbdSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 14049df5afbdSVladimir Sementsov-Ogievskiy 1405d470ad42SMax Reitz if (!drv) { 1406d470ad42SMax Reitz return -ENOMEDIUM; 1407d470ad42SMax Reitz } 1408d470ad42SMax Reitz 14098644476eSMax Reitz /* 14108644476eSMax Reitz * Do not write anything when the BDS is inactive. That is not 14118644476eSMax Reitz * allowed, and it would not help. 14128644476eSMax Reitz */ 14138644476eSMax Reitz skip_write = (bs->open_flags & BDRV_O_INACTIVE); 14148644476eSMax Reitz 14151bf03e66SKevin Wolf /* FIXME We cannot require callers to have write permissions when all they 14161bf03e66SKevin Wolf * are doing is a read request. If we did things right, write permissions 14171bf03e66SKevin Wolf * would be obtained anyway, but internally by the copy-on-read code. As 1418765d9df9SEric Blake * long as it is implemented here rather than in a separate filter driver, 14191bf03e66SKevin Wolf * the copy-on-read code doesn't have its own BdrvChild, however, for which 14201bf03e66SKevin Wolf * it could request permissions. Therefore we have to bypass the permission 14211bf03e66SKevin Wolf * system for the moment. */ 14221bf03e66SKevin Wolf // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1423afa4b293SKevin Wolf 142461007b31SStefan Hajnoczi /* Cover entire cluster so no additional backing file I/O is required when 1425cb2e2878SEric Blake * allocating cluster in the image file. Note that this value may exceed 1426cb2e2878SEric Blake * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1427cb2e2878SEric Blake * is one reason we loop rather than doing it all at once. 142861007b31SStefan Hajnoczi */ 1429244483e6SKevin Wolf bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1430cb2e2878SEric Blake skip_bytes = offset - cluster_offset; 143161007b31SStefan Hajnoczi 1432244483e6SKevin Wolf trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1433244483e6SKevin Wolf cluster_offset, cluster_bytes); 143461007b31SStefan Hajnoczi 1435cb2e2878SEric Blake while (cluster_bytes) { 1436cb2e2878SEric Blake int64_t pnum; 143761007b31SStefan Hajnoczi 14388644476eSMax Reitz if (skip_write) { 14398644476eSMax Reitz ret = 1; /* "already allocated", so nothing will be copied */ 14408644476eSMax Reitz pnum = MIN(cluster_bytes, max_transfer); 14418644476eSMax Reitz } else { 1442cb2e2878SEric Blake ret = bdrv_is_allocated(bs, cluster_offset, 1443cb2e2878SEric Blake MIN(cluster_bytes, max_transfer), &pnum); 1444cb2e2878SEric Blake if (ret < 0) { 14458644476eSMax Reitz /* 14468644476eSMax Reitz * Safe to treat errors in querying allocation as if 1447cb2e2878SEric Blake * unallocated; we'll probably fail again soon on the 1448cb2e2878SEric Blake * read, but at least that will set a decent errno. 1449cb2e2878SEric Blake */ 1450cb2e2878SEric Blake pnum = MIN(cluster_bytes, max_transfer); 1451cb2e2878SEric Blake } 1452cb2e2878SEric Blake 1453b0ddcbbbSKevin Wolf /* Stop at EOF if the image ends in the middle of the cluster */ 1454b0ddcbbbSKevin Wolf if (ret == 0 && pnum == 0) { 1455b0ddcbbbSKevin Wolf assert(progress >= bytes); 1456b0ddcbbbSKevin Wolf break; 1457b0ddcbbbSKevin Wolf } 1458b0ddcbbbSKevin Wolf 1459cb2e2878SEric Blake assert(skip_bytes < pnum); 14608644476eSMax Reitz } 1461cb2e2878SEric Blake 1462cb2e2878SEric Blake if (ret <= 0) { 14631143ec5eSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 14641143ec5eSVladimir Sementsov-Ogievskiy 1465cb2e2878SEric Blake /* Must copy-on-read; use the bounce buffer */ 14660d93ed08SVladimir Sementsov-Ogievskiy pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 14672275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 14682275cc90SVladimir Sementsov-Ogievskiy int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 14692275cc90SVladimir Sementsov-Ogievskiy int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 14702275cc90SVladimir Sementsov-Ogievskiy int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 14712275cc90SVladimir Sementsov-Ogievskiy 14722275cc90SVladimir Sementsov-Ogievskiy bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 14732275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 14742275cc90SVladimir Sementsov-Ogievskiy ret = -ENOMEM; 14752275cc90SVladimir Sementsov-Ogievskiy goto err; 14762275cc90SVladimir Sementsov-Ogievskiy } 14772275cc90SVladimir Sementsov-Ogievskiy } 14780d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1479cb2e2878SEric Blake 1480cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1481ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 0); 148261007b31SStefan Hajnoczi if (ret < 0) { 148361007b31SStefan Hajnoczi goto err; 148461007b31SStefan Hajnoczi } 148561007b31SStefan Hajnoczi 1486d855ebcdSEric Blake bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1487c1499a5eSEric Blake if (drv->bdrv_co_pwrite_zeroes && 1488cb2e2878SEric Blake buffer_is_zero(bounce_buffer, pnum)) { 1489a604fa2bSEric Blake /* FIXME: Should we (perhaps conditionally) be setting 1490a604fa2bSEric Blake * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1491a604fa2bSEric Blake * that still correctly reads as zero? */ 14927adcf59fSMax Reitz ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 14937adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 149461007b31SStefan Hajnoczi } else { 1495cb2e2878SEric Blake /* This does not change the data on the disk, it is not 1496cb2e2878SEric Blake * necessary to flush even in cache=writethrough mode. 149761007b31SStefan Hajnoczi */ 1498cb2e2878SEric Blake ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1499ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 15007adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 150161007b31SStefan Hajnoczi } 150261007b31SStefan Hajnoczi 150361007b31SStefan Hajnoczi if (ret < 0) { 1504cb2e2878SEric Blake /* It might be okay to ignore write errors for guest 1505cb2e2878SEric Blake * requests. If this is a deliberate copy-on-read 1506cb2e2878SEric Blake * then we don't want to ignore the error. Simply 1507cb2e2878SEric Blake * report it in all cases. 150861007b31SStefan Hajnoczi */ 150961007b31SStefan Hajnoczi goto err; 151061007b31SStefan Hajnoczi } 151161007b31SStefan Hajnoczi 15123299e5ecSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_PREFETCH)) { 15131143ec5eSVladimir Sementsov-Ogievskiy qemu_iovec_from_buf(qiov, qiov_offset + progress, 15141143ec5eSVladimir Sementsov-Ogievskiy bounce_buffer + skip_bytes, 15154ab78b19SVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress)); 15163299e5ecSVladimir Sementsov-Ogievskiy } 15173299e5ecSVladimir Sementsov-Ogievskiy } else if (!(flags & BDRV_REQ_PREFETCH)) { 1518cb2e2878SEric Blake /* Read directly into the destination */ 15191143ec5eSVladimir Sementsov-Ogievskiy ret = bdrv_driver_preadv(bs, offset + progress, 15201143ec5eSVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress), 15211143ec5eSVladimir Sementsov-Ogievskiy qiov, qiov_offset + progress, 0); 1522cb2e2878SEric Blake if (ret < 0) { 1523cb2e2878SEric Blake goto err; 1524cb2e2878SEric Blake } 1525cb2e2878SEric Blake } 1526cb2e2878SEric Blake 1527cb2e2878SEric Blake cluster_offset += pnum; 1528cb2e2878SEric Blake cluster_bytes -= pnum; 1529cb2e2878SEric Blake progress += pnum - skip_bytes; 1530cb2e2878SEric Blake skip_bytes = 0; 1531cb2e2878SEric Blake } 1532cb2e2878SEric Blake ret = 0; 153361007b31SStefan Hajnoczi 153461007b31SStefan Hajnoczi err: 153561007b31SStefan Hajnoczi qemu_vfree(bounce_buffer); 153661007b31SStefan Hajnoczi return ret; 153761007b31SStefan Hajnoczi } 153861007b31SStefan Hajnoczi 153961007b31SStefan Hajnoczi /* 154061007b31SStefan Hajnoczi * Forwards an already correctly aligned request to the BlockDriver. This 15411a62d0acSEric Blake * handles copy on read, zeroing after EOF, and fragmentation of large 15421a62d0acSEric Blake * reads; any other features must be implemented by the caller. 154361007b31SStefan Hajnoczi */ 154485c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 15458b0c5d76SVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 154665cd4424SVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 154761007b31SStefan Hajnoczi { 154885c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 1549c9d20029SKevin Wolf int64_t total_bytes, max_bytes; 15501a62d0acSEric Blake int ret = 0; 15518b0c5d76SVladimir Sementsov-Ogievskiy int64_t bytes_remaining = bytes; 15521a62d0acSEric Blake int max_transfer; 155361007b31SStefan Hajnoczi 15548b0c5d76SVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 155549c07526SKevin Wolf assert(is_power_of_2(align)); 155649c07526SKevin Wolf assert((offset & (align - 1)) == 0); 155749c07526SKevin Wolf assert((bytes & (align - 1)) == 0); 1558abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 15591a62d0acSEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 15601a62d0acSEric Blake align); 1561a604fa2bSEric Blake 1562a604fa2bSEric Blake /* TODO: We would need a per-BDS .supported_read_flags and 1563a604fa2bSEric Blake * potential fallback support, if we ever implement any read flags 1564a604fa2bSEric Blake * to pass through to drivers. For now, there aren't any 1565a604fa2bSEric Blake * passthrough flags. */ 1566c53cb427SPaolo Bonzini assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); 156761007b31SStefan Hajnoczi 156861007b31SStefan Hajnoczi /* Handle Copy on Read and associated serialisation */ 156961007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 157061007b31SStefan Hajnoczi /* If we touch the same cluster it counts as an overlap. This 157161007b31SStefan Hajnoczi * guarantees that allocating writes will be serialized and not race 157261007b31SStefan Hajnoczi * with each other for the same cluster. For example, in copy-on-read 157361007b31SStefan Hajnoczi * it ensures that the CoR read and write operations are atomic and 157461007b31SStefan Hajnoczi * guest writes cannot interleave between them. */ 15758ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); 157618fbd0deSPaolo Bonzini } else { 1577304d9d7fSMax Reitz bdrv_wait_serialising_requests(req); 157818fbd0deSPaolo Bonzini } 157961007b31SStefan Hajnoczi 158061007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 1581d6a644bbSEric Blake int64_t pnum; 158261007b31SStefan Hajnoczi 1583897dd0ecSAndrey Shinkevich /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ 1584897dd0ecSAndrey Shinkevich flags &= ~BDRV_REQ_COPY_ON_READ; 1585897dd0ecSAndrey Shinkevich 158688e63df2SEric Blake ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 158761007b31SStefan Hajnoczi if (ret < 0) { 158861007b31SStefan Hajnoczi goto out; 158961007b31SStefan Hajnoczi } 159061007b31SStefan Hajnoczi 159188e63df2SEric Blake if (!ret || pnum != bytes) { 159265cd4424SVladimir Sementsov-Ogievskiy ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 159365cd4424SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 15943299e5ecSVladimir Sementsov-Ogievskiy goto out; 15953299e5ecSVladimir Sementsov-Ogievskiy } else if (flags & BDRV_REQ_PREFETCH) { 159661007b31SStefan Hajnoczi goto out; 159761007b31SStefan Hajnoczi } 159861007b31SStefan Hajnoczi } 159961007b31SStefan Hajnoczi 16001a62d0acSEric Blake /* Forward the request to the BlockDriver, possibly fragmenting it */ 160149c07526SKevin Wolf total_bytes = bdrv_getlength(bs); 160249c07526SKevin Wolf if (total_bytes < 0) { 160349c07526SKevin Wolf ret = total_bytes; 160461007b31SStefan Hajnoczi goto out; 160561007b31SStefan Hajnoczi } 160661007b31SStefan Hajnoczi 1607897dd0ecSAndrey Shinkevich assert(!(flags & ~bs->supported_read_flags)); 1608897dd0ecSAndrey Shinkevich 160949c07526SKevin Wolf max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 16101a62d0acSEric Blake if (bytes <= max_bytes && bytes <= max_transfer) { 1611897dd0ecSAndrey Shinkevich ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); 16121a62d0acSEric Blake goto out; 161361007b31SStefan Hajnoczi } 161461007b31SStefan Hajnoczi 16151a62d0acSEric Blake while (bytes_remaining) { 16168b0c5d76SVladimir Sementsov-Ogievskiy int64_t num; 16171a62d0acSEric Blake 16181a62d0acSEric Blake if (max_bytes) { 16191a62d0acSEric Blake num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 16201a62d0acSEric Blake assert(num); 16211a62d0acSEric Blake 16221a62d0acSEric Blake ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1623134b7decSMax Reitz num, qiov, 1624897dd0ecSAndrey Shinkevich qiov_offset + bytes - bytes_remaining, 1625897dd0ecSAndrey Shinkevich flags); 16261a62d0acSEric Blake max_bytes -= num; 16271a62d0acSEric Blake } else { 16281a62d0acSEric Blake num = bytes_remaining; 1629134b7decSMax Reitz ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1630134b7decSMax Reitz 0, bytes_remaining); 16311a62d0acSEric Blake } 16321a62d0acSEric Blake if (ret < 0) { 16331a62d0acSEric Blake goto out; 16341a62d0acSEric Blake } 16351a62d0acSEric Blake bytes_remaining -= num; 163661007b31SStefan Hajnoczi } 163761007b31SStefan Hajnoczi 163861007b31SStefan Hajnoczi out: 16391a62d0acSEric Blake return ret < 0 ? ret : 0; 164061007b31SStefan Hajnoczi } 164161007b31SStefan Hajnoczi 164261007b31SStefan Hajnoczi /* 16437a3f542fSVladimir Sementsov-Ogievskiy * Request padding 16447a3f542fSVladimir Sementsov-Ogievskiy * 16457a3f542fSVladimir Sementsov-Ogievskiy * |<---- align ----->| |<----- align ---->| 16467a3f542fSVladimir Sementsov-Ogievskiy * |<- head ->|<------------- bytes ------------->|<-- tail -->| 16477a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 16487a3f542fSVladimir Sementsov-Ogievskiy * -*----------$-------*-------- ... --------*-----$------------*--- 16497a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 16507a3f542fSVladimir Sementsov-Ogievskiy * | offset | | end | 16517a3f542fSVladimir Sementsov-Ogievskiy * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 16527a3f542fSVladimir Sementsov-Ogievskiy * [buf ... ) [tail_buf ) 16537a3f542fSVladimir Sementsov-Ogievskiy * 16547a3f542fSVladimir Sementsov-Ogievskiy * @buf is an aligned allocation needed to store @head and @tail paddings. @head 16557a3f542fSVladimir Sementsov-Ogievskiy * is placed at the beginning of @buf and @tail at the @end. 16567a3f542fSVladimir Sementsov-Ogievskiy * 16577a3f542fSVladimir Sementsov-Ogievskiy * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 16587a3f542fSVladimir Sementsov-Ogievskiy * around tail, if tail exists. 16597a3f542fSVladimir Sementsov-Ogievskiy * 16607a3f542fSVladimir Sementsov-Ogievskiy * @merge_reads is true for small requests, 16617a3f542fSVladimir Sementsov-Ogievskiy * if @buf_len == @head + bytes + @tail. In this case it is possible that both 16627a3f542fSVladimir Sementsov-Ogievskiy * head and tail exist but @buf_len == align and @tail_buf == @buf. 166361007b31SStefan Hajnoczi */ 16647a3f542fSVladimir Sementsov-Ogievskiy typedef struct BdrvRequestPadding { 16657a3f542fSVladimir Sementsov-Ogievskiy uint8_t *buf; 16667a3f542fSVladimir Sementsov-Ogievskiy size_t buf_len; 16677a3f542fSVladimir Sementsov-Ogievskiy uint8_t *tail_buf; 16687a3f542fSVladimir Sementsov-Ogievskiy size_t head; 16697a3f542fSVladimir Sementsov-Ogievskiy size_t tail; 16707a3f542fSVladimir Sementsov-Ogievskiy bool merge_reads; 16717a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 16727a3f542fSVladimir Sementsov-Ogievskiy } BdrvRequestPadding; 16737a3f542fSVladimir Sementsov-Ogievskiy 16747a3f542fSVladimir Sementsov-Ogievskiy static bool bdrv_init_padding(BlockDriverState *bs, 16757a3f542fSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 16767a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad) 16777a3f542fSVladimir Sementsov-Ogievskiy { 1678a56ed80cSVladimir Sementsov-Ogievskiy int64_t align = bs->bl.request_alignment; 1679a56ed80cSVladimir Sementsov-Ogievskiy int64_t sum; 1680a56ed80cSVladimir Sementsov-Ogievskiy 1681a56ed80cSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 1682a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= INT_MAX); /* documented in block/block_int.h */ 1683a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */ 16847a3f542fSVladimir Sementsov-Ogievskiy 16857a3f542fSVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 16867a3f542fSVladimir Sementsov-Ogievskiy 16877a3f542fSVladimir Sementsov-Ogievskiy pad->head = offset & (align - 1); 16887a3f542fSVladimir Sementsov-Ogievskiy pad->tail = ((offset + bytes) & (align - 1)); 16897a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 16907a3f542fSVladimir Sementsov-Ogievskiy pad->tail = align - pad->tail; 16917a3f542fSVladimir Sementsov-Ogievskiy } 16927a3f542fSVladimir Sementsov-Ogievskiy 1693ac9d00bfSVladimir Sementsov-Ogievskiy if (!pad->head && !pad->tail) { 16947a3f542fSVladimir Sementsov-Ogievskiy return false; 16957a3f542fSVladimir Sementsov-Ogievskiy } 16967a3f542fSVladimir Sementsov-Ogievskiy 1697ac9d00bfSVladimir Sementsov-Ogievskiy assert(bytes); /* Nothing good in aligning zero-length requests */ 1698ac9d00bfSVladimir Sementsov-Ogievskiy 16997a3f542fSVladimir Sementsov-Ogievskiy sum = pad->head + bytes + pad->tail; 17007a3f542fSVladimir Sementsov-Ogievskiy pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 17017a3f542fSVladimir Sementsov-Ogievskiy pad->buf = qemu_blockalign(bs, pad->buf_len); 17027a3f542fSVladimir Sementsov-Ogievskiy pad->merge_reads = sum == pad->buf_len; 17037a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 17047a3f542fSVladimir Sementsov-Ogievskiy pad->tail_buf = pad->buf + pad->buf_len - align; 17057a3f542fSVladimir Sementsov-Ogievskiy } 17067a3f542fSVladimir Sementsov-Ogievskiy 17077a3f542fSVladimir Sementsov-Ogievskiy return true; 17087a3f542fSVladimir Sementsov-Ogievskiy } 17097a3f542fSVladimir Sementsov-Ogievskiy 17107a3f542fSVladimir Sementsov-Ogievskiy static int bdrv_padding_rmw_read(BdrvChild *child, 17117a3f542fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, 17127a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad, 17137a3f542fSVladimir Sementsov-Ogievskiy bool zero_middle) 17147a3f542fSVladimir Sementsov-Ogievskiy { 17157a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 17167a3f542fSVladimir Sementsov-Ogievskiy BlockDriverState *bs = child->bs; 17177a3f542fSVladimir Sementsov-Ogievskiy uint64_t align = bs->bl.request_alignment; 17187a3f542fSVladimir Sementsov-Ogievskiy int ret; 17197a3f542fSVladimir Sementsov-Ogievskiy 17207a3f542fSVladimir Sementsov-Ogievskiy assert(req->serialising && pad->buf); 17217a3f542fSVladimir Sementsov-Ogievskiy 17227a3f542fSVladimir Sementsov-Ogievskiy if (pad->head || pad->merge_reads) { 17238b0c5d76SVladimir Sementsov-Ogievskiy int64_t bytes = pad->merge_reads ? pad->buf_len : align; 17247a3f542fSVladimir Sementsov-Ogievskiy 17257a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 17267a3f542fSVladimir Sementsov-Ogievskiy 17277a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 17287a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 17297a3f542fSVladimir Sementsov-Ogievskiy } 17307a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 17317a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 17327a3f542fSVladimir Sementsov-Ogievskiy } 17337a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 173465cd4424SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 0); 17357a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 17367a3f542fSVladimir Sementsov-Ogievskiy return ret; 17377a3f542fSVladimir Sementsov-Ogievskiy } 17387a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 17397a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 17407a3f542fSVladimir Sementsov-Ogievskiy } 17417a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 17427a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 17437a3f542fSVladimir Sementsov-Ogievskiy } 17447a3f542fSVladimir Sementsov-Ogievskiy 17457a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads) { 17467a3f542fSVladimir Sementsov-Ogievskiy goto zero_mem; 17477a3f542fSVladimir Sementsov-Ogievskiy } 17487a3f542fSVladimir Sementsov-Ogievskiy } 17497a3f542fSVladimir Sementsov-Ogievskiy 17507a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 17517a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 17527a3f542fSVladimir Sementsov-Ogievskiy 17537a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 17547a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv( 17557a3f542fSVladimir Sementsov-Ogievskiy child, req, 17567a3f542fSVladimir Sementsov-Ogievskiy req->overlap_offset + req->overlap_bytes - align, 175765cd4424SVladimir Sementsov-Ogievskiy align, align, &local_qiov, 0, 0); 17587a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 17597a3f542fSVladimir Sementsov-Ogievskiy return ret; 17607a3f542fSVladimir Sementsov-Ogievskiy } 17617a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 17627a3f542fSVladimir Sementsov-Ogievskiy } 17637a3f542fSVladimir Sementsov-Ogievskiy 17647a3f542fSVladimir Sementsov-Ogievskiy zero_mem: 17657a3f542fSVladimir Sementsov-Ogievskiy if (zero_middle) { 17667a3f542fSVladimir Sementsov-Ogievskiy memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 17677a3f542fSVladimir Sementsov-Ogievskiy } 17687a3f542fSVladimir Sementsov-Ogievskiy 17697a3f542fSVladimir Sementsov-Ogievskiy return 0; 17707a3f542fSVladimir Sementsov-Ogievskiy } 17717a3f542fSVladimir Sementsov-Ogievskiy 17727a3f542fSVladimir Sementsov-Ogievskiy static void bdrv_padding_destroy(BdrvRequestPadding *pad) 17737a3f542fSVladimir Sementsov-Ogievskiy { 17747a3f542fSVladimir Sementsov-Ogievskiy if (pad->buf) { 17757a3f542fSVladimir Sementsov-Ogievskiy qemu_vfree(pad->buf); 17767a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&pad->local_qiov); 17777a3f542fSVladimir Sementsov-Ogievskiy } 177898ca4549SVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 17797a3f542fSVladimir Sementsov-Ogievskiy } 17807a3f542fSVladimir Sementsov-Ogievskiy 17817a3f542fSVladimir Sementsov-Ogievskiy /* 17827a3f542fSVladimir Sementsov-Ogievskiy * bdrv_pad_request 17837a3f542fSVladimir Sementsov-Ogievskiy * 17847a3f542fSVladimir Sementsov-Ogievskiy * Exchange request parameters with padded request if needed. Don't include RMW 17857a3f542fSVladimir Sementsov-Ogievskiy * read of padding, bdrv_padding_rmw_read() should be called separately if 17867a3f542fSVladimir Sementsov-Ogievskiy * needed. 17877a3f542fSVladimir Sementsov-Ogievskiy * 178898ca4549SVladimir Sementsov-Ogievskiy * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: 178998ca4549SVladimir Sementsov-Ogievskiy * - on function start they represent original request 179098ca4549SVladimir Sementsov-Ogievskiy * - on failure or when padding is not needed they are unchanged 179198ca4549SVladimir Sementsov-Ogievskiy * - on success when padding is needed they represent padded request 17927a3f542fSVladimir Sementsov-Ogievskiy */ 179398ca4549SVladimir Sementsov-Ogievskiy static int bdrv_pad_request(BlockDriverState *bs, 17941acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector **qiov, size_t *qiov_offset, 179537e9403eSVladimir Sementsov-Ogievskiy int64_t *offset, int64_t *bytes, 179698ca4549SVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad, bool *padded) 17977a3f542fSVladimir Sementsov-Ogievskiy { 17984c002cefSVladimir Sementsov-Ogievskiy int ret; 17994c002cefSVladimir Sementsov-Ogievskiy 180037e9403eSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); 180137e9403eSVladimir Sementsov-Ogievskiy 18027a3f542fSVladimir Sementsov-Ogievskiy if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 180398ca4549SVladimir Sementsov-Ogievskiy if (padded) { 180498ca4549SVladimir Sementsov-Ogievskiy *padded = false; 180598ca4549SVladimir Sementsov-Ogievskiy } 180698ca4549SVladimir Sementsov-Ogievskiy return 0; 18077a3f542fSVladimir Sementsov-Ogievskiy } 18087a3f542fSVladimir Sementsov-Ogievskiy 18094c002cefSVladimir Sementsov-Ogievskiy ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 18101acc3466SVladimir Sementsov-Ogievskiy *qiov, *qiov_offset, *bytes, 18114c002cefSVladimir Sementsov-Ogievskiy pad->buf + pad->buf_len - pad->tail, 18124c002cefSVladimir Sementsov-Ogievskiy pad->tail); 181398ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 181498ca4549SVladimir Sementsov-Ogievskiy bdrv_padding_destroy(pad); 181598ca4549SVladimir Sementsov-Ogievskiy return ret; 181698ca4549SVladimir Sementsov-Ogievskiy } 18177a3f542fSVladimir Sementsov-Ogievskiy *bytes += pad->head + pad->tail; 18187a3f542fSVladimir Sementsov-Ogievskiy *offset -= pad->head; 18197a3f542fSVladimir Sementsov-Ogievskiy *qiov = &pad->local_qiov; 18201acc3466SVladimir Sementsov-Ogievskiy *qiov_offset = 0; 182198ca4549SVladimir Sementsov-Ogievskiy if (padded) { 182298ca4549SVladimir Sementsov-Ogievskiy *padded = true; 182398ca4549SVladimir Sementsov-Ogievskiy } 18247a3f542fSVladimir Sementsov-Ogievskiy 182598ca4549SVladimir Sementsov-Ogievskiy return 0; 18267a3f542fSVladimir Sementsov-Ogievskiy } 18277a3f542fSVladimir Sementsov-Ogievskiy 1828a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1829e9e52efdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 183061007b31SStefan Hajnoczi BdrvRequestFlags flags) 183161007b31SStefan Hajnoczi { 1832*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 18331acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 18341acc3466SVladimir Sementsov-Ogievskiy } 18351acc3466SVladimir Sementsov-Ogievskiy 18361acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 183737e9403eSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 18381acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 18391acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 18401acc3466SVladimir Sementsov-Ogievskiy { 1841a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 184261007b31SStefan Hajnoczi BdrvTrackedRequest req; 18437a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 184461007b31SStefan Hajnoczi int ret; 1845*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 184661007b31SStefan Hajnoczi 184737e9403eSVladimir Sementsov-Ogievskiy trace_bdrv_co_preadv_part(bs, offset, bytes, flags); 184861007b31SStefan Hajnoczi 1849f4dad307SVladimir Sementsov-Ogievskiy if (!bdrv_is_inserted(bs)) { 1850f4dad307SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 1851f4dad307SVladimir Sementsov-Ogievskiy } 1852f4dad307SVladimir Sementsov-Ogievskiy 185363f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 185461007b31SStefan Hajnoczi if (ret < 0) { 185561007b31SStefan Hajnoczi return ret; 185661007b31SStefan Hajnoczi } 185761007b31SStefan Hajnoczi 1858ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1859ac9d00bfSVladimir Sementsov-Ogievskiy /* 1860ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 1861ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1862ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 1863ac9d00bfSVladimir Sementsov-Ogievskiy * 1864ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 1865ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length read occasionally. 1866ac9d00bfSVladimir Sementsov-Ogievskiy */ 1867ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 1868ac9d00bfSVladimir Sementsov-Ogievskiy } 1869ac9d00bfSVladimir Sementsov-Ogievskiy 187099723548SPaolo Bonzini bdrv_inc_in_flight(bs); 187199723548SPaolo Bonzini 18729568b511SWen Congyang /* Don't do copy-on-read if we read data before write operation */ 1873d73415a3SStefan Hajnoczi if (qatomic_read(&bs->copy_on_read)) { 187461007b31SStefan Hajnoczi flags |= BDRV_REQ_COPY_ON_READ; 187561007b31SStefan Hajnoczi } 187661007b31SStefan Hajnoczi 187798ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 187898ca4549SVladimir Sementsov-Ogievskiy NULL); 187998ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 188087ab8802SKevin Wolf goto fail; 188198ca4549SVladimir Sementsov-Ogievskiy } 188261007b31SStefan Hajnoczi 1883ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 18847a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, &req, offset, bytes, 18857a3f542fSVladimir Sementsov-Ogievskiy bs->bl.request_alignment, 18861acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 188761007b31SStefan Hajnoczi tracked_request_end(&req); 18887a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 188961007b31SStefan Hajnoczi 189087ab8802SKevin Wolf fail: 189187ab8802SKevin Wolf bdrv_dec_in_flight(bs); 189287ab8802SKevin Wolf 189361007b31SStefan Hajnoczi return ret; 189461007b31SStefan Hajnoczi } 189561007b31SStefan Hajnoczi 1896d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 18975ae07b14SVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, BdrvRequestFlags flags) 189861007b31SStefan Hajnoczi { 189961007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 190061007b31SStefan Hajnoczi QEMUIOVector qiov; 19010d93ed08SVladimir Sementsov-Ogievskiy void *buf = NULL; 190261007b31SStefan Hajnoczi int ret = 0; 1903465fe887SEric Blake bool need_flush = false; 1904443668caSDenis V. Lunev int head = 0; 1905443668caSDenis V. Lunev int tail = 0; 190661007b31SStefan Hajnoczi 19072aaa3f9bSVladimir Sementsov-Ogievskiy int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, 19082aaa3f9bSVladimir Sementsov-Ogievskiy INT64_MAX); 1909a5b8dd2cSEric Blake int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1910a5b8dd2cSEric Blake bs->bl.request_alignment); 1911cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1912cf081fcaSEric Blake 19135ae07b14SVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 19145ae07b14SVladimir Sementsov-Ogievskiy 1915d470ad42SMax Reitz if (!drv) { 1916d470ad42SMax Reitz return -ENOMEDIUM; 1917d470ad42SMax Reitz } 1918d470ad42SMax Reitz 1919fe0480d6SKevin Wolf if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1920fe0480d6SKevin Wolf return -ENOTSUP; 1921fe0480d6SKevin Wolf } 1922fe0480d6SKevin Wolf 19230bc329fbSHanna Reitz /* Invalidate the cached block-status data range if this write overlaps */ 19240bc329fbSHanna Reitz bdrv_bsc_invalidate_range(bs, offset, bytes); 19250bc329fbSHanna Reitz 1926b8d0a980SEric Blake assert(alignment % bs->bl.request_alignment == 0); 1927b8d0a980SEric Blake head = offset % alignment; 1928f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % alignment; 1929b8d0a980SEric Blake max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1930b8d0a980SEric Blake assert(max_write_zeroes >= bs->bl.request_alignment); 193161007b31SStefan Hajnoczi 1932f5a5ca79SManos Pitsidianakis while (bytes > 0 && !ret) { 19335ae07b14SVladimir Sementsov-Ogievskiy int64_t num = bytes; 193461007b31SStefan Hajnoczi 193561007b31SStefan Hajnoczi /* Align request. Block drivers can expect the "bulk" of the request 1936443668caSDenis V. Lunev * to be aligned, and that unaligned requests do not cross cluster 1937443668caSDenis V. Lunev * boundaries. 193861007b31SStefan Hajnoczi */ 1939443668caSDenis V. Lunev if (head) { 1940b2f95feeSEric Blake /* Make a small request up to the first aligned sector. For 1941b2f95feeSEric Blake * convenience, limit this request to max_transfer even if 1942b2f95feeSEric Blake * we don't need to fall back to writes. */ 1943f5a5ca79SManos Pitsidianakis num = MIN(MIN(bytes, max_transfer), alignment - head); 1944b2f95feeSEric Blake head = (head + num) % alignment; 1945b2f95feeSEric Blake assert(num < max_write_zeroes); 1946d05aa8bbSEric Blake } else if (tail && num > alignment) { 1947443668caSDenis V. Lunev /* Shorten the request to the last aligned sector. */ 1948443668caSDenis V. Lunev num -= tail; 194961007b31SStefan Hajnoczi } 195061007b31SStefan Hajnoczi 195161007b31SStefan Hajnoczi /* limit request size */ 195261007b31SStefan Hajnoczi if (num > max_write_zeroes) { 195361007b31SStefan Hajnoczi num = max_write_zeroes; 195461007b31SStefan Hajnoczi } 195561007b31SStefan Hajnoczi 195661007b31SStefan Hajnoczi ret = -ENOTSUP; 195761007b31SStefan Hajnoczi /* First try the efficient write zeroes operation */ 1958d05aa8bbSEric Blake if (drv->bdrv_co_pwrite_zeroes) { 1959d05aa8bbSEric Blake ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1960d05aa8bbSEric Blake flags & bs->supported_zero_flags); 1961d05aa8bbSEric Blake if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1962d05aa8bbSEric Blake !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1963d05aa8bbSEric Blake need_flush = true; 1964d05aa8bbSEric Blake } 1965465fe887SEric Blake } else { 1966465fe887SEric Blake assert(!bs->supported_zero_flags); 196761007b31SStefan Hajnoczi } 196861007b31SStefan Hajnoczi 1969294682ccSAndrey Shinkevich if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 197061007b31SStefan Hajnoczi /* Fall back to bounce buffer if write zeroes is unsupported */ 1971465fe887SEric Blake BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1972465fe887SEric Blake 1973465fe887SEric Blake if ((flags & BDRV_REQ_FUA) && 1974465fe887SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1975465fe887SEric Blake /* No need for bdrv_driver_pwrite() to do a fallback 1976465fe887SEric Blake * flush on each chunk; use just one at the end */ 1977465fe887SEric Blake write_flags &= ~BDRV_REQ_FUA; 1978465fe887SEric Blake need_flush = true; 1979465fe887SEric Blake } 19805def6b80SEric Blake num = MIN(num, max_transfer); 19810d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 19820d93ed08SVladimir Sementsov-Ogievskiy buf = qemu_try_blockalign0(bs, num); 19830d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 198461007b31SStefan Hajnoczi ret = -ENOMEM; 198561007b31SStefan Hajnoczi goto fail; 198661007b31SStefan Hajnoczi } 198761007b31SStefan Hajnoczi } 19880d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&qiov, buf, num); 198961007b31SStefan Hajnoczi 1990ac850bf0SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 199161007b31SStefan Hajnoczi 199261007b31SStefan Hajnoczi /* Keep bounce buffer around if it is big enough for all 199361007b31SStefan Hajnoczi * all future requests. 199461007b31SStefan Hajnoczi */ 19955def6b80SEric Blake if (num < max_transfer) { 19960d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 19970d93ed08SVladimir Sementsov-Ogievskiy buf = NULL; 199861007b31SStefan Hajnoczi } 199961007b31SStefan Hajnoczi } 200061007b31SStefan Hajnoczi 2001d05aa8bbSEric Blake offset += num; 2002f5a5ca79SManos Pitsidianakis bytes -= num; 200361007b31SStefan Hajnoczi } 200461007b31SStefan Hajnoczi 200561007b31SStefan Hajnoczi fail: 2006465fe887SEric Blake if (ret == 0 && need_flush) { 2007465fe887SEric Blake ret = bdrv_co_flush(bs); 2008465fe887SEric Blake } 20090d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 201061007b31SStefan Hajnoczi return ret; 201161007b31SStefan Hajnoczi } 201261007b31SStefan Hajnoczi 201385fe2479SFam Zheng static inline int coroutine_fn 2014fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, 201585fe2479SFam Zheng BdrvTrackedRequest *req, int flags) 201685fe2479SFam Zheng { 201785fe2479SFam Zheng BlockDriverState *bs = child->bs; 2018fcfd9adeSVladimir Sementsov-Ogievskiy 2019fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 202085fe2479SFam Zheng 2021307261b2SVladimir Sementsov-Ogievskiy if (bdrv_is_read_only(bs)) { 202285fe2479SFam Zheng return -EPERM; 202385fe2479SFam Zheng } 202485fe2479SFam Zheng 202585fe2479SFam Zheng assert(!(bs->open_flags & BDRV_O_INACTIVE)); 202685fe2479SFam Zheng assert((bs->open_flags & BDRV_O_NO_IO) == 0); 202785fe2479SFam Zheng assert(!(flags & ~BDRV_REQ_MASK)); 2028d1a764d1SVladimir Sementsov-Ogievskiy assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); 202985fe2479SFam Zheng 203085fe2479SFam Zheng if (flags & BDRV_REQ_SERIALISING) { 2031d1a764d1SVladimir Sementsov-Ogievskiy QEMU_LOCK_GUARD(&bs->reqs_lock); 2032d1a764d1SVladimir Sementsov-Ogievskiy 2033d1a764d1SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); 2034d1a764d1SVladimir Sementsov-Ogievskiy 2035d1a764d1SVladimir Sementsov-Ogievskiy if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { 2036d1a764d1SVladimir Sementsov-Ogievskiy return -EBUSY; 2037d1a764d1SVladimir Sementsov-Ogievskiy } 2038d1a764d1SVladimir Sementsov-Ogievskiy 2039d1a764d1SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(req); 204018fbd0deSPaolo Bonzini } else { 204118fbd0deSPaolo Bonzini bdrv_wait_serialising_requests(req); 204285fe2479SFam Zheng } 204385fe2479SFam Zheng 204485fe2479SFam Zheng assert(req->overlap_offset <= offset); 204585fe2479SFam Zheng assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 2046fcfd9adeSVladimir Sementsov-Ogievskiy assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE || 2047fcfd9adeSVladimir Sementsov-Ogievskiy child->perm & BLK_PERM_RESIZE); 204885fe2479SFam Zheng 2049cd47d792SFam Zheng switch (req->type) { 2050cd47d792SFam Zheng case BDRV_TRACKED_WRITE: 2051cd47d792SFam Zheng case BDRV_TRACKED_DISCARD: 205285fe2479SFam Zheng if (flags & BDRV_REQ_WRITE_UNCHANGED) { 205385fe2479SFam Zheng assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 205485fe2479SFam Zheng } else { 205585fe2479SFam Zheng assert(child->perm & BLK_PERM_WRITE); 205685fe2479SFam Zheng } 205794783301SVladimir Sementsov-Ogievskiy bdrv_write_threshold_check_write(bs, offset, bytes); 205894783301SVladimir Sementsov-Ogievskiy return 0; 2059cd47d792SFam Zheng case BDRV_TRACKED_TRUNCATE: 2060cd47d792SFam Zheng assert(child->perm & BLK_PERM_RESIZE); 2061cd47d792SFam Zheng return 0; 2062cd47d792SFam Zheng default: 2063cd47d792SFam Zheng abort(); 2064cd47d792SFam Zheng } 206585fe2479SFam Zheng } 206685fe2479SFam Zheng 206785fe2479SFam Zheng static inline void coroutine_fn 2068fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes, 206985fe2479SFam Zheng BdrvTrackedRequest *req, int ret) 207085fe2479SFam Zheng { 207185fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 207285fe2479SFam Zheng BlockDriverState *bs = child->bs; 207385fe2479SFam Zheng 2074fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 2075fcfd9adeSVladimir Sementsov-Ogievskiy 2076d73415a3SStefan Hajnoczi qatomic_inc(&bs->write_gen); 207785fe2479SFam Zheng 207800695c27SFam Zheng /* 207900695c27SFam Zheng * Discard cannot extend the image, but in error handling cases, such as 208000695c27SFam Zheng * when reverting a qcow2 cluster allocation, the discarded range can pass 208100695c27SFam Zheng * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 208200695c27SFam Zheng * here. Instead, just skip it, since semantically a discard request 208300695c27SFam Zheng * beyond EOF cannot expand the image anyway. 208400695c27SFam Zheng */ 20857f8f03efSFam Zheng if (ret == 0 && 2086cd47d792SFam Zheng (req->type == BDRV_TRACKED_TRUNCATE || 2087cd47d792SFam Zheng end_sector > bs->total_sectors) && 208800695c27SFam Zheng req->type != BDRV_TRACKED_DISCARD) { 20897f8f03efSFam Zheng bs->total_sectors = end_sector; 20907f8f03efSFam Zheng bdrv_parent_cb_resize(bs); 20917f8f03efSFam Zheng bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 209285fe2479SFam Zheng } 209300695c27SFam Zheng if (req->bytes) { 209400695c27SFam Zheng switch (req->type) { 209500695c27SFam Zheng case BDRV_TRACKED_WRITE: 209600695c27SFam Zheng stat64_max(&bs->wr_highest_offset, offset + bytes); 209700695c27SFam Zheng /* fall through, to set dirty bits */ 209800695c27SFam Zheng case BDRV_TRACKED_DISCARD: 20997f8f03efSFam Zheng bdrv_set_dirty(bs, offset, bytes); 210000695c27SFam Zheng break; 210100695c27SFam Zheng default: 210200695c27SFam Zheng break; 210300695c27SFam Zheng } 210400695c27SFam Zheng } 210585fe2479SFam Zheng } 210685fe2479SFam Zheng 210761007b31SStefan Hajnoczi /* 210804ed95f4SEric Blake * Forwards an already correctly aligned write request to the BlockDriver, 210904ed95f4SEric Blake * after possibly fragmenting it. 211061007b31SStefan Hajnoczi */ 211185c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 2112fcfd9adeSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 2113e75abedaSVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, 2114e75abedaSVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 211561007b31SStefan Hajnoczi { 211685c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 211761007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 211861007b31SStefan Hajnoczi int ret; 211961007b31SStefan Hajnoczi 2120fcfd9adeSVladimir Sementsov-Ogievskiy int64_t bytes_remaining = bytes; 212104ed95f4SEric Blake int max_transfer; 212261007b31SStefan Hajnoczi 2123fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 2124fcfd9adeSVladimir Sementsov-Ogievskiy 2125d470ad42SMax Reitz if (!drv) { 2126d470ad42SMax Reitz return -ENOMEDIUM; 2127d470ad42SMax Reitz } 2128d470ad42SMax Reitz 2129d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2130d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2131d6883bc9SVladimir Sementsov-Ogievskiy } 2132d6883bc9SVladimir Sementsov-Ogievskiy 2133cff86b38SEric Blake assert(is_power_of_2(align)); 2134cff86b38SEric Blake assert((offset & (align - 1)) == 0); 2135cff86b38SEric Blake assert((bytes & (align - 1)) == 0); 213604ed95f4SEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 213704ed95f4SEric Blake align); 213861007b31SStefan Hajnoczi 213985fe2479SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 214061007b31SStefan Hajnoczi 214161007b31SStefan Hajnoczi if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 2142c1499a5eSEric Blake !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 214328c4da28SVladimir Sementsov-Ogievskiy qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 214461007b31SStefan Hajnoczi flags |= BDRV_REQ_ZERO_WRITE; 214561007b31SStefan Hajnoczi if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 214661007b31SStefan Hajnoczi flags |= BDRV_REQ_MAY_UNMAP; 214761007b31SStefan Hajnoczi } 214861007b31SStefan Hajnoczi } 214961007b31SStefan Hajnoczi 215061007b31SStefan Hajnoczi if (ret < 0) { 215161007b31SStefan Hajnoczi /* Do nothing, write notifier decided to fail this request */ 215261007b31SStefan Hajnoczi } else if (flags & BDRV_REQ_ZERO_WRITE) { 21539a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 21549896c876SKevin Wolf ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 21553ea1a091SPavel Butsykin } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 215628c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 215728c4da28SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 215804ed95f4SEric Blake } else if (bytes <= max_transfer) { 21599a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV); 216028c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 216104ed95f4SEric Blake } else { 216204ed95f4SEric Blake bdrv_debug_event(bs, BLKDBG_PWRITEV); 216304ed95f4SEric Blake while (bytes_remaining) { 216404ed95f4SEric Blake int num = MIN(bytes_remaining, max_transfer); 216504ed95f4SEric Blake int local_flags = flags; 216604ed95f4SEric Blake 216704ed95f4SEric Blake assert(num); 216804ed95f4SEric Blake if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 216904ed95f4SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 217004ed95f4SEric Blake /* If FUA is going to be emulated by flush, we only 217104ed95f4SEric Blake * need to flush on the last iteration */ 217204ed95f4SEric Blake local_flags &= ~BDRV_REQ_FUA; 217304ed95f4SEric Blake } 217404ed95f4SEric Blake 217504ed95f4SEric Blake ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 2176134b7decSMax Reitz num, qiov, 2177134b7decSMax Reitz qiov_offset + bytes - bytes_remaining, 217828c4da28SVladimir Sementsov-Ogievskiy local_flags); 217904ed95f4SEric Blake if (ret < 0) { 218004ed95f4SEric Blake break; 218104ed95f4SEric Blake } 218204ed95f4SEric Blake bytes_remaining -= num; 218304ed95f4SEric Blake } 218461007b31SStefan Hajnoczi } 21859a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 218661007b31SStefan Hajnoczi 218761007b31SStefan Hajnoczi if (ret >= 0) { 218804ed95f4SEric Blake ret = 0; 218961007b31SStefan Hajnoczi } 219085fe2479SFam Zheng bdrv_co_write_req_finish(child, offset, bytes, req, ret); 219161007b31SStefan Hajnoczi 219261007b31SStefan Hajnoczi return ret; 219361007b31SStefan Hajnoczi } 219461007b31SStefan Hajnoczi 219585c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 21969eeb6dd1SFam Zheng int64_t offset, 219737e9403eSVladimir Sementsov-Ogievskiy int64_t bytes, 21989eeb6dd1SFam Zheng BdrvRequestFlags flags, 21999eeb6dd1SFam Zheng BdrvTrackedRequest *req) 22009eeb6dd1SFam Zheng { 220185c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 22029eeb6dd1SFam Zheng QEMUIOVector local_qiov; 2203a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 22049eeb6dd1SFam Zheng int ret = 0; 22057a3f542fSVladimir Sementsov-Ogievskiy bool padding; 22067a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 22079eeb6dd1SFam Zheng 22087a3f542fSVladimir Sementsov-Ogievskiy padding = bdrv_init_padding(bs, offset, bytes, &pad); 22097a3f542fSVladimir Sementsov-Ogievskiy if (padding) { 22108ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, align); 22119eeb6dd1SFam Zheng 22127a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, req, &pad, true); 22137a3f542fSVladimir Sementsov-Ogievskiy 22147a3f542fSVladimir Sementsov-Ogievskiy if (pad.head || pad.merge_reads) { 22157a3f542fSVladimir Sementsov-Ogievskiy int64_t aligned_offset = offset & ~(align - 1); 22167a3f542fSVladimir Sementsov-Ogievskiy int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 22177a3f542fSVladimir Sementsov-Ogievskiy 22187a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 22197a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 222028c4da28SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 22219eeb6dd1SFam Zheng flags & ~BDRV_REQ_ZERO_WRITE); 22227a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0 || pad.merge_reads) { 22237a3f542fSVladimir Sementsov-Ogievskiy /* Error or all work is done */ 22247a3f542fSVladimir Sementsov-Ogievskiy goto out; 22259eeb6dd1SFam Zheng } 22267a3f542fSVladimir Sementsov-Ogievskiy offset += write_bytes - pad.head; 22277a3f542fSVladimir Sementsov-Ogievskiy bytes -= write_bytes - pad.head; 22287a3f542fSVladimir Sementsov-Ogievskiy } 22299eeb6dd1SFam Zheng } 22309eeb6dd1SFam Zheng 22319eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 22329eeb6dd1SFam Zheng if (bytes >= align) { 22339eeb6dd1SFam Zheng /* Write the aligned part in the middle. */ 2234fcfd9adeSVladimir Sementsov-Ogievskiy int64_t aligned_bytes = bytes & ~(align - 1); 223585c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 223628c4da28SVladimir Sementsov-Ogievskiy NULL, 0, flags); 22379eeb6dd1SFam Zheng if (ret < 0) { 22387a3f542fSVladimir Sementsov-Ogievskiy goto out; 22399eeb6dd1SFam Zheng } 22409eeb6dd1SFam Zheng bytes -= aligned_bytes; 22419eeb6dd1SFam Zheng offset += aligned_bytes; 22429eeb6dd1SFam Zheng } 22439eeb6dd1SFam Zheng 22449eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 22459eeb6dd1SFam Zheng if (bytes) { 22467a3f542fSVladimir Sementsov-Ogievskiy assert(align == pad.tail + bytes); 22479eeb6dd1SFam Zheng 22487a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 224985c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, align, align, 225028c4da28SVladimir Sementsov-Ogievskiy &local_qiov, 0, 225128c4da28SVladimir Sementsov-Ogievskiy flags & ~BDRV_REQ_ZERO_WRITE); 22529eeb6dd1SFam Zheng } 22539eeb6dd1SFam Zheng 22547a3f542fSVladimir Sementsov-Ogievskiy out: 22557a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 22567a3f542fSVladimir Sementsov-Ogievskiy 22577a3f542fSVladimir Sementsov-Ogievskiy return ret; 22589eeb6dd1SFam Zheng } 22599eeb6dd1SFam Zheng 226061007b31SStefan Hajnoczi /* 226161007b31SStefan Hajnoczi * Handle a write request in coroutine context 226261007b31SStefan Hajnoczi */ 2263a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2264e9e52efdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 226561007b31SStefan Hajnoczi BdrvRequestFlags flags) 226661007b31SStefan Hajnoczi { 2267*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 22681acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 22691acc3466SVladimir Sementsov-Ogievskiy } 22701acc3466SVladimir Sementsov-Ogievskiy 22711acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 227237e9403eSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, 22731acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 22741acc3466SVladimir Sementsov-Ogievskiy { 2275a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 227661007b31SStefan Hajnoczi BdrvTrackedRequest req; 2277a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 22787a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 227961007b31SStefan Hajnoczi int ret; 2280f0deecffSVladimir Sementsov-Ogievskiy bool padded = false; 2281*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 228261007b31SStefan Hajnoczi 228337e9403eSVladimir Sementsov-Ogievskiy trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags); 2284f42cf447SDaniel P. Berrange 2285f4dad307SVladimir Sementsov-Ogievskiy if (!bdrv_is_inserted(bs)) { 228661007b31SStefan Hajnoczi return -ENOMEDIUM; 228761007b31SStefan Hajnoczi } 228861007b31SStefan Hajnoczi 22892aaa3f9bSVladimir Sementsov-Ogievskiy if (flags & BDRV_REQ_ZERO_WRITE) { 22902aaa3f9bSVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 22912aaa3f9bSVladimir Sementsov-Ogievskiy } else { 229263f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 22932aaa3f9bSVladimir Sementsov-Ogievskiy } 229461007b31SStefan Hajnoczi if (ret < 0) { 229561007b31SStefan Hajnoczi return ret; 229661007b31SStefan Hajnoczi } 229761007b31SStefan Hajnoczi 2298f2208fdcSAlberto Garcia /* If the request is misaligned then we can't make it efficient */ 2299f2208fdcSAlberto Garcia if ((flags & BDRV_REQ_NO_FALLBACK) && 2300f2208fdcSAlberto Garcia !QEMU_IS_ALIGNED(offset | bytes, align)) 2301f2208fdcSAlberto Garcia { 2302f2208fdcSAlberto Garcia return -ENOTSUP; 2303f2208fdcSAlberto Garcia } 2304f2208fdcSAlberto Garcia 2305ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2306ac9d00bfSVladimir Sementsov-Ogievskiy /* 2307ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 2308ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2309ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 2310ac9d00bfSVladimir Sementsov-Ogievskiy * 2311ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 2312ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length write occasionally. 2313ac9d00bfSVladimir Sementsov-Ogievskiy */ 2314ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 2315ac9d00bfSVladimir Sementsov-Ogievskiy } 2316ac9d00bfSVladimir Sementsov-Ogievskiy 2317f0deecffSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_ZERO_WRITE)) { 231861007b31SStefan Hajnoczi /* 2319f0deecffSVladimir Sementsov-Ogievskiy * Pad request for following read-modify-write cycle. 2320f0deecffSVladimir Sementsov-Ogievskiy * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do 2321f0deecffSVladimir Sementsov-Ogievskiy * alignment only if there is no ZERO flag. 232261007b31SStefan Hajnoczi */ 232398ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 232498ca4549SVladimir Sementsov-Ogievskiy &padded); 232598ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 232698ca4549SVladimir Sementsov-Ogievskiy return ret; 232798ca4549SVladimir Sementsov-Ogievskiy } 2328f0deecffSVladimir Sementsov-Ogievskiy } 2329f0deecffSVladimir Sementsov-Ogievskiy 2330f0deecffSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2331ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 233261007b31SStefan Hajnoczi 233318a59f03SAnton Nefedov if (flags & BDRV_REQ_ZERO_WRITE) { 2334f0deecffSVladimir Sementsov-Ogievskiy assert(!padded); 233585c97ca7SKevin Wolf ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 23369eeb6dd1SFam Zheng goto out; 23379eeb6dd1SFam Zheng } 23389eeb6dd1SFam Zheng 2339f0deecffSVladimir Sementsov-Ogievskiy if (padded) { 2340f0deecffSVladimir Sementsov-Ogievskiy /* 2341f0deecffSVladimir Sementsov-Ogievskiy * Request was unaligned to request_alignment and therefore 2342f0deecffSVladimir Sementsov-Ogievskiy * padded. We are going to do read-modify-write, and must 2343f0deecffSVladimir Sementsov-Ogievskiy * serialize the request to prevent interactions of the 2344f0deecffSVladimir Sementsov-Ogievskiy * widened region with other transactions. 2345f0deecffSVladimir Sementsov-Ogievskiy */ 23468ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, align); 23477a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, &req, &pad, false); 234861007b31SStefan Hajnoczi } 234961007b31SStefan Hajnoczi 235085c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 23511acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 235261007b31SStefan Hajnoczi 23537a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 235461007b31SStefan Hajnoczi 23559eeb6dd1SFam Zheng out: 23569eeb6dd1SFam Zheng tracked_request_end(&req); 235799723548SPaolo Bonzini bdrv_dec_in_flight(bs); 23587a3f542fSVladimir Sementsov-Ogievskiy 235961007b31SStefan Hajnoczi return ret; 236061007b31SStefan Hajnoczi } 236161007b31SStefan Hajnoczi 2362a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2363e9e52efdSVladimir Sementsov-Ogievskiy int64_t bytes, BdrvRequestFlags flags) 236461007b31SStefan Hajnoczi { 2365384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2366f5a5ca79SManos Pitsidianakis trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 236761007b31SStefan Hajnoczi 2368a03ef88fSKevin Wolf if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 236961007b31SStefan Hajnoczi flags &= ~BDRV_REQ_MAY_UNMAP; 237061007b31SStefan Hajnoczi } 237161007b31SStefan Hajnoczi 2372f5a5ca79SManos Pitsidianakis return bdrv_co_pwritev(child, offset, bytes, NULL, 237361007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 237461007b31SStefan Hajnoczi } 237561007b31SStefan Hajnoczi 23764085f5c7SJohn Snow /* 23774085f5c7SJohn Snow * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 23784085f5c7SJohn Snow */ 23794085f5c7SJohn Snow int bdrv_flush_all(void) 23804085f5c7SJohn Snow { 23814085f5c7SJohn Snow BdrvNextIterator it; 23824085f5c7SJohn Snow BlockDriverState *bs = NULL; 23834085f5c7SJohn Snow int result = 0; 23844085f5c7SJohn Snow 2385f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 2386f791bf7fSEmanuele Giuseppe Esposito 2387c8aa7895SPavel Dovgalyuk /* 2388c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 2389c8aa7895SPavel Dovgalyuk * creating new flush request for stopping 2390c8aa7895SPavel Dovgalyuk * the VM may break the determinism 2391c8aa7895SPavel Dovgalyuk */ 2392c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 2393c8aa7895SPavel Dovgalyuk return result; 2394c8aa7895SPavel Dovgalyuk } 2395c8aa7895SPavel Dovgalyuk 23964085f5c7SJohn Snow for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 23974085f5c7SJohn Snow AioContext *aio_context = bdrv_get_aio_context(bs); 23984085f5c7SJohn Snow int ret; 23994085f5c7SJohn Snow 24004085f5c7SJohn Snow aio_context_acquire(aio_context); 24014085f5c7SJohn Snow ret = bdrv_flush(bs); 24024085f5c7SJohn Snow if (ret < 0 && !result) { 24034085f5c7SJohn Snow result = ret; 24044085f5c7SJohn Snow } 24054085f5c7SJohn Snow aio_context_release(aio_context); 24064085f5c7SJohn Snow } 24074085f5c7SJohn Snow 24084085f5c7SJohn Snow return result; 24094085f5c7SJohn Snow } 24104085f5c7SJohn Snow 241161007b31SStefan Hajnoczi /* 241261007b31SStefan Hajnoczi * Returns the allocation status of the specified sectors. 241361007b31SStefan Hajnoczi * Drivers not implementing the functionality are assumed to not support 241461007b31SStefan Hajnoczi * backing files, hence all their sectors are reported as allocated. 241561007b31SStefan Hajnoczi * 241686a3d5c6SEric Blake * If 'want_zero' is true, the caller is querying for mapping 241786a3d5c6SEric Blake * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 241886a3d5c6SEric Blake * _ZERO where possible; otherwise, the result favors larger 'pnum', 241986a3d5c6SEric Blake * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2420c9ce8c4dSEric Blake * 24212e8bc787SEric Blake * If 'offset' is beyond the end of the disk image the return value is 2422fb0d8654SEric Blake * BDRV_BLOCK_EOF and 'pnum' is set to 0. 242361007b31SStefan Hajnoczi * 24242e8bc787SEric Blake * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2425fb0d8654SEric Blake * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2426fb0d8654SEric Blake * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 242767a0fd2aSFam Zheng * 24282e8bc787SEric Blake * 'pnum' is set to the number of bytes (including and immediately 24292e8bc787SEric Blake * following the specified offset) that are easily known to be in the 24302e8bc787SEric Blake * same allocated/unallocated state. Note that a second call starting 24312e8bc787SEric Blake * at the original offset plus returned pnum may have the same status. 24322e8bc787SEric Blake * The returned value is non-zero on success except at end-of-file. 24332e8bc787SEric Blake * 24342e8bc787SEric Blake * Returns negative errno on failure. Otherwise, if the 24352e8bc787SEric Blake * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 24362e8bc787SEric Blake * set to the host mapping and BDS corresponding to the guest offset. 243761007b31SStefan Hajnoczi */ 24382e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2439c9ce8c4dSEric Blake bool want_zero, 24402e8bc787SEric Blake int64_t offset, int64_t bytes, 24412e8bc787SEric Blake int64_t *pnum, int64_t *map, 244267a0fd2aSFam Zheng BlockDriverState **file) 244361007b31SStefan Hajnoczi { 24442e8bc787SEric Blake int64_t total_size; 24452e8bc787SEric Blake int64_t n; /* bytes */ 2446efa6e2edSEric Blake int ret; 24472e8bc787SEric Blake int64_t local_map = 0; 2448298a1665SEric Blake BlockDriverState *local_file = NULL; 2449efa6e2edSEric Blake int64_t aligned_offset, aligned_bytes; 2450efa6e2edSEric Blake uint32_t align; 2451549ec0d9SMax Reitz bool has_filtered_child; 245261007b31SStefan Hajnoczi 2453298a1665SEric Blake assert(pnum); 2454298a1665SEric Blake *pnum = 0; 24552e8bc787SEric Blake total_size = bdrv_getlength(bs); 24562e8bc787SEric Blake if (total_size < 0) { 24572e8bc787SEric Blake ret = total_size; 2458298a1665SEric Blake goto early_out; 245961007b31SStefan Hajnoczi } 246061007b31SStefan Hajnoczi 24612e8bc787SEric Blake if (offset >= total_size) { 2462298a1665SEric Blake ret = BDRV_BLOCK_EOF; 2463298a1665SEric Blake goto early_out; 246461007b31SStefan Hajnoczi } 24652e8bc787SEric Blake if (!bytes) { 2466298a1665SEric Blake ret = 0; 2467298a1665SEric Blake goto early_out; 24689cdcfd9fSEric Blake } 246961007b31SStefan Hajnoczi 24702e8bc787SEric Blake n = total_size - offset; 24712e8bc787SEric Blake if (n < bytes) { 24722e8bc787SEric Blake bytes = n; 247361007b31SStefan Hajnoczi } 247461007b31SStefan Hajnoczi 2475d470ad42SMax Reitz /* Must be non-NULL or bdrv_getlength() would have failed */ 2476d470ad42SMax Reitz assert(bs->drv); 2477549ec0d9SMax Reitz has_filtered_child = bdrv_filter_child(bs); 2478549ec0d9SMax Reitz if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { 24792e8bc787SEric Blake *pnum = bytes; 248061007b31SStefan Hajnoczi ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 24812e8bc787SEric Blake if (offset + bytes == total_size) { 2482fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2483fb0d8654SEric Blake } 248461007b31SStefan Hajnoczi if (bs->drv->protocol_name) { 24852e8bc787SEric Blake ret |= BDRV_BLOCK_OFFSET_VALID; 24862e8bc787SEric Blake local_map = offset; 2487298a1665SEric Blake local_file = bs; 248861007b31SStefan Hajnoczi } 2489298a1665SEric Blake goto early_out; 249061007b31SStefan Hajnoczi } 249161007b31SStefan Hajnoczi 249299723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2493efa6e2edSEric Blake 2494efa6e2edSEric Blake /* Round out to request_alignment boundaries */ 249586a3d5c6SEric Blake align = bs->bl.request_alignment; 2496efa6e2edSEric Blake aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2497efa6e2edSEric Blake aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2498efa6e2edSEric Blake 2499549ec0d9SMax Reitz if (bs->drv->bdrv_co_block_status) { 25000bc329fbSHanna Reitz /* 25010bc329fbSHanna Reitz * Use the block-status cache only for protocol nodes: Format 25020bc329fbSHanna Reitz * drivers are generally quick to inquire the status, but protocol 25030bc329fbSHanna Reitz * drivers often need to get information from outside of qemu, so 25040bc329fbSHanna Reitz * we do not have control over the actual implementation. There 25050bc329fbSHanna Reitz * have been cases where inquiring the status took an unreasonably 25060bc329fbSHanna Reitz * long time, and we can do nothing in qemu to fix it. 25070bc329fbSHanna Reitz * This is especially problematic for images with large data areas, 25080bc329fbSHanna Reitz * because finding the few holes in them and giving them special 25090bc329fbSHanna Reitz * treatment does not gain much performance. Therefore, we try to 25100bc329fbSHanna Reitz * cache the last-identified data region. 25110bc329fbSHanna Reitz * 25120bc329fbSHanna Reitz * Second, limiting ourselves to protocol nodes allows us to assume 25130bc329fbSHanna Reitz * the block status for data regions to be DATA | OFFSET_VALID, and 25140bc329fbSHanna Reitz * that the host offset is the same as the guest offset. 25150bc329fbSHanna Reitz * 25160bc329fbSHanna Reitz * Note that it is possible that external writers zero parts of 25170bc329fbSHanna Reitz * the cached regions without the cache being invalidated, and so 25180bc329fbSHanna Reitz * we may report zeroes as data. This is not catastrophic, 25190bc329fbSHanna Reitz * however, because reporting zeroes as data is fine. 25200bc329fbSHanna Reitz */ 25210bc329fbSHanna Reitz if (QLIST_EMPTY(&bs->children) && 25220bc329fbSHanna Reitz bdrv_bsc_is_data(bs, aligned_offset, pnum)) 25230bc329fbSHanna Reitz { 25240bc329fbSHanna Reitz ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 25250bc329fbSHanna Reitz local_file = bs; 25260bc329fbSHanna Reitz local_map = aligned_offset; 25270bc329fbSHanna Reitz } else { 252886a3d5c6SEric Blake ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 252986a3d5c6SEric Blake aligned_bytes, pnum, &local_map, 253086a3d5c6SEric Blake &local_file); 25310bc329fbSHanna Reitz 25320bc329fbSHanna Reitz /* 25330bc329fbSHanna Reitz * Note that checking QLIST_EMPTY(&bs->children) is also done when 25340bc329fbSHanna Reitz * the cache is queried above. Technically, we do not need to check 25350bc329fbSHanna Reitz * it here; the worst that can happen is that we fill the cache for 25360bc329fbSHanna Reitz * non-protocol nodes, and then it is never used. However, filling 25370bc329fbSHanna Reitz * the cache requires an RCU update, so double check here to avoid 25380bc329fbSHanna Reitz * such an update if possible. 2539113b727cSHanna Reitz * 2540113b727cSHanna Reitz * Check want_zero, because we only want to update the cache when we 2541113b727cSHanna Reitz * have accurate information about what is zero and what is data. 25420bc329fbSHanna Reitz */ 2543113b727cSHanna Reitz if (want_zero && 2544113b727cSHanna Reitz ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && 25450bc329fbSHanna Reitz QLIST_EMPTY(&bs->children)) 25460bc329fbSHanna Reitz { 25470bc329fbSHanna Reitz /* 25480bc329fbSHanna Reitz * When a protocol driver reports BLOCK_OFFSET_VALID, the 25490bc329fbSHanna Reitz * returned local_map value must be the same as the offset we 25500bc329fbSHanna Reitz * have passed (aligned_offset), and local_bs must be the node 25510bc329fbSHanna Reitz * itself. 25520bc329fbSHanna Reitz * Assert this, because we follow this rule when reading from 25530bc329fbSHanna Reitz * the cache (see the `local_file = bs` and 25540bc329fbSHanna Reitz * `local_map = aligned_offset` assignments above), and the 25550bc329fbSHanna Reitz * result the cache delivers must be the same as the driver 25560bc329fbSHanna Reitz * would deliver. 25570bc329fbSHanna Reitz */ 25580bc329fbSHanna Reitz assert(local_file == bs); 25590bc329fbSHanna Reitz assert(local_map == aligned_offset); 25600bc329fbSHanna Reitz bdrv_bsc_fill(bs, aligned_offset, *pnum); 25610bc329fbSHanna Reitz } 25620bc329fbSHanna Reitz } 2563549ec0d9SMax Reitz } else { 2564549ec0d9SMax Reitz /* Default code for filters */ 2565549ec0d9SMax Reitz 2566549ec0d9SMax Reitz local_file = bdrv_filter_bs(bs); 2567549ec0d9SMax Reitz assert(local_file); 2568549ec0d9SMax Reitz 2569549ec0d9SMax Reitz *pnum = aligned_bytes; 2570549ec0d9SMax Reitz local_map = aligned_offset; 2571549ec0d9SMax Reitz ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2572549ec0d9SMax Reitz } 257386a3d5c6SEric Blake if (ret < 0) { 257486a3d5c6SEric Blake *pnum = 0; 257586a3d5c6SEric Blake goto out; 257686a3d5c6SEric Blake } 2577efa6e2edSEric Blake 2578efa6e2edSEric Blake /* 2579636cb512SEric Blake * The driver's result must be a non-zero multiple of request_alignment. 2580efa6e2edSEric Blake * Clamp pnum and adjust map to original request. 2581efa6e2edSEric Blake */ 2582636cb512SEric Blake assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2583636cb512SEric Blake align > offset - aligned_offset); 258469f47505SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_RECURSE) { 258569f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_DATA); 258669f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_OFFSET_VALID); 258769f47505SVladimir Sementsov-Ogievskiy assert(!(ret & BDRV_BLOCK_ZERO)); 258869f47505SVladimir Sementsov-Ogievskiy } 258969f47505SVladimir Sementsov-Ogievskiy 2590efa6e2edSEric Blake *pnum -= offset - aligned_offset; 2591efa6e2edSEric Blake if (*pnum > bytes) { 2592efa6e2edSEric Blake *pnum = bytes; 2593efa6e2edSEric Blake } 2594efa6e2edSEric Blake if (ret & BDRV_BLOCK_OFFSET_VALID) { 2595efa6e2edSEric Blake local_map += offset - aligned_offset; 2596efa6e2edSEric Blake } 259761007b31SStefan Hajnoczi 259861007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_RAW) { 2599298a1665SEric Blake assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 26002e8bc787SEric Blake ret = bdrv_co_block_status(local_file, want_zero, local_map, 26012e8bc787SEric Blake *pnum, pnum, &local_map, &local_file); 260299723548SPaolo Bonzini goto out; 260361007b31SStefan Hajnoczi } 260461007b31SStefan Hajnoczi 260561007b31SStefan Hajnoczi if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 260661007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ALLOCATED; 2607d40f4a56SAlberto Garcia } else if (bs->drv->supports_backing) { 2608cb850315SMax Reitz BlockDriverState *cow_bs = bdrv_cow_bs(bs); 2609cb850315SMax Reitz 2610d40f4a56SAlberto Garcia if (!cow_bs) { 2611d40f4a56SAlberto Garcia ret |= BDRV_BLOCK_ZERO; 2612d40f4a56SAlberto Garcia } else if (want_zero) { 2613cb850315SMax Reitz int64_t size2 = bdrv_getlength(cow_bs); 2614c9ce8c4dSEric Blake 26152e8bc787SEric Blake if (size2 >= 0 && offset >= size2) { 261661007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 261761007b31SStefan Hajnoczi } 26187b1efe99SVladimir Sementsov-Ogievskiy } 261961007b31SStefan Hajnoczi } 262061007b31SStefan Hajnoczi 262169f47505SVladimir Sementsov-Ogievskiy if (want_zero && ret & BDRV_BLOCK_RECURSE && 262269f47505SVladimir Sementsov-Ogievskiy local_file && local_file != bs && 262361007b31SStefan Hajnoczi (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 262461007b31SStefan Hajnoczi (ret & BDRV_BLOCK_OFFSET_VALID)) { 26252e8bc787SEric Blake int64_t file_pnum; 26262e8bc787SEric Blake int ret2; 262761007b31SStefan Hajnoczi 26282e8bc787SEric Blake ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 26292e8bc787SEric Blake *pnum, &file_pnum, NULL, NULL); 263061007b31SStefan Hajnoczi if (ret2 >= 0) { 263161007b31SStefan Hajnoczi /* Ignore errors. This is just providing extra information, it 263261007b31SStefan Hajnoczi * is useful but not necessary. 263361007b31SStefan Hajnoczi */ 2634c61e684eSEric Blake if (ret2 & BDRV_BLOCK_EOF && 2635c61e684eSEric Blake (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2636c61e684eSEric Blake /* 2637c61e684eSEric Blake * It is valid for the format block driver to read 2638c61e684eSEric Blake * beyond the end of the underlying file's current 2639c61e684eSEric Blake * size; such areas read as zero. 2640c61e684eSEric Blake */ 264161007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 264261007b31SStefan Hajnoczi } else { 264361007b31SStefan Hajnoczi /* Limit request to the range reported by the protocol driver */ 264461007b31SStefan Hajnoczi *pnum = file_pnum; 264561007b31SStefan Hajnoczi ret |= (ret2 & BDRV_BLOCK_ZERO); 264661007b31SStefan Hajnoczi } 264761007b31SStefan Hajnoczi } 264861007b31SStefan Hajnoczi } 264961007b31SStefan Hajnoczi 265099723548SPaolo Bonzini out: 265199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 26522e8bc787SEric Blake if (ret >= 0 && offset + *pnum == total_size) { 2653fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2654fb0d8654SEric Blake } 2655298a1665SEric Blake early_out: 2656298a1665SEric Blake if (file) { 2657298a1665SEric Blake *file = local_file; 2658298a1665SEric Blake } 26592e8bc787SEric Blake if (map) { 26602e8bc787SEric Blake *map = local_map; 26612e8bc787SEric Blake } 266261007b31SStefan Hajnoczi return ret; 266361007b31SStefan Hajnoczi } 266461007b31SStefan Hajnoczi 266521c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2666f9e694cbSVladimir Sementsov-Ogievskiy bdrv_co_common_block_status_above(BlockDriverState *bs, 2667ba3f0e25SFam Zheng BlockDriverState *base, 26683555a432SVladimir Sementsov-Ogievskiy bool include_base, 2669c9ce8c4dSEric Blake bool want_zero, 26705b648c67SEric Blake int64_t offset, 26715b648c67SEric Blake int64_t bytes, 26725b648c67SEric Blake int64_t *pnum, 26735b648c67SEric Blake int64_t *map, 2674a92b1b06SEric Blake BlockDriverState **file, 2675a92b1b06SEric Blake int *depth) 2676ba3f0e25SFam Zheng { 267767c095c8SVladimir Sementsov-Ogievskiy int ret; 2678ba3f0e25SFam Zheng BlockDriverState *p; 267967c095c8SVladimir Sementsov-Ogievskiy int64_t eof = 0; 2680a92b1b06SEric Blake int dummy; 2681ba3f0e25SFam Zheng 26823555a432SVladimir Sementsov-Ogievskiy assert(!include_base || base); /* Can't include NULL base */ 268367c095c8SVladimir Sementsov-Ogievskiy 2684a92b1b06SEric Blake if (!depth) { 2685a92b1b06SEric Blake depth = &dummy; 2686a92b1b06SEric Blake } 2687a92b1b06SEric Blake *depth = 0; 2688a92b1b06SEric Blake 2689624f27bbSVladimir Sementsov-Ogievskiy if (!include_base && bs == base) { 2690624f27bbSVladimir Sementsov-Ogievskiy *pnum = bytes; 2691624f27bbSVladimir Sementsov-Ogievskiy return 0; 2692624f27bbSVladimir Sementsov-Ogievskiy } 2693624f27bbSVladimir Sementsov-Ogievskiy 269467c095c8SVladimir Sementsov-Ogievskiy ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file); 2695a92b1b06SEric Blake ++*depth; 26963555a432SVladimir Sementsov-Ogievskiy if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { 269767c095c8SVladimir Sementsov-Ogievskiy return ret; 269867c095c8SVladimir Sementsov-Ogievskiy } 269967c095c8SVladimir Sementsov-Ogievskiy 270067c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_EOF) { 270167c095c8SVladimir Sementsov-Ogievskiy eof = offset + *pnum; 270267c095c8SVladimir Sementsov-Ogievskiy } 270367c095c8SVladimir Sementsov-Ogievskiy 270467c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 270567c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 270667c095c8SVladimir Sementsov-Ogievskiy 27073555a432SVladimir Sementsov-Ogievskiy for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; 270867c095c8SVladimir Sementsov-Ogievskiy p = bdrv_filter_or_cow_bs(p)) 270967c095c8SVladimir Sementsov-Ogievskiy { 27105b648c67SEric Blake ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 27115b648c67SEric Blake file); 2712a92b1b06SEric Blake ++*depth; 2713c61e684eSEric Blake if (ret < 0) { 271467c095c8SVladimir Sementsov-Ogievskiy return ret; 2715c61e684eSEric Blake } 271667c095c8SVladimir Sementsov-Ogievskiy if (*pnum == 0) { 2717c61e684eSEric Blake /* 271867c095c8SVladimir Sementsov-Ogievskiy * The top layer deferred to this layer, and because this layer is 271967c095c8SVladimir Sementsov-Ogievskiy * short, any zeroes that we synthesize beyond EOF behave as if they 272067c095c8SVladimir Sementsov-Ogievskiy * were allocated at this layer. 272167c095c8SVladimir Sementsov-Ogievskiy * 272267c095c8SVladimir Sementsov-Ogievskiy * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be 272367c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 272467c095c8SVladimir Sementsov-Ogievskiy * below. 2725c61e684eSEric Blake */ 272667c095c8SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_EOF); 27275b648c67SEric Blake *pnum = bytes; 272867c095c8SVladimir Sementsov-Ogievskiy if (file) { 272967c095c8SVladimir Sementsov-Ogievskiy *file = p; 2730c61e684eSEric Blake } 273167c095c8SVladimir Sementsov-Ogievskiy ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED; 2732ba3f0e25SFam Zheng break; 2733ba3f0e25SFam Zheng } 273467c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_ALLOCATED) { 273567c095c8SVladimir Sementsov-Ogievskiy /* 273667c095c8SVladimir Sementsov-Ogievskiy * We've found the node and the status, we must break. 273767c095c8SVladimir Sementsov-Ogievskiy * 273867c095c8SVladimir Sementsov-Ogievskiy * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be 273967c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 274067c095c8SVladimir Sementsov-Ogievskiy * below. 274167c095c8SVladimir Sementsov-Ogievskiy */ 274267c095c8SVladimir Sementsov-Ogievskiy ret &= ~BDRV_BLOCK_EOF; 274367c095c8SVladimir Sementsov-Ogievskiy break; 2744ba3f0e25SFam Zheng } 274567c095c8SVladimir Sementsov-Ogievskiy 27463555a432SVladimir Sementsov-Ogievskiy if (p == base) { 27473555a432SVladimir Sementsov-Ogievskiy assert(include_base); 27483555a432SVladimir Sementsov-Ogievskiy break; 27493555a432SVladimir Sementsov-Ogievskiy } 27503555a432SVladimir Sementsov-Ogievskiy 275167c095c8SVladimir Sementsov-Ogievskiy /* 275267c095c8SVladimir Sementsov-Ogievskiy * OK, [offset, offset + *pnum) region is unallocated on this layer, 275367c095c8SVladimir Sementsov-Ogievskiy * let's continue the diving. 275467c095c8SVladimir Sementsov-Ogievskiy */ 275567c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 275667c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 275767c095c8SVladimir Sementsov-Ogievskiy } 275867c095c8SVladimir Sementsov-Ogievskiy 275967c095c8SVladimir Sementsov-Ogievskiy if (offset + *pnum == eof) { 276067c095c8SVladimir Sementsov-Ogievskiy ret |= BDRV_BLOCK_EOF; 276167c095c8SVladimir Sementsov-Ogievskiy } 276267c095c8SVladimir Sementsov-Ogievskiy 2763ba3f0e25SFam Zheng return ret; 2764ba3f0e25SFam Zheng } 2765ba3f0e25SFam Zheng 276631826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 276731826642SEric Blake int64_t offset, int64_t bytes, int64_t *pnum, 276831826642SEric Blake int64_t *map, BlockDriverState **file) 2769c9ce8c4dSEric Blake { 2770384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 27713555a432SVladimir Sementsov-Ogievskiy return bdrv_common_block_status_above(bs, base, false, true, offset, bytes, 2772a92b1b06SEric Blake pnum, map, file, NULL); 2773c9ce8c4dSEric Blake } 2774c9ce8c4dSEric Blake 2775237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2776237d78f8SEric Blake int64_t *pnum, int64_t *map, BlockDriverState **file) 2777ba3f0e25SFam Zheng { 2778384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2779cb850315SMax Reitz return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), 278031826642SEric Blake offset, bytes, pnum, map, file); 2781ba3f0e25SFam Zheng } 2782ba3f0e25SFam Zheng 278346cd1e8aSAlberto Garcia /* 278446cd1e8aSAlberto Garcia * Check @bs (and its backing chain) to see if the range defined 278546cd1e8aSAlberto Garcia * by @offset and @bytes is known to read as zeroes. 278646cd1e8aSAlberto Garcia * Return 1 if that is the case, 0 otherwise and -errno on error. 278746cd1e8aSAlberto Garcia * This test is meant to be fast rather than accurate so returning 0 278846cd1e8aSAlberto Garcia * does not guarantee non-zero data. 278946cd1e8aSAlberto Garcia */ 279046cd1e8aSAlberto Garcia int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, 279146cd1e8aSAlberto Garcia int64_t bytes) 279246cd1e8aSAlberto Garcia { 279346cd1e8aSAlberto Garcia int ret; 279446cd1e8aSAlberto Garcia int64_t pnum = bytes; 2795384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 279646cd1e8aSAlberto Garcia 279746cd1e8aSAlberto Garcia if (!bytes) { 279846cd1e8aSAlberto Garcia return 1; 279946cd1e8aSAlberto Garcia } 280046cd1e8aSAlberto Garcia 280146cd1e8aSAlberto Garcia ret = bdrv_common_block_status_above(bs, NULL, false, false, offset, 2802a92b1b06SEric Blake bytes, &pnum, NULL, NULL, NULL); 280346cd1e8aSAlberto Garcia 280446cd1e8aSAlberto Garcia if (ret < 0) { 280546cd1e8aSAlberto Garcia return ret; 280646cd1e8aSAlberto Garcia } 280746cd1e8aSAlberto Garcia 280846cd1e8aSAlberto Garcia return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); 280946cd1e8aSAlberto Garcia } 281046cd1e8aSAlberto Garcia 2811d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2812d6a644bbSEric Blake int64_t bytes, int64_t *pnum) 281361007b31SStefan Hajnoczi { 28147ddb99b9SEric Blake int ret; 28157ddb99b9SEric Blake int64_t dummy; 2816384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2817d6a644bbSEric Blake 28183555a432SVladimir Sementsov-Ogievskiy ret = bdrv_common_block_status_above(bs, bs, true, false, offset, 28193555a432SVladimir Sementsov-Ogievskiy bytes, pnum ? pnum : &dummy, NULL, 2820a92b1b06SEric Blake NULL, NULL); 282161007b31SStefan Hajnoczi if (ret < 0) { 282261007b31SStefan Hajnoczi return ret; 282361007b31SStefan Hajnoczi } 282461007b31SStefan Hajnoczi return !!(ret & BDRV_BLOCK_ALLOCATED); 282561007b31SStefan Hajnoczi } 282661007b31SStefan Hajnoczi 282761007b31SStefan Hajnoczi /* 282861007b31SStefan Hajnoczi * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 282961007b31SStefan Hajnoczi * 2830a92b1b06SEric Blake * Return a positive depth if (a prefix of) the given range is allocated 2831a92b1b06SEric Blake * in any image between BASE and TOP (BASE is only included if include_base 2832a92b1b06SEric Blake * is set). Depth 1 is TOP, 2 is the first backing layer, and so forth. 2833170d3bd3SAndrey Shinkevich * BASE can be NULL to check if the given offset is allocated in any 2834170d3bd3SAndrey Shinkevich * image of the chain. Return 0 otherwise, or negative errno on 2835170d3bd3SAndrey Shinkevich * failure. 283661007b31SStefan Hajnoczi * 283751b0a488SEric Blake * 'pnum' is set to the number of bytes (including and immediately 283851b0a488SEric Blake * following the specified offset) that are known to be in the same 283951b0a488SEric Blake * allocated/unallocated state. Note that a subsequent call starting 284051b0a488SEric Blake * at 'offset + *pnum' may return the same allocation status (in other 284151b0a488SEric Blake * words, the result is not necessarily the maximum possible range); 284251b0a488SEric Blake * but 'pnum' will only be 0 when end of file is reached. 284361007b31SStefan Hajnoczi */ 284461007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top, 284561007b31SStefan Hajnoczi BlockDriverState *base, 2846170d3bd3SAndrey Shinkevich bool include_base, int64_t offset, 2847170d3bd3SAndrey Shinkevich int64_t bytes, int64_t *pnum) 284861007b31SStefan Hajnoczi { 2849a92b1b06SEric Blake int depth; 28507e7e5100SVladimir Sementsov-Ogievskiy int ret = bdrv_common_block_status_above(top, base, include_base, false, 2851a92b1b06SEric Blake offset, bytes, pnum, NULL, NULL, 2852a92b1b06SEric Blake &depth); 2853384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 285461007b31SStefan Hajnoczi if (ret < 0) { 285561007b31SStefan Hajnoczi return ret; 2856d6a644bbSEric Blake } 285761007b31SStefan Hajnoczi 2858a92b1b06SEric Blake if (ret & BDRV_BLOCK_ALLOCATED) { 2859a92b1b06SEric Blake return depth; 2860a92b1b06SEric Blake } 2861a92b1b06SEric Blake return 0; 286261007b31SStefan Hajnoczi } 286361007b31SStefan Hajnoczi 286421c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2865b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 28661a8ae822SKevin Wolf { 28671a8ae822SKevin Wolf BlockDriver *drv = bs->drv; 2868c4db2e25SMax Reitz BlockDriverState *child_bs = bdrv_primary_bs(bs); 2869b984b296SVladimir Sementsov-Ogievskiy int ret; 2870b984b296SVladimir Sementsov-Ogievskiy 2871b984b296SVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2872b984b296SVladimir Sementsov-Ogievskiy if (ret < 0) { 2873b984b296SVladimir Sementsov-Ogievskiy return ret; 2874b984b296SVladimir Sementsov-Ogievskiy } 2875dc88a467SStefan Hajnoczi 2876b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2877b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2878b33b354fSVladimir Sementsov-Ogievskiy } 2879b33b354fSVladimir Sementsov-Ogievskiy 2880dc88a467SStefan Hajnoczi bdrv_inc_in_flight(bs); 28811a8ae822SKevin Wolf 2882b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_load_vmstate) { 2883dc88a467SStefan Hajnoczi ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2884c4db2e25SMax Reitz } else if (child_bs) { 2885b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); 2886b984b296SVladimir Sementsov-Ogievskiy } else { 2887b984b296SVladimir Sementsov-Ogievskiy ret = -ENOTSUP; 28881a8ae822SKevin Wolf } 28891a8ae822SKevin Wolf 2890dc88a467SStefan Hajnoczi bdrv_dec_in_flight(bs); 2891b33b354fSVladimir Sementsov-Ogievskiy 2892b33b354fSVladimir Sementsov-Ogievskiy return ret; 2893b33b354fSVladimir Sementsov-Ogievskiy } 2894b33b354fSVladimir Sementsov-Ogievskiy 2895b33b354fSVladimir Sementsov-Ogievskiy int coroutine_fn 2896b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2897b33b354fSVladimir Sementsov-Ogievskiy { 2898b33b354fSVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 2899b33b354fSVladimir Sementsov-Ogievskiy BlockDriverState *child_bs = bdrv_primary_bs(bs); 2900b984b296SVladimir Sementsov-Ogievskiy int ret; 2901b984b296SVladimir Sementsov-Ogievskiy 2902b984b296SVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2903b984b296SVladimir Sementsov-Ogievskiy if (ret < 0) { 2904b984b296SVladimir Sementsov-Ogievskiy return ret; 2905b984b296SVladimir Sementsov-Ogievskiy } 2906b33b354fSVladimir Sementsov-Ogievskiy 2907b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2908b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2909b33b354fSVladimir Sementsov-Ogievskiy } 2910b33b354fSVladimir Sementsov-Ogievskiy 2911b33b354fSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2912b33b354fSVladimir Sementsov-Ogievskiy 2913b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_save_vmstate) { 2914b33b354fSVladimir Sementsov-Ogievskiy ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2915b33b354fSVladimir Sementsov-Ogievskiy } else if (child_bs) { 2916b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); 2917b984b296SVladimir Sementsov-Ogievskiy } else { 2918b984b296SVladimir Sementsov-Ogievskiy ret = -ENOTSUP; 2919b33b354fSVladimir Sementsov-Ogievskiy } 2920b33b354fSVladimir Sementsov-Ogievskiy 2921b33b354fSVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 2922b33b354fSVladimir Sementsov-Ogievskiy 2923dc88a467SStefan Hajnoczi return ret; 29241a8ae822SKevin Wolf } 29251a8ae822SKevin Wolf 292661007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 292761007b31SStefan Hajnoczi int64_t pos, int size) 292861007b31SStefan Hajnoczi { 29290d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2930b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_writev_vmstate(bs, &qiov, pos); 2931384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 293261007b31SStefan Hajnoczi 2933b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 293461007b31SStefan Hajnoczi } 293561007b31SStefan Hajnoczi 293661007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 293761007b31SStefan Hajnoczi int64_t pos, int size) 293861007b31SStefan Hajnoczi { 29390d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2940b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_readv_vmstate(bs, &qiov, pos); 2941384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 29425ddda0b8SKevin Wolf 2943b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 294461007b31SStefan Hajnoczi } 294561007b31SStefan Hajnoczi 294661007b31SStefan Hajnoczi /**************************************************************/ 294761007b31SStefan Hajnoczi /* async I/Os */ 294861007b31SStefan Hajnoczi 294961007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb) 295061007b31SStefan Hajnoczi { 2951384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 295261007b31SStefan Hajnoczi qemu_aio_ref(acb); 295361007b31SStefan Hajnoczi bdrv_aio_cancel_async(acb); 295461007b31SStefan Hajnoczi while (acb->refcnt > 1) { 295561007b31SStefan Hajnoczi if (acb->aiocb_info->get_aio_context) { 295661007b31SStefan Hajnoczi aio_poll(acb->aiocb_info->get_aio_context(acb), true); 295761007b31SStefan Hajnoczi } else if (acb->bs) { 29582f47da5fSPaolo Bonzini /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 29592f47da5fSPaolo Bonzini * assert that we're not using an I/O thread. Thread-safe 29602f47da5fSPaolo Bonzini * code should use bdrv_aio_cancel_async exclusively. 29612f47da5fSPaolo Bonzini */ 29622f47da5fSPaolo Bonzini assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 296361007b31SStefan Hajnoczi aio_poll(bdrv_get_aio_context(acb->bs), true); 296461007b31SStefan Hajnoczi } else { 296561007b31SStefan Hajnoczi abort(); 296661007b31SStefan Hajnoczi } 296761007b31SStefan Hajnoczi } 296861007b31SStefan Hajnoczi qemu_aio_unref(acb); 296961007b31SStefan Hajnoczi } 297061007b31SStefan Hajnoczi 297161007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements 297261007b31SStefan Hajnoczi * cancel_async, otherwise we do nothing and let the request normally complete. 297361007b31SStefan Hajnoczi * In either case the completion callback must be called. */ 297461007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb) 297561007b31SStefan Hajnoczi { 2976384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 297761007b31SStefan Hajnoczi if (acb->aiocb_info->cancel_async) { 297861007b31SStefan Hajnoczi acb->aiocb_info->cancel_async(acb); 297961007b31SStefan Hajnoczi } 298061007b31SStefan Hajnoczi } 298161007b31SStefan Hajnoczi 298261007b31SStefan Hajnoczi /**************************************************************/ 298361007b31SStefan Hajnoczi /* Coroutine block device emulation */ 298461007b31SStefan Hajnoczi 298561007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 298661007b31SStefan Hajnoczi { 2987883833e2SMax Reitz BdrvChild *primary_child = bdrv_primary_child(bs); 2988883833e2SMax Reitz BdrvChild *child; 298949ca6259SFam Zheng int current_gen; 299049ca6259SFam Zheng int ret = 0; 2991384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 299261007b31SStefan Hajnoczi 299399723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2994c32b82afSPavel Dovgalyuk 2995e914404eSFam Zheng if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 299649ca6259SFam Zheng bdrv_is_sg(bs)) { 299749ca6259SFam Zheng goto early_exit; 299849ca6259SFam Zheng } 299949ca6259SFam Zheng 30003783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 3001d73415a3SStefan Hajnoczi current_gen = qatomic_read(&bs->write_gen); 30023ff2f67aSEvgeny Yakovlev 30033ff2f67aSEvgeny Yakovlev /* Wait until any previous flushes are completed */ 300499723548SPaolo Bonzini while (bs->active_flush_req) { 30053783fa3dSPaolo Bonzini qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 30063ff2f67aSEvgeny Yakovlev } 30073ff2f67aSEvgeny Yakovlev 30083783fa3dSPaolo Bonzini /* Flushes reach this point in nondecreasing current_gen order. */ 300999723548SPaolo Bonzini bs->active_flush_req = true; 30103783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 30113ff2f67aSEvgeny Yakovlev 3012c32b82afSPavel Dovgalyuk /* Write back all layers by calling one driver function */ 3013c32b82afSPavel Dovgalyuk if (bs->drv->bdrv_co_flush) { 3014c32b82afSPavel Dovgalyuk ret = bs->drv->bdrv_co_flush(bs); 3015c32b82afSPavel Dovgalyuk goto out; 3016c32b82afSPavel Dovgalyuk } 3017c32b82afSPavel Dovgalyuk 301861007b31SStefan Hajnoczi /* Write back cached data to the OS even with cache=unsafe */ 3019883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); 302061007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_os) { 302161007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_os(bs); 302261007b31SStefan Hajnoczi if (ret < 0) { 3023cdb5e315SFam Zheng goto out; 302461007b31SStefan Hajnoczi } 302561007b31SStefan Hajnoczi } 302661007b31SStefan Hajnoczi 302761007b31SStefan Hajnoczi /* But don't actually force it to the disk with cache=unsafe */ 302861007b31SStefan Hajnoczi if (bs->open_flags & BDRV_O_NO_FLUSH) { 3029883833e2SMax Reitz goto flush_children; 303061007b31SStefan Hajnoczi } 303161007b31SStefan Hajnoczi 30323ff2f67aSEvgeny Yakovlev /* Check if we really need to flush anything */ 30333ff2f67aSEvgeny Yakovlev if (bs->flushed_gen == current_gen) { 3034883833e2SMax Reitz goto flush_children; 30353ff2f67aSEvgeny Yakovlev } 30363ff2f67aSEvgeny Yakovlev 3037883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); 3038d470ad42SMax Reitz if (!bs->drv) { 3039d470ad42SMax Reitz /* bs->drv->bdrv_co_flush() might have ejected the BDS 3040d470ad42SMax Reitz * (even in case of apparent success) */ 3041d470ad42SMax Reitz ret = -ENOMEDIUM; 3042d470ad42SMax Reitz goto out; 3043d470ad42SMax Reitz } 304461007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_disk) { 304561007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_disk(bs); 304661007b31SStefan Hajnoczi } else if (bs->drv->bdrv_aio_flush) { 304761007b31SStefan Hajnoczi BlockAIOCB *acb; 304861007b31SStefan Hajnoczi CoroutineIOCompletion co = { 304961007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 305061007b31SStefan Hajnoczi }; 305161007b31SStefan Hajnoczi 305261007b31SStefan Hajnoczi acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 305361007b31SStefan Hajnoczi if (acb == NULL) { 305461007b31SStefan Hajnoczi ret = -EIO; 305561007b31SStefan Hajnoczi } else { 305661007b31SStefan Hajnoczi qemu_coroutine_yield(); 305761007b31SStefan Hajnoczi ret = co.ret; 305861007b31SStefan Hajnoczi } 305961007b31SStefan Hajnoczi } else { 306061007b31SStefan Hajnoczi /* 306161007b31SStefan Hajnoczi * Some block drivers always operate in either writethrough or unsafe 306261007b31SStefan Hajnoczi * mode and don't support bdrv_flush therefore. Usually qemu doesn't 306361007b31SStefan Hajnoczi * know how the server works (because the behaviour is hardcoded or 306461007b31SStefan Hajnoczi * depends on server-side configuration), so we can't ensure that 306561007b31SStefan Hajnoczi * everything is safe on disk. Returning an error doesn't work because 306661007b31SStefan Hajnoczi * that would break guests even if the server operates in writethrough 306761007b31SStefan Hajnoczi * mode. 306861007b31SStefan Hajnoczi * 306961007b31SStefan Hajnoczi * Let's hope the user knows what he's doing. 307061007b31SStefan Hajnoczi */ 307161007b31SStefan Hajnoczi ret = 0; 307261007b31SStefan Hajnoczi } 30733ff2f67aSEvgeny Yakovlev 307461007b31SStefan Hajnoczi if (ret < 0) { 3075cdb5e315SFam Zheng goto out; 307661007b31SStefan Hajnoczi } 307761007b31SStefan Hajnoczi 307861007b31SStefan Hajnoczi /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 307961007b31SStefan Hajnoczi * in the case of cache=unsafe, so there are no useless flushes. 308061007b31SStefan Hajnoczi */ 3081883833e2SMax Reitz flush_children: 3082883833e2SMax Reitz ret = 0; 3083883833e2SMax Reitz QLIST_FOREACH(child, &bs->children, next) { 3084883833e2SMax Reitz if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { 3085883833e2SMax Reitz int this_child_ret = bdrv_co_flush(child->bs); 3086883833e2SMax Reitz if (!ret) { 3087883833e2SMax Reitz ret = this_child_ret; 3088883833e2SMax Reitz } 3089883833e2SMax Reitz } 3090883833e2SMax Reitz } 3091883833e2SMax Reitz 3092cdb5e315SFam Zheng out: 30933ff2f67aSEvgeny Yakovlev /* Notify any pending flushes that we have completed */ 3094e6af1e08SKevin Wolf if (ret == 0) { 30953ff2f67aSEvgeny Yakovlev bs->flushed_gen = current_gen; 3096e6af1e08SKevin Wolf } 30973783fa3dSPaolo Bonzini 30983783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 309999723548SPaolo Bonzini bs->active_flush_req = false; 3100156af3acSDenis V. Lunev /* Return value is ignored - it's ok if wait queue is empty */ 3101156af3acSDenis V. Lunev qemu_co_queue_next(&bs->flush_queue); 31023783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 31033ff2f67aSEvgeny Yakovlev 310449ca6259SFam Zheng early_exit: 310599723548SPaolo Bonzini bdrv_dec_in_flight(bs); 3106cdb5e315SFam Zheng return ret; 310761007b31SStefan Hajnoczi } 310861007b31SStefan Hajnoczi 3109d93e5726SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 3110d93e5726SVladimir Sementsov-Ogievskiy int64_t bytes) 311161007b31SStefan Hajnoczi { 3112b1066c87SFam Zheng BdrvTrackedRequest req; 311339af49c0SVladimir Sementsov-Ogievskiy int ret; 311439af49c0SVladimir Sementsov-Ogievskiy int64_t max_pdiscard; 31153482b9bcSEric Blake int head, tail, align; 31160b9fd3f4SFam Zheng BlockDriverState *bs = child->bs; 3117384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 311861007b31SStefan Hajnoczi 3119d93e5726SVladimir Sementsov-Ogievskiy if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 312061007b31SStefan Hajnoczi return -ENOMEDIUM; 312161007b31SStefan Hajnoczi } 312261007b31SStefan Hajnoczi 3123d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 3124d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 3125d6883bc9SVladimir Sementsov-Ogievskiy } 3126d6883bc9SVladimir Sementsov-Ogievskiy 312769b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, bytes, NULL); 31288b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 31298b117001SVladimir Sementsov-Ogievskiy return ret; 313061007b31SStefan Hajnoczi } 313161007b31SStefan Hajnoczi 313261007b31SStefan Hajnoczi /* Do nothing if disabled. */ 313361007b31SStefan Hajnoczi if (!(bs->open_flags & BDRV_O_UNMAP)) { 313461007b31SStefan Hajnoczi return 0; 313561007b31SStefan Hajnoczi } 313661007b31SStefan Hajnoczi 313702aefe43SEric Blake if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 313861007b31SStefan Hajnoczi return 0; 313961007b31SStefan Hajnoczi } 314061007b31SStefan Hajnoczi 31410bc329fbSHanna Reitz /* Invalidate the cached block-status data range if this discard overlaps */ 31420bc329fbSHanna Reitz bdrv_bsc_invalidate_range(bs, offset, bytes); 31430bc329fbSHanna Reitz 31443482b9bcSEric Blake /* Discard is advisory, but some devices track and coalesce 31453482b9bcSEric Blake * unaligned requests, so we must pass everything down rather than 31463482b9bcSEric Blake * round here. Still, most devices will just silently ignore 31473482b9bcSEric Blake * unaligned requests (by returning -ENOTSUP), so we must fragment 31483482b9bcSEric Blake * the request accordingly. */ 314902aefe43SEric Blake align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 3150b8d0a980SEric Blake assert(align % bs->bl.request_alignment == 0); 3151b8d0a980SEric Blake head = offset % align; 3152f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % align; 31539f1963b3SEric Blake 315499723548SPaolo Bonzini bdrv_inc_in_flight(bs); 3155f5a5ca79SManos Pitsidianakis tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 315650824995SFam Zheng 315700695c27SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 3158ec050f77SDenis V. Lunev if (ret < 0) { 3159ec050f77SDenis V. Lunev goto out; 3160ec050f77SDenis V. Lunev } 3161ec050f77SDenis V. Lunev 31626a8f3dbbSVladimir Sementsov-Ogievskiy max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX), 31639f1963b3SEric Blake align); 31643482b9bcSEric Blake assert(max_pdiscard >= bs->bl.request_alignment); 31659f1963b3SEric Blake 3166f5a5ca79SManos Pitsidianakis while (bytes > 0) { 3167d93e5726SVladimir Sementsov-Ogievskiy int64_t num = bytes; 31683482b9bcSEric Blake 31693482b9bcSEric Blake if (head) { 31703482b9bcSEric Blake /* Make small requests to get to alignment boundaries. */ 3171f5a5ca79SManos Pitsidianakis num = MIN(bytes, align - head); 31723482b9bcSEric Blake if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 31733482b9bcSEric Blake num %= bs->bl.request_alignment; 31743482b9bcSEric Blake } 31753482b9bcSEric Blake head = (head + num) % align; 31763482b9bcSEric Blake assert(num < max_pdiscard); 31773482b9bcSEric Blake } else if (tail) { 31783482b9bcSEric Blake if (num > align) { 31793482b9bcSEric Blake /* Shorten the request to the last aligned cluster. */ 31803482b9bcSEric Blake num -= tail; 31813482b9bcSEric Blake } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 31823482b9bcSEric Blake tail > bs->bl.request_alignment) { 31833482b9bcSEric Blake tail %= bs->bl.request_alignment; 31843482b9bcSEric Blake num -= tail; 31853482b9bcSEric Blake } 31863482b9bcSEric Blake } 31873482b9bcSEric Blake /* limit request size */ 31883482b9bcSEric Blake if (num > max_pdiscard) { 31893482b9bcSEric Blake num = max_pdiscard; 31903482b9bcSEric Blake } 319161007b31SStefan Hajnoczi 3192d470ad42SMax Reitz if (!bs->drv) { 3193d470ad42SMax Reitz ret = -ENOMEDIUM; 3194d470ad42SMax Reitz goto out; 3195d470ad42SMax Reitz } 319647a5486dSEric Blake if (bs->drv->bdrv_co_pdiscard) { 319747a5486dSEric Blake ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 319861007b31SStefan Hajnoczi } else { 319961007b31SStefan Hajnoczi BlockAIOCB *acb; 320061007b31SStefan Hajnoczi CoroutineIOCompletion co = { 320161007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 320261007b31SStefan Hajnoczi }; 320361007b31SStefan Hajnoczi 32044da444a0SEric Blake acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 320561007b31SStefan Hajnoczi bdrv_co_io_em_complete, &co); 320661007b31SStefan Hajnoczi if (acb == NULL) { 3207b1066c87SFam Zheng ret = -EIO; 3208b1066c87SFam Zheng goto out; 320961007b31SStefan Hajnoczi } else { 321061007b31SStefan Hajnoczi qemu_coroutine_yield(); 321161007b31SStefan Hajnoczi ret = co.ret; 321261007b31SStefan Hajnoczi } 321361007b31SStefan Hajnoczi } 321461007b31SStefan Hajnoczi if (ret && ret != -ENOTSUP) { 3215b1066c87SFam Zheng goto out; 321661007b31SStefan Hajnoczi } 321761007b31SStefan Hajnoczi 32189f1963b3SEric Blake offset += num; 3219f5a5ca79SManos Pitsidianakis bytes -= num; 322061007b31SStefan Hajnoczi } 3221b1066c87SFam Zheng ret = 0; 3222b1066c87SFam Zheng out: 322300695c27SFam Zheng bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3224b1066c87SFam Zheng tracked_request_end(&req); 322599723548SPaolo Bonzini bdrv_dec_in_flight(bs); 3226b1066c87SFam Zheng return ret; 322761007b31SStefan Hajnoczi } 322861007b31SStefan Hajnoczi 322948af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 323061007b31SStefan Hajnoczi { 323161007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 32325c5ae76aSFam Zheng CoroutineIOCompletion co = { 32335c5ae76aSFam Zheng .coroutine = qemu_coroutine_self(), 32345c5ae76aSFam Zheng }; 32355c5ae76aSFam Zheng BlockAIOCB *acb; 3236384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 323761007b31SStefan Hajnoczi 323899723548SPaolo Bonzini bdrv_inc_in_flight(bs); 323916a389dcSKevin Wolf if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 32405c5ae76aSFam Zheng co.ret = -ENOTSUP; 32415c5ae76aSFam Zheng goto out; 32425c5ae76aSFam Zheng } 32435c5ae76aSFam Zheng 324416a389dcSKevin Wolf if (drv->bdrv_co_ioctl) { 324516a389dcSKevin Wolf co.ret = drv->bdrv_co_ioctl(bs, req, buf); 324616a389dcSKevin Wolf } else { 32475c5ae76aSFam Zheng acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 32485c5ae76aSFam Zheng if (!acb) { 3249c8a9fd80SFam Zheng co.ret = -ENOTSUP; 3250c8a9fd80SFam Zheng goto out; 32515c5ae76aSFam Zheng } 32525c5ae76aSFam Zheng qemu_coroutine_yield(); 325316a389dcSKevin Wolf } 32545c5ae76aSFam Zheng out: 325599723548SPaolo Bonzini bdrv_dec_in_flight(bs); 32565c5ae76aSFam Zheng return co.ret; 32575c5ae76aSFam Zheng } 32585c5ae76aSFam Zheng 325961007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size) 326061007b31SStefan Hajnoczi { 3261384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 326261007b31SStefan Hajnoczi return qemu_memalign(bdrv_opt_mem_align(bs), size); 326361007b31SStefan Hajnoczi } 326461007b31SStefan Hajnoczi 326561007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size) 326661007b31SStefan Hajnoczi { 3267384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 326861007b31SStefan Hajnoczi return memset(qemu_blockalign(bs, size), 0, size); 326961007b31SStefan Hajnoczi } 327061007b31SStefan Hajnoczi 327161007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 327261007b31SStefan Hajnoczi { 327361007b31SStefan Hajnoczi size_t align = bdrv_opt_mem_align(bs); 3274384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 327561007b31SStefan Hajnoczi 327661007b31SStefan Hajnoczi /* Ensure that NULL is never returned on success */ 327761007b31SStefan Hajnoczi assert(align > 0); 327861007b31SStefan Hajnoczi if (size == 0) { 327961007b31SStefan Hajnoczi size = align; 328061007b31SStefan Hajnoczi } 328161007b31SStefan Hajnoczi 328261007b31SStefan Hajnoczi return qemu_try_memalign(align, size); 328361007b31SStefan Hajnoczi } 328461007b31SStefan Hajnoczi 328561007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 328661007b31SStefan Hajnoczi { 328761007b31SStefan Hajnoczi void *mem = qemu_try_blockalign(bs, size); 3288384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 328961007b31SStefan Hajnoczi 329061007b31SStefan Hajnoczi if (mem) { 329161007b31SStefan Hajnoczi memset(mem, 0, size); 329261007b31SStefan Hajnoczi } 329361007b31SStefan Hajnoczi 329461007b31SStefan Hajnoczi return mem; 329561007b31SStefan Hajnoczi } 329661007b31SStefan Hajnoczi 329761007b31SStefan Hajnoczi /* 329861007b31SStefan Hajnoczi * Check if all memory in this vector is sector aligned. 329961007b31SStefan Hajnoczi */ 330061007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 330161007b31SStefan Hajnoczi { 330261007b31SStefan Hajnoczi int i; 33034196d2f0SDenis V. Lunev size_t alignment = bdrv_min_mem_align(bs); 3304384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 330561007b31SStefan Hajnoczi 330661007b31SStefan Hajnoczi for (i = 0; i < qiov->niov; i++) { 330761007b31SStefan Hajnoczi if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 330861007b31SStefan Hajnoczi return false; 330961007b31SStefan Hajnoczi } 331061007b31SStefan Hajnoczi if (qiov->iov[i].iov_len % alignment) { 331161007b31SStefan Hajnoczi return false; 331261007b31SStefan Hajnoczi } 331361007b31SStefan Hajnoczi } 331461007b31SStefan Hajnoczi 331561007b31SStefan Hajnoczi return true; 331661007b31SStefan Hajnoczi } 331761007b31SStefan Hajnoczi 331861007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs) 331961007b31SStefan Hajnoczi { 33206b98bd64SPaolo Bonzini BdrvChild *child; 3321384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 33226b98bd64SPaolo Bonzini 33236b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 33246b98bd64SPaolo Bonzini bdrv_io_plug(child->bs); 33256b98bd64SPaolo Bonzini } 33266b98bd64SPaolo Bonzini 3327d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->io_plugged) == 0) { 332861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 332961007b31SStefan Hajnoczi if (drv && drv->bdrv_io_plug) { 333061007b31SStefan Hajnoczi drv->bdrv_io_plug(bs); 33316b98bd64SPaolo Bonzini } 333261007b31SStefan Hajnoczi } 333361007b31SStefan Hajnoczi } 333461007b31SStefan Hajnoczi 333561007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs) 333661007b31SStefan Hajnoczi { 33376b98bd64SPaolo Bonzini BdrvChild *child; 3338384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 33396b98bd64SPaolo Bonzini 33406b98bd64SPaolo Bonzini assert(bs->io_plugged); 3341d73415a3SStefan Hajnoczi if (qatomic_fetch_dec(&bs->io_plugged) == 1) { 334261007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 334361007b31SStefan Hajnoczi if (drv && drv->bdrv_io_unplug) { 334461007b31SStefan Hajnoczi drv->bdrv_io_unplug(bs); 334561007b31SStefan Hajnoczi } 334661007b31SStefan Hajnoczi } 334761007b31SStefan Hajnoczi 33486b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 33496b98bd64SPaolo Bonzini bdrv_io_unplug(child->bs); 33506b98bd64SPaolo Bonzini } 33516b98bd64SPaolo Bonzini } 335223d0ba93SFam Zheng 335323d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 335423d0ba93SFam Zheng { 335523d0ba93SFam Zheng BdrvChild *child; 335623d0ba93SFam Zheng 3357f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 335823d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_register_buf) { 335923d0ba93SFam Zheng bs->drv->bdrv_register_buf(bs, host, size); 336023d0ba93SFam Zheng } 336123d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 336223d0ba93SFam Zheng bdrv_register_buf(child->bs, host, size); 336323d0ba93SFam Zheng } 336423d0ba93SFam Zheng } 336523d0ba93SFam Zheng 336623d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host) 336723d0ba93SFam Zheng { 336823d0ba93SFam Zheng BdrvChild *child; 336923d0ba93SFam Zheng 3370f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 337123d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_unregister_buf) { 337223d0ba93SFam Zheng bs->drv->bdrv_unregister_buf(bs, host); 337323d0ba93SFam Zheng } 337423d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 337523d0ba93SFam Zheng bdrv_unregister_buf(child->bs, host); 337623d0ba93SFam Zheng } 337723d0ba93SFam Zheng } 3378fcc67678SFam Zheng 337967b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal( 3380a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *src, int64_t src_offset, BdrvChild *dst, 3381a5215b8fSVladimir Sementsov-Ogievskiy int64_t dst_offset, int64_t bytes, 338267b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3383fcc67678SFam Zheng bool recurse_src) 3384fcc67678SFam Zheng { 3385999658a0SVladimir Sementsov-Ogievskiy BdrvTrackedRequest req; 3386fcc67678SFam Zheng int ret; 3387fcc67678SFam Zheng 3388fe0480d6SKevin Wolf /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3389fe0480d6SKevin Wolf assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3390fe0480d6SKevin Wolf assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3391fe0480d6SKevin Wolf 3392f4dad307SVladimir Sementsov-Ogievskiy if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) { 3393fcc67678SFam Zheng return -ENOMEDIUM; 3394fcc67678SFam Zheng } 339563f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(dst_offset, bytes, NULL, 0); 3396fcc67678SFam Zheng if (ret) { 3397fcc67678SFam Zheng return ret; 3398fcc67678SFam Zheng } 339967b51fb9SVladimir Sementsov-Ogievskiy if (write_flags & BDRV_REQ_ZERO_WRITE) { 340067b51fb9SVladimir Sementsov-Ogievskiy return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3401fcc67678SFam Zheng } 3402fcc67678SFam Zheng 3403f4dad307SVladimir Sementsov-Ogievskiy if (!src || !src->bs || !bdrv_is_inserted(src->bs)) { 3404d4d3e5a0SFam Zheng return -ENOMEDIUM; 3405d4d3e5a0SFam Zheng } 340663f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(src_offset, bytes, NULL, 0); 3407d4d3e5a0SFam Zheng if (ret) { 3408d4d3e5a0SFam Zheng return ret; 3409d4d3e5a0SFam Zheng } 3410d4d3e5a0SFam Zheng 3411fcc67678SFam Zheng if (!src->bs->drv->bdrv_co_copy_range_from 3412fcc67678SFam Zheng || !dst->bs->drv->bdrv_co_copy_range_to 3413fcc67678SFam Zheng || src->bs->encrypted || dst->bs->encrypted) { 3414fcc67678SFam Zheng return -ENOTSUP; 3415fcc67678SFam Zheng } 3416999658a0SVladimir Sementsov-Ogievskiy 3417999658a0SVladimir Sementsov-Ogievskiy if (recurse_src) { 3418d4d3e5a0SFam Zheng bdrv_inc_in_flight(src->bs); 3419999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, src->bs, src_offset, bytes, 3420999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_READ); 342137aec7d7SFam Zheng 342209d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 342309d2f948SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_SERIALISING)); 3424304d9d7fSMax Reitz bdrv_wait_serialising_requests(&req); 3425999658a0SVladimir Sementsov-Ogievskiy 342637aec7d7SFam Zheng ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3427fcc67678SFam Zheng src, src_offset, 3428fcc67678SFam Zheng dst, dst_offset, 342967b51fb9SVladimir Sementsov-Ogievskiy bytes, 343067b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 3431999658a0SVladimir Sementsov-Ogievskiy 3432999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3433999658a0SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(src->bs); 3434fcc67678SFam Zheng } else { 3435999658a0SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(dst->bs); 3436999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3437999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_WRITE); 34380eb1e891SFam Zheng ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 34390eb1e891SFam Zheng write_flags); 34400eb1e891SFam Zheng if (!ret) { 344137aec7d7SFam Zheng ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3442fcc67678SFam Zheng src, src_offset, 3443fcc67678SFam Zheng dst, dst_offset, 344467b51fb9SVladimir Sementsov-Ogievskiy bytes, 344567b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 34460eb1e891SFam Zheng } 34470eb1e891SFam Zheng bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3448999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3449d4d3e5a0SFam Zheng bdrv_dec_in_flight(dst->bs); 3450999658a0SVladimir Sementsov-Ogievskiy } 3451999658a0SVladimir Sementsov-Ogievskiy 345237aec7d7SFam Zheng return ret; 3453fcc67678SFam Zheng } 3454fcc67678SFam Zheng 3455fcc67678SFam Zheng /* Copy range from @src to @dst. 3456fcc67678SFam Zheng * 3457fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3458fcc67678SFam Zheng * semantics. */ 3459a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, 3460a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3461a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, 346267b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 346367b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3464fcc67678SFam Zheng { 3465*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 3466ecc983a5SFam Zheng trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3467ecc983a5SFam Zheng read_flags, write_flags); 3468fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 346967b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, true); 3470fcc67678SFam Zheng } 3471fcc67678SFam Zheng 3472fcc67678SFam Zheng /* Copy range from @src to @dst. 3473fcc67678SFam Zheng * 3474fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3475fcc67678SFam Zheng * semantics. */ 3476a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, 3477a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3478a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, 347967b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 348067b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3481fcc67678SFam Zheng { 3482*967d7905SEmanuele Giuseppe Esposito IO_CODE(); 3483ecc983a5SFam Zheng trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3484ecc983a5SFam Zheng read_flags, write_flags); 3485fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 348667b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, false); 3487fcc67678SFam Zheng } 3488fcc67678SFam Zheng 3489a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, 3490a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3491a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, BdrvRequestFlags read_flags, 349267b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3493fcc67678SFam Zheng { 3494384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 349537aec7d7SFam Zheng return bdrv_co_copy_range_from(src, src_offset, 3496fcc67678SFam Zheng dst, dst_offset, 349767b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags); 3498fcc67678SFam Zheng } 34993d9f2d2aSKevin Wolf 35003d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs) 35013d9f2d2aSKevin Wolf { 35023d9f2d2aSKevin Wolf BdrvChild *c; 35033d9f2d2aSKevin Wolf QLIST_FOREACH(c, &bs->parents, next_parent) { 3504bd86fb99SMax Reitz if (c->klass->resize) { 3505bd86fb99SMax Reitz c->klass->resize(c); 35063d9f2d2aSKevin Wolf } 35073d9f2d2aSKevin Wolf } 35083d9f2d2aSKevin Wolf } 35093d9f2d2aSKevin Wolf 35103d9f2d2aSKevin Wolf /** 35113d9f2d2aSKevin Wolf * Truncate file to 'offset' bytes (needed only for file protocols) 3512c80d8b06SMax Reitz * 3513c80d8b06SMax Reitz * If 'exact' is true, the file must be resized to exactly the given 3514c80d8b06SMax Reitz * 'offset'. Otherwise, it is sufficient for the node to be at least 3515c80d8b06SMax Reitz * 'offset' bytes in length. 35163d9f2d2aSKevin Wolf */ 3517c80d8b06SMax Reitz int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 35187b8e4857SKevin Wolf PreallocMode prealloc, BdrvRequestFlags flags, 35197b8e4857SKevin Wolf Error **errp) 35203d9f2d2aSKevin Wolf { 35213d9f2d2aSKevin Wolf BlockDriverState *bs = child->bs; 352223b93525SMax Reitz BdrvChild *filtered, *backing; 35233d9f2d2aSKevin Wolf BlockDriver *drv = bs->drv; 35241bc5f09fSKevin Wolf BdrvTrackedRequest req; 35251bc5f09fSKevin Wolf int64_t old_size, new_bytes; 35263d9f2d2aSKevin Wolf int ret; 3527384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 35283d9f2d2aSKevin Wolf 35293d9f2d2aSKevin Wolf /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 35303d9f2d2aSKevin Wolf if (!drv) { 35313d9f2d2aSKevin Wolf error_setg(errp, "No medium inserted"); 35323d9f2d2aSKevin Wolf return -ENOMEDIUM; 35333d9f2d2aSKevin Wolf } 35343d9f2d2aSKevin Wolf if (offset < 0) { 35353d9f2d2aSKevin Wolf error_setg(errp, "Image size cannot be negative"); 35363d9f2d2aSKevin Wolf return -EINVAL; 35373d9f2d2aSKevin Wolf } 35383d9f2d2aSKevin Wolf 353969b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, 0, errp); 35408b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 35418b117001SVladimir Sementsov-Ogievskiy return ret; 35428b117001SVladimir Sementsov-Ogievskiy } 35438b117001SVladimir Sementsov-Ogievskiy 35441bc5f09fSKevin Wolf old_size = bdrv_getlength(bs); 35451bc5f09fSKevin Wolf if (old_size < 0) { 35461bc5f09fSKevin Wolf error_setg_errno(errp, -old_size, "Failed to get old image size"); 35471bc5f09fSKevin Wolf return old_size; 35481bc5f09fSKevin Wolf } 35491bc5f09fSKevin Wolf 355097efa869SEric Blake if (bdrv_is_read_only(bs)) { 355197efa869SEric Blake error_setg(errp, "Image is read-only"); 355297efa869SEric Blake return -EACCES; 355397efa869SEric Blake } 355497efa869SEric Blake 35551bc5f09fSKevin Wolf if (offset > old_size) { 35561bc5f09fSKevin Wolf new_bytes = offset - old_size; 35571bc5f09fSKevin Wolf } else { 35581bc5f09fSKevin Wolf new_bytes = 0; 35591bc5f09fSKevin Wolf } 35601bc5f09fSKevin Wolf 35613d9f2d2aSKevin Wolf bdrv_inc_in_flight(bs); 35625416a11eSFam Zheng tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 35635416a11eSFam Zheng BDRV_TRACKED_TRUNCATE); 35641bc5f09fSKevin Wolf 35651bc5f09fSKevin Wolf /* If we are growing the image and potentially using preallocation for the 35661bc5f09fSKevin Wolf * new area, we need to make sure that no write requests are made to it 35671bc5f09fSKevin Wolf * concurrently or they might be overwritten by preallocation. */ 35681bc5f09fSKevin Wolf if (new_bytes) { 35698ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, 1); 3570cd47d792SFam Zheng } 3571cd47d792SFam Zheng ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3572cd47d792SFam Zheng 0); 3573cd47d792SFam Zheng if (ret < 0) { 3574cd47d792SFam Zheng error_setg_errno(errp, -ret, 3575cd47d792SFam Zheng "Failed to prepare request for truncation"); 3576cd47d792SFam Zheng goto out; 35771bc5f09fSKevin Wolf } 35783d9f2d2aSKevin Wolf 357993393e69SMax Reitz filtered = bdrv_filter_child(bs); 358023b93525SMax Reitz backing = bdrv_cow_child(bs); 358193393e69SMax Reitz 3582955c7d66SKevin Wolf /* 3583955c7d66SKevin Wolf * If the image has a backing file that is large enough that it would 3584955c7d66SKevin Wolf * provide data for the new area, we cannot leave it unallocated because 3585955c7d66SKevin Wolf * then the backing file content would become visible. Instead, zero-fill 3586955c7d66SKevin Wolf * the new area. 3587955c7d66SKevin Wolf * 3588955c7d66SKevin Wolf * Note that if the image has a backing file, but was opened without the 3589955c7d66SKevin Wolf * backing file, taking care of keeping things consistent with that backing 3590955c7d66SKevin Wolf * file is the user's responsibility. 3591955c7d66SKevin Wolf */ 359223b93525SMax Reitz if (new_bytes && backing) { 3593955c7d66SKevin Wolf int64_t backing_len; 3594955c7d66SKevin Wolf 359523b93525SMax Reitz backing_len = bdrv_getlength(backing->bs); 3596955c7d66SKevin Wolf if (backing_len < 0) { 3597955c7d66SKevin Wolf ret = backing_len; 3598955c7d66SKevin Wolf error_setg_errno(errp, -ret, "Could not get backing file size"); 3599955c7d66SKevin Wolf goto out; 3600955c7d66SKevin Wolf } 3601955c7d66SKevin Wolf 3602955c7d66SKevin Wolf if (backing_len > old_size) { 3603955c7d66SKevin Wolf flags |= BDRV_REQ_ZERO_WRITE; 3604955c7d66SKevin Wolf } 3605955c7d66SKevin Wolf } 3606955c7d66SKevin Wolf 36076b7e8f8bSMax Reitz if (drv->bdrv_co_truncate) { 360892b92799SKevin Wolf if (flags & ~bs->supported_truncate_flags) { 360992b92799SKevin Wolf error_setg(errp, "Block driver does not support requested flags"); 361092b92799SKevin Wolf ret = -ENOTSUP; 361192b92799SKevin Wolf goto out; 361292b92799SKevin Wolf } 361392b92799SKevin Wolf ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 361493393e69SMax Reitz } else if (filtered) { 361593393e69SMax Reitz ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); 36166b7e8f8bSMax Reitz } else { 36173d9f2d2aSKevin Wolf error_setg(errp, "Image format driver does not support resize"); 36183d9f2d2aSKevin Wolf ret = -ENOTSUP; 36193d9f2d2aSKevin Wolf goto out; 36203d9f2d2aSKevin Wolf } 36213d9f2d2aSKevin Wolf if (ret < 0) { 36223d9f2d2aSKevin Wolf goto out; 36233d9f2d2aSKevin Wolf } 36246b7e8f8bSMax Reitz 36253d9f2d2aSKevin Wolf ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 36263d9f2d2aSKevin Wolf if (ret < 0) { 36273d9f2d2aSKevin Wolf error_setg_errno(errp, -ret, "Could not refresh total sector count"); 36283d9f2d2aSKevin Wolf } else { 36293d9f2d2aSKevin Wolf offset = bs->total_sectors * BDRV_SECTOR_SIZE; 36303d9f2d2aSKevin Wolf } 3631cd47d792SFam Zheng /* It's possible that truncation succeeded but refresh_total_sectors 3632cd47d792SFam Zheng * failed, but the latter doesn't affect how we should finish the request. 3633cd47d792SFam Zheng * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3634cd47d792SFam Zheng bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 36353d9f2d2aSKevin Wolf 36363d9f2d2aSKevin Wolf out: 36371bc5f09fSKevin Wolf tracked_request_end(&req); 36383d9f2d2aSKevin Wolf bdrv_dec_in_flight(bs); 36391bc5f09fSKevin Wolf 36403d9f2d2aSKevin Wolf return ret; 36413d9f2d2aSKevin Wolf } 3642bd54669aSVladimir Sementsov-Ogievskiy 3643bd54669aSVladimir Sementsov-Ogievskiy void bdrv_cancel_in_flight(BlockDriverState *bs) 3644bd54669aSVladimir Sementsov-Ogievskiy { 3645f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 3646bd54669aSVladimir Sementsov-Ogievskiy if (!bs || !bs->drv) { 3647bd54669aSVladimir Sementsov-Ogievskiy return; 3648bd54669aSVladimir Sementsov-Ogievskiy } 3649bd54669aSVladimir Sementsov-Ogievskiy 3650bd54669aSVladimir Sementsov-Ogievskiy if (bs->drv->bdrv_cancel_in_flight) { 3651bd54669aSVladimir Sementsov-Ogievskiy bs->drv->bdrv_cancel_in_flight(bs); 3652bd54669aSVladimir Sementsov-Ogievskiy } 3653bd54669aSVladimir Sementsov-Ogievskiy } 3654