161007b31SStefan Hajnoczi /* 261007b31SStefan Hajnoczi * Block layer I/O functions 361007b31SStefan Hajnoczi * 461007b31SStefan Hajnoczi * Copyright (c) 2003 Fabrice Bellard 561007b31SStefan Hajnoczi * 661007b31SStefan Hajnoczi * Permission is hereby granted, free of charge, to any person obtaining a copy 761007b31SStefan Hajnoczi * of this software and associated documentation files (the "Software"), to deal 861007b31SStefan Hajnoczi * in the Software without restriction, including without limitation the rights 961007b31SStefan Hajnoczi * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1061007b31SStefan Hajnoczi * copies of the Software, and to permit persons to whom the Software is 1161007b31SStefan Hajnoczi * furnished to do so, subject to the following conditions: 1261007b31SStefan Hajnoczi * 1361007b31SStefan Hajnoczi * The above copyright notice and this permission notice shall be included in 1461007b31SStefan Hajnoczi * all copies or substantial portions of the Software. 1561007b31SStefan Hajnoczi * 1661007b31SStefan Hajnoczi * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1761007b31SStefan Hajnoczi * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1861007b31SStefan Hajnoczi * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1961007b31SStefan Hajnoczi * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2061007b31SStefan Hajnoczi * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2161007b31SStefan Hajnoczi * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2261007b31SStefan Hajnoczi * THE SOFTWARE. 2361007b31SStefan Hajnoczi */ 2461007b31SStefan Hajnoczi 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2661007b31SStefan Hajnoczi #include "trace.h" 277f0e9da6SMax Reitz #include "sysemu/block-backend.h" 287719f3c9SStefan Hajnoczi #include "block/aio-wait.h" 2961007b31SStefan Hajnoczi #include "block/blockjob.h" 30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h" 3161007b31SStefan Hajnoczi #include "block/block_int.h" 3221c2283eSVladimir Sementsov-Ogievskiy #include "block/coroutines.h" 33e2c1c34fSMarkus Armbruster #include "block/dirty-bitmap.h" 3494783301SVladimir Sementsov-Ogievskiy #include "block/write-threshold.h" 35f348b6d1SVeronia Bahaa #include "qemu/cutils.h" 365df022cfSPeter Maydell #include "qemu/memalign.h" 37da34e65cSMarkus Armbruster #include "qapi/error.h" 38d49b6836SMarkus Armbruster #include "qemu/error-report.h" 39db725815SMarkus Armbruster #include "qemu/main-loop.h" 40c8aa7895SPavel Dovgalyuk #include "sysemu/replay.h" 4161007b31SStefan Hajnoczi 42cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 43cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 44cb2e2878SEric Blake 457f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs); 46d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 475ae07b14SVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, BdrvRequestFlags flags); 4861007b31SStefan Hajnoczi 49a82a3bd1SKevin Wolf static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) 5061007b31SStefan Hajnoczi { 5102d21300SKevin Wolf BdrvChild *c, *next; 5227ccdd52SKevin Wolf 5302d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 54a82a3bd1SKevin Wolf if (c == ignore) { 550152bf40SKevin Wolf continue; 560152bf40SKevin Wolf } 57606ed756SKevin Wolf bdrv_parent_drained_begin_single(c); 58ce0f1412SPaolo Bonzini } 59ce0f1412SPaolo Bonzini } 60ce0f1412SPaolo Bonzini 612f65df6eSKevin Wolf void bdrv_parent_drained_end_single(BdrvChild *c) 62804db8eaSMax Reitz { 632f65df6eSKevin Wolf IO_OR_GS_CODE(); 642f65df6eSKevin Wolf 6557e05be3SKevin Wolf assert(c->quiesced_parent); 6657e05be3SKevin Wolf c->quiesced_parent = false; 6757e05be3SKevin Wolf 68bd86fb99SMax Reitz if (c->klass->drained_end) { 692f65df6eSKevin Wolf c->klass->drained_end(c); 70804db8eaSMax Reitz } 71804db8eaSMax Reitz } 72804db8eaSMax Reitz 73a82a3bd1SKevin Wolf static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) 74ce0f1412SPaolo Bonzini { 7561ad631cSMax Reitz BdrvChild *c; 7627ccdd52SKevin Wolf 7761ad631cSMax Reitz QLIST_FOREACH(c, &bs->parents, next_parent) { 78a82a3bd1SKevin Wolf if (c == ignore) { 790152bf40SKevin Wolf continue; 800152bf40SKevin Wolf } 812f65df6eSKevin Wolf bdrv_parent_drained_end_single(c); 82c2066af0SKevin Wolf } 8361007b31SStefan Hajnoczi } 8461007b31SStefan Hajnoczi 8523987471SKevin Wolf bool bdrv_parent_drained_poll_single(BdrvChild *c) 864be6a6d1SKevin Wolf { 87bd86fb99SMax Reitz if (c->klass->drained_poll) { 88bd86fb99SMax Reitz return c->klass->drained_poll(c); 894be6a6d1SKevin Wolf } 904be6a6d1SKevin Wolf return false; 914be6a6d1SKevin Wolf } 924be6a6d1SKevin Wolf 936cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 946cd5c9d7SKevin Wolf bool ignore_bds_parents) 9589bd0305SKevin Wolf { 9689bd0305SKevin Wolf BdrvChild *c, *next; 9789bd0305SKevin Wolf bool busy = false; 9889bd0305SKevin Wolf 9989bd0305SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 100bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 10189bd0305SKevin Wolf continue; 10289bd0305SKevin Wolf } 1034be6a6d1SKevin Wolf busy |= bdrv_parent_drained_poll_single(c); 10489bd0305SKevin Wolf } 10589bd0305SKevin Wolf 10689bd0305SKevin Wolf return busy; 10789bd0305SKevin Wolf } 10889bd0305SKevin Wolf 109606ed756SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c) 1104be6a6d1SKevin Wolf { 111384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 11257e05be3SKevin Wolf 11357e05be3SKevin Wolf assert(!c->quiesced_parent); 11457e05be3SKevin Wolf c->quiesced_parent = true; 11557e05be3SKevin Wolf 116bd86fb99SMax Reitz if (c->klass->drained_begin) { 117bd86fb99SMax Reitz c->klass->drained_begin(c); 1184be6a6d1SKevin Wolf } 1194be6a6d1SKevin Wolf } 1204be6a6d1SKevin Wolf 121d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 122d9e0dfa2SEric Blake { 1239f460c64SAkihiko Odaki dst->pdiscard_alignment = MAX(dst->pdiscard_alignment, 1249f460c64SAkihiko Odaki src->pdiscard_alignment); 125d9e0dfa2SEric Blake dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 126d9e0dfa2SEric Blake dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 12724b36e98SPaolo Bonzini dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, 12824b36e98SPaolo Bonzini src->max_hw_transfer); 129d9e0dfa2SEric Blake dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 130d9e0dfa2SEric Blake src->opt_mem_alignment); 131d9e0dfa2SEric Blake dst->min_mem_alignment = MAX(dst->min_mem_alignment, 132d9e0dfa2SEric Blake src->min_mem_alignment); 133d9e0dfa2SEric Blake dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 134cc071629SPaolo Bonzini dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov); 135d9e0dfa2SEric Blake } 136d9e0dfa2SEric Blake 1371e4c797cSVladimir Sementsov-Ogievskiy typedef struct BdrvRefreshLimitsState { 1381e4c797cSVladimir Sementsov-Ogievskiy BlockDriverState *bs; 1391e4c797cSVladimir Sementsov-Ogievskiy BlockLimits old_bl; 1401e4c797cSVladimir Sementsov-Ogievskiy } BdrvRefreshLimitsState; 1411e4c797cSVladimir Sementsov-Ogievskiy 1421e4c797cSVladimir Sementsov-Ogievskiy static void bdrv_refresh_limits_abort(void *opaque) 1431e4c797cSVladimir Sementsov-Ogievskiy { 1441e4c797cSVladimir Sementsov-Ogievskiy BdrvRefreshLimitsState *s = opaque; 1451e4c797cSVladimir Sementsov-Ogievskiy 1461e4c797cSVladimir Sementsov-Ogievskiy s->bs->bl = s->old_bl; 1471e4c797cSVladimir Sementsov-Ogievskiy } 1481e4c797cSVladimir Sementsov-Ogievskiy 1491e4c797cSVladimir Sementsov-Ogievskiy static TransactionActionDrv bdrv_refresh_limits_drv = { 1501e4c797cSVladimir Sementsov-Ogievskiy .abort = bdrv_refresh_limits_abort, 1511e4c797cSVladimir Sementsov-Ogievskiy .clean = g_free, 1521e4c797cSVladimir Sementsov-Ogievskiy }; 1531e4c797cSVladimir Sementsov-Ogievskiy 1541e4c797cSVladimir Sementsov-Ogievskiy /* @tran is allowed to be NULL, in this case no rollback is possible. */ 1551e4c797cSVladimir Sementsov-Ogievskiy void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp) 15661007b31SStefan Hajnoczi { 15733985614SVladimir Sementsov-Ogievskiy ERRP_GUARD(); 15861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 15966b129acSMax Reitz BdrvChild *c; 16066b129acSMax Reitz bool have_limits; 16161007b31SStefan Hajnoczi 162f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 163f791bf7fSEmanuele Giuseppe Esposito 1641e4c797cSVladimir Sementsov-Ogievskiy if (tran) { 1651e4c797cSVladimir Sementsov-Ogievskiy BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1); 1661e4c797cSVladimir Sementsov-Ogievskiy *s = (BdrvRefreshLimitsState) { 1671e4c797cSVladimir Sementsov-Ogievskiy .bs = bs, 1681e4c797cSVladimir Sementsov-Ogievskiy .old_bl = bs->bl, 1691e4c797cSVladimir Sementsov-Ogievskiy }; 1701e4c797cSVladimir Sementsov-Ogievskiy tran_add(tran, &bdrv_refresh_limits_drv, s); 1711e4c797cSVladimir Sementsov-Ogievskiy } 1721e4c797cSVladimir Sementsov-Ogievskiy 17361007b31SStefan Hajnoczi memset(&bs->bl, 0, sizeof(bs->bl)); 17461007b31SStefan Hajnoczi 17561007b31SStefan Hajnoczi if (!drv) { 17661007b31SStefan Hajnoczi return; 17761007b31SStefan Hajnoczi } 17861007b31SStefan Hajnoczi 17979ba8c98SEric Blake /* Default alignment based on whether driver has byte interface */ 180e31f6864SEric Blake bs->bl.request_alignment = (drv->bdrv_co_preadv || 181ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_aio_preadv || 182ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_co_preadv_part) ? 1 : 512; 18379ba8c98SEric Blake 18461007b31SStefan Hajnoczi /* Take some limits from the children as a default */ 18566b129acSMax Reitz have_limits = false; 18666b129acSMax Reitz QLIST_FOREACH(c, &bs->children, next) { 18766b129acSMax Reitz if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) 18866b129acSMax Reitz { 18966b129acSMax Reitz bdrv_merge_limits(&bs->bl, &c->bs->bl); 19066b129acSMax Reitz have_limits = true; 19166b129acSMax Reitz } 19266b129acSMax Reitz } 19366b129acSMax Reitz 19466b129acSMax Reitz if (!have_limits) { 1954196d2f0SDenis V. Lunev bs->bl.min_mem_alignment = 512; 1968e3b0cbbSMarc-André Lureau bs->bl.opt_mem_alignment = qemu_real_host_page_size(); 197bd44feb7SStefan Hajnoczi 198bd44feb7SStefan Hajnoczi /* Safe default since most protocols use readv()/writev()/etc */ 199bd44feb7SStefan Hajnoczi bs->bl.max_iov = IOV_MAX; 20061007b31SStefan Hajnoczi } 20161007b31SStefan Hajnoczi 20261007b31SStefan Hajnoczi /* Then let the driver override it */ 20361007b31SStefan Hajnoczi if (drv->bdrv_refresh_limits) { 20461007b31SStefan Hajnoczi drv->bdrv_refresh_limits(bs, errp); 2058b117001SVladimir Sementsov-Ogievskiy if (*errp) { 2068b117001SVladimir Sementsov-Ogievskiy return; 2078b117001SVladimir Sementsov-Ogievskiy } 2088b117001SVladimir Sementsov-Ogievskiy } 2098b117001SVladimir Sementsov-Ogievskiy 2108b117001SVladimir Sementsov-Ogievskiy if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) { 2118b117001SVladimir Sementsov-Ogievskiy error_setg(errp, "Driver requires too large request alignment"); 21261007b31SStefan Hajnoczi } 21361007b31SStefan Hajnoczi } 21461007b31SStefan Hajnoczi 21561007b31SStefan Hajnoczi /** 21661007b31SStefan Hajnoczi * The copy-on-read flag is actually a reference count so multiple users may 21761007b31SStefan Hajnoczi * use the feature without worrying about clobbering its previous state. 21861007b31SStefan Hajnoczi * Copy-on-read stays enabled until all users have called to disable it. 21961007b31SStefan Hajnoczi */ 22061007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs) 22161007b31SStefan Hajnoczi { 222384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 223d73415a3SStefan Hajnoczi qatomic_inc(&bs->copy_on_read); 22461007b31SStefan Hajnoczi } 22561007b31SStefan Hajnoczi 22661007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs) 22761007b31SStefan Hajnoczi { 228d73415a3SStefan Hajnoczi int old = qatomic_fetch_dec(&bs->copy_on_read); 229384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 230d3faa13eSPaolo Bonzini assert(old >= 1); 23161007b31SStefan Hajnoczi } 23261007b31SStefan Hajnoczi 23361124f03SPaolo Bonzini typedef struct { 23461124f03SPaolo Bonzini Coroutine *co; 23561124f03SPaolo Bonzini BlockDriverState *bs; 23661124f03SPaolo Bonzini bool done; 237481cad48SManos Pitsidianakis bool begin; 238fe4f0614SKevin Wolf bool poll; 2390152bf40SKevin Wolf BdrvChild *parent; 24061124f03SPaolo Bonzini } BdrvCoDrainData; 24161124f03SPaolo Bonzini 2421cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 243299403aeSKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent, 244299403aeSKevin Wolf bool ignore_bds_parents) 24589bd0305SKevin Wolf { 246384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 247fe4f0614SKevin Wolf 2486cd5c9d7SKevin Wolf if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 24989bd0305SKevin Wolf return true; 25089bd0305SKevin Wolf } 25189bd0305SKevin Wolf 252d73415a3SStefan Hajnoczi if (qatomic_read(&bs->in_flight)) { 253fe4f0614SKevin Wolf return true; 25489bd0305SKevin Wolf } 25589bd0305SKevin Wolf 256fe4f0614SKevin Wolf return false; 257fe4f0614SKevin Wolf } 258fe4f0614SKevin Wolf 259299403aeSKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, 26089bd0305SKevin Wolf BdrvChild *ignore_parent) 2611cc8e54aSKevin Wolf { 262299403aeSKevin Wolf return bdrv_drain_poll(bs, ignore_parent, false); 2631cc8e54aSKevin Wolf } 2641cc8e54aSKevin Wolf 265299403aeSKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, 266a82a3bd1SKevin Wolf bool poll); 267a82a3bd1SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent); 2680152bf40SKevin Wolf 269a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque) 270a77fd4bbSFam Zheng { 271a77fd4bbSFam Zheng BdrvCoDrainData *data = opaque; 272a77fd4bbSFam Zheng Coroutine *co = data->co; 27399723548SPaolo Bonzini BlockDriverState *bs = data->bs; 274a77fd4bbSFam Zheng 275c8ca33d0SKevin Wolf if (bs) { 276aa1361d5SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 277aa1361d5SKevin Wolf aio_context_acquire(ctx); 27899723548SPaolo Bonzini bdrv_dec_in_flight(bs); 279481cad48SManos Pitsidianakis if (data->begin) { 280a82a3bd1SKevin Wolf bdrv_do_drained_begin(bs, data->parent, data->poll); 281481cad48SManos Pitsidianakis } else { 282e037c09cSMax Reitz assert(!data->poll); 283a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, data->parent); 284481cad48SManos Pitsidianakis } 285aa1361d5SKevin Wolf aio_context_release(ctx); 286c8ca33d0SKevin Wolf } else { 287c8ca33d0SKevin Wolf assert(data->begin); 288c8ca33d0SKevin Wolf bdrv_drain_all_begin(); 289c8ca33d0SKevin Wolf } 290481cad48SManos Pitsidianakis 291a77fd4bbSFam Zheng data->done = true; 2921919631eSPaolo Bonzini aio_co_wake(co); 293a77fd4bbSFam Zheng } 294a77fd4bbSFam Zheng 295481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 296299403aeSKevin Wolf bool begin, 2976cd5c9d7SKevin Wolf BdrvChild *parent, 2982f65df6eSKevin Wolf bool poll) 299a77fd4bbSFam Zheng { 300a77fd4bbSFam Zheng BdrvCoDrainData data; 301960d5fb3SKevin Wolf Coroutine *self = qemu_coroutine_self(); 302960d5fb3SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 303960d5fb3SKevin Wolf AioContext *co_ctx = qemu_coroutine_get_aio_context(self); 304a77fd4bbSFam Zheng 305a77fd4bbSFam Zheng /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 306c40a2545SStefan Hajnoczi * other coroutines run if they were queued by aio_co_enter(). */ 307a77fd4bbSFam Zheng 308a77fd4bbSFam Zheng assert(qemu_in_coroutine()); 309a77fd4bbSFam Zheng data = (BdrvCoDrainData) { 310960d5fb3SKevin Wolf .co = self, 311a77fd4bbSFam Zheng .bs = bs, 312a77fd4bbSFam Zheng .done = false, 313481cad48SManos Pitsidianakis .begin = begin, 3140152bf40SKevin Wolf .parent = parent, 315fe4f0614SKevin Wolf .poll = poll, 316a77fd4bbSFam Zheng }; 3178e1da77eSMax Reitz 318c8ca33d0SKevin Wolf if (bs) { 31999723548SPaolo Bonzini bdrv_inc_in_flight(bs); 320c8ca33d0SKevin Wolf } 321960d5fb3SKevin Wolf 322960d5fb3SKevin Wolf /* 323960d5fb3SKevin Wolf * Temporarily drop the lock across yield or we would get deadlocks. 324960d5fb3SKevin Wolf * bdrv_co_drain_bh_cb() reaquires the lock as needed. 325960d5fb3SKevin Wolf * 326960d5fb3SKevin Wolf * When we yield below, the lock for the current context will be 327960d5fb3SKevin Wolf * released, so if this is actually the lock that protects bs, don't drop 328960d5fb3SKevin Wolf * it a second time. 329960d5fb3SKevin Wolf */ 330960d5fb3SKevin Wolf if (ctx != co_ctx) { 331960d5fb3SKevin Wolf aio_context_release(ctx); 332960d5fb3SKevin Wolf } 333960d5fb3SKevin Wolf replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data); 334a77fd4bbSFam Zheng 335a77fd4bbSFam Zheng qemu_coroutine_yield(); 336a77fd4bbSFam Zheng /* If we are resumed from some other event (such as an aio completion or a 337a77fd4bbSFam Zheng * timer callback), it is a bug in the caller that should be fixed. */ 338a77fd4bbSFam Zheng assert(data.done); 339960d5fb3SKevin Wolf 340960d5fb3SKevin Wolf /* Reaquire the AioContext of bs if we dropped it */ 341960d5fb3SKevin Wolf if (ctx != co_ctx) { 342960d5fb3SKevin Wolf aio_context_acquire(ctx); 343960d5fb3SKevin Wolf } 344a77fd4bbSFam Zheng } 345a77fd4bbSFam Zheng 34605c272ffSKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, 34705c272ffSKevin Wolf bool poll) 348dcf94a23SKevin Wolf { 349384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 35005c272ffSKevin Wolf 35105c272ffSKevin Wolf if (qemu_in_coroutine()) { 35205c272ffSKevin Wolf bdrv_co_yield_to_drain(bs, true, parent, poll); 35305c272ffSKevin Wolf return; 35405c272ffSKevin Wolf } 355dcf94a23SKevin Wolf 356dcf94a23SKevin Wolf /* Stop things in parent-to-child order */ 357d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { 358dcf94a23SKevin Wolf aio_disable_external(bdrv_get_aio_context(bs)); 359a82a3bd1SKevin Wolf bdrv_parent_drained_begin(bs, parent); 360c7bc05f7SKevin Wolf if (bs->drv && bs->drv->bdrv_drain_begin) { 361c7bc05f7SKevin Wolf bs->drv->bdrv_drain_begin(bs); 362c7bc05f7SKevin Wolf } 363dcf94a23SKevin Wolf } 364d30b8e64SKevin Wolf 365fe4f0614SKevin Wolf /* 366fe4f0614SKevin Wolf * Wait for drained requests to finish. 367fe4f0614SKevin Wolf * 368fe4f0614SKevin Wolf * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 369fe4f0614SKevin Wolf * call is needed so things in this AioContext can make progress even 370fe4f0614SKevin Wolf * though we don't return to the main AioContext loop - this automatically 371fe4f0614SKevin Wolf * includes other nodes in the same AioContext and therefore all child 372fe4f0614SKevin Wolf * nodes. 373fe4f0614SKevin Wolf */ 374fe4f0614SKevin Wolf if (poll) { 375299403aeSKevin Wolf BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent)); 376fe4f0614SKevin Wolf } 3776820643fSKevin Wolf } 3786820643fSKevin Wolf 37905c272ffSKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent) 38005c272ffSKevin Wolf { 38105c272ffSKevin Wolf bdrv_do_drained_begin(bs, parent, false); 38205c272ffSKevin Wolf } 38305c272ffSKevin Wolf 3840152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs) 3850152bf40SKevin Wolf { 386384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 387a82a3bd1SKevin Wolf bdrv_do_drained_begin(bs, NULL, true); 388b0165585SKevin Wolf } 389b0165585SKevin Wolf 390e037c09cSMax Reitz /** 391e037c09cSMax Reitz * This function does not poll, nor must any of its recursively called 3922f65df6eSKevin Wolf * functions. 393e037c09cSMax Reitz */ 394a82a3bd1SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) 395b0165585SKevin Wolf { 3960f115168SKevin Wolf int old_quiesce_counter; 3970f115168SKevin Wolf 398481cad48SManos Pitsidianakis if (qemu_in_coroutine()) { 399a82a3bd1SKevin Wolf bdrv_co_yield_to_drain(bs, false, parent, false); 400481cad48SManos Pitsidianakis return; 401481cad48SManos Pitsidianakis } 4026820643fSKevin Wolf assert(bs->quiesce_counter > 0); 4036820643fSKevin Wolf 40460369b86SKevin Wolf /* Re-enable things in child-to-parent order */ 40557e05be3SKevin Wolf old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); 40657e05be3SKevin Wolf if (old_quiesce_counter == 1) { 407c7bc05f7SKevin Wolf if (bs->drv && bs->drv->bdrv_drain_end) { 408c7bc05f7SKevin Wolf bs->drv->bdrv_drain_end(bs); 409c7bc05f7SKevin Wolf } 410a82a3bd1SKevin Wolf bdrv_parent_drained_end(bs, parent); 4116820643fSKevin Wolf aio_enable_external(bdrv_get_aio_context(bs)); 4126820643fSKevin Wolf } 4130f115168SKevin Wolf } 4146820643fSKevin Wolf 4150152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs) 4160152bf40SKevin Wolf { 417384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 418a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, NULL); 419d736f119SKevin Wolf } 420d736f119SKevin Wolf 42161007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs) 42261007b31SStefan Hajnoczi { 423384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 4246820643fSKevin Wolf bdrv_drained_begin(bs); 4256820643fSKevin Wolf bdrv_drained_end(bs); 42661007b31SStefan Hajnoczi } 42761007b31SStefan Hajnoczi 428c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs) 429c13ad59fSKevin Wolf { 430c13ad59fSKevin Wolf BdrvChild *child, *next; 431c13ad59fSKevin Wolf 432d73415a3SStefan Hajnoczi assert(qatomic_read(&bs->in_flight) == 0); 433c13ad59fSKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 434c13ad59fSKevin Wolf bdrv_drain_assert_idle(child->bs); 435c13ad59fSKevin Wolf } 436c13ad59fSKevin Wolf } 437c13ad59fSKevin Wolf 4380f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0; 4390f12264eSKevin Wolf 4400f12264eSKevin Wolf static bool bdrv_drain_all_poll(void) 4410f12264eSKevin Wolf { 4420f12264eSKevin Wolf BlockDriverState *bs = NULL; 4430f12264eSKevin Wolf bool result = false; 444f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 4450f12264eSKevin Wolf 4460f12264eSKevin Wolf /* bdrv_drain_poll() can't make changes to the graph and we are holding the 4470f12264eSKevin Wolf * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 4480f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 4490f12264eSKevin Wolf AioContext *aio_context = bdrv_get_aio_context(bs); 4500f12264eSKevin Wolf aio_context_acquire(aio_context); 451299403aeSKevin Wolf result |= bdrv_drain_poll(bs, NULL, true); 4520f12264eSKevin Wolf aio_context_release(aio_context); 4530f12264eSKevin Wolf } 4540f12264eSKevin Wolf 4550f12264eSKevin Wolf return result; 4560f12264eSKevin Wolf } 4570f12264eSKevin Wolf 45861007b31SStefan Hajnoczi /* 45961007b31SStefan Hajnoczi * Wait for pending requests to complete across all BlockDriverStates 46061007b31SStefan Hajnoczi * 46161007b31SStefan Hajnoczi * This function does not flush data to disk, use bdrv_flush_all() for that 46261007b31SStefan Hajnoczi * after calling this function. 463c0778f66SAlberto Garcia * 464c0778f66SAlberto Garcia * This pauses all block jobs and disables external clients. It must 465c0778f66SAlberto Garcia * be paired with bdrv_drain_all_end(). 466c0778f66SAlberto Garcia * 467c0778f66SAlberto Garcia * NOTE: no new block jobs or BlockDriverStates can be created between 468c0778f66SAlberto Garcia * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 46961007b31SStefan Hajnoczi */ 470da0bd744SKevin Wolf void bdrv_drain_all_begin_nopoll(void) 47161007b31SStefan Hajnoczi { 4720f12264eSKevin Wolf BlockDriverState *bs = NULL; 473f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 47461007b31SStefan Hajnoczi 475c8aa7895SPavel Dovgalyuk /* 476c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 477c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 478c8aa7895SPavel Dovgalyuk * be infinite 479c8aa7895SPavel Dovgalyuk */ 480c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 481c8aa7895SPavel Dovgalyuk return; 482c8aa7895SPavel Dovgalyuk } 483c8aa7895SPavel Dovgalyuk 4840f12264eSKevin Wolf /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 4850f12264eSKevin Wolf * loop AioContext, so make sure we're in the main context. */ 4869a7e86c8SKevin Wolf assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 4870f12264eSKevin Wolf assert(bdrv_drain_all_count < INT_MAX); 4880f12264eSKevin Wolf bdrv_drain_all_count++; 4899a7e86c8SKevin Wolf 4900f12264eSKevin Wolf /* Quiesce all nodes, without polling in-flight requests yet. The graph 4910f12264eSKevin Wolf * cannot change during this loop. */ 4920f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 49361007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 49461007b31SStefan Hajnoczi 49561007b31SStefan Hajnoczi aio_context_acquire(aio_context); 496a82a3bd1SKevin Wolf bdrv_do_drained_begin(bs, NULL, false); 49761007b31SStefan Hajnoczi aio_context_release(aio_context); 49861007b31SStefan Hajnoczi } 499da0bd744SKevin Wolf } 500da0bd744SKevin Wolf 501da0bd744SKevin Wolf void bdrv_drain_all_begin(void) 502da0bd744SKevin Wolf { 503da0bd744SKevin Wolf BlockDriverState *bs = NULL; 504da0bd744SKevin Wolf 505da0bd744SKevin Wolf if (qemu_in_coroutine()) { 506da0bd744SKevin Wolf bdrv_co_yield_to_drain(NULL, true, NULL, true); 507da0bd744SKevin Wolf return; 508da0bd744SKevin Wolf } 509da0bd744SKevin Wolf 51063945789SPeter Maydell /* 51163945789SPeter Maydell * bdrv queue is managed by record/replay, 51263945789SPeter Maydell * waiting for finishing the I/O requests may 51363945789SPeter Maydell * be infinite 51463945789SPeter Maydell */ 51563945789SPeter Maydell if (replay_events_enabled()) { 51663945789SPeter Maydell return; 51763945789SPeter Maydell } 51863945789SPeter Maydell 519da0bd744SKevin Wolf bdrv_drain_all_begin_nopoll(); 52061007b31SStefan Hajnoczi 5210f12264eSKevin Wolf /* Now poll the in-flight requests */ 522cfe29d82SKevin Wolf AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 5230f12264eSKevin Wolf 5240f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 525c13ad59fSKevin Wolf bdrv_drain_assert_idle(bs); 526f406c03cSAlexander Yarygin } 527f406c03cSAlexander Yarygin } 528c0778f66SAlberto Garcia 5291a6d3bd2SGreg Kurz void bdrv_drain_all_end_quiesce(BlockDriverState *bs) 5301a6d3bd2SGreg Kurz { 531b4ad82aaSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 5321a6d3bd2SGreg Kurz 5331a6d3bd2SGreg Kurz g_assert(bs->quiesce_counter > 0); 5341a6d3bd2SGreg Kurz g_assert(!bs->refcnt); 5351a6d3bd2SGreg Kurz 5361a6d3bd2SGreg Kurz while (bs->quiesce_counter) { 537a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, NULL); 5381a6d3bd2SGreg Kurz } 5391a6d3bd2SGreg Kurz } 5401a6d3bd2SGreg Kurz 541c0778f66SAlberto Garcia void bdrv_drain_all_end(void) 542c0778f66SAlberto Garcia { 5430f12264eSKevin Wolf BlockDriverState *bs = NULL; 544f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 545c0778f66SAlberto Garcia 546c8aa7895SPavel Dovgalyuk /* 547c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 548c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 549c8aa7895SPavel Dovgalyuk * be endless 550c8aa7895SPavel Dovgalyuk */ 551c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 552c8aa7895SPavel Dovgalyuk return; 553c8aa7895SPavel Dovgalyuk } 554c8aa7895SPavel Dovgalyuk 5550f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 55661007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 55761007b31SStefan Hajnoczi 55861007b31SStefan Hajnoczi aio_context_acquire(aio_context); 559a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, NULL); 56061007b31SStefan Hajnoczi aio_context_release(aio_context); 56161007b31SStefan Hajnoczi } 5620f12264eSKevin Wolf 563e037c09cSMax Reitz assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 5640f12264eSKevin Wolf assert(bdrv_drain_all_count > 0); 5650f12264eSKevin Wolf bdrv_drain_all_count--; 56661007b31SStefan Hajnoczi } 56761007b31SStefan Hajnoczi 568c0778f66SAlberto Garcia void bdrv_drain_all(void) 569c0778f66SAlberto Garcia { 570f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 571c0778f66SAlberto Garcia bdrv_drain_all_begin(); 572c0778f66SAlberto Garcia bdrv_drain_all_end(); 573c0778f66SAlberto Garcia } 574c0778f66SAlberto Garcia 57561007b31SStefan Hajnoczi /** 57661007b31SStefan Hajnoczi * Remove an active request from the tracked requests list 57761007b31SStefan Hajnoczi * 57861007b31SStefan Hajnoczi * This function should be called when a tracked request is completing. 57961007b31SStefan Hajnoczi */ 580f0d43b1eSPaolo Bonzini static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req) 58161007b31SStefan Hajnoczi { 58261007b31SStefan Hajnoczi if (req->serialising) { 583d73415a3SStefan Hajnoczi qatomic_dec(&req->bs->serialising_in_flight); 58461007b31SStefan Hajnoczi } 58561007b31SStefan Hajnoczi 5863783fa3dSPaolo Bonzini qemu_co_mutex_lock(&req->bs->reqs_lock); 58761007b31SStefan Hajnoczi QLIST_REMOVE(req, list); 58861007b31SStefan Hajnoczi qemu_co_queue_restart_all(&req->wait_queue); 5893783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&req->bs->reqs_lock); 59061007b31SStefan Hajnoczi } 59161007b31SStefan Hajnoczi 59261007b31SStefan Hajnoczi /** 59361007b31SStefan Hajnoczi * Add an active request to the tracked requests list 59461007b31SStefan Hajnoczi */ 595881a4c55SPaolo Bonzini static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req, 59661007b31SStefan Hajnoczi BlockDriverState *bs, 59761007b31SStefan Hajnoczi int64_t offset, 59880247264SEric Blake int64_t bytes, 599ebde595cSFam Zheng enum BdrvTrackedRequestType type) 60061007b31SStefan Hajnoczi { 60180247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 60222931a15SFam Zheng 60361007b31SStefan Hajnoczi *req = (BdrvTrackedRequest){ 60461007b31SStefan Hajnoczi .bs = bs, 60561007b31SStefan Hajnoczi .offset = offset, 60661007b31SStefan Hajnoczi .bytes = bytes, 607ebde595cSFam Zheng .type = type, 60861007b31SStefan Hajnoczi .co = qemu_coroutine_self(), 60961007b31SStefan Hajnoczi .serialising = false, 61061007b31SStefan Hajnoczi .overlap_offset = offset, 61161007b31SStefan Hajnoczi .overlap_bytes = bytes, 61261007b31SStefan Hajnoczi }; 61361007b31SStefan Hajnoczi 61461007b31SStefan Hajnoczi qemu_co_queue_init(&req->wait_queue); 61561007b31SStefan Hajnoczi 6163783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 61761007b31SStefan Hajnoczi QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 6183783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 61961007b31SStefan Hajnoczi } 62061007b31SStefan Hajnoczi 6213ba0e1a0SPaolo Bonzini static bool tracked_request_overlaps(BdrvTrackedRequest *req, 62280247264SEric Blake int64_t offset, int64_t bytes) 6233ba0e1a0SPaolo Bonzini { 62480247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 62580247264SEric Blake 6263ba0e1a0SPaolo Bonzini /* aaaa bbbb */ 6273ba0e1a0SPaolo Bonzini if (offset >= req->overlap_offset + req->overlap_bytes) { 6283ba0e1a0SPaolo Bonzini return false; 6293ba0e1a0SPaolo Bonzini } 6303ba0e1a0SPaolo Bonzini /* bbbb aaaa */ 6313ba0e1a0SPaolo Bonzini if (req->overlap_offset >= offset + bytes) { 6323ba0e1a0SPaolo Bonzini return false; 6333ba0e1a0SPaolo Bonzini } 6343ba0e1a0SPaolo Bonzini return true; 6353ba0e1a0SPaolo Bonzini } 6363ba0e1a0SPaolo Bonzini 6373183937fSVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 638881a4c55SPaolo Bonzini static coroutine_fn BdrvTrackedRequest * 6393183937fSVladimir Sementsov-Ogievskiy bdrv_find_conflicting_request(BdrvTrackedRequest *self) 6403ba0e1a0SPaolo Bonzini { 6413ba0e1a0SPaolo Bonzini BdrvTrackedRequest *req; 6423ba0e1a0SPaolo Bonzini 6433183937fSVladimir Sementsov-Ogievskiy QLIST_FOREACH(req, &self->bs->tracked_requests, list) { 6443ba0e1a0SPaolo Bonzini if (req == self || (!req->serialising && !self->serialising)) { 6453ba0e1a0SPaolo Bonzini continue; 6463ba0e1a0SPaolo Bonzini } 6473ba0e1a0SPaolo Bonzini if (tracked_request_overlaps(req, self->overlap_offset, 6483ba0e1a0SPaolo Bonzini self->overlap_bytes)) 6493ba0e1a0SPaolo Bonzini { 6503183937fSVladimir Sementsov-Ogievskiy /* 6513183937fSVladimir Sementsov-Ogievskiy * Hitting this means there was a reentrant request, for 6523ba0e1a0SPaolo Bonzini * example, a block driver issuing nested requests. This must 6533ba0e1a0SPaolo Bonzini * never happen since it means deadlock. 6543ba0e1a0SPaolo Bonzini */ 6553ba0e1a0SPaolo Bonzini assert(qemu_coroutine_self() != req->co); 6563ba0e1a0SPaolo Bonzini 6573183937fSVladimir Sementsov-Ogievskiy /* 6583183937fSVladimir Sementsov-Ogievskiy * If the request is already (indirectly) waiting for us, or 6593ba0e1a0SPaolo Bonzini * will wait for us as soon as it wakes up, then just go on 6603183937fSVladimir Sementsov-Ogievskiy * (instead of producing a deadlock in the former case). 6613183937fSVladimir Sementsov-Ogievskiy */ 6623ba0e1a0SPaolo Bonzini if (!req->waiting_for) { 6633183937fSVladimir Sementsov-Ogievskiy return req; 6643183937fSVladimir Sementsov-Ogievskiy } 6653183937fSVladimir Sementsov-Ogievskiy } 6663183937fSVladimir Sementsov-Ogievskiy } 6673183937fSVladimir Sementsov-Ogievskiy 6683183937fSVladimir Sementsov-Ogievskiy return NULL; 6693183937fSVladimir Sementsov-Ogievskiy } 6703183937fSVladimir Sementsov-Ogievskiy 671ec1c8868SVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 672131498f7SDenis V. Lunev static void coroutine_fn 673ec1c8868SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) 6743183937fSVladimir Sementsov-Ogievskiy { 6753183937fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req; 6763183937fSVladimir Sementsov-Ogievskiy 6773183937fSVladimir Sementsov-Ogievskiy while ((req = bdrv_find_conflicting_request(self))) { 6783ba0e1a0SPaolo Bonzini self->waiting_for = req; 679ec1c8868SVladimir Sementsov-Ogievskiy qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); 6803ba0e1a0SPaolo Bonzini self->waiting_for = NULL; 6813ba0e1a0SPaolo Bonzini } 6823ba0e1a0SPaolo Bonzini } 6833ba0e1a0SPaolo Bonzini 6848ac5aab2SVladimir Sementsov-Ogievskiy /* Called with req->bs->reqs_lock held */ 6858ac5aab2SVladimir Sementsov-Ogievskiy static void tracked_request_set_serialising(BdrvTrackedRequest *req, 6868ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 68761007b31SStefan Hajnoczi { 68861007b31SStefan Hajnoczi int64_t overlap_offset = req->offset & ~(align - 1); 68980247264SEric Blake int64_t overlap_bytes = 69080247264SEric Blake ROUND_UP(req->offset + req->bytes, align) - overlap_offset; 69180247264SEric Blake 69280247264SEric Blake bdrv_check_request(req->offset, req->bytes, &error_abort); 69361007b31SStefan Hajnoczi 69461007b31SStefan Hajnoczi if (!req->serialising) { 695d73415a3SStefan Hajnoczi qatomic_inc(&req->bs->serialising_in_flight); 69661007b31SStefan Hajnoczi req->serialising = true; 69761007b31SStefan Hajnoczi } 69861007b31SStefan Hajnoczi 69961007b31SStefan Hajnoczi req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 70061007b31SStefan Hajnoczi req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 70109d2f948SVladimir Sementsov-Ogievskiy } 70209d2f948SVladimir Sementsov-Ogievskiy 70361007b31SStefan Hajnoczi /** 704c28107e9SMax Reitz * Return the tracked request on @bs for the current coroutine, or 705c28107e9SMax Reitz * NULL if there is none. 706c28107e9SMax Reitz */ 707c28107e9SMax Reitz BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 708c28107e9SMax Reitz { 709c28107e9SMax Reitz BdrvTrackedRequest *req; 710c28107e9SMax Reitz Coroutine *self = qemu_coroutine_self(); 711967d7905SEmanuele Giuseppe Esposito IO_CODE(); 712c28107e9SMax Reitz 713c28107e9SMax Reitz QLIST_FOREACH(req, &bs->tracked_requests, list) { 714c28107e9SMax Reitz if (req->co == self) { 715c28107e9SMax Reitz return req; 716c28107e9SMax Reitz } 717c28107e9SMax Reitz } 718c28107e9SMax Reitz 719c28107e9SMax Reitz return NULL; 720c28107e9SMax Reitz } 721c28107e9SMax Reitz 722c28107e9SMax Reitz /** 723244483e6SKevin Wolf * Round a region to cluster boundaries 724244483e6SKevin Wolf */ 725*3d47eb0aSEmanuele Giuseppe Esposito void coroutine_fn bdrv_round_to_clusters(BlockDriverState *bs, 7267cfd5275SEric Blake int64_t offset, int64_t bytes, 727244483e6SKevin Wolf int64_t *cluster_offset, 7287cfd5275SEric Blake int64_t *cluster_bytes) 729244483e6SKevin Wolf { 730244483e6SKevin Wolf BlockDriverInfo bdi; 731384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 732*3d47eb0aSEmanuele Giuseppe Esposito if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 733244483e6SKevin Wolf *cluster_offset = offset; 734244483e6SKevin Wolf *cluster_bytes = bytes; 735244483e6SKevin Wolf } else { 736244483e6SKevin Wolf int64_t c = bdi.cluster_size; 737244483e6SKevin Wolf *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 738244483e6SKevin Wolf *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 739244483e6SKevin Wolf } 740244483e6SKevin Wolf } 741244483e6SKevin Wolf 742*3d47eb0aSEmanuele Giuseppe Esposito static coroutine_fn int bdrv_get_cluster_size(BlockDriverState *bs) 74361007b31SStefan Hajnoczi { 74461007b31SStefan Hajnoczi BlockDriverInfo bdi; 74561007b31SStefan Hajnoczi int ret; 74661007b31SStefan Hajnoczi 747*3d47eb0aSEmanuele Giuseppe Esposito ret = bdrv_co_get_info(bs, &bdi); 74861007b31SStefan Hajnoczi if (ret < 0 || bdi.cluster_size == 0) { 749a5b8dd2cSEric Blake return bs->bl.request_alignment; 75061007b31SStefan Hajnoczi } else { 75161007b31SStefan Hajnoczi return bdi.cluster_size; 75261007b31SStefan Hajnoczi } 75361007b31SStefan Hajnoczi } 75461007b31SStefan Hajnoczi 75599723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs) 75699723548SPaolo Bonzini { 757967d7905SEmanuele Giuseppe Esposito IO_CODE(); 758d73415a3SStefan Hajnoczi qatomic_inc(&bs->in_flight); 75999723548SPaolo Bonzini } 76099723548SPaolo Bonzini 761c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs) 762c9d1a561SPaolo Bonzini { 763967d7905SEmanuele Giuseppe Esposito IO_CODE(); 764cfe29d82SKevin Wolf aio_wait_kick(); 765c9d1a561SPaolo Bonzini } 766c9d1a561SPaolo Bonzini 76799723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs) 76899723548SPaolo Bonzini { 769967d7905SEmanuele Giuseppe Esposito IO_CODE(); 770d73415a3SStefan Hajnoczi qatomic_dec(&bs->in_flight); 771c9d1a561SPaolo Bonzini bdrv_wakeup(bs); 77299723548SPaolo Bonzini } 77399723548SPaolo Bonzini 774131498f7SDenis V. Lunev static void coroutine_fn 775131498f7SDenis V. Lunev bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 77661007b31SStefan Hajnoczi { 77761007b31SStefan Hajnoczi BlockDriverState *bs = self->bs; 77861007b31SStefan Hajnoczi 779d73415a3SStefan Hajnoczi if (!qatomic_read(&bs->serialising_in_flight)) { 780131498f7SDenis V. Lunev return; 78161007b31SStefan Hajnoczi } 78261007b31SStefan Hajnoczi 7833783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 784131498f7SDenis V. Lunev bdrv_wait_serialising_requests_locked(self); 7853783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 78661007b31SStefan Hajnoczi } 78761007b31SStefan Hajnoczi 788131498f7SDenis V. Lunev void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, 7898ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 7908ac5aab2SVladimir Sementsov-Ogievskiy { 791967d7905SEmanuele Giuseppe Esposito IO_CODE(); 7928ac5aab2SVladimir Sementsov-Ogievskiy 7938ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_lock(&req->bs->reqs_lock); 7948ac5aab2SVladimir Sementsov-Ogievskiy 7958ac5aab2SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, align); 796131498f7SDenis V. Lunev bdrv_wait_serialising_requests_locked(req); 7978ac5aab2SVladimir Sementsov-Ogievskiy 7988ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_unlock(&req->bs->reqs_lock); 7998ac5aab2SVladimir Sementsov-Ogievskiy } 8008ac5aab2SVladimir Sementsov-Ogievskiy 801558902ccSVladimir Sementsov-Ogievskiy int bdrv_check_qiov_request(int64_t offset, int64_t bytes, 80263f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 80363f4ad11SVladimir Sementsov-Ogievskiy Error **errp) 80461007b31SStefan Hajnoczi { 80563f4ad11SVladimir Sementsov-Ogievskiy /* 80663f4ad11SVladimir Sementsov-Ogievskiy * Check generic offset/bytes correctness 80763f4ad11SVladimir Sementsov-Ogievskiy */ 80863f4ad11SVladimir Sementsov-Ogievskiy 80969b55e03SVladimir Sementsov-Ogievskiy if (offset < 0) { 81069b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset is negative: %" PRIi64, offset); 81169b55e03SVladimir Sementsov-Ogievskiy return -EIO; 81269b55e03SVladimir Sementsov-Ogievskiy } 81369b55e03SVladimir Sementsov-Ogievskiy 81469b55e03SVladimir Sementsov-Ogievskiy if (bytes < 0) { 81569b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes is negative: %" PRIi64, bytes); 81661007b31SStefan Hajnoczi return -EIO; 81761007b31SStefan Hajnoczi } 81861007b31SStefan Hajnoczi 8198b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_MAX_LENGTH) { 82069b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 82169b55e03SVladimir Sementsov-Ogievskiy bytes, BDRV_MAX_LENGTH); 82269b55e03SVladimir Sementsov-Ogievskiy return -EIO; 82369b55e03SVladimir Sementsov-Ogievskiy } 82469b55e03SVladimir Sementsov-Ogievskiy 82569b55e03SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH) { 82669b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 82769b55e03SVladimir Sementsov-Ogievskiy offset, BDRV_MAX_LENGTH); 8288b117001SVladimir Sementsov-Ogievskiy return -EIO; 8298b117001SVladimir Sementsov-Ogievskiy } 8308b117001SVladimir Sementsov-Ogievskiy 8318b117001SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH - bytes) { 83269b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") " 83369b55e03SVladimir Sementsov-Ogievskiy "exceeds maximum(%" PRIi64 ")", offset, bytes, 83469b55e03SVladimir Sementsov-Ogievskiy BDRV_MAX_LENGTH); 8358b117001SVladimir Sementsov-Ogievskiy return -EIO; 8368b117001SVladimir Sementsov-Ogievskiy } 8378b117001SVladimir Sementsov-Ogievskiy 83863f4ad11SVladimir Sementsov-Ogievskiy if (!qiov) { 8398b117001SVladimir Sementsov-Ogievskiy return 0; 8408b117001SVladimir Sementsov-Ogievskiy } 8418b117001SVladimir Sementsov-Ogievskiy 84263f4ad11SVladimir Sementsov-Ogievskiy /* 84363f4ad11SVladimir Sementsov-Ogievskiy * Check qiov and qiov_offset 84463f4ad11SVladimir Sementsov-Ogievskiy */ 84563f4ad11SVladimir Sementsov-Ogievskiy 84663f4ad11SVladimir Sementsov-Ogievskiy if (qiov_offset > qiov->size) { 84763f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)", 84863f4ad11SVladimir Sementsov-Ogievskiy qiov_offset, qiov->size); 84963f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 85063f4ad11SVladimir Sementsov-Ogievskiy } 85163f4ad11SVladimir Sementsov-Ogievskiy 85263f4ad11SVladimir Sementsov-Ogievskiy if (bytes > qiov->size - qiov_offset) { 85363f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io " 85463f4ad11SVladimir Sementsov-Ogievskiy "vector size(%zu)", bytes, qiov_offset, qiov->size); 85563f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 85663f4ad11SVladimir Sementsov-Ogievskiy } 85763f4ad11SVladimir Sementsov-Ogievskiy 85863f4ad11SVladimir Sementsov-Ogievskiy return 0; 85963f4ad11SVladimir Sementsov-Ogievskiy } 86063f4ad11SVladimir Sementsov-Ogievskiy 86163f4ad11SVladimir Sementsov-Ogievskiy int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) 8628b117001SVladimir Sementsov-Ogievskiy { 86363f4ad11SVladimir Sementsov-Ogievskiy return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp); 86463f4ad11SVladimir Sementsov-Ogievskiy } 86563f4ad11SVladimir Sementsov-Ogievskiy 86663f4ad11SVladimir Sementsov-Ogievskiy static int bdrv_check_request32(int64_t offset, int64_t bytes, 86763f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset) 86863f4ad11SVladimir Sementsov-Ogievskiy { 86963f4ad11SVladimir Sementsov-Ogievskiy int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 8708b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 8718b117001SVladimir Sementsov-Ogievskiy return ret; 8728b117001SVladimir Sementsov-Ogievskiy } 8738b117001SVladimir Sementsov-Ogievskiy 8748b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_REQUEST_MAX_BYTES) { 87561007b31SStefan Hajnoczi return -EIO; 87661007b31SStefan Hajnoczi } 87761007b31SStefan Hajnoczi 87861007b31SStefan Hajnoczi return 0; 87961007b31SStefan Hajnoczi } 88061007b31SStefan Hajnoczi 88161007b31SStefan Hajnoczi /* 88274021bc4SEric Blake * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 88361007b31SStefan Hajnoczi * The operation is sped up by checking the block status and only writing 88461007b31SStefan Hajnoczi * zeroes to the device if they currently do not return zeroes. Optional 88574021bc4SEric Blake * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 886465fe887SEric Blake * BDRV_REQ_FUA). 88761007b31SStefan Hajnoczi * 888f4649069SEric Blake * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 88961007b31SStefan Hajnoczi */ 890720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 89161007b31SStefan Hajnoczi { 892237d78f8SEric Blake int ret; 893237d78f8SEric Blake int64_t target_size, bytes, offset = 0; 894720ff280SKevin Wolf BlockDriverState *bs = child->bs; 895384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 89661007b31SStefan Hajnoczi 8977286d610SEric Blake target_size = bdrv_getlength(bs); 8987286d610SEric Blake if (target_size < 0) { 8997286d610SEric Blake return target_size; 90061007b31SStefan Hajnoczi } 90161007b31SStefan Hajnoczi 90261007b31SStefan Hajnoczi for (;;) { 9037286d610SEric Blake bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 9047286d610SEric Blake if (bytes <= 0) { 90561007b31SStefan Hajnoczi return 0; 90661007b31SStefan Hajnoczi } 907237d78f8SEric Blake ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 90861007b31SStefan Hajnoczi if (ret < 0) { 90961007b31SStefan Hajnoczi return ret; 91061007b31SStefan Hajnoczi } 91161007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_ZERO) { 912237d78f8SEric Blake offset += bytes; 91361007b31SStefan Hajnoczi continue; 91461007b31SStefan Hajnoczi } 915237d78f8SEric Blake ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 91661007b31SStefan Hajnoczi if (ret < 0) { 91761007b31SStefan Hajnoczi return ret; 91861007b31SStefan Hajnoczi } 919237d78f8SEric Blake offset += bytes; 92061007b31SStefan Hajnoczi } 92161007b31SStefan Hajnoczi } 92261007b31SStefan Hajnoczi 92361007b31SStefan Hajnoczi /* 92461007b31SStefan Hajnoczi * Writes to the file and ensures that no writes are reordered across this 92561007b31SStefan Hajnoczi * request (acts as a barrier) 92661007b31SStefan Hajnoczi * 92761007b31SStefan Hajnoczi * Returns 0 on success, -errno in error cases. 92861007b31SStefan Hajnoczi */ 929e97190a4SAlberto Faria int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset, 930e97190a4SAlberto Faria int64_t bytes, const void *buf, 931e97190a4SAlberto Faria BdrvRequestFlags flags) 93261007b31SStefan Hajnoczi { 93361007b31SStefan Hajnoczi int ret; 934384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 93561007b31SStefan Hajnoczi 936e97190a4SAlberto Faria ret = bdrv_co_pwrite(child, offset, bytes, buf, flags); 93761007b31SStefan Hajnoczi if (ret < 0) { 93861007b31SStefan Hajnoczi return ret; 93961007b31SStefan Hajnoczi } 94061007b31SStefan Hajnoczi 941e97190a4SAlberto Faria ret = bdrv_co_flush(child->bs); 942855a6a93SKevin Wolf if (ret < 0) { 943855a6a93SKevin Wolf return ret; 94461007b31SStefan Hajnoczi } 94561007b31SStefan Hajnoczi 94661007b31SStefan Hajnoczi return 0; 94761007b31SStefan Hajnoczi } 94861007b31SStefan Hajnoczi 94908844473SKevin Wolf typedef struct CoroutineIOCompletion { 95008844473SKevin Wolf Coroutine *coroutine; 95108844473SKevin Wolf int ret; 95208844473SKevin Wolf } CoroutineIOCompletion; 95308844473SKevin Wolf 95408844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret) 95508844473SKevin Wolf { 95608844473SKevin Wolf CoroutineIOCompletion *co = opaque; 95708844473SKevin Wolf 95808844473SKevin Wolf co->ret = ret; 959b9e413ddSPaolo Bonzini aio_co_wake(co->coroutine); 96008844473SKevin Wolf } 96108844473SKevin Wolf 962166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 96317abcbeeSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 964ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 965ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 966166fe960SKevin Wolf { 967166fe960SKevin Wolf BlockDriver *drv = bs->drv; 9683fb06697SKevin Wolf int64_t sector_num; 9693fb06697SKevin Wolf unsigned int nb_sectors; 970ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 971ac850bf0SVladimir Sementsov-Ogievskiy int ret; 9723fb06697SKevin Wolf 97317abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 974e8b65355SStefan Hajnoczi assert(!(flags & ~bs->supported_read_flags)); 975fa166538SEric Blake 976d470ad42SMax Reitz if (!drv) { 977d470ad42SMax Reitz return -ENOMEDIUM; 978d470ad42SMax Reitz } 979d470ad42SMax Reitz 980ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_preadv_part) { 981ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 982ac850bf0SVladimir Sementsov-Ogievskiy flags); 983ac850bf0SVladimir Sementsov-Ogievskiy } 984ac850bf0SVladimir Sementsov-Ogievskiy 985ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 986ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 987ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 988ac850bf0SVladimir Sementsov-Ogievskiy } 989ac850bf0SVladimir Sementsov-Ogievskiy 9903fb06697SKevin Wolf if (drv->bdrv_co_preadv) { 991ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 992ac850bf0SVladimir Sementsov-Ogievskiy goto out; 9933fb06697SKevin Wolf } 9943fb06697SKevin Wolf 995edfab6a0SEric Blake if (drv->bdrv_aio_preadv) { 99608844473SKevin Wolf BlockAIOCB *acb; 99708844473SKevin Wolf CoroutineIOCompletion co = { 99808844473SKevin Wolf .coroutine = qemu_coroutine_self(), 99908844473SKevin Wolf }; 100008844473SKevin Wolf 1001e31f6864SEric Blake acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 100208844473SKevin Wolf bdrv_co_io_em_complete, &co); 100308844473SKevin Wolf if (acb == NULL) { 1004ac850bf0SVladimir Sementsov-Ogievskiy ret = -EIO; 1005ac850bf0SVladimir Sementsov-Ogievskiy goto out; 100608844473SKevin Wolf } else { 100708844473SKevin Wolf qemu_coroutine_yield(); 1008ac850bf0SVladimir Sementsov-Ogievskiy ret = co.ret; 1009ac850bf0SVladimir Sementsov-Ogievskiy goto out; 101008844473SKevin Wolf } 101108844473SKevin Wolf } 1012edfab6a0SEric Blake 1013edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1014edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1015edfab6a0SEric Blake 10161bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 10171bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 101841ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1019edfab6a0SEric Blake assert(drv->bdrv_co_readv); 1020edfab6a0SEric Blake 1021ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1022ac850bf0SVladimir Sementsov-Ogievskiy 1023ac850bf0SVladimir Sementsov-Ogievskiy out: 1024ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1025ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1026ac850bf0SVladimir Sementsov-Ogievskiy } 1027ac850bf0SVladimir Sementsov-Ogievskiy 1028ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1029166fe960SKevin Wolf } 1030166fe960SKevin Wolf 103178a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 103217abcbeeSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 1033ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1034e75abedaSVladimir Sementsov-Ogievskiy size_t qiov_offset, 1035e75abedaSVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 103678a07294SKevin Wolf { 103778a07294SKevin Wolf BlockDriver *drv = bs->drv; 1038e8b65355SStefan Hajnoczi bool emulate_fua = false; 10393fb06697SKevin Wolf int64_t sector_num; 10403fb06697SKevin Wolf unsigned int nb_sectors; 1041ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 104278a07294SKevin Wolf int ret; 104378a07294SKevin Wolf 104417abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1045fa166538SEric Blake 1046d470ad42SMax Reitz if (!drv) { 1047d470ad42SMax Reitz return -ENOMEDIUM; 1048d470ad42SMax Reitz } 1049d470ad42SMax Reitz 1050e8b65355SStefan Hajnoczi if ((flags & BDRV_REQ_FUA) && 1051e8b65355SStefan Hajnoczi (~bs->supported_write_flags & BDRV_REQ_FUA)) { 1052e8b65355SStefan Hajnoczi flags &= ~BDRV_REQ_FUA; 1053e8b65355SStefan Hajnoczi emulate_fua = true; 1054e8b65355SStefan Hajnoczi } 1055e8b65355SStefan Hajnoczi 1056e8b65355SStefan Hajnoczi flags &= bs->supported_write_flags; 1057e8b65355SStefan Hajnoczi 1058ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_part) { 1059ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1060e8b65355SStefan Hajnoczi flags); 1061ac850bf0SVladimir Sementsov-Ogievskiy goto emulate_flags; 1062ac850bf0SVladimir Sementsov-Ogievskiy } 1063ac850bf0SVladimir Sementsov-Ogievskiy 1064ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1065ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1066ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1067ac850bf0SVladimir Sementsov-Ogievskiy } 1068ac850bf0SVladimir Sementsov-Ogievskiy 10693fb06697SKevin Wolf if (drv->bdrv_co_pwritev) { 1070e8b65355SStefan Hajnoczi ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); 10713fb06697SKevin Wolf goto emulate_flags; 10723fb06697SKevin Wolf } 10733fb06697SKevin Wolf 1074edfab6a0SEric Blake if (drv->bdrv_aio_pwritev) { 107508844473SKevin Wolf BlockAIOCB *acb; 107608844473SKevin Wolf CoroutineIOCompletion co = { 107708844473SKevin Wolf .coroutine = qemu_coroutine_self(), 107808844473SKevin Wolf }; 107908844473SKevin Wolf 1080e8b65355SStefan Hajnoczi acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags, 108108844473SKevin Wolf bdrv_co_io_em_complete, &co); 108208844473SKevin Wolf if (acb == NULL) { 10833fb06697SKevin Wolf ret = -EIO; 108408844473SKevin Wolf } else { 108508844473SKevin Wolf qemu_coroutine_yield(); 10863fb06697SKevin Wolf ret = co.ret; 108708844473SKevin Wolf } 1088edfab6a0SEric Blake goto emulate_flags; 1089edfab6a0SEric Blake } 1090edfab6a0SEric Blake 1091edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1092edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1093edfab6a0SEric Blake 10941bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 10951bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 109641ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1097edfab6a0SEric Blake 1098e18a58b4SEric Blake assert(drv->bdrv_co_writev); 1099e8b65355SStefan Hajnoczi ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags); 110078a07294SKevin Wolf 11013fb06697SKevin Wolf emulate_flags: 1102e8b65355SStefan Hajnoczi if (ret == 0 && emulate_fua) { 110378a07294SKevin Wolf ret = bdrv_co_flush(bs); 110478a07294SKevin Wolf } 110578a07294SKevin Wolf 1106ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1107ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1108ac850bf0SVladimir Sementsov-Ogievskiy } 1109ac850bf0SVladimir Sementsov-Ogievskiy 111078a07294SKevin Wolf return ret; 111178a07294SKevin Wolf } 111278a07294SKevin Wolf 111329a298afSPavel Butsykin static int coroutine_fn 111417abcbeeSVladimir Sementsov-Ogievskiy bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset, 111517abcbeeSVladimir Sementsov-Ogievskiy int64_t bytes, QEMUIOVector *qiov, 1116ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset) 111729a298afSPavel Butsykin { 111829a298afSPavel Butsykin BlockDriver *drv = bs->drv; 1119ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1120ac850bf0SVladimir Sementsov-Ogievskiy int ret; 112129a298afSPavel Butsykin 112217abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 112317abcbeeSVladimir Sementsov-Ogievskiy 1124d470ad42SMax Reitz if (!drv) { 1125d470ad42SMax Reitz return -ENOMEDIUM; 1126d470ad42SMax Reitz } 1127d470ad42SMax Reitz 1128ac850bf0SVladimir Sementsov-Ogievskiy if (!block_driver_can_compress(drv)) { 112929a298afSPavel Butsykin return -ENOTSUP; 113029a298afSPavel Butsykin } 113129a298afSPavel Butsykin 1132ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_compressed_part) { 1133ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1134ac850bf0SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 1135ac850bf0SVladimir Sementsov-Ogievskiy } 1136ac850bf0SVladimir Sementsov-Ogievskiy 1137ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset == 0) { 113829a298afSPavel Butsykin return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 113929a298afSPavel Butsykin } 114029a298afSPavel Butsykin 1141ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1142ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1143ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1144ac850bf0SVladimir Sementsov-Ogievskiy 1145ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1146ac850bf0SVladimir Sementsov-Ogievskiy } 1147ac850bf0SVladimir Sementsov-Ogievskiy 114885c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 11499df5afbdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 11501143ec5eSVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 115161007b31SStefan Hajnoczi { 115285c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 115385c97ca7SKevin Wolf 115461007b31SStefan Hajnoczi /* Perform I/O through a temporary buffer so that users who scribble over 115561007b31SStefan Hajnoczi * their read buffer while the operation is in progress do not end up 115661007b31SStefan Hajnoczi * modifying the image file. This is critical for zero-copy guest I/O 115761007b31SStefan Hajnoczi * where anything might happen inside guest memory. 115861007b31SStefan Hajnoczi */ 11592275cc90SVladimir Sementsov-Ogievskiy void *bounce_buffer = NULL; 116061007b31SStefan Hajnoczi 116161007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 1162244483e6SKevin Wolf int64_t cluster_offset; 11637cfd5275SEric Blake int64_t cluster_bytes; 11649df5afbdSVladimir Sementsov-Ogievskiy int64_t skip_bytes; 116561007b31SStefan Hajnoczi int ret; 1166cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1167cb2e2878SEric Blake BDRV_REQUEST_MAX_BYTES); 11689df5afbdSVladimir Sementsov-Ogievskiy int64_t progress = 0; 11698644476eSMax Reitz bool skip_write; 117061007b31SStefan Hajnoczi 11719df5afbdSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 11729df5afbdSVladimir Sementsov-Ogievskiy 1173d470ad42SMax Reitz if (!drv) { 1174d470ad42SMax Reitz return -ENOMEDIUM; 1175d470ad42SMax Reitz } 1176d470ad42SMax Reitz 11778644476eSMax Reitz /* 11788644476eSMax Reitz * Do not write anything when the BDS is inactive. That is not 11798644476eSMax Reitz * allowed, and it would not help. 11808644476eSMax Reitz */ 11818644476eSMax Reitz skip_write = (bs->open_flags & BDRV_O_INACTIVE); 11828644476eSMax Reitz 11831bf03e66SKevin Wolf /* FIXME We cannot require callers to have write permissions when all they 11841bf03e66SKevin Wolf * are doing is a read request. If we did things right, write permissions 11851bf03e66SKevin Wolf * would be obtained anyway, but internally by the copy-on-read code. As 1186765d9df9SEric Blake * long as it is implemented here rather than in a separate filter driver, 11871bf03e66SKevin Wolf * the copy-on-read code doesn't have its own BdrvChild, however, for which 11881bf03e66SKevin Wolf * it could request permissions. Therefore we have to bypass the permission 11891bf03e66SKevin Wolf * system for the moment. */ 11901bf03e66SKevin Wolf // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1191afa4b293SKevin Wolf 119261007b31SStefan Hajnoczi /* Cover entire cluster so no additional backing file I/O is required when 1193cb2e2878SEric Blake * allocating cluster in the image file. Note that this value may exceed 1194cb2e2878SEric Blake * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1195cb2e2878SEric Blake * is one reason we loop rather than doing it all at once. 119661007b31SStefan Hajnoczi */ 1197244483e6SKevin Wolf bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1198cb2e2878SEric Blake skip_bytes = offset - cluster_offset; 119961007b31SStefan Hajnoczi 1200244483e6SKevin Wolf trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1201244483e6SKevin Wolf cluster_offset, cluster_bytes); 120261007b31SStefan Hajnoczi 1203cb2e2878SEric Blake while (cluster_bytes) { 1204cb2e2878SEric Blake int64_t pnum; 120561007b31SStefan Hajnoczi 12068644476eSMax Reitz if (skip_write) { 12078644476eSMax Reitz ret = 1; /* "already allocated", so nothing will be copied */ 12088644476eSMax Reitz pnum = MIN(cluster_bytes, max_transfer); 12098644476eSMax Reitz } else { 1210cb2e2878SEric Blake ret = bdrv_is_allocated(bs, cluster_offset, 1211cb2e2878SEric Blake MIN(cluster_bytes, max_transfer), &pnum); 1212cb2e2878SEric Blake if (ret < 0) { 12138644476eSMax Reitz /* 12148644476eSMax Reitz * Safe to treat errors in querying allocation as if 1215cb2e2878SEric Blake * unallocated; we'll probably fail again soon on the 1216cb2e2878SEric Blake * read, but at least that will set a decent errno. 1217cb2e2878SEric Blake */ 1218cb2e2878SEric Blake pnum = MIN(cluster_bytes, max_transfer); 1219cb2e2878SEric Blake } 1220cb2e2878SEric Blake 1221b0ddcbbbSKevin Wolf /* Stop at EOF if the image ends in the middle of the cluster */ 1222b0ddcbbbSKevin Wolf if (ret == 0 && pnum == 0) { 1223b0ddcbbbSKevin Wolf assert(progress >= bytes); 1224b0ddcbbbSKevin Wolf break; 1225b0ddcbbbSKevin Wolf } 1226b0ddcbbbSKevin Wolf 1227cb2e2878SEric Blake assert(skip_bytes < pnum); 12288644476eSMax Reitz } 1229cb2e2878SEric Blake 1230cb2e2878SEric Blake if (ret <= 0) { 12311143ec5eSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 12321143ec5eSVladimir Sementsov-Ogievskiy 1233cb2e2878SEric Blake /* Must copy-on-read; use the bounce buffer */ 12340d93ed08SVladimir Sementsov-Ogievskiy pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 12352275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 12362275cc90SVladimir Sementsov-Ogievskiy int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 12372275cc90SVladimir Sementsov-Ogievskiy int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 12382275cc90SVladimir Sementsov-Ogievskiy int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 12392275cc90SVladimir Sementsov-Ogievskiy 12402275cc90SVladimir Sementsov-Ogievskiy bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 12412275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 12422275cc90SVladimir Sementsov-Ogievskiy ret = -ENOMEM; 12432275cc90SVladimir Sementsov-Ogievskiy goto err; 12442275cc90SVladimir Sementsov-Ogievskiy } 12452275cc90SVladimir Sementsov-Ogievskiy } 12460d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1247cb2e2878SEric Blake 1248cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1249ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 0); 125061007b31SStefan Hajnoczi if (ret < 0) { 125161007b31SStefan Hajnoczi goto err; 125261007b31SStefan Hajnoczi } 125361007b31SStefan Hajnoczi 1254d855ebcdSEric Blake bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1255c1499a5eSEric Blake if (drv->bdrv_co_pwrite_zeroes && 1256cb2e2878SEric Blake buffer_is_zero(bounce_buffer, pnum)) { 1257a604fa2bSEric Blake /* FIXME: Should we (perhaps conditionally) be setting 1258a604fa2bSEric Blake * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1259a604fa2bSEric Blake * that still correctly reads as zero? */ 12607adcf59fSMax Reitz ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 12617adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 126261007b31SStefan Hajnoczi } else { 1263cb2e2878SEric Blake /* This does not change the data on the disk, it is not 1264cb2e2878SEric Blake * necessary to flush even in cache=writethrough mode. 126561007b31SStefan Hajnoczi */ 1266cb2e2878SEric Blake ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1267ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 12687adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 126961007b31SStefan Hajnoczi } 127061007b31SStefan Hajnoczi 127161007b31SStefan Hajnoczi if (ret < 0) { 1272cb2e2878SEric Blake /* It might be okay to ignore write errors for guest 1273cb2e2878SEric Blake * requests. If this is a deliberate copy-on-read 1274cb2e2878SEric Blake * then we don't want to ignore the error. Simply 1275cb2e2878SEric Blake * report it in all cases. 127661007b31SStefan Hajnoczi */ 127761007b31SStefan Hajnoczi goto err; 127861007b31SStefan Hajnoczi } 127961007b31SStefan Hajnoczi 12803299e5ecSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_PREFETCH)) { 12811143ec5eSVladimir Sementsov-Ogievskiy qemu_iovec_from_buf(qiov, qiov_offset + progress, 12821143ec5eSVladimir Sementsov-Ogievskiy bounce_buffer + skip_bytes, 12834ab78b19SVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress)); 12843299e5ecSVladimir Sementsov-Ogievskiy } 12853299e5ecSVladimir Sementsov-Ogievskiy } else if (!(flags & BDRV_REQ_PREFETCH)) { 1286cb2e2878SEric Blake /* Read directly into the destination */ 12871143ec5eSVladimir Sementsov-Ogievskiy ret = bdrv_driver_preadv(bs, offset + progress, 12881143ec5eSVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress), 12891143ec5eSVladimir Sementsov-Ogievskiy qiov, qiov_offset + progress, 0); 1290cb2e2878SEric Blake if (ret < 0) { 1291cb2e2878SEric Blake goto err; 1292cb2e2878SEric Blake } 1293cb2e2878SEric Blake } 1294cb2e2878SEric Blake 1295cb2e2878SEric Blake cluster_offset += pnum; 1296cb2e2878SEric Blake cluster_bytes -= pnum; 1297cb2e2878SEric Blake progress += pnum - skip_bytes; 1298cb2e2878SEric Blake skip_bytes = 0; 1299cb2e2878SEric Blake } 1300cb2e2878SEric Blake ret = 0; 130161007b31SStefan Hajnoczi 130261007b31SStefan Hajnoczi err: 130361007b31SStefan Hajnoczi qemu_vfree(bounce_buffer); 130461007b31SStefan Hajnoczi return ret; 130561007b31SStefan Hajnoczi } 130661007b31SStefan Hajnoczi 130761007b31SStefan Hajnoczi /* 130861007b31SStefan Hajnoczi * Forwards an already correctly aligned request to the BlockDriver. This 13091a62d0acSEric Blake * handles copy on read, zeroing after EOF, and fragmentation of large 13101a62d0acSEric Blake * reads; any other features must be implemented by the caller. 131161007b31SStefan Hajnoczi */ 131285c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 13138b0c5d76SVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 131465cd4424SVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 131561007b31SStefan Hajnoczi { 131685c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 1317c9d20029SKevin Wolf int64_t total_bytes, max_bytes; 13181a62d0acSEric Blake int ret = 0; 13198b0c5d76SVladimir Sementsov-Ogievskiy int64_t bytes_remaining = bytes; 13201a62d0acSEric Blake int max_transfer; 132161007b31SStefan Hajnoczi 13228b0c5d76SVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 132349c07526SKevin Wolf assert(is_power_of_2(align)); 132449c07526SKevin Wolf assert((offset & (align - 1)) == 0); 132549c07526SKevin Wolf assert((bytes & (align - 1)) == 0); 1326abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 13271a62d0acSEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 13281a62d0acSEric Blake align); 1329a604fa2bSEric Blake 1330e8b65355SStefan Hajnoczi /* 1331e8b65355SStefan Hajnoczi * TODO: We would need a per-BDS .supported_read_flags and 1332a604fa2bSEric Blake * potential fallback support, if we ever implement any read flags 1333a604fa2bSEric Blake * to pass through to drivers. For now, there aren't any 1334e8b65355SStefan Hajnoczi * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint. 1335e8b65355SStefan Hajnoczi */ 1336e8b65355SStefan Hajnoczi assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH | 1337e8b65355SStefan Hajnoczi BDRV_REQ_REGISTERED_BUF))); 133861007b31SStefan Hajnoczi 133961007b31SStefan Hajnoczi /* Handle Copy on Read and associated serialisation */ 134061007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 134161007b31SStefan Hajnoczi /* If we touch the same cluster it counts as an overlap. This 134261007b31SStefan Hajnoczi * guarantees that allocating writes will be serialized and not race 134361007b31SStefan Hajnoczi * with each other for the same cluster. For example, in copy-on-read 134461007b31SStefan Hajnoczi * it ensures that the CoR read and write operations are atomic and 134561007b31SStefan Hajnoczi * guest writes cannot interleave between them. */ 13468ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); 134718fbd0deSPaolo Bonzini } else { 1348304d9d7fSMax Reitz bdrv_wait_serialising_requests(req); 134918fbd0deSPaolo Bonzini } 135061007b31SStefan Hajnoczi 135161007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 1352d6a644bbSEric Blake int64_t pnum; 135361007b31SStefan Hajnoczi 1354897dd0ecSAndrey Shinkevich /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ 1355897dd0ecSAndrey Shinkevich flags &= ~BDRV_REQ_COPY_ON_READ; 1356897dd0ecSAndrey Shinkevich 135788e63df2SEric Blake ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 135861007b31SStefan Hajnoczi if (ret < 0) { 135961007b31SStefan Hajnoczi goto out; 136061007b31SStefan Hajnoczi } 136161007b31SStefan Hajnoczi 136288e63df2SEric Blake if (!ret || pnum != bytes) { 136365cd4424SVladimir Sementsov-Ogievskiy ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 136465cd4424SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 13653299e5ecSVladimir Sementsov-Ogievskiy goto out; 13663299e5ecSVladimir Sementsov-Ogievskiy } else if (flags & BDRV_REQ_PREFETCH) { 136761007b31SStefan Hajnoczi goto out; 136861007b31SStefan Hajnoczi } 136961007b31SStefan Hajnoczi } 137061007b31SStefan Hajnoczi 13711a62d0acSEric Blake /* Forward the request to the BlockDriver, possibly fragmenting it */ 137249c07526SKevin Wolf total_bytes = bdrv_getlength(bs); 137349c07526SKevin Wolf if (total_bytes < 0) { 137449c07526SKevin Wolf ret = total_bytes; 137561007b31SStefan Hajnoczi goto out; 137661007b31SStefan Hajnoczi } 137761007b31SStefan Hajnoczi 1378e8b65355SStefan Hajnoczi assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF))); 1379897dd0ecSAndrey Shinkevich 138049c07526SKevin Wolf max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 13811a62d0acSEric Blake if (bytes <= max_bytes && bytes <= max_transfer) { 1382897dd0ecSAndrey Shinkevich ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); 13831a62d0acSEric Blake goto out; 138461007b31SStefan Hajnoczi } 138561007b31SStefan Hajnoczi 13861a62d0acSEric Blake while (bytes_remaining) { 13878b0c5d76SVladimir Sementsov-Ogievskiy int64_t num; 13881a62d0acSEric Blake 13891a62d0acSEric Blake if (max_bytes) { 13901a62d0acSEric Blake num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 13911a62d0acSEric Blake assert(num); 13921a62d0acSEric Blake 13931a62d0acSEric Blake ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1394134b7decSMax Reitz num, qiov, 1395897dd0ecSAndrey Shinkevich qiov_offset + bytes - bytes_remaining, 1396897dd0ecSAndrey Shinkevich flags); 13971a62d0acSEric Blake max_bytes -= num; 13981a62d0acSEric Blake } else { 13991a62d0acSEric Blake num = bytes_remaining; 1400134b7decSMax Reitz ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1401134b7decSMax Reitz 0, bytes_remaining); 14021a62d0acSEric Blake } 14031a62d0acSEric Blake if (ret < 0) { 14041a62d0acSEric Blake goto out; 14051a62d0acSEric Blake } 14061a62d0acSEric Blake bytes_remaining -= num; 140761007b31SStefan Hajnoczi } 140861007b31SStefan Hajnoczi 140961007b31SStefan Hajnoczi out: 14101a62d0acSEric Blake return ret < 0 ? ret : 0; 141161007b31SStefan Hajnoczi } 141261007b31SStefan Hajnoczi 141361007b31SStefan Hajnoczi /* 14147a3f542fSVladimir Sementsov-Ogievskiy * Request padding 14157a3f542fSVladimir Sementsov-Ogievskiy * 14167a3f542fSVladimir Sementsov-Ogievskiy * |<---- align ----->| |<----- align ---->| 14177a3f542fSVladimir Sementsov-Ogievskiy * |<- head ->|<------------- bytes ------------->|<-- tail -->| 14187a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 14197a3f542fSVladimir Sementsov-Ogievskiy * -*----------$-------*-------- ... --------*-----$------------*--- 14207a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 14217a3f542fSVladimir Sementsov-Ogievskiy * | offset | | end | 14227a3f542fSVladimir Sementsov-Ogievskiy * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 14237a3f542fSVladimir Sementsov-Ogievskiy * [buf ... ) [tail_buf ) 14247a3f542fSVladimir Sementsov-Ogievskiy * 14257a3f542fSVladimir Sementsov-Ogievskiy * @buf is an aligned allocation needed to store @head and @tail paddings. @head 14267a3f542fSVladimir Sementsov-Ogievskiy * is placed at the beginning of @buf and @tail at the @end. 14277a3f542fSVladimir Sementsov-Ogievskiy * 14287a3f542fSVladimir Sementsov-Ogievskiy * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 14297a3f542fSVladimir Sementsov-Ogievskiy * around tail, if tail exists. 14307a3f542fSVladimir Sementsov-Ogievskiy * 14317a3f542fSVladimir Sementsov-Ogievskiy * @merge_reads is true for small requests, 14327a3f542fSVladimir Sementsov-Ogievskiy * if @buf_len == @head + bytes + @tail. In this case it is possible that both 14337a3f542fSVladimir Sementsov-Ogievskiy * head and tail exist but @buf_len == align and @tail_buf == @buf. 143461007b31SStefan Hajnoczi */ 14357a3f542fSVladimir Sementsov-Ogievskiy typedef struct BdrvRequestPadding { 14367a3f542fSVladimir Sementsov-Ogievskiy uint8_t *buf; 14377a3f542fSVladimir Sementsov-Ogievskiy size_t buf_len; 14387a3f542fSVladimir Sementsov-Ogievskiy uint8_t *tail_buf; 14397a3f542fSVladimir Sementsov-Ogievskiy size_t head; 14407a3f542fSVladimir Sementsov-Ogievskiy size_t tail; 14417a3f542fSVladimir Sementsov-Ogievskiy bool merge_reads; 14427a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 14437a3f542fSVladimir Sementsov-Ogievskiy } BdrvRequestPadding; 14447a3f542fSVladimir Sementsov-Ogievskiy 14457a3f542fSVladimir Sementsov-Ogievskiy static bool bdrv_init_padding(BlockDriverState *bs, 14467a3f542fSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 14477a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad) 14487a3f542fSVladimir Sementsov-Ogievskiy { 1449a56ed80cSVladimir Sementsov-Ogievskiy int64_t align = bs->bl.request_alignment; 1450a56ed80cSVladimir Sementsov-Ogievskiy int64_t sum; 1451a56ed80cSVladimir Sementsov-Ogievskiy 1452a56ed80cSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 1453a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= INT_MAX); /* documented in block/block_int.h */ 1454a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */ 14557a3f542fSVladimir Sementsov-Ogievskiy 14567a3f542fSVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 14577a3f542fSVladimir Sementsov-Ogievskiy 14587a3f542fSVladimir Sementsov-Ogievskiy pad->head = offset & (align - 1); 14597a3f542fSVladimir Sementsov-Ogievskiy pad->tail = ((offset + bytes) & (align - 1)); 14607a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 14617a3f542fSVladimir Sementsov-Ogievskiy pad->tail = align - pad->tail; 14627a3f542fSVladimir Sementsov-Ogievskiy } 14637a3f542fSVladimir Sementsov-Ogievskiy 1464ac9d00bfSVladimir Sementsov-Ogievskiy if (!pad->head && !pad->tail) { 14657a3f542fSVladimir Sementsov-Ogievskiy return false; 14667a3f542fSVladimir Sementsov-Ogievskiy } 14677a3f542fSVladimir Sementsov-Ogievskiy 1468ac9d00bfSVladimir Sementsov-Ogievskiy assert(bytes); /* Nothing good in aligning zero-length requests */ 1469ac9d00bfSVladimir Sementsov-Ogievskiy 14707a3f542fSVladimir Sementsov-Ogievskiy sum = pad->head + bytes + pad->tail; 14717a3f542fSVladimir Sementsov-Ogievskiy pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 14727a3f542fSVladimir Sementsov-Ogievskiy pad->buf = qemu_blockalign(bs, pad->buf_len); 14737a3f542fSVladimir Sementsov-Ogievskiy pad->merge_reads = sum == pad->buf_len; 14747a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 14757a3f542fSVladimir Sementsov-Ogievskiy pad->tail_buf = pad->buf + pad->buf_len - align; 14767a3f542fSVladimir Sementsov-Ogievskiy } 14777a3f542fSVladimir Sementsov-Ogievskiy 14787a3f542fSVladimir Sementsov-Ogievskiy return true; 14797a3f542fSVladimir Sementsov-Ogievskiy } 14807a3f542fSVladimir Sementsov-Ogievskiy 1481881a4c55SPaolo Bonzini static coroutine_fn int bdrv_padding_rmw_read(BdrvChild *child, 14827a3f542fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, 14837a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad, 14847a3f542fSVladimir Sementsov-Ogievskiy bool zero_middle) 14857a3f542fSVladimir Sementsov-Ogievskiy { 14867a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 14877a3f542fSVladimir Sementsov-Ogievskiy BlockDriverState *bs = child->bs; 14887a3f542fSVladimir Sementsov-Ogievskiy uint64_t align = bs->bl.request_alignment; 14897a3f542fSVladimir Sementsov-Ogievskiy int ret; 14907a3f542fSVladimir Sementsov-Ogievskiy 14917a3f542fSVladimir Sementsov-Ogievskiy assert(req->serialising && pad->buf); 14927a3f542fSVladimir Sementsov-Ogievskiy 14937a3f542fSVladimir Sementsov-Ogievskiy if (pad->head || pad->merge_reads) { 14948b0c5d76SVladimir Sementsov-Ogievskiy int64_t bytes = pad->merge_reads ? pad->buf_len : align; 14957a3f542fSVladimir Sementsov-Ogievskiy 14967a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 14977a3f542fSVladimir Sementsov-Ogievskiy 14987a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 14997a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 15007a3f542fSVladimir Sementsov-Ogievskiy } 15017a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 15027a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 15037a3f542fSVladimir Sementsov-Ogievskiy } 15047a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 150565cd4424SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 0); 15067a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 15077a3f542fSVladimir Sementsov-Ogievskiy return ret; 15087a3f542fSVladimir Sementsov-Ogievskiy } 15097a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 15107a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 15117a3f542fSVladimir Sementsov-Ogievskiy } 15127a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 15137a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 15147a3f542fSVladimir Sementsov-Ogievskiy } 15157a3f542fSVladimir Sementsov-Ogievskiy 15167a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads) { 15177a3f542fSVladimir Sementsov-Ogievskiy goto zero_mem; 15187a3f542fSVladimir Sementsov-Ogievskiy } 15197a3f542fSVladimir Sementsov-Ogievskiy } 15207a3f542fSVladimir Sementsov-Ogievskiy 15217a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 15227a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 15237a3f542fSVladimir Sementsov-Ogievskiy 15247a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 15257a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv( 15267a3f542fSVladimir Sementsov-Ogievskiy child, req, 15277a3f542fSVladimir Sementsov-Ogievskiy req->overlap_offset + req->overlap_bytes - align, 152865cd4424SVladimir Sementsov-Ogievskiy align, align, &local_qiov, 0, 0); 15297a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 15307a3f542fSVladimir Sementsov-Ogievskiy return ret; 15317a3f542fSVladimir Sementsov-Ogievskiy } 15327a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 15337a3f542fSVladimir Sementsov-Ogievskiy } 15347a3f542fSVladimir Sementsov-Ogievskiy 15357a3f542fSVladimir Sementsov-Ogievskiy zero_mem: 15367a3f542fSVladimir Sementsov-Ogievskiy if (zero_middle) { 15377a3f542fSVladimir Sementsov-Ogievskiy memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 15387a3f542fSVladimir Sementsov-Ogievskiy } 15397a3f542fSVladimir Sementsov-Ogievskiy 15407a3f542fSVladimir Sementsov-Ogievskiy return 0; 15417a3f542fSVladimir Sementsov-Ogievskiy } 15427a3f542fSVladimir Sementsov-Ogievskiy 15437a3f542fSVladimir Sementsov-Ogievskiy static void bdrv_padding_destroy(BdrvRequestPadding *pad) 15447a3f542fSVladimir Sementsov-Ogievskiy { 15457a3f542fSVladimir Sementsov-Ogievskiy if (pad->buf) { 15467a3f542fSVladimir Sementsov-Ogievskiy qemu_vfree(pad->buf); 15477a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&pad->local_qiov); 15487a3f542fSVladimir Sementsov-Ogievskiy } 154998ca4549SVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 15507a3f542fSVladimir Sementsov-Ogievskiy } 15517a3f542fSVladimir Sementsov-Ogievskiy 15527a3f542fSVladimir Sementsov-Ogievskiy /* 15537a3f542fSVladimir Sementsov-Ogievskiy * bdrv_pad_request 15547a3f542fSVladimir Sementsov-Ogievskiy * 15557a3f542fSVladimir Sementsov-Ogievskiy * Exchange request parameters with padded request if needed. Don't include RMW 15567a3f542fSVladimir Sementsov-Ogievskiy * read of padding, bdrv_padding_rmw_read() should be called separately if 15577a3f542fSVladimir Sementsov-Ogievskiy * needed. 15587a3f542fSVladimir Sementsov-Ogievskiy * 155998ca4549SVladimir Sementsov-Ogievskiy * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: 156098ca4549SVladimir Sementsov-Ogievskiy * - on function start they represent original request 156198ca4549SVladimir Sementsov-Ogievskiy * - on failure or when padding is not needed they are unchanged 156298ca4549SVladimir Sementsov-Ogievskiy * - on success when padding is needed they represent padded request 15637a3f542fSVladimir Sementsov-Ogievskiy */ 156498ca4549SVladimir Sementsov-Ogievskiy static int bdrv_pad_request(BlockDriverState *bs, 15651acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector **qiov, size_t *qiov_offset, 156637e9403eSVladimir Sementsov-Ogievskiy int64_t *offset, int64_t *bytes, 1567e8b65355SStefan Hajnoczi BdrvRequestPadding *pad, bool *padded, 1568e8b65355SStefan Hajnoczi BdrvRequestFlags *flags) 15697a3f542fSVladimir Sementsov-Ogievskiy { 15704c002cefSVladimir Sementsov-Ogievskiy int ret; 15714c002cefSVladimir Sementsov-Ogievskiy 157237e9403eSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); 157337e9403eSVladimir Sementsov-Ogievskiy 15747a3f542fSVladimir Sementsov-Ogievskiy if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 157598ca4549SVladimir Sementsov-Ogievskiy if (padded) { 157698ca4549SVladimir Sementsov-Ogievskiy *padded = false; 157798ca4549SVladimir Sementsov-Ogievskiy } 157898ca4549SVladimir Sementsov-Ogievskiy return 0; 15797a3f542fSVladimir Sementsov-Ogievskiy } 15807a3f542fSVladimir Sementsov-Ogievskiy 15814c002cefSVladimir Sementsov-Ogievskiy ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 15821acc3466SVladimir Sementsov-Ogievskiy *qiov, *qiov_offset, *bytes, 15834c002cefSVladimir Sementsov-Ogievskiy pad->buf + pad->buf_len - pad->tail, 15844c002cefSVladimir Sementsov-Ogievskiy pad->tail); 158598ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 158698ca4549SVladimir Sementsov-Ogievskiy bdrv_padding_destroy(pad); 158798ca4549SVladimir Sementsov-Ogievskiy return ret; 158898ca4549SVladimir Sementsov-Ogievskiy } 15897a3f542fSVladimir Sementsov-Ogievskiy *bytes += pad->head + pad->tail; 15907a3f542fSVladimir Sementsov-Ogievskiy *offset -= pad->head; 15917a3f542fSVladimir Sementsov-Ogievskiy *qiov = &pad->local_qiov; 15921acc3466SVladimir Sementsov-Ogievskiy *qiov_offset = 0; 159398ca4549SVladimir Sementsov-Ogievskiy if (padded) { 159498ca4549SVladimir Sementsov-Ogievskiy *padded = true; 159598ca4549SVladimir Sementsov-Ogievskiy } 1596e8b65355SStefan Hajnoczi if (flags) { 1597e8b65355SStefan Hajnoczi /* Can't use optimization hint with bounce buffer */ 1598e8b65355SStefan Hajnoczi *flags &= ~BDRV_REQ_REGISTERED_BUF; 1599e8b65355SStefan Hajnoczi } 16007a3f542fSVladimir Sementsov-Ogievskiy 160198ca4549SVladimir Sementsov-Ogievskiy return 0; 16027a3f542fSVladimir Sementsov-Ogievskiy } 16037a3f542fSVladimir Sementsov-Ogievskiy 1604a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1605e9e52efdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 160661007b31SStefan Hajnoczi BdrvRequestFlags flags) 160761007b31SStefan Hajnoczi { 1608967d7905SEmanuele Giuseppe Esposito IO_CODE(); 16091acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 16101acc3466SVladimir Sementsov-Ogievskiy } 16111acc3466SVladimir Sementsov-Ogievskiy 16121acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 161337e9403eSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 16141acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 16151acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 16161acc3466SVladimir Sementsov-Ogievskiy { 1617a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 161861007b31SStefan Hajnoczi BdrvTrackedRequest req; 16197a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 162061007b31SStefan Hajnoczi int ret; 1621967d7905SEmanuele Giuseppe Esposito IO_CODE(); 162261007b31SStefan Hajnoczi 162337e9403eSVladimir Sementsov-Ogievskiy trace_bdrv_co_preadv_part(bs, offset, bytes, flags); 162461007b31SStefan Hajnoczi 16251e97be91SEmanuele Giuseppe Esposito if (!bdrv_co_is_inserted(bs)) { 1626f4dad307SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 1627f4dad307SVladimir Sementsov-Ogievskiy } 1628f4dad307SVladimir Sementsov-Ogievskiy 162963f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 163061007b31SStefan Hajnoczi if (ret < 0) { 163161007b31SStefan Hajnoczi return ret; 163261007b31SStefan Hajnoczi } 163361007b31SStefan Hajnoczi 1634ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1635ac9d00bfSVladimir Sementsov-Ogievskiy /* 1636ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 1637ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1638ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 1639ac9d00bfSVladimir Sementsov-Ogievskiy * 1640ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 1641ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length read occasionally. 1642ac9d00bfSVladimir Sementsov-Ogievskiy */ 1643ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 1644ac9d00bfSVladimir Sementsov-Ogievskiy } 1645ac9d00bfSVladimir Sementsov-Ogievskiy 164699723548SPaolo Bonzini bdrv_inc_in_flight(bs); 164799723548SPaolo Bonzini 16489568b511SWen Congyang /* Don't do copy-on-read if we read data before write operation */ 1649d73415a3SStefan Hajnoczi if (qatomic_read(&bs->copy_on_read)) { 165061007b31SStefan Hajnoczi flags |= BDRV_REQ_COPY_ON_READ; 165161007b31SStefan Hajnoczi } 165261007b31SStefan Hajnoczi 165398ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 1654e8b65355SStefan Hajnoczi NULL, &flags); 165598ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 165687ab8802SKevin Wolf goto fail; 165798ca4549SVladimir Sementsov-Ogievskiy } 165861007b31SStefan Hajnoczi 1659ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 16607a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, &req, offset, bytes, 16617a3f542fSVladimir Sementsov-Ogievskiy bs->bl.request_alignment, 16621acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 166361007b31SStefan Hajnoczi tracked_request_end(&req); 16647a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 166561007b31SStefan Hajnoczi 166687ab8802SKevin Wolf fail: 166787ab8802SKevin Wolf bdrv_dec_in_flight(bs); 166887ab8802SKevin Wolf 166961007b31SStefan Hajnoczi return ret; 167061007b31SStefan Hajnoczi } 167161007b31SStefan Hajnoczi 1672d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 16735ae07b14SVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, BdrvRequestFlags flags) 167461007b31SStefan Hajnoczi { 167561007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 167661007b31SStefan Hajnoczi QEMUIOVector qiov; 16770d93ed08SVladimir Sementsov-Ogievskiy void *buf = NULL; 167861007b31SStefan Hajnoczi int ret = 0; 1679465fe887SEric Blake bool need_flush = false; 1680443668caSDenis V. Lunev int head = 0; 1681443668caSDenis V. Lunev int tail = 0; 168261007b31SStefan Hajnoczi 16832aaa3f9bSVladimir Sementsov-Ogievskiy int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, 16842aaa3f9bSVladimir Sementsov-Ogievskiy INT64_MAX); 1685a5b8dd2cSEric Blake int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1686a5b8dd2cSEric Blake bs->bl.request_alignment); 1687cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1688cf081fcaSEric Blake 16895ae07b14SVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 16905ae07b14SVladimir Sementsov-Ogievskiy 1691d470ad42SMax Reitz if (!drv) { 1692d470ad42SMax Reitz return -ENOMEDIUM; 1693d470ad42SMax Reitz } 1694d470ad42SMax Reitz 1695fe0480d6SKevin Wolf if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1696fe0480d6SKevin Wolf return -ENOTSUP; 1697fe0480d6SKevin Wolf } 1698fe0480d6SKevin Wolf 1699e8b65355SStefan Hajnoczi /* By definition there is no user buffer so this flag doesn't make sense */ 1700e8b65355SStefan Hajnoczi if (flags & BDRV_REQ_REGISTERED_BUF) { 1701e8b65355SStefan Hajnoczi return -EINVAL; 1702e8b65355SStefan Hajnoczi } 1703e8b65355SStefan Hajnoczi 17040bc329fbSHanna Reitz /* Invalidate the cached block-status data range if this write overlaps */ 17050bc329fbSHanna Reitz bdrv_bsc_invalidate_range(bs, offset, bytes); 17060bc329fbSHanna Reitz 1707b8d0a980SEric Blake assert(alignment % bs->bl.request_alignment == 0); 1708b8d0a980SEric Blake head = offset % alignment; 1709f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % alignment; 1710b8d0a980SEric Blake max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1711b8d0a980SEric Blake assert(max_write_zeroes >= bs->bl.request_alignment); 171261007b31SStefan Hajnoczi 1713f5a5ca79SManos Pitsidianakis while (bytes > 0 && !ret) { 17145ae07b14SVladimir Sementsov-Ogievskiy int64_t num = bytes; 171561007b31SStefan Hajnoczi 171661007b31SStefan Hajnoczi /* Align request. Block drivers can expect the "bulk" of the request 1717443668caSDenis V. Lunev * to be aligned, and that unaligned requests do not cross cluster 1718443668caSDenis V. Lunev * boundaries. 171961007b31SStefan Hajnoczi */ 1720443668caSDenis V. Lunev if (head) { 1721b2f95feeSEric Blake /* Make a small request up to the first aligned sector. For 1722b2f95feeSEric Blake * convenience, limit this request to max_transfer even if 1723b2f95feeSEric Blake * we don't need to fall back to writes. */ 1724f5a5ca79SManos Pitsidianakis num = MIN(MIN(bytes, max_transfer), alignment - head); 1725b2f95feeSEric Blake head = (head + num) % alignment; 1726b2f95feeSEric Blake assert(num < max_write_zeroes); 1727d05aa8bbSEric Blake } else if (tail && num > alignment) { 1728443668caSDenis V. Lunev /* Shorten the request to the last aligned sector. */ 1729443668caSDenis V. Lunev num -= tail; 173061007b31SStefan Hajnoczi } 173161007b31SStefan Hajnoczi 173261007b31SStefan Hajnoczi /* limit request size */ 173361007b31SStefan Hajnoczi if (num > max_write_zeroes) { 173461007b31SStefan Hajnoczi num = max_write_zeroes; 173561007b31SStefan Hajnoczi } 173661007b31SStefan Hajnoczi 173761007b31SStefan Hajnoczi ret = -ENOTSUP; 173861007b31SStefan Hajnoczi /* First try the efficient write zeroes operation */ 1739d05aa8bbSEric Blake if (drv->bdrv_co_pwrite_zeroes) { 1740d05aa8bbSEric Blake ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1741d05aa8bbSEric Blake flags & bs->supported_zero_flags); 1742d05aa8bbSEric Blake if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1743d05aa8bbSEric Blake !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1744d05aa8bbSEric Blake need_flush = true; 1745d05aa8bbSEric Blake } 1746465fe887SEric Blake } else { 1747465fe887SEric Blake assert(!bs->supported_zero_flags); 174861007b31SStefan Hajnoczi } 174961007b31SStefan Hajnoczi 1750294682ccSAndrey Shinkevich if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 175161007b31SStefan Hajnoczi /* Fall back to bounce buffer if write zeroes is unsupported */ 1752465fe887SEric Blake BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1753465fe887SEric Blake 1754465fe887SEric Blake if ((flags & BDRV_REQ_FUA) && 1755465fe887SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1756465fe887SEric Blake /* No need for bdrv_driver_pwrite() to do a fallback 1757465fe887SEric Blake * flush on each chunk; use just one at the end */ 1758465fe887SEric Blake write_flags &= ~BDRV_REQ_FUA; 1759465fe887SEric Blake need_flush = true; 1760465fe887SEric Blake } 17615def6b80SEric Blake num = MIN(num, max_transfer); 17620d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 17630d93ed08SVladimir Sementsov-Ogievskiy buf = qemu_try_blockalign0(bs, num); 17640d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 176561007b31SStefan Hajnoczi ret = -ENOMEM; 176661007b31SStefan Hajnoczi goto fail; 176761007b31SStefan Hajnoczi } 176861007b31SStefan Hajnoczi } 17690d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&qiov, buf, num); 177061007b31SStefan Hajnoczi 1771ac850bf0SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 177261007b31SStefan Hajnoczi 177361007b31SStefan Hajnoczi /* Keep bounce buffer around if it is big enough for all 177461007b31SStefan Hajnoczi * all future requests. 177561007b31SStefan Hajnoczi */ 17765def6b80SEric Blake if (num < max_transfer) { 17770d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 17780d93ed08SVladimir Sementsov-Ogievskiy buf = NULL; 177961007b31SStefan Hajnoczi } 178061007b31SStefan Hajnoczi } 178161007b31SStefan Hajnoczi 1782d05aa8bbSEric Blake offset += num; 1783f5a5ca79SManos Pitsidianakis bytes -= num; 178461007b31SStefan Hajnoczi } 178561007b31SStefan Hajnoczi 178661007b31SStefan Hajnoczi fail: 1787465fe887SEric Blake if (ret == 0 && need_flush) { 1788465fe887SEric Blake ret = bdrv_co_flush(bs); 1789465fe887SEric Blake } 17900d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 179161007b31SStefan Hajnoczi return ret; 179261007b31SStefan Hajnoczi } 179361007b31SStefan Hajnoczi 179485fe2479SFam Zheng static inline int coroutine_fn 1795fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, 179685fe2479SFam Zheng BdrvTrackedRequest *req, int flags) 179785fe2479SFam Zheng { 179885fe2479SFam Zheng BlockDriverState *bs = child->bs; 1799fcfd9adeSVladimir Sementsov-Ogievskiy 1800fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 180185fe2479SFam Zheng 1802307261b2SVladimir Sementsov-Ogievskiy if (bdrv_is_read_only(bs)) { 180385fe2479SFam Zheng return -EPERM; 180485fe2479SFam Zheng } 180585fe2479SFam Zheng 180685fe2479SFam Zheng assert(!(bs->open_flags & BDRV_O_INACTIVE)); 180785fe2479SFam Zheng assert((bs->open_flags & BDRV_O_NO_IO) == 0); 180885fe2479SFam Zheng assert(!(flags & ~BDRV_REQ_MASK)); 1809d1a764d1SVladimir Sementsov-Ogievskiy assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); 181085fe2479SFam Zheng 181185fe2479SFam Zheng if (flags & BDRV_REQ_SERIALISING) { 1812d1a764d1SVladimir Sementsov-Ogievskiy QEMU_LOCK_GUARD(&bs->reqs_lock); 1813d1a764d1SVladimir Sementsov-Ogievskiy 1814d1a764d1SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); 1815d1a764d1SVladimir Sementsov-Ogievskiy 1816d1a764d1SVladimir Sementsov-Ogievskiy if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { 1817d1a764d1SVladimir Sementsov-Ogievskiy return -EBUSY; 1818d1a764d1SVladimir Sementsov-Ogievskiy } 1819d1a764d1SVladimir Sementsov-Ogievskiy 1820d1a764d1SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(req); 182118fbd0deSPaolo Bonzini } else { 182218fbd0deSPaolo Bonzini bdrv_wait_serialising_requests(req); 182385fe2479SFam Zheng } 182485fe2479SFam Zheng 182585fe2479SFam Zheng assert(req->overlap_offset <= offset); 182685fe2479SFam Zheng assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1827fcfd9adeSVladimir Sementsov-Ogievskiy assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE || 1828fcfd9adeSVladimir Sementsov-Ogievskiy child->perm & BLK_PERM_RESIZE); 182985fe2479SFam Zheng 1830cd47d792SFam Zheng switch (req->type) { 1831cd47d792SFam Zheng case BDRV_TRACKED_WRITE: 1832cd47d792SFam Zheng case BDRV_TRACKED_DISCARD: 183385fe2479SFam Zheng if (flags & BDRV_REQ_WRITE_UNCHANGED) { 183485fe2479SFam Zheng assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 183585fe2479SFam Zheng } else { 183685fe2479SFam Zheng assert(child->perm & BLK_PERM_WRITE); 183785fe2479SFam Zheng } 183894783301SVladimir Sementsov-Ogievskiy bdrv_write_threshold_check_write(bs, offset, bytes); 183994783301SVladimir Sementsov-Ogievskiy return 0; 1840cd47d792SFam Zheng case BDRV_TRACKED_TRUNCATE: 1841cd47d792SFam Zheng assert(child->perm & BLK_PERM_RESIZE); 1842cd47d792SFam Zheng return 0; 1843cd47d792SFam Zheng default: 1844cd47d792SFam Zheng abort(); 1845cd47d792SFam Zheng } 184685fe2479SFam Zheng } 184785fe2479SFam Zheng 184885fe2479SFam Zheng static inline void coroutine_fn 1849fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes, 185085fe2479SFam Zheng BdrvTrackedRequest *req, int ret) 185185fe2479SFam Zheng { 185285fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 185385fe2479SFam Zheng BlockDriverState *bs = child->bs; 185485fe2479SFam Zheng 1855fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 1856fcfd9adeSVladimir Sementsov-Ogievskiy 1857d73415a3SStefan Hajnoczi qatomic_inc(&bs->write_gen); 185885fe2479SFam Zheng 185900695c27SFam Zheng /* 186000695c27SFam Zheng * Discard cannot extend the image, but in error handling cases, such as 186100695c27SFam Zheng * when reverting a qcow2 cluster allocation, the discarded range can pass 186200695c27SFam Zheng * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 186300695c27SFam Zheng * here. Instead, just skip it, since semantically a discard request 186400695c27SFam Zheng * beyond EOF cannot expand the image anyway. 186500695c27SFam Zheng */ 18667f8f03efSFam Zheng if (ret == 0 && 1867cd47d792SFam Zheng (req->type == BDRV_TRACKED_TRUNCATE || 1868cd47d792SFam Zheng end_sector > bs->total_sectors) && 186900695c27SFam Zheng req->type != BDRV_TRACKED_DISCARD) { 18707f8f03efSFam Zheng bs->total_sectors = end_sector; 18717f8f03efSFam Zheng bdrv_parent_cb_resize(bs); 18727f8f03efSFam Zheng bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 187385fe2479SFam Zheng } 187400695c27SFam Zheng if (req->bytes) { 187500695c27SFam Zheng switch (req->type) { 187600695c27SFam Zheng case BDRV_TRACKED_WRITE: 187700695c27SFam Zheng stat64_max(&bs->wr_highest_offset, offset + bytes); 187800695c27SFam Zheng /* fall through, to set dirty bits */ 187900695c27SFam Zheng case BDRV_TRACKED_DISCARD: 18807f8f03efSFam Zheng bdrv_set_dirty(bs, offset, bytes); 188100695c27SFam Zheng break; 188200695c27SFam Zheng default: 188300695c27SFam Zheng break; 188400695c27SFam Zheng } 188500695c27SFam Zheng } 188685fe2479SFam Zheng } 188785fe2479SFam Zheng 188861007b31SStefan Hajnoczi /* 188904ed95f4SEric Blake * Forwards an already correctly aligned write request to the BlockDriver, 189004ed95f4SEric Blake * after possibly fragmenting it. 189161007b31SStefan Hajnoczi */ 189285c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1893fcfd9adeSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 1894e75abedaSVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, 1895e75abedaSVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 189661007b31SStefan Hajnoczi { 189785c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 189861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 189961007b31SStefan Hajnoczi int ret; 190061007b31SStefan Hajnoczi 1901fcfd9adeSVladimir Sementsov-Ogievskiy int64_t bytes_remaining = bytes; 190204ed95f4SEric Blake int max_transfer; 190361007b31SStefan Hajnoczi 1904fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1905fcfd9adeSVladimir Sementsov-Ogievskiy 1906d470ad42SMax Reitz if (!drv) { 1907d470ad42SMax Reitz return -ENOMEDIUM; 1908d470ad42SMax Reitz } 1909d470ad42SMax Reitz 1910d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 1911d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 1912d6883bc9SVladimir Sementsov-Ogievskiy } 1913d6883bc9SVladimir Sementsov-Ogievskiy 1914cff86b38SEric Blake assert(is_power_of_2(align)); 1915cff86b38SEric Blake assert((offset & (align - 1)) == 0); 1916cff86b38SEric Blake assert((bytes & (align - 1)) == 0); 191704ed95f4SEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 191804ed95f4SEric Blake align); 191961007b31SStefan Hajnoczi 192085fe2479SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 192161007b31SStefan Hajnoczi 192261007b31SStefan Hajnoczi if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1923c1499a5eSEric Blake !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 192428c4da28SVladimir Sementsov-Ogievskiy qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 192561007b31SStefan Hajnoczi flags |= BDRV_REQ_ZERO_WRITE; 192661007b31SStefan Hajnoczi if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 192761007b31SStefan Hajnoczi flags |= BDRV_REQ_MAY_UNMAP; 192861007b31SStefan Hajnoczi } 192961007b31SStefan Hajnoczi } 193061007b31SStefan Hajnoczi 193161007b31SStefan Hajnoczi if (ret < 0) { 193261007b31SStefan Hajnoczi /* Do nothing, write notifier decided to fail this request */ 193361007b31SStefan Hajnoczi } else if (flags & BDRV_REQ_ZERO_WRITE) { 19349a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 19359896c876SKevin Wolf ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 19363ea1a091SPavel Butsykin } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 193728c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 193828c4da28SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 193904ed95f4SEric Blake } else if (bytes <= max_transfer) { 19409a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV); 194128c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 194204ed95f4SEric Blake } else { 194304ed95f4SEric Blake bdrv_debug_event(bs, BLKDBG_PWRITEV); 194404ed95f4SEric Blake while (bytes_remaining) { 194504ed95f4SEric Blake int num = MIN(bytes_remaining, max_transfer); 194604ed95f4SEric Blake int local_flags = flags; 194704ed95f4SEric Blake 194804ed95f4SEric Blake assert(num); 194904ed95f4SEric Blake if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 195004ed95f4SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 195104ed95f4SEric Blake /* If FUA is going to be emulated by flush, we only 195204ed95f4SEric Blake * need to flush on the last iteration */ 195304ed95f4SEric Blake local_flags &= ~BDRV_REQ_FUA; 195404ed95f4SEric Blake } 195504ed95f4SEric Blake 195604ed95f4SEric Blake ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1957134b7decSMax Reitz num, qiov, 1958134b7decSMax Reitz qiov_offset + bytes - bytes_remaining, 195928c4da28SVladimir Sementsov-Ogievskiy local_flags); 196004ed95f4SEric Blake if (ret < 0) { 196104ed95f4SEric Blake break; 196204ed95f4SEric Blake } 196304ed95f4SEric Blake bytes_remaining -= num; 196404ed95f4SEric Blake } 196561007b31SStefan Hajnoczi } 19669a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 196761007b31SStefan Hajnoczi 196861007b31SStefan Hajnoczi if (ret >= 0) { 196904ed95f4SEric Blake ret = 0; 197061007b31SStefan Hajnoczi } 197185fe2479SFam Zheng bdrv_co_write_req_finish(child, offset, bytes, req, ret); 197261007b31SStefan Hajnoczi 197361007b31SStefan Hajnoczi return ret; 197461007b31SStefan Hajnoczi } 197561007b31SStefan Hajnoczi 197685c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 19779eeb6dd1SFam Zheng int64_t offset, 197837e9403eSVladimir Sementsov-Ogievskiy int64_t bytes, 19799eeb6dd1SFam Zheng BdrvRequestFlags flags, 19809eeb6dd1SFam Zheng BdrvTrackedRequest *req) 19819eeb6dd1SFam Zheng { 198285c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 19839eeb6dd1SFam Zheng QEMUIOVector local_qiov; 1984a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 19859eeb6dd1SFam Zheng int ret = 0; 19867a3f542fSVladimir Sementsov-Ogievskiy bool padding; 19877a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 19889eeb6dd1SFam Zheng 1989e8b65355SStefan Hajnoczi /* This flag doesn't make sense for padding or zero writes */ 1990e8b65355SStefan Hajnoczi flags &= ~BDRV_REQ_REGISTERED_BUF; 1991e8b65355SStefan Hajnoczi 19927a3f542fSVladimir Sementsov-Ogievskiy padding = bdrv_init_padding(bs, offset, bytes, &pad); 19937a3f542fSVladimir Sementsov-Ogievskiy if (padding) { 199445e62b46SVladimir Sementsov-Ogievskiy assert(!(flags & BDRV_REQ_NO_WAIT)); 19958ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, align); 19969eeb6dd1SFam Zheng 19977a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, req, &pad, true); 19987a3f542fSVladimir Sementsov-Ogievskiy 19997a3f542fSVladimir Sementsov-Ogievskiy if (pad.head || pad.merge_reads) { 20007a3f542fSVladimir Sementsov-Ogievskiy int64_t aligned_offset = offset & ~(align - 1); 20017a3f542fSVladimir Sementsov-Ogievskiy int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 20027a3f542fSVladimir Sementsov-Ogievskiy 20037a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 20047a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 200528c4da28SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 20069eeb6dd1SFam Zheng flags & ~BDRV_REQ_ZERO_WRITE); 20077a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0 || pad.merge_reads) { 20087a3f542fSVladimir Sementsov-Ogievskiy /* Error or all work is done */ 20097a3f542fSVladimir Sementsov-Ogievskiy goto out; 20109eeb6dd1SFam Zheng } 20117a3f542fSVladimir Sementsov-Ogievskiy offset += write_bytes - pad.head; 20127a3f542fSVladimir Sementsov-Ogievskiy bytes -= write_bytes - pad.head; 20137a3f542fSVladimir Sementsov-Ogievskiy } 20149eeb6dd1SFam Zheng } 20159eeb6dd1SFam Zheng 20169eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 20179eeb6dd1SFam Zheng if (bytes >= align) { 20189eeb6dd1SFam Zheng /* Write the aligned part in the middle. */ 2019fcfd9adeSVladimir Sementsov-Ogievskiy int64_t aligned_bytes = bytes & ~(align - 1); 202085c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 202128c4da28SVladimir Sementsov-Ogievskiy NULL, 0, flags); 20229eeb6dd1SFam Zheng if (ret < 0) { 20237a3f542fSVladimir Sementsov-Ogievskiy goto out; 20249eeb6dd1SFam Zheng } 20259eeb6dd1SFam Zheng bytes -= aligned_bytes; 20269eeb6dd1SFam Zheng offset += aligned_bytes; 20279eeb6dd1SFam Zheng } 20289eeb6dd1SFam Zheng 20299eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 20309eeb6dd1SFam Zheng if (bytes) { 20317a3f542fSVladimir Sementsov-Ogievskiy assert(align == pad.tail + bytes); 20329eeb6dd1SFam Zheng 20337a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 203485c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, align, align, 203528c4da28SVladimir Sementsov-Ogievskiy &local_qiov, 0, 203628c4da28SVladimir Sementsov-Ogievskiy flags & ~BDRV_REQ_ZERO_WRITE); 20379eeb6dd1SFam Zheng } 20389eeb6dd1SFam Zheng 20397a3f542fSVladimir Sementsov-Ogievskiy out: 20407a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 20417a3f542fSVladimir Sementsov-Ogievskiy 20427a3f542fSVladimir Sementsov-Ogievskiy return ret; 20439eeb6dd1SFam Zheng } 20449eeb6dd1SFam Zheng 204561007b31SStefan Hajnoczi /* 204661007b31SStefan Hajnoczi * Handle a write request in coroutine context 204761007b31SStefan Hajnoczi */ 2048a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2049e9e52efdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 205061007b31SStefan Hajnoczi BdrvRequestFlags flags) 205161007b31SStefan Hajnoczi { 2052967d7905SEmanuele Giuseppe Esposito IO_CODE(); 20531acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 20541acc3466SVladimir Sementsov-Ogievskiy } 20551acc3466SVladimir Sementsov-Ogievskiy 20561acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 205737e9403eSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, 20581acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 20591acc3466SVladimir Sementsov-Ogievskiy { 2060a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 206161007b31SStefan Hajnoczi BdrvTrackedRequest req; 2062a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 20637a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 206461007b31SStefan Hajnoczi int ret; 2065f0deecffSVladimir Sementsov-Ogievskiy bool padded = false; 2066967d7905SEmanuele Giuseppe Esposito IO_CODE(); 206761007b31SStefan Hajnoczi 206837e9403eSVladimir Sementsov-Ogievskiy trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags); 2069f42cf447SDaniel P. Berrange 20701e97be91SEmanuele Giuseppe Esposito if (!bdrv_co_is_inserted(bs)) { 207161007b31SStefan Hajnoczi return -ENOMEDIUM; 207261007b31SStefan Hajnoczi } 207361007b31SStefan Hajnoczi 20742aaa3f9bSVladimir Sementsov-Ogievskiy if (flags & BDRV_REQ_ZERO_WRITE) { 20752aaa3f9bSVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 20762aaa3f9bSVladimir Sementsov-Ogievskiy } else { 207763f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 20782aaa3f9bSVladimir Sementsov-Ogievskiy } 207961007b31SStefan Hajnoczi if (ret < 0) { 208061007b31SStefan Hajnoczi return ret; 208161007b31SStefan Hajnoczi } 208261007b31SStefan Hajnoczi 2083f2208fdcSAlberto Garcia /* If the request is misaligned then we can't make it efficient */ 2084f2208fdcSAlberto Garcia if ((flags & BDRV_REQ_NO_FALLBACK) && 2085f2208fdcSAlberto Garcia !QEMU_IS_ALIGNED(offset | bytes, align)) 2086f2208fdcSAlberto Garcia { 2087f2208fdcSAlberto Garcia return -ENOTSUP; 2088f2208fdcSAlberto Garcia } 2089f2208fdcSAlberto Garcia 2090ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2091ac9d00bfSVladimir Sementsov-Ogievskiy /* 2092ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 2093ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2094ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 2095ac9d00bfSVladimir Sementsov-Ogievskiy * 2096ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 2097ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length write occasionally. 2098ac9d00bfSVladimir Sementsov-Ogievskiy */ 2099ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 2100ac9d00bfSVladimir Sementsov-Ogievskiy } 2101ac9d00bfSVladimir Sementsov-Ogievskiy 2102f0deecffSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_ZERO_WRITE)) { 210361007b31SStefan Hajnoczi /* 2104f0deecffSVladimir Sementsov-Ogievskiy * Pad request for following read-modify-write cycle. 2105f0deecffSVladimir Sementsov-Ogievskiy * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do 2106f0deecffSVladimir Sementsov-Ogievskiy * alignment only if there is no ZERO flag. 210761007b31SStefan Hajnoczi */ 210898ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 2109e8b65355SStefan Hajnoczi &padded, &flags); 211098ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 211198ca4549SVladimir Sementsov-Ogievskiy return ret; 211298ca4549SVladimir Sementsov-Ogievskiy } 2113f0deecffSVladimir Sementsov-Ogievskiy } 2114f0deecffSVladimir Sementsov-Ogievskiy 2115f0deecffSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2116ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 211761007b31SStefan Hajnoczi 211818a59f03SAnton Nefedov if (flags & BDRV_REQ_ZERO_WRITE) { 2119f0deecffSVladimir Sementsov-Ogievskiy assert(!padded); 212085c97ca7SKevin Wolf ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 21219eeb6dd1SFam Zheng goto out; 21229eeb6dd1SFam Zheng } 21239eeb6dd1SFam Zheng 2124f0deecffSVladimir Sementsov-Ogievskiy if (padded) { 2125f0deecffSVladimir Sementsov-Ogievskiy /* 2126f0deecffSVladimir Sementsov-Ogievskiy * Request was unaligned to request_alignment and therefore 2127f0deecffSVladimir Sementsov-Ogievskiy * padded. We are going to do read-modify-write, and must 2128f0deecffSVladimir Sementsov-Ogievskiy * serialize the request to prevent interactions of the 2129f0deecffSVladimir Sementsov-Ogievskiy * widened region with other transactions. 2130f0deecffSVladimir Sementsov-Ogievskiy */ 213145e62b46SVladimir Sementsov-Ogievskiy assert(!(flags & BDRV_REQ_NO_WAIT)); 21328ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, align); 21337a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, &req, &pad, false); 213461007b31SStefan Hajnoczi } 213561007b31SStefan Hajnoczi 213685c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 21371acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 213861007b31SStefan Hajnoczi 21397a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 214061007b31SStefan Hajnoczi 21419eeb6dd1SFam Zheng out: 21429eeb6dd1SFam Zheng tracked_request_end(&req); 214399723548SPaolo Bonzini bdrv_dec_in_flight(bs); 21447a3f542fSVladimir Sementsov-Ogievskiy 214561007b31SStefan Hajnoczi return ret; 214661007b31SStefan Hajnoczi } 214761007b31SStefan Hajnoczi 2148a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2149e9e52efdSVladimir Sementsov-Ogievskiy int64_t bytes, BdrvRequestFlags flags) 215061007b31SStefan Hajnoczi { 2151384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2152f5a5ca79SManos Pitsidianakis trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 215361007b31SStefan Hajnoczi 2154a03ef88fSKevin Wolf if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 215561007b31SStefan Hajnoczi flags &= ~BDRV_REQ_MAY_UNMAP; 215661007b31SStefan Hajnoczi } 215761007b31SStefan Hajnoczi 2158f5a5ca79SManos Pitsidianakis return bdrv_co_pwritev(child, offset, bytes, NULL, 215961007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 216061007b31SStefan Hajnoczi } 216161007b31SStefan Hajnoczi 21624085f5c7SJohn Snow /* 21634085f5c7SJohn Snow * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 21644085f5c7SJohn Snow */ 21654085f5c7SJohn Snow int bdrv_flush_all(void) 21664085f5c7SJohn Snow { 21674085f5c7SJohn Snow BdrvNextIterator it; 21684085f5c7SJohn Snow BlockDriverState *bs = NULL; 21694085f5c7SJohn Snow int result = 0; 21704085f5c7SJohn Snow 2171f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 2172f791bf7fSEmanuele Giuseppe Esposito 2173c8aa7895SPavel Dovgalyuk /* 2174c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 2175c8aa7895SPavel Dovgalyuk * creating new flush request for stopping 2176c8aa7895SPavel Dovgalyuk * the VM may break the determinism 2177c8aa7895SPavel Dovgalyuk */ 2178c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 2179c8aa7895SPavel Dovgalyuk return result; 2180c8aa7895SPavel Dovgalyuk } 2181c8aa7895SPavel Dovgalyuk 21824085f5c7SJohn Snow for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 21834085f5c7SJohn Snow AioContext *aio_context = bdrv_get_aio_context(bs); 21844085f5c7SJohn Snow int ret; 21854085f5c7SJohn Snow 21864085f5c7SJohn Snow aio_context_acquire(aio_context); 21874085f5c7SJohn Snow ret = bdrv_flush(bs); 21884085f5c7SJohn Snow if (ret < 0 && !result) { 21894085f5c7SJohn Snow result = ret; 21904085f5c7SJohn Snow } 21914085f5c7SJohn Snow aio_context_release(aio_context); 21924085f5c7SJohn Snow } 21934085f5c7SJohn Snow 21944085f5c7SJohn Snow return result; 21954085f5c7SJohn Snow } 21964085f5c7SJohn Snow 219761007b31SStefan Hajnoczi /* 219861007b31SStefan Hajnoczi * Returns the allocation status of the specified sectors. 219961007b31SStefan Hajnoczi * Drivers not implementing the functionality are assumed to not support 220061007b31SStefan Hajnoczi * backing files, hence all their sectors are reported as allocated. 220161007b31SStefan Hajnoczi * 220286a3d5c6SEric Blake * If 'want_zero' is true, the caller is querying for mapping 220386a3d5c6SEric Blake * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 220486a3d5c6SEric Blake * _ZERO where possible; otherwise, the result favors larger 'pnum', 220586a3d5c6SEric Blake * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2206c9ce8c4dSEric Blake * 22072e8bc787SEric Blake * If 'offset' is beyond the end of the disk image the return value is 2208fb0d8654SEric Blake * BDRV_BLOCK_EOF and 'pnum' is set to 0. 220961007b31SStefan Hajnoczi * 22102e8bc787SEric Blake * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2211fb0d8654SEric Blake * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2212fb0d8654SEric Blake * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 221367a0fd2aSFam Zheng * 22142e8bc787SEric Blake * 'pnum' is set to the number of bytes (including and immediately 22152e8bc787SEric Blake * following the specified offset) that are easily known to be in the 22162e8bc787SEric Blake * same allocated/unallocated state. Note that a second call starting 22172e8bc787SEric Blake * at the original offset plus returned pnum may have the same status. 22182e8bc787SEric Blake * The returned value is non-zero on success except at end-of-file. 22192e8bc787SEric Blake * 22202e8bc787SEric Blake * Returns negative errno on failure. Otherwise, if the 22212e8bc787SEric Blake * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 22222e8bc787SEric Blake * set to the host mapping and BDS corresponding to the guest offset. 222361007b31SStefan Hajnoczi */ 22242e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2225c9ce8c4dSEric Blake bool want_zero, 22262e8bc787SEric Blake int64_t offset, int64_t bytes, 22272e8bc787SEric Blake int64_t *pnum, int64_t *map, 222867a0fd2aSFam Zheng BlockDriverState **file) 222961007b31SStefan Hajnoczi { 22302e8bc787SEric Blake int64_t total_size; 22312e8bc787SEric Blake int64_t n; /* bytes */ 2232efa6e2edSEric Blake int ret; 22332e8bc787SEric Blake int64_t local_map = 0; 2234298a1665SEric Blake BlockDriverState *local_file = NULL; 2235efa6e2edSEric Blake int64_t aligned_offset, aligned_bytes; 2236efa6e2edSEric Blake uint32_t align; 2237549ec0d9SMax Reitz bool has_filtered_child; 223861007b31SStefan Hajnoczi 2239298a1665SEric Blake assert(pnum); 2240298a1665SEric Blake *pnum = 0; 22412e8bc787SEric Blake total_size = bdrv_getlength(bs); 22422e8bc787SEric Blake if (total_size < 0) { 22432e8bc787SEric Blake ret = total_size; 2244298a1665SEric Blake goto early_out; 224561007b31SStefan Hajnoczi } 224661007b31SStefan Hajnoczi 22472e8bc787SEric Blake if (offset >= total_size) { 2248298a1665SEric Blake ret = BDRV_BLOCK_EOF; 2249298a1665SEric Blake goto early_out; 225061007b31SStefan Hajnoczi } 22512e8bc787SEric Blake if (!bytes) { 2252298a1665SEric Blake ret = 0; 2253298a1665SEric Blake goto early_out; 22549cdcfd9fSEric Blake } 225561007b31SStefan Hajnoczi 22562e8bc787SEric Blake n = total_size - offset; 22572e8bc787SEric Blake if (n < bytes) { 22582e8bc787SEric Blake bytes = n; 225961007b31SStefan Hajnoczi } 226061007b31SStefan Hajnoczi 2261d470ad42SMax Reitz /* Must be non-NULL or bdrv_getlength() would have failed */ 2262d470ad42SMax Reitz assert(bs->drv); 2263549ec0d9SMax Reitz has_filtered_child = bdrv_filter_child(bs); 2264549ec0d9SMax Reitz if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { 22652e8bc787SEric Blake *pnum = bytes; 226661007b31SStefan Hajnoczi ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 22672e8bc787SEric Blake if (offset + bytes == total_size) { 2268fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2269fb0d8654SEric Blake } 227061007b31SStefan Hajnoczi if (bs->drv->protocol_name) { 22712e8bc787SEric Blake ret |= BDRV_BLOCK_OFFSET_VALID; 22722e8bc787SEric Blake local_map = offset; 2273298a1665SEric Blake local_file = bs; 227461007b31SStefan Hajnoczi } 2275298a1665SEric Blake goto early_out; 227661007b31SStefan Hajnoczi } 227761007b31SStefan Hajnoczi 227899723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2279efa6e2edSEric Blake 2280efa6e2edSEric Blake /* Round out to request_alignment boundaries */ 228186a3d5c6SEric Blake align = bs->bl.request_alignment; 2282efa6e2edSEric Blake aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2283efa6e2edSEric Blake aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2284efa6e2edSEric Blake 2285549ec0d9SMax Reitz if (bs->drv->bdrv_co_block_status) { 22860bc329fbSHanna Reitz /* 22870bc329fbSHanna Reitz * Use the block-status cache only for protocol nodes: Format 22880bc329fbSHanna Reitz * drivers are generally quick to inquire the status, but protocol 22890bc329fbSHanna Reitz * drivers often need to get information from outside of qemu, so 22900bc329fbSHanna Reitz * we do not have control over the actual implementation. There 22910bc329fbSHanna Reitz * have been cases where inquiring the status took an unreasonably 22920bc329fbSHanna Reitz * long time, and we can do nothing in qemu to fix it. 22930bc329fbSHanna Reitz * This is especially problematic for images with large data areas, 22940bc329fbSHanna Reitz * because finding the few holes in them and giving them special 22950bc329fbSHanna Reitz * treatment does not gain much performance. Therefore, we try to 22960bc329fbSHanna Reitz * cache the last-identified data region. 22970bc329fbSHanna Reitz * 22980bc329fbSHanna Reitz * Second, limiting ourselves to protocol nodes allows us to assume 22990bc329fbSHanna Reitz * the block status for data regions to be DATA | OFFSET_VALID, and 23000bc329fbSHanna Reitz * that the host offset is the same as the guest offset. 23010bc329fbSHanna Reitz * 23020bc329fbSHanna Reitz * Note that it is possible that external writers zero parts of 23030bc329fbSHanna Reitz * the cached regions without the cache being invalidated, and so 23040bc329fbSHanna Reitz * we may report zeroes as data. This is not catastrophic, 23050bc329fbSHanna Reitz * however, because reporting zeroes as data is fine. 23060bc329fbSHanna Reitz */ 23070bc329fbSHanna Reitz if (QLIST_EMPTY(&bs->children) && 23080bc329fbSHanna Reitz bdrv_bsc_is_data(bs, aligned_offset, pnum)) 23090bc329fbSHanna Reitz { 23100bc329fbSHanna Reitz ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 23110bc329fbSHanna Reitz local_file = bs; 23120bc329fbSHanna Reitz local_map = aligned_offset; 23130bc329fbSHanna Reitz } else { 231486a3d5c6SEric Blake ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 231586a3d5c6SEric Blake aligned_bytes, pnum, &local_map, 231686a3d5c6SEric Blake &local_file); 23170bc329fbSHanna Reitz 23180bc329fbSHanna Reitz /* 23190bc329fbSHanna Reitz * Note that checking QLIST_EMPTY(&bs->children) is also done when 23200bc329fbSHanna Reitz * the cache is queried above. Technically, we do not need to check 23210bc329fbSHanna Reitz * it here; the worst that can happen is that we fill the cache for 23220bc329fbSHanna Reitz * non-protocol nodes, and then it is never used. However, filling 23230bc329fbSHanna Reitz * the cache requires an RCU update, so double check here to avoid 23240bc329fbSHanna Reitz * such an update if possible. 2325113b727cSHanna Reitz * 2326113b727cSHanna Reitz * Check want_zero, because we only want to update the cache when we 2327113b727cSHanna Reitz * have accurate information about what is zero and what is data. 23280bc329fbSHanna Reitz */ 2329113b727cSHanna Reitz if (want_zero && 2330113b727cSHanna Reitz ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && 23310bc329fbSHanna Reitz QLIST_EMPTY(&bs->children)) 23320bc329fbSHanna Reitz { 23330bc329fbSHanna Reitz /* 23340bc329fbSHanna Reitz * When a protocol driver reports BLOCK_OFFSET_VALID, the 23350bc329fbSHanna Reitz * returned local_map value must be the same as the offset we 23360bc329fbSHanna Reitz * have passed (aligned_offset), and local_bs must be the node 23370bc329fbSHanna Reitz * itself. 23380bc329fbSHanna Reitz * Assert this, because we follow this rule when reading from 23390bc329fbSHanna Reitz * the cache (see the `local_file = bs` and 23400bc329fbSHanna Reitz * `local_map = aligned_offset` assignments above), and the 23410bc329fbSHanna Reitz * result the cache delivers must be the same as the driver 23420bc329fbSHanna Reitz * would deliver. 23430bc329fbSHanna Reitz */ 23440bc329fbSHanna Reitz assert(local_file == bs); 23450bc329fbSHanna Reitz assert(local_map == aligned_offset); 23460bc329fbSHanna Reitz bdrv_bsc_fill(bs, aligned_offset, *pnum); 23470bc329fbSHanna Reitz } 23480bc329fbSHanna Reitz } 2349549ec0d9SMax Reitz } else { 2350549ec0d9SMax Reitz /* Default code for filters */ 2351549ec0d9SMax Reitz 2352549ec0d9SMax Reitz local_file = bdrv_filter_bs(bs); 2353549ec0d9SMax Reitz assert(local_file); 2354549ec0d9SMax Reitz 2355549ec0d9SMax Reitz *pnum = aligned_bytes; 2356549ec0d9SMax Reitz local_map = aligned_offset; 2357549ec0d9SMax Reitz ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2358549ec0d9SMax Reitz } 235986a3d5c6SEric Blake if (ret < 0) { 236086a3d5c6SEric Blake *pnum = 0; 236186a3d5c6SEric Blake goto out; 236286a3d5c6SEric Blake } 2363efa6e2edSEric Blake 2364efa6e2edSEric Blake /* 2365636cb512SEric Blake * The driver's result must be a non-zero multiple of request_alignment. 2366efa6e2edSEric Blake * Clamp pnum and adjust map to original request. 2367efa6e2edSEric Blake */ 2368636cb512SEric Blake assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2369636cb512SEric Blake align > offset - aligned_offset); 237069f47505SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_RECURSE) { 237169f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_DATA); 237269f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_OFFSET_VALID); 237369f47505SVladimir Sementsov-Ogievskiy assert(!(ret & BDRV_BLOCK_ZERO)); 237469f47505SVladimir Sementsov-Ogievskiy } 237569f47505SVladimir Sementsov-Ogievskiy 2376efa6e2edSEric Blake *pnum -= offset - aligned_offset; 2377efa6e2edSEric Blake if (*pnum > bytes) { 2378efa6e2edSEric Blake *pnum = bytes; 2379efa6e2edSEric Blake } 2380efa6e2edSEric Blake if (ret & BDRV_BLOCK_OFFSET_VALID) { 2381efa6e2edSEric Blake local_map += offset - aligned_offset; 2382efa6e2edSEric Blake } 238361007b31SStefan Hajnoczi 238461007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_RAW) { 2385298a1665SEric Blake assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 23862e8bc787SEric Blake ret = bdrv_co_block_status(local_file, want_zero, local_map, 23872e8bc787SEric Blake *pnum, pnum, &local_map, &local_file); 238899723548SPaolo Bonzini goto out; 238961007b31SStefan Hajnoczi } 239061007b31SStefan Hajnoczi 239161007b31SStefan Hajnoczi if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 239261007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ALLOCATED; 2393d40f4a56SAlberto Garcia } else if (bs->drv->supports_backing) { 2394cb850315SMax Reitz BlockDriverState *cow_bs = bdrv_cow_bs(bs); 2395cb850315SMax Reitz 2396d40f4a56SAlberto Garcia if (!cow_bs) { 2397d40f4a56SAlberto Garcia ret |= BDRV_BLOCK_ZERO; 2398d40f4a56SAlberto Garcia } else if (want_zero) { 2399cb850315SMax Reitz int64_t size2 = bdrv_getlength(cow_bs); 2400c9ce8c4dSEric Blake 24012e8bc787SEric Blake if (size2 >= 0 && offset >= size2) { 240261007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 240361007b31SStefan Hajnoczi } 24047b1efe99SVladimir Sementsov-Ogievskiy } 240561007b31SStefan Hajnoczi } 240661007b31SStefan Hajnoczi 240769f47505SVladimir Sementsov-Ogievskiy if (want_zero && ret & BDRV_BLOCK_RECURSE && 240869f47505SVladimir Sementsov-Ogievskiy local_file && local_file != bs && 240961007b31SStefan Hajnoczi (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 241061007b31SStefan Hajnoczi (ret & BDRV_BLOCK_OFFSET_VALID)) { 24112e8bc787SEric Blake int64_t file_pnum; 24122e8bc787SEric Blake int ret2; 241361007b31SStefan Hajnoczi 24142e8bc787SEric Blake ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 24152e8bc787SEric Blake *pnum, &file_pnum, NULL, NULL); 241661007b31SStefan Hajnoczi if (ret2 >= 0) { 241761007b31SStefan Hajnoczi /* Ignore errors. This is just providing extra information, it 241861007b31SStefan Hajnoczi * is useful but not necessary. 241961007b31SStefan Hajnoczi */ 2420c61e684eSEric Blake if (ret2 & BDRV_BLOCK_EOF && 2421c61e684eSEric Blake (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2422c61e684eSEric Blake /* 2423c61e684eSEric Blake * It is valid for the format block driver to read 2424c61e684eSEric Blake * beyond the end of the underlying file's current 2425c61e684eSEric Blake * size; such areas read as zero. 2426c61e684eSEric Blake */ 242761007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 242861007b31SStefan Hajnoczi } else { 242961007b31SStefan Hajnoczi /* Limit request to the range reported by the protocol driver */ 243061007b31SStefan Hajnoczi *pnum = file_pnum; 243161007b31SStefan Hajnoczi ret |= (ret2 & BDRV_BLOCK_ZERO); 243261007b31SStefan Hajnoczi } 243361007b31SStefan Hajnoczi } 243461007b31SStefan Hajnoczi } 243561007b31SStefan Hajnoczi 243699723548SPaolo Bonzini out: 243799723548SPaolo Bonzini bdrv_dec_in_flight(bs); 24382e8bc787SEric Blake if (ret >= 0 && offset + *pnum == total_size) { 2439fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2440fb0d8654SEric Blake } 2441298a1665SEric Blake early_out: 2442298a1665SEric Blake if (file) { 2443298a1665SEric Blake *file = local_file; 2444298a1665SEric Blake } 24452e8bc787SEric Blake if (map) { 24462e8bc787SEric Blake *map = local_map; 24472e8bc787SEric Blake } 244861007b31SStefan Hajnoczi return ret; 244961007b31SStefan Hajnoczi } 245061007b31SStefan Hajnoczi 245121c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2452f9e694cbSVladimir Sementsov-Ogievskiy bdrv_co_common_block_status_above(BlockDriverState *bs, 2453ba3f0e25SFam Zheng BlockDriverState *base, 24543555a432SVladimir Sementsov-Ogievskiy bool include_base, 2455c9ce8c4dSEric Blake bool want_zero, 24565b648c67SEric Blake int64_t offset, 24575b648c67SEric Blake int64_t bytes, 24585b648c67SEric Blake int64_t *pnum, 24595b648c67SEric Blake int64_t *map, 2460a92b1b06SEric Blake BlockDriverState **file, 2461a92b1b06SEric Blake int *depth) 2462ba3f0e25SFam Zheng { 246367c095c8SVladimir Sementsov-Ogievskiy int ret; 2464ba3f0e25SFam Zheng BlockDriverState *p; 246567c095c8SVladimir Sementsov-Ogievskiy int64_t eof = 0; 2466a92b1b06SEric Blake int dummy; 24671581a70dSEmanuele Giuseppe Esposito IO_CODE(); 2468ba3f0e25SFam Zheng 24693555a432SVladimir Sementsov-Ogievskiy assert(!include_base || base); /* Can't include NULL base */ 247067c095c8SVladimir Sementsov-Ogievskiy 2471a92b1b06SEric Blake if (!depth) { 2472a92b1b06SEric Blake depth = &dummy; 2473a92b1b06SEric Blake } 2474a92b1b06SEric Blake *depth = 0; 2475a92b1b06SEric Blake 2476624f27bbSVladimir Sementsov-Ogievskiy if (!include_base && bs == base) { 2477624f27bbSVladimir Sementsov-Ogievskiy *pnum = bytes; 2478624f27bbSVladimir Sementsov-Ogievskiy return 0; 2479624f27bbSVladimir Sementsov-Ogievskiy } 2480624f27bbSVladimir Sementsov-Ogievskiy 248167c095c8SVladimir Sementsov-Ogievskiy ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file); 2482a92b1b06SEric Blake ++*depth; 24833555a432SVladimir Sementsov-Ogievskiy if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { 248467c095c8SVladimir Sementsov-Ogievskiy return ret; 248567c095c8SVladimir Sementsov-Ogievskiy } 248667c095c8SVladimir Sementsov-Ogievskiy 248767c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_EOF) { 248867c095c8SVladimir Sementsov-Ogievskiy eof = offset + *pnum; 248967c095c8SVladimir Sementsov-Ogievskiy } 249067c095c8SVladimir Sementsov-Ogievskiy 249167c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 249267c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 249367c095c8SVladimir Sementsov-Ogievskiy 24943555a432SVladimir Sementsov-Ogievskiy for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; 249567c095c8SVladimir Sementsov-Ogievskiy p = bdrv_filter_or_cow_bs(p)) 249667c095c8SVladimir Sementsov-Ogievskiy { 24975b648c67SEric Blake ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 24985b648c67SEric Blake file); 2499a92b1b06SEric Blake ++*depth; 2500c61e684eSEric Blake if (ret < 0) { 250167c095c8SVladimir Sementsov-Ogievskiy return ret; 2502c61e684eSEric Blake } 250367c095c8SVladimir Sementsov-Ogievskiy if (*pnum == 0) { 2504c61e684eSEric Blake /* 250567c095c8SVladimir Sementsov-Ogievskiy * The top layer deferred to this layer, and because this layer is 250667c095c8SVladimir Sementsov-Ogievskiy * short, any zeroes that we synthesize beyond EOF behave as if they 250767c095c8SVladimir Sementsov-Ogievskiy * were allocated at this layer. 250867c095c8SVladimir Sementsov-Ogievskiy * 250967c095c8SVladimir Sementsov-Ogievskiy * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be 251067c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 251167c095c8SVladimir Sementsov-Ogievskiy * below. 2512c61e684eSEric Blake */ 251367c095c8SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_EOF); 25145b648c67SEric Blake *pnum = bytes; 251567c095c8SVladimir Sementsov-Ogievskiy if (file) { 251667c095c8SVladimir Sementsov-Ogievskiy *file = p; 2517c61e684eSEric Blake } 251867c095c8SVladimir Sementsov-Ogievskiy ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED; 2519ba3f0e25SFam Zheng break; 2520ba3f0e25SFam Zheng } 252167c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_ALLOCATED) { 252267c095c8SVladimir Sementsov-Ogievskiy /* 252367c095c8SVladimir Sementsov-Ogievskiy * We've found the node and the status, we must break. 252467c095c8SVladimir Sementsov-Ogievskiy * 252567c095c8SVladimir Sementsov-Ogievskiy * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be 252667c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 252767c095c8SVladimir Sementsov-Ogievskiy * below. 252867c095c8SVladimir Sementsov-Ogievskiy */ 252967c095c8SVladimir Sementsov-Ogievskiy ret &= ~BDRV_BLOCK_EOF; 253067c095c8SVladimir Sementsov-Ogievskiy break; 2531ba3f0e25SFam Zheng } 253267c095c8SVladimir Sementsov-Ogievskiy 25333555a432SVladimir Sementsov-Ogievskiy if (p == base) { 25343555a432SVladimir Sementsov-Ogievskiy assert(include_base); 25353555a432SVladimir Sementsov-Ogievskiy break; 25363555a432SVladimir Sementsov-Ogievskiy } 25373555a432SVladimir Sementsov-Ogievskiy 253867c095c8SVladimir Sementsov-Ogievskiy /* 253967c095c8SVladimir Sementsov-Ogievskiy * OK, [offset, offset + *pnum) region is unallocated on this layer, 254067c095c8SVladimir Sementsov-Ogievskiy * let's continue the diving. 254167c095c8SVladimir Sementsov-Ogievskiy */ 254267c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 254367c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 254467c095c8SVladimir Sementsov-Ogievskiy } 254567c095c8SVladimir Sementsov-Ogievskiy 254667c095c8SVladimir Sementsov-Ogievskiy if (offset + *pnum == eof) { 254767c095c8SVladimir Sementsov-Ogievskiy ret |= BDRV_BLOCK_EOF; 254867c095c8SVladimir Sementsov-Ogievskiy } 254967c095c8SVladimir Sementsov-Ogievskiy 2550ba3f0e25SFam Zheng return ret; 2551ba3f0e25SFam Zheng } 2552ba3f0e25SFam Zheng 25537b52a921SEmanuele Giuseppe Esposito int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 25547b52a921SEmanuele Giuseppe Esposito BlockDriverState *base, 25557b52a921SEmanuele Giuseppe Esposito int64_t offset, int64_t bytes, 25567b52a921SEmanuele Giuseppe Esposito int64_t *pnum, int64_t *map, 25577b52a921SEmanuele Giuseppe Esposito BlockDriverState **file) 25587b52a921SEmanuele Giuseppe Esposito { 25597b52a921SEmanuele Giuseppe Esposito IO_CODE(); 25607b52a921SEmanuele Giuseppe Esposito return bdrv_co_common_block_status_above(bs, base, false, true, offset, 25617b52a921SEmanuele Giuseppe Esposito bytes, pnum, map, file, NULL); 25627b52a921SEmanuele Giuseppe Esposito } 25637b52a921SEmanuele Giuseppe Esposito 256431826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 256531826642SEric Blake int64_t offset, int64_t bytes, int64_t *pnum, 256631826642SEric Blake int64_t *map, BlockDriverState **file) 2567c9ce8c4dSEric Blake { 2568384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 25693555a432SVladimir Sementsov-Ogievskiy return bdrv_common_block_status_above(bs, base, false, true, offset, bytes, 2570a92b1b06SEric Blake pnum, map, file, NULL); 2571c9ce8c4dSEric Blake } 2572c9ce8c4dSEric Blake 2573237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2574237d78f8SEric Blake int64_t *pnum, int64_t *map, BlockDriverState **file) 2575ba3f0e25SFam Zheng { 2576384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2577cb850315SMax Reitz return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), 257831826642SEric Blake offset, bytes, pnum, map, file); 2579ba3f0e25SFam Zheng } 2580ba3f0e25SFam Zheng 258146cd1e8aSAlberto Garcia /* 258246cd1e8aSAlberto Garcia * Check @bs (and its backing chain) to see if the range defined 258346cd1e8aSAlberto Garcia * by @offset and @bytes is known to read as zeroes. 258446cd1e8aSAlberto Garcia * Return 1 if that is the case, 0 otherwise and -errno on error. 258546cd1e8aSAlberto Garcia * This test is meant to be fast rather than accurate so returning 0 258646cd1e8aSAlberto Garcia * does not guarantee non-zero data. 258746cd1e8aSAlberto Garcia */ 258846cd1e8aSAlberto Garcia int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, 258946cd1e8aSAlberto Garcia int64_t bytes) 259046cd1e8aSAlberto Garcia { 259146cd1e8aSAlberto Garcia int ret; 259246cd1e8aSAlberto Garcia int64_t pnum = bytes; 2593384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 259446cd1e8aSAlberto Garcia 259546cd1e8aSAlberto Garcia if (!bytes) { 259646cd1e8aSAlberto Garcia return 1; 259746cd1e8aSAlberto Garcia } 259846cd1e8aSAlberto Garcia 2599ce47ff20SAlberto Faria ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset, 2600a92b1b06SEric Blake bytes, &pnum, NULL, NULL, NULL); 260146cd1e8aSAlberto Garcia 260246cd1e8aSAlberto Garcia if (ret < 0) { 260346cd1e8aSAlberto Garcia return ret; 260446cd1e8aSAlberto Garcia } 260546cd1e8aSAlberto Garcia 260646cd1e8aSAlberto Garcia return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); 260746cd1e8aSAlberto Garcia } 260846cd1e8aSAlberto Garcia 26097b52a921SEmanuele Giuseppe Esposito int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, 26107b52a921SEmanuele Giuseppe Esposito int64_t bytes, int64_t *pnum) 26117b52a921SEmanuele Giuseppe Esposito { 26127b52a921SEmanuele Giuseppe Esposito int ret; 26137b52a921SEmanuele Giuseppe Esposito int64_t dummy; 26147b52a921SEmanuele Giuseppe Esposito IO_CODE(); 26157b52a921SEmanuele Giuseppe Esposito 26167b52a921SEmanuele Giuseppe Esposito ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset, 26177b52a921SEmanuele Giuseppe Esposito bytes, pnum ? pnum : &dummy, NULL, 26187b52a921SEmanuele Giuseppe Esposito NULL, NULL); 26197b52a921SEmanuele Giuseppe Esposito if (ret < 0) { 26207b52a921SEmanuele Giuseppe Esposito return ret; 26217b52a921SEmanuele Giuseppe Esposito } 26227b52a921SEmanuele Giuseppe Esposito return !!(ret & BDRV_BLOCK_ALLOCATED); 26237b52a921SEmanuele Giuseppe Esposito } 26247b52a921SEmanuele Giuseppe Esposito 26257c85803cSAlberto Faria int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, 26267c85803cSAlberto Faria int64_t *pnum) 262761007b31SStefan Hajnoczi { 26287ddb99b9SEric Blake int ret; 26297ddb99b9SEric Blake int64_t dummy; 2630384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2631d6a644bbSEric Blake 26323555a432SVladimir Sementsov-Ogievskiy ret = bdrv_common_block_status_above(bs, bs, true, false, offset, 26333555a432SVladimir Sementsov-Ogievskiy bytes, pnum ? pnum : &dummy, NULL, 2634a92b1b06SEric Blake NULL, NULL); 263561007b31SStefan Hajnoczi if (ret < 0) { 263661007b31SStefan Hajnoczi return ret; 263761007b31SStefan Hajnoczi } 263861007b31SStefan Hajnoczi return !!(ret & BDRV_BLOCK_ALLOCATED); 263961007b31SStefan Hajnoczi } 264061007b31SStefan Hajnoczi 26417b52a921SEmanuele Giuseppe Esposito /* See bdrv_is_allocated_above for documentation */ 26427b52a921SEmanuele Giuseppe Esposito int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, 26437b52a921SEmanuele Giuseppe Esposito BlockDriverState *base, 26447b52a921SEmanuele Giuseppe Esposito bool include_base, int64_t offset, 26457b52a921SEmanuele Giuseppe Esposito int64_t bytes, int64_t *pnum) 26467b52a921SEmanuele Giuseppe Esposito { 26477b52a921SEmanuele Giuseppe Esposito int depth; 26487b52a921SEmanuele Giuseppe Esposito int ret; 26497b52a921SEmanuele Giuseppe Esposito IO_CODE(); 26507b52a921SEmanuele Giuseppe Esposito 26517b52a921SEmanuele Giuseppe Esposito ret = bdrv_co_common_block_status_above(top, base, include_base, false, 26527b52a921SEmanuele Giuseppe Esposito offset, bytes, pnum, NULL, NULL, 26537b52a921SEmanuele Giuseppe Esposito &depth); 26547b52a921SEmanuele Giuseppe Esposito if (ret < 0) { 26557b52a921SEmanuele Giuseppe Esposito return ret; 26567b52a921SEmanuele Giuseppe Esposito } 26577b52a921SEmanuele Giuseppe Esposito 26587b52a921SEmanuele Giuseppe Esposito if (ret & BDRV_BLOCK_ALLOCATED) { 26597b52a921SEmanuele Giuseppe Esposito return depth; 26607b52a921SEmanuele Giuseppe Esposito } 26617b52a921SEmanuele Giuseppe Esposito return 0; 26627b52a921SEmanuele Giuseppe Esposito } 26637b52a921SEmanuele Giuseppe Esposito 266461007b31SStefan Hajnoczi /* 266561007b31SStefan Hajnoczi * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 266661007b31SStefan Hajnoczi * 2667a92b1b06SEric Blake * Return a positive depth if (a prefix of) the given range is allocated 2668a92b1b06SEric Blake * in any image between BASE and TOP (BASE is only included if include_base 2669a92b1b06SEric Blake * is set). Depth 1 is TOP, 2 is the first backing layer, and so forth. 2670170d3bd3SAndrey Shinkevich * BASE can be NULL to check if the given offset is allocated in any 2671170d3bd3SAndrey Shinkevich * image of the chain. Return 0 otherwise, or negative errno on 2672170d3bd3SAndrey Shinkevich * failure. 267361007b31SStefan Hajnoczi * 267451b0a488SEric Blake * 'pnum' is set to the number of bytes (including and immediately 267551b0a488SEric Blake * following the specified offset) that are known to be in the same 267651b0a488SEric Blake * allocated/unallocated state. Note that a subsequent call starting 267751b0a488SEric Blake * at 'offset + *pnum' may return the same allocation status (in other 267851b0a488SEric Blake * words, the result is not necessarily the maximum possible range); 267951b0a488SEric Blake * but 'pnum' will only be 0 when end of file is reached. 268061007b31SStefan Hajnoczi */ 268161007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top, 268261007b31SStefan Hajnoczi BlockDriverState *base, 2683170d3bd3SAndrey Shinkevich bool include_base, int64_t offset, 2684170d3bd3SAndrey Shinkevich int64_t bytes, int64_t *pnum) 268561007b31SStefan Hajnoczi { 2686a92b1b06SEric Blake int depth; 26877b52a921SEmanuele Giuseppe Esposito int ret; 26887b52a921SEmanuele Giuseppe Esposito IO_CODE(); 26897b52a921SEmanuele Giuseppe Esposito 26907b52a921SEmanuele Giuseppe Esposito ret = bdrv_common_block_status_above(top, base, include_base, false, 2691a92b1b06SEric Blake offset, bytes, pnum, NULL, NULL, 2692a92b1b06SEric Blake &depth); 269361007b31SStefan Hajnoczi if (ret < 0) { 269461007b31SStefan Hajnoczi return ret; 2695d6a644bbSEric Blake } 269661007b31SStefan Hajnoczi 2697a92b1b06SEric Blake if (ret & BDRV_BLOCK_ALLOCATED) { 2698a92b1b06SEric Blake return depth; 2699a92b1b06SEric Blake } 2700a92b1b06SEric Blake return 0; 270161007b31SStefan Hajnoczi } 270261007b31SStefan Hajnoczi 270321c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2704b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 27051a8ae822SKevin Wolf { 27061a8ae822SKevin Wolf BlockDriver *drv = bs->drv; 2707c4db2e25SMax Reitz BlockDriverState *child_bs = bdrv_primary_bs(bs); 2708b984b296SVladimir Sementsov-Ogievskiy int ret; 27091581a70dSEmanuele Giuseppe Esposito IO_CODE(); 27101b3ff9feSKevin Wolf assert_bdrv_graph_readable(); 2711b984b296SVladimir Sementsov-Ogievskiy 2712b984b296SVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2713b984b296SVladimir Sementsov-Ogievskiy if (ret < 0) { 2714b984b296SVladimir Sementsov-Ogievskiy return ret; 2715b984b296SVladimir Sementsov-Ogievskiy } 2716dc88a467SStefan Hajnoczi 2717b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2718b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2719b33b354fSVladimir Sementsov-Ogievskiy } 2720b33b354fSVladimir Sementsov-Ogievskiy 2721dc88a467SStefan Hajnoczi bdrv_inc_in_flight(bs); 27221a8ae822SKevin Wolf 2723b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_load_vmstate) { 2724dc88a467SStefan Hajnoczi ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2725c4db2e25SMax Reitz } else if (child_bs) { 2726b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); 2727b984b296SVladimir Sementsov-Ogievskiy } else { 2728b984b296SVladimir Sementsov-Ogievskiy ret = -ENOTSUP; 27291a8ae822SKevin Wolf } 27301a8ae822SKevin Wolf 2731dc88a467SStefan Hajnoczi bdrv_dec_in_flight(bs); 2732b33b354fSVladimir Sementsov-Ogievskiy 2733b33b354fSVladimir Sementsov-Ogievskiy return ret; 2734b33b354fSVladimir Sementsov-Ogievskiy } 2735b33b354fSVladimir Sementsov-Ogievskiy 2736b33b354fSVladimir Sementsov-Ogievskiy int coroutine_fn 2737b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2738b33b354fSVladimir Sementsov-Ogievskiy { 2739b33b354fSVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 2740b33b354fSVladimir Sementsov-Ogievskiy BlockDriverState *child_bs = bdrv_primary_bs(bs); 2741b984b296SVladimir Sementsov-Ogievskiy int ret; 27421581a70dSEmanuele Giuseppe Esposito IO_CODE(); 27431b3ff9feSKevin Wolf assert_bdrv_graph_readable(); 2744b984b296SVladimir Sementsov-Ogievskiy 2745b984b296SVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2746b984b296SVladimir Sementsov-Ogievskiy if (ret < 0) { 2747b984b296SVladimir Sementsov-Ogievskiy return ret; 2748b984b296SVladimir Sementsov-Ogievskiy } 2749b33b354fSVladimir Sementsov-Ogievskiy 2750b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2751b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2752b33b354fSVladimir Sementsov-Ogievskiy } 2753b33b354fSVladimir Sementsov-Ogievskiy 2754b33b354fSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2755b33b354fSVladimir Sementsov-Ogievskiy 2756b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_save_vmstate) { 2757b33b354fSVladimir Sementsov-Ogievskiy ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2758b33b354fSVladimir Sementsov-Ogievskiy } else if (child_bs) { 2759b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); 2760b984b296SVladimir Sementsov-Ogievskiy } else { 2761b984b296SVladimir Sementsov-Ogievskiy ret = -ENOTSUP; 2762b33b354fSVladimir Sementsov-Ogievskiy } 2763b33b354fSVladimir Sementsov-Ogievskiy 2764b33b354fSVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 2765b33b354fSVladimir Sementsov-Ogievskiy 2766dc88a467SStefan Hajnoczi return ret; 27671a8ae822SKevin Wolf } 27681a8ae822SKevin Wolf 276961007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 277061007b31SStefan Hajnoczi int64_t pos, int size) 277161007b31SStefan Hajnoczi { 27720d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2773b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_writev_vmstate(bs, &qiov, pos); 2774384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 277561007b31SStefan Hajnoczi 2776b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 277761007b31SStefan Hajnoczi } 277861007b31SStefan Hajnoczi 277961007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 278061007b31SStefan Hajnoczi int64_t pos, int size) 278161007b31SStefan Hajnoczi { 27820d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2783b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_readv_vmstate(bs, &qiov, pos); 2784384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 27855ddda0b8SKevin Wolf 2786b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 278761007b31SStefan Hajnoczi } 278861007b31SStefan Hajnoczi 278961007b31SStefan Hajnoczi /**************************************************************/ 279061007b31SStefan Hajnoczi /* async I/Os */ 279161007b31SStefan Hajnoczi 279261007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb) 279361007b31SStefan Hajnoczi { 2794384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 279561007b31SStefan Hajnoczi qemu_aio_ref(acb); 279661007b31SStefan Hajnoczi bdrv_aio_cancel_async(acb); 279761007b31SStefan Hajnoczi while (acb->refcnt > 1) { 279861007b31SStefan Hajnoczi if (acb->aiocb_info->get_aio_context) { 279961007b31SStefan Hajnoczi aio_poll(acb->aiocb_info->get_aio_context(acb), true); 280061007b31SStefan Hajnoczi } else if (acb->bs) { 28012f47da5fSPaolo Bonzini /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 28022f47da5fSPaolo Bonzini * assert that we're not using an I/O thread. Thread-safe 28032f47da5fSPaolo Bonzini * code should use bdrv_aio_cancel_async exclusively. 28042f47da5fSPaolo Bonzini */ 28052f47da5fSPaolo Bonzini assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 280661007b31SStefan Hajnoczi aio_poll(bdrv_get_aio_context(acb->bs), true); 280761007b31SStefan Hajnoczi } else { 280861007b31SStefan Hajnoczi abort(); 280961007b31SStefan Hajnoczi } 281061007b31SStefan Hajnoczi } 281161007b31SStefan Hajnoczi qemu_aio_unref(acb); 281261007b31SStefan Hajnoczi } 281361007b31SStefan Hajnoczi 281461007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements 281561007b31SStefan Hajnoczi * cancel_async, otherwise we do nothing and let the request normally complete. 281661007b31SStefan Hajnoczi * In either case the completion callback must be called. */ 281761007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb) 281861007b31SStefan Hajnoczi { 2819384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 282061007b31SStefan Hajnoczi if (acb->aiocb_info->cancel_async) { 282161007b31SStefan Hajnoczi acb->aiocb_info->cancel_async(acb); 282261007b31SStefan Hajnoczi } 282361007b31SStefan Hajnoczi } 282461007b31SStefan Hajnoczi 282561007b31SStefan Hajnoczi /**************************************************************/ 282661007b31SStefan Hajnoczi /* Coroutine block device emulation */ 282761007b31SStefan Hajnoczi 282861007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 282961007b31SStefan Hajnoczi { 2830883833e2SMax Reitz BdrvChild *primary_child = bdrv_primary_child(bs); 2831883833e2SMax Reitz BdrvChild *child; 283249ca6259SFam Zheng int current_gen; 283349ca6259SFam Zheng int ret = 0; 2834384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 283561007b31SStefan Hajnoczi 283699723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2837c32b82afSPavel Dovgalyuk 28381e97be91SEmanuele Giuseppe Esposito if (!bdrv_co_is_inserted(bs) || bdrv_is_read_only(bs) || 283949ca6259SFam Zheng bdrv_is_sg(bs)) { 284049ca6259SFam Zheng goto early_exit; 284149ca6259SFam Zheng } 284249ca6259SFam Zheng 28433783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 2844d73415a3SStefan Hajnoczi current_gen = qatomic_read(&bs->write_gen); 28453ff2f67aSEvgeny Yakovlev 28463ff2f67aSEvgeny Yakovlev /* Wait until any previous flushes are completed */ 284799723548SPaolo Bonzini while (bs->active_flush_req) { 28483783fa3dSPaolo Bonzini qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 28493ff2f67aSEvgeny Yakovlev } 28503ff2f67aSEvgeny Yakovlev 28513783fa3dSPaolo Bonzini /* Flushes reach this point in nondecreasing current_gen order. */ 285299723548SPaolo Bonzini bs->active_flush_req = true; 28533783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 28543ff2f67aSEvgeny Yakovlev 2855c32b82afSPavel Dovgalyuk /* Write back all layers by calling one driver function */ 2856c32b82afSPavel Dovgalyuk if (bs->drv->bdrv_co_flush) { 2857c32b82afSPavel Dovgalyuk ret = bs->drv->bdrv_co_flush(bs); 2858c32b82afSPavel Dovgalyuk goto out; 2859c32b82afSPavel Dovgalyuk } 2860c32b82afSPavel Dovgalyuk 286161007b31SStefan Hajnoczi /* Write back cached data to the OS even with cache=unsafe */ 2862883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); 286361007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_os) { 286461007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_os(bs); 286561007b31SStefan Hajnoczi if (ret < 0) { 2866cdb5e315SFam Zheng goto out; 286761007b31SStefan Hajnoczi } 286861007b31SStefan Hajnoczi } 286961007b31SStefan Hajnoczi 287061007b31SStefan Hajnoczi /* But don't actually force it to the disk with cache=unsafe */ 287161007b31SStefan Hajnoczi if (bs->open_flags & BDRV_O_NO_FLUSH) { 2872883833e2SMax Reitz goto flush_children; 287361007b31SStefan Hajnoczi } 287461007b31SStefan Hajnoczi 28753ff2f67aSEvgeny Yakovlev /* Check if we really need to flush anything */ 28763ff2f67aSEvgeny Yakovlev if (bs->flushed_gen == current_gen) { 2877883833e2SMax Reitz goto flush_children; 28783ff2f67aSEvgeny Yakovlev } 28793ff2f67aSEvgeny Yakovlev 2880883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); 2881d470ad42SMax Reitz if (!bs->drv) { 2882d470ad42SMax Reitz /* bs->drv->bdrv_co_flush() might have ejected the BDS 2883d470ad42SMax Reitz * (even in case of apparent success) */ 2884d470ad42SMax Reitz ret = -ENOMEDIUM; 2885d470ad42SMax Reitz goto out; 2886d470ad42SMax Reitz } 288761007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_disk) { 288861007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_disk(bs); 288961007b31SStefan Hajnoczi } else if (bs->drv->bdrv_aio_flush) { 289061007b31SStefan Hajnoczi BlockAIOCB *acb; 289161007b31SStefan Hajnoczi CoroutineIOCompletion co = { 289261007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 289361007b31SStefan Hajnoczi }; 289461007b31SStefan Hajnoczi 289561007b31SStefan Hajnoczi acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 289661007b31SStefan Hajnoczi if (acb == NULL) { 289761007b31SStefan Hajnoczi ret = -EIO; 289861007b31SStefan Hajnoczi } else { 289961007b31SStefan Hajnoczi qemu_coroutine_yield(); 290061007b31SStefan Hajnoczi ret = co.ret; 290161007b31SStefan Hajnoczi } 290261007b31SStefan Hajnoczi } else { 290361007b31SStefan Hajnoczi /* 290461007b31SStefan Hajnoczi * Some block drivers always operate in either writethrough or unsafe 290561007b31SStefan Hajnoczi * mode and don't support bdrv_flush therefore. Usually qemu doesn't 290661007b31SStefan Hajnoczi * know how the server works (because the behaviour is hardcoded or 290761007b31SStefan Hajnoczi * depends on server-side configuration), so we can't ensure that 290861007b31SStefan Hajnoczi * everything is safe on disk. Returning an error doesn't work because 290961007b31SStefan Hajnoczi * that would break guests even if the server operates in writethrough 291061007b31SStefan Hajnoczi * mode. 291161007b31SStefan Hajnoczi * 291261007b31SStefan Hajnoczi * Let's hope the user knows what he's doing. 291361007b31SStefan Hajnoczi */ 291461007b31SStefan Hajnoczi ret = 0; 291561007b31SStefan Hajnoczi } 29163ff2f67aSEvgeny Yakovlev 291761007b31SStefan Hajnoczi if (ret < 0) { 2918cdb5e315SFam Zheng goto out; 291961007b31SStefan Hajnoczi } 292061007b31SStefan Hajnoczi 292161007b31SStefan Hajnoczi /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 292261007b31SStefan Hajnoczi * in the case of cache=unsafe, so there are no useless flushes. 292361007b31SStefan Hajnoczi */ 2924883833e2SMax Reitz flush_children: 2925883833e2SMax Reitz ret = 0; 2926883833e2SMax Reitz QLIST_FOREACH(child, &bs->children, next) { 2927883833e2SMax Reitz if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { 2928883833e2SMax Reitz int this_child_ret = bdrv_co_flush(child->bs); 2929883833e2SMax Reitz if (!ret) { 2930883833e2SMax Reitz ret = this_child_ret; 2931883833e2SMax Reitz } 2932883833e2SMax Reitz } 2933883833e2SMax Reitz } 2934883833e2SMax Reitz 2935cdb5e315SFam Zheng out: 29363ff2f67aSEvgeny Yakovlev /* Notify any pending flushes that we have completed */ 2937e6af1e08SKevin Wolf if (ret == 0) { 29383ff2f67aSEvgeny Yakovlev bs->flushed_gen = current_gen; 2939e6af1e08SKevin Wolf } 29403783fa3dSPaolo Bonzini 29413783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 294299723548SPaolo Bonzini bs->active_flush_req = false; 2943156af3acSDenis V. Lunev /* Return value is ignored - it's ok if wait queue is empty */ 2944156af3acSDenis V. Lunev qemu_co_queue_next(&bs->flush_queue); 29453783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 29463ff2f67aSEvgeny Yakovlev 294749ca6259SFam Zheng early_exit: 294899723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2949cdb5e315SFam Zheng return ret; 295061007b31SStefan Hajnoczi } 295161007b31SStefan Hajnoczi 2952d93e5726SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2953d93e5726SVladimir Sementsov-Ogievskiy int64_t bytes) 295461007b31SStefan Hajnoczi { 2955b1066c87SFam Zheng BdrvTrackedRequest req; 295639af49c0SVladimir Sementsov-Ogievskiy int ret; 295739af49c0SVladimir Sementsov-Ogievskiy int64_t max_pdiscard; 29583482b9bcSEric Blake int head, tail, align; 29590b9fd3f4SFam Zheng BlockDriverState *bs = child->bs; 2960384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 296161007b31SStefan Hajnoczi 29621e97be91SEmanuele Giuseppe Esposito if (!bs || !bs->drv || !bdrv_co_is_inserted(bs)) { 296361007b31SStefan Hajnoczi return -ENOMEDIUM; 296461007b31SStefan Hajnoczi } 296561007b31SStefan Hajnoczi 2966d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2967d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2968d6883bc9SVladimir Sementsov-Ogievskiy } 2969d6883bc9SVladimir Sementsov-Ogievskiy 297069b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, bytes, NULL); 29718b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 29728b117001SVladimir Sementsov-Ogievskiy return ret; 297361007b31SStefan Hajnoczi } 297461007b31SStefan Hajnoczi 297561007b31SStefan Hajnoczi /* Do nothing if disabled. */ 297661007b31SStefan Hajnoczi if (!(bs->open_flags & BDRV_O_UNMAP)) { 297761007b31SStefan Hajnoczi return 0; 297861007b31SStefan Hajnoczi } 297961007b31SStefan Hajnoczi 298002aefe43SEric Blake if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 298161007b31SStefan Hajnoczi return 0; 298261007b31SStefan Hajnoczi } 298361007b31SStefan Hajnoczi 29840bc329fbSHanna Reitz /* Invalidate the cached block-status data range if this discard overlaps */ 29850bc329fbSHanna Reitz bdrv_bsc_invalidate_range(bs, offset, bytes); 29860bc329fbSHanna Reitz 29873482b9bcSEric Blake /* Discard is advisory, but some devices track and coalesce 29883482b9bcSEric Blake * unaligned requests, so we must pass everything down rather than 29893482b9bcSEric Blake * round here. Still, most devices will just silently ignore 29903482b9bcSEric Blake * unaligned requests (by returning -ENOTSUP), so we must fragment 29913482b9bcSEric Blake * the request accordingly. */ 299202aefe43SEric Blake align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2993b8d0a980SEric Blake assert(align % bs->bl.request_alignment == 0); 2994b8d0a980SEric Blake head = offset % align; 2995f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % align; 29969f1963b3SEric Blake 299799723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2998f5a5ca79SManos Pitsidianakis tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 299950824995SFam Zheng 300000695c27SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 3001ec050f77SDenis V. Lunev if (ret < 0) { 3002ec050f77SDenis V. Lunev goto out; 3003ec050f77SDenis V. Lunev } 3004ec050f77SDenis V. Lunev 30056a8f3dbbSVladimir Sementsov-Ogievskiy max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX), 30069f1963b3SEric Blake align); 30073482b9bcSEric Blake assert(max_pdiscard >= bs->bl.request_alignment); 30089f1963b3SEric Blake 3009f5a5ca79SManos Pitsidianakis while (bytes > 0) { 3010d93e5726SVladimir Sementsov-Ogievskiy int64_t num = bytes; 30113482b9bcSEric Blake 30123482b9bcSEric Blake if (head) { 30133482b9bcSEric Blake /* Make small requests to get to alignment boundaries. */ 3014f5a5ca79SManos Pitsidianakis num = MIN(bytes, align - head); 30153482b9bcSEric Blake if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 30163482b9bcSEric Blake num %= bs->bl.request_alignment; 30173482b9bcSEric Blake } 30183482b9bcSEric Blake head = (head + num) % align; 30193482b9bcSEric Blake assert(num < max_pdiscard); 30203482b9bcSEric Blake } else if (tail) { 30213482b9bcSEric Blake if (num > align) { 30223482b9bcSEric Blake /* Shorten the request to the last aligned cluster. */ 30233482b9bcSEric Blake num -= tail; 30243482b9bcSEric Blake } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 30253482b9bcSEric Blake tail > bs->bl.request_alignment) { 30263482b9bcSEric Blake tail %= bs->bl.request_alignment; 30273482b9bcSEric Blake num -= tail; 30283482b9bcSEric Blake } 30293482b9bcSEric Blake } 30303482b9bcSEric Blake /* limit request size */ 30313482b9bcSEric Blake if (num > max_pdiscard) { 30323482b9bcSEric Blake num = max_pdiscard; 30333482b9bcSEric Blake } 303461007b31SStefan Hajnoczi 3035d470ad42SMax Reitz if (!bs->drv) { 3036d470ad42SMax Reitz ret = -ENOMEDIUM; 3037d470ad42SMax Reitz goto out; 3038d470ad42SMax Reitz } 303947a5486dSEric Blake if (bs->drv->bdrv_co_pdiscard) { 304047a5486dSEric Blake ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 304161007b31SStefan Hajnoczi } else { 304261007b31SStefan Hajnoczi BlockAIOCB *acb; 304361007b31SStefan Hajnoczi CoroutineIOCompletion co = { 304461007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 304561007b31SStefan Hajnoczi }; 304661007b31SStefan Hajnoczi 30474da444a0SEric Blake acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 304861007b31SStefan Hajnoczi bdrv_co_io_em_complete, &co); 304961007b31SStefan Hajnoczi if (acb == NULL) { 3050b1066c87SFam Zheng ret = -EIO; 3051b1066c87SFam Zheng goto out; 305261007b31SStefan Hajnoczi } else { 305361007b31SStefan Hajnoczi qemu_coroutine_yield(); 305461007b31SStefan Hajnoczi ret = co.ret; 305561007b31SStefan Hajnoczi } 305661007b31SStefan Hajnoczi } 305761007b31SStefan Hajnoczi if (ret && ret != -ENOTSUP) { 3058b1066c87SFam Zheng goto out; 305961007b31SStefan Hajnoczi } 306061007b31SStefan Hajnoczi 30619f1963b3SEric Blake offset += num; 3062f5a5ca79SManos Pitsidianakis bytes -= num; 306361007b31SStefan Hajnoczi } 3064b1066c87SFam Zheng ret = 0; 3065b1066c87SFam Zheng out: 306600695c27SFam Zheng bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3067b1066c87SFam Zheng tracked_request_end(&req); 306899723548SPaolo Bonzini bdrv_dec_in_flight(bs); 3069b1066c87SFam Zheng return ret; 307061007b31SStefan Hajnoczi } 307161007b31SStefan Hajnoczi 3072881a4c55SPaolo Bonzini int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 307361007b31SStefan Hajnoczi { 307461007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 30755c5ae76aSFam Zheng CoroutineIOCompletion co = { 30765c5ae76aSFam Zheng .coroutine = qemu_coroutine_self(), 30775c5ae76aSFam Zheng }; 30785c5ae76aSFam Zheng BlockAIOCB *acb; 3079384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 308061007b31SStefan Hajnoczi 308199723548SPaolo Bonzini bdrv_inc_in_flight(bs); 308216a389dcSKevin Wolf if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 30835c5ae76aSFam Zheng co.ret = -ENOTSUP; 30845c5ae76aSFam Zheng goto out; 30855c5ae76aSFam Zheng } 30865c5ae76aSFam Zheng 308716a389dcSKevin Wolf if (drv->bdrv_co_ioctl) { 308816a389dcSKevin Wolf co.ret = drv->bdrv_co_ioctl(bs, req, buf); 308916a389dcSKevin Wolf } else { 30905c5ae76aSFam Zheng acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 30915c5ae76aSFam Zheng if (!acb) { 3092c8a9fd80SFam Zheng co.ret = -ENOTSUP; 3093c8a9fd80SFam Zheng goto out; 30945c5ae76aSFam Zheng } 30955c5ae76aSFam Zheng qemu_coroutine_yield(); 309616a389dcSKevin Wolf } 30975c5ae76aSFam Zheng out: 309899723548SPaolo Bonzini bdrv_dec_in_flight(bs); 30995c5ae76aSFam Zheng return co.ret; 31005c5ae76aSFam Zheng } 31015c5ae76aSFam Zheng 310261007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size) 310361007b31SStefan Hajnoczi { 3104384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 310561007b31SStefan Hajnoczi return qemu_memalign(bdrv_opt_mem_align(bs), size); 310661007b31SStefan Hajnoczi } 310761007b31SStefan Hajnoczi 310861007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size) 310961007b31SStefan Hajnoczi { 3110384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 311161007b31SStefan Hajnoczi return memset(qemu_blockalign(bs, size), 0, size); 311261007b31SStefan Hajnoczi } 311361007b31SStefan Hajnoczi 311461007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 311561007b31SStefan Hajnoczi { 311661007b31SStefan Hajnoczi size_t align = bdrv_opt_mem_align(bs); 3117384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 311861007b31SStefan Hajnoczi 311961007b31SStefan Hajnoczi /* Ensure that NULL is never returned on success */ 312061007b31SStefan Hajnoczi assert(align > 0); 312161007b31SStefan Hajnoczi if (size == 0) { 312261007b31SStefan Hajnoczi size = align; 312361007b31SStefan Hajnoczi } 312461007b31SStefan Hajnoczi 312561007b31SStefan Hajnoczi return qemu_try_memalign(align, size); 312661007b31SStefan Hajnoczi } 312761007b31SStefan Hajnoczi 312861007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 312961007b31SStefan Hajnoczi { 313061007b31SStefan Hajnoczi void *mem = qemu_try_blockalign(bs, size); 3131384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 313261007b31SStefan Hajnoczi 313361007b31SStefan Hajnoczi if (mem) { 313461007b31SStefan Hajnoczi memset(mem, 0, size); 313561007b31SStefan Hajnoczi } 313661007b31SStefan Hajnoczi 313761007b31SStefan Hajnoczi return mem; 313861007b31SStefan Hajnoczi } 313961007b31SStefan Hajnoczi 31408f497454SEmanuele Giuseppe Esposito void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs) 314161007b31SStefan Hajnoczi { 31426b98bd64SPaolo Bonzini BdrvChild *child; 3143384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 31446b98bd64SPaolo Bonzini 31456b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 31468f497454SEmanuele Giuseppe Esposito bdrv_co_io_plug(child->bs); 31476b98bd64SPaolo Bonzini } 31486b98bd64SPaolo Bonzini 3149d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->io_plugged) == 0) { 315061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 31518f497454SEmanuele Giuseppe Esposito if (drv && drv->bdrv_co_io_plug) { 31528f497454SEmanuele Giuseppe Esposito drv->bdrv_co_io_plug(bs); 31536b98bd64SPaolo Bonzini } 315461007b31SStefan Hajnoczi } 315561007b31SStefan Hajnoczi } 315661007b31SStefan Hajnoczi 315709d9fc97SEmanuele Giuseppe Esposito void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs) 315861007b31SStefan Hajnoczi { 31596b98bd64SPaolo Bonzini BdrvChild *child; 3160384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 31616b98bd64SPaolo Bonzini 31626b98bd64SPaolo Bonzini assert(bs->io_plugged); 3163d73415a3SStefan Hajnoczi if (qatomic_fetch_dec(&bs->io_plugged) == 1) { 316461007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 316509d9fc97SEmanuele Giuseppe Esposito if (drv && drv->bdrv_co_io_unplug) { 316609d9fc97SEmanuele Giuseppe Esposito drv->bdrv_co_io_unplug(bs); 316761007b31SStefan Hajnoczi } 316861007b31SStefan Hajnoczi } 316961007b31SStefan Hajnoczi 31706b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 317109d9fc97SEmanuele Giuseppe Esposito bdrv_co_io_unplug(child->bs); 31726b98bd64SPaolo Bonzini } 31736b98bd64SPaolo Bonzini } 317423d0ba93SFam Zheng 3175f4ec04baSStefan Hajnoczi /* Helper that undoes bdrv_register_buf() when it fails partway through */ 3176f4ec04baSStefan Hajnoczi static void bdrv_register_buf_rollback(BlockDriverState *bs, 3177f4ec04baSStefan Hajnoczi void *host, 3178f4ec04baSStefan Hajnoczi size_t size, 3179f4ec04baSStefan Hajnoczi BdrvChild *final_child) 3180f4ec04baSStefan Hajnoczi { 3181f4ec04baSStefan Hajnoczi BdrvChild *child; 3182f4ec04baSStefan Hajnoczi 3183f4ec04baSStefan Hajnoczi QLIST_FOREACH(child, &bs->children, next) { 3184f4ec04baSStefan Hajnoczi if (child == final_child) { 3185f4ec04baSStefan Hajnoczi break; 3186f4ec04baSStefan Hajnoczi } 3187f4ec04baSStefan Hajnoczi 3188f4ec04baSStefan Hajnoczi bdrv_unregister_buf(child->bs, host, size); 3189f4ec04baSStefan Hajnoczi } 3190f4ec04baSStefan Hajnoczi 3191f4ec04baSStefan Hajnoczi if (bs->drv && bs->drv->bdrv_unregister_buf) { 3192f4ec04baSStefan Hajnoczi bs->drv->bdrv_unregister_buf(bs, host, size); 3193f4ec04baSStefan Hajnoczi } 3194f4ec04baSStefan Hajnoczi } 3195f4ec04baSStefan Hajnoczi 3196f4ec04baSStefan Hajnoczi bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size, 3197f4ec04baSStefan Hajnoczi Error **errp) 319823d0ba93SFam Zheng { 319923d0ba93SFam Zheng BdrvChild *child; 320023d0ba93SFam Zheng 3201f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 320223d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_register_buf) { 3203f4ec04baSStefan Hajnoczi if (!bs->drv->bdrv_register_buf(bs, host, size, errp)) { 3204f4ec04baSStefan Hajnoczi return false; 3205f4ec04baSStefan Hajnoczi } 320623d0ba93SFam Zheng } 320723d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 3208f4ec04baSStefan Hajnoczi if (!bdrv_register_buf(child->bs, host, size, errp)) { 3209f4ec04baSStefan Hajnoczi bdrv_register_buf_rollback(bs, host, size, child); 3210f4ec04baSStefan Hajnoczi return false; 321123d0ba93SFam Zheng } 321223d0ba93SFam Zheng } 3213f4ec04baSStefan Hajnoczi return true; 3214f4ec04baSStefan Hajnoczi } 321523d0ba93SFam Zheng 32164f384011SStefan Hajnoczi void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size) 321723d0ba93SFam Zheng { 321823d0ba93SFam Zheng BdrvChild *child; 321923d0ba93SFam Zheng 3220f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 322123d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_unregister_buf) { 32224f384011SStefan Hajnoczi bs->drv->bdrv_unregister_buf(bs, host, size); 322323d0ba93SFam Zheng } 322423d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 32254f384011SStefan Hajnoczi bdrv_unregister_buf(child->bs, host, size); 322623d0ba93SFam Zheng } 322723d0ba93SFam Zheng } 3228fcc67678SFam Zheng 322967b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal( 3230a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *src, int64_t src_offset, BdrvChild *dst, 3231a5215b8fSVladimir Sementsov-Ogievskiy int64_t dst_offset, int64_t bytes, 323267b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3233fcc67678SFam Zheng bool recurse_src) 3234fcc67678SFam Zheng { 3235999658a0SVladimir Sementsov-Ogievskiy BdrvTrackedRequest req; 3236fcc67678SFam Zheng int ret; 3237fcc67678SFam Zheng 3238fe0480d6SKevin Wolf /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3239fe0480d6SKevin Wolf assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3240fe0480d6SKevin Wolf assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 324145e62b46SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_NO_WAIT)); 324245e62b46SVladimir Sementsov-Ogievskiy assert(!(write_flags & BDRV_REQ_NO_WAIT)); 3243fe0480d6SKevin Wolf 32441e97be91SEmanuele Giuseppe Esposito if (!dst || !dst->bs || !bdrv_co_is_inserted(dst->bs)) { 3245fcc67678SFam Zheng return -ENOMEDIUM; 3246fcc67678SFam Zheng } 324763f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(dst_offset, bytes, NULL, 0); 3248fcc67678SFam Zheng if (ret) { 3249fcc67678SFam Zheng return ret; 3250fcc67678SFam Zheng } 325167b51fb9SVladimir Sementsov-Ogievskiy if (write_flags & BDRV_REQ_ZERO_WRITE) { 325267b51fb9SVladimir Sementsov-Ogievskiy return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3253fcc67678SFam Zheng } 3254fcc67678SFam Zheng 32551e97be91SEmanuele Giuseppe Esposito if (!src || !src->bs || !bdrv_co_is_inserted(src->bs)) { 3256d4d3e5a0SFam Zheng return -ENOMEDIUM; 3257d4d3e5a0SFam Zheng } 325863f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(src_offset, bytes, NULL, 0); 3259d4d3e5a0SFam Zheng if (ret) { 3260d4d3e5a0SFam Zheng return ret; 3261d4d3e5a0SFam Zheng } 3262d4d3e5a0SFam Zheng 3263fcc67678SFam Zheng if (!src->bs->drv->bdrv_co_copy_range_from 3264fcc67678SFam Zheng || !dst->bs->drv->bdrv_co_copy_range_to 3265fcc67678SFam Zheng || src->bs->encrypted || dst->bs->encrypted) { 3266fcc67678SFam Zheng return -ENOTSUP; 3267fcc67678SFam Zheng } 3268999658a0SVladimir Sementsov-Ogievskiy 3269999658a0SVladimir Sementsov-Ogievskiy if (recurse_src) { 3270d4d3e5a0SFam Zheng bdrv_inc_in_flight(src->bs); 3271999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, src->bs, src_offset, bytes, 3272999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_READ); 327337aec7d7SFam Zheng 327409d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 327509d2f948SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_SERIALISING)); 3276304d9d7fSMax Reitz bdrv_wait_serialising_requests(&req); 3277999658a0SVladimir Sementsov-Ogievskiy 327837aec7d7SFam Zheng ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3279fcc67678SFam Zheng src, src_offset, 3280fcc67678SFam Zheng dst, dst_offset, 328167b51fb9SVladimir Sementsov-Ogievskiy bytes, 328267b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 3283999658a0SVladimir Sementsov-Ogievskiy 3284999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3285999658a0SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(src->bs); 3286fcc67678SFam Zheng } else { 3287999658a0SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(dst->bs); 3288999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3289999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_WRITE); 32900eb1e891SFam Zheng ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 32910eb1e891SFam Zheng write_flags); 32920eb1e891SFam Zheng if (!ret) { 329337aec7d7SFam Zheng ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3294fcc67678SFam Zheng src, src_offset, 3295fcc67678SFam Zheng dst, dst_offset, 329667b51fb9SVladimir Sementsov-Ogievskiy bytes, 329767b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 32980eb1e891SFam Zheng } 32990eb1e891SFam Zheng bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3300999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3301d4d3e5a0SFam Zheng bdrv_dec_in_flight(dst->bs); 3302999658a0SVladimir Sementsov-Ogievskiy } 3303999658a0SVladimir Sementsov-Ogievskiy 330437aec7d7SFam Zheng return ret; 3305fcc67678SFam Zheng } 3306fcc67678SFam Zheng 3307fcc67678SFam Zheng /* Copy range from @src to @dst. 3308fcc67678SFam Zheng * 3309fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3310fcc67678SFam Zheng * semantics. */ 3311a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, 3312a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3313a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, 331467b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 331567b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3316fcc67678SFam Zheng { 3317967d7905SEmanuele Giuseppe Esposito IO_CODE(); 3318ecc983a5SFam Zheng trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3319ecc983a5SFam Zheng read_flags, write_flags); 3320fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 332167b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, true); 3322fcc67678SFam Zheng } 3323fcc67678SFam Zheng 3324fcc67678SFam Zheng /* Copy range from @src to @dst. 3325fcc67678SFam Zheng * 3326fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3327fcc67678SFam Zheng * semantics. */ 3328a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, 3329a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3330a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, 333167b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 333267b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3333fcc67678SFam Zheng { 3334967d7905SEmanuele Giuseppe Esposito IO_CODE(); 3335ecc983a5SFam Zheng trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3336ecc983a5SFam Zheng read_flags, write_flags); 3337fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 333867b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, false); 3339fcc67678SFam Zheng } 3340fcc67678SFam Zheng 3341a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, 3342a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3343a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, BdrvRequestFlags read_flags, 334467b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3345fcc67678SFam Zheng { 3346384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 334737aec7d7SFam Zheng return bdrv_co_copy_range_from(src, src_offset, 3348fcc67678SFam Zheng dst, dst_offset, 334967b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags); 3350fcc67678SFam Zheng } 33513d9f2d2aSKevin Wolf 33523d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs) 33533d9f2d2aSKevin Wolf { 33543d9f2d2aSKevin Wolf BdrvChild *c; 33553d9f2d2aSKevin Wolf QLIST_FOREACH(c, &bs->parents, next_parent) { 3356bd86fb99SMax Reitz if (c->klass->resize) { 3357bd86fb99SMax Reitz c->klass->resize(c); 33583d9f2d2aSKevin Wolf } 33593d9f2d2aSKevin Wolf } 33603d9f2d2aSKevin Wolf } 33613d9f2d2aSKevin Wolf 33623d9f2d2aSKevin Wolf /** 33633d9f2d2aSKevin Wolf * Truncate file to 'offset' bytes (needed only for file protocols) 3364c80d8b06SMax Reitz * 3365c80d8b06SMax Reitz * If 'exact' is true, the file must be resized to exactly the given 3366c80d8b06SMax Reitz * 'offset'. Otherwise, it is sufficient for the node to be at least 3367c80d8b06SMax Reitz * 'offset' bytes in length. 33683d9f2d2aSKevin Wolf */ 3369c80d8b06SMax Reitz int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 33707b8e4857SKevin Wolf PreallocMode prealloc, BdrvRequestFlags flags, 33717b8e4857SKevin Wolf Error **errp) 33723d9f2d2aSKevin Wolf { 33733d9f2d2aSKevin Wolf BlockDriverState *bs = child->bs; 337423b93525SMax Reitz BdrvChild *filtered, *backing; 33753d9f2d2aSKevin Wolf BlockDriver *drv = bs->drv; 33761bc5f09fSKevin Wolf BdrvTrackedRequest req; 33771bc5f09fSKevin Wolf int64_t old_size, new_bytes; 33783d9f2d2aSKevin Wolf int ret; 3379384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 33803d9f2d2aSKevin Wolf 33813d9f2d2aSKevin Wolf /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 33823d9f2d2aSKevin Wolf if (!drv) { 33833d9f2d2aSKevin Wolf error_setg(errp, "No medium inserted"); 33843d9f2d2aSKevin Wolf return -ENOMEDIUM; 33853d9f2d2aSKevin Wolf } 33863d9f2d2aSKevin Wolf if (offset < 0) { 33873d9f2d2aSKevin Wolf error_setg(errp, "Image size cannot be negative"); 33883d9f2d2aSKevin Wolf return -EINVAL; 33893d9f2d2aSKevin Wolf } 33903d9f2d2aSKevin Wolf 339169b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, 0, errp); 33928b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 33938b117001SVladimir Sementsov-Ogievskiy return ret; 33948b117001SVladimir Sementsov-Ogievskiy } 33958b117001SVladimir Sementsov-Ogievskiy 33961bc5f09fSKevin Wolf old_size = bdrv_getlength(bs); 33971bc5f09fSKevin Wolf if (old_size < 0) { 33981bc5f09fSKevin Wolf error_setg_errno(errp, -old_size, "Failed to get old image size"); 33991bc5f09fSKevin Wolf return old_size; 34001bc5f09fSKevin Wolf } 34011bc5f09fSKevin Wolf 340297efa869SEric Blake if (bdrv_is_read_only(bs)) { 340397efa869SEric Blake error_setg(errp, "Image is read-only"); 340497efa869SEric Blake return -EACCES; 340597efa869SEric Blake } 340697efa869SEric Blake 34071bc5f09fSKevin Wolf if (offset > old_size) { 34081bc5f09fSKevin Wolf new_bytes = offset - old_size; 34091bc5f09fSKevin Wolf } else { 34101bc5f09fSKevin Wolf new_bytes = 0; 34111bc5f09fSKevin Wolf } 34121bc5f09fSKevin Wolf 34133d9f2d2aSKevin Wolf bdrv_inc_in_flight(bs); 34145416a11eSFam Zheng tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 34155416a11eSFam Zheng BDRV_TRACKED_TRUNCATE); 34161bc5f09fSKevin Wolf 34171bc5f09fSKevin Wolf /* If we are growing the image and potentially using preallocation for the 34181bc5f09fSKevin Wolf * new area, we need to make sure that no write requests are made to it 34191bc5f09fSKevin Wolf * concurrently or they might be overwritten by preallocation. */ 34201bc5f09fSKevin Wolf if (new_bytes) { 34218ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, 1); 3422cd47d792SFam Zheng } 3423cd47d792SFam Zheng ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3424cd47d792SFam Zheng 0); 3425cd47d792SFam Zheng if (ret < 0) { 3426cd47d792SFam Zheng error_setg_errno(errp, -ret, 3427cd47d792SFam Zheng "Failed to prepare request for truncation"); 3428cd47d792SFam Zheng goto out; 34291bc5f09fSKevin Wolf } 34303d9f2d2aSKevin Wolf 343193393e69SMax Reitz filtered = bdrv_filter_child(bs); 343223b93525SMax Reitz backing = bdrv_cow_child(bs); 343393393e69SMax Reitz 3434955c7d66SKevin Wolf /* 3435955c7d66SKevin Wolf * If the image has a backing file that is large enough that it would 3436955c7d66SKevin Wolf * provide data for the new area, we cannot leave it unallocated because 3437955c7d66SKevin Wolf * then the backing file content would become visible. Instead, zero-fill 3438955c7d66SKevin Wolf * the new area. 3439955c7d66SKevin Wolf * 3440955c7d66SKevin Wolf * Note that if the image has a backing file, but was opened without the 3441955c7d66SKevin Wolf * backing file, taking care of keeping things consistent with that backing 3442955c7d66SKevin Wolf * file is the user's responsibility. 3443955c7d66SKevin Wolf */ 344423b93525SMax Reitz if (new_bytes && backing) { 3445955c7d66SKevin Wolf int64_t backing_len; 3446955c7d66SKevin Wolf 3447bd53086eSEmanuele Giuseppe Esposito backing_len = bdrv_co_getlength(backing->bs); 3448955c7d66SKevin Wolf if (backing_len < 0) { 3449955c7d66SKevin Wolf ret = backing_len; 3450955c7d66SKevin Wolf error_setg_errno(errp, -ret, "Could not get backing file size"); 3451955c7d66SKevin Wolf goto out; 3452955c7d66SKevin Wolf } 3453955c7d66SKevin Wolf 3454955c7d66SKevin Wolf if (backing_len > old_size) { 3455955c7d66SKevin Wolf flags |= BDRV_REQ_ZERO_WRITE; 3456955c7d66SKevin Wolf } 3457955c7d66SKevin Wolf } 3458955c7d66SKevin Wolf 34596b7e8f8bSMax Reitz if (drv->bdrv_co_truncate) { 346092b92799SKevin Wolf if (flags & ~bs->supported_truncate_flags) { 346192b92799SKevin Wolf error_setg(errp, "Block driver does not support requested flags"); 346292b92799SKevin Wolf ret = -ENOTSUP; 346392b92799SKevin Wolf goto out; 346492b92799SKevin Wolf } 346592b92799SKevin Wolf ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 346693393e69SMax Reitz } else if (filtered) { 346793393e69SMax Reitz ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); 34686b7e8f8bSMax Reitz } else { 34693d9f2d2aSKevin Wolf error_setg(errp, "Image format driver does not support resize"); 34703d9f2d2aSKevin Wolf ret = -ENOTSUP; 34713d9f2d2aSKevin Wolf goto out; 34723d9f2d2aSKevin Wolf } 34733d9f2d2aSKevin Wolf if (ret < 0) { 34743d9f2d2aSKevin Wolf goto out; 34753d9f2d2aSKevin Wolf } 34766b7e8f8bSMax Reitz 3477bd53086eSEmanuele Giuseppe Esposito ret = bdrv_co_refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 34783d9f2d2aSKevin Wolf if (ret < 0) { 34793d9f2d2aSKevin Wolf error_setg_errno(errp, -ret, "Could not refresh total sector count"); 34803d9f2d2aSKevin Wolf } else { 34813d9f2d2aSKevin Wolf offset = bs->total_sectors * BDRV_SECTOR_SIZE; 34823d9f2d2aSKevin Wolf } 3483c057960cSEmanuele Giuseppe Esposito /* 3484c057960cSEmanuele Giuseppe Esposito * It's possible that truncation succeeded but bdrv_refresh_total_sectors 3485cd47d792SFam Zheng * failed, but the latter doesn't affect how we should finish the request. 3486c057960cSEmanuele Giuseppe Esposito * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. 3487c057960cSEmanuele Giuseppe Esposito */ 3488cd47d792SFam Zheng bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 34893d9f2d2aSKevin Wolf 34903d9f2d2aSKevin Wolf out: 34911bc5f09fSKevin Wolf tracked_request_end(&req); 34923d9f2d2aSKevin Wolf bdrv_dec_in_flight(bs); 34931bc5f09fSKevin Wolf 34943d9f2d2aSKevin Wolf return ret; 34953d9f2d2aSKevin Wolf } 3496bd54669aSVladimir Sementsov-Ogievskiy 3497bd54669aSVladimir Sementsov-Ogievskiy void bdrv_cancel_in_flight(BlockDriverState *bs) 3498bd54669aSVladimir Sementsov-Ogievskiy { 3499f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 3500bd54669aSVladimir Sementsov-Ogievskiy if (!bs || !bs->drv) { 3501bd54669aSVladimir Sementsov-Ogievskiy return; 3502bd54669aSVladimir Sementsov-Ogievskiy } 3503bd54669aSVladimir Sementsov-Ogievskiy 3504bd54669aSVladimir Sementsov-Ogievskiy if (bs->drv->bdrv_cancel_in_flight) { 3505bd54669aSVladimir Sementsov-Ogievskiy bs->drv->bdrv_cancel_in_flight(bs); 3506bd54669aSVladimir Sementsov-Ogievskiy } 3507bd54669aSVladimir Sementsov-Ogievskiy } 3508ce14f3b4SVladimir Sementsov-Ogievskiy 3509ce14f3b4SVladimir Sementsov-Ogievskiy int coroutine_fn 3510ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes, 3511ce14f3b4SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset) 3512ce14f3b4SVladimir Sementsov-Ogievskiy { 3513ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriverState *bs = child->bs; 3514ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 3515ce14f3b4SVladimir Sementsov-Ogievskiy int ret; 3516ce14f3b4SVladimir Sementsov-Ogievskiy IO_CODE(); 3517ce14f3b4SVladimir Sementsov-Ogievskiy 3518ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv) { 3519ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 3520ce14f3b4SVladimir Sementsov-Ogievskiy } 3521ce14f3b4SVladimir Sementsov-Ogievskiy 3522ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv->bdrv_co_preadv_snapshot) { 3523ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOTSUP; 3524ce14f3b4SVladimir Sementsov-Ogievskiy } 3525ce14f3b4SVladimir Sementsov-Ogievskiy 3526ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 3527ce14f3b4SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset); 3528ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 3529ce14f3b4SVladimir Sementsov-Ogievskiy 3530ce14f3b4SVladimir Sementsov-Ogievskiy return ret; 3531ce14f3b4SVladimir Sementsov-Ogievskiy } 3532ce14f3b4SVladimir Sementsov-Ogievskiy 3533ce14f3b4SVladimir Sementsov-Ogievskiy int coroutine_fn 3534ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_co_snapshot_block_status(BlockDriverState *bs, 3535ce14f3b4SVladimir Sementsov-Ogievskiy bool want_zero, int64_t offset, int64_t bytes, 3536ce14f3b4SVladimir Sementsov-Ogievskiy int64_t *pnum, int64_t *map, 3537ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriverState **file) 3538ce14f3b4SVladimir Sementsov-Ogievskiy { 3539ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 3540ce14f3b4SVladimir Sementsov-Ogievskiy int ret; 3541ce14f3b4SVladimir Sementsov-Ogievskiy IO_CODE(); 3542ce14f3b4SVladimir Sementsov-Ogievskiy 3543ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv) { 3544ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 3545ce14f3b4SVladimir Sementsov-Ogievskiy } 3546ce14f3b4SVladimir Sementsov-Ogievskiy 3547ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv->bdrv_co_snapshot_block_status) { 3548ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOTSUP; 3549ce14f3b4SVladimir Sementsov-Ogievskiy } 3550ce14f3b4SVladimir Sementsov-Ogievskiy 3551ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 3552ce14f3b4SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes, 3553ce14f3b4SVladimir Sementsov-Ogievskiy pnum, map, file); 3554ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 3555ce14f3b4SVladimir Sementsov-Ogievskiy 3556ce14f3b4SVladimir Sementsov-Ogievskiy return ret; 3557ce14f3b4SVladimir Sementsov-Ogievskiy } 3558ce14f3b4SVladimir Sementsov-Ogievskiy 3559ce14f3b4SVladimir Sementsov-Ogievskiy int coroutine_fn 3560ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes) 3561ce14f3b4SVladimir Sementsov-Ogievskiy { 3562ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 3563ce14f3b4SVladimir Sementsov-Ogievskiy int ret; 3564ce14f3b4SVladimir Sementsov-Ogievskiy IO_CODE(); 3565ce14f3b4SVladimir Sementsov-Ogievskiy 3566ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv) { 3567ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 3568ce14f3b4SVladimir Sementsov-Ogievskiy } 3569ce14f3b4SVladimir Sementsov-Ogievskiy 3570ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv->bdrv_co_pdiscard_snapshot) { 3571ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOTSUP; 3572ce14f3b4SVladimir Sementsov-Ogievskiy } 3573ce14f3b4SVladimir Sementsov-Ogievskiy 3574ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 3575ce14f3b4SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes); 3576ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 3577ce14f3b4SVladimir Sementsov-Ogievskiy 3578ce14f3b4SVladimir Sementsov-Ogievskiy return ret; 3579ce14f3b4SVladimir Sementsov-Ogievskiy } 3580