161007b31SStefan Hajnoczi /* 261007b31SStefan Hajnoczi * Block layer I/O functions 361007b31SStefan Hajnoczi * 461007b31SStefan Hajnoczi * Copyright (c) 2003 Fabrice Bellard 561007b31SStefan Hajnoczi * 661007b31SStefan Hajnoczi * Permission is hereby granted, free of charge, to any person obtaining a copy 761007b31SStefan Hajnoczi * of this software and associated documentation files (the "Software"), to deal 861007b31SStefan Hajnoczi * in the Software without restriction, including without limitation the rights 961007b31SStefan Hajnoczi * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1061007b31SStefan Hajnoczi * copies of the Software, and to permit persons to whom the Software is 1161007b31SStefan Hajnoczi * furnished to do so, subject to the following conditions: 1261007b31SStefan Hajnoczi * 1361007b31SStefan Hajnoczi * The above copyright notice and this permission notice shall be included in 1461007b31SStefan Hajnoczi * all copies or substantial portions of the Software. 1561007b31SStefan Hajnoczi * 1661007b31SStefan Hajnoczi * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1761007b31SStefan Hajnoczi * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1861007b31SStefan Hajnoczi * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1961007b31SStefan Hajnoczi * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2061007b31SStefan Hajnoczi * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2161007b31SStefan Hajnoczi * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2261007b31SStefan Hajnoczi * THE SOFTWARE. 2361007b31SStefan Hajnoczi */ 2461007b31SStefan Hajnoczi 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2661007b31SStefan Hajnoczi #include "trace.h" 277f0e9da6SMax Reitz #include "sysemu/block-backend.h" 287719f3c9SStefan Hajnoczi #include "block/aio-wait.h" 2961007b31SStefan Hajnoczi #include "block/blockjob.h" 30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h" 3161007b31SStefan Hajnoczi #include "block/block_int.h" 3221c2283eSVladimir Sementsov-Ogievskiy #include "block/coroutines.h" 3394783301SVladimir Sementsov-Ogievskiy #include "block/write-threshold.h" 34f348b6d1SVeronia Bahaa #include "qemu/cutils.h" 355df022cfSPeter Maydell #include "qemu/memalign.h" 36da34e65cSMarkus Armbruster #include "qapi/error.h" 37d49b6836SMarkus Armbruster #include "qemu/error-report.h" 38db725815SMarkus Armbruster #include "qemu/main-loop.h" 39c8aa7895SPavel Dovgalyuk #include "sysemu/replay.h" 4061007b31SStefan Hajnoczi 41cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 42cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 43cb2e2878SEric Blake 447f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs); 45d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 465ae07b14SVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, BdrvRequestFlags flags); 4761007b31SStefan Hajnoczi 48*a82a3bd1SKevin Wolf static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore) 4961007b31SStefan Hajnoczi { 5002d21300SKevin Wolf BdrvChild *c, *next; 5127ccdd52SKevin Wolf 5202d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 53*a82a3bd1SKevin Wolf if (c == ignore) { 540152bf40SKevin Wolf continue; 550152bf40SKevin Wolf } 564be6a6d1SKevin Wolf bdrv_parent_drained_begin_single(c, false); 57ce0f1412SPaolo Bonzini } 58ce0f1412SPaolo Bonzini } 59ce0f1412SPaolo Bonzini 602f65df6eSKevin Wolf void bdrv_parent_drained_end_single(BdrvChild *c) 61804db8eaSMax Reitz { 622f65df6eSKevin Wolf IO_OR_GS_CODE(); 632f65df6eSKevin Wolf 6457e05be3SKevin Wolf assert(c->quiesced_parent); 6557e05be3SKevin Wolf c->quiesced_parent = false; 6657e05be3SKevin Wolf 67bd86fb99SMax Reitz if (c->klass->drained_end) { 682f65df6eSKevin Wolf c->klass->drained_end(c); 69804db8eaSMax Reitz } 70804db8eaSMax Reitz } 71804db8eaSMax Reitz 72*a82a3bd1SKevin Wolf static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore) 73ce0f1412SPaolo Bonzini { 7461ad631cSMax Reitz BdrvChild *c; 7527ccdd52SKevin Wolf 7661ad631cSMax Reitz QLIST_FOREACH(c, &bs->parents, next_parent) { 77*a82a3bd1SKevin Wolf if (c == ignore) { 780152bf40SKevin Wolf continue; 790152bf40SKevin Wolf } 802f65df6eSKevin Wolf bdrv_parent_drained_end_single(c); 81c2066af0SKevin Wolf } 8261007b31SStefan Hajnoczi } 8361007b31SStefan Hajnoczi 844be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c) 854be6a6d1SKevin Wolf { 86bd86fb99SMax Reitz if (c->klass->drained_poll) { 87bd86fb99SMax Reitz return c->klass->drained_poll(c); 884be6a6d1SKevin Wolf } 894be6a6d1SKevin Wolf return false; 904be6a6d1SKevin Wolf } 914be6a6d1SKevin Wolf 926cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 936cd5c9d7SKevin Wolf bool ignore_bds_parents) 9489bd0305SKevin Wolf { 9589bd0305SKevin Wolf BdrvChild *c, *next; 9689bd0305SKevin Wolf bool busy = false; 9789bd0305SKevin Wolf 9889bd0305SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 99bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 10089bd0305SKevin Wolf continue; 10189bd0305SKevin Wolf } 1024be6a6d1SKevin Wolf busy |= bdrv_parent_drained_poll_single(c); 10389bd0305SKevin Wolf } 10489bd0305SKevin Wolf 10589bd0305SKevin Wolf return busy; 10689bd0305SKevin Wolf } 10789bd0305SKevin Wolf 1084be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 1094be6a6d1SKevin Wolf { 110ace5a161SHanna Reitz AioContext *ctx = bdrv_child_get_parent_aio_context(c); 111384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 11257e05be3SKevin Wolf 11357e05be3SKevin Wolf assert(!c->quiesced_parent); 11457e05be3SKevin Wolf c->quiesced_parent = true; 11557e05be3SKevin Wolf 116bd86fb99SMax Reitz if (c->klass->drained_begin) { 117bd86fb99SMax Reitz c->klass->drained_begin(c); 1184be6a6d1SKevin Wolf } 1194be6a6d1SKevin Wolf if (poll) { 120ace5a161SHanna Reitz AIO_WAIT_WHILE(ctx, bdrv_parent_drained_poll_single(c)); 1214be6a6d1SKevin Wolf } 1224be6a6d1SKevin Wolf } 1234be6a6d1SKevin Wolf 124d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 125d9e0dfa2SEric Blake { 1269f460c64SAkihiko Odaki dst->pdiscard_alignment = MAX(dst->pdiscard_alignment, 1279f460c64SAkihiko Odaki src->pdiscard_alignment); 128d9e0dfa2SEric Blake dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 129d9e0dfa2SEric Blake dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 13024b36e98SPaolo Bonzini dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer, 13124b36e98SPaolo Bonzini src->max_hw_transfer); 132d9e0dfa2SEric Blake dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 133d9e0dfa2SEric Blake src->opt_mem_alignment); 134d9e0dfa2SEric Blake dst->min_mem_alignment = MAX(dst->min_mem_alignment, 135d9e0dfa2SEric Blake src->min_mem_alignment); 136d9e0dfa2SEric Blake dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 137cc071629SPaolo Bonzini dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov); 138d9e0dfa2SEric Blake } 139d9e0dfa2SEric Blake 1401e4c797cSVladimir Sementsov-Ogievskiy typedef struct BdrvRefreshLimitsState { 1411e4c797cSVladimir Sementsov-Ogievskiy BlockDriverState *bs; 1421e4c797cSVladimir Sementsov-Ogievskiy BlockLimits old_bl; 1431e4c797cSVladimir Sementsov-Ogievskiy } BdrvRefreshLimitsState; 1441e4c797cSVladimir Sementsov-Ogievskiy 1451e4c797cSVladimir Sementsov-Ogievskiy static void bdrv_refresh_limits_abort(void *opaque) 1461e4c797cSVladimir Sementsov-Ogievskiy { 1471e4c797cSVladimir Sementsov-Ogievskiy BdrvRefreshLimitsState *s = opaque; 1481e4c797cSVladimir Sementsov-Ogievskiy 1491e4c797cSVladimir Sementsov-Ogievskiy s->bs->bl = s->old_bl; 1501e4c797cSVladimir Sementsov-Ogievskiy } 1511e4c797cSVladimir Sementsov-Ogievskiy 1521e4c797cSVladimir Sementsov-Ogievskiy static TransactionActionDrv bdrv_refresh_limits_drv = { 1531e4c797cSVladimir Sementsov-Ogievskiy .abort = bdrv_refresh_limits_abort, 1541e4c797cSVladimir Sementsov-Ogievskiy .clean = g_free, 1551e4c797cSVladimir Sementsov-Ogievskiy }; 1561e4c797cSVladimir Sementsov-Ogievskiy 1571e4c797cSVladimir Sementsov-Ogievskiy /* @tran is allowed to be NULL, in this case no rollback is possible. */ 1581e4c797cSVladimir Sementsov-Ogievskiy void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp) 15961007b31SStefan Hajnoczi { 16033985614SVladimir Sementsov-Ogievskiy ERRP_GUARD(); 16161007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 16266b129acSMax Reitz BdrvChild *c; 16366b129acSMax Reitz bool have_limits; 16461007b31SStefan Hajnoczi 165f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 166f791bf7fSEmanuele Giuseppe Esposito 1671e4c797cSVladimir Sementsov-Ogievskiy if (tran) { 1681e4c797cSVladimir Sementsov-Ogievskiy BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1); 1691e4c797cSVladimir Sementsov-Ogievskiy *s = (BdrvRefreshLimitsState) { 1701e4c797cSVladimir Sementsov-Ogievskiy .bs = bs, 1711e4c797cSVladimir Sementsov-Ogievskiy .old_bl = bs->bl, 1721e4c797cSVladimir Sementsov-Ogievskiy }; 1731e4c797cSVladimir Sementsov-Ogievskiy tran_add(tran, &bdrv_refresh_limits_drv, s); 1741e4c797cSVladimir Sementsov-Ogievskiy } 1751e4c797cSVladimir Sementsov-Ogievskiy 17661007b31SStefan Hajnoczi memset(&bs->bl, 0, sizeof(bs->bl)); 17761007b31SStefan Hajnoczi 17861007b31SStefan Hajnoczi if (!drv) { 17961007b31SStefan Hajnoczi return; 18061007b31SStefan Hajnoczi } 18161007b31SStefan Hajnoczi 18279ba8c98SEric Blake /* Default alignment based on whether driver has byte interface */ 183e31f6864SEric Blake bs->bl.request_alignment = (drv->bdrv_co_preadv || 184ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_aio_preadv || 185ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_co_preadv_part) ? 1 : 512; 18679ba8c98SEric Blake 18761007b31SStefan Hajnoczi /* Take some limits from the children as a default */ 18866b129acSMax Reitz have_limits = false; 18966b129acSMax Reitz QLIST_FOREACH(c, &bs->children, next) { 19066b129acSMax Reitz if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) 19166b129acSMax Reitz { 19266b129acSMax Reitz bdrv_merge_limits(&bs->bl, &c->bs->bl); 19366b129acSMax Reitz have_limits = true; 19466b129acSMax Reitz } 19566b129acSMax Reitz } 19666b129acSMax Reitz 19766b129acSMax Reitz if (!have_limits) { 1984196d2f0SDenis V. Lunev bs->bl.min_mem_alignment = 512; 1998e3b0cbbSMarc-André Lureau bs->bl.opt_mem_alignment = qemu_real_host_page_size(); 200bd44feb7SStefan Hajnoczi 201bd44feb7SStefan Hajnoczi /* Safe default since most protocols use readv()/writev()/etc */ 202bd44feb7SStefan Hajnoczi bs->bl.max_iov = IOV_MAX; 20361007b31SStefan Hajnoczi } 20461007b31SStefan Hajnoczi 20561007b31SStefan Hajnoczi /* Then let the driver override it */ 20661007b31SStefan Hajnoczi if (drv->bdrv_refresh_limits) { 20761007b31SStefan Hajnoczi drv->bdrv_refresh_limits(bs, errp); 2088b117001SVladimir Sementsov-Ogievskiy if (*errp) { 2098b117001SVladimir Sementsov-Ogievskiy return; 2108b117001SVladimir Sementsov-Ogievskiy } 2118b117001SVladimir Sementsov-Ogievskiy } 2128b117001SVladimir Sementsov-Ogievskiy 2138b117001SVladimir Sementsov-Ogievskiy if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) { 2148b117001SVladimir Sementsov-Ogievskiy error_setg(errp, "Driver requires too large request alignment"); 21561007b31SStefan Hajnoczi } 21661007b31SStefan Hajnoczi } 21761007b31SStefan Hajnoczi 21861007b31SStefan Hajnoczi /** 21961007b31SStefan Hajnoczi * The copy-on-read flag is actually a reference count so multiple users may 22061007b31SStefan Hajnoczi * use the feature without worrying about clobbering its previous state. 22161007b31SStefan Hajnoczi * Copy-on-read stays enabled until all users have called to disable it. 22261007b31SStefan Hajnoczi */ 22361007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs) 22461007b31SStefan Hajnoczi { 225384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 226d73415a3SStefan Hajnoczi qatomic_inc(&bs->copy_on_read); 22761007b31SStefan Hajnoczi } 22861007b31SStefan Hajnoczi 22961007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs) 23061007b31SStefan Hajnoczi { 231d73415a3SStefan Hajnoczi int old = qatomic_fetch_dec(&bs->copy_on_read); 232384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 233d3faa13eSPaolo Bonzini assert(old >= 1); 23461007b31SStefan Hajnoczi } 23561007b31SStefan Hajnoczi 23661124f03SPaolo Bonzini typedef struct { 23761124f03SPaolo Bonzini Coroutine *co; 23861124f03SPaolo Bonzini BlockDriverState *bs; 23961124f03SPaolo Bonzini bool done; 240481cad48SManos Pitsidianakis bool begin; 241fe4f0614SKevin Wolf bool poll; 2420152bf40SKevin Wolf BdrvChild *parent; 24361124f03SPaolo Bonzini } BdrvCoDrainData; 24461124f03SPaolo Bonzini 2451cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 246299403aeSKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, BdrvChild *ignore_parent, 247299403aeSKevin Wolf bool ignore_bds_parents) 24889bd0305SKevin Wolf { 249384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 250fe4f0614SKevin Wolf 2516cd5c9d7SKevin Wolf if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 25289bd0305SKevin Wolf return true; 25389bd0305SKevin Wolf } 25489bd0305SKevin Wolf 255d73415a3SStefan Hajnoczi if (qatomic_read(&bs->in_flight)) { 256fe4f0614SKevin Wolf return true; 25789bd0305SKevin Wolf } 25889bd0305SKevin Wolf 259fe4f0614SKevin Wolf return false; 260fe4f0614SKevin Wolf } 261fe4f0614SKevin Wolf 262299403aeSKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, 26389bd0305SKevin Wolf BdrvChild *ignore_parent) 2641cc8e54aSKevin Wolf { 265299403aeSKevin Wolf return bdrv_drain_poll(bs, ignore_parent, false); 2661cc8e54aSKevin Wolf } 2671cc8e54aSKevin Wolf 268299403aeSKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, 269*a82a3bd1SKevin Wolf bool poll); 270*a82a3bd1SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent); 2710152bf40SKevin Wolf 272a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque) 273a77fd4bbSFam Zheng { 274a77fd4bbSFam Zheng BdrvCoDrainData *data = opaque; 275a77fd4bbSFam Zheng Coroutine *co = data->co; 27699723548SPaolo Bonzini BlockDriverState *bs = data->bs; 277a77fd4bbSFam Zheng 278c8ca33d0SKevin Wolf if (bs) { 279aa1361d5SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 280aa1361d5SKevin Wolf aio_context_acquire(ctx); 28199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 282481cad48SManos Pitsidianakis if (data->begin) { 283*a82a3bd1SKevin Wolf bdrv_do_drained_begin(bs, data->parent, data->poll); 284481cad48SManos Pitsidianakis } else { 285e037c09cSMax Reitz assert(!data->poll); 286*a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, data->parent); 287481cad48SManos Pitsidianakis } 288aa1361d5SKevin Wolf aio_context_release(ctx); 289c8ca33d0SKevin Wolf } else { 290c8ca33d0SKevin Wolf assert(data->begin); 291c8ca33d0SKevin Wolf bdrv_drain_all_begin(); 292c8ca33d0SKevin Wolf } 293481cad48SManos Pitsidianakis 294a77fd4bbSFam Zheng data->done = true; 2951919631eSPaolo Bonzini aio_co_wake(co); 296a77fd4bbSFam Zheng } 297a77fd4bbSFam Zheng 298481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 299299403aeSKevin Wolf bool begin, 3006cd5c9d7SKevin Wolf BdrvChild *parent, 3012f65df6eSKevin Wolf bool poll) 302a77fd4bbSFam Zheng { 303a77fd4bbSFam Zheng BdrvCoDrainData data; 304960d5fb3SKevin Wolf Coroutine *self = qemu_coroutine_self(); 305960d5fb3SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 306960d5fb3SKevin Wolf AioContext *co_ctx = qemu_coroutine_get_aio_context(self); 307a77fd4bbSFam Zheng 308a77fd4bbSFam Zheng /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 309c40a2545SStefan Hajnoczi * other coroutines run if they were queued by aio_co_enter(). */ 310a77fd4bbSFam Zheng 311a77fd4bbSFam Zheng assert(qemu_in_coroutine()); 312a77fd4bbSFam Zheng data = (BdrvCoDrainData) { 313960d5fb3SKevin Wolf .co = self, 314a77fd4bbSFam Zheng .bs = bs, 315a77fd4bbSFam Zheng .done = false, 316481cad48SManos Pitsidianakis .begin = begin, 3170152bf40SKevin Wolf .parent = parent, 318fe4f0614SKevin Wolf .poll = poll, 319a77fd4bbSFam Zheng }; 3208e1da77eSMax Reitz 321c8ca33d0SKevin Wolf if (bs) { 32299723548SPaolo Bonzini bdrv_inc_in_flight(bs); 323c8ca33d0SKevin Wolf } 324960d5fb3SKevin Wolf 325960d5fb3SKevin Wolf /* 326960d5fb3SKevin Wolf * Temporarily drop the lock across yield or we would get deadlocks. 327960d5fb3SKevin Wolf * bdrv_co_drain_bh_cb() reaquires the lock as needed. 328960d5fb3SKevin Wolf * 329960d5fb3SKevin Wolf * When we yield below, the lock for the current context will be 330960d5fb3SKevin Wolf * released, so if this is actually the lock that protects bs, don't drop 331960d5fb3SKevin Wolf * it a second time. 332960d5fb3SKevin Wolf */ 333960d5fb3SKevin Wolf if (ctx != co_ctx) { 334960d5fb3SKevin Wolf aio_context_release(ctx); 335960d5fb3SKevin Wolf } 336960d5fb3SKevin Wolf replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data); 337a77fd4bbSFam Zheng 338a77fd4bbSFam Zheng qemu_coroutine_yield(); 339a77fd4bbSFam Zheng /* If we are resumed from some other event (such as an aio completion or a 340a77fd4bbSFam Zheng * timer callback), it is a bug in the caller that should be fixed. */ 341a77fd4bbSFam Zheng assert(data.done); 342960d5fb3SKevin Wolf 343960d5fb3SKevin Wolf /* Reaquire the AioContext of bs if we dropped it */ 344960d5fb3SKevin Wolf if (ctx != co_ctx) { 345960d5fb3SKevin Wolf aio_context_acquire(ctx); 346960d5fb3SKevin Wolf } 347a77fd4bbSFam Zheng } 348a77fd4bbSFam Zheng 349*a82a3bd1SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, BdrvChild *parent) 350dcf94a23SKevin Wolf { 351384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 352dcf94a23SKevin Wolf assert(!qemu_in_coroutine()); 353dcf94a23SKevin Wolf 354dcf94a23SKevin Wolf /* Stop things in parent-to-child order */ 355d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { 356dcf94a23SKevin Wolf aio_disable_external(bdrv_get_aio_context(bs)); 357*a82a3bd1SKevin Wolf bdrv_parent_drained_begin(bs, parent); 358c7bc05f7SKevin Wolf if (bs->drv && bs->drv->bdrv_drain_begin) { 359c7bc05f7SKevin Wolf bs->drv->bdrv_drain_begin(bs); 360c7bc05f7SKevin Wolf } 361dcf94a23SKevin Wolf } 36257e05be3SKevin Wolf } 363dcf94a23SKevin Wolf 364299403aeSKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, BdrvChild *parent, 365*a82a3bd1SKevin Wolf bool poll) 3666820643fSKevin Wolf { 367d42cf288SPaolo Bonzini if (qemu_in_coroutine()) { 368*a82a3bd1SKevin Wolf bdrv_co_yield_to_drain(bs, true, parent, poll); 369d42cf288SPaolo Bonzini return; 370d42cf288SPaolo Bonzini } 371d42cf288SPaolo Bonzini 372*a82a3bd1SKevin Wolf bdrv_do_drained_begin_quiesce(bs, parent); 373d30b8e64SKevin Wolf 374fe4f0614SKevin Wolf /* 375fe4f0614SKevin Wolf * Wait for drained requests to finish. 376fe4f0614SKevin Wolf * 377fe4f0614SKevin Wolf * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 378fe4f0614SKevin Wolf * call is needed so things in this AioContext can make progress even 379fe4f0614SKevin Wolf * though we don't return to the main AioContext loop - this automatically 380fe4f0614SKevin Wolf * includes other nodes in the same AioContext and therefore all child 381fe4f0614SKevin Wolf * nodes. 382fe4f0614SKevin Wolf */ 383fe4f0614SKevin Wolf if (poll) { 384299403aeSKevin Wolf BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, parent)); 385fe4f0614SKevin Wolf } 3866820643fSKevin Wolf } 3876820643fSKevin Wolf 3880152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs) 3890152bf40SKevin Wolf { 390384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 391*a82a3bd1SKevin Wolf bdrv_do_drained_begin(bs, NULL, true); 392b0165585SKevin Wolf } 393b0165585SKevin Wolf 394e037c09cSMax Reitz /** 395e037c09cSMax Reitz * This function does not poll, nor must any of its recursively called 3962f65df6eSKevin Wolf * functions. 397e037c09cSMax Reitz */ 398*a82a3bd1SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) 399b0165585SKevin Wolf { 4000f115168SKevin Wolf int old_quiesce_counter; 4010f115168SKevin Wolf 402481cad48SManos Pitsidianakis if (qemu_in_coroutine()) { 403*a82a3bd1SKevin Wolf bdrv_co_yield_to_drain(bs, false, parent, false); 404481cad48SManos Pitsidianakis return; 405481cad48SManos Pitsidianakis } 4066820643fSKevin Wolf assert(bs->quiesce_counter > 0); 4076820643fSKevin Wolf 40860369b86SKevin Wolf /* Re-enable things in child-to-parent order */ 40957e05be3SKevin Wolf old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); 41057e05be3SKevin Wolf if (old_quiesce_counter == 1) { 411c7bc05f7SKevin Wolf if (bs->drv && bs->drv->bdrv_drain_end) { 412c7bc05f7SKevin Wolf bs->drv->bdrv_drain_end(bs); 413c7bc05f7SKevin Wolf } 414*a82a3bd1SKevin Wolf bdrv_parent_drained_end(bs, parent); 4156820643fSKevin Wolf aio_enable_external(bdrv_get_aio_context(bs)); 4166820643fSKevin Wolf } 4170f115168SKevin Wolf } 4186820643fSKevin Wolf 4190152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs) 4200152bf40SKevin Wolf { 421384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 422*a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, NULL); 423d736f119SKevin Wolf } 424d736f119SKevin Wolf 42561007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs) 42661007b31SStefan Hajnoczi { 427384a48fbSEmanuele Giuseppe Esposito IO_OR_GS_CODE(); 4286820643fSKevin Wolf bdrv_drained_begin(bs); 4296820643fSKevin Wolf bdrv_drained_end(bs); 43061007b31SStefan Hajnoczi } 43161007b31SStefan Hajnoczi 432c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs) 433c13ad59fSKevin Wolf { 434c13ad59fSKevin Wolf BdrvChild *child, *next; 435c13ad59fSKevin Wolf 436d73415a3SStefan Hajnoczi assert(qatomic_read(&bs->in_flight) == 0); 437c13ad59fSKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 438c13ad59fSKevin Wolf bdrv_drain_assert_idle(child->bs); 439c13ad59fSKevin Wolf } 440c13ad59fSKevin Wolf } 441c13ad59fSKevin Wolf 4420f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0; 4430f12264eSKevin Wolf 4440f12264eSKevin Wolf static bool bdrv_drain_all_poll(void) 4450f12264eSKevin Wolf { 4460f12264eSKevin Wolf BlockDriverState *bs = NULL; 4470f12264eSKevin Wolf bool result = false; 448f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 4490f12264eSKevin Wolf 4500f12264eSKevin Wolf /* bdrv_drain_poll() can't make changes to the graph and we are holding the 4510f12264eSKevin Wolf * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 4520f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 4530f12264eSKevin Wolf AioContext *aio_context = bdrv_get_aio_context(bs); 4540f12264eSKevin Wolf aio_context_acquire(aio_context); 455299403aeSKevin Wolf result |= bdrv_drain_poll(bs, NULL, true); 4560f12264eSKevin Wolf aio_context_release(aio_context); 4570f12264eSKevin Wolf } 4580f12264eSKevin Wolf 4590f12264eSKevin Wolf return result; 4600f12264eSKevin Wolf } 4610f12264eSKevin Wolf 46261007b31SStefan Hajnoczi /* 46361007b31SStefan Hajnoczi * Wait for pending requests to complete across all BlockDriverStates 46461007b31SStefan Hajnoczi * 46561007b31SStefan Hajnoczi * This function does not flush data to disk, use bdrv_flush_all() for that 46661007b31SStefan Hajnoczi * after calling this function. 467c0778f66SAlberto Garcia * 468c0778f66SAlberto Garcia * This pauses all block jobs and disables external clients. It must 469c0778f66SAlberto Garcia * be paired with bdrv_drain_all_end(). 470c0778f66SAlberto Garcia * 471c0778f66SAlberto Garcia * NOTE: no new block jobs or BlockDriverStates can be created between 472c0778f66SAlberto Garcia * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 47361007b31SStefan Hajnoczi */ 474c0778f66SAlberto Garcia void bdrv_drain_all_begin(void) 47561007b31SStefan Hajnoczi { 4760f12264eSKevin Wolf BlockDriverState *bs = NULL; 477f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 47861007b31SStefan Hajnoczi 479c8ca33d0SKevin Wolf if (qemu_in_coroutine()) { 480*a82a3bd1SKevin Wolf bdrv_co_yield_to_drain(NULL, true, NULL, true); 481c8ca33d0SKevin Wolf return; 482c8ca33d0SKevin Wolf } 483c8ca33d0SKevin Wolf 484c8aa7895SPavel Dovgalyuk /* 485c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 486c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 487c8aa7895SPavel Dovgalyuk * be infinite 488c8aa7895SPavel Dovgalyuk */ 489c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 490c8aa7895SPavel Dovgalyuk return; 491c8aa7895SPavel Dovgalyuk } 492c8aa7895SPavel Dovgalyuk 4930f12264eSKevin Wolf /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 4940f12264eSKevin Wolf * loop AioContext, so make sure we're in the main context. */ 4959a7e86c8SKevin Wolf assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 4960f12264eSKevin Wolf assert(bdrv_drain_all_count < INT_MAX); 4970f12264eSKevin Wolf bdrv_drain_all_count++; 4989a7e86c8SKevin Wolf 4990f12264eSKevin Wolf /* Quiesce all nodes, without polling in-flight requests yet. The graph 5000f12264eSKevin Wolf * cannot change during this loop. */ 5010f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 50261007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 50361007b31SStefan Hajnoczi 50461007b31SStefan Hajnoczi aio_context_acquire(aio_context); 505*a82a3bd1SKevin Wolf bdrv_do_drained_begin(bs, NULL, false); 50661007b31SStefan Hajnoczi aio_context_release(aio_context); 50761007b31SStefan Hajnoczi } 50861007b31SStefan Hajnoczi 5090f12264eSKevin Wolf /* Now poll the in-flight requests */ 510cfe29d82SKevin Wolf AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 5110f12264eSKevin Wolf 5120f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 513c13ad59fSKevin Wolf bdrv_drain_assert_idle(bs); 514f406c03cSAlexander Yarygin } 515f406c03cSAlexander Yarygin } 516c0778f66SAlberto Garcia 5171a6d3bd2SGreg Kurz void bdrv_drain_all_end_quiesce(BlockDriverState *bs) 5181a6d3bd2SGreg Kurz { 519b4ad82aaSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 5201a6d3bd2SGreg Kurz 5211a6d3bd2SGreg Kurz g_assert(bs->quiesce_counter > 0); 5221a6d3bd2SGreg Kurz g_assert(!bs->refcnt); 5231a6d3bd2SGreg Kurz 5241a6d3bd2SGreg Kurz while (bs->quiesce_counter) { 525*a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, NULL); 5261a6d3bd2SGreg Kurz } 5271a6d3bd2SGreg Kurz } 5281a6d3bd2SGreg Kurz 529c0778f66SAlberto Garcia void bdrv_drain_all_end(void) 530c0778f66SAlberto Garcia { 5310f12264eSKevin Wolf BlockDriverState *bs = NULL; 532f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 533c0778f66SAlberto Garcia 534c8aa7895SPavel Dovgalyuk /* 535c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 536c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 537c8aa7895SPavel Dovgalyuk * be endless 538c8aa7895SPavel Dovgalyuk */ 539c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 540c8aa7895SPavel Dovgalyuk return; 541c8aa7895SPavel Dovgalyuk } 542c8aa7895SPavel Dovgalyuk 5430f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 54461007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 54561007b31SStefan Hajnoczi 54661007b31SStefan Hajnoczi aio_context_acquire(aio_context); 547*a82a3bd1SKevin Wolf bdrv_do_drained_end(bs, NULL); 54861007b31SStefan Hajnoczi aio_context_release(aio_context); 54961007b31SStefan Hajnoczi } 5500f12264eSKevin Wolf 551e037c09cSMax Reitz assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 5520f12264eSKevin Wolf assert(bdrv_drain_all_count > 0); 5530f12264eSKevin Wolf bdrv_drain_all_count--; 55461007b31SStefan Hajnoczi } 55561007b31SStefan Hajnoczi 556c0778f66SAlberto Garcia void bdrv_drain_all(void) 557c0778f66SAlberto Garcia { 558f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 559c0778f66SAlberto Garcia bdrv_drain_all_begin(); 560c0778f66SAlberto Garcia bdrv_drain_all_end(); 561c0778f66SAlberto Garcia } 562c0778f66SAlberto Garcia 56361007b31SStefan Hajnoczi /** 56461007b31SStefan Hajnoczi * Remove an active request from the tracked requests list 56561007b31SStefan Hajnoczi * 56661007b31SStefan Hajnoczi * This function should be called when a tracked request is completing. 56761007b31SStefan Hajnoczi */ 568f0d43b1eSPaolo Bonzini static void coroutine_fn tracked_request_end(BdrvTrackedRequest *req) 56961007b31SStefan Hajnoczi { 57061007b31SStefan Hajnoczi if (req->serialising) { 571d73415a3SStefan Hajnoczi qatomic_dec(&req->bs->serialising_in_flight); 57261007b31SStefan Hajnoczi } 57361007b31SStefan Hajnoczi 5743783fa3dSPaolo Bonzini qemu_co_mutex_lock(&req->bs->reqs_lock); 57561007b31SStefan Hajnoczi QLIST_REMOVE(req, list); 57661007b31SStefan Hajnoczi qemu_co_queue_restart_all(&req->wait_queue); 5773783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&req->bs->reqs_lock); 57861007b31SStefan Hajnoczi } 57961007b31SStefan Hajnoczi 58061007b31SStefan Hajnoczi /** 58161007b31SStefan Hajnoczi * Add an active request to the tracked requests list 58261007b31SStefan Hajnoczi */ 583881a4c55SPaolo Bonzini static void coroutine_fn tracked_request_begin(BdrvTrackedRequest *req, 58461007b31SStefan Hajnoczi BlockDriverState *bs, 58561007b31SStefan Hajnoczi int64_t offset, 58680247264SEric Blake int64_t bytes, 587ebde595cSFam Zheng enum BdrvTrackedRequestType type) 58861007b31SStefan Hajnoczi { 58980247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 59022931a15SFam Zheng 59161007b31SStefan Hajnoczi *req = (BdrvTrackedRequest){ 59261007b31SStefan Hajnoczi .bs = bs, 59361007b31SStefan Hajnoczi .offset = offset, 59461007b31SStefan Hajnoczi .bytes = bytes, 595ebde595cSFam Zheng .type = type, 59661007b31SStefan Hajnoczi .co = qemu_coroutine_self(), 59761007b31SStefan Hajnoczi .serialising = false, 59861007b31SStefan Hajnoczi .overlap_offset = offset, 59961007b31SStefan Hajnoczi .overlap_bytes = bytes, 60061007b31SStefan Hajnoczi }; 60161007b31SStefan Hajnoczi 60261007b31SStefan Hajnoczi qemu_co_queue_init(&req->wait_queue); 60361007b31SStefan Hajnoczi 6043783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 60561007b31SStefan Hajnoczi QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 6063783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 60761007b31SStefan Hajnoczi } 60861007b31SStefan Hajnoczi 6093ba0e1a0SPaolo Bonzini static bool tracked_request_overlaps(BdrvTrackedRequest *req, 61080247264SEric Blake int64_t offset, int64_t bytes) 6113ba0e1a0SPaolo Bonzini { 61280247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 61380247264SEric Blake 6143ba0e1a0SPaolo Bonzini /* aaaa bbbb */ 6153ba0e1a0SPaolo Bonzini if (offset >= req->overlap_offset + req->overlap_bytes) { 6163ba0e1a0SPaolo Bonzini return false; 6173ba0e1a0SPaolo Bonzini } 6183ba0e1a0SPaolo Bonzini /* bbbb aaaa */ 6193ba0e1a0SPaolo Bonzini if (req->overlap_offset >= offset + bytes) { 6203ba0e1a0SPaolo Bonzini return false; 6213ba0e1a0SPaolo Bonzini } 6223ba0e1a0SPaolo Bonzini return true; 6233ba0e1a0SPaolo Bonzini } 6243ba0e1a0SPaolo Bonzini 6253183937fSVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 626881a4c55SPaolo Bonzini static coroutine_fn BdrvTrackedRequest * 6273183937fSVladimir Sementsov-Ogievskiy bdrv_find_conflicting_request(BdrvTrackedRequest *self) 6283ba0e1a0SPaolo Bonzini { 6293ba0e1a0SPaolo Bonzini BdrvTrackedRequest *req; 6303ba0e1a0SPaolo Bonzini 6313183937fSVladimir Sementsov-Ogievskiy QLIST_FOREACH(req, &self->bs->tracked_requests, list) { 6323ba0e1a0SPaolo Bonzini if (req == self || (!req->serialising && !self->serialising)) { 6333ba0e1a0SPaolo Bonzini continue; 6343ba0e1a0SPaolo Bonzini } 6353ba0e1a0SPaolo Bonzini if (tracked_request_overlaps(req, self->overlap_offset, 6363ba0e1a0SPaolo Bonzini self->overlap_bytes)) 6373ba0e1a0SPaolo Bonzini { 6383183937fSVladimir Sementsov-Ogievskiy /* 6393183937fSVladimir Sementsov-Ogievskiy * Hitting this means there was a reentrant request, for 6403ba0e1a0SPaolo Bonzini * example, a block driver issuing nested requests. This must 6413ba0e1a0SPaolo Bonzini * never happen since it means deadlock. 6423ba0e1a0SPaolo Bonzini */ 6433ba0e1a0SPaolo Bonzini assert(qemu_coroutine_self() != req->co); 6443ba0e1a0SPaolo Bonzini 6453183937fSVladimir Sementsov-Ogievskiy /* 6463183937fSVladimir Sementsov-Ogievskiy * If the request is already (indirectly) waiting for us, or 6473ba0e1a0SPaolo Bonzini * will wait for us as soon as it wakes up, then just go on 6483183937fSVladimir Sementsov-Ogievskiy * (instead of producing a deadlock in the former case). 6493183937fSVladimir Sementsov-Ogievskiy */ 6503ba0e1a0SPaolo Bonzini if (!req->waiting_for) { 6513183937fSVladimir Sementsov-Ogievskiy return req; 6523183937fSVladimir Sementsov-Ogievskiy } 6533183937fSVladimir Sementsov-Ogievskiy } 6543183937fSVladimir Sementsov-Ogievskiy } 6553183937fSVladimir Sementsov-Ogievskiy 6563183937fSVladimir Sementsov-Ogievskiy return NULL; 6573183937fSVladimir Sementsov-Ogievskiy } 6583183937fSVladimir Sementsov-Ogievskiy 659ec1c8868SVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 660131498f7SDenis V. Lunev static void coroutine_fn 661ec1c8868SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) 6623183937fSVladimir Sementsov-Ogievskiy { 6633183937fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req; 6643183937fSVladimir Sementsov-Ogievskiy 6653183937fSVladimir Sementsov-Ogievskiy while ((req = bdrv_find_conflicting_request(self))) { 6663ba0e1a0SPaolo Bonzini self->waiting_for = req; 667ec1c8868SVladimir Sementsov-Ogievskiy qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); 6683ba0e1a0SPaolo Bonzini self->waiting_for = NULL; 6693ba0e1a0SPaolo Bonzini } 6703ba0e1a0SPaolo Bonzini } 6713ba0e1a0SPaolo Bonzini 6728ac5aab2SVladimir Sementsov-Ogievskiy /* Called with req->bs->reqs_lock held */ 6738ac5aab2SVladimir Sementsov-Ogievskiy static void tracked_request_set_serialising(BdrvTrackedRequest *req, 6748ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 67561007b31SStefan Hajnoczi { 67661007b31SStefan Hajnoczi int64_t overlap_offset = req->offset & ~(align - 1); 67780247264SEric Blake int64_t overlap_bytes = 67880247264SEric Blake ROUND_UP(req->offset + req->bytes, align) - overlap_offset; 67980247264SEric Blake 68080247264SEric Blake bdrv_check_request(req->offset, req->bytes, &error_abort); 68161007b31SStefan Hajnoczi 68261007b31SStefan Hajnoczi if (!req->serialising) { 683d73415a3SStefan Hajnoczi qatomic_inc(&req->bs->serialising_in_flight); 68461007b31SStefan Hajnoczi req->serialising = true; 68561007b31SStefan Hajnoczi } 68661007b31SStefan Hajnoczi 68761007b31SStefan Hajnoczi req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 68861007b31SStefan Hajnoczi req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 68909d2f948SVladimir Sementsov-Ogievskiy } 69009d2f948SVladimir Sementsov-Ogievskiy 69161007b31SStefan Hajnoczi /** 692c28107e9SMax Reitz * Return the tracked request on @bs for the current coroutine, or 693c28107e9SMax Reitz * NULL if there is none. 694c28107e9SMax Reitz */ 695c28107e9SMax Reitz BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 696c28107e9SMax Reitz { 697c28107e9SMax Reitz BdrvTrackedRequest *req; 698c28107e9SMax Reitz Coroutine *self = qemu_coroutine_self(); 699967d7905SEmanuele Giuseppe Esposito IO_CODE(); 700c28107e9SMax Reitz 701c28107e9SMax Reitz QLIST_FOREACH(req, &bs->tracked_requests, list) { 702c28107e9SMax Reitz if (req->co == self) { 703c28107e9SMax Reitz return req; 704c28107e9SMax Reitz } 705c28107e9SMax Reitz } 706c28107e9SMax Reitz 707c28107e9SMax Reitz return NULL; 708c28107e9SMax Reitz } 709c28107e9SMax Reitz 710c28107e9SMax Reitz /** 711244483e6SKevin Wolf * Round a region to cluster boundaries 712244483e6SKevin Wolf */ 713244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs, 7147cfd5275SEric Blake int64_t offset, int64_t bytes, 715244483e6SKevin Wolf int64_t *cluster_offset, 7167cfd5275SEric Blake int64_t *cluster_bytes) 717244483e6SKevin Wolf { 718244483e6SKevin Wolf BlockDriverInfo bdi; 719384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 720244483e6SKevin Wolf if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 721244483e6SKevin Wolf *cluster_offset = offset; 722244483e6SKevin Wolf *cluster_bytes = bytes; 723244483e6SKevin Wolf } else { 724244483e6SKevin Wolf int64_t c = bdi.cluster_size; 725244483e6SKevin Wolf *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 726244483e6SKevin Wolf *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 727244483e6SKevin Wolf } 728244483e6SKevin Wolf } 729244483e6SKevin Wolf 73061007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs) 73161007b31SStefan Hajnoczi { 73261007b31SStefan Hajnoczi BlockDriverInfo bdi; 73361007b31SStefan Hajnoczi int ret; 73461007b31SStefan Hajnoczi 73561007b31SStefan Hajnoczi ret = bdrv_get_info(bs, &bdi); 73661007b31SStefan Hajnoczi if (ret < 0 || bdi.cluster_size == 0) { 737a5b8dd2cSEric Blake return bs->bl.request_alignment; 73861007b31SStefan Hajnoczi } else { 73961007b31SStefan Hajnoczi return bdi.cluster_size; 74061007b31SStefan Hajnoczi } 74161007b31SStefan Hajnoczi } 74261007b31SStefan Hajnoczi 74399723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs) 74499723548SPaolo Bonzini { 745967d7905SEmanuele Giuseppe Esposito IO_CODE(); 746d73415a3SStefan Hajnoczi qatomic_inc(&bs->in_flight); 74799723548SPaolo Bonzini } 74899723548SPaolo Bonzini 749c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs) 750c9d1a561SPaolo Bonzini { 751967d7905SEmanuele Giuseppe Esposito IO_CODE(); 752cfe29d82SKevin Wolf aio_wait_kick(); 753c9d1a561SPaolo Bonzini } 754c9d1a561SPaolo Bonzini 75599723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs) 75699723548SPaolo Bonzini { 757967d7905SEmanuele Giuseppe Esposito IO_CODE(); 758d73415a3SStefan Hajnoczi qatomic_dec(&bs->in_flight); 759c9d1a561SPaolo Bonzini bdrv_wakeup(bs); 76099723548SPaolo Bonzini } 76199723548SPaolo Bonzini 762131498f7SDenis V. Lunev static void coroutine_fn 763131498f7SDenis V. Lunev bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 76461007b31SStefan Hajnoczi { 76561007b31SStefan Hajnoczi BlockDriverState *bs = self->bs; 76661007b31SStefan Hajnoczi 767d73415a3SStefan Hajnoczi if (!qatomic_read(&bs->serialising_in_flight)) { 768131498f7SDenis V. Lunev return; 76961007b31SStefan Hajnoczi } 77061007b31SStefan Hajnoczi 7713783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 772131498f7SDenis V. Lunev bdrv_wait_serialising_requests_locked(self); 7733783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 77461007b31SStefan Hajnoczi } 77561007b31SStefan Hajnoczi 776131498f7SDenis V. Lunev void coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, 7778ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 7788ac5aab2SVladimir Sementsov-Ogievskiy { 779967d7905SEmanuele Giuseppe Esposito IO_CODE(); 7808ac5aab2SVladimir Sementsov-Ogievskiy 7818ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_lock(&req->bs->reqs_lock); 7828ac5aab2SVladimir Sementsov-Ogievskiy 7838ac5aab2SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, align); 784131498f7SDenis V. Lunev bdrv_wait_serialising_requests_locked(req); 7858ac5aab2SVladimir Sementsov-Ogievskiy 7868ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_unlock(&req->bs->reqs_lock); 7878ac5aab2SVladimir Sementsov-Ogievskiy } 7888ac5aab2SVladimir Sementsov-Ogievskiy 789558902ccSVladimir Sementsov-Ogievskiy int bdrv_check_qiov_request(int64_t offset, int64_t bytes, 79063f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 79163f4ad11SVladimir Sementsov-Ogievskiy Error **errp) 79261007b31SStefan Hajnoczi { 79363f4ad11SVladimir Sementsov-Ogievskiy /* 79463f4ad11SVladimir Sementsov-Ogievskiy * Check generic offset/bytes correctness 79563f4ad11SVladimir Sementsov-Ogievskiy */ 79663f4ad11SVladimir Sementsov-Ogievskiy 79769b55e03SVladimir Sementsov-Ogievskiy if (offset < 0) { 79869b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset is negative: %" PRIi64, offset); 79969b55e03SVladimir Sementsov-Ogievskiy return -EIO; 80069b55e03SVladimir Sementsov-Ogievskiy } 80169b55e03SVladimir Sementsov-Ogievskiy 80269b55e03SVladimir Sementsov-Ogievskiy if (bytes < 0) { 80369b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes is negative: %" PRIi64, bytes); 80461007b31SStefan Hajnoczi return -EIO; 80561007b31SStefan Hajnoczi } 80661007b31SStefan Hajnoczi 8078b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_MAX_LENGTH) { 80869b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 80969b55e03SVladimir Sementsov-Ogievskiy bytes, BDRV_MAX_LENGTH); 81069b55e03SVladimir Sementsov-Ogievskiy return -EIO; 81169b55e03SVladimir Sementsov-Ogievskiy } 81269b55e03SVladimir Sementsov-Ogievskiy 81369b55e03SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH) { 81469b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 81569b55e03SVladimir Sementsov-Ogievskiy offset, BDRV_MAX_LENGTH); 8168b117001SVladimir Sementsov-Ogievskiy return -EIO; 8178b117001SVladimir Sementsov-Ogievskiy } 8188b117001SVladimir Sementsov-Ogievskiy 8198b117001SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH - bytes) { 82069b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") " 82169b55e03SVladimir Sementsov-Ogievskiy "exceeds maximum(%" PRIi64 ")", offset, bytes, 82269b55e03SVladimir Sementsov-Ogievskiy BDRV_MAX_LENGTH); 8238b117001SVladimir Sementsov-Ogievskiy return -EIO; 8248b117001SVladimir Sementsov-Ogievskiy } 8258b117001SVladimir Sementsov-Ogievskiy 82663f4ad11SVladimir Sementsov-Ogievskiy if (!qiov) { 8278b117001SVladimir Sementsov-Ogievskiy return 0; 8288b117001SVladimir Sementsov-Ogievskiy } 8298b117001SVladimir Sementsov-Ogievskiy 83063f4ad11SVladimir Sementsov-Ogievskiy /* 83163f4ad11SVladimir Sementsov-Ogievskiy * Check qiov and qiov_offset 83263f4ad11SVladimir Sementsov-Ogievskiy */ 83363f4ad11SVladimir Sementsov-Ogievskiy 83463f4ad11SVladimir Sementsov-Ogievskiy if (qiov_offset > qiov->size) { 83563f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)", 83663f4ad11SVladimir Sementsov-Ogievskiy qiov_offset, qiov->size); 83763f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 83863f4ad11SVladimir Sementsov-Ogievskiy } 83963f4ad11SVladimir Sementsov-Ogievskiy 84063f4ad11SVladimir Sementsov-Ogievskiy if (bytes > qiov->size - qiov_offset) { 84163f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io " 84263f4ad11SVladimir Sementsov-Ogievskiy "vector size(%zu)", bytes, qiov_offset, qiov->size); 84363f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 84463f4ad11SVladimir Sementsov-Ogievskiy } 84563f4ad11SVladimir Sementsov-Ogievskiy 84663f4ad11SVladimir Sementsov-Ogievskiy return 0; 84763f4ad11SVladimir Sementsov-Ogievskiy } 84863f4ad11SVladimir Sementsov-Ogievskiy 84963f4ad11SVladimir Sementsov-Ogievskiy int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) 8508b117001SVladimir Sementsov-Ogievskiy { 85163f4ad11SVladimir Sementsov-Ogievskiy return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp); 85263f4ad11SVladimir Sementsov-Ogievskiy } 85363f4ad11SVladimir Sementsov-Ogievskiy 85463f4ad11SVladimir Sementsov-Ogievskiy static int bdrv_check_request32(int64_t offset, int64_t bytes, 85563f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset) 85663f4ad11SVladimir Sementsov-Ogievskiy { 85763f4ad11SVladimir Sementsov-Ogievskiy int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 8588b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 8598b117001SVladimir Sementsov-Ogievskiy return ret; 8608b117001SVladimir Sementsov-Ogievskiy } 8618b117001SVladimir Sementsov-Ogievskiy 8628b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_REQUEST_MAX_BYTES) { 86361007b31SStefan Hajnoczi return -EIO; 86461007b31SStefan Hajnoczi } 86561007b31SStefan Hajnoczi 86661007b31SStefan Hajnoczi return 0; 86761007b31SStefan Hajnoczi } 86861007b31SStefan Hajnoczi 86961007b31SStefan Hajnoczi /* 87074021bc4SEric Blake * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 87161007b31SStefan Hajnoczi * The operation is sped up by checking the block status and only writing 87261007b31SStefan Hajnoczi * zeroes to the device if they currently do not return zeroes. Optional 87374021bc4SEric Blake * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 874465fe887SEric Blake * BDRV_REQ_FUA). 87561007b31SStefan Hajnoczi * 876f4649069SEric Blake * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 87761007b31SStefan Hajnoczi */ 878720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 87961007b31SStefan Hajnoczi { 880237d78f8SEric Blake int ret; 881237d78f8SEric Blake int64_t target_size, bytes, offset = 0; 882720ff280SKevin Wolf BlockDriverState *bs = child->bs; 883384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 88461007b31SStefan Hajnoczi 8857286d610SEric Blake target_size = bdrv_getlength(bs); 8867286d610SEric Blake if (target_size < 0) { 8877286d610SEric Blake return target_size; 88861007b31SStefan Hajnoczi } 88961007b31SStefan Hajnoczi 89061007b31SStefan Hajnoczi for (;;) { 8917286d610SEric Blake bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 8927286d610SEric Blake if (bytes <= 0) { 89361007b31SStefan Hajnoczi return 0; 89461007b31SStefan Hajnoczi } 895237d78f8SEric Blake ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 89661007b31SStefan Hajnoczi if (ret < 0) { 89761007b31SStefan Hajnoczi return ret; 89861007b31SStefan Hajnoczi } 89961007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_ZERO) { 900237d78f8SEric Blake offset += bytes; 90161007b31SStefan Hajnoczi continue; 90261007b31SStefan Hajnoczi } 903237d78f8SEric Blake ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 90461007b31SStefan Hajnoczi if (ret < 0) { 90561007b31SStefan Hajnoczi return ret; 90661007b31SStefan Hajnoczi } 907237d78f8SEric Blake offset += bytes; 90861007b31SStefan Hajnoczi } 90961007b31SStefan Hajnoczi } 91061007b31SStefan Hajnoczi 91161007b31SStefan Hajnoczi /* 91261007b31SStefan Hajnoczi * Writes to the file and ensures that no writes are reordered across this 91361007b31SStefan Hajnoczi * request (acts as a barrier) 91461007b31SStefan Hajnoczi * 91561007b31SStefan Hajnoczi * Returns 0 on success, -errno in error cases. 91661007b31SStefan Hajnoczi */ 917e97190a4SAlberto Faria int coroutine_fn bdrv_co_pwrite_sync(BdrvChild *child, int64_t offset, 918e97190a4SAlberto Faria int64_t bytes, const void *buf, 919e97190a4SAlberto Faria BdrvRequestFlags flags) 92061007b31SStefan Hajnoczi { 92161007b31SStefan Hajnoczi int ret; 922384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 92361007b31SStefan Hajnoczi 924e97190a4SAlberto Faria ret = bdrv_co_pwrite(child, offset, bytes, buf, flags); 92561007b31SStefan Hajnoczi if (ret < 0) { 92661007b31SStefan Hajnoczi return ret; 92761007b31SStefan Hajnoczi } 92861007b31SStefan Hajnoczi 929e97190a4SAlberto Faria ret = bdrv_co_flush(child->bs); 930855a6a93SKevin Wolf if (ret < 0) { 931855a6a93SKevin Wolf return ret; 93261007b31SStefan Hajnoczi } 93361007b31SStefan Hajnoczi 93461007b31SStefan Hajnoczi return 0; 93561007b31SStefan Hajnoczi } 93661007b31SStefan Hajnoczi 93708844473SKevin Wolf typedef struct CoroutineIOCompletion { 93808844473SKevin Wolf Coroutine *coroutine; 93908844473SKevin Wolf int ret; 94008844473SKevin Wolf } CoroutineIOCompletion; 94108844473SKevin Wolf 94208844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret) 94308844473SKevin Wolf { 94408844473SKevin Wolf CoroutineIOCompletion *co = opaque; 94508844473SKevin Wolf 94608844473SKevin Wolf co->ret = ret; 947b9e413ddSPaolo Bonzini aio_co_wake(co->coroutine); 94808844473SKevin Wolf } 94908844473SKevin Wolf 950166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 95117abcbeeSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 952ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 953ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 954166fe960SKevin Wolf { 955166fe960SKevin Wolf BlockDriver *drv = bs->drv; 9563fb06697SKevin Wolf int64_t sector_num; 9573fb06697SKevin Wolf unsigned int nb_sectors; 958ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 959ac850bf0SVladimir Sementsov-Ogievskiy int ret; 9603fb06697SKevin Wolf 96117abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 962e8b65355SStefan Hajnoczi assert(!(flags & ~bs->supported_read_flags)); 963fa166538SEric Blake 964d470ad42SMax Reitz if (!drv) { 965d470ad42SMax Reitz return -ENOMEDIUM; 966d470ad42SMax Reitz } 967d470ad42SMax Reitz 968ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_preadv_part) { 969ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 970ac850bf0SVladimir Sementsov-Ogievskiy flags); 971ac850bf0SVladimir Sementsov-Ogievskiy } 972ac850bf0SVladimir Sementsov-Ogievskiy 973ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 974ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 975ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 976ac850bf0SVladimir Sementsov-Ogievskiy } 977ac850bf0SVladimir Sementsov-Ogievskiy 9783fb06697SKevin Wolf if (drv->bdrv_co_preadv) { 979ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 980ac850bf0SVladimir Sementsov-Ogievskiy goto out; 9813fb06697SKevin Wolf } 9823fb06697SKevin Wolf 983edfab6a0SEric Blake if (drv->bdrv_aio_preadv) { 98408844473SKevin Wolf BlockAIOCB *acb; 98508844473SKevin Wolf CoroutineIOCompletion co = { 98608844473SKevin Wolf .coroutine = qemu_coroutine_self(), 98708844473SKevin Wolf }; 98808844473SKevin Wolf 989e31f6864SEric Blake acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 99008844473SKevin Wolf bdrv_co_io_em_complete, &co); 99108844473SKevin Wolf if (acb == NULL) { 992ac850bf0SVladimir Sementsov-Ogievskiy ret = -EIO; 993ac850bf0SVladimir Sementsov-Ogievskiy goto out; 99408844473SKevin Wolf } else { 99508844473SKevin Wolf qemu_coroutine_yield(); 996ac850bf0SVladimir Sementsov-Ogievskiy ret = co.ret; 997ac850bf0SVladimir Sementsov-Ogievskiy goto out; 99808844473SKevin Wolf } 99908844473SKevin Wolf } 1000edfab6a0SEric Blake 1001edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1002edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1003edfab6a0SEric Blake 10041bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 10051bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 100641ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1007edfab6a0SEric Blake assert(drv->bdrv_co_readv); 1008edfab6a0SEric Blake 1009ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1010ac850bf0SVladimir Sementsov-Ogievskiy 1011ac850bf0SVladimir Sementsov-Ogievskiy out: 1012ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1013ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1014ac850bf0SVladimir Sementsov-Ogievskiy } 1015ac850bf0SVladimir Sementsov-Ogievskiy 1016ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1017166fe960SKevin Wolf } 1018166fe960SKevin Wolf 101978a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 102017abcbeeSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 1021ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1022e75abedaSVladimir Sementsov-Ogievskiy size_t qiov_offset, 1023e75abedaSVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 102478a07294SKevin Wolf { 102578a07294SKevin Wolf BlockDriver *drv = bs->drv; 1026e8b65355SStefan Hajnoczi bool emulate_fua = false; 10273fb06697SKevin Wolf int64_t sector_num; 10283fb06697SKevin Wolf unsigned int nb_sectors; 1029ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 103078a07294SKevin Wolf int ret; 103178a07294SKevin Wolf 103217abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1033fa166538SEric Blake 1034d470ad42SMax Reitz if (!drv) { 1035d470ad42SMax Reitz return -ENOMEDIUM; 1036d470ad42SMax Reitz } 1037d470ad42SMax Reitz 1038e8b65355SStefan Hajnoczi if ((flags & BDRV_REQ_FUA) && 1039e8b65355SStefan Hajnoczi (~bs->supported_write_flags & BDRV_REQ_FUA)) { 1040e8b65355SStefan Hajnoczi flags &= ~BDRV_REQ_FUA; 1041e8b65355SStefan Hajnoczi emulate_fua = true; 1042e8b65355SStefan Hajnoczi } 1043e8b65355SStefan Hajnoczi 1044e8b65355SStefan Hajnoczi flags &= bs->supported_write_flags; 1045e8b65355SStefan Hajnoczi 1046ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_part) { 1047ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1048e8b65355SStefan Hajnoczi flags); 1049ac850bf0SVladimir Sementsov-Ogievskiy goto emulate_flags; 1050ac850bf0SVladimir Sementsov-Ogievskiy } 1051ac850bf0SVladimir Sementsov-Ogievskiy 1052ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1053ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1054ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1055ac850bf0SVladimir Sementsov-Ogievskiy } 1056ac850bf0SVladimir Sementsov-Ogievskiy 10573fb06697SKevin Wolf if (drv->bdrv_co_pwritev) { 1058e8b65355SStefan Hajnoczi ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags); 10593fb06697SKevin Wolf goto emulate_flags; 10603fb06697SKevin Wolf } 10613fb06697SKevin Wolf 1062edfab6a0SEric Blake if (drv->bdrv_aio_pwritev) { 106308844473SKevin Wolf BlockAIOCB *acb; 106408844473SKevin Wolf CoroutineIOCompletion co = { 106508844473SKevin Wolf .coroutine = qemu_coroutine_self(), 106608844473SKevin Wolf }; 106708844473SKevin Wolf 1068e8b65355SStefan Hajnoczi acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags, 106908844473SKevin Wolf bdrv_co_io_em_complete, &co); 107008844473SKevin Wolf if (acb == NULL) { 10713fb06697SKevin Wolf ret = -EIO; 107208844473SKevin Wolf } else { 107308844473SKevin Wolf qemu_coroutine_yield(); 10743fb06697SKevin Wolf ret = co.ret; 107508844473SKevin Wolf } 1076edfab6a0SEric Blake goto emulate_flags; 1077edfab6a0SEric Blake } 1078edfab6a0SEric Blake 1079edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1080edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1081edfab6a0SEric Blake 10821bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 10831bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 108441ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1085edfab6a0SEric Blake 1086e18a58b4SEric Blake assert(drv->bdrv_co_writev); 1087e8b65355SStefan Hajnoczi ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags); 108878a07294SKevin Wolf 10893fb06697SKevin Wolf emulate_flags: 1090e8b65355SStefan Hajnoczi if (ret == 0 && emulate_fua) { 109178a07294SKevin Wolf ret = bdrv_co_flush(bs); 109278a07294SKevin Wolf } 109378a07294SKevin Wolf 1094ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1095ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1096ac850bf0SVladimir Sementsov-Ogievskiy } 1097ac850bf0SVladimir Sementsov-Ogievskiy 109878a07294SKevin Wolf return ret; 109978a07294SKevin Wolf } 110078a07294SKevin Wolf 110129a298afSPavel Butsykin static int coroutine_fn 110217abcbeeSVladimir Sementsov-Ogievskiy bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset, 110317abcbeeSVladimir Sementsov-Ogievskiy int64_t bytes, QEMUIOVector *qiov, 1104ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset) 110529a298afSPavel Butsykin { 110629a298afSPavel Butsykin BlockDriver *drv = bs->drv; 1107ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1108ac850bf0SVladimir Sementsov-Ogievskiy int ret; 110929a298afSPavel Butsykin 111017abcbeeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 111117abcbeeSVladimir Sementsov-Ogievskiy 1112d470ad42SMax Reitz if (!drv) { 1113d470ad42SMax Reitz return -ENOMEDIUM; 1114d470ad42SMax Reitz } 1115d470ad42SMax Reitz 1116ac850bf0SVladimir Sementsov-Ogievskiy if (!block_driver_can_compress(drv)) { 111729a298afSPavel Butsykin return -ENOTSUP; 111829a298afSPavel Butsykin } 111929a298afSPavel Butsykin 1120ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_compressed_part) { 1121ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1122ac850bf0SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 1123ac850bf0SVladimir Sementsov-Ogievskiy } 1124ac850bf0SVladimir Sementsov-Ogievskiy 1125ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset == 0) { 112629a298afSPavel Butsykin return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 112729a298afSPavel Butsykin } 112829a298afSPavel Butsykin 1129ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1130ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1131ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1132ac850bf0SVladimir Sementsov-Ogievskiy 1133ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1134ac850bf0SVladimir Sementsov-Ogievskiy } 1135ac850bf0SVladimir Sementsov-Ogievskiy 113685c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 11379df5afbdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 11381143ec5eSVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 113961007b31SStefan Hajnoczi { 114085c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 114185c97ca7SKevin Wolf 114261007b31SStefan Hajnoczi /* Perform I/O through a temporary buffer so that users who scribble over 114361007b31SStefan Hajnoczi * their read buffer while the operation is in progress do not end up 114461007b31SStefan Hajnoczi * modifying the image file. This is critical for zero-copy guest I/O 114561007b31SStefan Hajnoczi * where anything might happen inside guest memory. 114661007b31SStefan Hajnoczi */ 11472275cc90SVladimir Sementsov-Ogievskiy void *bounce_buffer = NULL; 114861007b31SStefan Hajnoczi 114961007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 1150244483e6SKevin Wolf int64_t cluster_offset; 11517cfd5275SEric Blake int64_t cluster_bytes; 11529df5afbdSVladimir Sementsov-Ogievskiy int64_t skip_bytes; 115361007b31SStefan Hajnoczi int ret; 1154cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1155cb2e2878SEric Blake BDRV_REQUEST_MAX_BYTES); 11569df5afbdSVladimir Sementsov-Ogievskiy int64_t progress = 0; 11578644476eSMax Reitz bool skip_write; 115861007b31SStefan Hajnoczi 11599df5afbdSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 11609df5afbdSVladimir Sementsov-Ogievskiy 1161d470ad42SMax Reitz if (!drv) { 1162d470ad42SMax Reitz return -ENOMEDIUM; 1163d470ad42SMax Reitz } 1164d470ad42SMax Reitz 11658644476eSMax Reitz /* 11668644476eSMax Reitz * Do not write anything when the BDS is inactive. That is not 11678644476eSMax Reitz * allowed, and it would not help. 11688644476eSMax Reitz */ 11698644476eSMax Reitz skip_write = (bs->open_flags & BDRV_O_INACTIVE); 11708644476eSMax Reitz 11711bf03e66SKevin Wolf /* FIXME We cannot require callers to have write permissions when all they 11721bf03e66SKevin Wolf * are doing is a read request. If we did things right, write permissions 11731bf03e66SKevin Wolf * would be obtained anyway, but internally by the copy-on-read code. As 1174765d9df9SEric Blake * long as it is implemented here rather than in a separate filter driver, 11751bf03e66SKevin Wolf * the copy-on-read code doesn't have its own BdrvChild, however, for which 11761bf03e66SKevin Wolf * it could request permissions. Therefore we have to bypass the permission 11771bf03e66SKevin Wolf * system for the moment. */ 11781bf03e66SKevin Wolf // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1179afa4b293SKevin Wolf 118061007b31SStefan Hajnoczi /* Cover entire cluster so no additional backing file I/O is required when 1181cb2e2878SEric Blake * allocating cluster in the image file. Note that this value may exceed 1182cb2e2878SEric Blake * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1183cb2e2878SEric Blake * is one reason we loop rather than doing it all at once. 118461007b31SStefan Hajnoczi */ 1185244483e6SKevin Wolf bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1186cb2e2878SEric Blake skip_bytes = offset - cluster_offset; 118761007b31SStefan Hajnoczi 1188244483e6SKevin Wolf trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1189244483e6SKevin Wolf cluster_offset, cluster_bytes); 119061007b31SStefan Hajnoczi 1191cb2e2878SEric Blake while (cluster_bytes) { 1192cb2e2878SEric Blake int64_t pnum; 119361007b31SStefan Hajnoczi 11948644476eSMax Reitz if (skip_write) { 11958644476eSMax Reitz ret = 1; /* "already allocated", so nothing will be copied */ 11968644476eSMax Reitz pnum = MIN(cluster_bytes, max_transfer); 11978644476eSMax Reitz } else { 1198cb2e2878SEric Blake ret = bdrv_is_allocated(bs, cluster_offset, 1199cb2e2878SEric Blake MIN(cluster_bytes, max_transfer), &pnum); 1200cb2e2878SEric Blake if (ret < 0) { 12018644476eSMax Reitz /* 12028644476eSMax Reitz * Safe to treat errors in querying allocation as if 1203cb2e2878SEric Blake * unallocated; we'll probably fail again soon on the 1204cb2e2878SEric Blake * read, but at least that will set a decent errno. 1205cb2e2878SEric Blake */ 1206cb2e2878SEric Blake pnum = MIN(cluster_bytes, max_transfer); 1207cb2e2878SEric Blake } 1208cb2e2878SEric Blake 1209b0ddcbbbSKevin Wolf /* Stop at EOF if the image ends in the middle of the cluster */ 1210b0ddcbbbSKevin Wolf if (ret == 0 && pnum == 0) { 1211b0ddcbbbSKevin Wolf assert(progress >= bytes); 1212b0ddcbbbSKevin Wolf break; 1213b0ddcbbbSKevin Wolf } 1214b0ddcbbbSKevin Wolf 1215cb2e2878SEric Blake assert(skip_bytes < pnum); 12168644476eSMax Reitz } 1217cb2e2878SEric Blake 1218cb2e2878SEric Blake if (ret <= 0) { 12191143ec5eSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 12201143ec5eSVladimir Sementsov-Ogievskiy 1221cb2e2878SEric Blake /* Must copy-on-read; use the bounce buffer */ 12220d93ed08SVladimir Sementsov-Ogievskiy pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 12232275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 12242275cc90SVladimir Sementsov-Ogievskiy int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 12252275cc90SVladimir Sementsov-Ogievskiy int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 12262275cc90SVladimir Sementsov-Ogievskiy int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 12272275cc90SVladimir Sementsov-Ogievskiy 12282275cc90SVladimir Sementsov-Ogievskiy bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 12292275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 12302275cc90SVladimir Sementsov-Ogievskiy ret = -ENOMEM; 12312275cc90SVladimir Sementsov-Ogievskiy goto err; 12322275cc90SVladimir Sementsov-Ogievskiy } 12332275cc90SVladimir Sementsov-Ogievskiy } 12340d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1235cb2e2878SEric Blake 1236cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1237ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 0); 123861007b31SStefan Hajnoczi if (ret < 0) { 123961007b31SStefan Hajnoczi goto err; 124061007b31SStefan Hajnoczi } 124161007b31SStefan Hajnoczi 1242d855ebcdSEric Blake bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1243c1499a5eSEric Blake if (drv->bdrv_co_pwrite_zeroes && 1244cb2e2878SEric Blake buffer_is_zero(bounce_buffer, pnum)) { 1245a604fa2bSEric Blake /* FIXME: Should we (perhaps conditionally) be setting 1246a604fa2bSEric Blake * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1247a604fa2bSEric Blake * that still correctly reads as zero? */ 12487adcf59fSMax Reitz ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 12497adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 125061007b31SStefan Hajnoczi } else { 1251cb2e2878SEric Blake /* This does not change the data on the disk, it is not 1252cb2e2878SEric Blake * necessary to flush even in cache=writethrough mode. 125361007b31SStefan Hajnoczi */ 1254cb2e2878SEric Blake ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1255ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 12567adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 125761007b31SStefan Hajnoczi } 125861007b31SStefan Hajnoczi 125961007b31SStefan Hajnoczi if (ret < 0) { 1260cb2e2878SEric Blake /* It might be okay to ignore write errors for guest 1261cb2e2878SEric Blake * requests. If this is a deliberate copy-on-read 1262cb2e2878SEric Blake * then we don't want to ignore the error. Simply 1263cb2e2878SEric Blake * report it in all cases. 126461007b31SStefan Hajnoczi */ 126561007b31SStefan Hajnoczi goto err; 126661007b31SStefan Hajnoczi } 126761007b31SStefan Hajnoczi 12683299e5ecSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_PREFETCH)) { 12691143ec5eSVladimir Sementsov-Ogievskiy qemu_iovec_from_buf(qiov, qiov_offset + progress, 12701143ec5eSVladimir Sementsov-Ogievskiy bounce_buffer + skip_bytes, 12714ab78b19SVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress)); 12723299e5ecSVladimir Sementsov-Ogievskiy } 12733299e5ecSVladimir Sementsov-Ogievskiy } else if (!(flags & BDRV_REQ_PREFETCH)) { 1274cb2e2878SEric Blake /* Read directly into the destination */ 12751143ec5eSVladimir Sementsov-Ogievskiy ret = bdrv_driver_preadv(bs, offset + progress, 12761143ec5eSVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress), 12771143ec5eSVladimir Sementsov-Ogievskiy qiov, qiov_offset + progress, 0); 1278cb2e2878SEric Blake if (ret < 0) { 1279cb2e2878SEric Blake goto err; 1280cb2e2878SEric Blake } 1281cb2e2878SEric Blake } 1282cb2e2878SEric Blake 1283cb2e2878SEric Blake cluster_offset += pnum; 1284cb2e2878SEric Blake cluster_bytes -= pnum; 1285cb2e2878SEric Blake progress += pnum - skip_bytes; 1286cb2e2878SEric Blake skip_bytes = 0; 1287cb2e2878SEric Blake } 1288cb2e2878SEric Blake ret = 0; 128961007b31SStefan Hajnoczi 129061007b31SStefan Hajnoczi err: 129161007b31SStefan Hajnoczi qemu_vfree(bounce_buffer); 129261007b31SStefan Hajnoczi return ret; 129361007b31SStefan Hajnoczi } 129461007b31SStefan Hajnoczi 129561007b31SStefan Hajnoczi /* 129661007b31SStefan Hajnoczi * Forwards an already correctly aligned request to the BlockDriver. This 12971a62d0acSEric Blake * handles copy on read, zeroing after EOF, and fragmentation of large 12981a62d0acSEric Blake * reads; any other features must be implemented by the caller. 129961007b31SStefan Hajnoczi */ 130085c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 13018b0c5d76SVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 130265cd4424SVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 130361007b31SStefan Hajnoczi { 130485c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 1305c9d20029SKevin Wolf int64_t total_bytes, max_bytes; 13061a62d0acSEric Blake int ret = 0; 13078b0c5d76SVladimir Sementsov-Ogievskiy int64_t bytes_remaining = bytes; 13081a62d0acSEric Blake int max_transfer; 130961007b31SStefan Hajnoczi 13108b0c5d76SVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 131149c07526SKevin Wolf assert(is_power_of_2(align)); 131249c07526SKevin Wolf assert((offset & (align - 1)) == 0); 131349c07526SKevin Wolf assert((bytes & (align - 1)) == 0); 1314abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 13151a62d0acSEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 13161a62d0acSEric Blake align); 1317a604fa2bSEric Blake 1318e8b65355SStefan Hajnoczi /* 1319e8b65355SStefan Hajnoczi * TODO: We would need a per-BDS .supported_read_flags and 1320a604fa2bSEric Blake * potential fallback support, if we ever implement any read flags 1321a604fa2bSEric Blake * to pass through to drivers. For now, there aren't any 1322e8b65355SStefan Hajnoczi * passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint. 1323e8b65355SStefan Hajnoczi */ 1324e8b65355SStefan Hajnoczi assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH | 1325e8b65355SStefan Hajnoczi BDRV_REQ_REGISTERED_BUF))); 132661007b31SStefan Hajnoczi 132761007b31SStefan Hajnoczi /* Handle Copy on Read and associated serialisation */ 132861007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 132961007b31SStefan Hajnoczi /* If we touch the same cluster it counts as an overlap. This 133061007b31SStefan Hajnoczi * guarantees that allocating writes will be serialized and not race 133161007b31SStefan Hajnoczi * with each other for the same cluster. For example, in copy-on-read 133261007b31SStefan Hajnoczi * it ensures that the CoR read and write operations are atomic and 133361007b31SStefan Hajnoczi * guest writes cannot interleave between them. */ 13348ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); 133518fbd0deSPaolo Bonzini } else { 1336304d9d7fSMax Reitz bdrv_wait_serialising_requests(req); 133718fbd0deSPaolo Bonzini } 133861007b31SStefan Hajnoczi 133961007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 1340d6a644bbSEric Blake int64_t pnum; 134161007b31SStefan Hajnoczi 1342897dd0ecSAndrey Shinkevich /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ 1343897dd0ecSAndrey Shinkevich flags &= ~BDRV_REQ_COPY_ON_READ; 1344897dd0ecSAndrey Shinkevich 134588e63df2SEric Blake ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 134661007b31SStefan Hajnoczi if (ret < 0) { 134761007b31SStefan Hajnoczi goto out; 134861007b31SStefan Hajnoczi } 134961007b31SStefan Hajnoczi 135088e63df2SEric Blake if (!ret || pnum != bytes) { 135165cd4424SVladimir Sementsov-Ogievskiy ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 135265cd4424SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 13533299e5ecSVladimir Sementsov-Ogievskiy goto out; 13543299e5ecSVladimir Sementsov-Ogievskiy } else if (flags & BDRV_REQ_PREFETCH) { 135561007b31SStefan Hajnoczi goto out; 135661007b31SStefan Hajnoczi } 135761007b31SStefan Hajnoczi } 135861007b31SStefan Hajnoczi 13591a62d0acSEric Blake /* Forward the request to the BlockDriver, possibly fragmenting it */ 136049c07526SKevin Wolf total_bytes = bdrv_getlength(bs); 136149c07526SKevin Wolf if (total_bytes < 0) { 136249c07526SKevin Wolf ret = total_bytes; 136361007b31SStefan Hajnoczi goto out; 136461007b31SStefan Hajnoczi } 136561007b31SStefan Hajnoczi 1366e8b65355SStefan Hajnoczi assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF))); 1367897dd0ecSAndrey Shinkevich 136849c07526SKevin Wolf max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 13691a62d0acSEric Blake if (bytes <= max_bytes && bytes <= max_transfer) { 1370897dd0ecSAndrey Shinkevich ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); 13711a62d0acSEric Blake goto out; 137261007b31SStefan Hajnoczi } 137361007b31SStefan Hajnoczi 13741a62d0acSEric Blake while (bytes_remaining) { 13758b0c5d76SVladimir Sementsov-Ogievskiy int64_t num; 13761a62d0acSEric Blake 13771a62d0acSEric Blake if (max_bytes) { 13781a62d0acSEric Blake num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 13791a62d0acSEric Blake assert(num); 13801a62d0acSEric Blake 13811a62d0acSEric Blake ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1382134b7decSMax Reitz num, qiov, 1383897dd0ecSAndrey Shinkevich qiov_offset + bytes - bytes_remaining, 1384897dd0ecSAndrey Shinkevich flags); 13851a62d0acSEric Blake max_bytes -= num; 13861a62d0acSEric Blake } else { 13871a62d0acSEric Blake num = bytes_remaining; 1388134b7decSMax Reitz ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1389134b7decSMax Reitz 0, bytes_remaining); 13901a62d0acSEric Blake } 13911a62d0acSEric Blake if (ret < 0) { 13921a62d0acSEric Blake goto out; 13931a62d0acSEric Blake } 13941a62d0acSEric Blake bytes_remaining -= num; 139561007b31SStefan Hajnoczi } 139661007b31SStefan Hajnoczi 139761007b31SStefan Hajnoczi out: 13981a62d0acSEric Blake return ret < 0 ? ret : 0; 139961007b31SStefan Hajnoczi } 140061007b31SStefan Hajnoczi 140161007b31SStefan Hajnoczi /* 14027a3f542fSVladimir Sementsov-Ogievskiy * Request padding 14037a3f542fSVladimir Sementsov-Ogievskiy * 14047a3f542fSVladimir Sementsov-Ogievskiy * |<---- align ----->| |<----- align ---->| 14057a3f542fSVladimir Sementsov-Ogievskiy * |<- head ->|<------------- bytes ------------->|<-- tail -->| 14067a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 14077a3f542fSVladimir Sementsov-Ogievskiy * -*----------$-------*-------- ... --------*-----$------------*--- 14087a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 14097a3f542fSVladimir Sementsov-Ogievskiy * | offset | | end | 14107a3f542fSVladimir Sementsov-Ogievskiy * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 14117a3f542fSVladimir Sementsov-Ogievskiy * [buf ... ) [tail_buf ) 14127a3f542fSVladimir Sementsov-Ogievskiy * 14137a3f542fSVladimir Sementsov-Ogievskiy * @buf is an aligned allocation needed to store @head and @tail paddings. @head 14147a3f542fSVladimir Sementsov-Ogievskiy * is placed at the beginning of @buf and @tail at the @end. 14157a3f542fSVladimir Sementsov-Ogievskiy * 14167a3f542fSVladimir Sementsov-Ogievskiy * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 14177a3f542fSVladimir Sementsov-Ogievskiy * around tail, if tail exists. 14187a3f542fSVladimir Sementsov-Ogievskiy * 14197a3f542fSVladimir Sementsov-Ogievskiy * @merge_reads is true for small requests, 14207a3f542fSVladimir Sementsov-Ogievskiy * if @buf_len == @head + bytes + @tail. In this case it is possible that both 14217a3f542fSVladimir Sementsov-Ogievskiy * head and tail exist but @buf_len == align and @tail_buf == @buf. 142261007b31SStefan Hajnoczi */ 14237a3f542fSVladimir Sementsov-Ogievskiy typedef struct BdrvRequestPadding { 14247a3f542fSVladimir Sementsov-Ogievskiy uint8_t *buf; 14257a3f542fSVladimir Sementsov-Ogievskiy size_t buf_len; 14267a3f542fSVladimir Sementsov-Ogievskiy uint8_t *tail_buf; 14277a3f542fSVladimir Sementsov-Ogievskiy size_t head; 14287a3f542fSVladimir Sementsov-Ogievskiy size_t tail; 14297a3f542fSVladimir Sementsov-Ogievskiy bool merge_reads; 14307a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 14317a3f542fSVladimir Sementsov-Ogievskiy } BdrvRequestPadding; 14327a3f542fSVladimir Sementsov-Ogievskiy 14337a3f542fSVladimir Sementsov-Ogievskiy static bool bdrv_init_padding(BlockDriverState *bs, 14347a3f542fSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 14357a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad) 14367a3f542fSVladimir Sementsov-Ogievskiy { 1437a56ed80cSVladimir Sementsov-Ogievskiy int64_t align = bs->bl.request_alignment; 1438a56ed80cSVladimir Sementsov-Ogievskiy int64_t sum; 1439a56ed80cSVladimir Sementsov-Ogievskiy 1440a56ed80cSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 1441a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= INT_MAX); /* documented in block/block_int.h */ 1442a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */ 14437a3f542fSVladimir Sementsov-Ogievskiy 14447a3f542fSVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 14457a3f542fSVladimir Sementsov-Ogievskiy 14467a3f542fSVladimir Sementsov-Ogievskiy pad->head = offset & (align - 1); 14477a3f542fSVladimir Sementsov-Ogievskiy pad->tail = ((offset + bytes) & (align - 1)); 14487a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 14497a3f542fSVladimir Sementsov-Ogievskiy pad->tail = align - pad->tail; 14507a3f542fSVladimir Sementsov-Ogievskiy } 14517a3f542fSVladimir Sementsov-Ogievskiy 1452ac9d00bfSVladimir Sementsov-Ogievskiy if (!pad->head && !pad->tail) { 14537a3f542fSVladimir Sementsov-Ogievskiy return false; 14547a3f542fSVladimir Sementsov-Ogievskiy } 14557a3f542fSVladimir Sementsov-Ogievskiy 1456ac9d00bfSVladimir Sementsov-Ogievskiy assert(bytes); /* Nothing good in aligning zero-length requests */ 1457ac9d00bfSVladimir Sementsov-Ogievskiy 14587a3f542fSVladimir Sementsov-Ogievskiy sum = pad->head + bytes + pad->tail; 14597a3f542fSVladimir Sementsov-Ogievskiy pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 14607a3f542fSVladimir Sementsov-Ogievskiy pad->buf = qemu_blockalign(bs, pad->buf_len); 14617a3f542fSVladimir Sementsov-Ogievskiy pad->merge_reads = sum == pad->buf_len; 14627a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 14637a3f542fSVladimir Sementsov-Ogievskiy pad->tail_buf = pad->buf + pad->buf_len - align; 14647a3f542fSVladimir Sementsov-Ogievskiy } 14657a3f542fSVladimir Sementsov-Ogievskiy 14667a3f542fSVladimir Sementsov-Ogievskiy return true; 14677a3f542fSVladimir Sementsov-Ogievskiy } 14687a3f542fSVladimir Sementsov-Ogievskiy 1469881a4c55SPaolo Bonzini static coroutine_fn int bdrv_padding_rmw_read(BdrvChild *child, 14707a3f542fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, 14717a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad, 14727a3f542fSVladimir Sementsov-Ogievskiy bool zero_middle) 14737a3f542fSVladimir Sementsov-Ogievskiy { 14747a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 14757a3f542fSVladimir Sementsov-Ogievskiy BlockDriverState *bs = child->bs; 14767a3f542fSVladimir Sementsov-Ogievskiy uint64_t align = bs->bl.request_alignment; 14777a3f542fSVladimir Sementsov-Ogievskiy int ret; 14787a3f542fSVladimir Sementsov-Ogievskiy 14797a3f542fSVladimir Sementsov-Ogievskiy assert(req->serialising && pad->buf); 14807a3f542fSVladimir Sementsov-Ogievskiy 14817a3f542fSVladimir Sementsov-Ogievskiy if (pad->head || pad->merge_reads) { 14828b0c5d76SVladimir Sementsov-Ogievskiy int64_t bytes = pad->merge_reads ? pad->buf_len : align; 14837a3f542fSVladimir Sementsov-Ogievskiy 14847a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 14857a3f542fSVladimir Sementsov-Ogievskiy 14867a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 14877a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 14887a3f542fSVladimir Sementsov-Ogievskiy } 14897a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 14907a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 14917a3f542fSVladimir Sementsov-Ogievskiy } 14927a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 149365cd4424SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 0); 14947a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 14957a3f542fSVladimir Sementsov-Ogievskiy return ret; 14967a3f542fSVladimir Sementsov-Ogievskiy } 14977a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 14987a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 14997a3f542fSVladimir Sementsov-Ogievskiy } 15007a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 15017a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 15027a3f542fSVladimir Sementsov-Ogievskiy } 15037a3f542fSVladimir Sementsov-Ogievskiy 15047a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads) { 15057a3f542fSVladimir Sementsov-Ogievskiy goto zero_mem; 15067a3f542fSVladimir Sementsov-Ogievskiy } 15077a3f542fSVladimir Sementsov-Ogievskiy } 15087a3f542fSVladimir Sementsov-Ogievskiy 15097a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 15107a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 15117a3f542fSVladimir Sementsov-Ogievskiy 15127a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 15137a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv( 15147a3f542fSVladimir Sementsov-Ogievskiy child, req, 15157a3f542fSVladimir Sementsov-Ogievskiy req->overlap_offset + req->overlap_bytes - align, 151665cd4424SVladimir Sementsov-Ogievskiy align, align, &local_qiov, 0, 0); 15177a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 15187a3f542fSVladimir Sementsov-Ogievskiy return ret; 15197a3f542fSVladimir Sementsov-Ogievskiy } 15207a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 15217a3f542fSVladimir Sementsov-Ogievskiy } 15227a3f542fSVladimir Sementsov-Ogievskiy 15237a3f542fSVladimir Sementsov-Ogievskiy zero_mem: 15247a3f542fSVladimir Sementsov-Ogievskiy if (zero_middle) { 15257a3f542fSVladimir Sementsov-Ogievskiy memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 15267a3f542fSVladimir Sementsov-Ogievskiy } 15277a3f542fSVladimir Sementsov-Ogievskiy 15287a3f542fSVladimir Sementsov-Ogievskiy return 0; 15297a3f542fSVladimir Sementsov-Ogievskiy } 15307a3f542fSVladimir Sementsov-Ogievskiy 15317a3f542fSVladimir Sementsov-Ogievskiy static void bdrv_padding_destroy(BdrvRequestPadding *pad) 15327a3f542fSVladimir Sementsov-Ogievskiy { 15337a3f542fSVladimir Sementsov-Ogievskiy if (pad->buf) { 15347a3f542fSVladimir Sementsov-Ogievskiy qemu_vfree(pad->buf); 15357a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&pad->local_qiov); 15367a3f542fSVladimir Sementsov-Ogievskiy } 153798ca4549SVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 15387a3f542fSVladimir Sementsov-Ogievskiy } 15397a3f542fSVladimir Sementsov-Ogievskiy 15407a3f542fSVladimir Sementsov-Ogievskiy /* 15417a3f542fSVladimir Sementsov-Ogievskiy * bdrv_pad_request 15427a3f542fSVladimir Sementsov-Ogievskiy * 15437a3f542fSVladimir Sementsov-Ogievskiy * Exchange request parameters with padded request if needed. Don't include RMW 15447a3f542fSVladimir Sementsov-Ogievskiy * read of padding, bdrv_padding_rmw_read() should be called separately if 15457a3f542fSVladimir Sementsov-Ogievskiy * needed. 15467a3f542fSVladimir Sementsov-Ogievskiy * 154798ca4549SVladimir Sementsov-Ogievskiy * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: 154898ca4549SVladimir Sementsov-Ogievskiy * - on function start they represent original request 154998ca4549SVladimir Sementsov-Ogievskiy * - on failure or when padding is not needed they are unchanged 155098ca4549SVladimir Sementsov-Ogievskiy * - on success when padding is needed they represent padded request 15517a3f542fSVladimir Sementsov-Ogievskiy */ 155298ca4549SVladimir Sementsov-Ogievskiy static int bdrv_pad_request(BlockDriverState *bs, 15531acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector **qiov, size_t *qiov_offset, 155437e9403eSVladimir Sementsov-Ogievskiy int64_t *offset, int64_t *bytes, 1555e8b65355SStefan Hajnoczi BdrvRequestPadding *pad, bool *padded, 1556e8b65355SStefan Hajnoczi BdrvRequestFlags *flags) 15577a3f542fSVladimir Sementsov-Ogievskiy { 15584c002cefSVladimir Sementsov-Ogievskiy int ret; 15594c002cefSVladimir Sementsov-Ogievskiy 156037e9403eSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); 156137e9403eSVladimir Sementsov-Ogievskiy 15627a3f542fSVladimir Sementsov-Ogievskiy if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 156398ca4549SVladimir Sementsov-Ogievskiy if (padded) { 156498ca4549SVladimir Sementsov-Ogievskiy *padded = false; 156598ca4549SVladimir Sementsov-Ogievskiy } 156698ca4549SVladimir Sementsov-Ogievskiy return 0; 15677a3f542fSVladimir Sementsov-Ogievskiy } 15687a3f542fSVladimir Sementsov-Ogievskiy 15694c002cefSVladimir Sementsov-Ogievskiy ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 15701acc3466SVladimir Sementsov-Ogievskiy *qiov, *qiov_offset, *bytes, 15714c002cefSVladimir Sementsov-Ogievskiy pad->buf + pad->buf_len - pad->tail, 15724c002cefSVladimir Sementsov-Ogievskiy pad->tail); 157398ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 157498ca4549SVladimir Sementsov-Ogievskiy bdrv_padding_destroy(pad); 157598ca4549SVladimir Sementsov-Ogievskiy return ret; 157698ca4549SVladimir Sementsov-Ogievskiy } 15777a3f542fSVladimir Sementsov-Ogievskiy *bytes += pad->head + pad->tail; 15787a3f542fSVladimir Sementsov-Ogievskiy *offset -= pad->head; 15797a3f542fSVladimir Sementsov-Ogievskiy *qiov = &pad->local_qiov; 15801acc3466SVladimir Sementsov-Ogievskiy *qiov_offset = 0; 158198ca4549SVladimir Sementsov-Ogievskiy if (padded) { 158298ca4549SVladimir Sementsov-Ogievskiy *padded = true; 158398ca4549SVladimir Sementsov-Ogievskiy } 1584e8b65355SStefan Hajnoczi if (flags) { 1585e8b65355SStefan Hajnoczi /* Can't use optimization hint with bounce buffer */ 1586e8b65355SStefan Hajnoczi *flags &= ~BDRV_REQ_REGISTERED_BUF; 1587e8b65355SStefan Hajnoczi } 15887a3f542fSVladimir Sementsov-Ogievskiy 158998ca4549SVladimir Sementsov-Ogievskiy return 0; 15907a3f542fSVladimir Sementsov-Ogievskiy } 15917a3f542fSVladimir Sementsov-Ogievskiy 1592a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child, 1593e9e52efdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 159461007b31SStefan Hajnoczi BdrvRequestFlags flags) 159561007b31SStefan Hajnoczi { 1596967d7905SEmanuele Giuseppe Esposito IO_CODE(); 15971acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 15981acc3466SVladimir Sementsov-Ogievskiy } 15991acc3466SVladimir Sementsov-Ogievskiy 16001acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 160137e9403eSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 16021acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 16031acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 16041acc3466SVladimir Sementsov-Ogievskiy { 1605a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 160661007b31SStefan Hajnoczi BdrvTrackedRequest req; 16077a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 160861007b31SStefan Hajnoczi int ret; 1609967d7905SEmanuele Giuseppe Esposito IO_CODE(); 161061007b31SStefan Hajnoczi 161137e9403eSVladimir Sementsov-Ogievskiy trace_bdrv_co_preadv_part(bs, offset, bytes, flags); 161261007b31SStefan Hajnoczi 1613f4dad307SVladimir Sementsov-Ogievskiy if (!bdrv_is_inserted(bs)) { 1614f4dad307SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 1615f4dad307SVladimir Sementsov-Ogievskiy } 1616f4dad307SVladimir Sementsov-Ogievskiy 161763f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 161861007b31SStefan Hajnoczi if (ret < 0) { 161961007b31SStefan Hajnoczi return ret; 162061007b31SStefan Hajnoczi } 162161007b31SStefan Hajnoczi 1622ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1623ac9d00bfSVladimir Sementsov-Ogievskiy /* 1624ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 1625ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1626ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 1627ac9d00bfSVladimir Sementsov-Ogievskiy * 1628ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 1629ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length read occasionally. 1630ac9d00bfSVladimir Sementsov-Ogievskiy */ 1631ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 1632ac9d00bfSVladimir Sementsov-Ogievskiy } 1633ac9d00bfSVladimir Sementsov-Ogievskiy 163499723548SPaolo Bonzini bdrv_inc_in_flight(bs); 163599723548SPaolo Bonzini 16369568b511SWen Congyang /* Don't do copy-on-read if we read data before write operation */ 1637d73415a3SStefan Hajnoczi if (qatomic_read(&bs->copy_on_read)) { 163861007b31SStefan Hajnoczi flags |= BDRV_REQ_COPY_ON_READ; 163961007b31SStefan Hajnoczi } 164061007b31SStefan Hajnoczi 164198ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 1642e8b65355SStefan Hajnoczi NULL, &flags); 164398ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 164487ab8802SKevin Wolf goto fail; 164598ca4549SVladimir Sementsov-Ogievskiy } 164661007b31SStefan Hajnoczi 1647ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 16487a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, &req, offset, bytes, 16497a3f542fSVladimir Sementsov-Ogievskiy bs->bl.request_alignment, 16501acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 165161007b31SStefan Hajnoczi tracked_request_end(&req); 16527a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 165361007b31SStefan Hajnoczi 165487ab8802SKevin Wolf fail: 165587ab8802SKevin Wolf bdrv_dec_in_flight(bs); 165687ab8802SKevin Wolf 165761007b31SStefan Hajnoczi return ret; 165861007b31SStefan Hajnoczi } 165961007b31SStefan Hajnoczi 1660d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 16615ae07b14SVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, BdrvRequestFlags flags) 166261007b31SStefan Hajnoczi { 166361007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 166461007b31SStefan Hajnoczi QEMUIOVector qiov; 16650d93ed08SVladimir Sementsov-Ogievskiy void *buf = NULL; 166661007b31SStefan Hajnoczi int ret = 0; 1667465fe887SEric Blake bool need_flush = false; 1668443668caSDenis V. Lunev int head = 0; 1669443668caSDenis V. Lunev int tail = 0; 167061007b31SStefan Hajnoczi 16712aaa3f9bSVladimir Sementsov-Ogievskiy int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, 16722aaa3f9bSVladimir Sementsov-Ogievskiy INT64_MAX); 1673a5b8dd2cSEric Blake int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1674a5b8dd2cSEric Blake bs->bl.request_alignment); 1675cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1676cf081fcaSEric Blake 16775ae07b14SVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 16785ae07b14SVladimir Sementsov-Ogievskiy 1679d470ad42SMax Reitz if (!drv) { 1680d470ad42SMax Reitz return -ENOMEDIUM; 1681d470ad42SMax Reitz } 1682d470ad42SMax Reitz 1683fe0480d6SKevin Wolf if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1684fe0480d6SKevin Wolf return -ENOTSUP; 1685fe0480d6SKevin Wolf } 1686fe0480d6SKevin Wolf 1687e8b65355SStefan Hajnoczi /* By definition there is no user buffer so this flag doesn't make sense */ 1688e8b65355SStefan Hajnoczi if (flags & BDRV_REQ_REGISTERED_BUF) { 1689e8b65355SStefan Hajnoczi return -EINVAL; 1690e8b65355SStefan Hajnoczi } 1691e8b65355SStefan Hajnoczi 16920bc329fbSHanna Reitz /* Invalidate the cached block-status data range if this write overlaps */ 16930bc329fbSHanna Reitz bdrv_bsc_invalidate_range(bs, offset, bytes); 16940bc329fbSHanna Reitz 1695b8d0a980SEric Blake assert(alignment % bs->bl.request_alignment == 0); 1696b8d0a980SEric Blake head = offset % alignment; 1697f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % alignment; 1698b8d0a980SEric Blake max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1699b8d0a980SEric Blake assert(max_write_zeroes >= bs->bl.request_alignment); 170061007b31SStefan Hajnoczi 1701f5a5ca79SManos Pitsidianakis while (bytes > 0 && !ret) { 17025ae07b14SVladimir Sementsov-Ogievskiy int64_t num = bytes; 170361007b31SStefan Hajnoczi 170461007b31SStefan Hajnoczi /* Align request. Block drivers can expect the "bulk" of the request 1705443668caSDenis V. Lunev * to be aligned, and that unaligned requests do not cross cluster 1706443668caSDenis V. Lunev * boundaries. 170761007b31SStefan Hajnoczi */ 1708443668caSDenis V. Lunev if (head) { 1709b2f95feeSEric Blake /* Make a small request up to the first aligned sector. For 1710b2f95feeSEric Blake * convenience, limit this request to max_transfer even if 1711b2f95feeSEric Blake * we don't need to fall back to writes. */ 1712f5a5ca79SManos Pitsidianakis num = MIN(MIN(bytes, max_transfer), alignment - head); 1713b2f95feeSEric Blake head = (head + num) % alignment; 1714b2f95feeSEric Blake assert(num < max_write_zeroes); 1715d05aa8bbSEric Blake } else if (tail && num > alignment) { 1716443668caSDenis V. Lunev /* Shorten the request to the last aligned sector. */ 1717443668caSDenis V. Lunev num -= tail; 171861007b31SStefan Hajnoczi } 171961007b31SStefan Hajnoczi 172061007b31SStefan Hajnoczi /* limit request size */ 172161007b31SStefan Hajnoczi if (num > max_write_zeroes) { 172261007b31SStefan Hajnoczi num = max_write_zeroes; 172361007b31SStefan Hajnoczi } 172461007b31SStefan Hajnoczi 172561007b31SStefan Hajnoczi ret = -ENOTSUP; 172661007b31SStefan Hajnoczi /* First try the efficient write zeroes operation */ 1727d05aa8bbSEric Blake if (drv->bdrv_co_pwrite_zeroes) { 1728d05aa8bbSEric Blake ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1729d05aa8bbSEric Blake flags & bs->supported_zero_flags); 1730d05aa8bbSEric Blake if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1731d05aa8bbSEric Blake !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1732d05aa8bbSEric Blake need_flush = true; 1733d05aa8bbSEric Blake } 1734465fe887SEric Blake } else { 1735465fe887SEric Blake assert(!bs->supported_zero_flags); 173661007b31SStefan Hajnoczi } 173761007b31SStefan Hajnoczi 1738294682ccSAndrey Shinkevich if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 173961007b31SStefan Hajnoczi /* Fall back to bounce buffer if write zeroes is unsupported */ 1740465fe887SEric Blake BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1741465fe887SEric Blake 1742465fe887SEric Blake if ((flags & BDRV_REQ_FUA) && 1743465fe887SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1744465fe887SEric Blake /* No need for bdrv_driver_pwrite() to do a fallback 1745465fe887SEric Blake * flush on each chunk; use just one at the end */ 1746465fe887SEric Blake write_flags &= ~BDRV_REQ_FUA; 1747465fe887SEric Blake need_flush = true; 1748465fe887SEric Blake } 17495def6b80SEric Blake num = MIN(num, max_transfer); 17500d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 17510d93ed08SVladimir Sementsov-Ogievskiy buf = qemu_try_blockalign0(bs, num); 17520d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 175361007b31SStefan Hajnoczi ret = -ENOMEM; 175461007b31SStefan Hajnoczi goto fail; 175561007b31SStefan Hajnoczi } 175661007b31SStefan Hajnoczi } 17570d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&qiov, buf, num); 175861007b31SStefan Hajnoczi 1759ac850bf0SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 176061007b31SStefan Hajnoczi 176161007b31SStefan Hajnoczi /* Keep bounce buffer around if it is big enough for all 176261007b31SStefan Hajnoczi * all future requests. 176361007b31SStefan Hajnoczi */ 17645def6b80SEric Blake if (num < max_transfer) { 17650d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 17660d93ed08SVladimir Sementsov-Ogievskiy buf = NULL; 176761007b31SStefan Hajnoczi } 176861007b31SStefan Hajnoczi } 176961007b31SStefan Hajnoczi 1770d05aa8bbSEric Blake offset += num; 1771f5a5ca79SManos Pitsidianakis bytes -= num; 177261007b31SStefan Hajnoczi } 177361007b31SStefan Hajnoczi 177461007b31SStefan Hajnoczi fail: 1775465fe887SEric Blake if (ret == 0 && need_flush) { 1776465fe887SEric Blake ret = bdrv_co_flush(bs); 1777465fe887SEric Blake } 17780d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 177961007b31SStefan Hajnoczi return ret; 178061007b31SStefan Hajnoczi } 178161007b31SStefan Hajnoczi 178285fe2479SFam Zheng static inline int coroutine_fn 1783fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes, 178485fe2479SFam Zheng BdrvTrackedRequest *req, int flags) 178585fe2479SFam Zheng { 178685fe2479SFam Zheng BlockDriverState *bs = child->bs; 1787fcfd9adeSVladimir Sementsov-Ogievskiy 1788fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 178985fe2479SFam Zheng 1790307261b2SVladimir Sementsov-Ogievskiy if (bdrv_is_read_only(bs)) { 179185fe2479SFam Zheng return -EPERM; 179285fe2479SFam Zheng } 179385fe2479SFam Zheng 179485fe2479SFam Zheng assert(!(bs->open_flags & BDRV_O_INACTIVE)); 179585fe2479SFam Zheng assert((bs->open_flags & BDRV_O_NO_IO) == 0); 179685fe2479SFam Zheng assert(!(flags & ~BDRV_REQ_MASK)); 1797d1a764d1SVladimir Sementsov-Ogievskiy assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); 179885fe2479SFam Zheng 179985fe2479SFam Zheng if (flags & BDRV_REQ_SERIALISING) { 1800d1a764d1SVladimir Sementsov-Ogievskiy QEMU_LOCK_GUARD(&bs->reqs_lock); 1801d1a764d1SVladimir Sementsov-Ogievskiy 1802d1a764d1SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); 1803d1a764d1SVladimir Sementsov-Ogievskiy 1804d1a764d1SVladimir Sementsov-Ogievskiy if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { 1805d1a764d1SVladimir Sementsov-Ogievskiy return -EBUSY; 1806d1a764d1SVladimir Sementsov-Ogievskiy } 1807d1a764d1SVladimir Sementsov-Ogievskiy 1808d1a764d1SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(req); 180918fbd0deSPaolo Bonzini } else { 181018fbd0deSPaolo Bonzini bdrv_wait_serialising_requests(req); 181185fe2479SFam Zheng } 181285fe2479SFam Zheng 181385fe2479SFam Zheng assert(req->overlap_offset <= offset); 181485fe2479SFam Zheng assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1815fcfd9adeSVladimir Sementsov-Ogievskiy assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE || 1816fcfd9adeSVladimir Sementsov-Ogievskiy child->perm & BLK_PERM_RESIZE); 181785fe2479SFam Zheng 1818cd47d792SFam Zheng switch (req->type) { 1819cd47d792SFam Zheng case BDRV_TRACKED_WRITE: 1820cd47d792SFam Zheng case BDRV_TRACKED_DISCARD: 182185fe2479SFam Zheng if (flags & BDRV_REQ_WRITE_UNCHANGED) { 182285fe2479SFam Zheng assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 182385fe2479SFam Zheng } else { 182485fe2479SFam Zheng assert(child->perm & BLK_PERM_WRITE); 182585fe2479SFam Zheng } 182694783301SVladimir Sementsov-Ogievskiy bdrv_write_threshold_check_write(bs, offset, bytes); 182794783301SVladimir Sementsov-Ogievskiy return 0; 1828cd47d792SFam Zheng case BDRV_TRACKED_TRUNCATE: 1829cd47d792SFam Zheng assert(child->perm & BLK_PERM_RESIZE); 1830cd47d792SFam Zheng return 0; 1831cd47d792SFam Zheng default: 1832cd47d792SFam Zheng abort(); 1833cd47d792SFam Zheng } 183485fe2479SFam Zheng } 183585fe2479SFam Zheng 183685fe2479SFam Zheng static inline void coroutine_fn 1837fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes, 183885fe2479SFam Zheng BdrvTrackedRequest *req, int ret) 183985fe2479SFam Zheng { 184085fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 184185fe2479SFam Zheng BlockDriverState *bs = child->bs; 184285fe2479SFam Zheng 1843fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 1844fcfd9adeSVladimir Sementsov-Ogievskiy 1845d73415a3SStefan Hajnoczi qatomic_inc(&bs->write_gen); 184685fe2479SFam Zheng 184700695c27SFam Zheng /* 184800695c27SFam Zheng * Discard cannot extend the image, but in error handling cases, such as 184900695c27SFam Zheng * when reverting a qcow2 cluster allocation, the discarded range can pass 185000695c27SFam Zheng * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 185100695c27SFam Zheng * here. Instead, just skip it, since semantically a discard request 185200695c27SFam Zheng * beyond EOF cannot expand the image anyway. 185300695c27SFam Zheng */ 18547f8f03efSFam Zheng if (ret == 0 && 1855cd47d792SFam Zheng (req->type == BDRV_TRACKED_TRUNCATE || 1856cd47d792SFam Zheng end_sector > bs->total_sectors) && 185700695c27SFam Zheng req->type != BDRV_TRACKED_DISCARD) { 18587f8f03efSFam Zheng bs->total_sectors = end_sector; 18597f8f03efSFam Zheng bdrv_parent_cb_resize(bs); 18607f8f03efSFam Zheng bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 186185fe2479SFam Zheng } 186200695c27SFam Zheng if (req->bytes) { 186300695c27SFam Zheng switch (req->type) { 186400695c27SFam Zheng case BDRV_TRACKED_WRITE: 186500695c27SFam Zheng stat64_max(&bs->wr_highest_offset, offset + bytes); 186600695c27SFam Zheng /* fall through, to set dirty bits */ 186700695c27SFam Zheng case BDRV_TRACKED_DISCARD: 18687f8f03efSFam Zheng bdrv_set_dirty(bs, offset, bytes); 186900695c27SFam Zheng break; 187000695c27SFam Zheng default: 187100695c27SFam Zheng break; 187200695c27SFam Zheng } 187300695c27SFam Zheng } 187485fe2479SFam Zheng } 187585fe2479SFam Zheng 187661007b31SStefan Hajnoczi /* 187704ed95f4SEric Blake * Forwards an already correctly aligned write request to the BlockDriver, 187804ed95f4SEric Blake * after possibly fragmenting it. 187961007b31SStefan Hajnoczi */ 188085c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 1881fcfd9adeSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, int64_t offset, int64_t bytes, 1882e75abedaSVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, 1883e75abedaSVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 188461007b31SStefan Hajnoczi { 188585c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 188661007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 188761007b31SStefan Hajnoczi int ret; 188861007b31SStefan Hajnoczi 1889fcfd9adeSVladimir Sementsov-Ogievskiy int64_t bytes_remaining = bytes; 189004ed95f4SEric Blake int max_transfer; 189161007b31SStefan Hajnoczi 1892fcfd9adeSVladimir Sementsov-Ogievskiy bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort); 1893fcfd9adeSVladimir Sementsov-Ogievskiy 1894d470ad42SMax Reitz if (!drv) { 1895d470ad42SMax Reitz return -ENOMEDIUM; 1896d470ad42SMax Reitz } 1897d470ad42SMax Reitz 1898d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 1899d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 1900d6883bc9SVladimir Sementsov-Ogievskiy } 1901d6883bc9SVladimir Sementsov-Ogievskiy 1902cff86b38SEric Blake assert(is_power_of_2(align)); 1903cff86b38SEric Blake assert((offset & (align - 1)) == 0); 1904cff86b38SEric Blake assert((bytes & (align - 1)) == 0); 190504ed95f4SEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 190604ed95f4SEric Blake align); 190761007b31SStefan Hajnoczi 190885fe2479SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 190961007b31SStefan Hajnoczi 191061007b31SStefan Hajnoczi if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1911c1499a5eSEric Blake !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 191228c4da28SVladimir Sementsov-Ogievskiy qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 191361007b31SStefan Hajnoczi flags |= BDRV_REQ_ZERO_WRITE; 191461007b31SStefan Hajnoczi if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 191561007b31SStefan Hajnoczi flags |= BDRV_REQ_MAY_UNMAP; 191661007b31SStefan Hajnoczi } 191761007b31SStefan Hajnoczi } 191861007b31SStefan Hajnoczi 191961007b31SStefan Hajnoczi if (ret < 0) { 192061007b31SStefan Hajnoczi /* Do nothing, write notifier decided to fail this request */ 192161007b31SStefan Hajnoczi } else if (flags & BDRV_REQ_ZERO_WRITE) { 19229a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 19239896c876SKevin Wolf ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 19243ea1a091SPavel Butsykin } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 192528c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 192628c4da28SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 192704ed95f4SEric Blake } else if (bytes <= max_transfer) { 19289a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV); 192928c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 193004ed95f4SEric Blake } else { 193104ed95f4SEric Blake bdrv_debug_event(bs, BLKDBG_PWRITEV); 193204ed95f4SEric Blake while (bytes_remaining) { 193304ed95f4SEric Blake int num = MIN(bytes_remaining, max_transfer); 193404ed95f4SEric Blake int local_flags = flags; 193504ed95f4SEric Blake 193604ed95f4SEric Blake assert(num); 193704ed95f4SEric Blake if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 193804ed95f4SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 193904ed95f4SEric Blake /* If FUA is going to be emulated by flush, we only 194004ed95f4SEric Blake * need to flush on the last iteration */ 194104ed95f4SEric Blake local_flags &= ~BDRV_REQ_FUA; 194204ed95f4SEric Blake } 194304ed95f4SEric Blake 194404ed95f4SEric Blake ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 1945134b7decSMax Reitz num, qiov, 1946134b7decSMax Reitz qiov_offset + bytes - bytes_remaining, 194728c4da28SVladimir Sementsov-Ogievskiy local_flags); 194804ed95f4SEric Blake if (ret < 0) { 194904ed95f4SEric Blake break; 195004ed95f4SEric Blake } 195104ed95f4SEric Blake bytes_remaining -= num; 195204ed95f4SEric Blake } 195361007b31SStefan Hajnoczi } 19549a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 195561007b31SStefan Hajnoczi 195661007b31SStefan Hajnoczi if (ret >= 0) { 195704ed95f4SEric Blake ret = 0; 195861007b31SStefan Hajnoczi } 195985fe2479SFam Zheng bdrv_co_write_req_finish(child, offset, bytes, req, ret); 196061007b31SStefan Hajnoczi 196161007b31SStefan Hajnoczi return ret; 196261007b31SStefan Hajnoczi } 196361007b31SStefan Hajnoczi 196485c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 19659eeb6dd1SFam Zheng int64_t offset, 196637e9403eSVladimir Sementsov-Ogievskiy int64_t bytes, 19679eeb6dd1SFam Zheng BdrvRequestFlags flags, 19689eeb6dd1SFam Zheng BdrvTrackedRequest *req) 19699eeb6dd1SFam Zheng { 197085c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 19719eeb6dd1SFam Zheng QEMUIOVector local_qiov; 1972a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 19739eeb6dd1SFam Zheng int ret = 0; 19747a3f542fSVladimir Sementsov-Ogievskiy bool padding; 19757a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 19769eeb6dd1SFam Zheng 1977e8b65355SStefan Hajnoczi /* This flag doesn't make sense for padding or zero writes */ 1978e8b65355SStefan Hajnoczi flags &= ~BDRV_REQ_REGISTERED_BUF; 1979e8b65355SStefan Hajnoczi 19807a3f542fSVladimir Sementsov-Ogievskiy padding = bdrv_init_padding(bs, offset, bytes, &pad); 19817a3f542fSVladimir Sementsov-Ogievskiy if (padding) { 198245e62b46SVladimir Sementsov-Ogievskiy assert(!(flags & BDRV_REQ_NO_WAIT)); 19838ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, align); 19849eeb6dd1SFam Zheng 19857a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, req, &pad, true); 19867a3f542fSVladimir Sementsov-Ogievskiy 19877a3f542fSVladimir Sementsov-Ogievskiy if (pad.head || pad.merge_reads) { 19887a3f542fSVladimir Sementsov-Ogievskiy int64_t aligned_offset = offset & ~(align - 1); 19897a3f542fSVladimir Sementsov-Ogievskiy int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 19907a3f542fSVladimir Sementsov-Ogievskiy 19917a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 19927a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 199328c4da28SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 19949eeb6dd1SFam Zheng flags & ~BDRV_REQ_ZERO_WRITE); 19957a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0 || pad.merge_reads) { 19967a3f542fSVladimir Sementsov-Ogievskiy /* Error or all work is done */ 19977a3f542fSVladimir Sementsov-Ogievskiy goto out; 19989eeb6dd1SFam Zheng } 19997a3f542fSVladimir Sementsov-Ogievskiy offset += write_bytes - pad.head; 20007a3f542fSVladimir Sementsov-Ogievskiy bytes -= write_bytes - pad.head; 20017a3f542fSVladimir Sementsov-Ogievskiy } 20029eeb6dd1SFam Zheng } 20039eeb6dd1SFam Zheng 20049eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 20059eeb6dd1SFam Zheng if (bytes >= align) { 20069eeb6dd1SFam Zheng /* Write the aligned part in the middle. */ 2007fcfd9adeSVladimir Sementsov-Ogievskiy int64_t aligned_bytes = bytes & ~(align - 1); 200885c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 200928c4da28SVladimir Sementsov-Ogievskiy NULL, 0, flags); 20109eeb6dd1SFam Zheng if (ret < 0) { 20117a3f542fSVladimir Sementsov-Ogievskiy goto out; 20129eeb6dd1SFam Zheng } 20139eeb6dd1SFam Zheng bytes -= aligned_bytes; 20149eeb6dd1SFam Zheng offset += aligned_bytes; 20159eeb6dd1SFam Zheng } 20169eeb6dd1SFam Zheng 20179eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 20189eeb6dd1SFam Zheng if (bytes) { 20197a3f542fSVladimir Sementsov-Ogievskiy assert(align == pad.tail + bytes); 20209eeb6dd1SFam Zheng 20217a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 202285c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, align, align, 202328c4da28SVladimir Sementsov-Ogievskiy &local_qiov, 0, 202428c4da28SVladimir Sementsov-Ogievskiy flags & ~BDRV_REQ_ZERO_WRITE); 20259eeb6dd1SFam Zheng } 20269eeb6dd1SFam Zheng 20277a3f542fSVladimir Sementsov-Ogievskiy out: 20287a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 20297a3f542fSVladimir Sementsov-Ogievskiy 20307a3f542fSVladimir Sementsov-Ogievskiy return ret; 20319eeb6dd1SFam Zheng } 20329eeb6dd1SFam Zheng 203361007b31SStefan Hajnoczi /* 203461007b31SStefan Hajnoczi * Handle a write request in coroutine context 203561007b31SStefan Hajnoczi */ 2036a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 2037e9e52efdSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, 203861007b31SStefan Hajnoczi BdrvRequestFlags flags) 203961007b31SStefan Hajnoczi { 2040967d7905SEmanuele Giuseppe Esposito IO_CODE(); 20411acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 20421acc3466SVladimir Sementsov-Ogievskiy } 20431acc3466SVladimir Sementsov-Ogievskiy 20441acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 204537e9403eSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, 20461acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 20471acc3466SVladimir Sementsov-Ogievskiy { 2048a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 204961007b31SStefan Hajnoczi BdrvTrackedRequest req; 2050a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 20517a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 205261007b31SStefan Hajnoczi int ret; 2053f0deecffSVladimir Sementsov-Ogievskiy bool padded = false; 2054967d7905SEmanuele Giuseppe Esposito IO_CODE(); 205561007b31SStefan Hajnoczi 205637e9403eSVladimir Sementsov-Ogievskiy trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags); 2057f42cf447SDaniel P. Berrange 2058f4dad307SVladimir Sementsov-Ogievskiy if (!bdrv_is_inserted(bs)) { 205961007b31SStefan Hajnoczi return -ENOMEDIUM; 206061007b31SStefan Hajnoczi } 206161007b31SStefan Hajnoczi 20622aaa3f9bSVladimir Sementsov-Ogievskiy if (flags & BDRV_REQ_ZERO_WRITE) { 20632aaa3f9bSVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 20642aaa3f9bSVladimir Sementsov-Ogievskiy } else { 206563f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 20662aaa3f9bSVladimir Sementsov-Ogievskiy } 206761007b31SStefan Hajnoczi if (ret < 0) { 206861007b31SStefan Hajnoczi return ret; 206961007b31SStefan Hajnoczi } 207061007b31SStefan Hajnoczi 2071f2208fdcSAlberto Garcia /* If the request is misaligned then we can't make it efficient */ 2072f2208fdcSAlberto Garcia if ((flags & BDRV_REQ_NO_FALLBACK) && 2073f2208fdcSAlberto Garcia !QEMU_IS_ALIGNED(offset | bytes, align)) 2074f2208fdcSAlberto Garcia { 2075f2208fdcSAlberto Garcia return -ENOTSUP; 2076f2208fdcSAlberto Garcia } 2077f2208fdcSAlberto Garcia 2078ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2079ac9d00bfSVladimir Sementsov-Ogievskiy /* 2080ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 2081ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2082ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 2083ac9d00bfSVladimir Sementsov-Ogievskiy * 2084ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 2085ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length write occasionally. 2086ac9d00bfSVladimir Sementsov-Ogievskiy */ 2087ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 2088ac9d00bfSVladimir Sementsov-Ogievskiy } 2089ac9d00bfSVladimir Sementsov-Ogievskiy 2090f0deecffSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_ZERO_WRITE)) { 209161007b31SStefan Hajnoczi /* 2092f0deecffSVladimir Sementsov-Ogievskiy * Pad request for following read-modify-write cycle. 2093f0deecffSVladimir Sementsov-Ogievskiy * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do 2094f0deecffSVladimir Sementsov-Ogievskiy * alignment only if there is no ZERO flag. 209561007b31SStefan Hajnoczi */ 209698ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 2097e8b65355SStefan Hajnoczi &padded, &flags); 209898ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 209998ca4549SVladimir Sementsov-Ogievskiy return ret; 210098ca4549SVladimir Sementsov-Ogievskiy } 2101f0deecffSVladimir Sementsov-Ogievskiy } 2102f0deecffSVladimir Sementsov-Ogievskiy 2103f0deecffSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2104ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 210561007b31SStefan Hajnoczi 210618a59f03SAnton Nefedov if (flags & BDRV_REQ_ZERO_WRITE) { 2107f0deecffSVladimir Sementsov-Ogievskiy assert(!padded); 210885c97ca7SKevin Wolf ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 21099eeb6dd1SFam Zheng goto out; 21109eeb6dd1SFam Zheng } 21119eeb6dd1SFam Zheng 2112f0deecffSVladimir Sementsov-Ogievskiy if (padded) { 2113f0deecffSVladimir Sementsov-Ogievskiy /* 2114f0deecffSVladimir Sementsov-Ogievskiy * Request was unaligned to request_alignment and therefore 2115f0deecffSVladimir Sementsov-Ogievskiy * padded. We are going to do read-modify-write, and must 2116f0deecffSVladimir Sementsov-Ogievskiy * serialize the request to prevent interactions of the 2117f0deecffSVladimir Sementsov-Ogievskiy * widened region with other transactions. 2118f0deecffSVladimir Sementsov-Ogievskiy */ 211945e62b46SVladimir Sementsov-Ogievskiy assert(!(flags & BDRV_REQ_NO_WAIT)); 21208ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, align); 21217a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, &req, &pad, false); 212261007b31SStefan Hajnoczi } 212361007b31SStefan Hajnoczi 212485c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 21251acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 212661007b31SStefan Hajnoczi 21277a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 212861007b31SStefan Hajnoczi 21299eeb6dd1SFam Zheng out: 21309eeb6dd1SFam Zheng tracked_request_end(&req); 213199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 21327a3f542fSVladimir Sementsov-Ogievskiy 213361007b31SStefan Hajnoczi return ret; 213461007b31SStefan Hajnoczi } 213561007b31SStefan Hajnoczi 2136a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2137e9e52efdSVladimir Sementsov-Ogievskiy int64_t bytes, BdrvRequestFlags flags) 213861007b31SStefan Hajnoczi { 2139384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2140f5a5ca79SManos Pitsidianakis trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 214161007b31SStefan Hajnoczi 2142a03ef88fSKevin Wolf if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 214361007b31SStefan Hajnoczi flags &= ~BDRV_REQ_MAY_UNMAP; 214461007b31SStefan Hajnoczi } 214561007b31SStefan Hajnoczi 2146f5a5ca79SManos Pitsidianakis return bdrv_co_pwritev(child, offset, bytes, NULL, 214761007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 214861007b31SStefan Hajnoczi } 214961007b31SStefan Hajnoczi 21504085f5c7SJohn Snow /* 21514085f5c7SJohn Snow * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 21524085f5c7SJohn Snow */ 21534085f5c7SJohn Snow int bdrv_flush_all(void) 21544085f5c7SJohn Snow { 21554085f5c7SJohn Snow BdrvNextIterator it; 21564085f5c7SJohn Snow BlockDriverState *bs = NULL; 21574085f5c7SJohn Snow int result = 0; 21584085f5c7SJohn Snow 2159f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 2160f791bf7fSEmanuele Giuseppe Esposito 2161c8aa7895SPavel Dovgalyuk /* 2162c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 2163c8aa7895SPavel Dovgalyuk * creating new flush request for stopping 2164c8aa7895SPavel Dovgalyuk * the VM may break the determinism 2165c8aa7895SPavel Dovgalyuk */ 2166c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 2167c8aa7895SPavel Dovgalyuk return result; 2168c8aa7895SPavel Dovgalyuk } 2169c8aa7895SPavel Dovgalyuk 21704085f5c7SJohn Snow for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 21714085f5c7SJohn Snow AioContext *aio_context = bdrv_get_aio_context(bs); 21724085f5c7SJohn Snow int ret; 21734085f5c7SJohn Snow 21744085f5c7SJohn Snow aio_context_acquire(aio_context); 21754085f5c7SJohn Snow ret = bdrv_flush(bs); 21764085f5c7SJohn Snow if (ret < 0 && !result) { 21774085f5c7SJohn Snow result = ret; 21784085f5c7SJohn Snow } 21794085f5c7SJohn Snow aio_context_release(aio_context); 21804085f5c7SJohn Snow } 21814085f5c7SJohn Snow 21824085f5c7SJohn Snow return result; 21834085f5c7SJohn Snow } 21844085f5c7SJohn Snow 218561007b31SStefan Hajnoczi /* 218661007b31SStefan Hajnoczi * Returns the allocation status of the specified sectors. 218761007b31SStefan Hajnoczi * Drivers not implementing the functionality are assumed to not support 218861007b31SStefan Hajnoczi * backing files, hence all their sectors are reported as allocated. 218961007b31SStefan Hajnoczi * 219086a3d5c6SEric Blake * If 'want_zero' is true, the caller is querying for mapping 219186a3d5c6SEric Blake * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 219286a3d5c6SEric Blake * _ZERO where possible; otherwise, the result favors larger 'pnum', 219386a3d5c6SEric Blake * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2194c9ce8c4dSEric Blake * 21952e8bc787SEric Blake * If 'offset' is beyond the end of the disk image the return value is 2196fb0d8654SEric Blake * BDRV_BLOCK_EOF and 'pnum' is set to 0. 219761007b31SStefan Hajnoczi * 21982e8bc787SEric Blake * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2199fb0d8654SEric Blake * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2200fb0d8654SEric Blake * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 220167a0fd2aSFam Zheng * 22022e8bc787SEric Blake * 'pnum' is set to the number of bytes (including and immediately 22032e8bc787SEric Blake * following the specified offset) that are easily known to be in the 22042e8bc787SEric Blake * same allocated/unallocated state. Note that a second call starting 22052e8bc787SEric Blake * at the original offset plus returned pnum may have the same status. 22062e8bc787SEric Blake * The returned value is non-zero on success except at end-of-file. 22072e8bc787SEric Blake * 22082e8bc787SEric Blake * Returns negative errno on failure. Otherwise, if the 22092e8bc787SEric Blake * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 22102e8bc787SEric Blake * set to the host mapping and BDS corresponding to the guest offset. 221161007b31SStefan Hajnoczi */ 22122e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2213c9ce8c4dSEric Blake bool want_zero, 22142e8bc787SEric Blake int64_t offset, int64_t bytes, 22152e8bc787SEric Blake int64_t *pnum, int64_t *map, 221667a0fd2aSFam Zheng BlockDriverState **file) 221761007b31SStefan Hajnoczi { 22182e8bc787SEric Blake int64_t total_size; 22192e8bc787SEric Blake int64_t n; /* bytes */ 2220efa6e2edSEric Blake int ret; 22212e8bc787SEric Blake int64_t local_map = 0; 2222298a1665SEric Blake BlockDriverState *local_file = NULL; 2223efa6e2edSEric Blake int64_t aligned_offset, aligned_bytes; 2224efa6e2edSEric Blake uint32_t align; 2225549ec0d9SMax Reitz bool has_filtered_child; 222661007b31SStefan Hajnoczi 2227298a1665SEric Blake assert(pnum); 2228298a1665SEric Blake *pnum = 0; 22292e8bc787SEric Blake total_size = bdrv_getlength(bs); 22302e8bc787SEric Blake if (total_size < 0) { 22312e8bc787SEric Blake ret = total_size; 2232298a1665SEric Blake goto early_out; 223361007b31SStefan Hajnoczi } 223461007b31SStefan Hajnoczi 22352e8bc787SEric Blake if (offset >= total_size) { 2236298a1665SEric Blake ret = BDRV_BLOCK_EOF; 2237298a1665SEric Blake goto early_out; 223861007b31SStefan Hajnoczi } 22392e8bc787SEric Blake if (!bytes) { 2240298a1665SEric Blake ret = 0; 2241298a1665SEric Blake goto early_out; 22429cdcfd9fSEric Blake } 224361007b31SStefan Hajnoczi 22442e8bc787SEric Blake n = total_size - offset; 22452e8bc787SEric Blake if (n < bytes) { 22462e8bc787SEric Blake bytes = n; 224761007b31SStefan Hajnoczi } 224861007b31SStefan Hajnoczi 2249d470ad42SMax Reitz /* Must be non-NULL or bdrv_getlength() would have failed */ 2250d470ad42SMax Reitz assert(bs->drv); 2251549ec0d9SMax Reitz has_filtered_child = bdrv_filter_child(bs); 2252549ec0d9SMax Reitz if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { 22532e8bc787SEric Blake *pnum = bytes; 225461007b31SStefan Hajnoczi ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 22552e8bc787SEric Blake if (offset + bytes == total_size) { 2256fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2257fb0d8654SEric Blake } 225861007b31SStefan Hajnoczi if (bs->drv->protocol_name) { 22592e8bc787SEric Blake ret |= BDRV_BLOCK_OFFSET_VALID; 22602e8bc787SEric Blake local_map = offset; 2261298a1665SEric Blake local_file = bs; 226261007b31SStefan Hajnoczi } 2263298a1665SEric Blake goto early_out; 226461007b31SStefan Hajnoczi } 226561007b31SStefan Hajnoczi 226699723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2267efa6e2edSEric Blake 2268efa6e2edSEric Blake /* Round out to request_alignment boundaries */ 226986a3d5c6SEric Blake align = bs->bl.request_alignment; 2270efa6e2edSEric Blake aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2271efa6e2edSEric Blake aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2272efa6e2edSEric Blake 2273549ec0d9SMax Reitz if (bs->drv->bdrv_co_block_status) { 22740bc329fbSHanna Reitz /* 22750bc329fbSHanna Reitz * Use the block-status cache only for protocol nodes: Format 22760bc329fbSHanna Reitz * drivers are generally quick to inquire the status, but protocol 22770bc329fbSHanna Reitz * drivers often need to get information from outside of qemu, so 22780bc329fbSHanna Reitz * we do not have control over the actual implementation. There 22790bc329fbSHanna Reitz * have been cases where inquiring the status took an unreasonably 22800bc329fbSHanna Reitz * long time, and we can do nothing in qemu to fix it. 22810bc329fbSHanna Reitz * This is especially problematic for images with large data areas, 22820bc329fbSHanna Reitz * because finding the few holes in them and giving them special 22830bc329fbSHanna Reitz * treatment does not gain much performance. Therefore, we try to 22840bc329fbSHanna Reitz * cache the last-identified data region. 22850bc329fbSHanna Reitz * 22860bc329fbSHanna Reitz * Second, limiting ourselves to protocol nodes allows us to assume 22870bc329fbSHanna Reitz * the block status for data regions to be DATA | OFFSET_VALID, and 22880bc329fbSHanna Reitz * that the host offset is the same as the guest offset. 22890bc329fbSHanna Reitz * 22900bc329fbSHanna Reitz * Note that it is possible that external writers zero parts of 22910bc329fbSHanna Reitz * the cached regions without the cache being invalidated, and so 22920bc329fbSHanna Reitz * we may report zeroes as data. This is not catastrophic, 22930bc329fbSHanna Reitz * however, because reporting zeroes as data is fine. 22940bc329fbSHanna Reitz */ 22950bc329fbSHanna Reitz if (QLIST_EMPTY(&bs->children) && 22960bc329fbSHanna Reitz bdrv_bsc_is_data(bs, aligned_offset, pnum)) 22970bc329fbSHanna Reitz { 22980bc329fbSHanna Reitz ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; 22990bc329fbSHanna Reitz local_file = bs; 23000bc329fbSHanna Reitz local_map = aligned_offset; 23010bc329fbSHanna Reitz } else { 230286a3d5c6SEric Blake ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 230386a3d5c6SEric Blake aligned_bytes, pnum, &local_map, 230486a3d5c6SEric Blake &local_file); 23050bc329fbSHanna Reitz 23060bc329fbSHanna Reitz /* 23070bc329fbSHanna Reitz * Note that checking QLIST_EMPTY(&bs->children) is also done when 23080bc329fbSHanna Reitz * the cache is queried above. Technically, we do not need to check 23090bc329fbSHanna Reitz * it here; the worst that can happen is that we fill the cache for 23100bc329fbSHanna Reitz * non-protocol nodes, and then it is never used. However, filling 23110bc329fbSHanna Reitz * the cache requires an RCU update, so double check here to avoid 23120bc329fbSHanna Reitz * such an update if possible. 2313113b727cSHanna Reitz * 2314113b727cSHanna Reitz * Check want_zero, because we only want to update the cache when we 2315113b727cSHanna Reitz * have accurate information about what is zero and what is data. 23160bc329fbSHanna Reitz */ 2317113b727cSHanna Reitz if (want_zero && 2318113b727cSHanna Reitz ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && 23190bc329fbSHanna Reitz QLIST_EMPTY(&bs->children)) 23200bc329fbSHanna Reitz { 23210bc329fbSHanna Reitz /* 23220bc329fbSHanna Reitz * When a protocol driver reports BLOCK_OFFSET_VALID, the 23230bc329fbSHanna Reitz * returned local_map value must be the same as the offset we 23240bc329fbSHanna Reitz * have passed (aligned_offset), and local_bs must be the node 23250bc329fbSHanna Reitz * itself. 23260bc329fbSHanna Reitz * Assert this, because we follow this rule when reading from 23270bc329fbSHanna Reitz * the cache (see the `local_file = bs` and 23280bc329fbSHanna Reitz * `local_map = aligned_offset` assignments above), and the 23290bc329fbSHanna Reitz * result the cache delivers must be the same as the driver 23300bc329fbSHanna Reitz * would deliver. 23310bc329fbSHanna Reitz */ 23320bc329fbSHanna Reitz assert(local_file == bs); 23330bc329fbSHanna Reitz assert(local_map == aligned_offset); 23340bc329fbSHanna Reitz bdrv_bsc_fill(bs, aligned_offset, *pnum); 23350bc329fbSHanna Reitz } 23360bc329fbSHanna Reitz } 2337549ec0d9SMax Reitz } else { 2338549ec0d9SMax Reitz /* Default code for filters */ 2339549ec0d9SMax Reitz 2340549ec0d9SMax Reitz local_file = bdrv_filter_bs(bs); 2341549ec0d9SMax Reitz assert(local_file); 2342549ec0d9SMax Reitz 2343549ec0d9SMax Reitz *pnum = aligned_bytes; 2344549ec0d9SMax Reitz local_map = aligned_offset; 2345549ec0d9SMax Reitz ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2346549ec0d9SMax Reitz } 234786a3d5c6SEric Blake if (ret < 0) { 234886a3d5c6SEric Blake *pnum = 0; 234986a3d5c6SEric Blake goto out; 235086a3d5c6SEric Blake } 2351efa6e2edSEric Blake 2352efa6e2edSEric Blake /* 2353636cb512SEric Blake * The driver's result must be a non-zero multiple of request_alignment. 2354efa6e2edSEric Blake * Clamp pnum and adjust map to original request. 2355efa6e2edSEric Blake */ 2356636cb512SEric Blake assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2357636cb512SEric Blake align > offset - aligned_offset); 235869f47505SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_RECURSE) { 235969f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_DATA); 236069f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_OFFSET_VALID); 236169f47505SVladimir Sementsov-Ogievskiy assert(!(ret & BDRV_BLOCK_ZERO)); 236269f47505SVladimir Sementsov-Ogievskiy } 236369f47505SVladimir Sementsov-Ogievskiy 2364efa6e2edSEric Blake *pnum -= offset - aligned_offset; 2365efa6e2edSEric Blake if (*pnum > bytes) { 2366efa6e2edSEric Blake *pnum = bytes; 2367efa6e2edSEric Blake } 2368efa6e2edSEric Blake if (ret & BDRV_BLOCK_OFFSET_VALID) { 2369efa6e2edSEric Blake local_map += offset - aligned_offset; 2370efa6e2edSEric Blake } 237161007b31SStefan Hajnoczi 237261007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_RAW) { 2373298a1665SEric Blake assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 23742e8bc787SEric Blake ret = bdrv_co_block_status(local_file, want_zero, local_map, 23752e8bc787SEric Blake *pnum, pnum, &local_map, &local_file); 237699723548SPaolo Bonzini goto out; 237761007b31SStefan Hajnoczi } 237861007b31SStefan Hajnoczi 237961007b31SStefan Hajnoczi if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 238061007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ALLOCATED; 2381d40f4a56SAlberto Garcia } else if (bs->drv->supports_backing) { 2382cb850315SMax Reitz BlockDriverState *cow_bs = bdrv_cow_bs(bs); 2383cb850315SMax Reitz 2384d40f4a56SAlberto Garcia if (!cow_bs) { 2385d40f4a56SAlberto Garcia ret |= BDRV_BLOCK_ZERO; 2386d40f4a56SAlberto Garcia } else if (want_zero) { 2387cb850315SMax Reitz int64_t size2 = bdrv_getlength(cow_bs); 2388c9ce8c4dSEric Blake 23892e8bc787SEric Blake if (size2 >= 0 && offset >= size2) { 239061007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 239161007b31SStefan Hajnoczi } 23927b1efe99SVladimir Sementsov-Ogievskiy } 239361007b31SStefan Hajnoczi } 239461007b31SStefan Hajnoczi 239569f47505SVladimir Sementsov-Ogievskiy if (want_zero && ret & BDRV_BLOCK_RECURSE && 239669f47505SVladimir Sementsov-Ogievskiy local_file && local_file != bs && 239761007b31SStefan Hajnoczi (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 239861007b31SStefan Hajnoczi (ret & BDRV_BLOCK_OFFSET_VALID)) { 23992e8bc787SEric Blake int64_t file_pnum; 24002e8bc787SEric Blake int ret2; 240161007b31SStefan Hajnoczi 24022e8bc787SEric Blake ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 24032e8bc787SEric Blake *pnum, &file_pnum, NULL, NULL); 240461007b31SStefan Hajnoczi if (ret2 >= 0) { 240561007b31SStefan Hajnoczi /* Ignore errors. This is just providing extra information, it 240661007b31SStefan Hajnoczi * is useful but not necessary. 240761007b31SStefan Hajnoczi */ 2408c61e684eSEric Blake if (ret2 & BDRV_BLOCK_EOF && 2409c61e684eSEric Blake (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2410c61e684eSEric Blake /* 2411c61e684eSEric Blake * It is valid for the format block driver to read 2412c61e684eSEric Blake * beyond the end of the underlying file's current 2413c61e684eSEric Blake * size; such areas read as zero. 2414c61e684eSEric Blake */ 241561007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 241661007b31SStefan Hajnoczi } else { 241761007b31SStefan Hajnoczi /* Limit request to the range reported by the protocol driver */ 241861007b31SStefan Hajnoczi *pnum = file_pnum; 241961007b31SStefan Hajnoczi ret |= (ret2 & BDRV_BLOCK_ZERO); 242061007b31SStefan Hajnoczi } 242161007b31SStefan Hajnoczi } 242261007b31SStefan Hajnoczi } 242361007b31SStefan Hajnoczi 242499723548SPaolo Bonzini out: 242599723548SPaolo Bonzini bdrv_dec_in_flight(bs); 24262e8bc787SEric Blake if (ret >= 0 && offset + *pnum == total_size) { 2427fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2428fb0d8654SEric Blake } 2429298a1665SEric Blake early_out: 2430298a1665SEric Blake if (file) { 2431298a1665SEric Blake *file = local_file; 2432298a1665SEric Blake } 24332e8bc787SEric Blake if (map) { 24342e8bc787SEric Blake *map = local_map; 24352e8bc787SEric Blake } 243661007b31SStefan Hajnoczi return ret; 243761007b31SStefan Hajnoczi } 243861007b31SStefan Hajnoczi 243921c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2440f9e694cbSVladimir Sementsov-Ogievskiy bdrv_co_common_block_status_above(BlockDriverState *bs, 2441ba3f0e25SFam Zheng BlockDriverState *base, 24423555a432SVladimir Sementsov-Ogievskiy bool include_base, 2443c9ce8c4dSEric Blake bool want_zero, 24445b648c67SEric Blake int64_t offset, 24455b648c67SEric Blake int64_t bytes, 24465b648c67SEric Blake int64_t *pnum, 24475b648c67SEric Blake int64_t *map, 2448a92b1b06SEric Blake BlockDriverState **file, 2449a92b1b06SEric Blake int *depth) 2450ba3f0e25SFam Zheng { 245167c095c8SVladimir Sementsov-Ogievskiy int ret; 2452ba3f0e25SFam Zheng BlockDriverState *p; 245367c095c8SVladimir Sementsov-Ogievskiy int64_t eof = 0; 2454a92b1b06SEric Blake int dummy; 24551581a70dSEmanuele Giuseppe Esposito IO_CODE(); 2456ba3f0e25SFam Zheng 24573555a432SVladimir Sementsov-Ogievskiy assert(!include_base || base); /* Can't include NULL base */ 245867c095c8SVladimir Sementsov-Ogievskiy 2459a92b1b06SEric Blake if (!depth) { 2460a92b1b06SEric Blake depth = &dummy; 2461a92b1b06SEric Blake } 2462a92b1b06SEric Blake *depth = 0; 2463a92b1b06SEric Blake 2464624f27bbSVladimir Sementsov-Ogievskiy if (!include_base && bs == base) { 2465624f27bbSVladimir Sementsov-Ogievskiy *pnum = bytes; 2466624f27bbSVladimir Sementsov-Ogievskiy return 0; 2467624f27bbSVladimir Sementsov-Ogievskiy } 2468624f27bbSVladimir Sementsov-Ogievskiy 246967c095c8SVladimir Sementsov-Ogievskiy ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file); 2470a92b1b06SEric Blake ++*depth; 24713555a432SVladimir Sementsov-Ogievskiy if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { 247267c095c8SVladimir Sementsov-Ogievskiy return ret; 247367c095c8SVladimir Sementsov-Ogievskiy } 247467c095c8SVladimir Sementsov-Ogievskiy 247567c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_EOF) { 247667c095c8SVladimir Sementsov-Ogievskiy eof = offset + *pnum; 247767c095c8SVladimir Sementsov-Ogievskiy } 247867c095c8SVladimir Sementsov-Ogievskiy 247967c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 248067c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 248167c095c8SVladimir Sementsov-Ogievskiy 24823555a432SVladimir Sementsov-Ogievskiy for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; 248367c095c8SVladimir Sementsov-Ogievskiy p = bdrv_filter_or_cow_bs(p)) 248467c095c8SVladimir Sementsov-Ogievskiy { 24855b648c67SEric Blake ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 24865b648c67SEric Blake file); 2487a92b1b06SEric Blake ++*depth; 2488c61e684eSEric Blake if (ret < 0) { 248967c095c8SVladimir Sementsov-Ogievskiy return ret; 2490c61e684eSEric Blake } 249167c095c8SVladimir Sementsov-Ogievskiy if (*pnum == 0) { 2492c61e684eSEric Blake /* 249367c095c8SVladimir Sementsov-Ogievskiy * The top layer deferred to this layer, and because this layer is 249467c095c8SVladimir Sementsov-Ogievskiy * short, any zeroes that we synthesize beyond EOF behave as if they 249567c095c8SVladimir Sementsov-Ogievskiy * were allocated at this layer. 249667c095c8SVladimir Sementsov-Ogievskiy * 249767c095c8SVladimir Sementsov-Ogievskiy * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be 249867c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 249967c095c8SVladimir Sementsov-Ogievskiy * below. 2500c61e684eSEric Blake */ 250167c095c8SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_EOF); 25025b648c67SEric Blake *pnum = bytes; 250367c095c8SVladimir Sementsov-Ogievskiy if (file) { 250467c095c8SVladimir Sementsov-Ogievskiy *file = p; 2505c61e684eSEric Blake } 250667c095c8SVladimir Sementsov-Ogievskiy ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED; 2507ba3f0e25SFam Zheng break; 2508ba3f0e25SFam Zheng } 250967c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_ALLOCATED) { 251067c095c8SVladimir Sementsov-Ogievskiy /* 251167c095c8SVladimir Sementsov-Ogievskiy * We've found the node and the status, we must break. 251267c095c8SVladimir Sementsov-Ogievskiy * 251367c095c8SVladimir Sementsov-Ogievskiy * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be 251467c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 251567c095c8SVladimir Sementsov-Ogievskiy * below. 251667c095c8SVladimir Sementsov-Ogievskiy */ 251767c095c8SVladimir Sementsov-Ogievskiy ret &= ~BDRV_BLOCK_EOF; 251867c095c8SVladimir Sementsov-Ogievskiy break; 2519ba3f0e25SFam Zheng } 252067c095c8SVladimir Sementsov-Ogievskiy 25213555a432SVladimir Sementsov-Ogievskiy if (p == base) { 25223555a432SVladimir Sementsov-Ogievskiy assert(include_base); 25233555a432SVladimir Sementsov-Ogievskiy break; 25243555a432SVladimir Sementsov-Ogievskiy } 25253555a432SVladimir Sementsov-Ogievskiy 252667c095c8SVladimir Sementsov-Ogievskiy /* 252767c095c8SVladimir Sementsov-Ogievskiy * OK, [offset, offset + *pnum) region is unallocated on this layer, 252867c095c8SVladimir Sementsov-Ogievskiy * let's continue the diving. 252967c095c8SVladimir Sementsov-Ogievskiy */ 253067c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 253167c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 253267c095c8SVladimir Sementsov-Ogievskiy } 253367c095c8SVladimir Sementsov-Ogievskiy 253467c095c8SVladimir Sementsov-Ogievskiy if (offset + *pnum == eof) { 253567c095c8SVladimir Sementsov-Ogievskiy ret |= BDRV_BLOCK_EOF; 253667c095c8SVladimir Sementsov-Ogievskiy } 253767c095c8SVladimir Sementsov-Ogievskiy 2538ba3f0e25SFam Zheng return ret; 2539ba3f0e25SFam Zheng } 2540ba3f0e25SFam Zheng 254131826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 254231826642SEric Blake int64_t offset, int64_t bytes, int64_t *pnum, 254331826642SEric Blake int64_t *map, BlockDriverState **file) 2544c9ce8c4dSEric Blake { 2545384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 25463555a432SVladimir Sementsov-Ogievskiy return bdrv_common_block_status_above(bs, base, false, true, offset, bytes, 2547a92b1b06SEric Blake pnum, map, file, NULL); 2548c9ce8c4dSEric Blake } 2549c9ce8c4dSEric Blake 2550237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2551237d78f8SEric Blake int64_t *pnum, int64_t *map, BlockDriverState **file) 2552ba3f0e25SFam Zheng { 2553384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2554cb850315SMax Reitz return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), 255531826642SEric Blake offset, bytes, pnum, map, file); 2556ba3f0e25SFam Zheng } 2557ba3f0e25SFam Zheng 255846cd1e8aSAlberto Garcia /* 255946cd1e8aSAlberto Garcia * Check @bs (and its backing chain) to see if the range defined 256046cd1e8aSAlberto Garcia * by @offset and @bytes is known to read as zeroes. 256146cd1e8aSAlberto Garcia * Return 1 if that is the case, 0 otherwise and -errno on error. 256246cd1e8aSAlberto Garcia * This test is meant to be fast rather than accurate so returning 0 256346cd1e8aSAlberto Garcia * does not guarantee non-zero data. 256446cd1e8aSAlberto Garcia */ 256546cd1e8aSAlberto Garcia int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, 256646cd1e8aSAlberto Garcia int64_t bytes) 256746cd1e8aSAlberto Garcia { 256846cd1e8aSAlberto Garcia int ret; 256946cd1e8aSAlberto Garcia int64_t pnum = bytes; 2570384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 257146cd1e8aSAlberto Garcia 257246cd1e8aSAlberto Garcia if (!bytes) { 257346cd1e8aSAlberto Garcia return 1; 257446cd1e8aSAlberto Garcia } 257546cd1e8aSAlberto Garcia 2576ce47ff20SAlberto Faria ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset, 2577a92b1b06SEric Blake bytes, &pnum, NULL, NULL, NULL); 257846cd1e8aSAlberto Garcia 257946cd1e8aSAlberto Garcia if (ret < 0) { 258046cd1e8aSAlberto Garcia return ret; 258146cd1e8aSAlberto Garcia } 258246cd1e8aSAlberto Garcia 258346cd1e8aSAlberto Garcia return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); 258446cd1e8aSAlberto Garcia } 258546cd1e8aSAlberto Garcia 25867c85803cSAlberto Faria int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, 25877c85803cSAlberto Faria int64_t *pnum) 258861007b31SStefan Hajnoczi { 25897ddb99b9SEric Blake int ret; 25907ddb99b9SEric Blake int64_t dummy; 2591384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 2592d6a644bbSEric Blake 25933555a432SVladimir Sementsov-Ogievskiy ret = bdrv_common_block_status_above(bs, bs, true, false, offset, 25943555a432SVladimir Sementsov-Ogievskiy bytes, pnum ? pnum : &dummy, NULL, 2595a92b1b06SEric Blake NULL, NULL); 259661007b31SStefan Hajnoczi if (ret < 0) { 259761007b31SStefan Hajnoczi return ret; 259861007b31SStefan Hajnoczi } 259961007b31SStefan Hajnoczi return !!(ret & BDRV_BLOCK_ALLOCATED); 260061007b31SStefan Hajnoczi } 260161007b31SStefan Hajnoczi 260261007b31SStefan Hajnoczi /* 260361007b31SStefan Hajnoczi * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 260461007b31SStefan Hajnoczi * 2605a92b1b06SEric Blake * Return a positive depth if (a prefix of) the given range is allocated 2606a92b1b06SEric Blake * in any image between BASE and TOP (BASE is only included if include_base 2607a92b1b06SEric Blake * is set). Depth 1 is TOP, 2 is the first backing layer, and so forth. 2608170d3bd3SAndrey Shinkevich * BASE can be NULL to check if the given offset is allocated in any 2609170d3bd3SAndrey Shinkevich * image of the chain. Return 0 otherwise, or negative errno on 2610170d3bd3SAndrey Shinkevich * failure. 261161007b31SStefan Hajnoczi * 261251b0a488SEric Blake * 'pnum' is set to the number of bytes (including and immediately 261351b0a488SEric Blake * following the specified offset) that are known to be in the same 261451b0a488SEric Blake * allocated/unallocated state. Note that a subsequent call starting 261551b0a488SEric Blake * at 'offset + *pnum' may return the same allocation status (in other 261651b0a488SEric Blake * words, the result is not necessarily the maximum possible range); 261751b0a488SEric Blake * but 'pnum' will only be 0 when end of file is reached. 261861007b31SStefan Hajnoczi */ 261961007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top, 262061007b31SStefan Hajnoczi BlockDriverState *base, 2621170d3bd3SAndrey Shinkevich bool include_base, int64_t offset, 2622170d3bd3SAndrey Shinkevich int64_t bytes, int64_t *pnum) 262361007b31SStefan Hajnoczi { 2624a92b1b06SEric Blake int depth; 26257e7e5100SVladimir Sementsov-Ogievskiy int ret = bdrv_common_block_status_above(top, base, include_base, false, 2626a92b1b06SEric Blake offset, bytes, pnum, NULL, NULL, 2627a92b1b06SEric Blake &depth); 2628384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 262961007b31SStefan Hajnoczi if (ret < 0) { 263061007b31SStefan Hajnoczi return ret; 2631d6a644bbSEric Blake } 263261007b31SStefan Hajnoczi 2633a92b1b06SEric Blake if (ret & BDRV_BLOCK_ALLOCATED) { 2634a92b1b06SEric Blake return depth; 2635a92b1b06SEric Blake } 2636a92b1b06SEric Blake return 0; 263761007b31SStefan Hajnoczi } 263861007b31SStefan Hajnoczi 263921c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2640b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 26411a8ae822SKevin Wolf { 26421a8ae822SKevin Wolf BlockDriver *drv = bs->drv; 2643c4db2e25SMax Reitz BlockDriverState *child_bs = bdrv_primary_bs(bs); 2644b984b296SVladimir Sementsov-Ogievskiy int ret; 26451581a70dSEmanuele Giuseppe Esposito IO_CODE(); 2646b984b296SVladimir Sementsov-Ogievskiy 2647b984b296SVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2648b984b296SVladimir Sementsov-Ogievskiy if (ret < 0) { 2649b984b296SVladimir Sementsov-Ogievskiy return ret; 2650b984b296SVladimir Sementsov-Ogievskiy } 2651dc88a467SStefan Hajnoczi 2652b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2653b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2654b33b354fSVladimir Sementsov-Ogievskiy } 2655b33b354fSVladimir Sementsov-Ogievskiy 2656dc88a467SStefan Hajnoczi bdrv_inc_in_flight(bs); 26571a8ae822SKevin Wolf 2658b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_load_vmstate) { 2659dc88a467SStefan Hajnoczi ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2660c4db2e25SMax Reitz } else if (child_bs) { 2661b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); 2662b984b296SVladimir Sementsov-Ogievskiy } else { 2663b984b296SVladimir Sementsov-Ogievskiy ret = -ENOTSUP; 26641a8ae822SKevin Wolf } 26651a8ae822SKevin Wolf 2666dc88a467SStefan Hajnoczi bdrv_dec_in_flight(bs); 2667b33b354fSVladimir Sementsov-Ogievskiy 2668b33b354fSVladimir Sementsov-Ogievskiy return ret; 2669b33b354fSVladimir Sementsov-Ogievskiy } 2670b33b354fSVladimir Sementsov-Ogievskiy 2671b33b354fSVladimir Sementsov-Ogievskiy int coroutine_fn 2672b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2673b33b354fSVladimir Sementsov-Ogievskiy { 2674b33b354fSVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 2675b33b354fSVladimir Sementsov-Ogievskiy BlockDriverState *child_bs = bdrv_primary_bs(bs); 2676b984b296SVladimir Sementsov-Ogievskiy int ret; 26771581a70dSEmanuele Giuseppe Esposito IO_CODE(); 2678b984b296SVladimir Sementsov-Ogievskiy 2679b984b296SVladimir Sementsov-Ogievskiy ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); 2680b984b296SVladimir Sementsov-Ogievskiy if (ret < 0) { 2681b984b296SVladimir Sementsov-Ogievskiy return ret; 2682b984b296SVladimir Sementsov-Ogievskiy } 2683b33b354fSVladimir Sementsov-Ogievskiy 2684b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2685b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2686b33b354fSVladimir Sementsov-Ogievskiy } 2687b33b354fSVladimir Sementsov-Ogievskiy 2688b33b354fSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2689b33b354fSVladimir Sementsov-Ogievskiy 2690b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_save_vmstate) { 2691b33b354fSVladimir Sementsov-Ogievskiy ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2692b33b354fSVladimir Sementsov-Ogievskiy } else if (child_bs) { 2693b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); 2694b984b296SVladimir Sementsov-Ogievskiy } else { 2695b984b296SVladimir Sementsov-Ogievskiy ret = -ENOTSUP; 2696b33b354fSVladimir Sementsov-Ogievskiy } 2697b33b354fSVladimir Sementsov-Ogievskiy 2698b33b354fSVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 2699b33b354fSVladimir Sementsov-Ogievskiy 2700dc88a467SStefan Hajnoczi return ret; 27011a8ae822SKevin Wolf } 27021a8ae822SKevin Wolf 270361007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 270461007b31SStefan Hajnoczi int64_t pos, int size) 270561007b31SStefan Hajnoczi { 27060d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2707b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_writev_vmstate(bs, &qiov, pos); 2708384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 270961007b31SStefan Hajnoczi 2710b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 271161007b31SStefan Hajnoczi } 271261007b31SStefan Hajnoczi 271361007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 271461007b31SStefan Hajnoczi int64_t pos, int size) 271561007b31SStefan Hajnoczi { 27160d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2717b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_readv_vmstate(bs, &qiov, pos); 2718384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 27195ddda0b8SKevin Wolf 2720b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 272161007b31SStefan Hajnoczi } 272261007b31SStefan Hajnoczi 272361007b31SStefan Hajnoczi /**************************************************************/ 272461007b31SStefan Hajnoczi /* async I/Os */ 272561007b31SStefan Hajnoczi 272661007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb) 272761007b31SStefan Hajnoczi { 2728384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 272961007b31SStefan Hajnoczi qemu_aio_ref(acb); 273061007b31SStefan Hajnoczi bdrv_aio_cancel_async(acb); 273161007b31SStefan Hajnoczi while (acb->refcnt > 1) { 273261007b31SStefan Hajnoczi if (acb->aiocb_info->get_aio_context) { 273361007b31SStefan Hajnoczi aio_poll(acb->aiocb_info->get_aio_context(acb), true); 273461007b31SStefan Hajnoczi } else if (acb->bs) { 27352f47da5fSPaolo Bonzini /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 27362f47da5fSPaolo Bonzini * assert that we're not using an I/O thread. Thread-safe 27372f47da5fSPaolo Bonzini * code should use bdrv_aio_cancel_async exclusively. 27382f47da5fSPaolo Bonzini */ 27392f47da5fSPaolo Bonzini assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 274061007b31SStefan Hajnoczi aio_poll(bdrv_get_aio_context(acb->bs), true); 274161007b31SStefan Hajnoczi } else { 274261007b31SStefan Hajnoczi abort(); 274361007b31SStefan Hajnoczi } 274461007b31SStefan Hajnoczi } 274561007b31SStefan Hajnoczi qemu_aio_unref(acb); 274661007b31SStefan Hajnoczi } 274761007b31SStefan Hajnoczi 274861007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements 274961007b31SStefan Hajnoczi * cancel_async, otherwise we do nothing and let the request normally complete. 275061007b31SStefan Hajnoczi * In either case the completion callback must be called. */ 275161007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb) 275261007b31SStefan Hajnoczi { 2753384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 275461007b31SStefan Hajnoczi if (acb->aiocb_info->cancel_async) { 275561007b31SStefan Hajnoczi acb->aiocb_info->cancel_async(acb); 275661007b31SStefan Hajnoczi } 275761007b31SStefan Hajnoczi } 275861007b31SStefan Hajnoczi 275961007b31SStefan Hajnoczi /**************************************************************/ 276061007b31SStefan Hajnoczi /* Coroutine block device emulation */ 276161007b31SStefan Hajnoczi 276261007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 276361007b31SStefan Hajnoczi { 2764883833e2SMax Reitz BdrvChild *primary_child = bdrv_primary_child(bs); 2765883833e2SMax Reitz BdrvChild *child; 276649ca6259SFam Zheng int current_gen; 276749ca6259SFam Zheng int ret = 0; 2768384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 276961007b31SStefan Hajnoczi 277099723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2771c32b82afSPavel Dovgalyuk 2772e914404eSFam Zheng if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 277349ca6259SFam Zheng bdrv_is_sg(bs)) { 277449ca6259SFam Zheng goto early_exit; 277549ca6259SFam Zheng } 277649ca6259SFam Zheng 27773783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 2778d73415a3SStefan Hajnoczi current_gen = qatomic_read(&bs->write_gen); 27793ff2f67aSEvgeny Yakovlev 27803ff2f67aSEvgeny Yakovlev /* Wait until any previous flushes are completed */ 278199723548SPaolo Bonzini while (bs->active_flush_req) { 27823783fa3dSPaolo Bonzini qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 27833ff2f67aSEvgeny Yakovlev } 27843ff2f67aSEvgeny Yakovlev 27853783fa3dSPaolo Bonzini /* Flushes reach this point in nondecreasing current_gen order. */ 278699723548SPaolo Bonzini bs->active_flush_req = true; 27873783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 27883ff2f67aSEvgeny Yakovlev 2789c32b82afSPavel Dovgalyuk /* Write back all layers by calling one driver function */ 2790c32b82afSPavel Dovgalyuk if (bs->drv->bdrv_co_flush) { 2791c32b82afSPavel Dovgalyuk ret = bs->drv->bdrv_co_flush(bs); 2792c32b82afSPavel Dovgalyuk goto out; 2793c32b82afSPavel Dovgalyuk } 2794c32b82afSPavel Dovgalyuk 279561007b31SStefan Hajnoczi /* Write back cached data to the OS even with cache=unsafe */ 2796883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); 279761007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_os) { 279861007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_os(bs); 279961007b31SStefan Hajnoczi if (ret < 0) { 2800cdb5e315SFam Zheng goto out; 280161007b31SStefan Hajnoczi } 280261007b31SStefan Hajnoczi } 280361007b31SStefan Hajnoczi 280461007b31SStefan Hajnoczi /* But don't actually force it to the disk with cache=unsafe */ 280561007b31SStefan Hajnoczi if (bs->open_flags & BDRV_O_NO_FLUSH) { 2806883833e2SMax Reitz goto flush_children; 280761007b31SStefan Hajnoczi } 280861007b31SStefan Hajnoczi 28093ff2f67aSEvgeny Yakovlev /* Check if we really need to flush anything */ 28103ff2f67aSEvgeny Yakovlev if (bs->flushed_gen == current_gen) { 2811883833e2SMax Reitz goto flush_children; 28123ff2f67aSEvgeny Yakovlev } 28133ff2f67aSEvgeny Yakovlev 2814883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); 2815d470ad42SMax Reitz if (!bs->drv) { 2816d470ad42SMax Reitz /* bs->drv->bdrv_co_flush() might have ejected the BDS 2817d470ad42SMax Reitz * (even in case of apparent success) */ 2818d470ad42SMax Reitz ret = -ENOMEDIUM; 2819d470ad42SMax Reitz goto out; 2820d470ad42SMax Reitz } 282161007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_disk) { 282261007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_disk(bs); 282361007b31SStefan Hajnoczi } else if (bs->drv->bdrv_aio_flush) { 282461007b31SStefan Hajnoczi BlockAIOCB *acb; 282561007b31SStefan Hajnoczi CoroutineIOCompletion co = { 282661007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 282761007b31SStefan Hajnoczi }; 282861007b31SStefan Hajnoczi 282961007b31SStefan Hajnoczi acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 283061007b31SStefan Hajnoczi if (acb == NULL) { 283161007b31SStefan Hajnoczi ret = -EIO; 283261007b31SStefan Hajnoczi } else { 283361007b31SStefan Hajnoczi qemu_coroutine_yield(); 283461007b31SStefan Hajnoczi ret = co.ret; 283561007b31SStefan Hajnoczi } 283661007b31SStefan Hajnoczi } else { 283761007b31SStefan Hajnoczi /* 283861007b31SStefan Hajnoczi * Some block drivers always operate in either writethrough or unsafe 283961007b31SStefan Hajnoczi * mode and don't support bdrv_flush therefore. Usually qemu doesn't 284061007b31SStefan Hajnoczi * know how the server works (because the behaviour is hardcoded or 284161007b31SStefan Hajnoczi * depends on server-side configuration), so we can't ensure that 284261007b31SStefan Hajnoczi * everything is safe on disk. Returning an error doesn't work because 284361007b31SStefan Hajnoczi * that would break guests even if the server operates in writethrough 284461007b31SStefan Hajnoczi * mode. 284561007b31SStefan Hajnoczi * 284661007b31SStefan Hajnoczi * Let's hope the user knows what he's doing. 284761007b31SStefan Hajnoczi */ 284861007b31SStefan Hajnoczi ret = 0; 284961007b31SStefan Hajnoczi } 28503ff2f67aSEvgeny Yakovlev 285161007b31SStefan Hajnoczi if (ret < 0) { 2852cdb5e315SFam Zheng goto out; 285361007b31SStefan Hajnoczi } 285461007b31SStefan Hajnoczi 285561007b31SStefan Hajnoczi /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 285661007b31SStefan Hajnoczi * in the case of cache=unsafe, so there are no useless flushes. 285761007b31SStefan Hajnoczi */ 2858883833e2SMax Reitz flush_children: 2859883833e2SMax Reitz ret = 0; 2860883833e2SMax Reitz QLIST_FOREACH(child, &bs->children, next) { 2861883833e2SMax Reitz if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { 2862883833e2SMax Reitz int this_child_ret = bdrv_co_flush(child->bs); 2863883833e2SMax Reitz if (!ret) { 2864883833e2SMax Reitz ret = this_child_ret; 2865883833e2SMax Reitz } 2866883833e2SMax Reitz } 2867883833e2SMax Reitz } 2868883833e2SMax Reitz 2869cdb5e315SFam Zheng out: 28703ff2f67aSEvgeny Yakovlev /* Notify any pending flushes that we have completed */ 2871e6af1e08SKevin Wolf if (ret == 0) { 28723ff2f67aSEvgeny Yakovlev bs->flushed_gen = current_gen; 2873e6af1e08SKevin Wolf } 28743783fa3dSPaolo Bonzini 28753783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 287699723548SPaolo Bonzini bs->active_flush_req = false; 2877156af3acSDenis V. Lunev /* Return value is ignored - it's ok if wait queue is empty */ 2878156af3acSDenis V. Lunev qemu_co_queue_next(&bs->flush_queue); 28793783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 28803ff2f67aSEvgeny Yakovlev 288149ca6259SFam Zheng early_exit: 288299723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2883cdb5e315SFam Zheng return ret; 288461007b31SStefan Hajnoczi } 288561007b31SStefan Hajnoczi 2886d93e5726SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2887d93e5726SVladimir Sementsov-Ogievskiy int64_t bytes) 288861007b31SStefan Hajnoczi { 2889b1066c87SFam Zheng BdrvTrackedRequest req; 289039af49c0SVladimir Sementsov-Ogievskiy int ret; 289139af49c0SVladimir Sementsov-Ogievskiy int64_t max_pdiscard; 28923482b9bcSEric Blake int head, tail, align; 28930b9fd3f4SFam Zheng BlockDriverState *bs = child->bs; 2894384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 289561007b31SStefan Hajnoczi 2896d93e5726SVladimir Sementsov-Ogievskiy if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 289761007b31SStefan Hajnoczi return -ENOMEDIUM; 289861007b31SStefan Hajnoczi } 289961007b31SStefan Hajnoczi 2900d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2901d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2902d6883bc9SVladimir Sementsov-Ogievskiy } 2903d6883bc9SVladimir Sementsov-Ogievskiy 290469b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, bytes, NULL); 29058b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 29068b117001SVladimir Sementsov-Ogievskiy return ret; 290761007b31SStefan Hajnoczi } 290861007b31SStefan Hajnoczi 290961007b31SStefan Hajnoczi /* Do nothing if disabled. */ 291061007b31SStefan Hajnoczi if (!(bs->open_flags & BDRV_O_UNMAP)) { 291161007b31SStefan Hajnoczi return 0; 291261007b31SStefan Hajnoczi } 291361007b31SStefan Hajnoczi 291402aefe43SEric Blake if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 291561007b31SStefan Hajnoczi return 0; 291661007b31SStefan Hajnoczi } 291761007b31SStefan Hajnoczi 29180bc329fbSHanna Reitz /* Invalidate the cached block-status data range if this discard overlaps */ 29190bc329fbSHanna Reitz bdrv_bsc_invalidate_range(bs, offset, bytes); 29200bc329fbSHanna Reitz 29213482b9bcSEric Blake /* Discard is advisory, but some devices track and coalesce 29223482b9bcSEric Blake * unaligned requests, so we must pass everything down rather than 29233482b9bcSEric Blake * round here. Still, most devices will just silently ignore 29243482b9bcSEric Blake * unaligned requests (by returning -ENOTSUP), so we must fragment 29253482b9bcSEric Blake * the request accordingly. */ 292602aefe43SEric Blake align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2927b8d0a980SEric Blake assert(align % bs->bl.request_alignment == 0); 2928b8d0a980SEric Blake head = offset % align; 2929f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % align; 29309f1963b3SEric Blake 293199723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2932f5a5ca79SManos Pitsidianakis tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 293350824995SFam Zheng 293400695c27SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2935ec050f77SDenis V. Lunev if (ret < 0) { 2936ec050f77SDenis V. Lunev goto out; 2937ec050f77SDenis V. Lunev } 2938ec050f77SDenis V. Lunev 29396a8f3dbbSVladimir Sementsov-Ogievskiy max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX), 29409f1963b3SEric Blake align); 29413482b9bcSEric Blake assert(max_pdiscard >= bs->bl.request_alignment); 29429f1963b3SEric Blake 2943f5a5ca79SManos Pitsidianakis while (bytes > 0) { 2944d93e5726SVladimir Sementsov-Ogievskiy int64_t num = bytes; 29453482b9bcSEric Blake 29463482b9bcSEric Blake if (head) { 29473482b9bcSEric Blake /* Make small requests to get to alignment boundaries. */ 2948f5a5ca79SManos Pitsidianakis num = MIN(bytes, align - head); 29493482b9bcSEric Blake if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 29503482b9bcSEric Blake num %= bs->bl.request_alignment; 29513482b9bcSEric Blake } 29523482b9bcSEric Blake head = (head + num) % align; 29533482b9bcSEric Blake assert(num < max_pdiscard); 29543482b9bcSEric Blake } else if (tail) { 29553482b9bcSEric Blake if (num > align) { 29563482b9bcSEric Blake /* Shorten the request to the last aligned cluster. */ 29573482b9bcSEric Blake num -= tail; 29583482b9bcSEric Blake } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 29593482b9bcSEric Blake tail > bs->bl.request_alignment) { 29603482b9bcSEric Blake tail %= bs->bl.request_alignment; 29613482b9bcSEric Blake num -= tail; 29623482b9bcSEric Blake } 29633482b9bcSEric Blake } 29643482b9bcSEric Blake /* limit request size */ 29653482b9bcSEric Blake if (num > max_pdiscard) { 29663482b9bcSEric Blake num = max_pdiscard; 29673482b9bcSEric Blake } 296861007b31SStefan Hajnoczi 2969d470ad42SMax Reitz if (!bs->drv) { 2970d470ad42SMax Reitz ret = -ENOMEDIUM; 2971d470ad42SMax Reitz goto out; 2972d470ad42SMax Reitz } 297347a5486dSEric Blake if (bs->drv->bdrv_co_pdiscard) { 297447a5486dSEric Blake ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 297561007b31SStefan Hajnoczi } else { 297661007b31SStefan Hajnoczi BlockAIOCB *acb; 297761007b31SStefan Hajnoczi CoroutineIOCompletion co = { 297861007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 297961007b31SStefan Hajnoczi }; 298061007b31SStefan Hajnoczi 29814da444a0SEric Blake acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 298261007b31SStefan Hajnoczi bdrv_co_io_em_complete, &co); 298361007b31SStefan Hajnoczi if (acb == NULL) { 2984b1066c87SFam Zheng ret = -EIO; 2985b1066c87SFam Zheng goto out; 298661007b31SStefan Hajnoczi } else { 298761007b31SStefan Hajnoczi qemu_coroutine_yield(); 298861007b31SStefan Hajnoczi ret = co.ret; 298961007b31SStefan Hajnoczi } 299061007b31SStefan Hajnoczi } 299161007b31SStefan Hajnoczi if (ret && ret != -ENOTSUP) { 2992b1066c87SFam Zheng goto out; 299361007b31SStefan Hajnoczi } 299461007b31SStefan Hajnoczi 29959f1963b3SEric Blake offset += num; 2996f5a5ca79SManos Pitsidianakis bytes -= num; 299761007b31SStefan Hajnoczi } 2998b1066c87SFam Zheng ret = 0; 2999b1066c87SFam Zheng out: 300000695c27SFam Zheng bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3001b1066c87SFam Zheng tracked_request_end(&req); 300299723548SPaolo Bonzini bdrv_dec_in_flight(bs); 3003b1066c87SFam Zheng return ret; 300461007b31SStefan Hajnoczi } 300561007b31SStefan Hajnoczi 3006881a4c55SPaolo Bonzini int coroutine_fn bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 300761007b31SStefan Hajnoczi { 300861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 30095c5ae76aSFam Zheng CoroutineIOCompletion co = { 30105c5ae76aSFam Zheng .coroutine = qemu_coroutine_self(), 30115c5ae76aSFam Zheng }; 30125c5ae76aSFam Zheng BlockAIOCB *acb; 3013384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 301461007b31SStefan Hajnoczi 301599723548SPaolo Bonzini bdrv_inc_in_flight(bs); 301616a389dcSKevin Wolf if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 30175c5ae76aSFam Zheng co.ret = -ENOTSUP; 30185c5ae76aSFam Zheng goto out; 30195c5ae76aSFam Zheng } 30205c5ae76aSFam Zheng 302116a389dcSKevin Wolf if (drv->bdrv_co_ioctl) { 302216a389dcSKevin Wolf co.ret = drv->bdrv_co_ioctl(bs, req, buf); 302316a389dcSKevin Wolf } else { 30245c5ae76aSFam Zheng acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 30255c5ae76aSFam Zheng if (!acb) { 3026c8a9fd80SFam Zheng co.ret = -ENOTSUP; 3027c8a9fd80SFam Zheng goto out; 30285c5ae76aSFam Zheng } 30295c5ae76aSFam Zheng qemu_coroutine_yield(); 303016a389dcSKevin Wolf } 30315c5ae76aSFam Zheng out: 303299723548SPaolo Bonzini bdrv_dec_in_flight(bs); 30335c5ae76aSFam Zheng return co.ret; 30345c5ae76aSFam Zheng } 30355c5ae76aSFam Zheng 303661007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size) 303761007b31SStefan Hajnoczi { 3038384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 303961007b31SStefan Hajnoczi return qemu_memalign(bdrv_opt_mem_align(bs), size); 304061007b31SStefan Hajnoczi } 304161007b31SStefan Hajnoczi 304261007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size) 304361007b31SStefan Hajnoczi { 3044384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 304561007b31SStefan Hajnoczi return memset(qemu_blockalign(bs, size), 0, size); 304661007b31SStefan Hajnoczi } 304761007b31SStefan Hajnoczi 304861007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 304961007b31SStefan Hajnoczi { 305061007b31SStefan Hajnoczi size_t align = bdrv_opt_mem_align(bs); 3051384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 305261007b31SStefan Hajnoczi 305361007b31SStefan Hajnoczi /* Ensure that NULL is never returned on success */ 305461007b31SStefan Hajnoczi assert(align > 0); 305561007b31SStefan Hajnoczi if (size == 0) { 305661007b31SStefan Hajnoczi size = align; 305761007b31SStefan Hajnoczi } 305861007b31SStefan Hajnoczi 305961007b31SStefan Hajnoczi return qemu_try_memalign(align, size); 306061007b31SStefan Hajnoczi } 306161007b31SStefan Hajnoczi 306261007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 306361007b31SStefan Hajnoczi { 306461007b31SStefan Hajnoczi void *mem = qemu_try_blockalign(bs, size); 3065384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 306661007b31SStefan Hajnoczi 306761007b31SStefan Hajnoczi if (mem) { 306861007b31SStefan Hajnoczi memset(mem, 0, size); 306961007b31SStefan Hajnoczi } 307061007b31SStefan Hajnoczi 307161007b31SStefan Hajnoczi return mem; 307261007b31SStefan Hajnoczi } 307361007b31SStefan Hajnoczi 307461007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs) 307561007b31SStefan Hajnoczi { 30766b98bd64SPaolo Bonzini BdrvChild *child; 3077384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 30786b98bd64SPaolo Bonzini 30796b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 30806b98bd64SPaolo Bonzini bdrv_io_plug(child->bs); 30816b98bd64SPaolo Bonzini } 30826b98bd64SPaolo Bonzini 3083d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->io_plugged) == 0) { 308461007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 308561007b31SStefan Hajnoczi if (drv && drv->bdrv_io_plug) { 308661007b31SStefan Hajnoczi drv->bdrv_io_plug(bs); 30876b98bd64SPaolo Bonzini } 308861007b31SStefan Hajnoczi } 308961007b31SStefan Hajnoczi } 309061007b31SStefan Hajnoczi 309161007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs) 309261007b31SStefan Hajnoczi { 30936b98bd64SPaolo Bonzini BdrvChild *child; 3094384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 30956b98bd64SPaolo Bonzini 30966b98bd64SPaolo Bonzini assert(bs->io_plugged); 3097d73415a3SStefan Hajnoczi if (qatomic_fetch_dec(&bs->io_plugged) == 1) { 309861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 309961007b31SStefan Hajnoczi if (drv && drv->bdrv_io_unplug) { 310061007b31SStefan Hajnoczi drv->bdrv_io_unplug(bs); 310161007b31SStefan Hajnoczi } 310261007b31SStefan Hajnoczi } 310361007b31SStefan Hajnoczi 31046b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 31056b98bd64SPaolo Bonzini bdrv_io_unplug(child->bs); 31066b98bd64SPaolo Bonzini } 31076b98bd64SPaolo Bonzini } 310823d0ba93SFam Zheng 3109f4ec04baSStefan Hajnoczi /* Helper that undoes bdrv_register_buf() when it fails partway through */ 3110f4ec04baSStefan Hajnoczi static void bdrv_register_buf_rollback(BlockDriverState *bs, 3111f4ec04baSStefan Hajnoczi void *host, 3112f4ec04baSStefan Hajnoczi size_t size, 3113f4ec04baSStefan Hajnoczi BdrvChild *final_child) 3114f4ec04baSStefan Hajnoczi { 3115f4ec04baSStefan Hajnoczi BdrvChild *child; 3116f4ec04baSStefan Hajnoczi 3117f4ec04baSStefan Hajnoczi QLIST_FOREACH(child, &bs->children, next) { 3118f4ec04baSStefan Hajnoczi if (child == final_child) { 3119f4ec04baSStefan Hajnoczi break; 3120f4ec04baSStefan Hajnoczi } 3121f4ec04baSStefan Hajnoczi 3122f4ec04baSStefan Hajnoczi bdrv_unregister_buf(child->bs, host, size); 3123f4ec04baSStefan Hajnoczi } 3124f4ec04baSStefan Hajnoczi 3125f4ec04baSStefan Hajnoczi if (bs->drv && bs->drv->bdrv_unregister_buf) { 3126f4ec04baSStefan Hajnoczi bs->drv->bdrv_unregister_buf(bs, host, size); 3127f4ec04baSStefan Hajnoczi } 3128f4ec04baSStefan Hajnoczi } 3129f4ec04baSStefan Hajnoczi 3130f4ec04baSStefan Hajnoczi bool bdrv_register_buf(BlockDriverState *bs, void *host, size_t size, 3131f4ec04baSStefan Hajnoczi Error **errp) 313223d0ba93SFam Zheng { 313323d0ba93SFam Zheng BdrvChild *child; 313423d0ba93SFam Zheng 3135f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 313623d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_register_buf) { 3137f4ec04baSStefan Hajnoczi if (!bs->drv->bdrv_register_buf(bs, host, size, errp)) { 3138f4ec04baSStefan Hajnoczi return false; 3139f4ec04baSStefan Hajnoczi } 314023d0ba93SFam Zheng } 314123d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 3142f4ec04baSStefan Hajnoczi if (!bdrv_register_buf(child->bs, host, size, errp)) { 3143f4ec04baSStefan Hajnoczi bdrv_register_buf_rollback(bs, host, size, child); 3144f4ec04baSStefan Hajnoczi return false; 314523d0ba93SFam Zheng } 314623d0ba93SFam Zheng } 3147f4ec04baSStefan Hajnoczi return true; 3148f4ec04baSStefan Hajnoczi } 314923d0ba93SFam Zheng 31504f384011SStefan Hajnoczi void bdrv_unregister_buf(BlockDriverState *bs, void *host, size_t size) 315123d0ba93SFam Zheng { 315223d0ba93SFam Zheng BdrvChild *child; 315323d0ba93SFam Zheng 3154f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 315523d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_unregister_buf) { 31564f384011SStefan Hajnoczi bs->drv->bdrv_unregister_buf(bs, host, size); 315723d0ba93SFam Zheng } 315823d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 31594f384011SStefan Hajnoczi bdrv_unregister_buf(child->bs, host, size); 316023d0ba93SFam Zheng } 316123d0ba93SFam Zheng } 3162fcc67678SFam Zheng 316367b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal( 3164a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *src, int64_t src_offset, BdrvChild *dst, 3165a5215b8fSVladimir Sementsov-Ogievskiy int64_t dst_offset, int64_t bytes, 316667b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3167fcc67678SFam Zheng bool recurse_src) 3168fcc67678SFam Zheng { 3169999658a0SVladimir Sementsov-Ogievskiy BdrvTrackedRequest req; 3170fcc67678SFam Zheng int ret; 3171fcc67678SFam Zheng 3172fe0480d6SKevin Wolf /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3173fe0480d6SKevin Wolf assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3174fe0480d6SKevin Wolf assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 317545e62b46SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_NO_WAIT)); 317645e62b46SVladimir Sementsov-Ogievskiy assert(!(write_flags & BDRV_REQ_NO_WAIT)); 3177fe0480d6SKevin Wolf 3178f4dad307SVladimir Sementsov-Ogievskiy if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) { 3179fcc67678SFam Zheng return -ENOMEDIUM; 3180fcc67678SFam Zheng } 318163f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(dst_offset, bytes, NULL, 0); 3182fcc67678SFam Zheng if (ret) { 3183fcc67678SFam Zheng return ret; 3184fcc67678SFam Zheng } 318567b51fb9SVladimir Sementsov-Ogievskiy if (write_flags & BDRV_REQ_ZERO_WRITE) { 318667b51fb9SVladimir Sementsov-Ogievskiy return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3187fcc67678SFam Zheng } 3188fcc67678SFam Zheng 3189f4dad307SVladimir Sementsov-Ogievskiy if (!src || !src->bs || !bdrv_is_inserted(src->bs)) { 3190d4d3e5a0SFam Zheng return -ENOMEDIUM; 3191d4d3e5a0SFam Zheng } 319263f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(src_offset, bytes, NULL, 0); 3193d4d3e5a0SFam Zheng if (ret) { 3194d4d3e5a0SFam Zheng return ret; 3195d4d3e5a0SFam Zheng } 3196d4d3e5a0SFam Zheng 3197fcc67678SFam Zheng if (!src->bs->drv->bdrv_co_copy_range_from 3198fcc67678SFam Zheng || !dst->bs->drv->bdrv_co_copy_range_to 3199fcc67678SFam Zheng || src->bs->encrypted || dst->bs->encrypted) { 3200fcc67678SFam Zheng return -ENOTSUP; 3201fcc67678SFam Zheng } 3202999658a0SVladimir Sementsov-Ogievskiy 3203999658a0SVladimir Sementsov-Ogievskiy if (recurse_src) { 3204d4d3e5a0SFam Zheng bdrv_inc_in_flight(src->bs); 3205999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, src->bs, src_offset, bytes, 3206999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_READ); 320737aec7d7SFam Zheng 320809d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 320909d2f948SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_SERIALISING)); 3210304d9d7fSMax Reitz bdrv_wait_serialising_requests(&req); 3211999658a0SVladimir Sementsov-Ogievskiy 321237aec7d7SFam Zheng ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3213fcc67678SFam Zheng src, src_offset, 3214fcc67678SFam Zheng dst, dst_offset, 321567b51fb9SVladimir Sementsov-Ogievskiy bytes, 321667b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 3217999658a0SVladimir Sementsov-Ogievskiy 3218999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3219999658a0SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(src->bs); 3220fcc67678SFam Zheng } else { 3221999658a0SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(dst->bs); 3222999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3223999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_WRITE); 32240eb1e891SFam Zheng ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 32250eb1e891SFam Zheng write_flags); 32260eb1e891SFam Zheng if (!ret) { 322737aec7d7SFam Zheng ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3228fcc67678SFam Zheng src, src_offset, 3229fcc67678SFam Zheng dst, dst_offset, 323067b51fb9SVladimir Sementsov-Ogievskiy bytes, 323167b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 32320eb1e891SFam Zheng } 32330eb1e891SFam Zheng bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3234999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3235d4d3e5a0SFam Zheng bdrv_dec_in_flight(dst->bs); 3236999658a0SVladimir Sementsov-Ogievskiy } 3237999658a0SVladimir Sementsov-Ogievskiy 323837aec7d7SFam Zheng return ret; 3239fcc67678SFam Zheng } 3240fcc67678SFam Zheng 3241fcc67678SFam Zheng /* Copy range from @src to @dst. 3242fcc67678SFam Zheng * 3243fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3244fcc67678SFam Zheng * semantics. */ 3245a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, 3246a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3247a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, 324867b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 324967b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3250fcc67678SFam Zheng { 3251967d7905SEmanuele Giuseppe Esposito IO_CODE(); 3252ecc983a5SFam Zheng trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3253ecc983a5SFam Zheng read_flags, write_flags); 3254fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 325567b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, true); 3256fcc67678SFam Zheng } 3257fcc67678SFam Zheng 3258fcc67678SFam Zheng /* Copy range from @src to @dst. 3259fcc67678SFam Zheng * 3260fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3261fcc67678SFam Zheng * semantics. */ 3262a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, 3263a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3264a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, 326567b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 326667b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3267fcc67678SFam Zheng { 3268967d7905SEmanuele Giuseppe Esposito IO_CODE(); 3269ecc983a5SFam Zheng trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3270ecc983a5SFam Zheng read_flags, write_flags); 3271fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 327267b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, false); 3273fcc67678SFam Zheng } 3274fcc67678SFam Zheng 3275a5215b8fSVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, 3276a5215b8fSVladimir Sementsov-Ogievskiy BdrvChild *dst, int64_t dst_offset, 3277a5215b8fSVladimir Sementsov-Ogievskiy int64_t bytes, BdrvRequestFlags read_flags, 327867b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3279fcc67678SFam Zheng { 3280384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 328137aec7d7SFam Zheng return bdrv_co_copy_range_from(src, src_offset, 3282fcc67678SFam Zheng dst, dst_offset, 328367b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags); 3284fcc67678SFam Zheng } 32853d9f2d2aSKevin Wolf 32863d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs) 32873d9f2d2aSKevin Wolf { 32883d9f2d2aSKevin Wolf BdrvChild *c; 32893d9f2d2aSKevin Wolf QLIST_FOREACH(c, &bs->parents, next_parent) { 3290bd86fb99SMax Reitz if (c->klass->resize) { 3291bd86fb99SMax Reitz c->klass->resize(c); 32923d9f2d2aSKevin Wolf } 32933d9f2d2aSKevin Wolf } 32943d9f2d2aSKevin Wolf } 32953d9f2d2aSKevin Wolf 32963d9f2d2aSKevin Wolf /** 32973d9f2d2aSKevin Wolf * Truncate file to 'offset' bytes (needed only for file protocols) 3298c80d8b06SMax Reitz * 3299c80d8b06SMax Reitz * If 'exact' is true, the file must be resized to exactly the given 3300c80d8b06SMax Reitz * 'offset'. Otherwise, it is sufficient for the node to be at least 3301c80d8b06SMax Reitz * 'offset' bytes in length. 33023d9f2d2aSKevin Wolf */ 3303c80d8b06SMax Reitz int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 33047b8e4857SKevin Wolf PreallocMode prealloc, BdrvRequestFlags flags, 33057b8e4857SKevin Wolf Error **errp) 33063d9f2d2aSKevin Wolf { 33073d9f2d2aSKevin Wolf BlockDriverState *bs = child->bs; 330823b93525SMax Reitz BdrvChild *filtered, *backing; 33093d9f2d2aSKevin Wolf BlockDriver *drv = bs->drv; 33101bc5f09fSKevin Wolf BdrvTrackedRequest req; 33111bc5f09fSKevin Wolf int64_t old_size, new_bytes; 33123d9f2d2aSKevin Wolf int ret; 3313384a48fbSEmanuele Giuseppe Esposito IO_CODE(); 33143d9f2d2aSKevin Wolf 33153d9f2d2aSKevin Wolf /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 33163d9f2d2aSKevin Wolf if (!drv) { 33173d9f2d2aSKevin Wolf error_setg(errp, "No medium inserted"); 33183d9f2d2aSKevin Wolf return -ENOMEDIUM; 33193d9f2d2aSKevin Wolf } 33203d9f2d2aSKevin Wolf if (offset < 0) { 33213d9f2d2aSKevin Wolf error_setg(errp, "Image size cannot be negative"); 33223d9f2d2aSKevin Wolf return -EINVAL; 33233d9f2d2aSKevin Wolf } 33243d9f2d2aSKevin Wolf 332569b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, 0, errp); 33268b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 33278b117001SVladimir Sementsov-Ogievskiy return ret; 33288b117001SVladimir Sementsov-Ogievskiy } 33298b117001SVladimir Sementsov-Ogievskiy 33301bc5f09fSKevin Wolf old_size = bdrv_getlength(bs); 33311bc5f09fSKevin Wolf if (old_size < 0) { 33321bc5f09fSKevin Wolf error_setg_errno(errp, -old_size, "Failed to get old image size"); 33331bc5f09fSKevin Wolf return old_size; 33341bc5f09fSKevin Wolf } 33351bc5f09fSKevin Wolf 333697efa869SEric Blake if (bdrv_is_read_only(bs)) { 333797efa869SEric Blake error_setg(errp, "Image is read-only"); 333897efa869SEric Blake return -EACCES; 333997efa869SEric Blake } 334097efa869SEric Blake 33411bc5f09fSKevin Wolf if (offset > old_size) { 33421bc5f09fSKevin Wolf new_bytes = offset - old_size; 33431bc5f09fSKevin Wolf } else { 33441bc5f09fSKevin Wolf new_bytes = 0; 33451bc5f09fSKevin Wolf } 33461bc5f09fSKevin Wolf 33473d9f2d2aSKevin Wolf bdrv_inc_in_flight(bs); 33485416a11eSFam Zheng tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 33495416a11eSFam Zheng BDRV_TRACKED_TRUNCATE); 33501bc5f09fSKevin Wolf 33511bc5f09fSKevin Wolf /* If we are growing the image and potentially using preallocation for the 33521bc5f09fSKevin Wolf * new area, we need to make sure that no write requests are made to it 33531bc5f09fSKevin Wolf * concurrently or they might be overwritten by preallocation. */ 33541bc5f09fSKevin Wolf if (new_bytes) { 33558ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, 1); 3356cd47d792SFam Zheng } 3357cd47d792SFam Zheng ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3358cd47d792SFam Zheng 0); 3359cd47d792SFam Zheng if (ret < 0) { 3360cd47d792SFam Zheng error_setg_errno(errp, -ret, 3361cd47d792SFam Zheng "Failed to prepare request for truncation"); 3362cd47d792SFam Zheng goto out; 33631bc5f09fSKevin Wolf } 33643d9f2d2aSKevin Wolf 336593393e69SMax Reitz filtered = bdrv_filter_child(bs); 336623b93525SMax Reitz backing = bdrv_cow_child(bs); 336793393e69SMax Reitz 3368955c7d66SKevin Wolf /* 3369955c7d66SKevin Wolf * If the image has a backing file that is large enough that it would 3370955c7d66SKevin Wolf * provide data for the new area, we cannot leave it unallocated because 3371955c7d66SKevin Wolf * then the backing file content would become visible. Instead, zero-fill 3372955c7d66SKevin Wolf * the new area. 3373955c7d66SKevin Wolf * 3374955c7d66SKevin Wolf * Note that if the image has a backing file, but was opened without the 3375955c7d66SKevin Wolf * backing file, taking care of keeping things consistent with that backing 3376955c7d66SKevin Wolf * file is the user's responsibility. 3377955c7d66SKevin Wolf */ 337823b93525SMax Reitz if (new_bytes && backing) { 3379955c7d66SKevin Wolf int64_t backing_len; 3380955c7d66SKevin Wolf 338123b93525SMax Reitz backing_len = bdrv_getlength(backing->bs); 3382955c7d66SKevin Wolf if (backing_len < 0) { 3383955c7d66SKevin Wolf ret = backing_len; 3384955c7d66SKevin Wolf error_setg_errno(errp, -ret, "Could not get backing file size"); 3385955c7d66SKevin Wolf goto out; 3386955c7d66SKevin Wolf } 3387955c7d66SKevin Wolf 3388955c7d66SKevin Wolf if (backing_len > old_size) { 3389955c7d66SKevin Wolf flags |= BDRV_REQ_ZERO_WRITE; 3390955c7d66SKevin Wolf } 3391955c7d66SKevin Wolf } 3392955c7d66SKevin Wolf 33936b7e8f8bSMax Reitz if (drv->bdrv_co_truncate) { 339492b92799SKevin Wolf if (flags & ~bs->supported_truncate_flags) { 339592b92799SKevin Wolf error_setg(errp, "Block driver does not support requested flags"); 339692b92799SKevin Wolf ret = -ENOTSUP; 339792b92799SKevin Wolf goto out; 339892b92799SKevin Wolf } 339992b92799SKevin Wolf ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 340093393e69SMax Reitz } else if (filtered) { 340193393e69SMax Reitz ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); 34026b7e8f8bSMax Reitz } else { 34033d9f2d2aSKevin Wolf error_setg(errp, "Image format driver does not support resize"); 34043d9f2d2aSKevin Wolf ret = -ENOTSUP; 34053d9f2d2aSKevin Wolf goto out; 34063d9f2d2aSKevin Wolf } 34073d9f2d2aSKevin Wolf if (ret < 0) { 34083d9f2d2aSKevin Wolf goto out; 34093d9f2d2aSKevin Wolf } 34106b7e8f8bSMax Reitz 34113d9f2d2aSKevin Wolf ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 34123d9f2d2aSKevin Wolf if (ret < 0) { 34133d9f2d2aSKevin Wolf error_setg_errno(errp, -ret, "Could not refresh total sector count"); 34143d9f2d2aSKevin Wolf } else { 34153d9f2d2aSKevin Wolf offset = bs->total_sectors * BDRV_SECTOR_SIZE; 34163d9f2d2aSKevin Wolf } 3417cd47d792SFam Zheng /* It's possible that truncation succeeded but refresh_total_sectors 3418cd47d792SFam Zheng * failed, but the latter doesn't affect how we should finish the request. 3419cd47d792SFam Zheng * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3420cd47d792SFam Zheng bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 34213d9f2d2aSKevin Wolf 34223d9f2d2aSKevin Wolf out: 34231bc5f09fSKevin Wolf tracked_request_end(&req); 34243d9f2d2aSKevin Wolf bdrv_dec_in_flight(bs); 34251bc5f09fSKevin Wolf 34263d9f2d2aSKevin Wolf return ret; 34273d9f2d2aSKevin Wolf } 3428bd54669aSVladimir Sementsov-Ogievskiy 3429bd54669aSVladimir Sementsov-Ogievskiy void bdrv_cancel_in_flight(BlockDriverState *bs) 3430bd54669aSVladimir Sementsov-Ogievskiy { 3431f791bf7fSEmanuele Giuseppe Esposito GLOBAL_STATE_CODE(); 3432bd54669aSVladimir Sementsov-Ogievskiy if (!bs || !bs->drv) { 3433bd54669aSVladimir Sementsov-Ogievskiy return; 3434bd54669aSVladimir Sementsov-Ogievskiy } 3435bd54669aSVladimir Sementsov-Ogievskiy 3436bd54669aSVladimir Sementsov-Ogievskiy if (bs->drv->bdrv_cancel_in_flight) { 3437bd54669aSVladimir Sementsov-Ogievskiy bs->drv->bdrv_cancel_in_flight(bs); 3438bd54669aSVladimir Sementsov-Ogievskiy } 3439bd54669aSVladimir Sementsov-Ogievskiy } 3440ce14f3b4SVladimir Sementsov-Ogievskiy 3441ce14f3b4SVladimir Sementsov-Ogievskiy int coroutine_fn 3442ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes, 3443ce14f3b4SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset) 3444ce14f3b4SVladimir Sementsov-Ogievskiy { 3445ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriverState *bs = child->bs; 3446ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 3447ce14f3b4SVladimir Sementsov-Ogievskiy int ret; 3448ce14f3b4SVladimir Sementsov-Ogievskiy IO_CODE(); 3449ce14f3b4SVladimir Sementsov-Ogievskiy 3450ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv) { 3451ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 3452ce14f3b4SVladimir Sementsov-Ogievskiy } 3453ce14f3b4SVladimir Sementsov-Ogievskiy 3454ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv->bdrv_co_preadv_snapshot) { 3455ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOTSUP; 3456ce14f3b4SVladimir Sementsov-Ogievskiy } 3457ce14f3b4SVladimir Sementsov-Ogievskiy 3458ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 3459ce14f3b4SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset); 3460ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 3461ce14f3b4SVladimir Sementsov-Ogievskiy 3462ce14f3b4SVladimir Sementsov-Ogievskiy return ret; 3463ce14f3b4SVladimir Sementsov-Ogievskiy } 3464ce14f3b4SVladimir Sementsov-Ogievskiy 3465ce14f3b4SVladimir Sementsov-Ogievskiy int coroutine_fn 3466ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_co_snapshot_block_status(BlockDriverState *bs, 3467ce14f3b4SVladimir Sementsov-Ogievskiy bool want_zero, int64_t offset, int64_t bytes, 3468ce14f3b4SVladimir Sementsov-Ogievskiy int64_t *pnum, int64_t *map, 3469ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriverState **file) 3470ce14f3b4SVladimir Sementsov-Ogievskiy { 3471ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 3472ce14f3b4SVladimir Sementsov-Ogievskiy int ret; 3473ce14f3b4SVladimir Sementsov-Ogievskiy IO_CODE(); 3474ce14f3b4SVladimir Sementsov-Ogievskiy 3475ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv) { 3476ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 3477ce14f3b4SVladimir Sementsov-Ogievskiy } 3478ce14f3b4SVladimir Sementsov-Ogievskiy 3479ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv->bdrv_co_snapshot_block_status) { 3480ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOTSUP; 3481ce14f3b4SVladimir Sementsov-Ogievskiy } 3482ce14f3b4SVladimir Sementsov-Ogievskiy 3483ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 3484ce14f3b4SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes, 3485ce14f3b4SVladimir Sementsov-Ogievskiy pnum, map, file); 3486ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 3487ce14f3b4SVladimir Sementsov-Ogievskiy 3488ce14f3b4SVladimir Sementsov-Ogievskiy return ret; 3489ce14f3b4SVladimir Sementsov-Ogievskiy } 3490ce14f3b4SVladimir Sementsov-Ogievskiy 3491ce14f3b4SVladimir Sementsov-Ogievskiy int coroutine_fn 3492ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes) 3493ce14f3b4SVladimir Sementsov-Ogievskiy { 3494ce14f3b4SVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 3495ce14f3b4SVladimir Sementsov-Ogievskiy int ret; 3496ce14f3b4SVladimir Sementsov-Ogievskiy IO_CODE(); 3497ce14f3b4SVladimir Sementsov-Ogievskiy 3498ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv) { 3499ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 3500ce14f3b4SVladimir Sementsov-Ogievskiy } 3501ce14f3b4SVladimir Sementsov-Ogievskiy 3502ce14f3b4SVladimir Sementsov-Ogievskiy if (!drv->bdrv_co_pdiscard_snapshot) { 3503ce14f3b4SVladimir Sementsov-Ogievskiy return -ENOTSUP; 3504ce14f3b4SVladimir Sementsov-Ogievskiy } 3505ce14f3b4SVladimir Sementsov-Ogievskiy 3506ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 3507ce14f3b4SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes); 3508ce14f3b4SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 3509ce14f3b4SVladimir Sementsov-Ogievskiy 3510ce14f3b4SVladimir Sementsov-Ogievskiy return ret; 3511ce14f3b4SVladimir Sementsov-Ogievskiy } 3512