161007b31SStefan Hajnoczi /* 261007b31SStefan Hajnoczi * Block layer I/O functions 361007b31SStefan Hajnoczi * 461007b31SStefan Hajnoczi * Copyright (c) 2003 Fabrice Bellard 561007b31SStefan Hajnoczi * 661007b31SStefan Hajnoczi * Permission is hereby granted, free of charge, to any person obtaining a copy 761007b31SStefan Hajnoczi * of this software and associated documentation files (the "Software"), to deal 861007b31SStefan Hajnoczi * in the Software without restriction, including without limitation the rights 961007b31SStefan Hajnoczi * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1061007b31SStefan Hajnoczi * copies of the Software, and to permit persons to whom the Software is 1161007b31SStefan Hajnoczi * furnished to do so, subject to the following conditions: 1261007b31SStefan Hajnoczi * 1361007b31SStefan Hajnoczi * The above copyright notice and this permission notice shall be included in 1461007b31SStefan Hajnoczi * all copies or substantial portions of the Software. 1561007b31SStefan Hajnoczi * 1661007b31SStefan Hajnoczi * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1761007b31SStefan Hajnoczi * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1861007b31SStefan Hajnoczi * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1961007b31SStefan Hajnoczi * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2061007b31SStefan Hajnoczi * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2161007b31SStefan Hajnoczi * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2261007b31SStefan Hajnoczi * THE SOFTWARE. 2361007b31SStefan Hajnoczi */ 2461007b31SStefan Hajnoczi 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2661007b31SStefan Hajnoczi #include "trace.h" 277f0e9da6SMax Reitz #include "sysemu/block-backend.h" 287719f3c9SStefan Hajnoczi #include "block/aio-wait.h" 2961007b31SStefan Hajnoczi #include "block/blockjob.h" 30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h" 3161007b31SStefan Hajnoczi #include "block/block_int.h" 3221c2283eSVladimir Sementsov-Ogievskiy #include "block/coroutines.h" 33f348b6d1SVeronia Bahaa #include "qemu/cutils.h" 34da34e65cSMarkus Armbruster #include "qapi/error.h" 35d49b6836SMarkus Armbruster #include "qemu/error-report.h" 36db725815SMarkus Armbruster #include "qemu/main-loop.h" 37c8aa7895SPavel Dovgalyuk #include "sysemu/replay.h" 3861007b31SStefan Hajnoczi 39cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 40cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 41cb2e2878SEric Blake 427f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs); 43d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 44f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags); 4561007b31SStefan Hajnoczi 46f4c8a43bSMax Reitz static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 476cd5c9d7SKevin Wolf bool ignore_bds_parents) 4861007b31SStefan Hajnoczi { 4902d21300SKevin Wolf BdrvChild *c, *next; 5027ccdd52SKevin Wolf 5102d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 52bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 530152bf40SKevin Wolf continue; 540152bf40SKevin Wolf } 554be6a6d1SKevin Wolf bdrv_parent_drained_begin_single(c, false); 56ce0f1412SPaolo Bonzini } 57ce0f1412SPaolo Bonzini } 58ce0f1412SPaolo Bonzini 59e037c09cSMax Reitz static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c, 60e037c09cSMax Reitz int *drained_end_counter) 61804db8eaSMax Reitz { 62804db8eaSMax Reitz assert(c->parent_quiesce_counter > 0); 63804db8eaSMax Reitz c->parent_quiesce_counter--; 64bd86fb99SMax Reitz if (c->klass->drained_end) { 65bd86fb99SMax Reitz c->klass->drained_end(c, drained_end_counter); 66804db8eaSMax Reitz } 67804db8eaSMax Reitz } 68804db8eaSMax Reitz 69e037c09cSMax Reitz void bdrv_parent_drained_end_single(BdrvChild *c) 70e037c09cSMax Reitz { 71e037c09cSMax Reitz int drained_end_counter = 0; 72e037c09cSMax Reitz bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter); 73d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(c->bs, qatomic_read(&drained_end_counter) > 0); 74e037c09cSMax Reitz } 75e037c09cSMax Reitz 76f4c8a43bSMax Reitz static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 77e037c09cSMax Reitz bool ignore_bds_parents, 78e037c09cSMax Reitz int *drained_end_counter) 79ce0f1412SPaolo Bonzini { 8061ad631cSMax Reitz BdrvChild *c; 8127ccdd52SKevin Wolf 8261ad631cSMax Reitz QLIST_FOREACH(c, &bs->parents, next_parent) { 83bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 840152bf40SKevin Wolf continue; 850152bf40SKevin Wolf } 86e037c09cSMax Reitz bdrv_parent_drained_end_single_no_poll(c, drained_end_counter); 87c2066af0SKevin Wolf } 8861007b31SStefan Hajnoczi } 8961007b31SStefan Hajnoczi 904be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c) 914be6a6d1SKevin Wolf { 92bd86fb99SMax Reitz if (c->klass->drained_poll) { 93bd86fb99SMax Reitz return c->klass->drained_poll(c); 944be6a6d1SKevin Wolf } 954be6a6d1SKevin Wolf return false; 964be6a6d1SKevin Wolf } 974be6a6d1SKevin Wolf 986cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 996cd5c9d7SKevin Wolf bool ignore_bds_parents) 10089bd0305SKevin Wolf { 10189bd0305SKevin Wolf BdrvChild *c, *next; 10289bd0305SKevin Wolf bool busy = false; 10389bd0305SKevin Wolf 10489bd0305SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 105bd86fb99SMax Reitz if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) { 10689bd0305SKevin Wolf continue; 10789bd0305SKevin Wolf } 1084be6a6d1SKevin Wolf busy |= bdrv_parent_drained_poll_single(c); 10989bd0305SKevin Wolf } 11089bd0305SKevin Wolf 11189bd0305SKevin Wolf return busy; 11289bd0305SKevin Wolf } 11389bd0305SKevin Wolf 1144be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 1154be6a6d1SKevin Wolf { 116804db8eaSMax Reitz c->parent_quiesce_counter++; 117bd86fb99SMax Reitz if (c->klass->drained_begin) { 118bd86fb99SMax Reitz c->klass->drained_begin(c); 1194be6a6d1SKevin Wolf } 1204be6a6d1SKevin Wolf if (poll) { 1214be6a6d1SKevin Wolf BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 1224be6a6d1SKevin Wolf } 1234be6a6d1SKevin Wolf } 1244be6a6d1SKevin Wolf 125d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 126d9e0dfa2SEric Blake { 127d9e0dfa2SEric Blake dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 128d9e0dfa2SEric Blake dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 129d9e0dfa2SEric Blake dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 130d9e0dfa2SEric Blake src->opt_mem_alignment); 131d9e0dfa2SEric Blake dst->min_mem_alignment = MAX(dst->min_mem_alignment, 132d9e0dfa2SEric Blake src->min_mem_alignment); 133d9e0dfa2SEric Blake dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 134d9e0dfa2SEric Blake } 135d9e0dfa2SEric Blake 13661007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 13761007b31SStefan Hajnoczi { 13833985614SVladimir Sementsov-Ogievskiy ERRP_GUARD(); 13961007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 14066b129acSMax Reitz BdrvChild *c; 14166b129acSMax Reitz bool have_limits; 14261007b31SStefan Hajnoczi 14361007b31SStefan Hajnoczi memset(&bs->bl, 0, sizeof(bs->bl)); 14461007b31SStefan Hajnoczi 14561007b31SStefan Hajnoczi if (!drv) { 14661007b31SStefan Hajnoczi return; 14761007b31SStefan Hajnoczi } 14861007b31SStefan Hajnoczi 14979ba8c98SEric Blake /* Default alignment based on whether driver has byte interface */ 150e31f6864SEric Blake bs->bl.request_alignment = (drv->bdrv_co_preadv || 151ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_aio_preadv || 152ac850bf0SVladimir Sementsov-Ogievskiy drv->bdrv_co_preadv_part) ? 1 : 512; 15379ba8c98SEric Blake 15461007b31SStefan Hajnoczi /* Take some limits from the children as a default */ 15566b129acSMax Reitz have_limits = false; 15666b129acSMax Reitz QLIST_FOREACH(c, &bs->children, next) { 15766b129acSMax Reitz if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW)) 15866b129acSMax Reitz { 15933985614SVladimir Sementsov-Ogievskiy bdrv_refresh_limits(c->bs, errp); 16033985614SVladimir Sementsov-Ogievskiy if (*errp) { 16161007b31SStefan Hajnoczi return; 16261007b31SStefan Hajnoczi } 16366b129acSMax Reitz bdrv_merge_limits(&bs->bl, &c->bs->bl); 16466b129acSMax Reitz have_limits = true; 16566b129acSMax Reitz } 16666b129acSMax Reitz } 16766b129acSMax Reitz 16866b129acSMax Reitz if (!have_limits) { 1694196d2f0SDenis V. Lunev bs->bl.min_mem_alignment = 512; 170038adc2fSWei Yang bs->bl.opt_mem_alignment = qemu_real_host_page_size; 171bd44feb7SStefan Hajnoczi 172bd44feb7SStefan Hajnoczi /* Safe default since most protocols use readv()/writev()/etc */ 173bd44feb7SStefan Hajnoczi bs->bl.max_iov = IOV_MAX; 17461007b31SStefan Hajnoczi } 17561007b31SStefan Hajnoczi 17661007b31SStefan Hajnoczi /* Then let the driver override it */ 17761007b31SStefan Hajnoczi if (drv->bdrv_refresh_limits) { 17861007b31SStefan Hajnoczi drv->bdrv_refresh_limits(bs, errp); 1798b117001SVladimir Sementsov-Ogievskiy if (*errp) { 1808b117001SVladimir Sementsov-Ogievskiy return; 1818b117001SVladimir Sementsov-Ogievskiy } 1828b117001SVladimir Sementsov-Ogievskiy } 1838b117001SVladimir Sementsov-Ogievskiy 1848b117001SVladimir Sementsov-Ogievskiy if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) { 1858b117001SVladimir Sementsov-Ogievskiy error_setg(errp, "Driver requires too large request alignment"); 18661007b31SStefan Hajnoczi } 18761007b31SStefan Hajnoczi } 18861007b31SStefan Hajnoczi 18961007b31SStefan Hajnoczi /** 19061007b31SStefan Hajnoczi * The copy-on-read flag is actually a reference count so multiple users may 19161007b31SStefan Hajnoczi * use the feature without worrying about clobbering its previous state. 19261007b31SStefan Hajnoczi * Copy-on-read stays enabled until all users have called to disable it. 19361007b31SStefan Hajnoczi */ 19461007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs) 19561007b31SStefan Hajnoczi { 196d73415a3SStefan Hajnoczi qatomic_inc(&bs->copy_on_read); 19761007b31SStefan Hajnoczi } 19861007b31SStefan Hajnoczi 19961007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs) 20061007b31SStefan Hajnoczi { 201d73415a3SStefan Hajnoczi int old = qatomic_fetch_dec(&bs->copy_on_read); 202d3faa13eSPaolo Bonzini assert(old >= 1); 20361007b31SStefan Hajnoczi } 20461007b31SStefan Hajnoczi 20561124f03SPaolo Bonzini typedef struct { 20661124f03SPaolo Bonzini Coroutine *co; 20761124f03SPaolo Bonzini BlockDriverState *bs; 20861124f03SPaolo Bonzini bool done; 209481cad48SManos Pitsidianakis bool begin; 210b0165585SKevin Wolf bool recursive; 211fe4f0614SKevin Wolf bool poll; 2120152bf40SKevin Wolf BdrvChild *parent; 2136cd5c9d7SKevin Wolf bool ignore_bds_parents; 2148e1da77eSMax Reitz int *drained_end_counter; 21561124f03SPaolo Bonzini } BdrvCoDrainData; 21661124f03SPaolo Bonzini 21761124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 21861124f03SPaolo Bonzini { 21961124f03SPaolo Bonzini BdrvCoDrainData *data = opaque; 22061124f03SPaolo Bonzini BlockDriverState *bs = data->bs; 22161124f03SPaolo Bonzini 222481cad48SManos Pitsidianakis if (data->begin) { 223f8ea8dacSManos Pitsidianakis bs->drv->bdrv_co_drain_begin(bs); 224481cad48SManos Pitsidianakis } else { 225481cad48SManos Pitsidianakis bs->drv->bdrv_co_drain_end(bs); 226481cad48SManos Pitsidianakis } 22761124f03SPaolo Bonzini 22865181d63SMax Reitz /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */ 229d73415a3SStefan Hajnoczi qatomic_mb_set(&data->done, true); 230e037c09cSMax Reitz if (!data->begin) { 231d73415a3SStefan Hajnoczi qatomic_dec(data->drained_end_counter); 2328e1da77eSMax Reitz } 23365181d63SMax Reitz bdrv_dec_in_flight(bs); 2348e1da77eSMax Reitz 2350109e7e6SKevin Wolf g_free(data); 2360109e7e6SKevin Wolf } 23761124f03SPaolo Bonzini 238db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 2398e1da77eSMax Reitz static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, 2408e1da77eSMax Reitz int *drained_end_counter) 24161124f03SPaolo Bonzini { 2420109e7e6SKevin Wolf BdrvCoDrainData *data; 24361124f03SPaolo Bonzini 244f8ea8dacSManos Pitsidianakis if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 245481cad48SManos Pitsidianakis (!begin && !bs->drv->bdrv_co_drain_end)) { 24661124f03SPaolo Bonzini return; 24761124f03SPaolo Bonzini } 24861124f03SPaolo Bonzini 2490109e7e6SKevin Wolf data = g_new(BdrvCoDrainData, 1); 2500109e7e6SKevin Wolf *data = (BdrvCoDrainData) { 2510109e7e6SKevin Wolf .bs = bs, 2520109e7e6SKevin Wolf .done = false, 2538e1da77eSMax Reitz .begin = begin, 2548e1da77eSMax Reitz .drained_end_counter = drained_end_counter, 2550109e7e6SKevin Wolf }; 2560109e7e6SKevin Wolf 257e037c09cSMax Reitz if (!begin) { 258d73415a3SStefan Hajnoczi qatomic_inc(drained_end_counter); 2598e1da77eSMax Reitz } 2608e1da77eSMax Reitz 2610109e7e6SKevin Wolf /* Make sure the driver callback completes during the polling phase for 2620109e7e6SKevin Wolf * drain_begin. */ 2630109e7e6SKevin Wolf bdrv_inc_in_flight(bs); 2640109e7e6SKevin Wolf data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 2650109e7e6SKevin Wolf aio_co_schedule(bdrv_get_aio_context(bs), data->co); 26661124f03SPaolo Bonzini } 26761124f03SPaolo Bonzini 2681cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 269fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 2706cd5c9d7SKevin Wolf BdrvChild *ignore_parent, bool ignore_bds_parents) 27189bd0305SKevin Wolf { 272fe4f0614SKevin Wolf BdrvChild *child, *next; 273fe4f0614SKevin Wolf 2746cd5c9d7SKevin Wolf if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 27589bd0305SKevin Wolf return true; 27689bd0305SKevin Wolf } 27789bd0305SKevin Wolf 278d73415a3SStefan Hajnoczi if (qatomic_read(&bs->in_flight)) { 279fe4f0614SKevin Wolf return true; 28089bd0305SKevin Wolf } 28189bd0305SKevin Wolf 282fe4f0614SKevin Wolf if (recursive) { 2836cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 284fe4f0614SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 2856cd5c9d7SKevin Wolf if (bdrv_drain_poll(child->bs, recursive, child, false)) { 286fe4f0614SKevin Wolf return true; 287fe4f0614SKevin Wolf } 288fe4f0614SKevin Wolf } 289fe4f0614SKevin Wolf } 290fe4f0614SKevin Wolf 291fe4f0614SKevin Wolf return false; 292fe4f0614SKevin Wolf } 293fe4f0614SKevin Wolf 294fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 29589bd0305SKevin Wolf BdrvChild *ignore_parent) 2961cc8e54aSKevin Wolf { 2976cd5c9d7SKevin Wolf return bdrv_drain_poll(bs, recursive, ignore_parent, false); 2981cc8e54aSKevin Wolf } 2991cc8e54aSKevin Wolf 300b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 3016cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 3026cd5c9d7SKevin Wolf bool poll); 303b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 3048e1da77eSMax Reitz BdrvChild *parent, bool ignore_bds_parents, 3058e1da77eSMax Reitz int *drained_end_counter); 3060152bf40SKevin Wolf 307a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque) 308a77fd4bbSFam Zheng { 309a77fd4bbSFam Zheng BdrvCoDrainData *data = opaque; 310a77fd4bbSFam Zheng Coroutine *co = data->co; 31199723548SPaolo Bonzini BlockDriverState *bs = data->bs; 312a77fd4bbSFam Zheng 313c8ca33d0SKevin Wolf if (bs) { 314aa1361d5SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 315aa1361d5SKevin Wolf aio_context_acquire(ctx); 31699723548SPaolo Bonzini bdrv_dec_in_flight(bs); 317481cad48SManos Pitsidianakis if (data->begin) { 318e037c09cSMax Reitz assert(!data->drained_end_counter); 3196cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, data->recursive, data->parent, 3206cd5c9d7SKevin Wolf data->ignore_bds_parents, data->poll); 321481cad48SManos Pitsidianakis } else { 322e037c09cSMax Reitz assert(!data->poll); 3236cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, data->recursive, data->parent, 3248e1da77eSMax Reitz data->ignore_bds_parents, 3258e1da77eSMax Reitz data->drained_end_counter); 326481cad48SManos Pitsidianakis } 327aa1361d5SKevin Wolf aio_context_release(ctx); 328c8ca33d0SKevin Wolf } else { 329c8ca33d0SKevin Wolf assert(data->begin); 330c8ca33d0SKevin Wolf bdrv_drain_all_begin(); 331c8ca33d0SKevin Wolf } 332481cad48SManos Pitsidianakis 333a77fd4bbSFam Zheng data->done = true; 3341919631eSPaolo Bonzini aio_co_wake(co); 335a77fd4bbSFam Zheng } 336a77fd4bbSFam Zheng 337481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 338b0165585SKevin Wolf bool begin, bool recursive, 3396cd5c9d7SKevin Wolf BdrvChild *parent, 3406cd5c9d7SKevin Wolf bool ignore_bds_parents, 3418e1da77eSMax Reitz bool poll, 3428e1da77eSMax Reitz int *drained_end_counter) 343a77fd4bbSFam Zheng { 344a77fd4bbSFam Zheng BdrvCoDrainData data; 345960d5fb3SKevin Wolf Coroutine *self = qemu_coroutine_self(); 346960d5fb3SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 347960d5fb3SKevin Wolf AioContext *co_ctx = qemu_coroutine_get_aio_context(self); 348a77fd4bbSFam Zheng 349a77fd4bbSFam Zheng /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 350c40a2545SStefan Hajnoczi * other coroutines run if they were queued by aio_co_enter(). */ 351a77fd4bbSFam Zheng 352a77fd4bbSFam Zheng assert(qemu_in_coroutine()); 353a77fd4bbSFam Zheng data = (BdrvCoDrainData) { 354960d5fb3SKevin Wolf .co = self, 355a77fd4bbSFam Zheng .bs = bs, 356a77fd4bbSFam Zheng .done = false, 357481cad48SManos Pitsidianakis .begin = begin, 358b0165585SKevin Wolf .recursive = recursive, 3590152bf40SKevin Wolf .parent = parent, 3606cd5c9d7SKevin Wolf .ignore_bds_parents = ignore_bds_parents, 361fe4f0614SKevin Wolf .poll = poll, 3628e1da77eSMax Reitz .drained_end_counter = drained_end_counter, 363a77fd4bbSFam Zheng }; 3648e1da77eSMax Reitz 365c8ca33d0SKevin Wolf if (bs) { 36699723548SPaolo Bonzini bdrv_inc_in_flight(bs); 367c8ca33d0SKevin Wolf } 368960d5fb3SKevin Wolf 369960d5fb3SKevin Wolf /* 370960d5fb3SKevin Wolf * Temporarily drop the lock across yield or we would get deadlocks. 371960d5fb3SKevin Wolf * bdrv_co_drain_bh_cb() reaquires the lock as needed. 372960d5fb3SKevin Wolf * 373960d5fb3SKevin Wolf * When we yield below, the lock for the current context will be 374960d5fb3SKevin Wolf * released, so if this is actually the lock that protects bs, don't drop 375960d5fb3SKevin Wolf * it a second time. 376960d5fb3SKevin Wolf */ 377960d5fb3SKevin Wolf if (ctx != co_ctx) { 378960d5fb3SKevin Wolf aio_context_release(ctx); 379960d5fb3SKevin Wolf } 380960d5fb3SKevin Wolf replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data); 381a77fd4bbSFam Zheng 382a77fd4bbSFam Zheng qemu_coroutine_yield(); 383a77fd4bbSFam Zheng /* If we are resumed from some other event (such as an aio completion or a 384a77fd4bbSFam Zheng * timer callback), it is a bug in the caller that should be fixed. */ 385a77fd4bbSFam Zheng assert(data.done); 386960d5fb3SKevin Wolf 387960d5fb3SKevin Wolf /* Reaquire the AioContext of bs if we dropped it */ 388960d5fb3SKevin Wolf if (ctx != co_ctx) { 389960d5fb3SKevin Wolf aio_context_acquire(ctx); 390960d5fb3SKevin Wolf } 391a77fd4bbSFam Zheng } 392a77fd4bbSFam Zheng 393dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 3946cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents) 395dcf94a23SKevin Wolf { 396dcf94a23SKevin Wolf assert(!qemu_in_coroutine()); 397dcf94a23SKevin Wolf 398dcf94a23SKevin Wolf /* Stop things in parent-to-child order */ 399d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) { 400dcf94a23SKevin Wolf aio_disable_external(bdrv_get_aio_context(bs)); 401dcf94a23SKevin Wolf } 402dcf94a23SKevin Wolf 4036cd5c9d7SKevin Wolf bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 4048e1da77eSMax Reitz bdrv_drain_invoke(bs, true, NULL); 405dcf94a23SKevin Wolf } 406dcf94a23SKevin Wolf 407dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 4086cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 4096cd5c9d7SKevin Wolf bool poll) 4106820643fSKevin Wolf { 411b0165585SKevin Wolf BdrvChild *child, *next; 412b0165585SKevin Wolf 413d42cf288SPaolo Bonzini if (qemu_in_coroutine()) { 4146cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 4158e1da77eSMax Reitz poll, NULL); 416d42cf288SPaolo Bonzini return; 417d42cf288SPaolo Bonzini } 418d42cf288SPaolo Bonzini 4196cd5c9d7SKevin Wolf bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 420d30b8e64SKevin Wolf 421b0165585SKevin Wolf if (recursive) { 4226cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 423d736f119SKevin Wolf bs->recursive_quiesce_counter++; 424b0165585SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 4256cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 4266cd5c9d7SKevin Wolf false); 427b0165585SKevin Wolf } 428b0165585SKevin Wolf } 429fe4f0614SKevin Wolf 430fe4f0614SKevin Wolf /* 431fe4f0614SKevin Wolf * Wait for drained requests to finish. 432fe4f0614SKevin Wolf * 433fe4f0614SKevin Wolf * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 434fe4f0614SKevin Wolf * call is needed so things in this AioContext can make progress even 435fe4f0614SKevin Wolf * though we don't return to the main AioContext loop - this automatically 436fe4f0614SKevin Wolf * includes other nodes in the same AioContext and therefore all child 437fe4f0614SKevin Wolf * nodes. 438fe4f0614SKevin Wolf */ 439fe4f0614SKevin Wolf if (poll) { 4406cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 441fe4f0614SKevin Wolf BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 442fe4f0614SKevin Wolf } 4436820643fSKevin Wolf } 4446820643fSKevin Wolf 4450152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs) 4460152bf40SKevin Wolf { 4476cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, false, NULL, false, true); 4480152bf40SKevin Wolf } 4490152bf40SKevin Wolf 450b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs) 4516820643fSKevin Wolf { 4526cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, true, NULL, false, true); 453b0165585SKevin Wolf } 454b0165585SKevin Wolf 455e037c09cSMax Reitz /** 456e037c09cSMax Reitz * This function does not poll, nor must any of its recursively called 457e037c09cSMax Reitz * functions. The *drained_end_counter pointee will be incremented 458e037c09cSMax Reitz * once for every background operation scheduled, and decremented once 459e037c09cSMax Reitz * the operation settles. Therefore, the pointer must remain valid 460e037c09cSMax Reitz * until the pointee reaches 0. That implies that whoever sets up the 461e037c09cSMax Reitz * pointee has to poll until it is 0. 462e037c09cSMax Reitz * 463e037c09cSMax Reitz * We use atomic operations to access *drained_end_counter, because 464e037c09cSMax Reitz * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of 465e037c09cSMax Reitz * @bs may contain nodes in different AioContexts, 466e037c09cSMax Reitz * (2) bdrv_drain_all_end() uses the same counter for all nodes, 467e037c09cSMax Reitz * regardless of which AioContext they are in. 468e037c09cSMax Reitz */ 4696cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 4708e1da77eSMax Reitz BdrvChild *parent, bool ignore_bds_parents, 4718e1da77eSMax Reitz int *drained_end_counter) 472b0165585SKevin Wolf { 47361ad631cSMax Reitz BdrvChild *child; 4740f115168SKevin Wolf int old_quiesce_counter; 4750f115168SKevin Wolf 476e037c09cSMax Reitz assert(drained_end_counter != NULL); 477e037c09cSMax Reitz 478481cad48SManos Pitsidianakis if (qemu_in_coroutine()) { 4796cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 4808e1da77eSMax Reitz false, drained_end_counter); 481481cad48SManos Pitsidianakis return; 482481cad48SManos Pitsidianakis } 4836820643fSKevin Wolf assert(bs->quiesce_counter > 0); 4846820643fSKevin Wolf 48560369b86SKevin Wolf /* Re-enable things in child-to-parent order */ 4868e1da77eSMax Reitz bdrv_drain_invoke(bs, false, drained_end_counter); 487e037c09cSMax Reitz bdrv_parent_drained_end(bs, parent, ignore_bds_parents, 488e037c09cSMax Reitz drained_end_counter); 4895cb2737eSMax Reitz 490d73415a3SStefan Hajnoczi old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); 4910f115168SKevin Wolf if (old_quiesce_counter == 1) { 4926820643fSKevin Wolf aio_enable_external(bdrv_get_aio_context(bs)); 4936820643fSKevin Wolf } 494b0165585SKevin Wolf 495b0165585SKevin Wolf if (recursive) { 4966cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 497d736f119SKevin Wolf bs->recursive_quiesce_counter--; 49861ad631cSMax Reitz QLIST_FOREACH(child, &bs->children, next) { 4998e1da77eSMax Reitz bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents, 5008e1da77eSMax Reitz drained_end_counter); 501b0165585SKevin Wolf } 502b0165585SKevin Wolf } 5030f115168SKevin Wolf } 5046820643fSKevin Wolf 5050152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs) 5060152bf40SKevin Wolf { 507e037c09cSMax Reitz int drained_end_counter = 0; 508e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter); 509d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 510e037c09cSMax Reitz } 511e037c09cSMax Reitz 512e037c09cSMax Reitz void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter) 513e037c09cSMax Reitz { 514e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter); 515b0165585SKevin Wolf } 516b0165585SKevin Wolf 517b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs) 518b0165585SKevin Wolf { 519e037c09cSMax Reitz int drained_end_counter = 0; 520e037c09cSMax Reitz bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter); 521d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 5220152bf40SKevin Wolf } 5230152bf40SKevin Wolf 524d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 525d736f119SKevin Wolf { 526d736f119SKevin Wolf int i; 527d736f119SKevin Wolf 528d736f119SKevin Wolf for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 5296cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, false, true); 530d736f119SKevin Wolf } 531d736f119SKevin Wolf } 532d736f119SKevin Wolf 533d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 534d736f119SKevin Wolf { 535e037c09cSMax Reitz int drained_end_counter = 0; 536d736f119SKevin Wolf int i; 537d736f119SKevin Wolf 538d736f119SKevin Wolf for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 539e037c09cSMax Reitz bdrv_do_drained_end(child->bs, true, child, false, 540e037c09cSMax Reitz &drained_end_counter); 541d736f119SKevin Wolf } 542e037c09cSMax Reitz 543d73415a3SStefan Hajnoczi BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0); 544d736f119SKevin Wolf } 545d736f119SKevin Wolf 54661007b31SStefan Hajnoczi /* 54767da1dc5SFam Zheng * Wait for pending requests to complete on a single BlockDriverState subtree, 54867da1dc5SFam Zheng * and suspend block driver's internal I/O until next request arrives. 54961007b31SStefan Hajnoczi * 55061007b31SStefan Hajnoczi * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 55161007b31SStefan Hajnoczi * AioContext. 55261007b31SStefan Hajnoczi */ 553b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 554b6e84c97SPaolo Bonzini { 5556820643fSKevin Wolf assert(qemu_in_coroutine()); 5566820643fSKevin Wolf bdrv_drained_begin(bs); 5576820643fSKevin Wolf bdrv_drained_end(bs); 558b6e84c97SPaolo Bonzini } 559b6e84c97SPaolo Bonzini 56061007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs) 56161007b31SStefan Hajnoczi { 5626820643fSKevin Wolf bdrv_drained_begin(bs); 5636820643fSKevin Wolf bdrv_drained_end(bs); 56461007b31SStefan Hajnoczi } 56561007b31SStefan Hajnoczi 566c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs) 567c13ad59fSKevin Wolf { 568c13ad59fSKevin Wolf BdrvChild *child, *next; 569c13ad59fSKevin Wolf 570d73415a3SStefan Hajnoczi assert(qatomic_read(&bs->in_flight) == 0); 571c13ad59fSKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 572c13ad59fSKevin Wolf bdrv_drain_assert_idle(child->bs); 573c13ad59fSKevin Wolf } 574c13ad59fSKevin Wolf } 575c13ad59fSKevin Wolf 5760f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0; 5770f12264eSKevin Wolf 5780f12264eSKevin Wolf static bool bdrv_drain_all_poll(void) 5790f12264eSKevin Wolf { 5800f12264eSKevin Wolf BlockDriverState *bs = NULL; 5810f12264eSKevin Wolf bool result = false; 5820f12264eSKevin Wolf 5830f12264eSKevin Wolf /* bdrv_drain_poll() can't make changes to the graph and we are holding the 5840f12264eSKevin Wolf * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 5850f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 5860f12264eSKevin Wolf AioContext *aio_context = bdrv_get_aio_context(bs); 5870f12264eSKevin Wolf aio_context_acquire(aio_context); 5880f12264eSKevin Wolf result |= bdrv_drain_poll(bs, false, NULL, true); 5890f12264eSKevin Wolf aio_context_release(aio_context); 5900f12264eSKevin Wolf } 5910f12264eSKevin Wolf 5920f12264eSKevin Wolf return result; 5930f12264eSKevin Wolf } 5940f12264eSKevin Wolf 59561007b31SStefan Hajnoczi /* 59661007b31SStefan Hajnoczi * Wait for pending requests to complete across all BlockDriverStates 59761007b31SStefan Hajnoczi * 59861007b31SStefan Hajnoczi * This function does not flush data to disk, use bdrv_flush_all() for that 59961007b31SStefan Hajnoczi * after calling this function. 600c0778f66SAlberto Garcia * 601c0778f66SAlberto Garcia * This pauses all block jobs and disables external clients. It must 602c0778f66SAlberto Garcia * be paired with bdrv_drain_all_end(). 603c0778f66SAlberto Garcia * 604c0778f66SAlberto Garcia * NOTE: no new block jobs or BlockDriverStates can be created between 605c0778f66SAlberto Garcia * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 60661007b31SStefan Hajnoczi */ 607c0778f66SAlberto Garcia void bdrv_drain_all_begin(void) 60861007b31SStefan Hajnoczi { 6090f12264eSKevin Wolf BlockDriverState *bs = NULL; 61061007b31SStefan Hajnoczi 611c8ca33d0SKevin Wolf if (qemu_in_coroutine()) { 6128e1da77eSMax Reitz bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL); 613c8ca33d0SKevin Wolf return; 614c8ca33d0SKevin Wolf } 615c8ca33d0SKevin Wolf 616c8aa7895SPavel Dovgalyuk /* 617c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 618c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 619c8aa7895SPavel Dovgalyuk * be infinite 620c8aa7895SPavel Dovgalyuk */ 621c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 622c8aa7895SPavel Dovgalyuk return; 623c8aa7895SPavel Dovgalyuk } 624c8aa7895SPavel Dovgalyuk 6250f12264eSKevin Wolf /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 6260f12264eSKevin Wolf * loop AioContext, so make sure we're in the main context. */ 6279a7e86c8SKevin Wolf assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 6280f12264eSKevin Wolf assert(bdrv_drain_all_count < INT_MAX); 6290f12264eSKevin Wolf bdrv_drain_all_count++; 6309a7e86c8SKevin Wolf 6310f12264eSKevin Wolf /* Quiesce all nodes, without polling in-flight requests yet. The graph 6320f12264eSKevin Wolf * cannot change during this loop. */ 6330f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 63461007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 63561007b31SStefan Hajnoczi 63661007b31SStefan Hajnoczi aio_context_acquire(aio_context); 6370f12264eSKevin Wolf bdrv_do_drained_begin(bs, false, NULL, true, false); 63861007b31SStefan Hajnoczi aio_context_release(aio_context); 63961007b31SStefan Hajnoczi } 64061007b31SStefan Hajnoczi 6410f12264eSKevin Wolf /* Now poll the in-flight requests */ 642cfe29d82SKevin Wolf AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 6430f12264eSKevin Wolf 6440f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 645c13ad59fSKevin Wolf bdrv_drain_assert_idle(bs); 646f406c03cSAlexander Yarygin } 647f406c03cSAlexander Yarygin } 648c0778f66SAlberto Garcia 6491a6d3bd2SGreg Kurz void bdrv_drain_all_end_quiesce(BlockDriverState *bs) 6501a6d3bd2SGreg Kurz { 6511a6d3bd2SGreg Kurz int drained_end_counter = 0; 6521a6d3bd2SGreg Kurz 6531a6d3bd2SGreg Kurz g_assert(bs->quiesce_counter > 0); 6541a6d3bd2SGreg Kurz g_assert(!bs->refcnt); 6551a6d3bd2SGreg Kurz 6561a6d3bd2SGreg Kurz while (bs->quiesce_counter) { 6571a6d3bd2SGreg Kurz bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 6581a6d3bd2SGreg Kurz } 6591a6d3bd2SGreg Kurz BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); 6601a6d3bd2SGreg Kurz } 6611a6d3bd2SGreg Kurz 662c0778f66SAlberto Garcia void bdrv_drain_all_end(void) 663c0778f66SAlberto Garcia { 6640f12264eSKevin Wolf BlockDriverState *bs = NULL; 665e037c09cSMax Reitz int drained_end_counter = 0; 666c0778f66SAlberto Garcia 667c8aa7895SPavel Dovgalyuk /* 668c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 669c8aa7895SPavel Dovgalyuk * waiting for finishing the I/O requests may 670c8aa7895SPavel Dovgalyuk * be endless 671c8aa7895SPavel Dovgalyuk */ 672c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 673c8aa7895SPavel Dovgalyuk return; 674c8aa7895SPavel Dovgalyuk } 675c8aa7895SPavel Dovgalyuk 6760f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 67761007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 67861007b31SStefan Hajnoczi 67961007b31SStefan Hajnoczi aio_context_acquire(aio_context); 680e037c09cSMax Reitz bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); 68161007b31SStefan Hajnoczi aio_context_release(aio_context); 68261007b31SStefan Hajnoczi } 6830f12264eSKevin Wolf 684e037c09cSMax Reitz assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 685d73415a3SStefan Hajnoczi AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0); 686e037c09cSMax Reitz 6870f12264eSKevin Wolf assert(bdrv_drain_all_count > 0); 6880f12264eSKevin Wolf bdrv_drain_all_count--; 68961007b31SStefan Hajnoczi } 69061007b31SStefan Hajnoczi 691c0778f66SAlberto Garcia void bdrv_drain_all(void) 692c0778f66SAlberto Garcia { 693c0778f66SAlberto Garcia bdrv_drain_all_begin(); 694c0778f66SAlberto Garcia bdrv_drain_all_end(); 695c0778f66SAlberto Garcia } 696c0778f66SAlberto Garcia 69761007b31SStefan Hajnoczi /** 69861007b31SStefan Hajnoczi * Remove an active request from the tracked requests list 69961007b31SStefan Hajnoczi * 70061007b31SStefan Hajnoczi * This function should be called when a tracked request is completing. 70161007b31SStefan Hajnoczi */ 70261007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req) 70361007b31SStefan Hajnoczi { 70461007b31SStefan Hajnoczi if (req->serialising) { 705d73415a3SStefan Hajnoczi qatomic_dec(&req->bs->serialising_in_flight); 70661007b31SStefan Hajnoczi } 70761007b31SStefan Hajnoczi 7083783fa3dSPaolo Bonzini qemu_co_mutex_lock(&req->bs->reqs_lock); 70961007b31SStefan Hajnoczi QLIST_REMOVE(req, list); 71061007b31SStefan Hajnoczi qemu_co_queue_restart_all(&req->wait_queue); 7113783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&req->bs->reqs_lock); 71261007b31SStefan Hajnoczi } 71361007b31SStefan Hajnoczi 71461007b31SStefan Hajnoczi /** 71561007b31SStefan Hajnoczi * Add an active request to the tracked requests list 71661007b31SStefan Hajnoczi */ 71761007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req, 71861007b31SStefan Hajnoczi BlockDriverState *bs, 71961007b31SStefan Hajnoczi int64_t offset, 720*80247264SEric Blake int64_t bytes, 721ebde595cSFam Zheng enum BdrvTrackedRequestType type) 72261007b31SStefan Hajnoczi { 723*80247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 72422931a15SFam Zheng 72561007b31SStefan Hajnoczi *req = (BdrvTrackedRequest){ 72661007b31SStefan Hajnoczi .bs = bs, 72761007b31SStefan Hajnoczi .offset = offset, 72861007b31SStefan Hajnoczi .bytes = bytes, 729ebde595cSFam Zheng .type = type, 73061007b31SStefan Hajnoczi .co = qemu_coroutine_self(), 73161007b31SStefan Hajnoczi .serialising = false, 73261007b31SStefan Hajnoczi .overlap_offset = offset, 73361007b31SStefan Hajnoczi .overlap_bytes = bytes, 73461007b31SStefan Hajnoczi }; 73561007b31SStefan Hajnoczi 73661007b31SStefan Hajnoczi qemu_co_queue_init(&req->wait_queue); 73761007b31SStefan Hajnoczi 7383783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 73961007b31SStefan Hajnoczi QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 7403783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 74161007b31SStefan Hajnoczi } 74261007b31SStefan Hajnoczi 7433ba0e1a0SPaolo Bonzini static bool tracked_request_overlaps(BdrvTrackedRequest *req, 744*80247264SEric Blake int64_t offset, int64_t bytes) 7453ba0e1a0SPaolo Bonzini { 746*80247264SEric Blake bdrv_check_request(offset, bytes, &error_abort); 747*80247264SEric Blake 7483ba0e1a0SPaolo Bonzini /* aaaa bbbb */ 7493ba0e1a0SPaolo Bonzini if (offset >= req->overlap_offset + req->overlap_bytes) { 7503ba0e1a0SPaolo Bonzini return false; 7513ba0e1a0SPaolo Bonzini } 7523ba0e1a0SPaolo Bonzini /* bbbb aaaa */ 7533ba0e1a0SPaolo Bonzini if (req->overlap_offset >= offset + bytes) { 7543ba0e1a0SPaolo Bonzini return false; 7553ba0e1a0SPaolo Bonzini } 7563ba0e1a0SPaolo Bonzini return true; 7573ba0e1a0SPaolo Bonzini } 7583ba0e1a0SPaolo Bonzini 7593183937fSVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 7603183937fSVladimir Sementsov-Ogievskiy static BdrvTrackedRequest * 7613183937fSVladimir Sementsov-Ogievskiy bdrv_find_conflicting_request(BdrvTrackedRequest *self) 7623ba0e1a0SPaolo Bonzini { 7633ba0e1a0SPaolo Bonzini BdrvTrackedRequest *req; 7643ba0e1a0SPaolo Bonzini 7653183937fSVladimir Sementsov-Ogievskiy QLIST_FOREACH(req, &self->bs->tracked_requests, list) { 7663ba0e1a0SPaolo Bonzini if (req == self || (!req->serialising && !self->serialising)) { 7673ba0e1a0SPaolo Bonzini continue; 7683ba0e1a0SPaolo Bonzini } 7693ba0e1a0SPaolo Bonzini if (tracked_request_overlaps(req, self->overlap_offset, 7703ba0e1a0SPaolo Bonzini self->overlap_bytes)) 7713ba0e1a0SPaolo Bonzini { 7723183937fSVladimir Sementsov-Ogievskiy /* 7733183937fSVladimir Sementsov-Ogievskiy * Hitting this means there was a reentrant request, for 7743ba0e1a0SPaolo Bonzini * example, a block driver issuing nested requests. This must 7753ba0e1a0SPaolo Bonzini * never happen since it means deadlock. 7763ba0e1a0SPaolo Bonzini */ 7773ba0e1a0SPaolo Bonzini assert(qemu_coroutine_self() != req->co); 7783ba0e1a0SPaolo Bonzini 7793183937fSVladimir Sementsov-Ogievskiy /* 7803183937fSVladimir Sementsov-Ogievskiy * If the request is already (indirectly) waiting for us, or 7813ba0e1a0SPaolo Bonzini * will wait for us as soon as it wakes up, then just go on 7823183937fSVladimir Sementsov-Ogievskiy * (instead of producing a deadlock in the former case). 7833183937fSVladimir Sementsov-Ogievskiy */ 7843ba0e1a0SPaolo Bonzini if (!req->waiting_for) { 7853183937fSVladimir Sementsov-Ogievskiy return req; 7863183937fSVladimir Sementsov-Ogievskiy } 7873183937fSVladimir Sementsov-Ogievskiy } 7883183937fSVladimir Sementsov-Ogievskiy } 7893183937fSVladimir Sementsov-Ogievskiy 7903183937fSVladimir Sementsov-Ogievskiy return NULL; 7913183937fSVladimir Sementsov-Ogievskiy } 7923183937fSVladimir Sementsov-Ogievskiy 793ec1c8868SVladimir Sementsov-Ogievskiy /* Called with self->bs->reqs_lock held */ 7943183937fSVladimir Sementsov-Ogievskiy static bool coroutine_fn 795ec1c8868SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) 7963183937fSVladimir Sementsov-Ogievskiy { 7973183937fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req; 7983183937fSVladimir Sementsov-Ogievskiy bool waited = false; 7993183937fSVladimir Sementsov-Ogievskiy 8003183937fSVladimir Sementsov-Ogievskiy while ((req = bdrv_find_conflicting_request(self))) { 8013ba0e1a0SPaolo Bonzini self->waiting_for = req; 802ec1c8868SVladimir Sementsov-Ogievskiy qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); 8033ba0e1a0SPaolo Bonzini self->waiting_for = NULL; 8043ba0e1a0SPaolo Bonzini waited = true; 8053ba0e1a0SPaolo Bonzini } 8063183937fSVladimir Sementsov-Ogievskiy 8073ba0e1a0SPaolo Bonzini return waited; 8083ba0e1a0SPaolo Bonzini } 8093ba0e1a0SPaolo Bonzini 8108ac5aab2SVladimir Sementsov-Ogievskiy /* Called with req->bs->reqs_lock held */ 8118ac5aab2SVladimir Sementsov-Ogievskiy static void tracked_request_set_serialising(BdrvTrackedRequest *req, 8128ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 81361007b31SStefan Hajnoczi { 81461007b31SStefan Hajnoczi int64_t overlap_offset = req->offset & ~(align - 1); 815*80247264SEric Blake int64_t overlap_bytes = 816*80247264SEric Blake ROUND_UP(req->offset + req->bytes, align) - overlap_offset; 817*80247264SEric Blake 818*80247264SEric Blake bdrv_check_request(req->offset, req->bytes, &error_abort); 81961007b31SStefan Hajnoczi 82061007b31SStefan Hajnoczi if (!req->serialising) { 821d73415a3SStefan Hajnoczi qatomic_inc(&req->bs->serialising_in_flight); 82261007b31SStefan Hajnoczi req->serialising = true; 82361007b31SStefan Hajnoczi } 82461007b31SStefan Hajnoczi 82561007b31SStefan Hajnoczi req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 82661007b31SStefan Hajnoczi req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 82709d2f948SVladimir Sementsov-Ogievskiy } 82809d2f948SVladimir Sementsov-Ogievskiy 82961007b31SStefan Hajnoczi /** 830c28107e9SMax Reitz * Return the tracked request on @bs for the current coroutine, or 831c28107e9SMax Reitz * NULL if there is none. 832c28107e9SMax Reitz */ 833c28107e9SMax Reitz BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) 834c28107e9SMax Reitz { 835c28107e9SMax Reitz BdrvTrackedRequest *req; 836c28107e9SMax Reitz Coroutine *self = qemu_coroutine_self(); 837c28107e9SMax Reitz 838c28107e9SMax Reitz QLIST_FOREACH(req, &bs->tracked_requests, list) { 839c28107e9SMax Reitz if (req->co == self) { 840c28107e9SMax Reitz return req; 841c28107e9SMax Reitz } 842c28107e9SMax Reitz } 843c28107e9SMax Reitz 844c28107e9SMax Reitz return NULL; 845c28107e9SMax Reitz } 846c28107e9SMax Reitz 847c28107e9SMax Reitz /** 848244483e6SKevin Wolf * Round a region to cluster boundaries 849244483e6SKevin Wolf */ 850244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs, 8517cfd5275SEric Blake int64_t offset, int64_t bytes, 852244483e6SKevin Wolf int64_t *cluster_offset, 8537cfd5275SEric Blake int64_t *cluster_bytes) 854244483e6SKevin Wolf { 855244483e6SKevin Wolf BlockDriverInfo bdi; 856244483e6SKevin Wolf 857244483e6SKevin Wolf if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 858244483e6SKevin Wolf *cluster_offset = offset; 859244483e6SKevin Wolf *cluster_bytes = bytes; 860244483e6SKevin Wolf } else { 861244483e6SKevin Wolf int64_t c = bdi.cluster_size; 862244483e6SKevin Wolf *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 863244483e6SKevin Wolf *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 864244483e6SKevin Wolf } 865244483e6SKevin Wolf } 866244483e6SKevin Wolf 86761007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs) 86861007b31SStefan Hajnoczi { 86961007b31SStefan Hajnoczi BlockDriverInfo bdi; 87061007b31SStefan Hajnoczi int ret; 87161007b31SStefan Hajnoczi 87261007b31SStefan Hajnoczi ret = bdrv_get_info(bs, &bdi); 87361007b31SStefan Hajnoczi if (ret < 0 || bdi.cluster_size == 0) { 874a5b8dd2cSEric Blake return bs->bl.request_alignment; 87561007b31SStefan Hajnoczi } else { 87661007b31SStefan Hajnoczi return bdi.cluster_size; 87761007b31SStefan Hajnoczi } 87861007b31SStefan Hajnoczi } 87961007b31SStefan Hajnoczi 88099723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs) 88199723548SPaolo Bonzini { 882d73415a3SStefan Hajnoczi qatomic_inc(&bs->in_flight); 88399723548SPaolo Bonzini } 88499723548SPaolo Bonzini 885c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs) 886c9d1a561SPaolo Bonzini { 887cfe29d82SKevin Wolf aio_wait_kick(); 888c9d1a561SPaolo Bonzini } 889c9d1a561SPaolo Bonzini 89099723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs) 89199723548SPaolo Bonzini { 892d73415a3SStefan Hajnoczi qatomic_dec(&bs->in_flight); 893c9d1a561SPaolo Bonzini bdrv_wakeup(bs); 89499723548SPaolo Bonzini } 89599723548SPaolo Bonzini 89618fbd0deSPaolo Bonzini static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self) 89761007b31SStefan Hajnoczi { 89861007b31SStefan Hajnoczi BlockDriverState *bs = self->bs; 89961007b31SStefan Hajnoczi bool waited = false; 90061007b31SStefan Hajnoczi 901d73415a3SStefan Hajnoczi if (!qatomic_read(&bs->serialising_in_flight)) { 90261007b31SStefan Hajnoczi return false; 90361007b31SStefan Hajnoczi } 90461007b31SStefan Hajnoczi 9053783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 906ec1c8868SVladimir Sementsov-Ogievskiy waited = bdrv_wait_serialising_requests_locked(self); 9073783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 90861007b31SStefan Hajnoczi 90961007b31SStefan Hajnoczi return waited; 91061007b31SStefan Hajnoczi } 91161007b31SStefan Hajnoczi 9128ac5aab2SVladimir Sementsov-Ogievskiy bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, 9138ac5aab2SVladimir Sementsov-Ogievskiy uint64_t align) 9148ac5aab2SVladimir Sementsov-Ogievskiy { 9158ac5aab2SVladimir Sementsov-Ogievskiy bool waited; 9168ac5aab2SVladimir Sementsov-Ogievskiy 9178ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_lock(&req->bs->reqs_lock); 9188ac5aab2SVladimir Sementsov-Ogievskiy 9198ac5aab2SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, align); 9208ac5aab2SVladimir Sementsov-Ogievskiy waited = bdrv_wait_serialising_requests_locked(req); 9218ac5aab2SVladimir Sementsov-Ogievskiy 9228ac5aab2SVladimir Sementsov-Ogievskiy qemu_co_mutex_unlock(&req->bs->reqs_lock); 9238ac5aab2SVladimir Sementsov-Ogievskiy 9248ac5aab2SVladimir Sementsov-Ogievskiy return waited; 9258ac5aab2SVladimir Sementsov-Ogievskiy } 9268ac5aab2SVladimir Sementsov-Ogievskiy 92763f4ad11SVladimir Sementsov-Ogievskiy static int bdrv_check_qiov_request(int64_t offset, int64_t bytes, 92863f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 92963f4ad11SVladimir Sementsov-Ogievskiy Error **errp) 93061007b31SStefan Hajnoczi { 93163f4ad11SVladimir Sementsov-Ogievskiy /* 93263f4ad11SVladimir Sementsov-Ogievskiy * Check generic offset/bytes correctness 93363f4ad11SVladimir Sementsov-Ogievskiy */ 93463f4ad11SVladimir Sementsov-Ogievskiy 93569b55e03SVladimir Sementsov-Ogievskiy if (offset < 0) { 93669b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset is negative: %" PRIi64, offset); 93769b55e03SVladimir Sementsov-Ogievskiy return -EIO; 93869b55e03SVladimir Sementsov-Ogievskiy } 93969b55e03SVladimir Sementsov-Ogievskiy 94069b55e03SVladimir Sementsov-Ogievskiy if (bytes < 0) { 94169b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes is negative: %" PRIi64, bytes); 94261007b31SStefan Hajnoczi return -EIO; 94361007b31SStefan Hajnoczi } 94461007b31SStefan Hajnoczi 9458b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_MAX_LENGTH) { 94669b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 94769b55e03SVladimir Sementsov-Ogievskiy bytes, BDRV_MAX_LENGTH); 94869b55e03SVladimir Sementsov-Ogievskiy return -EIO; 94969b55e03SVladimir Sementsov-Ogievskiy } 95069b55e03SVladimir Sementsov-Ogievskiy 95169b55e03SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH) { 95269b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")", 95369b55e03SVladimir Sementsov-Ogievskiy offset, BDRV_MAX_LENGTH); 9548b117001SVladimir Sementsov-Ogievskiy return -EIO; 9558b117001SVladimir Sementsov-Ogievskiy } 9568b117001SVladimir Sementsov-Ogievskiy 9578b117001SVladimir Sementsov-Ogievskiy if (offset > BDRV_MAX_LENGTH - bytes) { 95869b55e03SVladimir Sementsov-Ogievskiy error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") " 95969b55e03SVladimir Sementsov-Ogievskiy "exceeds maximum(%" PRIi64 ")", offset, bytes, 96069b55e03SVladimir Sementsov-Ogievskiy BDRV_MAX_LENGTH); 9618b117001SVladimir Sementsov-Ogievskiy return -EIO; 9628b117001SVladimir Sementsov-Ogievskiy } 9638b117001SVladimir Sementsov-Ogievskiy 96463f4ad11SVladimir Sementsov-Ogievskiy if (!qiov) { 9658b117001SVladimir Sementsov-Ogievskiy return 0; 9668b117001SVladimir Sementsov-Ogievskiy } 9678b117001SVladimir Sementsov-Ogievskiy 96863f4ad11SVladimir Sementsov-Ogievskiy /* 96963f4ad11SVladimir Sementsov-Ogievskiy * Check qiov and qiov_offset 97063f4ad11SVladimir Sementsov-Ogievskiy */ 97163f4ad11SVladimir Sementsov-Ogievskiy 97263f4ad11SVladimir Sementsov-Ogievskiy if (qiov_offset > qiov->size) { 97363f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)", 97463f4ad11SVladimir Sementsov-Ogievskiy qiov_offset, qiov->size); 97563f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 97663f4ad11SVladimir Sementsov-Ogievskiy } 97763f4ad11SVladimir Sementsov-Ogievskiy 97863f4ad11SVladimir Sementsov-Ogievskiy if (bytes > qiov->size - qiov_offset) { 97963f4ad11SVladimir Sementsov-Ogievskiy error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io " 98063f4ad11SVladimir Sementsov-Ogievskiy "vector size(%zu)", bytes, qiov_offset, qiov->size); 98163f4ad11SVladimir Sementsov-Ogievskiy return -EIO; 98263f4ad11SVladimir Sementsov-Ogievskiy } 98363f4ad11SVladimir Sementsov-Ogievskiy 98463f4ad11SVladimir Sementsov-Ogievskiy return 0; 98563f4ad11SVladimir Sementsov-Ogievskiy } 98663f4ad11SVladimir Sementsov-Ogievskiy 98763f4ad11SVladimir Sementsov-Ogievskiy int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp) 9888b117001SVladimir Sementsov-Ogievskiy { 98963f4ad11SVladimir Sementsov-Ogievskiy return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp); 99063f4ad11SVladimir Sementsov-Ogievskiy } 99163f4ad11SVladimir Sementsov-Ogievskiy 99263f4ad11SVladimir Sementsov-Ogievskiy static int bdrv_check_request32(int64_t offset, int64_t bytes, 99363f4ad11SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset) 99463f4ad11SVladimir Sementsov-Ogievskiy { 99563f4ad11SVladimir Sementsov-Ogievskiy int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); 9968b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 9978b117001SVladimir Sementsov-Ogievskiy return ret; 9988b117001SVladimir Sementsov-Ogievskiy } 9998b117001SVladimir Sementsov-Ogievskiy 10008b117001SVladimir Sementsov-Ogievskiy if (bytes > BDRV_REQUEST_MAX_BYTES) { 100161007b31SStefan Hajnoczi return -EIO; 100261007b31SStefan Hajnoczi } 100361007b31SStefan Hajnoczi 100461007b31SStefan Hajnoczi return 0; 100561007b31SStefan Hajnoczi } 100661007b31SStefan Hajnoczi 1007720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 1008f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 100961007b31SStefan Hajnoczi { 1010fae2681aSVladimir Sementsov-Ogievskiy return bdrv_pwritev(child, offset, bytes, NULL, 1011fae2681aSVladimir Sementsov-Ogievskiy BDRV_REQ_ZERO_WRITE | flags); 101261007b31SStefan Hajnoczi } 101361007b31SStefan Hajnoczi 101461007b31SStefan Hajnoczi /* 101574021bc4SEric Blake * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 101661007b31SStefan Hajnoczi * The operation is sped up by checking the block status and only writing 101761007b31SStefan Hajnoczi * zeroes to the device if they currently do not return zeroes. Optional 101874021bc4SEric Blake * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 1019465fe887SEric Blake * BDRV_REQ_FUA). 102061007b31SStefan Hajnoczi * 1021f4649069SEric Blake * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite(). 102261007b31SStefan Hajnoczi */ 1023720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 102461007b31SStefan Hajnoczi { 1025237d78f8SEric Blake int ret; 1026237d78f8SEric Blake int64_t target_size, bytes, offset = 0; 1027720ff280SKevin Wolf BlockDriverState *bs = child->bs; 102861007b31SStefan Hajnoczi 10297286d610SEric Blake target_size = bdrv_getlength(bs); 10307286d610SEric Blake if (target_size < 0) { 10317286d610SEric Blake return target_size; 103261007b31SStefan Hajnoczi } 103361007b31SStefan Hajnoczi 103461007b31SStefan Hajnoczi for (;;) { 10357286d610SEric Blake bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 10367286d610SEric Blake if (bytes <= 0) { 103761007b31SStefan Hajnoczi return 0; 103861007b31SStefan Hajnoczi } 1039237d78f8SEric Blake ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 104061007b31SStefan Hajnoczi if (ret < 0) { 104161007b31SStefan Hajnoczi return ret; 104261007b31SStefan Hajnoczi } 104361007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_ZERO) { 1044237d78f8SEric Blake offset += bytes; 104561007b31SStefan Hajnoczi continue; 104661007b31SStefan Hajnoczi } 1047237d78f8SEric Blake ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 104861007b31SStefan Hajnoczi if (ret < 0) { 104961007b31SStefan Hajnoczi return ret; 105061007b31SStefan Hajnoczi } 1051237d78f8SEric Blake offset += bytes; 105261007b31SStefan Hajnoczi } 105361007b31SStefan Hajnoczi } 105461007b31SStefan Hajnoczi 10552e11d756SAlberto Garcia /* See bdrv_pwrite() for the return codes */ 1056cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 105761007b31SStefan Hajnoczi { 1058fae2681aSVladimir Sementsov-Ogievskiy int ret; 10590d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 106061007b31SStefan Hajnoczi 106161007b31SStefan Hajnoczi if (bytes < 0) { 106261007b31SStefan Hajnoczi return -EINVAL; 106361007b31SStefan Hajnoczi } 106461007b31SStefan Hajnoczi 1065fae2681aSVladimir Sementsov-Ogievskiy ret = bdrv_preadv(child, offset, bytes, &qiov, 0); 106661007b31SStefan Hajnoczi 1067fae2681aSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : bytes; 106861007b31SStefan Hajnoczi } 106961007b31SStefan Hajnoczi 10702e11d756SAlberto Garcia /* Return no. of bytes on success or < 0 on error. Important errors are: 10712e11d756SAlberto Garcia -EIO generic I/O error (may happen for all errors) 10722e11d756SAlberto Garcia -ENOMEDIUM No media inserted. 10732e11d756SAlberto Garcia -EINVAL Invalid offset or number of bytes 10742e11d756SAlberto Garcia -EACCES Trying to write a read-only device 10752e11d756SAlberto Garcia */ 1076d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 107761007b31SStefan Hajnoczi { 1078fae2681aSVladimir Sementsov-Ogievskiy int ret; 10790d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 108061007b31SStefan Hajnoczi 108161007b31SStefan Hajnoczi if (bytes < 0) { 108261007b31SStefan Hajnoczi return -EINVAL; 108361007b31SStefan Hajnoczi } 108461007b31SStefan Hajnoczi 1085fae2681aSVladimir Sementsov-Ogievskiy ret = bdrv_pwritev(child, offset, bytes, &qiov, 0); 1086fae2681aSVladimir Sementsov-Ogievskiy 1087fae2681aSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : bytes; 108861007b31SStefan Hajnoczi } 108961007b31SStefan Hajnoczi 109061007b31SStefan Hajnoczi /* 109161007b31SStefan Hajnoczi * Writes to the file and ensures that no writes are reordered across this 109261007b31SStefan Hajnoczi * request (acts as a barrier) 109361007b31SStefan Hajnoczi * 109461007b31SStefan Hajnoczi * Returns 0 on success, -errno in error cases. 109561007b31SStefan Hajnoczi */ 1096d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 109761007b31SStefan Hajnoczi const void *buf, int count) 109861007b31SStefan Hajnoczi { 109961007b31SStefan Hajnoczi int ret; 110061007b31SStefan Hajnoczi 1101d9ca2ea2SKevin Wolf ret = bdrv_pwrite(child, offset, buf, count); 110261007b31SStefan Hajnoczi if (ret < 0) { 110361007b31SStefan Hajnoczi return ret; 110461007b31SStefan Hajnoczi } 110561007b31SStefan Hajnoczi 1106d9ca2ea2SKevin Wolf ret = bdrv_flush(child->bs); 1107855a6a93SKevin Wolf if (ret < 0) { 1108855a6a93SKevin Wolf return ret; 110961007b31SStefan Hajnoczi } 111061007b31SStefan Hajnoczi 111161007b31SStefan Hajnoczi return 0; 111261007b31SStefan Hajnoczi } 111361007b31SStefan Hajnoczi 111408844473SKevin Wolf typedef struct CoroutineIOCompletion { 111508844473SKevin Wolf Coroutine *coroutine; 111608844473SKevin Wolf int ret; 111708844473SKevin Wolf } CoroutineIOCompletion; 111808844473SKevin Wolf 111908844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret) 112008844473SKevin Wolf { 112108844473SKevin Wolf CoroutineIOCompletion *co = opaque; 112208844473SKevin Wolf 112308844473SKevin Wolf co->ret = ret; 1124b9e413ddSPaolo Bonzini aio_co_wake(co->coroutine); 112508844473SKevin Wolf } 112608844473SKevin Wolf 1127166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 1128166fe960SKevin Wolf uint64_t offset, uint64_t bytes, 1129ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1130ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 1131166fe960SKevin Wolf { 1132166fe960SKevin Wolf BlockDriver *drv = bs->drv; 11333fb06697SKevin Wolf int64_t sector_num; 11343fb06697SKevin Wolf unsigned int nb_sectors; 1135ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1136ac850bf0SVladimir Sementsov-Ogievskiy int ret; 11373fb06697SKevin Wolf 1138fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1139fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1140fa166538SEric Blake 1141d470ad42SMax Reitz if (!drv) { 1142d470ad42SMax Reitz return -ENOMEDIUM; 1143d470ad42SMax Reitz } 1144d470ad42SMax Reitz 1145ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_preadv_part) { 1146ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset, 1147ac850bf0SVladimir Sementsov-Ogievskiy flags); 1148ac850bf0SVladimir Sementsov-Ogievskiy } 1149ac850bf0SVladimir Sementsov-Ogievskiy 1150ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1151ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1152ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1153ac850bf0SVladimir Sementsov-Ogievskiy } 1154ac850bf0SVladimir Sementsov-Ogievskiy 11553fb06697SKevin Wolf if (drv->bdrv_co_preadv) { 1156ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 1157ac850bf0SVladimir Sementsov-Ogievskiy goto out; 11583fb06697SKevin Wolf } 11593fb06697SKevin Wolf 1160edfab6a0SEric Blake if (drv->bdrv_aio_preadv) { 116108844473SKevin Wolf BlockAIOCB *acb; 116208844473SKevin Wolf CoroutineIOCompletion co = { 116308844473SKevin Wolf .coroutine = qemu_coroutine_self(), 116408844473SKevin Wolf }; 116508844473SKevin Wolf 1166e31f6864SEric Blake acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 116708844473SKevin Wolf bdrv_co_io_em_complete, &co); 116808844473SKevin Wolf if (acb == NULL) { 1169ac850bf0SVladimir Sementsov-Ogievskiy ret = -EIO; 1170ac850bf0SVladimir Sementsov-Ogievskiy goto out; 117108844473SKevin Wolf } else { 117208844473SKevin Wolf qemu_coroutine_yield(); 1173ac850bf0SVladimir Sementsov-Ogievskiy ret = co.ret; 1174ac850bf0SVladimir Sementsov-Ogievskiy goto out; 117508844473SKevin Wolf } 117608844473SKevin Wolf } 1177edfab6a0SEric Blake 1178edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1179edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1180edfab6a0SEric Blake 11811bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 11821bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 118341ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1184edfab6a0SEric Blake assert(drv->bdrv_co_readv); 1185edfab6a0SEric Blake 1186ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1187ac850bf0SVladimir Sementsov-Ogievskiy 1188ac850bf0SVladimir Sementsov-Ogievskiy out: 1189ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1190ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1191ac850bf0SVladimir Sementsov-Ogievskiy } 1192ac850bf0SVladimir Sementsov-Ogievskiy 1193ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1194166fe960SKevin Wolf } 1195166fe960SKevin Wolf 119678a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 119778a07294SKevin Wolf uint64_t offset, uint64_t bytes, 1198ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, 1199ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 120078a07294SKevin Wolf { 120178a07294SKevin Wolf BlockDriver *drv = bs->drv; 12023fb06697SKevin Wolf int64_t sector_num; 12033fb06697SKevin Wolf unsigned int nb_sectors; 1204ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 120578a07294SKevin Wolf int ret; 120678a07294SKevin Wolf 1207fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1208fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1209fa166538SEric Blake 1210d470ad42SMax Reitz if (!drv) { 1211d470ad42SMax Reitz return -ENOMEDIUM; 1212d470ad42SMax Reitz } 1213d470ad42SMax Reitz 1214ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_part) { 1215ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 1216ac850bf0SVladimir Sementsov-Ogievskiy flags & bs->supported_write_flags); 1217ac850bf0SVladimir Sementsov-Ogievskiy flags &= ~bs->supported_write_flags; 1218ac850bf0SVladimir Sementsov-Ogievskiy goto emulate_flags; 1219ac850bf0SVladimir Sementsov-Ogievskiy } 1220ac850bf0SVladimir Sementsov-Ogievskiy 1221ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset > 0 || bytes != qiov->size) { 1222ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1223ac850bf0SVladimir Sementsov-Ogievskiy qiov = &local_qiov; 1224ac850bf0SVladimir Sementsov-Ogievskiy } 1225ac850bf0SVladimir Sementsov-Ogievskiy 12263fb06697SKevin Wolf if (drv->bdrv_co_pwritev) { 1227515c2f43SKevin Wolf ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1228515c2f43SKevin Wolf flags & bs->supported_write_flags); 1229515c2f43SKevin Wolf flags &= ~bs->supported_write_flags; 12303fb06697SKevin Wolf goto emulate_flags; 12313fb06697SKevin Wolf } 12323fb06697SKevin Wolf 1233edfab6a0SEric Blake if (drv->bdrv_aio_pwritev) { 123408844473SKevin Wolf BlockAIOCB *acb; 123508844473SKevin Wolf CoroutineIOCompletion co = { 123608844473SKevin Wolf .coroutine = qemu_coroutine_self(), 123708844473SKevin Wolf }; 123808844473SKevin Wolf 1239e31f6864SEric Blake acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1240e31f6864SEric Blake flags & bs->supported_write_flags, 124108844473SKevin Wolf bdrv_co_io_em_complete, &co); 1242e31f6864SEric Blake flags &= ~bs->supported_write_flags; 124308844473SKevin Wolf if (acb == NULL) { 12443fb06697SKevin Wolf ret = -EIO; 124508844473SKevin Wolf } else { 124608844473SKevin Wolf qemu_coroutine_yield(); 12473fb06697SKevin Wolf ret = co.ret; 124808844473SKevin Wolf } 1249edfab6a0SEric Blake goto emulate_flags; 1250edfab6a0SEric Blake } 1251edfab6a0SEric Blake 1252edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1253edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1254edfab6a0SEric Blake 12551bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 12561bbbf32dSNir Soffer assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 125741ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1258edfab6a0SEric Blake 1259e18a58b4SEric Blake assert(drv->bdrv_co_writev); 1260e18a58b4SEric Blake ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1261edfab6a0SEric Blake flags & bs->supported_write_flags); 1262edfab6a0SEric Blake flags &= ~bs->supported_write_flags; 126378a07294SKevin Wolf 12643fb06697SKevin Wolf emulate_flags: 12654df863f3SEric Blake if (ret == 0 && (flags & BDRV_REQ_FUA)) { 126678a07294SKevin Wolf ret = bdrv_co_flush(bs); 126778a07294SKevin Wolf } 126878a07294SKevin Wolf 1269ac850bf0SVladimir Sementsov-Ogievskiy if (qiov == &local_qiov) { 1270ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1271ac850bf0SVladimir Sementsov-Ogievskiy } 1272ac850bf0SVladimir Sementsov-Ogievskiy 127378a07294SKevin Wolf return ret; 127478a07294SKevin Wolf } 127578a07294SKevin Wolf 127629a298afSPavel Butsykin static int coroutine_fn 127729a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 1278ac850bf0SVladimir Sementsov-Ogievskiy uint64_t bytes, QEMUIOVector *qiov, 1279ac850bf0SVladimir Sementsov-Ogievskiy size_t qiov_offset) 128029a298afSPavel Butsykin { 128129a298afSPavel Butsykin BlockDriver *drv = bs->drv; 1282ac850bf0SVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 1283ac850bf0SVladimir Sementsov-Ogievskiy int ret; 128429a298afSPavel Butsykin 1285d470ad42SMax Reitz if (!drv) { 1286d470ad42SMax Reitz return -ENOMEDIUM; 1287d470ad42SMax Reitz } 1288d470ad42SMax Reitz 1289ac850bf0SVladimir Sementsov-Ogievskiy if (!block_driver_can_compress(drv)) { 129029a298afSPavel Butsykin return -ENOTSUP; 129129a298afSPavel Butsykin } 129229a298afSPavel Butsykin 1293ac850bf0SVladimir Sementsov-Ogievskiy if (drv->bdrv_co_pwritev_compressed_part) { 1294ac850bf0SVladimir Sementsov-Ogievskiy return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes, 1295ac850bf0SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 1296ac850bf0SVladimir Sementsov-Ogievskiy } 1297ac850bf0SVladimir Sementsov-Ogievskiy 1298ac850bf0SVladimir Sementsov-Ogievskiy if (qiov_offset == 0) { 129929a298afSPavel Butsykin return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 130029a298afSPavel Butsykin } 130129a298afSPavel Butsykin 1302ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes); 1303ac850bf0SVladimir Sementsov-Ogievskiy ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov); 1304ac850bf0SVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&local_qiov); 1305ac850bf0SVladimir Sementsov-Ogievskiy 1306ac850bf0SVladimir Sementsov-Ogievskiy return ret; 1307ac850bf0SVladimir Sementsov-Ogievskiy } 1308ac850bf0SVladimir Sementsov-Ogievskiy 130985c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 13103299e5ecSVladimir Sementsov-Ogievskiy int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 13111143ec5eSVladimir Sementsov-Ogievskiy size_t qiov_offset, int flags) 131261007b31SStefan Hajnoczi { 131385c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 131485c97ca7SKevin Wolf 131561007b31SStefan Hajnoczi /* Perform I/O through a temporary buffer so that users who scribble over 131661007b31SStefan Hajnoczi * their read buffer while the operation is in progress do not end up 131761007b31SStefan Hajnoczi * modifying the image file. This is critical for zero-copy guest I/O 131861007b31SStefan Hajnoczi * where anything might happen inside guest memory. 131961007b31SStefan Hajnoczi */ 13202275cc90SVladimir Sementsov-Ogievskiy void *bounce_buffer = NULL; 132161007b31SStefan Hajnoczi 132261007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 1323244483e6SKevin Wolf int64_t cluster_offset; 13247cfd5275SEric Blake int64_t cluster_bytes; 132561007b31SStefan Hajnoczi size_t skip_bytes; 132661007b31SStefan Hajnoczi int ret; 1327cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1328cb2e2878SEric Blake BDRV_REQUEST_MAX_BYTES); 1329cb2e2878SEric Blake unsigned int progress = 0; 13308644476eSMax Reitz bool skip_write; 133161007b31SStefan Hajnoczi 1332d470ad42SMax Reitz if (!drv) { 1333d470ad42SMax Reitz return -ENOMEDIUM; 1334d470ad42SMax Reitz } 1335d470ad42SMax Reitz 13368644476eSMax Reitz /* 13378644476eSMax Reitz * Do not write anything when the BDS is inactive. That is not 13388644476eSMax Reitz * allowed, and it would not help. 13398644476eSMax Reitz */ 13408644476eSMax Reitz skip_write = (bs->open_flags & BDRV_O_INACTIVE); 13418644476eSMax Reitz 13421bf03e66SKevin Wolf /* FIXME We cannot require callers to have write permissions when all they 13431bf03e66SKevin Wolf * are doing is a read request. If we did things right, write permissions 13441bf03e66SKevin Wolf * would be obtained anyway, but internally by the copy-on-read code. As 1345765d9df9SEric Blake * long as it is implemented here rather than in a separate filter driver, 13461bf03e66SKevin Wolf * the copy-on-read code doesn't have its own BdrvChild, however, for which 13471bf03e66SKevin Wolf * it could request permissions. Therefore we have to bypass the permission 13481bf03e66SKevin Wolf * system for the moment. */ 13491bf03e66SKevin Wolf // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1350afa4b293SKevin Wolf 135161007b31SStefan Hajnoczi /* Cover entire cluster so no additional backing file I/O is required when 1352cb2e2878SEric Blake * allocating cluster in the image file. Note that this value may exceed 1353cb2e2878SEric Blake * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1354cb2e2878SEric Blake * is one reason we loop rather than doing it all at once. 135561007b31SStefan Hajnoczi */ 1356244483e6SKevin Wolf bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1357cb2e2878SEric Blake skip_bytes = offset - cluster_offset; 135861007b31SStefan Hajnoczi 1359244483e6SKevin Wolf trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1360244483e6SKevin Wolf cluster_offset, cluster_bytes); 136161007b31SStefan Hajnoczi 1362cb2e2878SEric Blake while (cluster_bytes) { 1363cb2e2878SEric Blake int64_t pnum; 136461007b31SStefan Hajnoczi 13658644476eSMax Reitz if (skip_write) { 13668644476eSMax Reitz ret = 1; /* "already allocated", so nothing will be copied */ 13678644476eSMax Reitz pnum = MIN(cluster_bytes, max_transfer); 13688644476eSMax Reitz } else { 1369cb2e2878SEric Blake ret = bdrv_is_allocated(bs, cluster_offset, 1370cb2e2878SEric Blake MIN(cluster_bytes, max_transfer), &pnum); 1371cb2e2878SEric Blake if (ret < 0) { 13728644476eSMax Reitz /* 13738644476eSMax Reitz * Safe to treat errors in querying allocation as if 1374cb2e2878SEric Blake * unallocated; we'll probably fail again soon on the 1375cb2e2878SEric Blake * read, but at least that will set a decent errno. 1376cb2e2878SEric Blake */ 1377cb2e2878SEric Blake pnum = MIN(cluster_bytes, max_transfer); 1378cb2e2878SEric Blake } 1379cb2e2878SEric Blake 1380b0ddcbbbSKevin Wolf /* Stop at EOF if the image ends in the middle of the cluster */ 1381b0ddcbbbSKevin Wolf if (ret == 0 && pnum == 0) { 1382b0ddcbbbSKevin Wolf assert(progress >= bytes); 1383b0ddcbbbSKevin Wolf break; 1384b0ddcbbbSKevin Wolf } 1385b0ddcbbbSKevin Wolf 1386cb2e2878SEric Blake assert(skip_bytes < pnum); 13878644476eSMax Reitz } 1388cb2e2878SEric Blake 1389cb2e2878SEric Blake if (ret <= 0) { 13901143ec5eSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 13911143ec5eSVladimir Sementsov-Ogievskiy 1392cb2e2878SEric Blake /* Must copy-on-read; use the bounce buffer */ 13930d93ed08SVladimir Sementsov-Ogievskiy pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 13942275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 13952275cc90SVladimir Sementsov-Ogievskiy int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); 13962275cc90SVladimir Sementsov-Ogievskiy int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); 13972275cc90SVladimir Sementsov-Ogievskiy int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); 13982275cc90SVladimir Sementsov-Ogievskiy 13992275cc90SVladimir Sementsov-Ogievskiy bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len); 14002275cc90SVladimir Sementsov-Ogievskiy if (!bounce_buffer) { 14012275cc90SVladimir Sementsov-Ogievskiy ret = -ENOMEM; 14022275cc90SVladimir Sementsov-Ogievskiy goto err; 14032275cc90SVladimir Sementsov-Ogievskiy } 14042275cc90SVladimir Sementsov-Ogievskiy } 14050d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1406cb2e2878SEric Blake 1407cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1408ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 0); 140961007b31SStefan Hajnoczi if (ret < 0) { 141061007b31SStefan Hajnoczi goto err; 141161007b31SStefan Hajnoczi } 141261007b31SStefan Hajnoczi 1413d855ebcdSEric Blake bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1414c1499a5eSEric Blake if (drv->bdrv_co_pwrite_zeroes && 1415cb2e2878SEric Blake buffer_is_zero(bounce_buffer, pnum)) { 1416a604fa2bSEric Blake /* FIXME: Should we (perhaps conditionally) be setting 1417a604fa2bSEric Blake * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1418a604fa2bSEric Blake * that still correctly reads as zero? */ 14197adcf59fSMax Reitz ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 14207adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 142161007b31SStefan Hajnoczi } else { 1422cb2e2878SEric Blake /* This does not change the data on the disk, it is not 1423cb2e2878SEric Blake * necessary to flush even in cache=writethrough mode. 142461007b31SStefan Hajnoczi */ 1425cb2e2878SEric Blake ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 1426ac850bf0SVladimir Sementsov-Ogievskiy &local_qiov, 0, 14277adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 142861007b31SStefan Hajnoczi } 142961007b31SStefan Hajnoczi 143061007b31SStefan Hajnoczi if (ret < 0) { 1431cb2e2878SEric Blake /* It might be okay to ignore write errors for guest 1432cb2e2878SEric Blake * requests. If this is a deliberate copy-on-read 1433cb2e2878SEric Blake * then we don't want to ignore the error. Simply 1434cb2e2878SEric Blake * report it in all cases. 143561007b31SStefan Hajnoczi */ 143661007b31SStefan Hajnoczi goto err; 143761007b31SStefan Hajnoczi } 143861007b31SStefan Hajnoczi 14393299e5ecSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_PREFETCH)) { 14401143ec5eSVladimir Sementsov-Ogievskiy qemu_iovec_from_buf(qiov, qiov_offset + progress, 14411143ec5eSVladimir Sementsov-Ogievskiy bounce_buffer + skip_bytes, 14424ab78b19SVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress)); 14433299e5ecSVladimir Sementsov-Ogievskiy } 14443299e5ecSVladimir Sementsov-Ogievskiy } else if (!(flags & BDRV_REQ_PREFETCH)) { 1445cb2e2878SEric Blake /* Read directly into the destination */ 14461143ec5eSVladimir Sementsov-Ogievskiy ret = bdrv_driver_preadv(bs, offset + progress, 14471143ec5eSVladimir Sementsov-Ogievskiy MIN(pnum - skip_bytes, bytes - progress), 14481143ec5eSVladimir Sementsov-Ogievskiy qiov, qiov_offset + progress, 0); 1449cb2e2878SEric Blake if (ret < 0) { 1450cb2e2878SEric Blake goto err; 1451cb2e2878SEric Blake } 1452cb2e2878SEric Blake } 1453cb2e2878SEric Blake 1454cb2e2878SEric Blake cluster_offset += pnum; 1455cb2e2878SEric Blake cluster_bytes -= pnum; 1456cb2e2878SEric Blake progress += pnum - skip_bytes; 1457cb2e2878SEric Blake skip_bytes = 0; 1458cb2e2878SEric Blake } 1459cb2e2878SEric Blake ret = 0; 146061007b31SStefan Hajnoczi 146161007b31SStefan Hajnoczi err: 146261007b31SStefan Hajnoczi qemu_vfree(bounce_buffer); 146361007b31SStefan Hajnoczi return ret; 146461007b31SStefan Hajnoczi } 146561007b31SStefan Hajnoczi 146661007b31SStefan Hajnoczi /* 146761007b31SStefan Hajnoczi * Forwards an already correctly aligned request to the BlockDriver. This 14681a62d0acSEric Blake * handles copy on read, zeroing after EOF, and fragmentation of large 14691a62d0acSEric Blake * reads; any other features must be implemented by the caller. 147061007b31SStefan Hajnoczi */ 147185c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 147261007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 147365cd4424SVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 147461007b31SStefan Hajnoczi { 147585c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 1476c9d20029SKevin Wolf int64_t total_bytes, max_bytes; 14771a62d0acSEric Blake int ret = 0; 14781a62d0acSEric Blake uint64_t bytes_remaining = bytes; 14791a62d0acSEric Blake int max_transfer; 148061007b31SStefan Hajnoczi 148149c07526SKevin Wolf assert(is_power_of_2(align)); 148249c07526SKevin Wolf assert((offset & (align - 1)) == 0); 148349c07526SKevin Wolf assert((bytes & (align - 1)) == 0); 1484abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 14851a62d0acSEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 14861a62d0acSEric Blake align); 1487a604fa2bSEric Blake 1488a604fa2bSEric Blake /* TODO: We would need a per-BDS .supported_read_flags and 1489a604fa2bSEric Blake * potential fallback support, if we ever implement any read flags 1490a604fa2bSEric Blake * to pass through to drivers. For now, there aren't any 1491a604fa2bSEric Blake * passthrough flags. */ 1492c53cb427SPaolo Bonzini assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH))); 149361007b31SStefan Hajnoczi 149461007b31SStefan Hajnoczi /* Handle Copy on Read and associated serialisation */ 149561007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 149661007b31SStefan Hajnoczi /* If we touch the same cluster it counts as an overlap. This 149761007b31SStefan Hajnoczi * guarantees that allocating writes will be serialized and not race 149861007b31SStefan Hajnoczi * with each other for the same cluster. For example, in copy-on-read 149961007b31SStefan Hajnoczi * it ensures that the CoR read and write operations are atomic and 150061007b31SStefan Hajnoczi * guest writes cannot interleave between them. */ 15018ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); 150218fbd0deSPaolo Bonzini } else { 1503304d9d7fSMax Reitz bdrv_wait_serialising_requests(req); 150418fbd0deSPaolo Bonzini } 150561007b31SStefan Hajnoczi 150661007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 1507d6a644bbSEric Blake int64_t pnum; 150861007b31SStefan Hajnoczi 1509897dd0ecSAndrey Shinkevich /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */ 1510897dd0ecSAndrey Shinkevich flags &= ~BDRV_REQ_COPY_ON_READ; 1511897dd0ecSAndrey Shinkevich 151288e63df2SEric Blake ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 151361007b31SStefan Hajnoczi if (ret < 0) { 151461007b31SStefan Hajnoczi goto out; 151561007b31SStefan Hajnoczi } 151661007b31SStefan Hajnoczi 151788e63df2SEric Blake if (!ret || pnum != bytes) { 151865cd4424SVladimir Sementsov-Ogievskiy ret = bdrv_co_do_copy_on_readv(child, offset, bytes, 151965cd4424SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 15203299e5ecSVladimir Sementsov-Ogievskiy goto out; 15213299e5ecSVladimir Sementsov-Ogievskiy } else if (flags & BDRV_REQ_PREFETCH) { 152261007b31SStefan Hajnoczi goto out; 152361007b31SStefan Hajnoczi } 152461007b31SStefan Hajnoczi } 152561007b31SStefan Hajnoczi 15261a62d0acSEric Blake /* Forward the request to the BlockDriver, possibly fragmenting it */ 152749c07526SKevin Wolf total_bytes = bdrv_getlength(bs); 152849c07526SKevin Wolf if (total_bytes < 0) { 152949c07526SKevin Wolf ret = total_bytes; 153061007b31SStefan Hajnoczi goto out; 153161007b31SStefan Hajnoczi } 153261007b31SStefan Hajnoczi 1533897dd0ecSAndrey Shinkevich assert(!(flags & ~bs->supported_read_flags)); 1534897dd0ecSAndrey Shinkevich 153549c07526SKevin Wolf max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 15361a62d0acSEric Blake if (bytes <= max_bytes && bytes <= max_transfer) { 1537897dd0ecSAndrey Shinkevich ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags); 15381a62d0acSEric Blake goto out; 153961007b31SStefan Hajnoczi } 154061007b31SStefan Hajnoczi 15411a62d0acSEric Blake while (bytes_remaining) { 15421a62d0acSEric Blake int num; 15431a62d0acSEric Blake 15441a62d0acSEric Blake if (max_bytes) { 15451a62d0acSEric Blake num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 15461a62d0acSEric Blake assert(num); 15471a62d0acSEric Blake 15481a62d0acSEric Blake ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 1549134b7decSMax Reitz num, qiov, 1550897dd0ecSAndrey Shinkevich qiov_offset + bytes - bytes_remaining, 1551897dd0ecSAndrey Shinkevich flags); 15521a62d0acSEric Blake max_bytes -= num; 15531a62d0acSEric Blake } else { 15541a62d0acSEric Blake num = bytes_remaining; 1555134b7decSMax Reitz ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining, 1556134b7decSMax Reitz 0, bytes_remaining); 15571a62d0acSEric Blake } 15581a62d0acSEric Blake if (ret < 0) { 15591a62d0acSEric Blake goto out; 15601a62d0acSEric Blake } 15611a62d0acSEric Blake bytes_remaining -= num; 156261007b31SStefan Hajnoczi } 156361007b31SStefan Hajnoczi 156461007b31SStefan Hajnoczi out: 15651a62d0acSEric Blake return ret < 0 ? ret : 0; 156661007b31SStefan Hajnoczi } 156761007b31SStefan Hajnoczi 156861007b31SStefan Hajnoczi /* 15697a3f542fSVladimir Sementsov-Ogievskiy * Request padding 15707a3f542fSVladimir Sementsov-Ogievskiy * 15717a3f542fSVladimir Sementsov-Ogievskiy * |<---- align ----->| |<----- align ---->| 15727a3f542fSVladimir Sementsov-Ogievskiy * |<- head ->|<------------- bytes ------------->|<-- tail -->| 15737a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 15747a3f542fSVladimir Sementsov-Ogievskiy * -*----------$-------*-------- ... --------*-----$------------*--- 15757a3f542fSVladimir Sementsov-Ogievskiy * | | | | | | 15767a3f542fSVladimir Sementsov-Ogievskiy * | offset | | end | 15777a3f542fSVladimir Sementsov-Ogievskiy * ALIGN_DOWN(offset) ALIGN_UP(offset) ALIGN_DOWN(end) ALIGN_UP(end) 15787a3f542fSVladimir Sementsov-Ogievskiy * [buf ... ) [tail_buf ) 15797a3f542fSVladimir Sementsov-Ogievskiy * 15807a3f542fSVladimir Sementsov-Ogievskiy * @buf is an aligned allocation needed to store @head and @tail paddings. @head 15817a3f542fSVladimir Sementsov-Ogievskiy * is placed at the beginning of @buf and @tail at the @end. 15827a3f542fSVladimir Sementsov-Ogievskiy * 15837a3f542fSVladimir Sementsov-Ogievskiy * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk 15847a3f542fSVladimir Sementsov-Ogievskiy * around tail, if tail exists. 15857a3f542fSVladimir Sementsov-Ogievskiy * 15867a3f542fSVladimir Sementsov-Ogievskiy * @merge_reads is true for small requests, 15877a3f542fSVladimir Sementsov-Ogievskiy * if @buf_len == @head + bytes + @tail. In this case it is possible that both 15887a3f542fSVladimir Sementsov-Ogievskiy * head and tail exist but @buf_len == align and @tail_buf == @buf. 158961007b31SStefan Hajnoczi */ 15907a3f542fSVladimir Sementsov-Ogievskiy typedef struct BdrvRequestPadding { 15917a3f542fSVladimir Sementsov-Ogievskiy uint8_t *buf; 15927a3f542fSVladimir Sementsov-Ogievskiy size_t buf_len; 15937a3f542fSVladimir Sementsov-Ogievskiy uint8_t *tail_buf; 15947a3f542fSVladimir Sementsov-Ogievskiy size_t head; 15957a3f542fSVladimir Sementsov-Ogievskiy size_t tail; 15967a3f542fSVladimir Sementsov-Ogievskiy bool merge_reads; 15977a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 15987a3f542fSVladimir Sementsov-Ogievskiy } BdrvRequestPadding; 15997a3f542fSVladimir Sementsov-Ogievskiy 16007a3f542fSVladimir Sementsov-Ogievskiy static bool bdrv_init_padding(BlockDriverState *bs, 16017a3f542fSVladimir Sementsov-Ogievskiy int64_t offset, int64_t bytes, 16027a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad) 16037a3f542fSVladimir Sementsov-Ogievskiy { 1604a56ed80cSVladimir Sementsov-Ogievskiy int64_t align = bs->bl.request_alignment; 1605a56ed80cSVladimir Sementsov-Ogievskiy int64_t sum; 1606a56ed80cSVladimir Sementsov-Ogievskiy 1607a56ed80cSVladimir Sementsov-Ogievskiy bdrv_check_request(offset, bytes, &error_abort); 1608a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= INT_MAX); /* documented in block/block_int.h */ 1609a56ed80cSVladimir Sementsov-Ogievskiy assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */ 16107a3f542fSVladimir Sementsov-Ogievskiy 16117a3f542fSVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 16127a3f542fSVladimir Sementsov-Ogievskiy 16137a3f542fSVladimir Sementsov-Ogievskiy pad->head = offset & (align - 1); 16147a3f542fSVladimir Sementsov-Ogievskiy pad->tail = ((offset + bytes) & (align - 1)); 16157a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 16167a3f542fSVladimir Sementsov-Ogievskiy pad->tail = align - pad->tail; 16177a3f542fSVladimir Sementsov-Ogievskiy } 16187a3f542fSVladimir Sementsov-Ogievskiy 1619ac9d00bfSVladimir Sementsov-Ogievskiy if (!pad->head && !pad->tail) { 16207a3f542fSVladimir Sementsov-Ogievskiy return false; 16217a3f542fSVladimir Sementsov-Ogievskiy } 16227a3f542fSVladimir Sementsov-Ogievskiy 1623ac9d00bfSVladimir Sementsov-Ogievskiy assert(bytes); /* Nothing good in aligning zero-length requests */ 1624ac9d00bfSVladimir Sementsov-Ogievskiy 16257a3f542fSVladimir Sementsov-Ogievskiy sum = pad->head + bytes + pad->tail; 16267a3f542fSVladimir Sementsov-Ogievskiy pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align; 16277a3f542fSVladimir Sementsov-Ogievskiy pad->buf = qemu_blockalign(bs, pad->buf_len); 16287a3f542fSVladimir Sementsov-Ogievskiy pad->merge_reads = sum == pad->buf_len; 16297a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 16307a3f542fSVladimir Sementsov-Ogievskiy pad->tail_buf = pad->buf + pad->buf_len - align; 16317a3f542fSVladimir Sementsov-Ogievskiy } 16327a3f542fSVladimir Sementsov-Ogievskiy 16337a3f542fSVladimir Sementsov-Ogievskiy return true; 16347a3f542fSVladimir Sementsov-Ogievskiy } 16357a3f542fSVladimir Sementsov-Ogievskiy 16367a3f542fSVladimir Sementsov-Ogievskiy static int bdrv_padding_rmw_read(BdrvChild *child, 16377a3f542fSVladimir Sementsov-Ogievskiy BdrvTrackedRequest *req, 16387a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad, 16397a3f542fSVladimir Sementsov-Ogievskiy bool zero_middle) 16407a3f542fSVladimir Sementsov-Ogievskiy { 16417a3f542fSVladimir Sementsov-Ogievskiy QEMUIOVector local_qiov; 16427a3f542fSVladimir Sementsov-Ogievskiy BlockDriverState *bs = child->bs; 16437a3f542fSVladimir Sementsov-Ogievskiy uint64_t align = bs->bl.request_alignment; 16447a3f542fSVladimir Sementsov-Ogievskiy int ret; 16457a3f542fSVladimir Sementsov-Ogievskiy 16467a3f542fSVladimir Sementsov-Ogievskiy assert(req->serialising && pad->buf); 16477a3f542fSVladimir Sementsov-Ogievskiy 16487a3f542fSVladimir Sementsov-Ogievskiy if (pad->head || pad->merge_reads) { 16497a3f542fSVladimir Sementsov-Ogievskiy uint64_t bytes = pad->merge_reads ? pad->buf_len : align; 16507a3f542fSVladimir Sementsov-Ogievskiy 16517a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->buf, bytes); 16527a3f542fSVladimir Sementsov-Ogievskiy 16537a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 16547a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 16557a3f542fSVladimir Sementsov-Ogievskiy } 16567a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 16577a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 16587a3f542fSVladimir Sementsov-Ogievskiy } 16597a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes, 166065cd4424SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 0); 16617a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 16627a3f542fSVladimir Sementsov-Ogievskiy return ret; 16637a3f542fSVladimir Sementsov-Ogievskiy } 16647a3f542fSVladimir Sementsov-Ogievskiy if (pad->head) { 16657a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 16667a3f542fSVladimir Sementsov-Ogievskiy } 16677a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads && pad->tail) { 16687a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 16697a3f542fSVladimir Sementsov-Ogievskiy } 16707a3f542fSVladimir Sementsov-Ogievskiy 16717a3f542fSVladimir Sementsov-Ogievskiy if (pad->merge_reads) { 16727a3f542fSVladimir Sementsov-Ogievskiy goto zero_mem; 16737a3f542fSVladimir Sementsov-Ogievskiy } 16747a3f542fSVladimir Sementsov-Ogievskiy } 16757a3f542fSVladimir Sementsov-Ogievskiy 16767a3f542fSVladimir Sementsov-Ogievskiy if (pad->tail) { 16777a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align); 16787a3f542fSVladimir Sementsov-Ogievskiy 16797a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 16807a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv( 16817a3f542fSVladimir Sementsov-Ogievskiy child, req, 16827a3f542fSVladimir Sementsov-Ogievskiy req->overlap_offset + req->overlap_bytes - align, 168365cd4424SVladimir Sementsov-Ogievskiy align, align, &local_qiov, 0, 0); 16847a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0) { 16857a3f542fSVladimir Sementsov-Ogievskiy return ret; 16867a3f542fSVladimir Sementsov-Ogievskiy } 16877a3f542fSVladimir Sementsov-Ogievskiy bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 16887a3f542fSVladimir Sementsov-Ogievskiy } 16897a3f542fSVladimir Sementsov-Ogievskiy 16907a3f542fSVladimir Sementsov-Ogievskiy zero_mem: 16917a3f542fSVladimir Sementsov-Ogievskiy if (zero_middle) { 16927a3f542fSVladimir Sementsov-Ogievskiy memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail); 16937a3f542fSVladimir Sementsov-Ogievskiy } 16947a3f542fSVladimir Sementsov-Ogievskiy 16957a3f542fSVladimir Sementsov-Ogievskiy return 0; 16967a3f542fSVladimir Sementsov-Ogievskiy } 16977a3f542fSVladimir Sementsov-Ogievskiy 16987a3f542fSVladimir Sementsov-Ogievskiy static void bdrv_padding_destroy(BdrvRequestPadding *pad) 16997a3f542fSVladimir Sementsov-Ogievskiy { 17007a3f542fSVladimir Sementsov-Ogievskiy if (pad->buf) { 17017a3f542fSVladimir Sementsov-Ogievskiy qemu_vfree(pad->buf); 17027a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_destroy(&pad->local_qiov); 17037a3f542fSVladimir Sementsov-Ogievskiy } 170498ca4549SVladimir Sementsov-Ogievskiy memset(pad, 0, sizeof(*pad)); 17057a3f542fSVladimir Sementsov-Ogievskiy } 17067a3f542fSVladimir Sementsov-Ogievskiy 17077a3f542fSVladimir Sementsov-Ogievskiy /* 17087a3f542fSVladimir Sementsov-Ogievskiy * bdrv_pad_request 17097a3f542fSVladimir Sementsov-Ogievskiy * 17107a3f542fSVladimir Sementsov-Ogievskiy * Exchange request parameters with padded request if needed. Don't include RMW 17117a3f542fSVladimir Sementsov-Ogievskiy * read of padding, bdrv_padding_rmw_read() should be called separately if 17127a3f542fSVladimir Sementsov-Ogievskiy * needed. 17137a3f542fSVladimir Sementsov-Ogievskiy * 171498ca4549SVladimir Sementsov-Ogievskiy * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: 171598ca4549SVladimir Sementsov-Ogievskiy * - on function start they represent original request 171698ca4549SVladimir Sementsov-Ogievskiy * - on failure or when padding is not needed they are unchanged 171798ca4549SVladimir Sementsov-Ogievskiy * - on success when padding is needed they represent padded request 17187a3f542fSVladimir Sementsov-Ogievskiy */ 171998ca4549SVladimir Sementsov-Ogievskiy static int bdrv_pad_request(BlockDriverState *bs, 17201acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector **qiov, size_t *qiov_offset, 17217a3f542fSVladimir Sementsov-Ogievskiy int64_t *offset, unsigned int *bytes, 172298ca4549SVladimir Sementsov-Ogievskiy BdrvRequestPadding *pad, bool *padded) 17237a3f542fSVladimir Sementsov-Ogievskiy { 17244c002cefSVladimir Sementsov-Ogievskiy int ret; 17254c002cefSVladimir Sementsov-Ogievskiy 17267a3f542fSVladimir Sementsov-Ogievskiy if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { 172798ca4549SVladimir Sementsov-Ogievskiy if (padded) { 172898ca4549SVladimir Sementsov-Ogievskiy *padded = false; 172998ca4549SVladimir Sementsov-Ogievskiy } 173098ca4549SVladimir Sementsov-Ogievskiy return 0; 17317a3f542fSVladimir Sementsov-Ogievskiy } 17327a3f542fSVladimir Sementsov-Ogievskiy 17334c002cefSVladimir Sementsov-Ogievskiy ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, 17341acc3466SVladimir Sementsov-Ogievskiy *qiov, *qiov_offset, *bytes, 17354c002cefSVladimir Sementsov-Ogievskiy pad->buf + pad->buf_len - pad->tail, 17364c002cefSVladimir Sementsov-Ogievskiy pad->tail); 173798ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 173898ca4549SVladimir Sementsov-Ogievskiy bdrv_padding_destroy(pad); 173998ca4549SVladimir Sementsov-Ogievskiy return ret; 174098ca4549SVladimir Sementsov-Ogievskiy } 17417a3f542fSVladimir Sementsov-Ogievskiy *bytes += pad->head + pad->tail; 17427a3f542fSVladimir Sementsov-Ogievskiy *offset -= pad->head; 17437a3f542fSVladimir Sementsov-Ogievskiy *qiov = &pad->local_qiov; 17441acc3466SVladimir Sementsov-Ogievskiy *qiov_offset = 0; 174598ca4549SVladimir Sementsov-Ogievskiy if (padded) { 174698ca4549SVladimir Sementsov-Ogievskiy *padded = true; 174798ca4549SVladimir Sementsov-Ogievskiy } 17487a3f542fSVladimir Sementsov-Ogievskiy 174998ca4549SVladimir Sementsov-Ogievskiy return 0; 17507a3f542fSVladimir Sementsov-Ogievskiy } 17517a3f542fSVladimir Sementsov-Ogievskiy 1752a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child, 175361007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 175461007b31SStefan Hajnoczi BdrvRequestFlags flags) 175561007b31SStefan Hajnoczi { 17561acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags); 17571acc3466SVladimir Sementsov-Ogievskiy } 17581acc3466SVladimir Sementsov-Ogievskiy 17591acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, 17601acc3466SVladimir Sementsov-Ogievskiy int64_t offset, unsigned int bytes, 17611acc3466SVladimir Sementsov-Ogievskiy QEMUIOVector *qiov, size_t qiov_offset, 17621acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 17631acc3466SVladimir Sementsov-Ogievskiy { 1764a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 176561007b31SStefan Hajnoczi BdrvTrackedRequest req; 17667a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 176761007b31SStefan Hajnoczi int ret; 176861007b31SStefan Hajnoczi 17697a3f542fSVladimir Sementsov-Ogievskiy trace_bdrv_co_preadv(bs, offset, bytes, flags); 177061007b31SStefan Hajnoczi 1771f4dad307SVladimir Sementsov-Ogievskiy if (!bdrv_is_inserted(bs)) { 1772f4dad307SVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 1773f4dad307SVladimir Sementsov-Ogievskiy } 1774f4dad307SVladimir Sementsov-Ogievskiy 177563f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 177661007b31SStefan Hajnoczi if (ret < 0) { 177761007b31SStefan Hajnoczi return ret; 177861007b31SStefan Hajnoczi } 177961007b31SStefan Hajnoczi 1780ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 1781ac9d00bfSVladimir Sementsov-Ogievskiy /* 1782ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 1783ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 1784ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 1785ac9d00bfSVladimir Sementsov-Ogievskiy * 1786ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 1787ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length read occasionally. 1788ac9d00bfSVladimir Sementsov-Ogievskiy */ 1789ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 1790ac9d00bfSVladimir Sementsov-Ogievskiy } 1791ac9d00bfSVladimir Sementsov-Ogievskiy 179299723548SPaolo Bonzini bdrv_inc_in_flight(bs); 179399723548SPaolo Bonzini 17949568b511SWen Congyang /* Don't do copy-on-read if we read data before write operation */ 1795d73415a3SStefan Hajnoczi if (qatomic_read(&bs->copy_on_read)) { 179661007b31SStefan Hajnoczi flags |= BDRV_REQ_COPY_ON_READ; 179761007b31SStefan Hajnoczi } 179861007b31SStefan Hajnoczi 179998ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 180098ca4549SVladimir Sementsov-Ogievskiy NULL); 180198ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 180298ca4549SVladimir Sementsov-Ogievskiy return ret; 180398ca4549SVladimir Sementsov-Ogievskiy } 180461007b31SStefan Hajnoczi 1805ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 18067a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_preadv(child, &req, offset, bytes, 18077a3f542fSVladimir Sementsov-Ogievskiy bs->bl.request_alignment, 18081acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 180961007b31SStefan Hajnoczi tracked_request_end(&req); 181099723548SPaolo Bonzini bdrv_dec_in_flight(bs); 181161007b31SStefan Hajnoczi 18127a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 181361007b31SStefan Hajnoczi 181461007b31SStefan Hajnoczi return ret; 181561007b31SStefan Hajnoczi } 181661007b31SStefan Hajnoczi 1817d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1818f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags) 181961007b31SStefan Hajnoczi { 182061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 182161007b31SStefan Hajnoczi QEMUIOVector qiov; 18220d93ed08SVladimir Sementsov-Ogievskiy void *buf = NULL; 182361007b31SStefan Hajnoczi int ret = 0; 1824465fe887SEric Blake bool need_flush = false; 1825443668caSDenis V. Lunev int head = 0; 1826443668caSDenis V. Lunev int tail = 0; 182761007b31SStefan Hajnoczi 1828cf081fcaSEric Blake int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1829a5b8dd2cSEric Blake int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1830a5b8dd2cSEric Blake bs->bl.request_alignment); 1831cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1832cf081fcaSEric Blake 1833d470ad42SMax Reitz if (!drv) { 1834d470ad42SMax Reitz return -ENOMEDIUM; 1835d470ad42SMax Reitz } 1836d470ad42SMax Reitz 1837fe0480d6SKevin Wolf if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1838fe0480d6SKevin Wolf return -ENOTSUP; 1839fe0480d6SKevin Wolf } 1840fe0480d6SKevin Wolf 1841b8d0a980SEric Blake assert(alignment % bs->bl.request_alignment == 0); 1842b8d0a980SEric Blake head = offset % alignment; 1843f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % alignment; 1844b8d0a980SEric Blake max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1845b8d0a980SEric Blake assert(max_write_zeroes >= bs->bl.request_alignment); 184661007b31SStefan Hajnoczi 1847f5a5ca79SManos Pitsidianakis while (bytes > 0 && !ret) { 1848f5a5ca79SManos Pitsidianakis int num = bytes; 184961007b31SStefan Hajnoczi 185061007b31SStefan Hajnoczi /* Align request. Block drivers can expect the "bulk" of the request 1851443668caSDenis V. Lunev * to be aligned, and that unaligned requests do not cross cluster 1852443668caSDenis V. Lunev * boundaries. 185361007b31SStefan Hajnoczi */ 1854443668caSDenis V. Lunev if (head) { 1855b2f95feeSEric Blake /* Make a small request up to the first aligned sector. For 1856b2f95feeSEric Blake * convenience, limit this request to max_transfer even if 1857b2f95feeSEric Blake * we don't need to fall back to writes. */ 1858f5a5ca79SManos Pitsidianakis num = MIN(MIN(bytes, max_transfer), alignment - head); 1859b2f95feeSEric Blake head = (head + num) % alignment; 1860b2f95feeSEric Blake assert(num < max_write_zeroes); 1861d05aa8bbSEric Blake } else if (tail && num > alignment) { 1862443668caSDenis V. Lunev /* Shorten the request to the last aligned sector. */ 1863443668caSDenis V. Lunev num -= tail; 186461007b31SStefan Hajnoczi } 186561007b31SStefan Hajnoczi 186661007b31SStefan Hajnoczi /* limit request size */ 186761007b31SStefan Hajnoczi if (num > max_write_zeroes) { 186861007b31SStefan Hajnoczi num = max_write_zeroes; 186961007b31SStefan Hajnoczi } 187061007b31SStefan Hajnoczi 187161007b31SStefan Hajnoczi ret = -ENOTSUP; 187261007b31SStefan Hajnoczi /* First try the efficient write zeroes operation */ 1873d05aa8bbSEric Blake if (drv->bdrv_co_pwrite_zeroes) { 1874d05aa8bbSEric Blake ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1875d05aa8bbSEric Blake flags & bs->supported_zero_flags); 1876d05aa8bbSEric Blake if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1877d05aa8bbSEric Blake !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1878d05aa8bbSEric Blake need_flush = true; 1879d05aa8bbSEric Blake } 1880465fe887SEric Blake } else { 1881465fe887SEric Blake assert(!bs->supported_zero_flags); 188261007b31SStefan Hajnoczi } 188361007b31SStefan Hajnoczi 1884294682ccSAndrey Shinkevich if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) { 188561007b31SStefan Hajnoczi /* Fall back to bounce buffer if write zeroes is unsupported */ 1886465fe887SEric Blake BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1887465fe887SEric Blake 1888465fe887SEric Blake if ((flags & BDRV_REQ_FUA) && 1889465fe887SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1890465fe887SEric Blake /* No need for bdrv_driver_pwrite() to do a fallback 1891465fe887SEric Blake * flush on each chunk; use just one at the end */ 1892465fe887SEric Blake write_flags &= ~BDRV_REQ_FUA; 1893465fe887SEric Blake need_flush = true; 1894465fe887SEric Blake } 18955def6b80SEric Blake num = MIN(num, max_transfer); 18960d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 18970d93ed08SVladimir Sementsov-Ogievskiy buf = qemu_try_blockalign0(bs, num); 18980d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 189961007b31SStefan Hajnoczi ret = -ENOMEM; 190061007b31SStefan Hajnoczi goto fail; 190161007b31SStefan Hajnoczi } 190261007b31SStefan Hajnoczi } 19030d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&qiov, buf, num); 190461007b31SStefan Hajnoczi 1905ac850bf0SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags); 190661007b31SStefan Hajnoczi 190761007b31SStefan Hajnoczi /* Keep bounce buffer around if it is big enough for all 190861007b31SStefan Hajnoczi * all future requests. 190961007b31SStefan Hajnoczi */ 19105def6b80SEric Blake if (num < max_transfer) { 19110d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 19120d93ed08SVladimir Sementsov-Ogievskiy buf = NULL; 191361007b31SStefan Hajnoczi } 191461007b31SStefan Hajnoczi } 191561007b31SStefan Hajnoczi 1916d05aa8bbSEric Blake offset += num; 1917f5a5ca79SManos Pitsidianakis bytes -= num; 191861007b31SStefan Hajnoczi } 191961007b31SStefan Hajnoczi 192061007b31SStefan Hajnoczi fail: 1921465fe887SEric Blake if (ret == 0 && need_flush) { 1922465fe887SEric Blake ret = bdrv_co_flush(bs); 1923465fe887SEric Blake } 19240d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 192561007b31SStefan Hajnoczi return ret; 192661007b31SStefan Hajnoczi } 192761007b31SStefan Hajnoczi 192885fe2479SFam Zheng static inline int coroutine_fn 192985fe2479SFam Zheng bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, 193085fe2479SFam Zheng BdrvTrackedRequest *req, int flags) 193185fe2479SFam Zheng { 193285fe2479SFam Zheng BlockDriverState *bs = child->bs; 193385fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 193485fe2479SFam Zheng 193585fe2479SFam Zheng if (bs->read_only) { 193685fe2479SFam Zheng return -EPERM; 193785fe2479SFam Zheng } 193885fe2479SFam Zheng 193985fe2479SFam Zheng assert(!(bs->open_flags & BDRV_O_INACTIVE)); 194085fe2479SFam Zheng assert((bs->open_flags & BDRV_O_NO_IO) == 0); 194185fe2479SFam Zheng assert(!(flags & ~BDRV_REQ_MASK)); 1942d1a764d1SVladimir Sementsov-Ogievskiy assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); 194385fe2479SFam Zheng 194485fe2479SFam Zheng if (flags & BDRV_REQ_SERIALISING) { 1945d1a764d1SVladimir Sementsov-Ogievskiy QEMU_LOCK_GUARD(&bs->reqs_lock); 1946d1a764d1SVladimir Sementsov-Ogievskiy 1947d1a764d1SVladimir Sementsov-Ogievskiy tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); 1948d1a764d1SVladimir Sementsov-Ogievskiy 1949d1a764d1SVladimir Sementsov-Ogievskiy if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { 1950d1a764d1SVladimir Sementsov-Ogievskiy return -EBUSY; 1951d1a764d1SVladimir Sementsov-Ogievskiy } 1952d1a764d1SVladimir Sementsov-Ogievskiy 1953d1a764d1SVladimir Sementsov-Ogievskiy bdrv_wait_serialising_requests_locked(req); 195418fbd0deSPaolo Bonzini } else { 195518fbd0deSPaolo Bonzini bdrv_wait_serialising_requests(req); 195685fe2479SFam Zheng } 195785fe2479SFam Zheng 195885fe2479SFam Zheng assert(req->overlap_offset <= offset); 195985fe2479SFam Zheng assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1960cd47d792SFam Zheng assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 196185fe2479SFam Zheng 1962cd47d792SFam Zheng switch (req->type) { 1963cd47d792SFam Zheng case BDRV_TRACKED_WRITE: 1964cd47d792SFam Zheng case BDRV_TRACKED_DISCARD: 196585fe2479SFam Zheng if (flags & BDRV_REQ_WRITE_UNCHANGED) { 196685fe2479SFam Zheng assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 196785fe2479SFam Zheng } else { 196885fe2479SFam Zheng assert(child->perm & BLK_PERM_WRITE); 196985fe2479SFam Zheng } 1970cd47d792SFam Zheng return notifier_with_return_list_notify(&bs->before_write_notifiers, 1971cd47d792SFam Zheng req); 1972cd47d792SFam Zheng case BDRV_TRACKED_TRUNCATE: 1973cd47d792SFam Zheng assert(child->perm & BLK_PERM_RESIZE); 1974cd47d792SFam Zheng return 0; 1975cd47d792SFam Zheng default: 1976cd47d792SFam Zheng abort(); 1977cd47d792SFam Zheng } 197885fe2479SFam Zheng } 197985fe2479SFam Zheng 198085fe2479SFam Zheng static inline void coroutine_fn 198185fe2479SFam Zheng bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, 198285fe2479SFam Zheng BdrvTrackedRequest *req, int ret) 198385fe2479SFam Zheng { 198485fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 198585fe2479SFam Zheng BlockDriverState *bs = child->bs; 198685fe2479SFam Zheng 1987d73415a3SStefan Hajnoczi qatomic_inc(&bs->write_gen); 198885fe2479SFam Zheng 198900695c27SFam Zheng /* 199000695c27SFam Zheng * Discard cannot extend the image, but in error handling cases, such as 199100695c27SFam Zheng * when reverting a qcow2 cluster allocation, the discarded range can pass 199200695c27SFam Zheng * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 199300695c27SFam Zheng * here. Instead, just skip it, since semantically a discard request 199400695c27SFam Zheng * beyond EOF cannot expand the image anyway. 199500695c27SFam Zheng */ 19967f8f03efSFam Zheng if (ret == 0 && 1997cd47d792SFam Zheng (req->type == BDRV_TRACKED_TRUNCATE || 1998cd47d792SFam Zheng end_sector > bs->total_sectors) && 199900695c27SFam Zheng req->type != BDRV_TRACKED_DISCARD) { 20007f8f03efSFam Zheng bs->total_sectors = end_sector; 20017f8f03efSFam Zheng bdrv_parent_cb_resize(bs); 20027f8f03efSFam Zheng bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 200385fe2479SFam Zheng } 200400695c27SFam Zheng if (req->bytes) { 200500695c27SFam Zheng switch (req->type) { 200600695c27SFam Zheng case BDRV_TRACKED_WRITE: 200700695c27SFam Zheng stat64_max(&bs->wr_highest_offset, offset + bytes); 200800695c27SFam Zheng /* fall through, to set dirty bits */ 200900695c27SFam Zheng case BDRV_TRACKED_DISCARD: 20107f8f03efSFam Zheng bdrv_set_dirty(bs, offset, bytes); 201100695c27SFam Zheng break; 201200695c27SFam Zheng default: 201300695c27SFam Zheng break; 201400695c27SFam Zheng } 201500695c27SFam Zheng } 201685fe2479SFam Zheng } 201785fe2479SFam Zheng 201861007b31SStefan Hajnoczi /* 201904ed95f4SEric Blake * Forwards an already correctly aligned write request to the BlockDriver, 202004ed95f4SEric Blake * after possibly fragmenting it. 202161007b31SStefan Hajnoczi */ 202285c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 202361007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 202428c4da28SVladimir Sementsov-Ogievskiy int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) 202561007b31SStefan Hajnoczi { 202685c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 202761007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 202861007b31SStefan Hajnoczi int ret; 202961007b31SStefan Hajnoczi 203004ed95f4SEric Blake uint64_t bytes_remaining = bytes; 203104ed95f4SEric Blake int max_transfer; 203261007b31SStefan Hajnoczi 2033d470ad42SMax Reitz if (!drv) { 2034d470ad42SMax Reitz return -ENOMEDIUM; 2035d470ad42SMax Reitz } 2036d470ad42SMax Reitz 2037d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2038d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2039d6883bc9SVladimir Sementsov-Ogievskiy } 2040d6883bc9SVladimir Sementsov-Ogievskiy 2041cff86b38SEric Blake assert(is_power_of_2(align)); 2042cff86b38SEric Blake assert((offset & (align - 1)) == 0); 2043cff86b38SEric Blake assert((bytes & (align - 1)) == 0); 204428c4da28SVladimir Sementsov-Ogievskiy assert(!qiov || qiov_offset + bytes <= qiov->size); 204504ed95f4SEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 204604ed95f4SEric Blake align); 204761007b31SStefan Hajnoczi 204885fe2479SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 204961007b31SStefan Hajnoczi 205061007b31SStefan Hajnoczi if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 2051c1499a5eSEric Blake !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 205228c4da28SVladimir Sementsov-Ogievskiy qemu_iovec_is_zero(qiov, qiov_offset, bytes)) { 205361007b31SStefan Hajnoczi flags |= BDRV_REQ_ZERO_WRITE; 205461007b31SStefan Hajnoczi if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 205561007b31SStefan Hajnoczi flags |= BDRV_REQ_MAY_UNMAP; 205661007b31SStefan Hajnoczi } 205761007b31SStefan Hajnoczi } 205861007b31SStefan Hajnoczi 205961007b31SStefan Hajnoczi if (ret < 0) { 206061007b31SStefan Hajnoczi /* Do nothing, write notifier decided to fail this request */ 206161007b31SStefan Hajnoczi } else if (flags & BDRV_REQ_ZERO_WRITE) { 20629a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 20639896c876SKevin Wolf ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 20643ea1a091SPavel Butsykin } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 206528c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, 206628c4da28SVladimir Sementsov-Ogievskiy qiov, qiov_offset); 206704ed95f4SEric Blake } else if (bytes <= max_transfer) { 20689a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV); 206928c4da28SVladimir Sementsov-Ogievskiy ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags); 207004ed95f4SEric Blake } else { 207104ed95f4SEric Blake bdrv_debug_event(bs, BLKDBG_PWRITEV); 207204ed95f4SEric Blake while (bytes_remaining) { 207304ed95f4SEric Blake int num = MIN(bytes_remaining, max_transfer); 207404ed95f4SEric Blake int local_flags = flags; 207504ed95f4SEric Blake 207604ed95f4SEric Blake assert(num); 207704ed95f4SEric Blake if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 207804ed95f4SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 207904ed95f4SEric Blake /* If FUA is going to be emulated by flush, we only 208004ed95f4SEric Blake * need to flush on the last iteration */ 208104ed95f4SEric Blake local_flags &= ~BDRV_REQ_FUA; 208204ed95f4SEric Blake } 208304ed95f4SEric Blake 208404ed95f4SEric Blake ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 2085134b7decSMax Reitz num, qiov, 2086134b7decSMax Reitz qiov_offset + bytes - bytes_remaining, 208728c4da28SVladimir Sementsov-Ogievskiy local_flags); 208804ed95f4SEric Blake if (ret < 0) { 208904ed95f4SEric Blake break; 209004ed95f4SEric Blake } 209104ed95f4SEric Blake bytes_remaining -= num; 209204ed95f4SEric Blake } 209361007b31SStefan Hajnoczi } 20949a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 209561007b31SStefan Hajnoczi 209661007b31SStefan Hajnoczi if (ret >= 0) { 209704ed95f4SEric Blake ret = 0; 209861007b31SStefan Hajnoczi } 209985fe2479SFam Zheng bdrv_co_write_req_finish(child, offset, bytes, req, ret); 210061007b31SStefan Hajnoczi 210161007b31SStefan Hajnoczi return ret; 210261007b31SStefan Hajnoczi } 210361007b31SStefan Hajnoczi 210485c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 21059eeb6dd1SFam Zheng int64_t offset, 21069eeb6dd1SFam Zheng unsigned int bytes, 21079eeb6dd1SFam Zheng BdrvRequestFlags flags, 21089eeb6dd1SFam Zheng BdrvTrackedRequest *req) 21099eeb6dd1SFam Zheng { 211085c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 21119eeb6dd1SFam Zheng QEMUIOVector local_qiov; 2112a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 21139eeb6dd1SFam Zheng int ret = 0; 21147a3f542fSVladimir Sementsov-Ogievskiy bool padding; 21157a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 21169eeb6dd1SFam Zheng 21177a3f542fSVladimir Sementsov-Ogievskiy padding = bdrv_init_padding(bs, offset, bytes, &pad); 21187a3f542fSVladimir Sementsov-Ogievskiy if (padding) { 21198ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(req, align); 21209eeb6dd1SFam Zheng 21217a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, req, &pad, true); 21227a3f542fSVladimir Sementsov-Ogievskiy 21237a3f542fSVladimir Sementsov-Ogievskiy if (pad.head || pad.merge_reads) { 21247a3f542fSVladimir Sementsov-Ogievskiy int64_t aligned_offset = offset & ~(align - 1); 21257a3f542fSVladimir Sementsov-Ogievskiy int64_t write_bytes = pad.merge_reads ? pad.buf_len : align; 21267a3f542fSVladimir Sementsov-Ogievskiy 21277a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes); 21287a3f542fSVladimir Sementsov-Ogievskiy ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes, 212928c4da28SVladimir Sementsov-Ogievskiy align, &local_qiov, 0, 21309eeb6dd1SFam Zheng flags & ~BDRV_REQ_ZERO_WRITE); 21317a3f542fSVladimir Sementsov-Ogievskiy if (ret < 0 || pad.merge_reads) { 21327a3f542fSVladimir Sementsov-Ogievskiy /* Error or all work is done */ 21337a3f542fSVladimir Sementsov-Ogievskiy goto out; 21349eeb6dd1SFam Zheng } 21357a3f542fSVladimir Sementsov-Ogievskiy offset += write_bytes - pad.head; 21367a3f542fSVladimir Sementsov-Ogievskiy bytes -= write_bytes - pad.head; 21377a3f542fSVladimir Sementsov-Ogievskiy } 21389eeb6dd1SFam Zheng } 21399eeb6dd1SFam Zheng 21409eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 21419eeb6dd1SFam Zheng if (bytes >= align) { 21429eeb6dd1SFam Zheng /* Write the aligned part in the middle. */ 21439eeb6dd1SFam Zheng uint64_t aligned_bytes = bytes & ~(align - 1); 214485c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 214528c4da28SVladimir Sementsov-Ogievskiy NULL, 0, flags); 21469eeb6dd1SFam Zheng if (ret < 0) { 21477a3f542fSVladimir Sementsov-Ogievskiy goto out; 21489eeb6dd1SFam Zheng } 21499eeb6dd1SFam Zheng bytes -= aligned_bytes; 21509eeb6dd1SFam Zheng offset += aligned_bytes; 21519eeb6dd1SFam Zheng } 21529eeb6dd1SFam Zheng 21539eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 21549eeb6dd1SFam Zheng if (bytes) { 21557a3f542fSVladimir Sementsov-Ogievskiy assert(align == pad.tail + bytes); 21569eeb6dd1SFam Zheng 21577a3f542fSVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align); 215885c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, align, align, 215928c4da28SVladimir Sementsov-Ogievskiy &local_qiov, 0, 216028c4da28SVladimir Sementsov-Ogievskiy flags & ~BDRV_REQ_ZERO_WRITE); 21619eeb6dd1SFam Zheng } 21629eeb6dd1SFam Zheng 21637a3f542fSVladimir Sementsov-Ogievskiy out: 21647a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 21657a3f542fSVladimir Sementsov-Ogievskiy 21667a3f542fSVladimir Sementsov-Ogievskiy return ret; 21679eeb6dd1SFam Zheng } 21689eeb6dd1SFam Zheng 216961007b31SStefan Hajnoczi /* 217061007b31SStefan Hajnoczi * Handle a write request in coroutine context 217161007b31SStefan Hajnoczi */ 2172a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 217361007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 217461007b31SStefan Hajnoczi BdrvRequestFlags flags) 217561007b31SStefan Hajnoczi { 21761acc3466SVladimir Sementsov-Ogievskiy return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); 21771acc3466SVladimir Sementsov-Ogievskiy } 21781acc3466SVladimir Sementsov-Ogievskiy 21791acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, 21801acc3466SVladimir Sementsov-Ogievskiy int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, 21811acc3466SVladimir Sementsov-Ogievskiy BdrvRequestFlags flags) 21821acc3466SVladimir Sementsov-Ogievskiy { 2183a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 218461007b31SStefan Hajnoczi BdrvTrackedRequest req; 2185a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 21867a3f542fSVladimir Sementsov-Ogievskiy BdrvRequestPadding pad; 218761007b31SStefan Hajnoczi int ret; 2188f0deecffSVladimir Sementsov-Ogievskiy bool padded = false; 218961007b31SStefan Hajnoczi 2190f42cf447SDaniel P. Berrange trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 2191f42cf447SDaniel P. Berrange 2192f4dad307SVladimir Sementsov-Ogievskiy if (!bdrv_is_inserted(bs)) { 219361007b31SStefan Hajnoczi return -ENOMEDIUM; 219461007b31SStefan Hajnoczi } 219561007b31SStefan Hajnoczi 219663f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); 219761007b31SStefan Hajnoczi if (ret < 0) { 219861007b31SStefan Hajnoczi return ret; 219961007b31SStefan Hajnoczi } 220061007b31SStefan Hajnoczi 2201f2208fdcSAlberto Garcia /* If the request is misaligned then we can't make it efficient */ 2202f2208fdcSAlberto Garcia if ((flags & BDRV_REQ_NO_FALLBACK) && 2203f2208fdcSAlberto Garcia !QEMU_IS_ALIGNED(offset | bytes, align)) 2204f2208fdcSAlberto Garcia { 2205f2208fdcSAlberto Garcia return -ENOTSUP; 2206f2208fdcSAlberto Garcia } 2207f2208fdcSAlberto Garcia 2208ac9d00bfSVladimir Sementsov-Ogievskiy if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) { 2209ac9d00bfSVladimir Sementsov-Ogievskiy /* 2210ac9d00bfSVladimir Sementsov-Ogievskiy * Aligning zero request is nonsense. Even if driver has special meaning 2211ac9d00bfSVladimir Sementsov-Ogievskiy * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass 2212ac9d00bfSVladimir Sementsov-Ogievskiy * it to driver due to request_alignment. 2213ac9d00bfSVladimir Sementsov-Ogievskiy * 2214ac9d00bfSVladimir Sementsov-Ogievskiy * Still, no reason to return an error if someone do unaligned 2215ac9d00bfSVladimir Sementsov-Ogievskiy * zero-length write occasionally. 2216ac9d00bfSVladimir Sementsov-Ogievskiy */ 2217ac9d00bfSVladimir Sementsov-Ogievskiy return 0; 2218ac9d00bfSVladimir Sementsov-Ogievskiy } 2219ac9d00bfSVladimir Sementsov-Ogievskiy 2220f0deecffSVladimir Sementsov-Ogievskiy if (!(flags & BDRV_REQ_ZERO_WRITE)) { 222161007b31SStefan Hajnoczi /* 2222f0deecffSVladimir Sementsov-Ogievskiy * Pad request for following read-modify-write cycle. 2223f0deecffSVladimir Sementsov-Ogievskiy * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do 2224f0deecffSVladimir Sementsov-Ogievskiy * alignment only if there is no ZERO flag. 222561007b31SStefan Hajnoczi */ 222698ca4549SVladimir Sementsov-Ogievskiy ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, 222798ca4549SVladimir Sementsov-Ogievskiy &padded); 222898ca4549SVladimir Sementsov-Ogievskiy if (ret < 0) { 222998ca4549SVladimir Sementsov-Ogievskiy return ret; 223098ca4549SVladimir Sementsov-Ogievskiy } 2231f0deecffSVladimir Sementsov-Ogievskiy } 2232f0deecffSVladimir Sementsov-Ogievskiy 2233f0deecffSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2234ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 223561007b31SStefan Hajnoczi 223618a59f03SAnton Nefedov if (flags & BDRV_REQ_ZERO_WRITE) { 2237f0deecffSVladimir Sementsov-Ogievskiy assert(!padded); 223885c97ca7SKevin Wolf ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 22399eeb6dd1SFam Zheng goto out; 22409eeb6dd1SFam Zheng } 22419eeb6dd1SFam Zheng 2242f0deecffSVladimir Sementsov-Ogievskiy if (padded) { 2243f0deecffSVladimir Sementsov-Ogievskiy /* 2244f0deecffSVladimir Sementsov-Ogievskiy * Request was unaligned to request_alignment and therefore 2245f0deecffSVladimir Sementsov-Ogievskiy * padded. We are going to do read-modify-write, and must 2246f0deecffSVladimir Sementsov-Ogievskiy * serialize the request to prevent interactions of the 2247f0deecffSVladimir Sementsov-Ogievskiy * widened region with other transactions. 2248f0deecffSVladimir Sementsov-Ogievskiy */ 22498ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, align); 22507a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_rmw_read(child, &req, &pad, false); 225161007b31SStefan Hajnoczi } 225261007b31SStefan Hajnoczi 225385c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 22541acc3466SVladimir Sementsov-Ogievskiy qiov, qiov_offset, flags); 225561007b31SStefan Hajnoczi 22567a3f542fSVladimir Sementsov-Ogievskiy bdrv_padding_destroy(&pad); 225761007b31SStefan Hajnoczi 22589eeb6dd1SFam Zheng out: 22599eeb6dd1SFam Zheng tracked_request_end(&req); 226099723548SPaolo Bonzini bdrv_dec_in_flight(bs); 22617a3f542fSVladimir Sementsov-Ogievskiy 226261007b31SStefan Hajnoczi return ret; 226361007b31SStefan Hajnoczi } 226461007b31SStefan Hajnoczi 2265a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 2266f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 226761007b31SStefan Hajnoczi { 2268f5a5ca79SManos Pitsidianakis trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 226961007b31SStefan Hajnoczi 2270a03ef88fSKevin Wolf if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 227161007b31SStefan Hajnoczi flags &= ~BDRV_REQ_MAY_UNMAP; 227261007b31SStefan Hajnoczi } 227361007b31SStefan Hajnoczi 2274f5a5ca79SManos Pitsidianakis return bdrv_co_pwritev(child, offset, bytes, NULL, 227561007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 227661007b31SStefan Hajnoczi } 227761007b31SStefan Hajnoczi 22784085f5c7SJohn Snow /* 22794085f5c7SJohn Snow * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 22804085f5c7SJohn Snow */ 22814085f5c7SJohn Snow int bdrv_flush_all(void) 22824085f5c7SJohn Snow { 22834085f5c7SJohn Snow BdrvNextIterator it; 22844085f5c7SJohn Snow BlockDriverState *bs = NULL; 22854085f5c7SJohn Snow int result = 0; 22864085f5c7SJohn Snow 2287c8aa7895SPavel Dovgalyuk /* 2288c8aa7895SPavel Dovgalyuk * bdrv queue is managed by record/replay, 2289c8aa7895SPavel Dovgalyuk * creating new flush request for stopping 2290c8aa7895SPavel Dovgalyuk * the VM may break the determinism 2291c8aa7895SPavel Dovgalyuk */ 2292c8aa7895SPavel Dovgalyuk if (replay_events_enabled()) { 2293c8aa7895SPavel Dovgalyuk return result; 2294c8aa7895SPavel Dovgalyuk } 2295c8aa7895SPavel Dovgalyuk 22964085f5c7SJohn Snow for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 22974085f5c7SJohn Snow AioContext *aio_context = bdrv_get_aio_context(bs); 22984085f5c7SJohn Snow int ret; 22994085f5c7SJohn Snow 23004085f5c7SJohn Snow aio_context_acquire(aio_context); 23014085f5c7SJohn Snow ret = bdrv_flush(bs); 23024085f5c7SJohn Snow if (ret < 0 && !result) { 23034085f5c7SJohn Snow result = ret; 23044085f5c7SJohn Snow } 23054085f5c7SJohn Snow aio_context_release(aio_context); 23064085f5c7SJohn Snow } 23074085f5c7SJohn Snow 23084085f5c7SJohn Snow return result; 23094085f5c7SJohn Snow } 23104085f5c7SJohn Snow 231161007b31SStefan Hajnoczi /* 231261007b31SStefan Hajnoczi * Returns the allocation status of the specified sectors. 231361007b31SStefan Hajnoczi * Drivers not implementing the functionality are assumed to not support 231461007b31SStefan Hajnoczi * backing files, hence all their sectors are reported as allocated. 231561007b31SStefan Hajnoczi * 231686a3d5c6SEric Blake * If 'want_zero' is true, the caller is querying for mapping 231786a3d5c6SEric Blake * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 231886a3d5c6SEric Blake * _ZERO where possible; otherwise, the result favors larger 'pnum', 231986a3d5c6SEric Blake * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2320c9ce8c4dSEric Blake * 23212e8bc787SEric Blake * If 'offset' is beyond the end of the disk image the return value is 2322fb0d8654SEric Blake * BDRV_BLOCK_EOF and 'pnum' is set to 0. 232361007b31SStefan Hajnoczi * 23242e8bc787SEric Blake * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2325fb0d8654SEric Blake * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2326fb0d8654SEric Blake * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 232767a0fd2aSFam Zheng * 23282e8bc787SEric Blake * 'pnum' is set to the number of bytes (including and immediately 23292e8bc787SEric Blake * following the specified offset) that are easily known to be in the 23302e8bc787SEric Blake * same allocated/unallocated state. Note that a second call starting 23312e8bc787SEric Blake * at the original offset plus returned pnum may have the same status. 23322e8bc787SEric Blake * The returned value is non-zero on success except at end-of-file. 23332e8bc787SEric Blake * 23342e8bc787SEric Blake * Returns negative errno on failure. Otherwise, if the 23352e8bc787SEric Blake * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 23362e8bc787SEric Blake * set to the host mapping and BDS corresponding to the guest offset. 233761007b31SStefan Hajnoczi */ 23382e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2339c9ce8c4dSEric Blake bool want_zero, 23402e8bc787SEric Blake int64_t offset, int64_t bytes, 23412e8bc787SEric Blake int64_t *pnum, int64_t *map, 234267a0fd2aSFam Zheng BlockDriverState **file) 234361007b31SStefan Hajnoczi { 23442e8bc787SEric Blake int64_t total_size; 23452e8bc787SEric Blake int64_t n; /* bytes */ 2346efa6e2edSEric Blake int ret; 23472e8bc787SEric Blake int64_t local_map = 0; 2348298a1665SEric Blake BlockDriverState *local_file = NULL; 2349efa6e2edSEric Blake int64_t aligned_offset, aligned_bytes; 2350efa6e2edSEric Blake uint32_t align; 2351549ec0d9SMax Reitz bool has_filtered_child; 235261007b31SStefan Hajnoczi 2353298a1665SEric Blake assert(pnum); 2354298a1665SEric Blake *pnum = 0; 23552e8bc787SEric Blake total_size = bdrv_getlength(bs); 23562e8bc787SEric Blake if (total_size < 0) { 23572e8bc787SEric Blake ret = total_size; 2358298a1665SEric Blake goto early_out; 235961007b31SStefan Hajnoczi } 236061007b31SStefan Hajnoczi 23612e8bc787SEric Blake if (offset >= total_size) { 2362298a1665SEric Blake ret = BDRV_BLOCK_EOF; 2363298a1665SEric Blake goto early_out; 236461007b31SStefan Hajnoczi } 23652e8bc787SEric Blake if (!bytes) { 2366298a1665SEric Blake ret = 0; 2367298a1665SEric Blake goto early_out; 23689cdcfd9fSEric Blake } 236961007b31SStefan Hajnoczi 23702e8bc787SEric Blake n = total_size - offset; 23712e8bc787SEric Blake if (n < bytes) { 23722e8bc787SEric Blake bytes = n; 237361007b31SStefan Hajnoczi } 237461007b31SStefan Hajnoczi 2375d470ad42SMax Reitz /* Must be non-NULL or bdrv_getlength() would have failed */ 2376d470ad42SMax Reitz assert(bs->drv); 2377549ec0d9SMax Reitz has_filtered_child = bdrv_filter_child(bs); 2378549ec0d9SMax Reitz if (!bs->drv->bdrv_co_block_status && !has_filtered_child) { 23792e8bc787SEric Blake *pnum = bytes; 238061007b31SStefan Hajnoczi ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 23812e8bc787SEric Blake if (offset + bytes == total_size) { 2382fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2383fb0d8654SEric Blake } 238461007b31SStefan Hajnoczi if (bs->drv->protocol_name) { 23852e8bc787SEric Blake ret |= BDRV_BLOCK_OFFSET_VALID; 23862e8bc787SEric Blake local_map = offset; 2387298a1665SEric Blake local_file = bs; 238861007b31SStefan Hajnoczi } 2389298a1665SEric Blake goto early_out; 239061007b31SStefan Hajnoczi } 239161007b31SStefan Hajnoczi 239299723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2393efa6e2edSEric Blake 2394efa6e2edSEric Blake /* Round out to request_alignment boundaries */ 239586a3d5c6SEric Blake align = bs->bl.request_alignment; 2396efa6e2edSEric Blake aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2397efa6e2edSEric Blake aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2398efa6e2edSEric Blake 2399549ec0d9SMax Reitz if (bs->drv->bdrv_co_block_status) { 240086a3d5c6SEric Blake ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 240186a3d5c6SEric Blake aligned_bytes, pnum, &local_map, 240286a3d5c6SEric Blake &local_file); 2403549ec0d9SMax Reitz } else { 2404549ec0d9SMax Reitz /* Default code for filters */ 2405549ec0d9SMax Reitz 2406549ec0d9SMax Reitz local_file = bdrv_filter_bs(bs); 2407549ec0d9SMax Reitz assert(local_file); 2408549ec0d9SMax Reitz 2409549ec0d9SMax Reitz *pnum = aligned_bytes; 2410549ec0d9SMax Reitz local_map = aligned_offset; 2411549ec0d9SMax Reitz ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 2412549ec0d9SMax Reitz } 241386a3d5c6SEric Blake if (ret < 0) { 241486a3d5c6SEric Blake *pnum = 0; 241586a3d5c6SEric Blake goto out; 241686a3d5c6SEric Blake } 2417efa6e2edSEric Blake 2418efa6e2edSEric Blake /* 2419636cb512SEric Blake * The driver's result must be a non-zero multiple of request_alignment. 2420efa6e2edSEric Blake * Clamp pnum and adjust map to original request. 2421efa6e2edSEric Blake */ 2422636cb512SEric Blake assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2423636cb512SEric Blake align > offset - aligned_offset); 242469f47505SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_RECURSE) { 242569f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_DATA); 242669f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_OFFSET_VALID); 242769f47505SVladimir Sementsov-Ogievskiy assert(!(ret & BDRV_BLOCK_ZERO)); 242869f47505SVladimir Sementsov-Ogievskiy } 242969f47505SVladimir Sementsov-Ogievskiy 2430efa6e2edSEric Blake *pnum -= offset - aligned_offset; 2431efa6e2edSEric Blake if (*pnum > bytes) { 2432efa6e2edSEric Blake *pnum = bytes; 2433efa6e2edSEric Blake } 2434efa6e2edSEric Blake if (ret & BDRV_BLOCK_OFFSET_VALID) { 2435efa6e2edSEric Blake local_map += offset - aligned_offset; 2436efa6e2edSEric Blake } 243761007b31SStefan Hajnoczi 243861007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_RAW) { 2439298a1665SEric Blake assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 24402e8bc787SEric Blake ret = bdrv_co_block_status(local_file, want_zero, local_map, 24412e8bc787SEric Blake *pnum, pnum, &local_map, &local_file); 244299723548SPaolo Bonzini goto out; 244361007b31SStefan Hajnoczi } 244461007b31SStefan Hajnoczi 244561007b31SStefan Hajnoczi if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 244661007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ALLOCATED; 2447d40f4a56SAlberto Garcia } else if (bs->drv->supports_backing) { 2448cb850315SMax Reitz BlockDriverState *cow_bs = bdrv_cow_bs(bs); 2449cb850315SMax Reitz 2450d40f4a56SAlberto Garcia if (!cow_bs) { 2451d40f4a56SAlberto Garcia ret |= BDRV_BLOCK_ZERO; 2452d40f4a56SAlberto Garcia } else if (want_zero) { 2453cb850315SMax Reitz int64_t size2 = bdrv_getlength(cow_bs); 2454c9ce8c4dSEric Blake 24552e8bc787SEric Blake if (size2 >= 0 && offset >= size2) { 245661007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 245761007b31SStefan Hajnoczi } 24587b1efe99SVladimir Sementsov-Ogievskiy } 245961007b31SStefan Hajnoczi } 246061007b31SStefan Hajnoczi 246169f47505SVladimir Sementsov-Ogievskiy if (want_zero && ret & BDRV_BLOCK_RECURSE && 246269f47505SVladimir Sementsov-Ogievskiy local_file && local_file != bs && 246361007b31SStefan Hajnoczi (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 246461007b31SStefan Hajnoczi (ret & BDRV_BLOCK_OFFSET_VALID)) { 24652e8bc787SEric Blake int64_t file_pnum; 24662e8bc787SEric Blake int ret2; 246761007b31SStefan Hajnoczi 24682e8bc787SEric Blake ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 24692e8bc787SEric Blake *pnum, &file_pnum, NULL, NULL); 247061007b31SStefan Hajnoczi if (ret2 >= 0) { 247161007b31SStefan Hajnoczi /* Ignore errors. This is just providing extra information, it 247261007b31SStefan Hajnoczi * is useful but not necessary. 247361007b31SStefan Hajnoczi */ 2474c61e684eSEric Blake if (ret2 & BDRV_BLOCK_EOF && 2475c61e684eSEric Blake (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2476c61e684eSEric Blake /* 2477c61e684eSEric Blake * It is valid for the format block driver to read 2478c61e684eSEric Blake * beyond the end of the underlying file's current 2479c61e684eSEric Blake * size; such areas read as zero. 2480c61e684eSEric Blake */ 248161007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 248261007b31SStefan Hajnoczi } else { 248361007b31SStefan Hajnoczi /* Limit request to the range reported by the protocol driver */ 248461007b31SStefan Hajnoczi *pnum = file_pnum; 248561007b31SStefan Hajnoczi ret |= (ret2 & BDRV_BLOCK_ZERO); 248661007b31SStefan Hajnoczi } 248761007b31SStefan Hajnoczi } 248861007b31SStefan Hajnoczi } 248961007b31SStefan Hajnoczi 249099723548SPaolo Bonzini out: 249199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 24922e8bc787SEric Blake if (ret >= 0 && offset + *pnum == total_size) { 2493fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2494fb0d8654SEric Blake } 2495298a1665SEric Blake early_out: 2496298a1665SEric Blake if (file) { 2497298a1665SEric Blake *file = local_file; 2498298a1665SEric Blake } 24992e8bc787SEric Blake if (map) { 25002e8bc787SEric Blake *map = local_map; 25012e8bc787SEric Blake } 250261007b31SStefan Hajnoczi return ret; 250361007b31SStefan Hajnoczi } 250461007b31SStefan Hajnoczi 250521c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2506f9e694cbSVladimir Sementsov-Ogievskiy bdrv_co_common_block_status_above(BlockDriverState *bs, 2507ba3f0e25SFam Zheng BlockDriverState *base, 25083555a432SVladimir Sementsov-Ogievskiy bool include_base, 2509c9ce8c4dSEric Blake bool want_zero, 25105b648c67SEric Blake int64_t offset, 25115b648c67SEric Blake int64_t bytes, 25125b648c67SEric Blake int64_t *pnum, 25135b648c67SEric Blake int64_t *map, 2514a92b1b06SEric Blake BlockDriverState **file, 2515a92b1b06SEric Blake int *depth) 2516ba3f0e25SFam Zheng { 251767c095c8SVladimir Sementsov-Ogievskiy int ret; 2518ba3f0e25SFam Zheng BlockDriverState *p; 251967c095c8SVladimir Sementsov-Ogievskiy int64_t eof = 0; 2520a92b1b06SEric Blake int dummy; 2521ba3f0e25SFam Zheng 25223555a432SVladimir Sementsov-Ogievskiy assert(!include_base || base); /* Can't include NULL base */ 252367c095c8SVladimir Sementsov-Ogievskiy 2524a92b1b06SEric Blake if (!depth) { 2525a92b1b06SEric Blake depth = &dummy; 2526a92b1b06SEric Blake } 2527a92b1b06SEric Blake *depth = 0; 2528a92b1b06SEric Blake 2529624f27bbSVladimir Sementsov-Ogievskiy if (!include_base && bs == base) { 2530624f27bbSVladimir Sementsov-Ogievskiy *pnum = bytes; 2531624f27bbSVladimir Sementsov-Ogievskiy return 0; 2532624f27bbSVladimir Sementsov-Ogievskiy } 2533624f27bbSVladimir Sementsov-Ogievskiy 253467c095c8SVladimir Sementsov-Ogievskiy ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file); 2535a92b1b06SEric Blake ++*depth; 25363555a432SVladimir Sementsov-Ogievskiy if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { 253767c095c8SVladimir Sementsov-Ogievskiy return ret; 253867c095c8SVladimir Sementsov-Ogievskiy } 253967c095c8SVladimir Sementsov-Ogievskiy 254067c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_EOF) { 254167c095c8SVladimir Sementsov-Ogievskiy eof = offset + *pnum; 254267c095c8SVladimir Sementsov-Ogievskiy } 254367c095c8SVladimir Sementsov-Ogievskiy 254467c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 254567c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 254667c095c8SVladimir Sementsov-Ogievskiy 25473555a432SVladimir Sementsov-Ogievskiy for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; 254867c095c8SVladimir Sementsov-Ogievskiy p = bdrv_filter_or_cow_bs(p)) 254967c095c8SVladimir Sementsov-Ogievskiy { 25505b648c67SEric Blake ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 25515b648c67SEric Blake file); 2552a92b1b06SEric Blake ++*depth; 2553c61e684eSEric Blake if (ret < 0) { 255467c095c8SVladimir Sementsov-Ogievskiy return ret; 2555c61e684eSEric Blake } 255667c095c8SVladimir Sementsov-Ogievskiy if (*pnum == 0) { 2557c61e684eSEric Blake /* 255867c095c8SVladimir Sementsov-Ogievskiy * The top layer deferred to this layer, and because this layer is 255967c095c8SVladimir Sementsov-Ogievskiy * short, any zeroes that we synthesize beyond EOF behave as if they 256067c095c8SVladimir Sementsov-Ogievskiy * were allocated at this layer. 256167c095c8SVladimir Sementsov-Ogievskiy * 256267c095c8SVladimir Sementsov-Ogievskiy * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be 256367c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 256467c095c8SVladimir Sementsov-Ogievskiy * below. 2565c61e684eSEric Blake */ 256667c095c8SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_EOF); 25675b648c67SEric Blake *pnum = bytes; 256867c095c8SVladimir Sementsov-Ogievskiy if (file) { 256967c095c8SVladimir Sementsov-Ogievskiy *file = p; 2570c61e684eSEric Blake } 257167c095c8SVladimir Sementsov-Ogievskiy ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED; 2572ba3f0e25SFam Zheng break; 2573ba3f0e25SFam Zheng } 257467c095c8SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_ALLOCATED) { 257567c095c8SVladimir Sementsov-Ogievskiy /* 257667c095c8SVladimir Sementsov-Ogievskiy * We've found the node and the status, we must break. 257767c095c8SVladimir Sementsov-Ogievskiy * 257867c095c8SVladimir Sementsov-Ogievskiy * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be 257967c095c8SVladimir Sementsov-Ogievskiy * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see 258067c095c8SVladimir Sementsov-Ogievskiy * below. 258167c095c8SVladimir Sementsov-Ogievskiy */ 258267c095c8SVladimir Sementsov-Ogievskiy ret &= ~BDRV_BLOCK_EOF; 258367c095c8SVladimir Sementsov-Ogievskiy break; 2584ba3f0e25SFam Zheng } 258567c095c8SVladimir Sementsov-Ogievskiy 25863555a432SVladimir Sementsov-Ogievskiy if (p == base) { 25873555a432SVladimir Sementsov-Ogievskiy assert(include_base); 25883555a432SVladimir Sementsov-Ogievskiy break; 25893555a432SVladimir Sementsov-Ogievskiy } 25903555a432SVladimir Sementsov-Ogievskiy 259167c095c8SVladimir Sementsov-Ogievskiy /* 259267c095c8SVladimir Sementsov-Ogievskiy * OK, [offset, offset + *pnum) region is unallocated on this layer, 259367c095c8SVladimir Sementsov-Ogievskiy * let's continue the diving. 259467c095c8SVladimir Sementsov-Ogievskiy */ 259567c095c8SVladimir Sementsov-Ogievskiy assert(*pnum <= bytes); 259667c095c8SVladimir Sementsov-Ogievskiy bytes = *pnum; 259767c095c8SVladimir Sementsov-Ogievskiy } 259867c095c8SVladimir Sementsov-Ogievskiy 259967c095c8SVladimir Sementsov-Ogievskiy if (offset + *pnum == eof) { 260067c095c8SVladimir Sementsov-Ogievskiy ret |= BDRV_BLOCK_EOF; 260167c095c8SVladimir Sementsov-Ogievskiy } 260267c095c8SVladimir Sementsov-Ogievskiy 2603ba3f0e25SFam Zheng return ret; 2604ba3f0e25SFam Zheng } 2605ba3f0e25SFam Zheng 260631826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 260731826642SEric Blake int64_t offset, int64_t bytes, int64_t *pnum, 260831826642SEric Blake int64_t *map, BlockDriverState **file) 2609c9ce8c4dSEric Blake { 26103555a432SVladimir Sementsov-Ogievskiy return bdrv_common_block_status_above(bs, base, false, true, offset, bytes, 2611a92b1b06SEric Blake pnum, map, file, NULL); 2612c9ce8c4dSEric Blake } 2613c9ce8c4dSEric Blake 2614237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2615237d78f8SEric Blake int64_t *pnum, int64_t *map, BlockDriverState **file) 2616ba3f0e25SFam Zheng { 2617cb850315SMax Reitz return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs), 261831826642SEric Blake offset, bytes, pnum, map, file); 2619ba3f0e25SFam Zheng } 2620ba3f0e25SFam Zheng 262146cd1e8aSAlberto Garcia /* 262246cd1e8aSAlberto Garcia * Check @bs (and its backing chain) to see if the range defined 262346cd1e8aSAlberto Garcia * by @offset and @bytes is known to read as zeroes. 262446cd1e8aSAlberto Garcia * Return 1 if that is the case, 0 otherwise and -errno on error. 262546cd1e8aSAlberto Garcia * This test is meant to be fast rather than accurate so returning 0 262646cd1e8aSAlberto Garcia * does not guarantee non-zero data. 262746cd1e8aSAlberto Garcia */ 262846cd1e8aSAlberto Garcia int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, 262946cd1e8aSAlberto Garcia int64_t bytes) 263046cd1e8aSAlberto Garcia { 263146cd1e8aSAlberto Garcia int ret; 263246cd1e8aSAlberto Garcia int64_t pnum = bytes; 263346cd1e8aSAlberto Garcia 263446cd1e8aSAlberto Garcia if (!bytes) { 263546cd1e8aSAlberto Garcia return 1; 263646cd1e8aSAlberto Garcia } 263746cd1e8aSAlberto Garcia 263846cd1e8aSAlberto Garcia ret = bdrv_common_block_status_above(bs, NULL, false, false, offset, 2639a92b1b06SEric Blake bytes, &pnum, NULL, NULL, NULL); 264046cd1e8aSAlberto Garcia 264146cd1e8aSAlberto Garcia if (ret < 0) { 264246cd1e8aSAlberto Garcia return ret; 264346cd1e8aSAlberto Garcia } 264446cd1e8aSAlberto Garcia 264546cd1e8aSAlberto Garcia return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); 264646cd1e8aSAlberto Garcia } 264746cd1e8aSAlberto Garcia 2648d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2649d6a644bbSEric Blake int64_t bytes, int64_t *pnum) 265061007b31SStefan Hajnoczi { 26517ddb99b9SEric Blake int ret; 26527ddb99b9SEric Blake int64_t dummy; 2653d6a644bbSEric Blake 26543555a432SVladimir Sementsov-Ogievskiy ret = bdrv_common_block_status_above(bs, bs, true, false, offset, 26553555a432SVladimir Sementsov-Ogievskiy bytes, pnum ? pnum : &dummy, NULL, 2656a92b1b06SEric Blake NULL, NULL); 265761007b31SStefan Hajnoczi if (ret < 0) { 265861007b31SStefan Hajnoczi return ret; 265961007b31SStefan Hajnoczi } 266061007b31SStefan Hajnoczi return !!(ret & BDRV_BLOCK_ALLOCATED); 266161007b31SStefan Hajnoczi } 266261007b31SStefan Hajnoczi 266361007b31SStefan Hajnoczi /* 266461007b31SStefan Hajnoczi * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 266561007b31SStefan Hajnoczi * 2666a92b1b06SEric Blake * Return a positive depth if (a prefix of) the given range is allocated 2667a92b1b06SEric Blake * in any image between BASE and TOP (BASE is only included if include_base 2668a92b1b06SEric Blake * is set). Depth 1 is TOP, 2 is the first backing layer, and so forth. 2669170d3bd3SAndrey Shinkevich * BASE can be NULL to check if the given offset is allocated in any 2670170d3bd3SAndrey Shinkevich * image of the chain. Return 0 otherwise, or negative errno on 2671170d3bd3SAndrey Shinkevich * failure. 267261007b31SStefan Hajnoczi * 267351b0a488SEric Blake * 'pnum' is set to the number of bytes (including and immediately 267451b0a488SEric Blake * following the specified offset) that are known to be in the same 267551b0a488SEric Blake * allocated/unallocated state. Note that a subsequent call starting 267651b0a488SEric Blake * at 'offset + *pnum' may return the same allocation status (in other 267751b0a488SEric Blake * words, the result is not necessarily the maximum possible range); 267851b0a488SEric Blake * but 'pnum' will only be 0 when end of file is reached. 267961007b31SStefan Hajnoczi */ 268061007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top, 268161007b31SStefan Hajnoczi BlockDriverState *base, 2682170d3bd3SAndrey Shinkevich bool include_base, int64_t offset, 2683170d3bd3SAndrey Shinkevich int64_t bytes, int64_t *pnum) 268461007b31SStefan Hajnoczi { 2685a92b1b06SEric Blake int depth; 26867e7e5100SVladimir Sementsov-Ogievskiy int ret = bdrv_common_block_status_above(top, base, include_base, false, 2687a92b1b06SEric Blake offset, bytes, pnum, NULL, NULL, 2688a92b1b06SEric Blake &depth); 268961007b31SStefan Hajnoczi if (ret < 0) { 269061007b31SStefan Hajnoczi return ret; 2691d6a644bbSEric Blake } 269261007b31SStefan Hajnoczi 2693a92b1b06SEric Blake if (ret & BDRV_BLOCK_ALLOCATED) { 2694a92b1b06SEric Blake return depth; 2695a92b1b06SEric Blake } 2696a92b1b06SEric Blake return 0; 269761007b31SStefan Hajnoczi } 269861007b31SStefan Hajnoczi 269921c2283eSVladimir Sementsov-Ogievskiy int coroutine_fn 2700b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 27011a8ae822SKevin Wolf { 27021a8ae822SKevin Wolf BlockDriver *drv = bs->drv; 2703c4db2e25SMax Reitz BlockDriverState *child_bs = bdrv_primary_bs(bs); 2704dc88a467SStefan Hajnoczi int ret = -ENOTSUP; 2705dc88a467SStefan Hajnoczi 2706b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2707b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2708b33b354fSVladimir Sementsov-Ogievskiy } 2709b33b354fSVladimir Sementsov-Ogievskiy 2710dc88a467SStefan Hajnoczi bdrv_inc_in_flight(bs); 27111a8ae822SKevin Wolf 2712b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_load_vmstate) { 2713dc88a467SStefan Hajnoczi ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2714c4db2e25SMax Reitz } else if (child_bs) { 2715b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); 27161a8ae822SKevin Wolf } 27171a8ae822SKevin Wolf 2718dc88a467SStefan Hajnoczi bdrv_dec_in_flight(bs); 2719b33b354fSVladimir Sementsov-Ogievskiy 2720b33b354fSVladimir Sementsov-Ogievskiy return ret; 2721b33b354fSVladimir Sementsov-Ogievskiy } 2722b33b354fSVladimir Sementsov-Ogievskiy 2723b33b354fSVladimir Sementsov-Ogievskiy int coroutine_fn 2724b33b354fSVladimir Sementsov-Ogievskiy bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 2725b33b354fSVladimir Sementsov-Ogievskiy { 2726b33b354fSVladimir Sementsov-Ogievskiy BlockDriver *drv = bs->drv; 2727b33b354fSVladimir Sementsov-Ogievskiy BlockDriverState *child_bs = bdrv_primary_bs(bs); 2728b33b354fSVladimir Sementsov-Ogievskiy int ret = -ENOTSUP; 2729b33b354fSVladimir Sementsov-Ogievskiy 2730b33b354fSVladimir Sementsov-Ogievskiy if (!drv) { 2731b33b354fSVladimir Sementsov-Ogievskiy return -ENOMEDIUM; 2732b33b354fSVladimir Sementsov-Ogievskiy } 2733b33b354fSVladimir Sementsov-Ogievskiy 2734b33b354fSVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(bs); 2735b33b354fSVladimir Sementsov-Ogievskiy 2736b33b354fSVladimir Sementsov-Ogievskiy if (drv->bdrv_save_vmstate) { 2737b33b354fSVladimir Sementsov-Ogievskiy ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2738b33b354fSVladimir Sementsov-Ogievskiy } else if (child_bs) { 2739b33b354fSVladimir Sementsov-Ogievskiy ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); 2740b33b354fSVladimir Sementsov-Ogievskiy } 2741b33b354fSVladimir Sementsov-Ogievskiy 2742b33b354fSVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(bs); 2743b33b354fSVladimir Sementsov-Ogievskiy 2744dc88a467SStefan Hajnoczi return ret; 27451a8ae822SKevin Wolf } 27461a8ae822SKevin Wolf 274761007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 274861007b31SStefan Hajnoczi int64_t pos, int size) 274961007b31SStefan Hajnoczi { 27500d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2751b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_writev_vmstate(bs, &qiov, pos); 275261007b31SStefan Hajnoczi 2753b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 275461007b31SStefan Hajnoczi } 275561007b31SStefan Hajnoczi 275661007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 275761007b31SStefan Hajnoczi int64_t pos, int size) 275861007b31SStefan Hajnoczi { 27590d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2760b33b354fSVladimir Sementsov-Ogievskiy int ret = bdrv_readv_vmstate(bs, &qiov, pos); 27615ddda0b8SKevin Wolf 2762b33b354fSVladimir Sementsov-Ogievskiy return ret < 0 ? ret : size; 276361007b31SStefan Hajnoczi } 276461007b31SStefan Hajnoczi 276561007b31SStefan Hajnoczi /**************************************************************/ 276661007b31SStefan Hajnoczi /* async I/Os */ 276761007b31SStefan Hajnoczi 276861007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb) 276961007b31SStefan Hajnoczi { 277061007b31SStefan Hajnoczi qemu_aio_ref(acb); 277161007b31SStefan Hajnoczi bdrv_aio_cancel_async(acb); 277261007b31SStefan Hajnoczi while (acb->refcnt > 1) { 277361007b31SStefan Hajnoczi if (acb->aiocb_info->get_aio_context) { 277461007b31SStefan Hajnoczi aio_poll(acb->aiocb_info->get_aio_context(acb), true); 277561007b31SStefan Hajnoczi } else if (acb->bs) { 27762f47da5fSPaolo Bonzini /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 27772f47da5fSPaolo Bonzini * assert that we're not using an I/O thread. Thread-safe 27782f47da5fSPaolo Bonzini * code should use bdrv_aio_cancel_async exclusively. 27792f47da5fSPaolo Bonzini */ 27802f47da5fSPaolo Bonzini assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 278161007b31SStefan Hajnoczi aio_poll(bdrv_get_aio_context(acb->bs), true); 278261007b31SStefan Hajnoczi } else { 278361007b31SStefan Hajnoczi abort(); 278461007b31SStefan Hajnoczi } 278561007b31SStefan Hajnoczi } 278661007b31SStefan Hajnoczi qemu_aio_unref(acb); 278761007b31SStefan Hajnoczi } 278861007b31SStefan Hajnoczi 278961007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements 279061007b31SStefan Hajnoczi * cancel_async, otherwise we do nothing and let the request normally complete. 279161007b31SStefan Hajnoczi * In either case the completion callback must be called. */ 279261007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb) 279361007b31SStefan Hajnoczi { 279461007b31SStefan Hajnoczi if (acb->aiocb_info->cancel_async) { 279561007b31SStefan Hajnoczi acb->aiocb_info->cancel_async(acb); 279661007b31SStefan Hajnoczi } 279761007b31SStefan Hajnoczi } 279861007b31SStefan Hajnoczi 279961007b31SStefan Hajnoczi /**************************************************************/ 280061007b31SStefan Hajnoczi /* Coroutine block device emulation */ 280161007b31SStefan Hajnoczi 280261007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 280361007b31SStefan Hajnoczi { 2804883833e2SMax Reitz BdrvChild *primary_child = bdrv_primary_child(bs); 2805883833e2SMax Reitz BdrvChild *child; 280649ca6259SFam Zheng int current_gen; 280749ca6259SFam Zheng int ret = 0; 280861007b31SStefan Hajnoczi 280999723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2810c32b82afSPavel Dovgalyuk 2811e914404eSFam Zheng if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 281249ca6259SFam Zheng bdrv_is_sg(bs)) { 281349ca6259SFam Zheng goto early_exit; 281449ca6259SFam Zheng } 281549ca6259SFam Zheng 28163783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 2817d73415a3SStefan Hajnoczi current_gen = qatomic_read(&bs->write_gen); 28183ff2f67aSEvgeny Yakovlev 28193ff2f67aSEvgeny Yakovlev /* Wait until any previous flushes are completed */ 282099723548SPaolo Bonzini while (bs->active_flush_req) { 28213783fa3dSPaolo Bonzini qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 28223ff2f67aSEvgeny Yakovlev } 28233ff2f67aSEvgeny Yakovlev 28243783fa3dSPaolo Bonzini /* Flushes reach this point in nondecreasing current_gen order. */ 282599723548SPaolo Bonzini bs->active_flush_req = true; 28263783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 28273ff2f67aSEvgeny Yakovlev 2828c32b82afSPavel Dovgalyuk /* Write back all layers by calling one driver function */ 2829c32b82afSPavel Dovgalyuk if (bs->drv->bdrv_co_flush) { 2830c32b82afSPavel Dovgalyuk ret = bs->drv->bdrv_co_flush(bs); 2831c32b82afSPavel Dovgalyuk goto out; 2832c32b82afSPavel Dovgalyuk } 2833c32b82afSPavel Dovgalyuk 283461007b31SStefan Hajnoczi /* Write back cached data to the OS even with cache=unsafe */ 2835883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS); 283661007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_os) { 283761007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_os(bs); 283861007b31SStefan Hajnoczi if (ret < 0) { 2839cdb5e315SFam Zheng goto out; 284061007b31SStefan Hajnoczi } 284161007b31SStefan Hajnoczi } 284261007b31SStefan Hajnoczi 284361007b31SStefan Hajnoczi /* But don't actually force it to the disk with cache=unsafe */ 284461007b31SStefan Hajnoczi if (bs->open_flags & BDRV_O_NO_FLUSH) { 2845883833e2SMax Reitz goto flush_children; 284661007b31SStefan Hajnoczi } 284761007b31SStefan Hajnoczi 28483ff2f67aSEvgeny Yakovlev /* Check if we really need to flush anything */ 28493ff2f67aSEvgeny Yakovlev if (bs->flushed_gen == current_gen) { 2850883833e2SMax Reitz goto flush_children; 28513ff2f67aSEvgeny Yakovlev } 28523ff2f67aSEvgeny Yakovlev 2853883833e2SMax Reitz BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK); 2854d470ad42SMax Reitz if (!bs->drv) { 2855d470ad42SMax Reitz /* bs->drv->bdrv_co_flush() might have ejected the BDS 2856d470ad42SMax Reitz * (even in case of apparent success) */ 2857d470ad42SMax Reitz ret = -ENOMEDIUM; 2858d470ad42SMax Reitz goto out; 2859d470ad42SMax Reitz } 286061007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_disk) { 286161007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_disk(bs); 286261007b31SStefan Hajnoczi } else if (bs->drv->bdrv_aio_flush) { 286361007b31SStefan Hajnoczi BlockAIOCB *acb; 286461007b31SStefan Hajnoczi CoroutineIOCompletion co = { 286561007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 286661007b31SStefan Hajnoczi }; 286761007b31SStefan Hajnoczi 286861007b31SStefan Hajnoczi acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 286961007b31SStefan Hajnoczi if (acb == NULL) { 287061007b31SStefan Hajnoczi ret = -EIO; 287161007b31SStefan Hajnoczi } else { 287261007b31SStefan Hajnoczi qemu_coroutine_yield(); 287361007b31SStefan Hajnoczi ret = co.ret; 287461007b31SStefan Hajnoczi } 287561007b31SStefan Hajnoczi } else { 287661007b31SStefan Hajnoczi /* 287761007b31SStefan Hajnoczi * Some block drivers always operate in either writethrough or unsafe 287861007b31SStefan Hajnoczi * mode and don't support bdrv_flush therefore. Usually qemu doesn't 287961007b31SStefan Hajnoczi * know how the server works (because the behaviour is hardcoded or 288061007b31SStefan Hajnoczi * depends on server-side configuration), so we can't ensure that 288161007b31SStefan Hajnoczi * everything is safe on disk. Returning an error doesn't work because 288261007b31SStefan Hajnoczi * that would break guests even if the server operates in writethrough 288361007b31SStefan Hajnoczi * mode. 288461007b31SStefan Hajnoczi * 288561007b31SStefan Hajnoczi * Let's hope the user knows what he's doing. 288661007b31SStefan Hajnoczi */ 288761007b31SStefan Hajnoczi ret = 0; 288861007b31SStefan Hajnoczi } 28893ff2f67aSEvgeny Yakovlev 289061007b31SStefan Hajnoczi if (ret < 0) { 2891cdb5e315SFam Zheng goto out; 289261007b31SStefan Hajnoczi } 289361007b31SStefan Hajnoczi 289461007b31SStefan Hajnoczi /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 289561007b31SStefan Hajnoczi * in the case of cache=unsafe, so there are no useless flushes. 289661007b31SStefan Hajnoczi */ 2897883833e2SMax Reitz flush_children: 2898883833e2SMax Reitz ret = 0; 2899883833e2SMax Reitz QLIST_FOREACH(child, &bs->children, next) { 2900883833e2SMax Reitz if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { 2901883833e2SMax Reitz int this_child_ret = bdrv_co_flush(child->bs); 2902883833e2SMax Reitz if (!ret) { 2903883833e2SMax Reitz ret = this_child_ret; 2904883833e2SMax Reitz } 2905883833e2SMax Reitz } 2906883833e2SMax Reitz } 2907883833e2SMax Reitz 2908cdb5e315SFam Zheng out: 29093ff2f67aSEvgeny Yakovlev /* Notify any pending flushes that we have completed */ 2910e6af1e08SKevin Wolf if (ret == 0) { 29113ff2f67aSEvgeny Yakovlev bs->flushed_gen = current_gen; 2912e6af1e08SKevin Wolf } 29133783fa3dSPaolo Bonzini 29143783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 291599723548SPaolo Bonzini bs->active_flush_req = false; 2916156af3acSDenis V. Lunev /* Return value is ignored - it's ok if wait queue is empty */ 2917156af3acSDenis V. Lunev qemu_co_queue_next(&bs->flush_queue); 29183783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 29193ff2f67aSEvgeny Yakovlev 292049ca6259SFam Zheng early_exit: 292199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2922cdb5e315SFam Zheng return ret; 292361007b31SStefan Hajnoczi } 292461007b31SStefan Hajnoczi 2925d93e5726SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, 2926d93e5726SVladimir Sementsov-Ogievskiy int64_t bytes) 292761007b31SStefan Hajnoczi { 2928b1066c87SFam Zheng BdrvTrackedRequest req; 29299f1963b3SEric Blake int max_pdiscard, ret; 29303482b9bcSEric Blake int head, tail, align; 29310b9fd3f4SFam Zheng BlockDriverState *bs = child->bs; 293261007b31SStefan Hajnoczi 2933d93e5726SVladimir Sementsov-Ogievskiy if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { 293461007b31SStefan Hajnoczi return -ENOMEDIUM; 293561007b31SStefan Hajnoczi } 293661007b31SStefan Hajnoczi 2937d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2938d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2939d6883bc9SVladimir Sementsov-Ogievskiy } 2940d6883bc9SVladimir Sementsov-Ogievskiy 294169b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, bytes, NULL); 29428b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 29438b117001SVladimir Sementsov-Ogievskiy return ret; 294461007b31SStefan Hajnoczi } 294561007b31SStefan Hajnoczi 294661007b31SStefan Hajnoczi /* Do nothing if disabled. */ 294761007b31SStefan Hajnoczi if (!(bs->open_flags & BDRV_O_UNMAP)) { 294861007b31SStefan Hajnoczi return 0; 294961007b31SStefan Hajnoczi } 295061007b31SStefan Hajnoczi 295102aefe43SEric Blake if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 295261007b31SStefan Hajnoczi return 0; 295361007b31SStefan Hajnoczi } 295461007b31SStefan Hajnoczi 29553482b9bcSEric Blake /* Discard is advisory, but some devices track and coalesce 29563482b9bcSEric Blake * unaligned requests, so we must pass everything down rather than 29573482b9bcSEric Blake * round here. Still, most devices will just silently ignore 29583482b9bcSEric Blake * unaligned requests (by returning -ENOTSUP), so we must fragment 29593482b9bcSEric Blake * the request accordingly. */ 296002aefe43SEric Blake align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2961b8d0a980SEric Blake assert(align % bs->bl.request_alignment == 0); 2962b8d0a980SEric Blake head = offset % align; 2963f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % align; 29649f1963b3SEric Blake 296599723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2966f5a5ca79SManos Pitsidianakis tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 296750824995SFam Zheng 296800695c27SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2969ec050f77SDenis V. Lunev if (ret < 0) { 2970ec050f77SDenis V. Lunev goto out; 2971ec050f77SDenis V. Lunev } 2972ec050f77SDenis V. Lunev 29739f1963b3SEric Blake max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 29749f1963b3SEric Blake align); 29753482b9bcSEric Blake assert(max_pdiscard >= bs->bl.request_alignment); 29769f1963b3SEric Blake 2977f5a5ca79SManos Pitsidianakis while (bytes > 0) { 2978d93e5726SVladimir Sementsov-Ogievskiy int64_t num = bytes; 29793482b9bcSEric Blake 29803482b9bcSEric Blake if (head) { 29813482b9bcSEric Blake /* Make small requests to get to alignment boundaries. */ 2982f5a5ca79SManos Pitsidianakis num = MIN(bytes, align - head); 29833482b9bcSEric Blake if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 29843482b9bcSEric Blake num %= bs->bl.request_alignment; 29853482b9bcSEric Blake } 29863482b9bcSEric Blake head = (head + num) % align; 29873482b9bcSEric Blake assert(num < max_pdiscard); 29883482b9bcSEric Blake } else if (tail) { 29893482b9bcSEric Blake if (num > align) { 29903482b9bcSEric Blake /* Shorten the request to the last aligned cluster. */ 29913482b9bcSEric Blake num -= tail; 29923482b9bcSEric Blake } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 29933482b9bcSEric Blake tail > bs->bl.request_alignment) { 29943482b9bcSEric Blake tail %= bs->bl.request_alignment; 29953482b9bcSEric Blake num -= tail; 29963482b9bcSEric Blake } 29973482b9bcSEric Blake } 29983482b9bcSEric Blake /* limit request size */ 29993482b9bcSEric Blake if (num > max_pdiscard) { 30003482b9bcSEric Blake num = max_pdiscard; 30013482b9bcSEric Blake } 300261007b31SStefan Hajnoczi 3003d470ad42SMax Reitz if (!bs->drv) { 3004d470ad42SMax Reitz ret = -ENOMEDIUM; 3005d470ad42SMax Reitz goto out; 3006d470ad42SMax Reitz } 300747a5486dSEric Blake if (bs->drv->bdrv_co_pdiscard) { 300847a5486dSEric Blake ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 300961007b31SStefan Hajnoczi } else { 301061007b31SStefan Hajnoczi BlockAIOCB *acb; 301161007b31SStefan Hajnoczi CoroutineIOCompletion co = { 301261007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 301361007b31SStefan Hajnoczi }; 301461007b31SStefan Hajnoczi 30154da444a0SEric Blake acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 301661007b31SStefan Hajnoczi bdrv_co_io_em_complete, &co); 301761007b31SStefan Hajnoczi if (acb == NULL) { 3018b1066c87SFam Zheng ret = -EIO; 3019b1066c87SFam Zheng goto out; 302061007b31SStefan Hajnoczi } else { 302161007b31SStefan Hajnoczi qemu_coroutine_yield(); 302261007b31SStefan Hajnoczi ret = co.ret; 302361007b31SStefan Hajnoczi } 302461007b31SStefan Hajnoczi } 302561007b31SStefan Hajnoczi if (ret && ret != -ENOTSUP) { 3026b1066c87SFam Zheng goto out; 302761007b31SStefan Hajnoczi } 302861007b31SStefan Hajnoczi 30299f1963b3SEric Blake offset += num; 3030f5a5ca79SManos Pitsidianakis bytes -= num; 303161007b31SStefan Hajnoczi } 3032b1066c87SFam Zheng ret = 0; 3033b1066c87SFam Zheng out: 303400695c27SFam Zheng bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 3035b1066c87SFam Zheng tracked_request_end(&req); 303699723548SPaolo Bonzini bdrv_dec_in_flight(bs); 3037b1066c87SFam Zheng return ret; 303861007b31SStefan Hajnoczi } 303961007b31SStefan Hajnoczi 304048af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 304161007b31SStefan Hajnoczi { 304261007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 30435c5ae76aSFam Zheng CoroutineIOCompletion co = { 30445c5ae76aSFam Zheng .coroutine = qemu_coroutine_self(), 30455c5ae76aSFam Zheng }; 30465c5ae76aSFam Zheng BlockAIOCB *acb; 304761007b31SStefan Hajnoczi 304899723548SPaolo Bonzini bdrv_inc_in_flight(bs); 304916a389dcSKevin Wolf if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 30505c5ae76aSFam Zheng co.ret = -ENOTSUP; 30515c5ae76aSFam Zheng goto out; 30525c5ae76aSFam Zheng } 30535c5ae76aSFam Zheng 305416a389dcSKevin Wolf if (drv->bdrv_co_ioctl) { 305516a389dcSKevin Wolf co.ret = drv->bdrv_co_ioctl(bs, req, buf); 305616a389dcSKevin Wolf } else { 30575c5ae76aSFam Zheng acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 30585c5ae76aSFam Zheng if (!acb) { 3059c8a9fd80SFam Zheng co.ret = -ENOTSUP; 3060c8a9fd80SFam Zheng goto out; 30615c5ae76aSFam Zheng } 30625c5ae76aSFam Zheng qemu_coroutine_yield(); 306316a389dcSKevin Wolf } 30645c5ae76aSFam Zheng out: 306599723548SPaolo Bonzini bdrv_dec_in_flight(bs); 30665c5ae76aSFam Zheng return co.ret; 30675c5ae76aSFam Zheng } 30685c5ae76aSFam Zheng 306961007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size) 307061007b31SStefan Hajnoczi { 307161007b31SStefan Hajnoczi return qemu_memalign(bdrv_opt_mem_align(bs), size); 307261007b31SStefan Hajnoczi } 307361007b31SStefan Hajnoczi 307461007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size) 307561007b31SStefan Hajnoczi { 307661007b31SStefan Hajnoczi return memset(qemu_blockalign(bs, size), 0, size); 307761007b31SStefan Hajnoczi } 307861007b31SStefan Hajnoczi 307961007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 308061007b31SStefan Hajnoczi { 308161007b31SStefan Hajnoczi size_t align = bdrv_opt_mem_align(bs); 308261007b31SStefan Hajnoczi 308361007b31SStefan Hajnoczi /* Ensure that NULL is never returned on success */ 308461007b31SStefan Hajnoczi assert(align > 0); 308561007b31SStefan Hajnoczi if (size == 0) { 308661007b31SStefan Hajnoczi size = align; 308761007b31SStefan Hajnoczi } 308861007b31SStefan Hajnoczi 308961007b31SStefan Hajnoczi return qemu_try_memalign(align, size); 309061007b31SStefan Hajnoczi } 309161007b31SStefan Hajnoczi 309261007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 309361007b31SStefan Hajnoczi { 309461007b31SStefan Hajnoczi void *mem = qemu_try_blockalign(bs, size); 309561007b31SStefan Hajnoczi 309661007b31SStefan Hajnoczi if (mem) { 309761007b31SStefan Hajnoczi memset(mem, 0, size); 309861007b31SStefan Hajnoczi } 309961007b31SStefan Hajnoczi 310061007b31SStefan Hajnoczi return mem; 310161007b31SStefan Hajnoczi } 310261007b31SStefan Hajnoczi 310361007b31SStefan Hajnoczi /* 310461007b31SStefan Hajnoczi * Check if all memory in this vector is sector aligned. 310561007b31SStefan Hajnoczi */ 310661007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 310761007b31SStefan Hajnoczi { 310861007b31SStefan Hajnoczi int i; 31094196d2f0SDenis V. Lunev size_t alignment = bdrv_min_mem_align(bs); 311061007b31SStefan Hajnoczi 311161007b31SStefan Hajnoczi for (i = 0; i < qiov->niov; i++) { 311261007b31SStefan Hajnoczi if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 311361007b31SStefan Hajnoczi return false; 311461007b31SStefan Hajnoczi } 311561007b31SStefan Hajnoczi if (qiov->iov[i].iov_len % alignment) { 311661007b31SStefan Hajnoczi return false; 311761007b31SStefan Hajnoczi } 311861007b31SStefan Hajnoczi } 311961007b31SStefan Hajnoczi 312061007b31SStefan Hajnoczi return true; 312161007b31SStefan Hajnoczi } 312261007b31SStefan Hajnoczi 312361007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs, 312461007b31SStefan Hajnoczi NotifierWithReturn *notifier) 312561007b31SStefan Hajnoczi { 312661007b31SStefan Hajnoczi notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 312761007b31SStefan Hajnoczi } 312861007b31SStefan Hajnoczi 312961007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs) 313061007b31SStefan Hajnoczi { 31316b98bd64SPaolo Bonzini BdrvChild *child; 31326b98bd64SPaolo Bonzini 31336b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 31346b98bd64SPaolo Bonzini bdrv_io_plug(child->bs); 31356b98bd64SPaolo Bonzini } 31366b98bd64SPaolo Bonzini 3137d73415a3SStefan Hajnoczi if (qatomic_fetch_inc(&bs->io_plugged) == 0) { 313861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 313961007b31SStefan Hajnoczi if (drv && drv->bdrv_io_plug) { 314061007b31SStefan Hajnoczi drv->bdrv_io_plug(bs); 31416b98bd64SPaolo Bonzini } 314261007b31SStefan Hajnoczi } 314361007b31SStefan Hajnoczi } 314461007b31SStefan Hajnoczi 314561007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs) 314661007b31SStefan Hajnoczi { 31476b98bd64SPaolo Bonzini BdrvChild *child; 31486b98bd64SPaolo Bonzini 31496b98bd64SPaolo Bonzini assert(bs->io_plugged); 3150d73415a3SStefan Hajnoczi if (qatomic_fetch_dec(&bs->io_plugged) == 1) { 315161007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 315261007b31SStefan Hajnoczi if (drv && drv->bdrv_io_unplug) { 315361007b31SStefan Hajnoczi drv->bdrv_io_unplug(bs); 315461007b31SStefan Hajnoczi } 315561007b31SStefan Hajnoczi } 315661007b31SStefan Hajnoczi 31576b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 31586b98bd64SPaolo Bonzini bdrv_io_unplug(child->bs); 31596b98bd64SPaolo Bonzini } 31606b98bd64SPaolo Bonzini } 316123d0ba93SFam Zheng 316223d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 316323d0ba93SFam Zheng { 316423d0ba93SFam Zheng BdrvChild *child; 316523d0ba93SFam Zheng 316623d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_register_buf) { 316723d0ba93SFam Zheng bs->drv->bdrv_register_buf(bs, host, size); 316823d0ba93SFam Zheng } 316923d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 317023d0ba93SFam Zheng bdrv_register_buf(child->bs, host, size); 317123d0ba93SFam Zheng } 317223d0ba93SFam Zheng } 317323d0ba93SFam Zheng 317423d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host) 317523d0ba93SFam Zheng { 317623d0ba93SFam Zheng BdrvChild *child; 317723d0ba93SFam Zheng 317823d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_unregister_buf) { 317923d0ba93SFam Zheng bs->drv->bdrv_unregister_buf(bs, host); 318023d0ba93SFam Zheng } 318123d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 318223d0ba93SFam Zheng bdrv_unregister_buf(child->bs, host); 318323d0ba93SFam Zheng } 318423d0ba93SFam Zheng } 3185fcc67678SFam Zheng 318667b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal( 318767b51fb9SVladimir Sementsov-Ogievskiy BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 318867b51fb9SVladimir Sementsov-Ogievskiy uint64_t dst_offset, uint64_t bytes, 318967b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 3190fcc67678SFam Zheng bool recurse_src) 3191fcc67678SFam Zheng { 3192999658a0SVladimir Sementsov-Ogievskiy BdrvTrackedRequest req; 3193fcc67678SFam Zheng int ret; 3194fcc67678SFam Zheng 3195fe0480d6SKevin Wolf /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 3196fe0480d6SKevin Wolf assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 3197fe0480d6SKevin Wolf assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 3198fe0480d6SKevin Wolf 3199f4dad307SVladimir Sementsov-Ogievskiy if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) { 3200fcc67678SFam Zheng return -ENOMEDIUM; 3201fcc67678SFam Zheng } 320263f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(dst_offset, bytes, NULL, 0); 3203fcc67678SFam Zheng if (ret) { 3204fcc67678SFam Zheng return ret; 3205fcc67678SFam Zheng } 320667b51fb9SVladimir Sementsov-Ogievskiy if (write_flags & BDRV_REQ_ZERO_WRITE) { 320767b51fb9SVladimir Sementsov-Ogievskiy return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 3208fcc67678SFam Zheng } 3209fcc67678SFam Zheng 3210f4dad307SVladimir Sementsov-Ogievskiy if (!src || !src->bs || !bdrv_is_inserted(src->bs)) { 3211d4d3e5a0SFam Zheng return -ENOMEDIUM; 3212d4d3e5a0SFam Zheng } 321363f4ad11SVladimir Sementsov-Ogievskiy ret = bdrv_check_request32(src_offset, bytes, NULL, 0); 3214d4d3e5a0SFam Zheng if (ret) { 3215d4d3e5a0SFam Zheng return ret; 3216d4d3e5a0SFam Zheng } 3217d4d3e5a0SFam Zheng 3218fcc67678SFam Zheng if (!src->bs->drv->bdrv_co_copy_range_from 3219fcc67678SFam Zheng || !dst->bs->drv->bdrv_co_copy_range_to 3220fcc67678SFam Zheng || src->bs->encrypted || dst->bs->encrypted) { 3221fcc67678SFam Zheng return -ENOTSUP; 3222fcc67678SFam Zheng } 3223999658a0SVladimir Sementsov-Ogievskiy 3224999658a0SVladimir Sementsov-Ogievskiy if (recurse_src) { 3225d4d3e5a0SFam Zheng bdrv_inc_in_flight(src->bs); 3226999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, src->bs, src_offset, bytes, 3227999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_READ); 322837aec7d7SFam Zheng 322909d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 323009d2f948SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_SERIALISING)); 3231304d9d7fSMax Reitz bdrv_wait_serialising_requests(&req); 3232999658a0SVladimir Sementsov-Ogievskiy 323337aec7d7SFam Zheng ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 3234fcc67678SFam Zheng src, src_offset, 3235fcc67678SFam Zheng dst, dst_offset, 323667b51fb9SVladimir Sementsov-Ogievskiy bytes, 323767b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 3238999658a0SVladimir Sementsov-Ogievskiy 3239999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3240999658a0SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(src->bs); 3241fcc67678SFam Zheng } else { 3242999658a0SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(dst->bs); 3243999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, dst->bs, dst_offset, bytes, 3244999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_WRITE); 32450eb1e891SFam Zheng ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 32460eb1e891SFam Zheng write_flags); 32470eb1e891SFam Zheng if (!ret) { 324837aec7d7SFam Zheng ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 3249fcc67678SFam Zheng src, src_offset, 3250fcc67678SFam Zheng dst, dst_offset, 325167b51fb9SVladimir Sementsov-Ogievskiy bytes, 325267b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 32530eb1e891SFam Zheng } 32540eb1e891SFam Zheng bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 3255999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3256d4d3e5a0SFam Zheng bdrv_dec_in_flight(dst->bs); 3257999658a0SVladimir Sementsov-Ogievskiy } 3258999658a0SVladimir Sementsov-Ogievskiy 325937aec7d7SFam Zheng return ret; 3260fcc67678SFam Zheng } 3261fcc67678SFam Zheng 3262fcc67678SFam Zheng /* Copy range from @src to @dst. 3263fcc67678SFam Zheng * 3264fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3265fcc67678SFam Zheng * semantics. */ 3266fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 3267fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 326867b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 326967b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 327067b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3271fcc67678SFam Zheng { 3272ecc983a5SFam Zheng trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3273ecc983a5SFam Zheng read_flags, write_flags); 3274fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 327567b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, true); 3276fcc67678SFam Zheng } 3277fcc67678SFam Zheng 3278fcc67678SFam Zheng /* Copy range from @src to @dst. 3279fcc67678SFam Zheng * 3280fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3281fcc67678SFam Zheng * semantics. */ 3282fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3283fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 328467b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 328567b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 328667b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3287fcc67678SFam Zheng { 3288ecc983a5SFam Zheng trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3289ecc983a5SFam Zheng read_flags, write_flags); 3290fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 329167b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, false); 3292fcc67678SFam Zheng } 3293fcc67678SFam Zheng 3294fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3295fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 329667b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, BdrvRequestFlags read_flags, 329767b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3298fcc67678SFam Zheng { 329937aec7d7SFam Zheng return bdrv_co_copy_range_from(src, src_offset, 3300fcc67678SFam Zheng dst, dst_offset, 330167b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags); 3302fcc67678SFam Zheng } 33033d9f2d2aSKevin Wolf 33043d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs) 33053d9f2d2aSKevin Wolf { 33063d9f2d2aSKevin Wolf BdrvChild *c; 33073d9f2d2aSKevin Wolf QLIST_FOREACH(c, &bs->parents, next_parent) { 3308bd86fb99SMax Reitz if (c->klass->resize) { 3309bd86fb99SMax Reitz c->klass->resize(c); 33103d9f2d2aSKevin Wolf } 33113d9f2d2aSKevin Wolf } 33123d9f2d2aSKevin Wolf } 33133d9f2d2aSKevin Wolf 33143d9f2d2aSKevin Wolf /** 33153d9f2d2aSKevin Wolf * Truncate file to 'offset' bytes (needed only for file protocols) 3316c80d8b06SMax Reitz * 3317c80d8b06SMax Reitz * If 'exact' is true, the file must be resized to exactly the given 3318c80d8b06SMax Reitz * 'offset'. Otherwise, it is sufficient for the node to be at least 3319c80d8b06SMax Reitz * 'offset' bytes in length. 33203d9f2d2aSKevin Wolf */ 3321c80d8b06SMax Reitz int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, 33227b8e4857SKevin Wolf PreallocMode prealloc, BdrvRequestFlags flags, 33237b8e4857SKevin Wolf Error **errp) 33243d9f2d2aSKevin Wolf { 33253d9f2d2aSKevin Wolf BlockDriverState *bs = child->bs; 332623b93525SMax Reitz BdrvChild *filtered, *backing; 33273d9f2d2aSKevin Wolf BlockDriver *drv = bs->drv; 33281bc5f09fSKevin Wolf BdrvTrackedRequest req; 33291bc5f09fSKevin Wolf int64_t old_size, new_bytes; 33303d9f2d2aSKevin Wolf int ret; 33313d9f2d2aSKevin Wolf 33323d9f2d2aSKevin Wolf 33333d9f2d2aSKevin Wolf /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 33343d9f2d2aSKevin Wolf if (!drv) { 33353d9f2d2aSKevin Wolf error_setg(errp, "No medium inserted"); 33363d9f2d2aSKevin Wolf return -ENOMEDIUM; 33373d9f2d2aSKevin Wolf } 33383d9f2d2aSKevin Wolf if (offset < 0) { 33393d9f2d2aSKevin Wolf error_setg(errp, "Image size cannot be negative"); 33403d9f2d2aSKevin Wolf return -EINVAL; 33413d9f2d2aSKevin Wolf } 33423d9f2d2aSKevin Wolf 334369b55e03SVladimir Sementsov-Ogievskiy ret = bdrv_check_request(offset, 0, errp); 33448b117001SVladimir Sementsov-Ogievskiy if (ret < 0) { 33458b117001SVladimir Sementsov-Ogievskiy return ret; 33468b117001SVladimir Sementsov-Ogievskiy } 33478b117001SVladimir Sementsov-Ogievskiy 33481bc5f09fSKevin Wolf old_size = bdrv_getlength(bs); 33491bc5f09fSKevin Wolf if (old_size < 0) { 33501bc5f09fSKevin Wolf error_setg_errno(errp, -old_size, "Failed to get old image size"); 33511bc5f09fSKevin Wolf return old_size; 33521bc5f09fSKevin Wolf } 33531bc5f09fSKevin Wolf 33541bc5f09fSKevin Wolf if (offset > old_size) { 33551bc5f09fSKevin Wolf new_bytes = offset - old_size; 33561bc5f09fSKevin Wolf } else { 33571bc5f09fSKevin Wolf new_bytes = 0; 33581bc5f09fSKevin Wolf } 33591bc5f09fSKevin Wolf 33603d9f2d2aSKevin Wolf bdrv_inc_in_flight(bs); 33615416a11eSFam Zheng tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 33625416a11eSFam Zheng BDRV_TRACKED_TRUNCATE); 33631bc5f09fSKevin Wolf 33641bc5f09fSKevin Wolf /* If we are growing the image and potentially using preallocation for the 33651bc5f09fSKevin Wolf * new area, we need to make sure that no write requests are made to it 33661bc5f09fSKevin Wolf * concurrently or they might be overwritten by preallocation. */ 33671bc5f09fSKevin Wolf if (new_bytes) { 33688ac5aab2SVladimir Sementsov-Ogievskiy bdrv_make_request_serialising(&req, 1); 3369cd47d792SFam Zheng } 3370cd47d792SFam Zheng if (bs->read_only) { 3371cd47d792SFam Zheng error_setg(errp, "Image is read-only"); 3372cd47d792SFam Zheng ret = -EACCES; 3373cd47d792SFam Zheng goto out; 3374cd47d792SFam Zheng } 3375cd47d792SFam Zheng ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3376cd47d792SFam Zheng 0); 3377cd47d792SFam Zheng if (ret < 0) { 3378cd47d792SFam Zheng error_setg_errno(errp, -ret, 3379cd47d792SFam Zheng "Failed to prepare request for truncation"); 3380cd47d792SFam Zheng goto out; 33811bc5f09fSKevin Wolf } 33823d9f2d2aSKevin Wolf 338393393e69SMax Reitz filtered = bdrv_filter_child(bs); 338423b93525SMax Reitz backing = bdrv_cow_child(bs); 338593393e69SMax Reitz 3386955c7d66SKevin Wolf /* 3387955c7d66SKevin Wolf * If the image has a backing file that is large enough that it would 3388955c7d66SKevin Wolf * provide data for the new area, we cannot leave it unallocated because 3389955c7d66SKevin Wolf * then the backing file content would become visible. Instead, zero-fill 3390955c7d66SKevin Wolf * the new area. 3391955c7d66SKevin Wolf * 3392955c7d66SKevin Wolf * Note that if the image has a backing file, but was opened without the 3393955c7d66SKevin Wolf * backing file, taking care of keeping things consistent with that backing 3394955c7d66SKevin Wolf * file is the user's responsibility. 3395955c7d66SKevin Wolf */ 339623b93525SMax Reitz if (new_bytes && backing) { 3397955c7d66SKevin Wolf int64_t backing_len; 3398955c7d66SKevin Wolf 339923b93525SMax Reitz backing_len = bdrv_getlength(backing->bs); 3400955c7d66SKevin Wolf if (backing_len < 0) { 3401955c7d66SKevin Wolf ret = backing_len; 3402955c7d66SKevin Wolf error_setg_errno(errp, -ret, "Could not get backing file size"); 3403955c7d66SKevin Wolf goto out; 3404955c7d66SKevin Wolf } 3405955c7d66SKevin Wolf 3406955c7d66SKevin Wolf if (backing_len > old_size) { 3407955c7d66SKevin Wolf flags |= BDRV_REQ_ZERO_WRITE; 3408955c7d66SKevin Wolf } 3409955c7d66SKevin Wolf } 3410955c7d66SKevin Wolf 34116b7e8f8bSMax Reitz if (drv->bdrv_co_truncate) { 341292b92799SKevin Wolf if (flags & ~bs->supported_truncate_flags) { 341392b92799SKevin Wolf error_setg(errp, "Block driver does not support requested flags"); 341492b92799SKevin Wolf ret = -ENOTSUP; 341592b92799SKevin Wolf goto out; 341692b92799SKevin Wolf } 341792b92799SKevin Wolf ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp); 341893393e69SMax Reitz } else if (filtered) { 341993393e69SMax Reitz ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp); 34206b7e8f8bSMax Reitz } else { 34213d9f2d2aSKevin Wolf error_setg(errp, "Image format driver does not support resize"); 34223d9f2d2aSKevin Wolf ret = -ENOTSUP; 34233d9f2d2aSKevin Wolf goto out; 34243d9f2d2aSKevin Wolf } 34253d9f2d2aSKevin Wolf if (ret < 0) { 34263d9f2d2aSKevin Wolf goto out; 34273d9f2d2aSKevin Wolf } 34286b7e8f8bSMax Reitz 34293d9f2d2aSKevin Wolf ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 34303d9f2d2aSKevin Wolf if (ret < 0) { 34313d9f2d2aSKevin Wolf error_setg_errno(errp, -ret, "Could not refresh total sector count"); 34323d9f2d2aSKevin Wolf } else { 34333d9f2d2aSKevin Wolf offset = bs->total_sectors * BDRV_SECTOR_SIZE; 34343d9f2d2aSKevin Wolf } 3435cd47d792SFam Zheng /* It's possible that truncation succeeded but refresh_total_sectors 3436cd47d792SFam Zheng * failed, but the latter doesn't affect how we should finish the request. 3437cd47d792SFam Zheng * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3438cd47d792SFam Zheng bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 34393d9f2d2aSKevin Wolf 34403d9f2d2aSKevin Wolf out: 34411bc5f09fSKevin Wolf tracked_request_end(&req); 34423d9f2d2aSKevin Wolf bdrv_dec_in_flight(bs); 34431bc5f09fSKevin Wolf 34443d9f2d2aSKevin Wolf return ret; 34453d9f2d2aSKevin Wolf } 3446