161007b31SStefan Hajnoczi /* 261007b31SStefan Hajnoczi * Block layer I/O functions 361007b31SStefan Hajnoczi * 461007b31SStefan Hajnoczi * Copyright (c) 2003 Fabrice Bellard 561007b31SStefan Hajnoczi * 661007b31SStefan Hajnoczi * Permission is hereby granted, free of charge, to any person obtaining a copy 761007b31SStefan Hajnoczi * of this software and associated documentation files (the "Software"), to deal 861007b31SStefan Hajnoczi * in the Software without restriction, including without limitation the rights 961007b31SStefan Hajnoczi * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1061007b31SStefan Hajnoczi * copies of the Software, and to permit persons to whom the Software is 1161007b31SStefan Hajnoczi * furnished to do so, subject to the following conditions: 1261007b31SStefan Hajnoczi * 1361007b31SStefan Hajnoczi * The above copyright notice and this permission notice shall be included in 1461007b31SStefan Hajnoczi * all copies or substantial portions of the Software. 1561007b31SStefan Hajnoczi * 1661007b31SStefan Hajnoczi * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1761007b31SStefan Hajnoczi * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1861007b31SStefan Hajnoczi * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1961007b31SStefan Hajnoczi * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2061007b31SStefan Hajnoczi * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2161007b31SStefan Hajnoczi * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2261007b31SStefan Hajnoczi * THE SOFTWARE. 2361007b31SStefan Hajnoczi */ 2461007b31SStefan Hajnoczi 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2661007b31SStefan Hajnoczi #include "trace.h" 277f0e9da6SMax Reitz #include "sysemu/block-backend.h" 287719f3c9SStefan Hajnoczi #include "block/aio-wait.h" 2961007b31SStefan Hajnoczi #include "block/blockjob.h" 30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h" 3161007b31SStefan Hajnoczi #include "block/block_int.h" 32f348b6d1SVeronia Bahaa #include "qemu/cutils.h" 33da34e65cSMarkus Armbruster #include "qapi/error.h" 34d49b6836SMarkus Armbruster #include "qemu/error-report.h" 3561007b31SStefan Hajnoczi 3661007b31SStefan Hajnoczi #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ 3761007b31SStefan Hajnoczi 38cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ 39cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) 40cb2e2878SEric Blake 417f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs); 42d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 43f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags); 4461007b31SStefan Hajnoczi 456cd5c9d7SKevin Wolf void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, 466cd5c9d7SKevin Wolf bool ignore_bds_parents) 4761007b31SStefan Hajnoczi { 4802d21300SKevin Wolf BdrvChild *c, *next; 4927ccdd52SKevin Wolf 5002d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 516cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 520152bf40SKevin Wolf continue; 530152bf40SKevin Wolf } 544be6a6d1SKevin Wolf bdrv_parent_drained_begin_single(c, false); 55ce0f1412SPaolo Bonzini } 56ce0f1412SPaolo Bonzini } 57ce0f1412SPaolo Bonzini 586cd5c9d7SKevin Wolf void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, 596cd5c9d7SKevin Wolf bool ignore_bds_parents) 60ce0f1412SPaolo Bonzini { 6102d21300SKevin Wolf BdrvChild *c, *next; 6227ccdd52SKevin Wolf 6302d21300SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 646cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 650152bf40SKevin Wolf continue; 660152bf40SKevin Wolf } 67c2066af0SKevin Wolf if (c->role->drained_end) { 68c2066af0SKevin Wolf c->role->drained_end(c); 6927ccdd52SKevin Wolf } 70c2066af0SKevin Wolf } 7161007b31SStefan Hajnoczi } 7261007b31SStefan Hajnoczi 734be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c) 744be6a6d1SKevin Wolf { 754be6a6d1SKevin Wolf if (c->role->drained_poll) { 764be6a6d1SKevin Wolf return c->role->drained_poll(c); 774be6a6d1SKevin Wolf } 784be6a6d1SKevin Wolf return false; 794be6a6d1SKevin Wolf } 804be6a6d1SKevin Wolf 816cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, 826cd5c9d7SKevin Wolf bool ignore_bds_parents) 8389bd0305SKevin Wolf { 8489bd0305SKevin Wolf BdrvChild *c, *next; 8589bd0305SKevin Wolf bool busy = false; 8689bd0305SKevin Wolf 8789bd0305SKevin Wolf QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) { 886cd5c9d7SKevin Wolf if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { 8989bd0305SKevin Wolf continue; 9089bd0305SKevin Wolf } 914be6a6d1SKevin Wolf busy |= bdrv_parent_drained_poll_single(c); 9289bd0305SKevin Wolf } 9389bd0305SKevin Wolf 9489bd0305SKevin Wolf return busy; 9589bd0305SKevin Wolf } 9689bd0305SKevin Wolf 974be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) 984be6a6d1SKevin Wolf { 994be6a6d1SKevin Wolf if (c->role->drained_begin) { 1004be6a6d1SKevin Wolf c->role->drained_begin(c); 1014be6a6d1SKevin Wolf } 1024be6a6d1SKevin Wolf if (poll) { 1034be6a6d1SKevin Wolf BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); 1044be6a6d1SKevin Wolf } 1054be6a6d1SKevin Wolf } 1064be6a6d1SKevin Wolf 107d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) 108d9e0dfa2SEric Blake { 109d9e0dfa2SEric Blake dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); 110d9e0dfa2SEric Blake dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer); 111d9e0dfa2SEric Blake dst->opt_mem_alignment = MAX(dst->opt_mem_alignment, 112d9e0dfa2SEric Blake src->opt_mem_alignment); 113d9e0dfa2SEric Blake dst->min_mem_alignment = MAX(dst->min_mem_alignment, 114d9e0dfa2SEric Blake src->min_mem_alignment); 115d9e0dfa2SEric Blake dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov); 116d9e0dfa2SEric Blake } 117d9e0dfa2SEric Blake 11861007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) 11961007b31SStefan Hajnoczi { 12061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 12161007b31SStefan Hajnoczi Error *local_err = NULL; 12261007b31SStefan Hajnoczi 12361007b31SStefan Hajnoczi memset(&bs->bl, 0, sizeof(bs->bl)); 12461007b31SStefan Hajnoczi 12561007b31SStefan Hajnoczi if (!drv) { 12661007b31SStefan Hajnoczi return; 12761007b31SStefan Hajnoczi } 12861007b31SStefan Hajnoczi 12979ba8c98SEric Blake /* Default alignment based on whether driver has byte interface */ 130e31f6864SEric Blake bs->bl.request_alignment = (drv->bdrv_co_preadv || 131e31f6864SEric Blake drv->bdrv_aio_preadv) ? 1 : 512; 13279ba8c98SEric Blake 13361007b31SStefan Hajnoczi /* Take some limits from the children as a default */ 13461007b31SStefan Hajnoczi if (bs->file) { 1359a4f4c31SKevin Wolf bdrv_refresh_limits(bs->file->bs, &local_err); 13661007b31SStefan Hajnoczi if (local_err) { 13761007b31SStefan Hajnoczi error_propagate(errp, local_err); 13861007b31SStefan Hajnoczi return; 13961007b31SStefan Hajnoczi } 140d9e0dfa2SEric Blake bdrv_merge_limits(&bs->bl, &bs->file->bs->bl); 14161007b31SStefan Hajnoczi } else { 1424196d2f0SDenis V. Lunev bs->bl.min_mem_alignment = 512; 143459b4e66SDenis V. Lunev bs->bl.opt_mem_alignment = getpagesize(); 144bd44feb7SStefan Hajnoczi 145bd44feb7SStefan Hajnoczi /* Safe default since most protocols use readv()/writev()/etc */ 146bd44feb7SStefan Hajnoczi bs->bl.max_iov = IOV_MAX; 14761007b31SStefan Hajnoczi } 14861007b31SStefan Hajnoczi 149760e0063SKevin Wolf if (bs->backing) { 150760e0063SKevin Wolf bdrv_refresh_limits(bs->backing->bs, &local_err); 15161007b31SStefan Hajnoczi if (local_err) { 15261007b31SStefan Hajnoczi error_propagate(errp, local_err); 15361007b31SStefan Hajnoczi return; 15461007b31SStefan Hajnoczi } 155d9e0dfa2SEric Blake bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl); 15661007b31SStefan Hajnoczi } 15761007b31SStefan Hajnoczi 15861007b31SStefan Hajnoczi /* Then let the driver override it */ 15961007b31SStefan Hajnoczi if (drv->bdrv_refresh_limits) { 16061007b31SStefan Hajnoczi drv->bdrv_refresh_limits(bs, errp); 16161007b31SStefan Hajnoczi } 16261007b31SStefan Hajnoczi } 16361007b31SStefan Hajnoczi 16461007b31SStefan Hajnoczi /** 16561007b31SStefan Hajnoczi * The copy-on-read flag is actually a reference count so multiple users may 16661007b31SStefan Hajnoczi * use the feature without worrying about clobbering its previous state. 16761007b31SStefan Hajnoczi * Copy-on-read stays enabled until all users have called to disable it. 16861007b31SStefan Hajnoczi */ 16961007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs) 17061007b31SStefan Hajnoczi { 171d3faa13eSPaolo Bonzini atomic_inc(&bs->copy_on_read); 17261007b31SStefan Hajnoczi } 17361007b31SStefan Hajnoczi 17461007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs) 17561007b31SStefan Hajnoczi { 176d3faa13eSPaolo Bonzini int old = atomic_fetch_dec(&bs->copy_on_read); 177d3faa13eSPaolo Bonzini assert(old >= 1); 17861007b31SStefan Hajnoczi } 17961007b31SStefan Hajnoczi 18061124f03SPaolo Bonzini typedef struct { 18161124f03SPaolo Bonzini Coroutine *co; 18261124f03SPaolo Bonzini BlockDriverState *bs; 18361124f03SPaolo Bonzini bool done; 184481cad48SManos Pitsidianakis bool begin; 185b0165585SKevin Wolf bool recursive; 186fe4f0614SKevin Wolf bool poll; 1870152bf40SKevin Wolf BdrvChild *parent; 1886cd5c9d7SKevin Wolf bool ignore_bds_parents; 18961124f03SPaolo Bonzini } BdrvCoDrainData; 19061124f03SPaolo Bonzini 19161124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque) 19261124f03SPaolo Bonzini { 19361124f03SPaolo Bonzini BdrvCoDrainData *data = opaque; 19461124f03SPaolo Bonzini BlockDriverState *bs = data->bs; 19561124f03SPaolo Bonzini 196481cad48SManos Pitsidianakis if (data->begin) { 197f8ea8dacSManos Pitsidianakis bs->drv->bdrv_co_drain_begin(bs); 198481cad48SManos Pitsidianakis } else { 199481cad48SManos Pitsidianakis bs->drv->bdrv_co_drain_end(bs); 200481cad48SManos Pitsidianakis } 20161124f03SPaolo Bonzini 20261124f03SPaolo Bonzini /* Set data->done before reading bs->wakeup. */ 20361124f03SPaolo Bonzini atomic_mb_set(&data->done, true); 2040109e7e6SKevin Wolf bdrv_dec_in_flight(bs); 2050109e7e6SKevin Wolf 2060109e7e6SKevin Wolf if (data->begin) { 2070109e7e6SKevin Wolf g_free(data); 2080109e7e6SKevin Wolf } 20961124f03SPaolo Bonzini } 21061124f03SPaolo Bonzini 211db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */ 2127d40d9efSKevin Wolf static void bdrv_drain_invoke(BlockDriverState *bs, bool begin) 21361124f03SPaolo Bonzini { 2140109e7e6SKevin Wolf BdrvCoDrainData *data; 21561124f03SPaolo Bonzini 216f8ea8dacSManos Pitsidianakis if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) || 217481cad48SManos Pitsidianakis (!begin && !bs->drv->bdrv_co_drain_end)) { 21861124f03SPaolo Bonzini return; 21961124f03SPaolo Bonzini } 22061124f03SPaolo Bonzini 2210109e7e6SKevin Wolf data = g_new(BdrvCoDrainData, 1); 2220109e7e6SKevin Wolf *data = (BdrvCoDrainData) { 2230109e7e6SKevin Wolf .bs = bs, 2240109e7e6SKevin Wolf .done = false, 2250109e7e6SKevin Wolf .begin = begin 2260109e7e6SKevin Wolf }; 2270109e7e6SKevin Wolf 2280109e7e6SKevin Wolf /* Make sure the driver callback completes during the polling phase for 2290109e7e6SKevin Wolf * drain_begin. */ 2300109e7e6SKevin Wolf bdrv_inc_in_flight(bs); 2310109e7e6SKevin Wolf data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data); 2320109e7e6SKevin Wolf aio_co_schedule(bdrv_get_aio_context(bs), data->co); 2330109e7e6SKevin Wolf 2340109e7e6SKevin Wolf if (!begin) { 2350109e7e6SKevin Wolf BDRV_POLL_WHILE(bs, !data->done); 2360109e7e6SKevin Wolf g_free(data); 2370109e7e6SKevin Wolf } 23861124f03SPaolo Bonzini } 23961124f03SPaolo Bonzini 2401cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */ 241fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, 2426cd5c9d7SKevin Wolf BdrvChild *ignore_parent, bool ignore_bds_parents) 24389bd0305SKevin Wolf { 244fe4f0614SKevin Wolf BdrvChild *child, *next; 245fe4f0614SKevin Wolf 2466cd5c9d7SKevin Wolf if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) { 24789bd0305SKevin Wolf return true; 24889bd0305SKevin Wolf } 24989bd0305SKevin Wolf 250fe4f0614SKevin Wolf if (atomic_read(&bs->in_flight)) { 251fe4f0614SKevin Wolf return true; 25289bd0305SKevin Wolf } 25389bd0305SKevin Wolf 254fe4f0614SKevin Wolf if (recursive) { 2556cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 256fe4f0614SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 2576cd5c9d7SKevin Wolf if (bdrv_drain_poll(child->bs, recursive, child, false)) { 258fe4f0614SKevin Wolf return true; 259fe4f0614SKevin Wolf } 260fe4f0614SKevin Wolf } 261fe4f0614SKevin Wolf } 262fe4f0614SKevin Wolf 263fe4f0614SKevin Wolf return false; 264fe4f0614SKevin Wolf } 265fe4f0614SKevin Wolf 266fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, 26789bd0305SKevin Wolf BdrvChild *ignore_parent) 2681cc8e54aSKevin Wolf { 2696cd5c9d7SKevin Wolf return bdrv_drain_poll(bs, recursive, ignore_parent, false); 2701cc8e54aSKevin Wolf } 2711cc8e54aSKevin Wolf 272b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 2736cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 2746cd5c9d7SKevin Wolf bool poll); 275b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 2766cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents); 2770152bf40SKevin Wolf 278a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque) 279a77fd4bbSFam Zheng { 280a77fd4bbSFam Zheng BdrvCoDrainData *data = opaque; 281a77fd4bbSFam Zheng Coroutine *co = data->co; 28299723548SPaolo Bonzini BlockDriverState *bs = data->bs; 283a77fd4bbSFam Zheng 284c8ca33d0SKevin Wolf if (bs) { 285aa1361d5SKevin Wolf AioContext *ctx = bdrv_get_aio_context(bs); 286aa1361d5SKevin Wolf AioContext *co_ctx = qemu_coroutine_get_aio_context(co); 287aa1361d5SKevin Wolf 288aa1361d5SKevin Wolf /* 289aa1361d5SKevin Wolf * When the coroutine yielded, the lock for its home context was 290aa1361d5SKevin Wolf * released, so we need to re-acquire it here. If it explicitly 291aa1361d5SKevin Wolf * acquired a different context, the lock is still held and we don't 292aa1361d5SKevin Wolf * want to lock it a second time (or AIO_WAIT_WHILE() would hang). 293aa1361d5SKevin Wolf */ 294aa1361d5SKevin Wolf if (ctx == co_ctx) { 295aa1361d5SKevin Wolf aio_context_acquire(ctx); 296aa1361d5SKevin Wolf } 29799723548SPaolo Bonzini bdrv_dec_in_flight(bs); 298481cad48SManos Pitsidianakis if (data->begin) { 2996cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, data->recursive, data->parent, 3006cd5c9d7SKevin Wolf data->ignore_bds_parents, data->poll); 301481cad48SManos Pitsidianakis } else { 3026cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, data->recursive, data->parent, 3036cd5c9d7SKevin Wolf data->ignore_bds_parents); 304481cad48SManos Pitsidianakis } 305aa1361d5SKevin Wolf if (ctx == co_ctx) { 306aa1361d5SKevin Wolf aio_context_release(ctx); 307aa1361d5SKevin Wolf } 308c8ca33d0SKevin Wolf } else { 309c8ca33d0SKevin Wolf assert(data->begin); 310c8ca33d0SKevin Wolf bdrv_drain_all_begin(); 311c8ca33d0SKevin Wolf } 312481cad48SManos Pitsidianakis 313a77fd4bbSFam Zheng data->done = true; 3141919631eSPaolo Bonzini aio_co_wake(co); 315a77fd4bbSFam Zheng } 316a77fd4bbSFam Zheng 317481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs, 318b0165585SKevin Wolf bool begin, bool recursive, 3196cd5c9d7SKevin Wolf BdrvChild *parent, 3206cd5c9d7SKevin Wolf bool ignore_bds_parents, 3216cd5c9d7SKevin Wolf bool poll) 322a77fd4bbSFam Zheng { 323a77fd4bbSFam Zheng BdrvCoDrainData data; 324a77fd4bbSFam Zheng 325a77fd4bbSFam Zheng /* Calling bdrv_drain() from a BH ensures the current coroutine yields and 326c40a2545SStefan Hajnoczi * other coroutines run if they were queued by aio_co_enter(). */ 327a77fd4bbSFam Zheng 328a77fd4bbSFam Zheng assert(qemu_in_coroutine()); 329a77fd4bbSFam Zheng data = (BdrvCoDrainData) { 330a77fd4bbSFam Zheng .co = qemu_coroutine_self(), 331a77fd4bbSFam Zheng .bs = bs, 332a77fd4bbSFam Zheng .done = false, 333481cad48SManos Pitsidianakis .begin = begin, 334b0165585SKevin Wolf .recursive = recursive, 3350152bf40SKevin Wolf .parent = parent, 3366cd5c9d7SKevin Wolf .ignore_bds_parents = ignore_bds_parents, 337fe4f0614SKevin Wolf .poll = poll, 338a77fd4bbSFam Zheng }; 339c8ca33d0SKevin Wolf if (bs) { 34099723548SPaolo Bonzini bdrv_inc_in_flight(bs); 341c8ca33d0SKevin Wolf } 342fffb6e12SPaolo Bonzini aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), 343fffb6e12SPaolo Bonzini bdrv_co_drain_bh_cb, &data); 344a77fd4bbSFam Zheng 345a77fd4bbSFam Zheng qemu_coroutine_yield(); 346a77fd4bbSFam Zheng /* If we are resumed from some other event (such as an aio completion or a 347a77fd4bbSFam Zheng * timer callback), it is a bug in the caller that should be fixed. */ 348a77fd4bbSFam Zheng assert(data.done); 349a77fd4bbSFam Zheng } 350a77fd4bbSFam Zheng 351dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, 3526cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents) 353dcf94a23SKevin Wolf { 354dcf94a23SKevin Wolf assert(!qemu_in_coroutine()); 355dcf94a23SKevin Wolf 356dcf94a23SKevin Wolf /* Stop things in parent-to-child order */ 357dcf94a23SKevin Wolf if (atomic_fetch_inc(&bs->quiesce_counter) == 0) { 358dcf94a23SKevin Wolf aio_disable_external(bdrv_get_aio_context(bs)); 359dcf94a23SKevin Wolf } 360dcf94a23SKevin Wolf 3616cd5c9d7SKevin Wolf bdrv_parent_drained_begin(bs, parent, ignore_bds_parents); 362dcf94a23SKevin Wolf bdrv_drain_invoke(bs, true); 363dcf94a23SKevin Wolf } 364dcf94a23SKevin Wolf 365dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive, 3666cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents, 3676cd5c9d7SKevin Wolf bool poll) 3686820643fSKevin Wolf { 369b0165585SKevin Wolf BdrvChild *child, *next; 370b0165585SKevin Wolf 371d42cf288SPaolo Bonzini if (qemu_in_coroutine()) { 3726cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents, 3736cd5c9d7SKevin Wolf poll); 374d42cf288SPaolo Bonzini return; 375d42cf288SPaolo Bonzini } 376d42cf288SPaolo Bonzini 3776cd5c9d7SKevin Wolf bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents); 378d30b8e64SKevin Wolf 379b0165585SKevin Wolf if (recursive) { 3806cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 381d736f119SKevin Wolf bs->recursive_quiesce_counter++; 382b0165585SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 3836cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents, 3846cd5c9d7SKevin Wolf false); 385b0165585SKevin Wolf } 386b0165585SKevin Wolf } 387fe4f0614SKevin Wolf 388fe4f0614SKevin Wolf /* 389fe4f0614SKevin Wolf * Wait for drained requests to finish. 390fe4f0614SKevin Wolf * 391fe4f0614SKevin Wolf * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The 392fe4f0614SKevin Wolf * call is needed so things in this AioContext can make progress even 393fe4f0614SKevin Wolf * though we don't return to the main AioContext loop - this automatically 394fe4f0614SKevin Wolf * includes other nodes in the same AioContext and therefore all child 395fe4f0614SKevin Wolf * nodes. 396fe4f0614SKevin Wolf */ 397fe4f0614SKevin Wolf if (poll) { 3986cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 399fe4f0614SKevin Wolf BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent)); 400fe4f0614SKevin Wolf } 4016820643fSKevin Wolf } 4026820643fSKevin Wolf 4030152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs) 4040152bf40SKevin Wolf { 4056cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, false, NULL, false, true); 4060152bf40SKevin Wolf } 4070152bf40SKevin Wolf 408b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs) 4096820643fSKevin Wolf { 4106cd5c9d7SKevin Wolf bdrv_do_drained_begin(bs, true, NULL, false, true); 411b0165585SKevin Wolf } 412b0165585SKevin Wolf 4136cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive, 4146cd5c9d7SKevin Wolf BdrvChild *parent, bool ignore_bds_parents) 415b0165585SKevin Wolf { 416b0165585SKevin Wolf BdrvChild *child, *next; 4170f115168SKevin Wolf int old_quiesce_counter; 4180f115168SKevin Wolf 419481cad48SManos Pitsidianakis if (qemu_in_coroutine()) { 4206cd5c9d7SKevin Wolf bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents, 4216cd5c9d7SKevin Wolf false); 422481cad48SManos Pitsidianakis return; 423481cad48SManos Pitsidianakis } 4246820643fSKevin Wolf assert(bs->quiesce_counter > 0); 4256820643fSKevin Wolf 42660369b86SKevin Wolf /* Re-enable things in child-to-parent order */ 4277d40d9efSKevin Wolf bdrv_drain_invoke(bs, false); 4286cd5c9d7SKevin Wolf bdrv_parent_drained_end(bs, parent, ignore_bds_parents); 429*5cb2737eSMax Reitz 430*5cb2737eSMax Reitz old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter); 4310f115168SKevin Wolf if (old_quiesce_counter == 1) { 4326820643fSKevin Wolf aio_enable_external(bdrv_get_aio_context(bs)); 4336820643fSKevin Wolf } 434b0165585SKevin Wolf 435b0165585SKevin Wolf if (recursive) { 4366cd5c9d7SKevin Wolf assert(!ignore_bds_parents); 437d736f119SKevin Wolf bs->recursive_quiesce_counter--; 438b0165585SKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 4396cd5c9d7SKevin Wolf bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents); 440b0165585SKevin Wolf } 441b0165585SKevin Wolf } 4420f115168SKevin Wolf } 4436820643fSKevin Wolf 4440152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs) 4450152bf40SKevin Wolf { 4466cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, false, NULL, false); 447b0165585SKevin Wolf } 448b0165585SKevin Wolf 449b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs) 450b0165585SKevin Wolf { 4516cd5c9d7SKevin Wolf bdrv_do_drained_end(bs, true, NULL, false); 4520152bf40SKevin Wolf } 4530152bf40SKevin Wolf 454d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent) 455d736f119SKevin Wolf { 456d736f119SKevin Wolf int i; 457d736f119SKevin Wolf 458d736f119SKevin Wolf for (i = 0; i < new_parent->recursive_quiesce_counter; i++) { 4596cd5c9d7SKevin Wolf bdrv_do_drained_begin(child->bs, true, child, false, true); 460d736f119SKevin Wolf } 461d736f119SKevin Wolf } 462d736f119SKevin Wolf 463d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent) 464d736f119SKevin Wolf { 465d736f119SKevin Wolf int i; 466d736f119SKevin Wolf 467d736f119SKevin Wolf for (i = 0; i < old_parent->recursive_quiesce_counter; i++) { 4686cd5c9d7SKevin Wolf bdrv_do_drained_end(child->bs, true, child, false); 469d736f119SKevin Wolf } 470d736f119SKevin Wolf } 471d736f119SKevin Wolf 47261007b31SStefan Hajnoczi /* 47367da1dc5SFam Zheng * Wait for pending requests to complete on a single BlockDriverState subtree, 47467da1dc5SFam Zheng * and suspend block driver's internal I/O until next request arrives. 47561007b31SStefan Hajnoczi * 47661007b31SStefan Hajnoczi * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState 47761007b31SStefan Hajnoczi * AioContext. 47861007b31SStefan Hajnoczi */ 479b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs) 480b6e84c97SPaolo Bonzini { 4816820643fSKevin Wolf assert(qemu_in_coroutine()); 4826820643fSKevin Wolf bdrv_drained_begin(bs); 4836820643fSKevin Wolf bdrv_drained_end(bs); 484b6e84c97SPaolo Bonzini } 485b6e84c97SPaolo Bonzini 48661007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs) 48761007b31SStefan Hajnoczi { 4886820643fSKevin Wolf bdrv_drained_begin(bs); 4896820643fSKevin Wolf bdrv_drained_end(bs); 49061007b31SStefan Hajnoczi } 49161007b31SStefan Hajnoczi 492c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs) 493c13ad59fSKevin Wolf { 494c13ad59fSKevin Wolf BdrvChild *child, *next; 495c13ad59fSKevin Wolf 496c13ad59fSKevin Wolf assert(atomic_read(&bs->in_flight) == 0); 497c13ad59fSKevin Wolf QLIST_FOREACH_SAFE(child, &bs->children, next, next) { 498c13ad59fSKevin Wolf bdrv_drain_assert_idle(child->bs); 499c13ad59fSKevin Wolf } 500c13ad59fSKevin Wolf } 501c13ad59fSKevin Wolf 5020f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0; 5030f12264eSKevin Wolf 5040f12264eSKevin Wolf static bool bdrv_drain_all_poll(void) 5050f12264eSKevin Wolf { 5060f12264eSKevin Wolf BlockDriverState *bs = NULL; 5070f12264eSKevin Wolf bool result = false; 5080f12264eSKevin Wolf 5090f12264eSKevin Wolf /* bdrv_drain_poll() can't make changes to the graph and we are holding the 5100f12264eSKevin Wolf * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ 5110f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 5120f12264eSKevin Wolf AioContext *aio_context = bdrv_get_aio_context(bs); 5130f12264eSKevin Wolf aio_context_acquire(aio_context); 5140f12264eSKevin Wolf result |= bdrv_drain_poll(bs, false, NULL, true); 5150f12264eSKevin Wolf aio_context_release(aio_context); 5160f12264eSKevin Wolf } 5170f12264eSKevin Wolf 5180f12264eSKevin Wolf return result; 5190f12264eSKevin Wolf } 5200f12264eSKevin Wolf 52161007b31SStefan Hajnoczi /* 52261007b31SStefan Hajnoczi * Wait for pending requests to complete across all BlockDriverStates 52361007b31SStefan Hajnoczi * 52461007b31SStefan Hajnoczi * This function does not flush data to disk, use bdrv_flush_all() for that 52561007b31SStefan Hajnoczi * after calling this function. 526c0778f66SAlberto Garcia * 527c0778f66SAlberto Garcia * This pauses all block jobs and disables external clients. It must 528c0778f66SAlberto Garcia * be paired with bdrv_drain_all_end(). 529c0778f66SAlberto Garcia * 530c0778f66SAlberto Garcia * NOTE: no new block jobs or BlockDriverStates can be created between 531c0778f66SAlberto Garcia * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls. 53261007b31SStefan Hajnoczi */ 533c0778f66SAlberto Garcia void bdrv_drain_all_begin(void) 53461007b31SStefan Hajnoczi { 5350f12264eSKevin Wolf BlockDriverState *bs = NULL; 53661007b31SStefan Hajnoczi 537c8ca33d0SKevin Wolf if (qemu_in_coroutine()) { 5380f12264eSKevin Wolf bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true); 539c8ca33d0SKevin Wolf return; 540c8ca33d0SKevin Wolf } 541c8ca33d0SKevin Wolf 5420f12264eSKevin Wolf /* AIO_WAIT_WHILE() with a NULL context can only be called from the main 5430f12264eSKevin Wolf * loop AioContext, so make sure we're in the main context. */ 5449a7e86c8SKevin Wolf assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 5450f12264eSKevin Wolf assert(bdrv_drain_all_count < INT_MAX); 5460f12264eSKevin Wolf bdrv_drain_all_count++; 5479a7e86c8SKevin Wolf 5480f12264eSKevin Wolf /* Quiesce all nodes, without polling in-flight requests yet. The graph 5490f12264eSKevin Wolf * cannot change during this loop. */ 5500f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 55161007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 55261007b31SStefan Hajnoczi 55361007b31SStefan Hajnoczi aio_context_acquire(aio_context); 5540f12264eSKevin Wolf bdrv_do_drained_begin(bs, false, NULL, true, false); 55561007b31SStefan Hajnoczi aio_context_release(aio_context); 55661007b31SStefan Hajnoczi } 55761007b31SStefan Hajnoczi 5580f12264eSKevin Wolf /* Now poll the in-flight requests */ 559cfe29d82SKevin Wolf AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); 5600f12264eSKevin Wolf 5610f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 562c13ad59fSKevin Wolf bdrv_drain_assert_idle(bs); 563f406c03cSAlexander Yarygin } 564f406c03cSAlexander Yarygin } 565c0778f66SAlberto Garcia 566c0778f66SAlberto Garcia void bdrv_drain_all_end(void) 567c0778f66SAlberto Garcia { 5680f12264eSKevin Wolf BlockDriverState *bs = NULL; 569c0778f66SAlberto Garcia 5700f12264eSKevin Wolf while ((bs = bdrv_next_all_states(bs))) { 57161007b31SStefan Hajnoczi AioContext *aio_context = bdrv_get_aio_context(bs); 57261007b31SStefan Hajnoczi 57361007b31SStefan Hajnoczi aio_context_acquire(aio_context); 5740f12264eSKevin Wolf bdrv_do_drained_end(bs, false, NULL, true); 57561007b31SStefan Hajnoczi aio_context_release(aio_context); 57661007b31SStefan Hajnoczi } 5770f12264eSKevin Wolf 5780f12264eSKevin Wolf assert(bdrv_drain_all_count > 0); 5790f12264eSKevin Wolf bdrv_drain_all_count--; 58061007b31SStefan Hajnoczi } 58161007b31SStefan Hajnoczi 582c0778f66SAlberto Garcia void bdrv_drain_all(void) 583c0778f66SAlberto Garcia { 584c0778f66SAlberto Garcia bdrv_drain_all_begin(); 585c0778f66SAlberto Garcia bdrv_drain_all_end(); 586c0778f66SAlberto Garcia } 587c0778f66SAlberto Garcia 58861007b31SStefan Hajnoczi /** 58961007b31SStefan Hajnoczi * Remove an active request from the tracked requests list 59061007b31SStefan Hajnoczi * 59161007b31SStefan Hajnoczi * This function should be called when a tracked request is completing. 59261007b31SStefan Hajnoczi */ 59361007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req) 59461007b31SStefan Hajnoczi { 59561007b31SStefan Hajnoczi if (req->serialising) { 59620fc71b2SPaolo Bonzini atomic_dec(&req->bs->serialising_in_flight); 59761007b31SStefan Hajnoczi } 59861007b31SStefan Hajnoczi 5993783fa3dSPaolo Bonzini qemu_co_mutex_lock(&req->bs->reqs_lock); 60061007b31SStefan Hajnoczi QLIST_REMOVE(req, list); 60161007b31SStefan Hajnoczi qemu_co_queue_restart_all(&req->wait_queue); 6023783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&req->bs->reqs_lock); 60361007b31SStefan Hajnoczi } 60461007b31SStefan Hajnoczi 60561007b31SStefan Hajnoczi /** 60661007b31SStefan Hajnoczi * Add an active request to the tracked requests list 60761007b31SStefan Hajnoczi */ 60861007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req, 60961007b31SStefan Hajnoczi BlockDriverState *bs, 61061007b31SStefan Hajnoczi int64_t offset, 61122931a15SFam Zheng uint64_t bytes, 612ebde595cSFam Zheng enum BdrvTrackedRequestType type) 61361007b31SStefan Hajnoczi { 61422931a15SFam Zheng assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); 61522931a15SFam Zheng 61661007b31SStefan Hajnoczi *req = (BdrvTrackedRequest){ 61761007b31SStefan Hajnoczi .bs = bs, 61861007b31SStefan Hajnoczi .offset = offset, 61961007b31SStefan Hajnoczi .bytes = bytes, 620ebde595cSFam Zheng .type = type, 62161007b31SStefan Hajnoczi .co = qemu_coroutine_self(), 62261007b31SStefan Hajnoczi .serialising = false, 62361007b31SStefan Hajnoczi .overlap_offset = offset, 62461007b31SStefan Hajnoczi .overlap_bytes = bytes, 62561007b31SStefan Hajnoczi }; 62661007b31SStefan Hajnoczi 62761007b31SStefan Hajnoczi qemu_co_queue_init(&req->wait_queue); 62861007b31SStefan Hajnoczi 6293783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 63061007b31SStefan Hajnoczi QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); 6313783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 63261007b31SStefan Hajnoczi } 63361007b31SStefan Hajnoczi 63461007b31SStefan Hajnoczi static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) 63561007b31SStefan Hajnoczi { 63661007b31SStefan Hajnoczi int64_t overlap_offset = req->offset & ~(align - 1); 63722931a15SFam Zheng uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) 63861007b31SStefan Hajnoczi - overlap_offset; 63961007b31SStefan Hajnoczi 64061007b31SStefan Hajnoczi if (!req->serialising) { 64120fc71b2SPaolo Bonzini atomic_inc(&req->bs->serialising_in_flight); 64261007b31SStefan Hajnoczi req->serialising = true; 64361007b31SStefan Hajnoczi } 64461007b31SStefan Hajnoczi 64561007b31SStefan Hajnoczi req->overlap_offset = MIN(req->overlap_offset, overlap_offset); 64661007b31SStefan Hajnoczi req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); 64761007b31SStefan Hajnoczi } 64861007b31SStefan Hajnoczi 64909d2f948SVladimir Sementsov-Ogievskiy static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req) 65009d2f948SVladimir Sementsov-Ogievskiy { 65109d2f948SVladimir Sementsov-Ogievskiy /* 65209d2f948SVladimir Sementsov-Ogievskiy * If the request is serialising, overlap_offset and overlap_bytes are set, 65309d2f948SVladimir Sementsov-Ogievskiy * so we can check if the request is aligned. Otherwise, don't care and 65409d2f948SVladimir Sementsov-Ogievskiy * return false. 65509d2f948SVladimir Sementsov-Ogievskiy */ 65609d2f948SVladimir Sementsov-Ogievskiy 65709d2f948SVladimir Sementsov-Ogievskiy return req->serialising && (req->offset == req->overlap_offset) && 65809d2f948SVladimir Sementsov-Ogievskiy (req->bytes == req->overlap_bytes); 65909d2f948SVladimir Sementsov-Ogievskiy } 66009d2f948SVladimir Sementsov-Ogievskiy 66161007b31SStefan Hajnoczi /** 662244483e6SKevin Wolf * Round a region to cluster boundaries 663244483e6SKevin Wolf */ 664244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs, 6657cfd5275SEric Blake int64_t offset, int64_t bytes, 666244483e6SKevin Wolf int64_t *cluster_offset, 6677cfd5275SEric Blake int64_t *cluster_bytes) 668244483e6SKevin Wolf { 669244483e6SKevin Wolf BlockDriverInfo bdi; 670244483e6SKevin Wolf 671244483e6SKevin Wolf if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { 672244483e6SKevin Wolf *cluster_offset = offset; 673244483e6SKevin Wolf *cluster_bytes = bytes; 674244483e6SKevin Wolf } else { 675244483e6SKevin Wolf int64_t c = bdi.cluster_size; 676244483e6SKevin Wolf *cluster_offset = QEMU_ALIGN_DOWN(offset, c); 677244483e6SKevin Wolf *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); 678244483e6SKevin Wolf } 679244483e6SKevin Wolf } 680244483e6SKevin Wolf 68161007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs) 68261007b31SStefan Hajnoczi { 68361007b31SStefan Hajnoczi BlockDriverInfo bdi; 68461007b31SStefan Hajnoczi int ret; 68561007b31SStefan Hajnoczi 68661007b31SStefan Hajnoczi ret = bdrv_get_info(bs, &bdi); 68761007b31SStefan Hajnoczi if (ret < 0 || bdi.cluster_size == 0) { 688a5b8dd2cSEric Blake return bs->bl.request_alignment; 68961007b31SStefan Hajnoczi } else { 69061007b31SStefan Hajnoczi return bdi.cluster_size; 69161007b31SStefan Hajnoczi } 69261007b31SStefan Hajnoczi } 69361007b31SStefan Hajnoczi 69461007b31SStefan Hajnoczi static bool tracked_request_overlaps(BdrvTrackedRequest *req, 69522931a15SFam Zheng int64_t offset, uint64_t bytes) 69661007b31SStefan Hajnoczi { 69761007b31SStefan Hajnoczi /* aaaa bbbb */ 69861007b31SStefan Hajnoczi if (offset >= req->overlap_offset + req->overlap_bytes) { 69961007b31SStefan Hajnoczi return false; 70061007b31SStefan Hajnoczi } 70161007b31SStefan Hajnoczi /* bbbb aaaa */ 70261007b31SStefan Hajnoczi if (req->overlap_offset >= offset + bytes) { 70361007b31SStefan Hajnoczi return false; 70461007b31SStefan Hajnoczi } 70561007b31SStefan Hajnoczi return true; 70661007b31SStefan Hajnoczi } 70761007b31SStefan Hajnoczi 70899723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs) 70999723548SPaolo Bonzini { 71099723548SPaolo Bonzini atomic_inc(&bs->in_flight); 71199723548SPaolo Bonzini } 71299723548SPaolo Bonzini 713c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs) 714c9d1a561SPaolo Bonzini { 715cfe29d82SKevin Wolf aio_wait_kick(); 716c9d1a561SPaolo Bonzini } 717c9d1a561SPaolo Bonzini 71899723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs) 71999723548SPaolo Bonzini { 72099723548SPaolo Bonzini atomic_dec(&bs->in_flight); 721c9d1a561SPaolo Bonzini bdrv_wakeup(bs); 72299723548SPaolo Bonzini } 72399723548SPaolo Bonzini 72461007b31SStefan Hajnoczi static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) 72561007b31SStefan Hajnoczi { 72661007b31SStefan Hajnoczi BlockDriverState *bs = self->bs; 72761007b31SStefan Hajnoczi BdrvTrackedRequest *req; 72861007b31SStefan Hajnoczi bool retry; 72961007b31SStefan Hajnoczi bool waited = false; 73061007b31SStefan Hajnoczi 73120fc71b2SPaolo Bonzini if (!atomic_read(&bs->serialising_in_flight)) { 73261007b31SStefan Hajnoczi return false; 73361007b31SStefan Hajnoczi } 73461007b31SStefan Hajnoczi 73561007b31SStefan Hajnoczi do { 73661007b31SStefan Hajnoczi retry = false; 7373783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 73861007b31SStefan Hajnoczi QLIST_FOREACH(req, &bs->tracked_requests, list) { 73961007b31SStefan Hajnoczi if (req == self || (!req->serialising && !self->serialising)) { 74061007b31SStefan Hajnoczi continue; 74161007b31SStefan Hajnoczi } 74261007b31SStefan Hajnoczi if (tracked_request_overlaps(req, self->overlap_offset, 74361007b31SStefan Hajnoczi self->overlap_bytes)) 74461007b31SStefan Hajnoczi { 74561007b31SStefan Hajnoczi /* Hitting this means there was a reentrant request, for 74661007b31SStefan Hajnoczi * example, a block driver issuing nested requests. This must 74761007b31SStefan Hajnoczi * never happen since it means deadlock. 74861007b31SStefan Hajnoczi */ 74961007b31SStefan Hajnoczi assert(qemu_coroutine_self() != req->co); 75061007b31SStefan Hajnoczi 75161007b31SStefan Hajnoczi /* If the request is already (indirectly) waiting for us, or 75261007b31SStefan Hajnoczi * will wait for us as soon as it wakes up, then just go on 75361007b31SStefan Hajnoczi * (instead of producing a deadlock in the former case). */ 75461007b31SStefan Hajnoczi if (!req->waiting_for) { 75561007b31SStefan Hajnoczi self->waiting_for = req; 7563783fa3dSPaolo Bonzini qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); 75761007b31SStefan Hajnoczi self->waiting_for = NULL; 75861007b31SStefan Hajnoczi retry = true; 75961007b31SStefan Hajnoczi waited = true; 76061007b31SStefan Hajnoczi break; 76161007b31SStefan Hajnoczi } 76261007b31SStefan Hajnoczi } 76361007b31SStefan Hajnoczi } 7643783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 76561007b31SStefan Hajnoczi } while (retry); 76661007b31SStefan Hajnoczi 76761007b31SStefan Hajnoczi return waited; 76861007b31SStefan Hajnoczi } 76961007b31SStefan Hajnoczi 77061007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, 77161007b31SStefan Hajnoczi size_t size) 77261007b31SStefan Hajnoczi { 77341ae31e3SAlberto Garcia if (size > BDRV_REQUEST_MAX_BYTES) { 77461007b31SStefan Hajnoczi return -EIO; 77561007b31SStefan Hajnoczi } 77661007b31SStefan Hajnoczi 77761007b31SStefan Hajnoczi if (!bdrv_is_inserted(bs)) { 77861007b31SStefan Hajnoczi return -ENOMEDIUM; 77961007b31SStefan Hajnoczi } 78061007b31SStefan Hajnoczi 78161007b31SStefan Hajnoczi if (offset < 0) { 78261007b31SStefan Hajnoczi return -EIO; 78361007b31SStefan Hajnoczi } 78461007b31SStefan Hajnoczi 78561007b31SStefan Hajnoczi return 0; 78661007b31SStefan Hajnoczi } 78761007b31SStefan Hajnoczi 78861007b31SStefan Hajnoczi typedef struct RwCo { 789e293b7a3SKevin Wolf BdrvChild *child; 79061007b31SStefan Hajnoczi int64_t offset; 79161007b31SStefan Hajnoczi QEMUIOVector *qiov; 79261007b31SStefan Hajnoczi bool is_write; 79361007b31SStefan Hajnoczi int ret; 79461007b31SStefan Hajnoczi BdrvRequestFlags flags; 79561007b31SStefan Hajnoczi } RwCo; 79661007b31SStefan Hajnoczi 79761007b31SStefan Hajnoczi static void coroutine_fn bdrv_rw_co_entry(void *opaque) 79861007b31SStefan Hajnoczi { 79961007b31SStefan Hajnoczi RwCo *rwco = opaque; 80061007b31SStefan Hajnoczi 80161007b31SStefan Hajnoczi if (!rwco->is_write) { 802a03ef88fSKevin Wolf rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset, 80361007b31SStefan Hajnoczi rwco->qiov->size, rwco->qiov, 80461007b31SStefan Hajnoczi rwco->flags); 80561007b31SStefan Hajnoczi } else { 806a03ef88fSKevin Wolf rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset, 80761007b31SStefan Hajnoczi rwco->qiov->size, rwco->qiov, 80861007b31SStefan Hajnoczi rwco->flags); 80961007b31SStefan Hajnoczi } 8104720cbeeSKevin Wolf aio_wait_kick(); 81161007b31SStefan Hajnoczi } 81261007b31SStefan Hajnoczi 81361007b31SStefan Hajnoczi /* 81461007b31SStefan Hajnoczi * Process a vectored synchronous request using coroutines 81561007b31SStefan Hajnoczi */ 816e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset, 81761007b31SStefan Hajnoczi QEMUIOVector *qiov, bool is_write, 81861007b31SStefan Hajnoczi BdrvRequestFlags flags) 81961007b31SStefan Hajnoczi { 82061007b31SStefan Hajnoczi Coroutine *co; 82161007b31SStefan Hajnoczi RwCo rwco = { 822e293b7a3SKevin Wolf .child = child, 82361007b31SStefan Hajnoczi .offset = offset, 82461007b31SStefan Hajnoczi .qiov = qiov, 82561007b31SStefan Hajnoczi .is_write = is_write, 82661007b31SStefan Hajnoczi .ret = NOT_DONE, 82761007b31SStefan Hajnoczi .flags = flags, 82861007b31SStefan Hajnoczi }; 82961007b31SStefan Hajnoczi 83061007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 83161007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 83261007b31SStefan Hajnoczi bdrv_rw_co_entry(&rwco); 83361007b31SStefan Hajnoczi } else { 8340b8b8753SPaolo Bonzini co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); 835e92f0e19SFam Zheng bdrv_coroutine_enter(child->bs, co); 83688b062c2SPaolo Bonzini BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 83761007b31SStefan Hajnoczi } 83861007b31SStefan Hajnoczi return rwco.ret; 83961007b31SStefan Hajnoczi } 84061007b31SStefan Hajnoczi 841720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, 842f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 84361007b31SStefan Hajnoczi { 8440d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); 84574021bc4SEric Blake 846e293b7a3SKevin Wolf return bdrv_prwv_co(child, offset, &qiov, true, 84761007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 84861007b31SStefan Hajnoczi } 84961007b31SStefan Hajnoczi 85061007b31SStefan Hajnoczi /* 85174021bc4SEric Blake * Completely zero out a block device with the help of bdrv_pwrite_zeroes. 85261007b31SStefan Hajnoczi * The operation is sped up by checking the block status and only writing 85361007b31SStefan Hajnoczi * zeroes to the device if they currently do not return zeroes. Optional 85474021bc4SEric Blake * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP, 855465fe887SEric Blake * BDRV_REQ_FUA). 85661007b31SStefan Hajnoczi * 85761007b31SStefan Hajnoczi * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). 85861007b31SStefan Hajnoczi */ 859720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) 86061007b31SStefan Hajnoczi { 861237d78f8SEric Blake int ret; 862237d78f8SEric Blake int64_t target_size, bytes, offset = 0; 863720ff280SKevin Wolf BlockDriverState *bs = child->bs; 86461007b31SStefan Hajnoczi 8657286d610SEric Blake target_size = bdrv_getlength(bs); 8667286d610SEric Blake if (target_size < 0) { 8677286d610SEric Blake return target_size; 86861007b31SStefan Hajnoczi } 86961007b31SStefan Hajnoczi 87061007b31SStefan Hajnoczi for (;;) { 8717286d610SEric Blake bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES); 8727286d610SEric Blake if (bytes <= 0) { 87361007b31SStefan Hajnoczi return 0; 87461007b31SStefan Hajnoczi } 875237d78f8SEric Blake ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL); 87661007b31SStefan Hajnoczi if (ret < 0) { 87761007b31SStefan Hajnoczi return ret; 87861007b31SStefan Hajnoczi } 87961007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_ZERO) { 880237d78f8SEric Blake offset += bytes; 88161007b31SStefan Hajnoczi continue; 88261007b31SStefan Hajnoczi } 883237d78f8SEric Blake ret = bdrv_pwrite_zeroes(child, offset, bytes, flags); 88461007b31SStefan Hajnoczi if (ret < 0) { 88561007b31SStefan Hajnoczi return ret; 88661007b31SStefan Hajnoczi } 887237d78f8SEric Blake offset += bytes; 88861007b31SStefan Hajnoczi } 88961007b31SStefan Hajnoczi } 89061007b31SStefan Hajnoczi 891cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 892f1e84741SKevin Wolf { 893f1e84741SKevin Wolf int ret; 894f1e84741SKevin Wolf 895e293b7a3SKevin Wolf ret = bdrv_prwv_co(child, offset, qiov, false, 0); 896f1e84741SKevin Wolf if (ret < 0) { 897f1e84741SKevin Wolf return ret; 898f1e84741SKevin Wolf } 899f1e84741SKevin Wolf 900f1e84741SKevin Wolf return qiov->size; 901f1e84741SKevin Wolf } 902f1e84741SKevin Wolf 9032e11d756SAlberto Garcia /* See bdrv_pwrite() for the return codes */ 904cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) 90561007b31SStefan Hajnoczi { 9060d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 90761007b31SStefan Hajnoczi 90861007b31SStefan Hajnoczi if (bytes < 0) { 90961007b31SStefan Hajnoczi return -EINVAL; 91061007b31SStefan Hajnoczi } 91161007b31SStefan Hajnoczi 912cf2ab8fcSKevin Wolf return bdrv_preadv(child, offset, &qiov); 91361007b31SStefan Hajnoczi } 91461007b31SStefan Hajnoczi 915d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) 91661007b31SStefan Hajnoczi { 91761007b31SStefan Hajnoczi int ret; 91861007b31SStefan Hajnoczi 919e293b7a3SKevin Wolf ret = bdrv_prwv_co(child, offset, qiov, true, 0); 92061007b31SStefan Hajnoczi if (ret < 0) { 92161007b31SStefan Hajnoczi return ret; 92261007b31SStefan Hajnoczi } 92361007b31SStefan Hajnoczi 92461007b31SStefan Hajnoczi return qiov->size; 92561007b31SStefan Hajnoczi } 92661007b31SStefan Hajnoczi 9272e11d756SAlberto Garcia /* Return no. of bytes on success or < 0 on error. Important errors are: 9282e11d756SAlberto Garcia -EIO generic I/O error (may happen for all errors) 9292e11d756SAlberto Garcia -ENOMEDIUM No media inserted. 9302e11d756SAlberto Garcia -EINVAL Invalid offset or number of bytes 9312e11d756SAlberto Garcia -EACCES Trying to write a read-only device 9322e11d756SAlberto Garcia */ 933d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) 93461007b31SStefan Hajnoczi { 9350d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); 93661007b31SStefan Hajnoczi 93761007b31SStefan Hajnoczi if (bytes < 0) { 93861007b31SStefan Hajnoczi return -EINVAL; 93961007b31SStefan Hajnoczi } 94061007b31SStefan Hajnoczi 941d9ca2ea2SKevin Wolf return bdrv_pwritev(child, offset, &qiov); 94261007b31SStefan Hajnoczi } 94361007b31SStefan Hajnoczi 94461007b31SStefan Hajnoczi /* 94561007b31SStefan Hajnoczi * Writes to the file and ensures that no writes are reordered across this 94661007b31SStefan Hajnoczi * request (acts as a barrier) 94761007b31SStefan Hajnoczi * 94861007b31SStefan Hajnoczi * Returns 0 on success, -errno in error cases. 94961007b31SStefan Hajnoczi */ 950d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, 95161007b31SStefan Hajnoczi const void *buf, int count) 95261007b31SStefan Hajnoczi { 95361007b31SStefan Hajnoczi int ret; 95461007b31SStefan Hajnoczi 955d9ca2ea2SKevin Wolf ret = bdrv_pwrite(child, offset, buf, count); 95661007b31SStefan Hajnoczi if (ret < 0) { 95761007b31SStefan Hajnoczi return ret; 95861007b31SStefan Hajnoczi } 95961007b31SStefan Hajnoczi 960d9ca2ea2SKevin Wolf ret = bdrv_flush(child->bs); 961855a6a93SKevin Wolf if (ret < 0) { 962855a6a93SKevin Wolf return ret; 96361007b31SStefan Hajnoczi } 96461007b31SStefan Hajnoczi 96561007b31SStefan Hajnoczi return 0; 96661007b31SStefan Hajnoczi } 96761007b31SStefan Hajnoczi 96808844473SKevin Wolf typedef struct CoroutineIOCompletion { 96908844473SKevin Wolf Coroutine *coroutine; 97008844473SKevin Wolf int ret; 97108844473SKevin Wolf } CoroutineIOCompletion; 97208844473SKevin Wolf 97308844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret) 97408844473SKevin Wolf { 97508844473SKevin Wolf CoroutineIOCompletion *co = opaque; 97608844473SKevin Wolf 97708844473SKevin Wolf co->ret = ret; 978b9e413ddSPaolo Bonzini aio_co_wake(co->coroutine); 97908844473SKevin Wolf } 98008844473SKevin Wolf 981166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs, 982166fe960SKevin Wolf uint64_t offset, uint64_t bytes, 983166fe960SKevin Wolf QEMUIOVector *qiov, int flags) 984166fe960SKevin Wolf { 985166fe960SKevin Wolf BlockDriver *drv = bs->drv; 9863fb06697SKevin Wolf int64_t sector_num; 9873fb06697SKevin Wolf unsigned int nb_sectors; 9883fb06697SKevin Wolf 989fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 990fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 991fa166538SEric Blake 992d470ad42SMax Reitz if (!drv) { 993d470ad42SMax Reitz return -ENOMEDIUM; 994d470ad42SMax Reitz } 995d470ad42SMax Reitz 9963fb06697SKevin Wolf if (drv->bdrv_co_preadv) { 9973fb06697SKevin Wolf return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags); 9983fb06697SKevin Wolf } 9993fb06697SKevin Wolf 1000edfab6a0SEric Blake if (drv->bdrv_aio_preadv) { 100108844473SKevin Wolf BlockAIOCB *acb; 100208844473SKevin Wolf CoroutineIOCompletion co = { 100308844473SKevin Wolf .coroutine = qemu_coroutine_self(), 100408844473SKevin Wolf }; 100508844473SKevin Wolf 1006e31f6864SEric Blake acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags, 100708844473SKevin Wolf bdrv_co_io_em_complete, &co); 100808844473SKevin Wolf if (acb == NULL) { 100908844473SKevin Wolf return -EIO; 101008844473SKevin Wolf } else { 101108844473SKevin Wolf qemu_coroutine_yield(); 101208844473SKevin Wolf return co.ret; 101308844473SKevin Wolf } 101408844473SKevin Wolf } 1015edfab6a0SEric Blake 1016edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1017edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1018edfab6a0SEric Blake 1019edfab6a0SEric Blake assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1020edfab6a0SEric Blake assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 102141ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1022edfab6a0SEric Blake assert(drv->bdrv_co_readv); 1023edfab6a0SEric Blake 1024edfab6a0SEric Blake return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); 1025166fe960SKevin Wolf } 1026166fe960SKevin Wolf 102778a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, 102878a07294SKevin Wolf uint64_t offset, uint64_t bytes, 102978a07294SKevin Wolf QEMUIOVector *qiov, int flags) 103078a07294SKevin Wolf { 103178a07294SKevin Wolf BlockDriver *drv = bs->drv; 10323fb06697SKevin Wolf int64_t sector_num; 10333fb06697SKevin Wolf unsigned int nb_sectors; 103478a07294SKevin Wolf int ret; 103578a07294SKevin Wolf 1036fa166538SEric Blake assert(!(flags & ~BDRV_REQ_MASK)); 1037fe0480d6SKevin Wolf assert(!(flags & BDRV_REQ_NO_FALLBACK)); 1038fa166538SEric Blake 1039d470ad42SMax Reitz if (!drv) { 1040d470ad42SMax Reitz return -ENOMEDIUM; 1041d470ad42SMax Reitz } 1042d470ad42SMax Reitz 10433fb06697SKevin Wolf if (drv->bdrv_co_pwritev) { 1044515c2f43SKevin Wolf ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, 1045515c2f43SKevin Wolf flags & bs->supported_write_flags); 1046515c2f43SKevin Wolf flags &= ~bs->supported_write_flags; 10473fb06697SKevin Wolf goto emulate_flags; 10483fb06697SKevin Wolf } 10493fb06697SKevin Wolf 1050edfab6a0SEric Blake if (drv->bdrv_aio_pwritev) { 105108844473SKevin Wolf BlockAIOCB *acb; 105208844473SKevin Wolf CoroutineIOCompletion co = { 105308844473SKevin Wolf .coroutine = qemu_coroutine_self(), 105408844473SKevin Wolf }; 105508844473SKevin Wolf 1056e31f6864SEric Blake acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, 1057e31f6864SEric Blake flags & bs->supported_write_flags, 105808844473SKevin Wolf bdrv_co_io_em_complete, &co); 1059e31f6864SEric Blake flags &= ~bs->supported_write_flags; 106008844473SKevin Wolf if (acb == NULL) { 10613fb06697SKevin Wolf ret = -EIO; 106208844473SKevin Wolf } else { 106308844473SKevin Wolf qemu_coroutine_yield(); 10643fb06697SKevin Wolf ret = co.ret; 106508844473SKevin Wolf } 1066edfab6a0SEric Blake goto emulate_flags; 1067edfab6a0SEric Blake } 1068edfab6a0SEric Blake 1069edfab6a0SEric Blake sector_num = offset >> BDRV_SECTOR_BITS; 1070edfab6a0SEric Blake nb_sectors = bytes >> BDRV_SECTOR_BITS; 1071edfab6a0SEric Blake 1072edfab6a0SEric Blake assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1073edfab6a0SEric Blake assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 107441ae31e3SAlberto Garcia assert(bytes <= BDRV_REQUEST_MAX_BYTES); 1075edfab6a0SEric Blake 1076e18a58b4SEric Blake assert(drv->bdrv_co_writev); 1077e18a58b4SEric Blake ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, 1078edfab6a0SEric Blake flags & bs->supported_write_flags); 1079edfab6a0SEric Blake flags &= ~bs->supported_write_flags; 108078a07294SKevin Wolf 10813fb06697SKevin Wolf emulate_flags: 10824df863f3SEric Blake if (ret == 0 && (flags & BDRV_REQ_FUA)) { 108378a07294SKevin Wolf ret = bdrv_co_flush(bs); 108478a07294SKevin Wolf } 108578a07294SKevin Wolf 108678a07294SKevin Wolf return ret; 108778a07294SKevin Wolf } 108878a07294SKevin Wolf 108929a298afSPavel Butsykin static int coroutine_fn 109029a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 109129a298afSPavel Butsykin uint64_t bytes, QEMUIOVector *qiov) 109229a298afSPavel Butsykin { 109329a298afSPavel Butsykin BlockDriver *drv = bs->drv; 109429a298afSPavel Butsykin 1095d470ad42SMax Reitz if (!drv) { 1096d470ad42SMax Reitz return -ENOMEDIUM; 1097d470ad42SMax Reitz } 1098d470ad42SMax Reitz 109929a298afSPavel Butsykin if (!drv->bdrv_co_pwritev_compressed) { 110029a298afSPavel Butsykin return -ENOTSUP; 110129a298afSPavel Butsykin } 110229a298afSPavel Butsykin 110329a298afSPavel Butsykin return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov); 110429a298afSPavel Butsykin } 110529a298afSPavel Butsykin 110685c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, 1107244483e6SKevin Wolf int64_t offset, unsigned int bytes, QEMUIOVector *qiov) 110861007b31SStefan Hajnoczi { 110985c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 111085c97ca7SKevin Wolf 111161007b31SStefan Hajnoczi /* Perform I/O through a temporary buffer so that users who scribble over 111261007b31SStefan Hajnoczi * their read buffer while the operation is in progress do not end up 111361007b31SStefan Hajnoczi * modifying the image file. This is critical for zero-copy guest I/O 111461007b31SStefan Hajnoczi * where anything might happen inside guest memory. 111561007b31SStefan Hajnoczi */ 111661007b31SStefan Hajnoczi void *bounce_buffer; 111761007b31SStefan Hajnoczi 111861007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 1119cb2e2878SEric Blake QEMUIOVector local_qiov; 1120244483e6SKevin Wolf int64_t cluster_offset; 11217cfd5275SEric Blake int64_t cluster_bytes; 112261007b31SStefan Hajnoczi size_t skip_bytes; 112361007b31SStefan Hajnoczi int ret; 1124cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, 1125cb2e2878SEric Blake BDRV_REQUEST_MAX_BYTES); 1126cb2e2878SEric Blake unsigned int progress = 0; 112761007b31SStefan Hajnoczi 1128d470ad42SMax Reitz if (!drv) { 1129d470ad42SMax Reitz return -ENOMEDIUM; 1130d470ad42SMax Reitz } 1131d470ad42SMax Reitz 11321bf03e66SKevin Wolf /* FIXME We cannot require callers to have write permissions when all they 11331bf03e66SKevin Wolf * are doing is a read request. If we did things right, write permissions 11341bf03e66SKevin Wolf * would be obtained anyway, but internally by the copy-on-read code. As 1135765d9df9SEric Blake * long as it is implemented here rather than in a separate filter driver, 11361bf03e66SKevin Wolf * the copy-on-read code doesn't have its own BdrvChild, however, for which 11371bf03e66SKevin Wolf * it could request permissions. Therefore we have to bypass the permission 11381bf03e66SKevin Wolf * system for the moment. */ 11391bf03e66SKevin Wolf // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 1140afa4b293SKevin Wolf 114161007b31SStefan Hajnoczi /* Cover entire cluster so no additional backing file I/O is required when 1142cb2e2878SEric Blake * allocating cluster in the image file. Note that this value may exceed 1143cb2e2878SEric Blake * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which 1144cb2e2878SEric Blake * is one reason we loop rather than doing it all at once. 114561007b31SStefan Hajnoczi */ 1146244483e6SKevin Wolf bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); 1147cb2e2878SEric Blake skip_bytes = offset - cluster_offset; 114861007b31SStefan Hajnoczi 1149244483e6SKevin Wolf trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, 1150244483e6SKevin Wolf cluster_offset, cluster_bytes); 115161007b31SStefan Hajnoczi 1152cb2e2878SEric Blake bounce_buffer = qemu_try_blockalign(bs, 1153cb2e2878SEric Blake MIN(MIN(max_transfer, cluster_bytes), 1154cb2e2878SEric Blake MAX_BOUNCE_BUFFER)); 115561007b31SStefan Hajnoczi if (bounce_buffer == NULL) { 115661007b31SStefan Hajnoczi ret = -ENOMEM; 115761007b31SStefan Hajnoczi goto err; 115861007b31SStefan Hajnoczi } 115961007b31SStefan Hajnoczi 1160cb2e2878SEric Blake while (cluster_bytes) { 1161cb2e2878SEric Blake int64_t pnum; 116261007b31SStefan Hajnoczi 1163cb2e2878SEric Blake ret = bdrv_is_allocated(bs, cluster_offset, 1164cb2e2878SEric Blake MIN(cluster_bytes, max_transfer), &pnum); 1165cb2e2878SEric Blake if (ret < 0) { 1166cb2e2878SEric Blake /* Safe to treat errors in querying allocation as if 1167cb2e2878SEric Blake * unallocated; we'll probably fail again soon on the 1168cb2e2878SEric Blake * read, but at least that will set a decent errno. 1169cb2e2878SEric Blake */ 1170cb2e2878SEric Blake pnum = MIN(cluster_bytes, max_transfer); 1171cb2e2878SEric Blake } 1172cb2e2878SEric Blake 1173b0ddcbbbSKevin Wolf /* Stop at EOF if the image ends in the middle of the cluster */ 1174b0ddcbbbSKevin Wolf if (ret == 0 && pnum == 0) { 1175b0ddcbbbSKevin Wolf assert(progress >= bytes); 1176b0ddcbbbSKevin Wolf break; 1177b0ddcbbbSKevin Wolf } 1178b0ddcbbbSKevin Wolf 1179cb2e2878SEric Blake assert(skip_bytes < pnum); 1180cb2e2878SEric Blake 1181cb2e2878SEric Blake if (ret <= 0) { 1182cb2e2878SEric Blake /* Must copy-on-read; use the bounce buffer */ 11830d93ed08SVladimir Sementsov-Ogievskiy pnum = MIN(pnum, MAX_BOUNCE_BUFFER); 11840d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); 1185cb2e2878SEric Blake 1186cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, cluster_offset, pnum, 1187cb2e2878SEric Blake &local_qiov, 0); 118861007b31SStefan Hajnoczi if (ret < 0) { 118961007b31SStefan Hajnoczi goto err; 119061007b31SStefan Hajnoczi } 119161007b31SStefan Hajnoczi 1192d855ebcdSEric Blake bdrv_debug_event(bs, BLKDBG_COR_WRITE); 1193c1499a5eSEric Blake if (drv->bdrv_co_pwrite_zeroes && 1194cb2e2878SEric Blake buffer_is_zero(bounce_buffer, pnum)) { 1195a604fa2bSEric Blake /* FIXME: Should we (perhaps conditionally) be setting 1196a604fa2bSEric Blake * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy 1197a604fa2bSEric Blake * that still correctly reads as zero? */ 11987adcf59fSMax Reitz ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 11997adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 120061007b31SStefan Hajnoczi } else { 1201cb2e2878SEric Blake /* This does not change the data on the disk, it is not 1202cb2e2878SEric Blake * necessary to flush even in cache=writethrough mode. 120361007b31SStefan Hajnoczi */ 1204cb2e2878SEric Blake ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, 12057adcf59fSMax Reitz &local_qiov, 12067adcf59fSMax Reitz BDRV_REQ_WRITE_UNCHANGED); 120761007b31SStefan Hajnoczi } 120861007b31SStefan Hajnoczi 120961007b31SStefan Hajnoczi if (ret < 0) { 1210cb2e2878SEric Blake /* It might be okay to ignore write errors for guest 1211cb2e2878SEric Blake * requests. If this is a deliberate copy-on-read 1212cb2e2878SEric Blake * then we don't want to ignore the error. Simply 1213cb2e2878SEric Blake * report it in all cases. 121461007b31SStefan Hajnoczi */ 121561007b31SStefan Hajnoczi goto err; 121661007b31SStefan Hajnoczi } 121761007b31SStefan Hajnoczi 1218cb2e2878SEric Blake qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, 1219cb2e2878SEric Blake pnum - skip_bytes); 1220cb2e2878SEric Blake } else { 1221cb2e2878SEric Blake /* Read directly into the destination */ 1222cb2e2878SEric Blake qemu_iovec_init(&local_qiov, qiov->niov); 1223cb2e2878SEric Blake qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); 1224cb2e2878SEric Blake ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, 1225cb2e2878SEric Blake &local_qiov, 0); 1226cb2e2878SEric Blake qemu_iovec_destroy(&local_qiov); 1227cb2e2878SEric Blake if (ret < 0) { 1228cb2e2878SEric Blake goto err; 1229cb2e2878SEric Blake } 1230cb2e2878SEric Blake } 1231cb2e2878SEric Blake 1232cb2e2878SEric Blake cluster_offset += pnum; 1233cb2e2878SEric Blake cluster_bytes -= pnum; 1234cb2e2878SEric Blake progress += pnum - skip_bytes; 1235cb2e2878SEric Blake skip_bytes = 0; 1236cb2e2878SEric Blake } 1237cb2e2878SEric Blake ret = 0; 123861007b31SStefan Hajnoczi 123961007b31SStefan Hajnoczi err: 124061007b31SStefan Hajnoczi qemu_vfree(bounce_buffer); 124161007b31SStefan Hajnoczi return ret; 124261007b31SStefan Hajnoczi } 124361007b31SStefan Hajnoczi 124461007b31SStefan Hajnoczi /* 124561007b31SStefan Hajnoczi * Forwards an already correctly aligned request to the BlockDriver. This 12461a62d0acSEric Blake * handles copy on read, zeroing after EOF, and fragmentation of large 12471a62d0acSEric Blake * reads; any other features must be implemented by the caller. 124861007b31SStefan Hajnoczi */ 124985c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, 125061007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 125161007b31SStefan Hajnoczi int64_t align, QEMUIOVector *qiov, int flags) 125261007b31SStefan Hajnoczi { 125385c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 1254c9d20029SKevin Wolf int64_t total_bytes, max_bytes; 12551a62d0acSEric Blake int ret = 0; 12561a62d0acSEric Blake uint64_t bytes_remaining = bytes; 12571a62d0acSEric Blake int max_transfer; 125861007b31SStefan Hajnoczi 125949c07526SKevin Wolf assert(is_power_of_2(align)); 126049c07526SKevin Wolf assert((offset & (align - 1)) == 0); 126149c07526SKevin Wolf assert((bytes & (align - 1)) == 0); 126261007b31SStefan Hajnoczi assert(!qiov || bytes == qiov->size); 1263abb06c5aSDaniel P. Berrange assert((bs->open_flags & BDRV_O_NO_IO) == 0); 12641a62d0acSEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 12651a62d0acSEric Blake align); 1266a604fa2bSEric Blake 1267a604fa2bSEric Blake /* TODO: We would need a per-BDS .supported_read_flags and 1268a604fa2bSEric Blake * potential fallback support, if we ever implement any read flags 1269a604fa2bSEric Blake * to pass through to drivers. For now, there aren't any 1270a604fa2bSEric Blake * passthrough flags. */ 1271a604fa2bSEric Blake assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); 127261007b31SStefan Hajnoczi 127361007b31SStefan Hajnoczi /* Handle Copy on Read and associated serialisation */ 127461007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 127561007b31SStefan Hajnoczi /* If we touch the same cluster it counts as an overlap. This 127661007b31SStefan Hajnoczi * guarantees that allocating writes will be serialized and not race 127761007b31SStefan Hajnoczi * with each other for the same cluster. For example, in copy-on-read 127861007b31SStefan Hajnoczi * it ensures that the CoR read and write operations are atomic and 127961007b31SStefan Hajnoczi * guest writes cannot interleave between them. */ 128061007b31SStefan Hajnoczi mark_request_serialising(req, bdrv_get_cluster_size(bs)); 128161007b31SStefan Hajnoczi } 128261007b31SStefan Hajnoczi 128309d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 128409d2f948SVladimir Sementsov-Ogievskiy assert(!(flags & BDRV_REQ_SERIALISING)); 128509d2f948SVladimir Sementsov-Ogievskiy 128661408b25SFam Zheng if (!(flags & BDRV_REQ_NO_SERIALISING)) { 128761007b31SStefan Hajnoczi wait_serialising_requests(req); 128861408b25SFam Zheng } 128961007b31SStefan Hajnoczi 129061007b31SStefan Hajnoczi if (flags & BDRV_REQ_COPY_ON_READ) { 1291d6a644bbSEric Blake int64_t pnum; 129261007b31SStefan Hajnoczi 129388e63df2SEric Blake ret = bdrv_is_allocated(bs, offset, bytes, &pnum); 129461007b31SStefan Hajnoczi if (ret < 0) { 129561007b31SStefan Hajnoczi goto out; 129661007b31SStefan Hajnoczi } 129761007b31SStefan Hajnoczi 129888e63df2SEric Blake if (!ret || pnum != bytes) { 129985c97ca7SKevin Wolf ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); 130061007b31SStefan Hajnoczi goto out; 130161007b31SStefan Hajnoczi } 130261007b31SStefan Hajnoczi } 130361007b31SStefan Hajnoczi 13041a62d0acSEric Blake /* Forward the request to the BlockDriver, possibly fragmenting it */ 130549c07526SKevin Wolf total_bytes = bdrv_getlength(bs); 130649c07526SKevin Wolf if (total_bytes < 0) { 130749c07526SKevin Wolf ret = total_bytes; 130861007b31SStefan Hajnoczi goto out; 130961007b31SStefan Hajnoczi } 131061007b31SStefan Hajnoczi 131149c07526SKevin Wolf max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align); 13121a62d0acSEric Blake if (bytes <= max_bytes && bytes <= max_transfer) { 1313166fe960SKevin Wolf ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0); 13141a62d0acSEric Blake goto out; 131561007b31SStefan Hajnoczi } 131661007b31SStefan Hajnoczi 13171a62d0acSEric Blake while (bytes_remaining) { 13181a62d0acSEric Blake int num; 13191a62d0acSEric Blake 13201a62d0acSEric Blake if (max_bytes) { 13211a62d0acSEric Blake QEMUIOVector local_qiov; 13221a62d0acSEric Blake 13231a62d0acSEric Blake num = MIN(bytes_remaining, MIN(max_bytes, max_transfer)); 13241a62d0acSEric Blake assert(num); 13251a62d0acSEric Blake qemu_iovec_init(&local_qiov, qiov->niov); 13261a62d0acSEric Blake qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 13271a62d0acSEric Blake 13281a62d0acSEric Blake ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining, 13291a62d0acSEric Blake num, &local_qiov, 0); 13301a62d0acSEric Blake max_bytes -= num; 13311a62d0acSEric Blake qemu_iovec_destroy(&local_qiov); 13321a62d0acSEric Blake } else { 13331a62d0acSEric Blake num = bytes_remaining; 13341a62d0acSEric Blake ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0, 13351a62d0acSEric Blake bytes_remaining); 13361a62d0acSEric Blake } 13371a62d0acSEric Blake if (ret < 0) { 13381a62d0acSEric Blake goto out; 13391a62d0acSEric Blake } 13401a62d0acSEric Blake bytes_remaining -= num; 134161007b31SStefan Hajnoczi } 134261007b31SStefan Hajnoczi 134361007b31SStefan Hajnoczi out: 13441a62d0acSEric Blake return ret < 0 ? ret : 0; 134561007b31SStefan Hajnoczi } 134661007b31SStefan Hajnoczi 134761007b31SStefan Hajnoczi /* 134861007b31SStefan Hajnoczi * Handle a read request in coroutine context 134961007b31SStefan Hajnoczi */ 1350a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child, 135161007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 135261007b31SStefan Hajnoczi BdrvRequestFlags flags) 135361007b31SStefan Hajnoczi { 1354a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 135561007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 135661007b31SStefan Hajnoczi BdrvTrackedRequest req; 135761007b31SStefan Hajnoczi 1358a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 135961007b31SStefan Hajnoczi uint8_t *head_buf = NULL; 136061007b31SStefan Hajnoczi uint8_t *tail_buf = NULL; 136161007b31SStefan Hajnoczi QEMUIOVector local_qiov; 136261007b31SStefan Hajnoczi bool use_local_qiov = false; 136361007b31SStefan Hajnoczi int ret; 136461007b31SStefan Hajnoczi 1365f42cf447SDaniel P. Berrange trace_bdrv_co_preadv(child->bs, offset, bytes, flags); 1366f42cf447SDaniel P. Berrange 136761007b31SStefan Hajnoczi if (!drv) { 136861007b31SStefan Hajnoczi return -ENOMEDIUM; 136961007b31SStefan Hajnoczi } 137061007b31SStefan Hajnoczi 137161007b31SStefan Hajnoczi ret = bdrv_check_byte_request(bs, offset, bytes); 137261007b31SStefan Hajnoczi if (ret < 0) { 137361007b31SStefan Hajnoczi return ret; 137461007b31SStefan Hajnoczi } 137561007b31SStefan Hajnoczi 137699723548SPaolo Bonzini bdrv_inc_in_flight(bs); 137799723548SPaolo Bonzini 13789568b511SWen Congyang /* Don't do copy-on-read if we read data before write operation */ 1379d3faa13eSPaolo Bonzini if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) { 138061007b31SStefan Hajnoczi flags |= BDRV_REQ_COPY_ON_READ; 138161007b31SStefan Hajnoczi } 138261007b31SStefan Hajnoczi 138361007b31SStefan Hajnoczi /* Align read if necessary by padding qiov */ 138461007b31SStefan Hajnoczi if (offset & (align - 1)) { 138561007b31SStefan Hajnoczi head_buf = qemu_blockalign(bs, align); 138661007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 2); 138761007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 138861007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 138961007b31SStefan Hajnoczi use_local_qiov = true; 139061007b31SStefan Hajnoczi 139161007b31SStefan Hajnoczi bytes += offset & (align - 1); 139261007b31SStefan Hajnoczi offset = offset & ~(align - 1); 139361007b31SStefan Hajnoczi } 139461007b31SStefan Hajnoczi 139561007b31SStefan Hajnoczi if ((offset + bytes) & (align - 1)) { 139661007b31SStefan Hajnoczi if (!use_local_qiov) { 139761007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 1); 139861007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 139961007b31SStefan Hajnoczi use_local_qiov = true; 140061007b31SStefan Hajnoczi } 140161007b31SStefan Hajnoczi tail_buf = qemu_blockalign(bs, align); 140261007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, tail_buf, 140361007b31SStefan Hajnoczi align - ((offset + bytes) & (align - 1))); 140461007b31SStefan Hajnoczi 140561007b31SStefan Hajnoczi bytes = ROUND_UP(bytes, align); 140661007b31SStefan Hajnoczi } 140761007b31SStefan Hajnoczi 1408ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); 140985c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, &req, offset, bytes, align, 141061007b31SStefan Hajnoczi use_local_qiov ? &local_qiov : qiov, 141161007b31SStefan Hajnoczi flags); 141261007b31SStefan Hajnoczi tracked_request_end(&req); 141399723548SPaolo Bonzini bdrv_dec_in_flight(bs); 141461007b31SStefan Hajnoczi 141561007b31SStefan Hajnoczi if (use_local_qiov) { 141661007b31SStefan Hajnoczi qemu_iovec_destroy(&local_qiov); 141761007b31SStefan Hajnoczi qemu_vfree(head_buf); 141861007b31SStefan Hajnoczi qemu_vfree(tail_buf); 141961007b31SStefan Hajnoczi } 142061007b31SStefan Hajnoczi 142161007b31SStefan Hajnoczi return ret; 142261007b31SStefan Hajnoczi } 142361007b31SStefan Hajnoczi 1424d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, 1425f5a5ca79SManos Pitsidianakis int64_t offset, int bytes, BdrvRequestFlags flags) 142661007b31SStefan Hajnoczi { 142761007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 142861007b31SStefan Hajnoczi QEMUIOVector qiov; 14290d93ed08SVladimir Sementsov-Ogievskiy void *buf = NULL; 143061007b31SStefan Hajnoczi int ret = 0; 1431465fe887SEric Blake bool need_flush = false; 1432443668caSDenis V. Lunev int head = 0; 1433443668caSDenis V. Lunev int tail = 0; 143461007b31SStefan Hajnoczi 1435cf081fcaSEric Blake int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); 1436a5b8dd2cSEric Blake int alignment = MAX(bs->bl.pwrite_zeroes_alignment, 1437a5b8dd2cSEric Blake bs->bl.request_alignment); 1438cb2e2878SEric Blake int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); 1439cf081fcaSEric Blake 1440d470ad42SMax Reitz if (!drv) { 1441d470ad42SMax Reitz return -ENOMEDIUM; 1442d470ad42SMax Reitz } 1443d470ad42SMax Reitz 1444fe0480d6SKevin Wolf if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) { 1445fe0480d6SKevin Wolf return -ENOTSUP; 1446fe0480d6SKevin Wolf } 1447fe0480d6SKevin Wolf 1448b8d0a980SEric Blake assert(alignment % bs->bl.request_alignment == 0); 1449b8d0a980SEric Blake head = offset % alignment; 1450f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % alignment; 1451b8d0a980SEric Blake max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); 1452b8d0a980SEric Blake assert(max_write_zeroes >= bs->bl.request_alignment); 145361007b31SStefan Hajnoczi 1454f5a5ca79SManos Pitsidianakis while (bytes > 0 && !ret) { 1455f5a5ca79SManos Pitsidianakis int num = bytes; 145661007b31SStefan Hajnoczi 145761007b31SStefan Hajnoczi /* Align request. Block drivers can expect the "bulk" of the request 1458443668caSDenis V. Lunev * to be aligned, and that unaligned requests do not cross cluster 1459443668caSDenis V. Lunev * boundaries. 146061007b31SStefan Hajnoczi */ 1461443668caSDenis V. Lunev if (head) { 1462b2f95feeSEric Blake /* Make a small request up to the first aligned sector. For 1463b2f95feeSEric Blake * convenience, limit this request to max_transfer even if 1464b2f95feeSEric Blake * we don't need to fall back to writes. */ 1465f5a5ca79SManos Pitsidianakis num = MIN(MIN(bytes, max_transfer), alignment - head); 1466b2f95feeSEric Blake head = (head + num) % alignment; 1467b2f95feeSEric Blake assert(num < max_write_zeroes); 1468d05aa8bbSEric Blake } else if (tail && num > alignment) { 1469443668caSDenis V. Lunev /* Shorten the request to the last aligned sector. */ 1470443668caSDenis V. Lunev num -= tail; 147161007b31SStefan Hajnoczi } 147261007b31SStefan Hajnoczi 147361007b31SStefan Hajnoczi /* limit request size */ 147461007b31SStefan Hajnoczi if (num > max_write_zeroes) { 147561007b31SStefan Hajnoczi num = max_write_zeroes; 147661007b31SStefan Hajnoczi } 147761007b31SStefan Hajnoczi 147861007b31SStefan Hajnoczi ret = -ENOTSUP; 147961007b31SStefan Hajnoczi /* First try the efficient write zeroes operation */ 1480d05aa8bbSEric Blake if (drv->bdrv_co_pwrite_zeroes) { 1481d05aa8bbSEric Blake ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num, 1482d05aa8bbSEric Blake flags & bs->supported_zero_flags); 1483d05aa8bbSEric Blake if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) && 1484d05aa8bbSEric Blake !(bs->supported_zero_flags & BDRV_REQ_FUA)) { 1485d05aa8bbSEric Blake need_flush = true; 1486d05aa8bbSEric Blake } 1487465fe887SEric Blake } else { 1488465fe887SEric Blake assert(!bs->supported_zero_flags); 148961007b31SStefan Hajnoczi } 149061007b31SStefan Hajnoczi 1491118f9944SAndrey Shinkevich if (ret < 0 && !(flags & BDRV_REQ_NO_FALLBACK)) { 149261007b31SStefan Hajnoczi /* Fall back to bounce buffer if write zeroes is unsupported */ 1493465fe887SEric Blake BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; 1494465fe887SEric Blake 1495465fe887SEric Blake if ((flags & BDRV_REQ_FUA) && 1496465fe887SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 1497465fe887SEric Blake /* No need for bdrv_driver_pwrite() to do a fallback 1498465fe887SEric Blake * flush on each chunk; use just one at the end */ 1499465fe887SEric Blake write_flags &= ~BDRV_REQ_FUA; 1500465fe887SEric Blake need_flush = true; 1501465fe887SEric Blake } 15025def6b80SEric Blake num = MIN(num, max_transfer); 15030d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 15040d93ed08SVladimir Sementsov-Ogievskiy buf = qemu_try_blockalign0(bs, num); 15050d93ed08SVladimir Sementsov-Ogievskiy if (buf == NULL) { 150661007b31SStefan Hajnoczi ret = -ENOMEM; 150761007b31SStefan Hajnoczi goto fail; 150861007b31SStefan Hajnoczi } 150961007b31SStefan Hajnoczi } 15100d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&qiov, buf, num); 151161007b31SStefan Hajnoczi 1512d05aa8bbSEric Blake ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags); 151361007b31SStefan Hajnoczi 151461007b31SStefan Hajnoczi /* Keep bounce buffer around if it is big enough for all 151561007b31SStefan Hajnoczi * all future requests. 151661007b31SStefan Hajnoczi */ 15175def6b80SEric Blake if (num < max_transfer) { 15180d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 15190d93ed08SVladimir Sementsov-Ogievskiy buf = NULL; 152061007b31SStefan Hajnoczi } 152161007b31SStefan Hajnoczi } 152261007b31SStefan Hajnoczi 1523d05aa8bbSEric Blake offset += num; 1524f5a5ca79SManos Pitsidianakis bytes -= num; 152561007b31SStefan Hajnoczi } 152661007b31SStefan Hajnoczi 152761007b31SStefan Hajnoczi fail: 1528465fe887SEric Blake if (ret == 0 && need_flush) { 1529465fe887SEric Blake ret = bdrv_co_flush(bs); 1530465fe887SEric Blake } 15310d93ed08SVladimir Sementsov-Ogievskiy qemu_vfree(buf); 153261007b31SStefan Hajnoczi return ret; 153361007b31SStefan Hajnoczi } 153461007b31SStefan Hajnoczi 153585fe2479SFam Zheng static inline int coroutine_fn 153685fe2479SFam Zheng bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, 153785fe2479SFam Zheng BdrvTrackedRequest *req, int flags) 153885fe2479SFam Zheng { 153985fe2479SFam Zheng BlockDriverState *bs = child->bs; 154085fe2479SFam Zheng bool waited; 154185fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 154285fe2479SFam Zheng 154385fe2479SFam Zheng if (bs->read_only) { 154485fe2479SFam Zheng return -EPERM; 154585fe2479SFam Zheng } 154685fe2479SFam Zheng 154785fe2479SFam Zheng /* BDRV_REQ_NO_SERIALISING is only for read operation */ 154885fe2479SFam Zheng assert(!(flags & BDRV_REQ_NO_SERIALISING)); 154985fe2479SFam Zheng assert(!(bs->open_flags & BDRV_O_INACTIVE)); 155085fe2479SFam Zheng assert((bs->open_flags & BDRV_O_NO_IO) == 0); 155185fe2479SFam Zheng assert(!(flags & ~BDRV_REQ_MASK)); 155285fe2479SFam Zheng 155385fe2479SFam Zheng if (flags & BDRV_REQ_SERIALISING) { 155485fe2479SFam Zheng mark_request_serialising(req, bdrv_get_cluster_size(bs)); 155585fe2479SFam Zheng } 155685fe2479SFam Zheng 155785fe2479SFam Zheng waited = wait_serialising_requests(req); 155885fe2479SFam Zheng 155985fe2479SFam Zheng assert(!waited || !req->serialising || 156085fe2479SFam Zheng is_request_serialising_and_aligned(req)); 156185fe2479SFam Zheng assert(req->overlap_offset <= offset); 156285fe2479SFam Zheng assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); 1563cd47d792SFam Zheng assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); 156485fe2479SFam Zheng 1565cd47d792SFam Zheng switch (req->type) { 1566cd47d792SFam Zheng case BDRV_TRACKED_WRITE: 1567cd47d792SFam Zheng case BDRV_TRACKED_DISCARD: 156885fe2479SFam Zheng if (flags & BDRV_REQ_WRITE_UNCHANGED) { 156985fe2479SFam Zheng assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); 157085fe2479SFam Zheng } else { 157185fe2479SFam Zheng assert(child->perm & BLK_PERM_WRITE); 157285fe2479SFam Zheng } 1573cd47d792SFam Zheng return notifier_with_return_list_notify(&bs->before_write_notifiers, 1574cd47d792SFam Zheng req); 1575cd47d792SFam Zheng case BDRV_TRACKED_TRUNCATE: 1576cd47d792SFam Zheng assert(child->perm & BLK_PERM_RESIZE); 1577cd47d792SFam Zheng return 0; 1578cd47d792SFam Zheng default: 1579cd47d792SFam Zheng abort(); 1580cd47d792SFam Zheng } 158185fe2479SFam Zheng } 158285fe2479SFam Zheng 158385fe2479SFam Zheng static inline void coroutine_fn 158485fe2479SFam Zheng bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, 158585fe2479SFam Zheng BdrvTrackedRequest *req, int ret) 158685fe2479SFam Zheng { 158785fe2479SFam Zheng int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); 158885fe2479SFam Zheng BlockDriverState *bs = child->bs; 158985fe2479SFam Zheng 159085fe2479SFam Zheng atomic_inc(&bs->write_gen); 159185fe2479SFam Zheng 159200695c27SFam Zheng /* 159300695c27SFam Zheng * Discard cannot extend the image, but in error handling cases, such as 159400695c27SFam Zheng * when reverting a qcow2 cluster allocation, the discarded range can pass 159500695c27SFam Zheng * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD 159600695c27SFam Zheng * here. Instead, just skip it, since semantically a discard request 159700695c27SFam Zheng * beyond EOF cannot expand the image anyway. 159800695c27SFam Zheng */ 15997f8f03efSFam Zheng if (ret == 0 && 1600cd47d792SFam Zheng (req->type == BDRV_TRACKED_TRUNCATE || 1601cd47d792SFam Zheng end_sector > bs->total_sectors) && 160200695c27SFam Zheng req->type != BDRV_TRACKED_DISCARD) { 16037f8f03efSFam Zheng bs->total_sectors = end_sector; 16047f8f03efSFam Zheng bdrv_parent_cb_resize(bs); 16057f8f03efSFam Zheng bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); 160685fe2479SFam Zheng } 160700695c27SFam Zheng if (req->bytes) { 160800695c27SFam Zheng switch (req->type) { 160900695c27SFam Zheng case BDRV_TRACKED_WRITE: 161000695c27SFam Zheng stat64_max(&bs->wr_highest_offset, offset + bytes); 161100695c27SFam Zheng /* fall through, to set dirty bits */ 161200695c27SFam Zheng case BDRV_TRACKED_DISCARD: 16137f8f03efSFam Zheng bdrv_set_dirty(bs, offset, bytes); 161400695c27SFam Zheng break; 161500695c27SFam Zheng default: 161600695c27SFam Zheng break; 161700695c27SFam Zheng } 161800695c27SFam Zheng } 161985fe2479SFam Zheng } 162085fe2479SFam Zheng 162161007b31SStefan Hajnoczi /* 162204ed95f4SEric Blake * Forwards an already correctly aligned write request to the BlockDriver, 162304ed95f4SEric Blake * after possibly fragmenting it. 162461007b31SStefan Hajnoczi */ 162585c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, 162661007b31SStefan Hajnoczi BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, 1627cff86b38SEric Blake int64_t align, QEMUIOVector *qiov, int flags) 162861007b31SStefan Hajnoczi { 162985c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 163061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 163161007b31SStefan Hajnoczi int ret; 163261007b31SStefan Hajnoczi 163304ed95f4SEric Blake uint64_t bytes_remaining = bytes; 163404ed95f4SEric Blake int max_transfer; 163561007b31SStefan Hajnoczi 1636d470ad42SMax Reitz if (!drv) { 1637d470ad42SMax Reitz return -ENOMEDIUM; 1638d470ad42SMax Reitz } 1639d470ad42SMax Reitz 1640d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 1641d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 1642d6883bc9SVladimir Sementsov-Ogievskiy } 1643d6883bc9SVladimir Sementsov-Ogievskiy 1644cff86b38SEric Blake assert(is_power_of_2(align)); 1645cff86b38SEric Blake assert((offset & (align - 1)) == 0); 1646cff86b38SEric Blake assert((bytes & (align - 1)) == 0); 164761007b31SStefan Hajnoczi assert(!qiov || bytes == qiov->size); 164804ed95f4SEric Blake max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), 164904ed95f4SEric Blake align); 165061007b31SStefan Hajnoczi 165185fe2479SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); 165261007b31SStefan Hajnoczi 165361007b31SStefan Hajnoczi if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && 1654c1499a5eSEric Blake !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && 165561007b31SStefan Hajnoczi qemu_iovec_is_zero(qiov)) { 165661007b31SStefan Hajnoczi flags |= BDRV_REQ_ZERO_WRITE; 165761007b31SStefan Hajnoczi if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { 165861007b31SStefan Hajnoczi flags |= BDRV_REQ_MAY_UNMAP; 165961007b31SStefan Hajnoczi } 166061007b31SStefan Hajnoczi } 166161007b31SStefan Hajnoczi 166261007b31SStefan Hajnoczi if (ret < 0) { 166361007b31SStefan Hajnoczi /* Do nothing, write notifier decided to fail this request */ 166461007b31SStefan Hajnoczi } else if (flags & BDRV_REQ_ZERO_WRITE) { 16659a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); 16669896c876SKevin Wolf ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags); 16673ea1a091SPavel Butsykin } else if (flags & BDRV_REQ_WRITE_COMPRESSED) { 16683ea1a091SPavel Butsykin ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov); 166904ed95f4SEric Blake } else if (bytes <= max_transfer) { 16709a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV); 167178a07294SKevin Wolf ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags); 167204ed95f4SEric Blake } else { 167304ed95f4SEric Blake bdrv_debug_event(bs, BLKDBG_PWRITEV); 167404ed95f4SEric Blake while (bytes_remaining) { 167504ed95f4SEric Blake int num = MIN(bytes_remaining, max_transfer); 167604ed95f4SEric Blake QEMUIOVector local_qiov; 167704ed95f4SEric Blake int local_flags = flags; 167804ed95f4SEric Blake 167904ed95f4SEric Blake assert(num); 168004ed95f4SEric Blake if (num < bytes_remaining && (flags & BDRV_REQ_FUA) && 168104ed95f4SEric Blake !(bs->supported_write_flags & BDRV_REQ_FUA)) { 168204ed95f4SEric Blake /* If FUA is going to be emulated by flush, we only 168304ed95f4SEric Blake * need to flush on the last iteration */ 168404ed95f4SEric Blake local_flags &= ~BDRV_REQ_FUA; 168504ed95f4SEric Blake } 168604ed95f4SEric Blake qemu_iovec_init(&local_qiov, qiov->niov); 168704ed95f4SEric Blake qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num); 168804ed95f4SEric Blake 168904ed95f4SEric Blake ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining, 169004ed95f4SEric Blake num, &local_qiov, local_flags); 169104ed95f4SEric Blake qemu_iovec_destroy(&local_qiov); 169204ed95f4SEric Blake if (ret < 0) { 169304ed95f4SEric Blake break; 169404ed95f4SEric Blake } 169504ed95f4SEric Blake bytes_remaining -= num; 169604ed95f4SEric Blake } 169761007b31SStefan Hajnoczi } 16989a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); 169961007b31SStefan Hajnoczi 170061007b31SStefan Hajnoczi if (ret >= 0) { 170104ed95f4SEric Blake ret = 0; 170261007b31SStefan Hajnoczi } 170385fe2479SFam Zheng bdrv_co_write_req_finish(child, offset, bytes, req, ret); 170461007b31SStefan Hajnoczi 170561007b31SStefan Hajnoczi return ret; 170661007b31SStefan Hajnoczi } 170761007b31SStefan Hajnoczi 170885c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, 17099eeb6dd1SFam Zheng int64_t offset, 17109eeb6dd1SFam Zheng unsigned int bytes, 17119eeb6dd1SFam Zheng BdrvRequestFlags flags, 17129eeb6dd1SFam Zheng BdrvTrackedRequest *req) 17139eeb6dd1SFam Zheng { 171485c97ca7SKevin Wolf BlockDriverState *bs = child->bs; 17159eeb6dd1SFam Zheng uint8_t *buf = NULL; 17169eeb6dd1SFam Zheng QEMUIOVector local_qiov; 1717a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 17189eeb6dd1SFam Zheng unsigned int head_padding_bytes, tail_padding_bytes; 17199eeb6dd1SFam Zheng int ret = 0; 17209eeb6dd1SFam Zheng 17219eeb6dd1SFam Zheng head_padding_bytes = offset & (align - 1); 1722f13ce1beSDenis V. Lunev tail_padding_bytes = (align - (offset + bytes)) & (align - 1); 17239eeb6dd1SFam Zheng 17249eeb6dd1SFam Zheng 17259eeb6dd1SFam Zheng assert(flags & BDRV_REQ_ZERO_WRITE); 17269eeb6dd1SFam Zheng if (head_padding_bytes || tail_padding_bytes) { 17279eeb6dd1SFam Zheng buf = qemu_blockalign(bs, align); 17280d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&local_qiov, buf, align); 17299eeb6dd1SFam Zheng } 17309eeb6dd1SFam Zheng if (head_padding_bytes) { 17319eeb6dd1SFam Zheng uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); 17329eeb6dd1SFam Zheng 17339eeb6dd1SFam Zheng /* RMW the unaligned part before head. */ 17349eeb6dd1SFam Zheng mark_request_serialising(req, align); 17359eeb6dd1SFam Zheng wait_serialising_requests(req); 17369a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 173785c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align, 17389eeb6dd1SFam Zheng align, &local_qiov, 0); 17399eeb6dd1SFam Zheng if (ret < 0) { 17409eeb6dd1SFam Zheng goto fail; 17419eeb6dd1SFam Zheng } 17429a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 17439eeb6dd1SFam Zheng 17449eeb6dd1SFam Zheng memset(buf + head_padding_bytes, 0, zero_bytes); 174585c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align, 1746cff86b38SEric Blake align, &local_qiov, 17479eeb6dd1SFam Zheng flags & ~BDRV_REQ_ZERO_WRITE); 17489eeb6dd1SFam Zheng if (ret < 0) { 17499eeb6dd1SFam Zheng goto fail; 17509eeb6dd1SFam Zheng } 17519eeb6dd1SFam Zheng offset += zero_bytes; 17529eeb6dd1SFam Zheng bytes -= zero_bytes; 17539eeb6dd1SFam Zheng } 17549eeb6dd1SFam Zheng 17559eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 17569eeb6dd1SFam Zheng if (bytes >= align) { 17579eeb6dd1SFam Zheng /* Write the aligned part in the middle. */ 17589eeb6dd1SFam Zheng uint64_t aligned_bytes = bytes & ~(align - 1); 175985c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align, 17609eeb6dd1SFam Zheng NULL, flags); 17619eeb6dd1SFam Zheng if (ret < 0) { 17629eeb6dd1SFam Zheng goto fail; 17639eeb6dd1SFam Zheng } 17649eeb6dd1SFam Zheng bytes -= aligned_bytes; 17659eeb6dd1SFam Zheng offset += aligned_bytes; 17669eeb6dd1SFam Zheng } 17679eeb6dd1SFam Zheng 17689eeb6dd1SFam Zheng assert(!bytes || (offset & (align - 1)) == 0); 17699eeb6dd1SFam Zheng if (bytes) { 17709eeb6dd1SFam Zheng assert(align == tail_padding_bytes + bytes); 17719eeb6dd1SFam Zheng /* RMW the unaligned part after tail. */ 17729eeb6dd1SFam Zheng mark_request_serialising(req, align); 17739eeb6dd1SFam Zheng wait_serialising_requests(req); 17749a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 177585c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, req, offset, align, 17769eeb6dd1SFam Zheng align, &local_qiov, 0); 17779eeb6dd1SFam Zheng if (ret < 0) { 17789eeb6dd1SFam Zheng goto fail; 17799eeb6dd1SFam Zheng } 17809a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 17819eeb6dd1SFam Zheng 17829eeb6dd1SFam Zheng memset(buf, 0, bytes); 178385c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, req, offset, align, align, 17849eeb6dd1SFam Zheng &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); 17859eeb6dd1SFam Zheng } 17869eeb6dd1SFam Zheng fail: 17879eeb6dd1SFam Zheng qemu_vfree(buf); 17889eeb6dd1SFam Zheng return ret; 17899eeb6dd1SFam Zheng 17909eeb6dd1SFam Zheng } 17919eeb6dd1SFam Zheng 179261007b31SStefan Hajnoczi /* 179361007b31SStefan Hajnoczi * Handle a write request in coroutine context 179461007b31SStefan Hajnoczi */ 1795a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child, 179661007b31SStefan Hajnoczi int64_t offset, unsigned int bytes, QEMUIOVector *qiov, 179761007b31SStefan Hajnoczi BdrvRequestFlags flags) 179861007b31SStefan Hajnoczi { 1799a03ef88fSKevin Wolf BlockDriverState *bs = child->bs; 180061007b31SStefan Hajnoczi BdrvTrackedRequest req; 1801a5b8dd2cSEric Blake uint64_t align = bs->bl.request_alignment; 180261007b31SStefan Hajnoczi uint8_t *head_buf = NULL; 180361007b31SStefan Hajnoczi uint8_t *tail_buf = NULL; 180461007b31SStefan Hajnoczi QEMUIOVector local_qiov; 180561007b31SStefan Hajnoczi bool use_local_qiov = false; 180661007b31SStefan Hajnoczi int ret; 180761007b31SStefan Hajnoczi 1808f42cf447SDaniel P. Berrange trace_bdrv_co_pwritev(child->bs, offset, bytes, flags); 1809f42cf447SDaniel P. Berrange 181061007b31SStefan Hajnoczi if (!bs->drv) { 181161007b31SStefan Hajnoczi return -ENOMEDIUM; 181261007b31SStefan Hajnoczi } 181361007b31SStefan Hajnoczi 181461007b31SStefan Hajnoczi ret = bdrv_check_byte_request(bs, offset, bytes); 181561007b31SStefan Hajnoczi if (ret < 0) { 181661007b31SStefan Hajnoczi return ret; 181761007b31SStefan Hajnoczi } 181861007b31SStefan Hajnoczi 181999723548SPaolo Bonzini bdrv_inc_in_flight(bs); 182061007b31SStefan Hajnoczi /* 182161007b31SStefan Hajnoczi * Align write if necessary by performing a read-modify-write cycle. 182261007b31SStefan Hajnoczi * Pad qiov with the read parts and be sure to have a tracked request not 182361007b31SStefan Hajnoczi * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. 182461007b31SStefan Hajnoczi */ 1825ebde595cSFam Zheng tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); 182661007b31SStefan Hajnoczi 182718a59f03SAnton Nefedov if (flags & BDRV_REQ_ZERO_WRITE) { 182885c97ca7SKevin Wolf ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); 18299eeb6dd1SFam Zheng goto out; 18309eeb6dd1SFam Zheng } 18319eeb6dd1SFam Zheng 183261007b31SStefan Hajnoczi if (offset & (align - 1)) { 183361007b31SStefan Hajnoczi QEMUIOVector head_qiov; 183461007b31SStefan Hajnoczi 183561007b31SStefan Hajnoczi mark_request_serialising(&req, align); 183661007b31SStefan Hajnoczi wait_serialising_requests(&req); 183761007b31SStefan Hajnoczi 183861007b31SStefan Hajnoczi head_buf = qemu_blockalign(bs, align); 18390d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&head_qiov, head_buf, align); 184061007b31SStefan Hajnoczi 18419a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); 184285c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align, 184361007b31SStefan Hajnoczi align, &head_qiov, 0); 184461007b31SStefan Hajnoczi if (ret < 0) { 184561007b31SStefan Hajnoczi goto fail; 184661007b31SStefan Hajnoczi } 18479a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); 184861007b31SStefan Hajnoczi 184961007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 2); 185061007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); 185161007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 185261007b31SStefan Hajnoczi use_local_qiov = true; 185361007b31SStefan Hajnoczi 185461007b31SStefan Hajnoczi bytes += offset & (align - 1); 185561007b31SStefan Hajnoczi offset = offset & ~(align - 1); 1856117bc3faSPeter Lieven 1857117bc3faSPeter Lieven /* We have read the tail already if the request is smaller 1858117bc3faSPeter Lieven * than one aligned block. 1859117bc3faSPeter Lieven */ 1860117bc3faSPeter Lieven if (bytes < align) { 1861117bc3faSPeter Lieven qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes); 1862117bc3faSPeter Lieven bytes = align; 1863117bc3faSPeter Lieven } 186461007b31SStefan Hajnoczi } 186561007b31SStefan Hajnoczi 186661007b31SStefan Hajnoczi if ((offset + bytes) & (align - 1)) { 186761007b31SStefan Hajnoczi QEMUIOVector tail_qiov; 186861007b31SStefan Hajnoczi size_t tail_bytes; 186961007b31SStefan Hajnoczi bool waited; 187061007b31SStefan Hajnoczi 187161007b31SStefan Hajnoczi mark_request_serialising(&req, align); 187261007b31SStefan Hajnoczi waited = wait_serialising_requests(&req); 187361007b31SStefan Hajnoczi assert(!waited || !use_local_qiov); 187461007b31SStefan Hajnoczi 187561007b31SStefan Hajnoczi tail_buf = qemu_blockalign(bs, align); 18760d93ed08SVladimir Sementsov-Ogievskiy qemu_iovec_init_buf(&tail_qiov, tail_buf, align); 187761007b31SStefan Hajnoczi 18789a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); 187985c97ca7SKevin Wolf ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1), 188085c97ca7SKevin Wolf align, align, &tail_qiov, 0); 188161007b31SStefan Hajnoczi if (ret < 0) { 188261007b31SStefan Hajnoczi goto fail; 188361007b31SStefan Hajnoczi } 18849a4f4c31SKevin Wolf bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); 188561007b31SStefan Hajnoczi 188661007b31SStefan Hajnoczi if (!use_local_qiov) { 188761007b31SStefan Hajnoczi qemu_iovec_init(&local_qiov, qiov->niov + 1); 188861007b31SStefan Hajnoczi qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); 188961007b31SStefan Hajnoczi use_local_qiov = true; 189061007b31SStefan Hajnoczi } 189161007b31SStefan Hajnoczi 189261007b31SStefan Hajnoczi tail_bytes = (offset + bytes) & (align - 1); 189361007b31SStefan Hajnoczi qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); 189461007b31SStefan Hajnoczi 189561007b31SStefan Hajnoczi bytes = ROUND_UP(bytes, align); 189661007b31SStefan Hajnoczi } 189761007b31SStefan Hajnoczi 189885c97ca7SKevin Wolf ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, 189961007b31SStefan Hajnoczi use_local_qiov ? &local_qiov : qiov, 190061007b31SStefan Hajnoczi flags); 190161007b31SStefan Hajnoczi 190261007b31SStefan Hajnoczi fail: 190361007b31SStefan Hajnoczi 190461007b31SStefan Hajnoczi if (use_local_qiov) { 190561007b31SStefan Hajnoczi qemu_iovec_destroy(&local_qiov); 190661007b31SStefan Hajnoczi } 190761007b31SStefan Hajnoczi qemu_vfree(head_buf); 190861007b31SStefan Hajnoczi qemu_vfree(tail_buf); 19099eeb6dd1SFam Zheng out: 19109eeb6dd1SFam Zheng tracked_request_end(&req); 191199723548SPaolo Bonzini bdrv_dec_in_flight(bs); 191261007b31SStefan Hajnoczi return ret; 191361007b31SStefan Hajnoczi } 191461007b31SStefan Hajnoczi 1915a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, 1916f5a5ca79SManos Pitsidianakis int bytes, BdrvRequestFlags flags) 191761007b31SStefan Hajnoczi { 1918f5a5ca79SManos Pitsidianakis trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); 191961007b31SStefan Hajnoczi 1920a03ef88fSKevin Wolf if (!(child->bs->open_flags & BDRV_O_UNMAP)) { 192161007b31SStefan Hajnoczi flags &= ~BDRV_REQ_MAY_UNMAP; 192261007b31SStefan Hajnoczi } 192361007b31SStefan Hajnoczi 1924f5a5ca79SManos Pitsidianakis return bdrv_co_pwritev(child, offset, bytes, NULL, 192561007b31SStefan Hajnoczi BDRV_REQ_ZERO_WRITE | flags); 192661007b31SStefan Hajnoczi } 192761007b31SStefan Hajnoczi 19284085f5c7SJohn Snow /* 19294085f5c7SJohn Snow * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not. 19304085f5c7SJohn Snow */ 19314085f5c7SJohn Snow int bdrv_flush_all(void) 19324085f5c7SJohn Snow { 19334085f5c7SJohn Snow BdrvNextIterator it; 19344085f5c7SJohn Snow BlockDriverState *bs = NULL; 19354085f5c7SJohn Snow int result = 0; 19364085f5c7SJohn Snow 19374085f5c7SJohn Snow for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 19384085f5c7SJohn Snow AioContext *aio_context = bdrv_get_aio_context(bs); 19394085f5c7SJohn Snow int ret; 19404085f5c7SJohn Snow 19414085f5c7SJohn Snow aio_context_acquire(aio_context); 19424085f5c7SJohn Snow ret = bdrv_flush(bs); 19434085f5c7SJohn Snow if (ret < 0 && !result) { 19444085f5c7SJohn Snow result = ret; 19454085f5c7SJohn Snow } 19464085f5c7SJohn Snow aio_context_release(aio_context); 19474085f5c7SJohn Snow } 19484085f5c7SJohn Snow 19494085f5c7SJohn Snow return result; 19504085f5c7SJohn Snow } 19514085f5c7SJohn Snow 19524085f5c7SJohn Snow 19534bcd936eSEric Blake typedef struct BdrvCoBlockStatusData { 195461007b31SStefan Hajnoczi BlockDriverState *bs; 195561007b31SStefan Hajnoczi BlockDriverState *base; 1956c9ce8c4dSEric Blake bool want_zero; 19574bcd936eSEric Blake int64_t offset; 19584bcd936eSEric Blake int64_t bytes; 19594bcd936eSEric Blake int64_t *pnum; 19604bcd936eSEric Blake int64_t *map; 1961c9ce8c4dSEric Blake BlockDriverState **file; 19624bcd936eSEric Blake int ret; 196361007b31SStefan Hajnoczi bool done; 19644bcd936eSEric Blake } BdrvCoBlockStatusData; 196561007b31SStefan Hajnoczi 19663e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, 19673e4d0e72SEric Blake bool want_zero, 19683e4d0e72SEric Blake int64_t offset, 19693e4d0e72SEric Blake int64_t bytes, 19703e4d0e72SEric Blake int64_t *pnum, 19713e4d0e72SEric Blake int64_t *map, 1972f7cc69b3SManos Pitsidianakis BlockDriverState **file) 1973f7cc69b3SManos Pitsidianakis { 1974f7cc69b3SManos Pitsidianakis assert(bs->file && bs->file->bs); 19753e4d0e72SEric Blake *pnum = bytes; 19763e4d0e72SEric Blake *map = offset; 1977f7cc69b3SManos Pitsidianakis *file = bs->file->bs; 19783e4d0e72SEric Blake return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1979f7cc69b3SManos Pitsidianakis } 1980f7cc69b3SManos Pitsidianakis 19813e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs, 19823e4d0e72SEric Blake bool want_zero, 19833e4d0e72SEric Blake int64_t offset, 19843e4d0e72SEric Blake int64_t bytes, 19853e4d0e72SEric Blake int64_t *pnum, 19863e4d0e72SEric Blake int64_t *map, 1987f7cc69b3SManos Pitsidianakis BlockDriverState **file) 1988f7cc69b3SManos Pitsidianakis { 1989f7cc69b3SManos Pitsidianakis assert(bs->backing && bs->backing->bs); 19903e4d0e72SEric Blake *pnum = bytes; 19913e4d0e72SEric Blake *map = offset; 1992f7cc69b3SManos Pitsidianakis *file = bs->backing->bs; 19933e4d0e72SEric Blake return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID; 1994f7cc69b3SManos Pitsidianakis } 1995f7cc69b3SManos Pitsidianakis 199661007b31SStefan Hajnoczi /* 199761007b31SStefan Hajnoczi * Returns the allocation status of the specified sectors. 199861007b31SStefan Hajnoczi * Drivers not implementing the functionality are assumed to not support 199961007b31SStefan Hajnoczi * backing files, hence all their sectors are reported as allocated. 200061007b31SStefan Hajnoczi * 200186a3d5c6SEric Blake * If 'want_zero' is true, the caller is querying for mapping 200286a3d5c6SEric Blake * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and 200386a3d5c6SEric Blake * _ZERO where possible; otherwise, the result favors larger 'pnum', 200486a3d5c6SEric Blake * with a focus on accurate BDRV_BLOCK_ALLOCATED. 2005c9ce8c4dSEric Blake * 20062e8bc787SEric Blake * If 'offset' is beyond the end of the disk image the return value is 2007fb0d8654SEric Blake * BDRV_BLOCK_EOF and 'pnum' is set to 0. 200861007b31SStefan Hajnoczi * 20092e8bc787SEric Blake * 'bytes' is the max value 'pnum' should be set to. If bytes goes 2010fb0d8654SEric Blake * beyond the end of the disk image it will be clamped; if 'pnum' is set to 2011fb0d8654SEric Blake * the end of the image, then the returned value will include BDRV_BLOCK_EOF. 201267a0fd2aSFam Zheng * 20132e8bc787SEric Blake * 'pnum' is set to the number of bytes (including and immediately 20142e8bc787SEric Blake * following the specified offset) that are easily known to be in the 20152e8bc787SEric Blake * same allocated/unallocated state. Note that a second call starting 20162e8bc787SEric Blake * at the original offset plus returned pnum may have the same status. 20172e8bc787SEric Blake * The returned value is non-zero on success except at end-of-file. 20182e8bc787SEric Blake * 20192e8bc787SEric Blake * Returns negative errno on failure. Otherwise, if the 20202e8bc787SEric Blake * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are 20212e8bc787SEric Blake * set to the host mapping and BDS corresponding to the guest offset. 202261007b31SStefan Hajnoczi */ 20232e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, 2024c9ce8c4dSEric Blake bool want_zero, 20252e8bc787SEric Blake int64_t offset, int64_t bytes, 20262e8bc787SEric Blake int64_t *pnum, int64_t *map, 202767a0fd2aSFam Zheng BlockDriverState **file) 202861007b31SStefan Hajnoczi { 20292e8bc787SEric Blake int64_t total_size; 20302e8bc787SEric Blake int64_t n; /* bytes */ 2031efa6e2edSEric Blake int ret; 20322e8bc787SEric Blake int64_t local_map = 0; 2033298a1665SEric Blake BlockDriverState *local_file = NULL; 2034efa6e2edSEric Blake int64_t aligned_offset, aligned_bytes; 2035efa6e2edSEric Blake uint32_t align; 203661007b31SStefan Hajnoczi 2037298a1665SEric Blake assert(pnum); 2038298a1665SEric Blake *pnum = 0; 20392e8bc787SEric Blake total_size = bdrv_getlength(bs); 20402e8bc787SEric Blake if (total_size < 0) { 20412e8bc787SEric Blake ret = total_size; 2042298a1665SEric Blake goto early_out; 204361007b31SStefan Hajnoczi } 204461007b31SStefan Hajnoczi 20452e8bc787SEric Blake if (offset >= total_size) { 2046298a1665SEric Blake ret = BDRV_BLOCK_EOF; 2047298a1665SEric Blake goto early_out; 204861007b31SStefan Hajnoczi } 20492e8bc787SEric Blake if (!bytes) { 2050298a1665SEric Blake ret = 0; 2051298a1665SEric Blake goto early_out; 20529cdcfd9fSEric Blake } 205361007b31SStefan Hajnoczi 20542e8bc787SEric Blake n = total_size - offset; 20552e8bc787SEric Blake if (n < bytes) { 20562e8bc787SEric Blake bytes = n; 205761007b31SStefan Hajnoczi } 205861007b31SStefan Hajnoczi 2059d470ad42SMax Reitz /* Must be non-NULL or bdrv_getlength() would have failed */ 2060d470ad42SMax Reitz assert(bs->drv); 2061636cb512SEric Blake if (!bs->drv->bdrv_co_block_status) { 20622e8bc787SEric Blake *pnum = bytes; 206361007b31SStefan Hajnoczi ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; 20642e8bc787SEric Blake if (offset + bytes == total_size) { 2065fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2066fb0d8654SEric Blake } 206761007b31SStefan Hajnoczi if (bs->drv->protocol_name) { 20682e8bc787SEric Blake ret |= BDRV_BLOCK_OFFSET_VALID; 20692e8bc787SEric Blake local_map = offset; 2070298a1665SEric Blake local_file = bs; 207161007b31SStefan Hajnoczi } 2072298a1665SEric Blake goto early_out; 207361007b31SStefan Hajnoczi } 207461007b31SStefan Hajnoczi 207599723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2076efa6e2edSEric Blake 2077efa6e2edSEric Blake /* Round out to request_alignment boundaries */ 207886a3d5c6SEric Blake align = bs->bl.request_alignment; 2079efa6e2edSEric Blake aligned_offset = QEMU_ALIGN_DOWN(offset, align); 2080efa6e2edSEric Blake aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; 2081efa6e2edSEric Blake 208286a3d5c6SEric Blake ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, 208386a3d5c6SEric Blake aligned_bytes, pnum, &local_map, 208486a3d5c6SEric Blake &local_file); 208586a3d5c6SEric Blake if (ret < 0) { 208686a3d5c6SEric Blake *pnum = 0; 208786a3d5c6SEric Blake goto out; 208886a3d5c6SEric Blake } 2089efa6e2edSEric Blake 2090efa6e2edSEric Blake /* 2091636cb512SEric Blake * The driver's result must be a non-zero multiple of request_alignment. 2092efa6e2edSEric Blake * Clamp pnum and adjust map to original request. 2093efa6e2edSEric Blake */ 2094636cb512SEric Blake assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) && 2095636cb512SEric Blake align > offset - aligned_offset); 209669f47505SVladimir Sementsov-Ogievskiy if (ret & BDRV_BLOCK_RECURSE) { 209769f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_DATA); 209869f47505SVladimir Sementsov-Ogievskiy assert(ret & BDRV_BLOCK_OFFSET_VALID); 209969f47505SVladimir Sementsov-Ogievskiy assert(!(ret & BDRV_BLOCK_ZERO)); 210069f47505SVladimir Sementsov-Ogievskiy } 210169f47505SVladimir Sementsov-Ogievskiy 2102efa6e2edSEric Blake *pnum -= offset - aligned_offset; 2103efa6e2edSEric Blake if (*pnum > bytes) { 2104efa6e2edSEric Blake *pnum = bytes; 2105efa6e2edSEric Blake } 2106efa6e2edSEric Blake if (ret & BDRV_BLOCK_OFFSET_VALID) { 2107efa6e2edSEric Blake local_map += offset - aligned_offset; 2108efa6e2edSEric Blake } 210961007b31SStefan Hajnoczi 211061007b31SStefan Hajnoczi if (ret & BDRV_BLOCK_RAW) { 2111298a1665SEric Blake assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); 21122e8bc787SEric Blake ret = bdrv_co_block_status(local_file, want_zero, local_map, 21132e8bc787SEric Blake *pnum, pnum, &local_map, &local_file); 211499723548SPaolo Bonzini goto out; 211561007b31SStefan Hajnoczi } 211661007b31SStefan Hajnoczi 211761007b31SStefan Hajnoczi if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { 211861007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ALLOCATED; 2119c9ce8c4dSEric Blake } else if (want_zero) { 212061007b31SStefan Hajnoczi if (bdrv_unallocated_blocks_are_zero(bs)) { 212161007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 2122760e0063SKevin Wolf } else if (bs->backing) { 2123760e0063SKevin Wolf BlockDriverState *bs2 = bs->backing->bs; 21242e8bc787SEric Blake int64_t size2 = bdrv_getlength(bs2); 2125c9ce8c4dSEric Blake 21262e8bc787SEric Blake if (size2 >= 0 && offset >= size2) { 212761007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 212861007b31SStefan Hajnoczi } 212961007b31SStefan Hajnoczi } 213061007b31SStefan Hajnoczi } 213161007b31SStefan Hajnoczi 213269f47505SVladimir Sementsov-Ogievskiy if (want_zero && ret & BDRV_BLOCK_RECURSE && 213369f47505SVladimir Sementsov-Ogievskiy local_file && local_file != bs && 213461007b31SStefan Hajnoczi (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && 213561007b31SStefan Hajnoczi (ret & BDRV_BLOCK_OFFSET_VALID)) { 21362e8bc787SEric Blake int64_t file_pnum; 21372e8bc787SEric Blake int ret2; 213861007b31SStefan Hajnoczi 21392e8bc787SEric Blake ret2 = bdrv_co_block_status(local_file, want_zero, local_map, 21402e8bc787SEric Blake *pnum, &file_pnum, NULL, NULL); 214161007b31SStefan Hajnoczi if (ret2 >= 0) { 214261007b31SStefan Hajnoczi /* Ignore errors. This is just providing extra information, it 214361007b31SStefan Hajnoczi * is useful but not necessary. 214461007b31SStefan Hajnoczi */ 2145c61e684eSEric Blake if (ret2 & BDRV_BLOCK_EOF && 2146c61e684eSEric Blake (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) { 2147c61e684eSEric Blake /* 2148c61e684eSEric Blake * It is valid for the format block driver to read 2149c61e684eSEric Blake * beyond the end of the underlying file's current 2150c61e684eSEric Blake * size; such areas read as zero. 2151c61e684eSEric Blake */ 215261007b31SStefan Hajnoczi ret |= BDRV_BLOCK_ZERO; 215361007b31SStefan Hajnoczi } else { 215461007b31SStefan Hajnoczi /* Limit request to the range reported by the protocol driver */ 215561007b31SStefan Hajnoczi *pnum = file_pnum; 215661007b31SStefan Hajnoczi ret |= (ret2 & BDRV_BLOCK_ZERO); 215761007b31SStefan Hajnoczi } 215861007b31SStefan Hajnoczi } 215961007b31SStefan Hajnoczi } 216061007b31SStefan Hajnoczi 216199723548SPaolo Bonzini out: 216299723548SPaolo Bonzini bdrv_dec_in_flight(bs); 21632e8bc787SEric Blake if (ret >= 0 && offset + *pnum == total_size) { 2164fb0d8654SEric Blake ret |= BDRV_BLOCK_EOF; 2165fb0d8654SEric Blake } 2166298a1665SEric Blake early_out: 2167298a1665SEric Blake if (file) { 2168298a1665SEric Blake *file = local_file; 2169298a1665SEric Blake } 21702e8bc787SEric Blake if (map) { 21712e8bc787SEric Blake *map = local_map; 21722e8bc787SEric Blake } 217361007b31SStefan Hajnoczi return ret; 217461007b31SStefan Hajnoczi } 217561007b31SStefan Hajnoczi 21765b648c67SEric Blake static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, 2177ba3f0e25SFam Zheng BlockDriverState *base, 2178c9ce8c4dSEric Blake bool want_zero, 21795b648c67SEric Blake int64_t offset, 21805b648c67SEric Blake int64_t bytes, 21815b648c67SEric Blake int64_t *pnum, 21825b648c67SEric Blake int64_t *map, 218367a0fd2aSFam Zheng BlockDriverState **file) 2184ba3f0e25SFam Zheng { 2185ba3f0e25SFam Zheng BlockDriverState *p; 21865b648c67SEric Blake int ret = 0; 2187c61e684eSEric Blake bool first = true; 2188ba3f0e25SFam Zheng 2189ba3f0e25SFam Zheng assert(bs != base); 2190760e0063SKevin Wolf for (p = bs; p != base; p = backing_bs(p)) { 21915b648c67SEric Blake ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, 21925b648c67SEric Blake file); 2193c61e684eSEric Blake if (ret < 0) { 2194c61e684eSEric Blake break; 2195c61e684eSEric Blake } 2196c61e684eSEric Blake if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) { 2197c61e684eSEric Blake /* 2198c61e684eSEric Blake * Reading beyond the end of the file continues to read 2199c61e684eSEric Blake * zeroes, but we can only widen the result to the 2200c61e684eSEric Blake * unallocated length we learned from an earlier 2201c61e684eSEric Blake * iteration. 2202c61e684eSEric Blake */ 22035b648c67SEric Blake *pnum = bytes; 2204c61e684eSEric Blake } 2205c61e684eSEric Blake if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) { 2206ba3f0e25SFam Zheng break; 2207ba3f0e25SFam Zheng } 22085b648c67SEric Blake /* [offset, pnum] unallocated on this layer, which could be only 22095b648c67SEric Blake * the first part of [offset, bytes]. */ 22105b648c67SEric Blake bytes = MIN(bytes, *pnum); 2211c61e684eSEric Blake first = false; 2212ba3f0e25SFam Zheng } 2213ba3f0e25SFam Zheng return ret; 2214ba3f0e25SFam Zheng } 2215ba3f0e25SFam Zheng 221631826642SEric Blake /* Coroutine wrapper for bdrv_block_status_above() */ 22175b648c67SEric Blake static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) 221861007b31SStefan Hajnoczi { 22194bcd936eSEric Blake BdrvCoBlockStatusData *data = opaque; 222061007b31SStefan Hajnoczi 22215b648c67SEric Blake data->ret = bdrv_co_block_status_above(data->bs, data->base, 2222c9ce8c4dSEric Blake data->want_zero, 22235b648c67SEric Blake data->offset, data->bytes, 22245b648c67SEric Blake data->pnum, data->map, data->file); 222561007b31SStefan Hajnoczi data->done = true; 22264720cbeeSKevin Wolf aio_wait_kick(); 222761007b31SStefan Hajnoczi } 222861007b31SStefan Hajnoczi 222961007b31SStefan Hajnoczi /* 22305b648c67SEric Blake * Synchronous wrapper around bdrv_co_block_status_above(). 223161007b31SStefan Hajnoczi * 22325b648c67SEric Blake * See bdrv_co_block_status_above() for details. 223361007b31SStefan Hajnoczi */ 22347ddb99b9SEric Blake static int bdrv_common_block_status_above(BlockDriverState *bs, 2235ba3f0e25SFam Zheng BlockDriverState *base, 22367ddb99b9SEric Blake bool want_zero, int64_t offset, 22377ddb99b9SEric Blake int64_t bytes, int64_t *pnum, 22387ddb99b9SEric Blake int64_t *map, 223967a0fd2aSFam Zheng BlockDriverState **file) 224061007b31SStefan Hajnoczi { 224161007b31SStefan Hajnoczi Coroutine *co; 22424bcd936eSEric Blake BdrvCoBlockStatusData data = { 224361007b31SStefan Hajnoczi .bs = bs, 2244ba3f0e25SFam Zheng .base = base, 2245c9ce8c4dSEric Blake .want_zero = want_zero, 22467ddb99b9SEric Blake .offset = offset, 22477ddb99b9SEric Blake .bytes = bytes, 22487ddb99b9SEric Blake .pnum = pnum, 22497ddb99b9SEric Blake .map = map, 2250c9ce8c4dSEric Blake .file = file, 225161007b31SStefan Hajnoczi .done = false, 225261007b31SStefan Hajnoczi }; 225361007b31SStefan Hajnoczi 225461007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 225561007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 22565b648c67SEric Blake bdrv_block_status_above_co_entry(&data); 225761007b31SStefan Hajnoczi } else { 22585b648c67SEric Blake co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); 2259e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 226088b062c2SPaolo Bonzini BDRV_POLL_WHILE(bs, !data.done); 226161007b31SStefan Hajnoczi } 226261007b31SStefan Hajnoczi return data.ret; 226361007b31SStefan Hajnoczi } 226461007b31SStefan Hajnoczi 226531826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, 226631826642SEric Blake int64_t offset, int64_t bytes, int64_t *pnum, 226731826642SEric Blake int64_t *map, BlockDriverState **file) 2268c9ce8c4dSEric Blake { 226931826642SEric Blake return bdrv_common_block_status_above(bs, base, true, offset, bytes, 227031826642SEric Blake pnum, map, file); 2271c9ce8c4dSEric Blake } 2272c9ce8c4dSEric Blake 2273237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, 2274237d78f8SEric Blake int64_t *pnum, int64_t *map, BlockDriverState **file) 2275ba3f0e25SFam Zheng { 227631826642SEric Blake return bdrv_block_status_above(bs, backing_bs(bs), 227731826642SEric Blake offset, bytes, pnum, map, file); 2278ba3f0e25SFam Zheng } 2279ba3f0e25SFam Zheng 2280d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, 2281d6a644bbSEric Blake int64_t bytes, int64_t *pnum) 228261007b31SStefan Hajnoczi { 22837ddb99b9SEric Blake int ret; 22847ddb99b9SEric Blake int64_t dummy; 2285d6a644bbSEric Blake 22867ddb99b9SEric Blake ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, 22877ddb99b9SEric Blake bytes, pnum ? pnum : &dummy, NULL, 2288298a1665SEric Blake NULL); 228961007b31SStefan Hajnoczi if (ret < 0) { 229061007b31SStefan Hajnoczi return ret; 229161007b31SStefan Hajnoczi } 229261007b31SStefan Hajnoczi return !!(ret & BDRV_BLOCK_ALLOCATED); 229361007b31SStefan Hajnoczi } 229461007b31SStefan Hajnoczi 229561007b31SStefan Hajnoczi /* 229661007b31SStefan Hajnoczi * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] 229761007b31SStefan Hajnoczi * 229851b0a488SEric Blake * Return true if (a prefix of) the given range is allocated in any image 229951b0a488SEric Blake * between BASE and TOP (inclusive). BASE can be NULL to check if the given 230051b0a488SEric Blake * offset is allocated in any image of the chain. Return false otherwise, 2301d6a644bbSEric Blake * or negative errno on failure. 230261007b31SStefan Hajnoczi * 230351b0a488SEric Blake * 'pnum' is set to the number of bytes (including and immediately 230451b0a488SEric Blake * following the specified offset) that are known to be in the same 230551b0a488SEric Blake * allocated/unallocated state. Note that a subsequent call starting 230651b0a488SEric Blake * at 'offset + *pnum' may return the same allocation status (in other 230751b0a488SEric Blake * words, the result is not necessarily the maximum possible range); 230851b0a488SEric Blake * but 'pnum' will only be 0 when end of file is reached. 230961007b31SStefan Hajnoczi * 231061007b31SStefan Hajnoczi */ 231161007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top, 231261007b31SStefan Hajnoczi BlockDriverState *base, 231351b0a488SEric Blake int64_t offset, int64_t bytes, int64_t *pnum) 231461007b31SStefan Hajnoczi { 231561007b31SStefan Hajnoczi BlockDriverState *intermediate; 231651b0a488SEric Blake int ret; 231751b0a488SEric Blake int64_t n = bytes; 231861007b31SStefan Hajnoczi 231961007b31SStefan Hajnoczi intermediate = top; 232061007b31SStefan Hajnoczi while (intermediate && intermediate != base) { 2321d6a644bbSEric Blake int64_t pnum_inter; 2322c00716beSEric Blake int64_t size_inter; 2323d6a644bbSEric Blake 232451b0a488SEric Blake ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); 232561007b31SStefan Hajnoczi if (ret < 0) { 232661007b31SStefan Hajnoczi return ret; 2327d6a644bbSEric Blake } 2328d6a644bbSEric Blake if (ret) { 232951b0a488SEric Blake *pnum = pnum_inter; 233061007b31SStefan Hajnoczi return 1; 233161007b31SStefan Hajnoczi } 233261007b31SStefan Hajnoczi 233351b0a488SEric Blake size_inter = bdrv_getlength(intermediate); 2334c00716beSEric Blake if (size_inter < 0) { 2335c00716beSEric Blake return size_inter; 2336c00716beSEric Blake } 233751b0a488SEric Blake if (n > pnum_inter && 233851b0a488SEric Blake (intermediate == top || offset + pnum_inter < size_inter)) { 233951b0a488SEric Blake n = pnum_inter; 234061007b31SStefan Hajnoczi } 234161007b31SStefan Hajnoczi 2342760e0063SKevin Wolf intermediate = backing_bs(intermediate); 234361007b31SStefan Hajnoczi } 234461007b31SStefan Hajnoczi 234561007b31SStefan Hajnoczi *pnum = n; 234661007b31SStefan Hajnoczi return 0; 234761007b31SStefan Hajnoczi } 234861007b31SStefan Hajnoczi 23491a8ae822SKevin Wolf typedef struct BdrvVmstateCo { 23501a8ae822SKevin Wolf BlockDriverState *bs; 23511a8ae822SKevin Wolf QEMUIOVector *qiov; 23521a8ae822SKevin Wolf int64_t pos; 23531a8ae822SKevin Wolf bool is_read; 23541a8ae822SKevin Wolf int ret; 23551a8ae822SKevin Wolf } BdrvVmstateCo; 23561a8ae822SKevin Wolf 23571a8ae822SKevin Wolf static int coroutine_fn 23581a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 23591a8ae822SKevin Wolf bool is_read) 23601a8ae822SKevin Wolf { 23611a8ae822SKevin Wolf BlockDriver *drv = bs->drv; 2362dc88a467SStefan Hajnoczi int ret = -ENOTSUP; 2363dc88a467SStefan Hajnoczi 2364dc88a467SStefan Hajnoczi bdrv_inc_in_flight(bs); 23651a8ae822SKevin Wolf 23661a8ae822SKevin Wolf if (!drv) { 2367dc88a467SStefan Hajnoczi ret = -ENOMEDIUM; 23681a8ae822SKevin Wolf } else if (drv->bdrv_load_vmstate) { 2369dc88a467SStefan Hajnoczi if (is_read) { 2370dc88a467SStefan Hajnoczi ret = drv->bdrv_load_vmstate(bs, qiov, pos); 2371dc88a467SStefan Hajnoczi } else { 2372dc88a467SStefan Hajnoczi ret = drv->bdrv_save_vmstate(bs, qiov, pos); 2373dc88a467SStefan Hajnoczi } 23741a8ae822SKevin Wolf } else if (bs->file) { 2375dc88a467SStefan Hajnoczi ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read); 23761a8ae822SKevin Wolf } 23771a8ae822SKevin Wolf 2378dc88a467SStefan Hajnoczi bdrv_dec_in_flight(bs); 2379dc88a467SStefan Hajnoczi return ret; 23801a8ae822SKevin Wolf } 23811a8ae822SKevin Wolf 23821a8ae822SKevin Wolf static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) 23831a8ae822SKevin Wolf { 23841a8ae822SKevin Wolf BdrvVmstateCo *co = opaque; 23851a8ae822SKevin Wolf co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); 23864720cbeeSKevin Wolf aio_wait_kick(); 23871a8ae822SKevin Wolf } 23881a8ae822SKevin Wolf 23891a8ae822SKevin Wolf static inline int 23901a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, 23911a8ae822SKevin Wolf bool is_read) 23921a8ae822SKevin Wolf { 23931a8ae822SKevin Wolf if (qemu_in_coroutine()) { 23941a8ae822SKevin Wolf return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); 23951a8ae822SKevin Wolf } else { 23961a8ae822SKevin Wolf BdrvVmstateCo data = { 23971a8ae822SKevin Wolf .bs = bs, 23981a8ae822SKevin Wolf .qiov = qiov, 23991a8ae822SKevin Wolf .pos = pos, 24001a8ae822SKevin Wolf .is_read = is_read, 24011a8ae822SKevin Wolf .ret = -EINPROGRESS, 24021a8ae822SKevin Wolf }; 24030b8b8753SPaolo Bonzini Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); 24041a8ae822SKevin Wolf 2405e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 2406ea17c9d2SStefan Hajnoczi BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); 24071a8ae822SKevin Wolf return data.ret; 24081a8ae822SKevin Wolf } 24091a8ae822SKevin Wolf } 24101a8ae822SKevin Wolf 241161007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, 241261007b31SStefan Hajnoczi int64_t pos, int size) 241361007b31SStefan Hajnoczi { 24140d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2415b433d942SKevin Wolf int ret; 241661007b31SStefan Hajnoczi 2417b433d942SKevin Wolf ret = bdrv_writev_vmstate(bs, &qiov, pos); 2418b433d942SKevin Wolf if (ret < 0) { 2419b433d942SKevin Wolf return ret; 2420b433d942SKevin Wolf } 2421b433d942SKevin Wolf 2422b433d942SKevin Wolf return size; 242361007b31SStefan Hajnoczi } 242461007b31SStefan Hajnoczi 242561007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 242661007b31SStefan Hajnoczi { 24271a8ae822SKevin Wolf return bdrv_rw_vmstate(bs, qiov, pos, false); 242861007b31SStefan Hajnoczi } 242961007b31SStefan Hajnoczi 243061007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, 243161007b31SStefan Hajnoczi int64_t pos, int size) 243261007b31SStefan Hajnoczi { 24330d93ed08SVladimir Sementsov-Ogievskiy QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); 2434b433d942SKevin Wolf int ret; 24355ddda0b8SKevin Wolf 2436b433d942SKevin Wolf ret = bdrv_readv_vmstate(bs, &qiov, pos); 2437b433d942SKevin Wolf if (ret < 0) { 2438b433d942SKevin Wolf return ret; 2439b433d942SKevin Wolf } 2440b433d942SKevin Wolf 2441b433d942SKevin Wolf return size; 24425ddda0b8SKevin Wolf } 24435ddda0b8SKevin Wolf 24445ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) 24455ddda0b8SKevin Wolf { 24461a8ae822SKevin Wolf return bdrv_rw_vmstate(bs, qiov, pos, true); 244761007b31SStefan Hajnoczi } 244861007b31SStefan Hajnoczi 244961007b31SStefan Hajnoczi /**************************************************************/ 245061007b31SStefan Hajnoczi /* async I/Os */ 245161007b31SStefan Hajnoczi 245261007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb) 245361007b31SStefan Hajnoczi { 245461007b31SStefan Hajnoczi qemu_aio_ref(acb); 245561007b31SStefan Hajnoczi bdrv_aio_cancel_async(acb); 245661007b31SStefan Hajnoczi while (acb->refcnt > 1) { 245761007b31SStefan Hajnoczi if (acb->aiocb_info->get_aio_context) { 245861007b31SStefan Hajnoczi aio_poll(acb->aiocb_info->get_aio_context(acb), true); 245961007b31SStefan Hajnoczi } else if (acb->bs) { 24602f47da5fSPaolo Bonzini /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so 24612f47da5fSPaolo Bonzini * assert that we're not using an I/O thread. Thread-safe 24622f47da5fSPaolo Bonzini * code should use bdrv_aio_cancel_async exclusively. 24632f47da5fSPaolo Bonzini */ 24642f47da5fSPaolo Bonzini assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context()); 246561007b31SStefan Hajnoczi aio_poll(bdrv_get_aio_context(acb->bs), true); 246661007b31SStefan Hajnoczi } else { 246761007b31SStefan Hajnoczi abort(); 246861007b31SStefan Hajnoczi } 246961007b31SStefan Hajnoczi } 247061007b31SStefan Hajnoczi qemu_aio_unref(acb); 247161007b31SStefan Hajnoczi } 247261007b31SStefan Hajnoczi 247361007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements 247461007b31SStefan Hajnoczi * cancel_async, otherwise we do nothing and let the request normally complete. 247561007b31SStefan Hajnoczi * In either case the completion callback must be called. */ 247661007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb) 247761007b31SStefan Hajnoczi { 247861007b31SStefan Hajnoczi if (acb->aiocb_info->cancel_async) { 247961007b31SStefan Hajnoczi acb->aiocb_info->cancel_async(acb); 248061007b31SStefan Hajnoczi } 248161007b31SStefan Hajnoczi } 248261007b31SStefan Hajnoczi 248361007b31SStefan Hajnoczi /**************************************************************/ 248461007b31SStefan Hajnoczi /* Coroutine block device emulation */ 248561007b31SStefan Hajnoczi 2486e293b7a3SKevin Wolf typedef struct FlushCo { 2487e293b7a3SKevin Wolf BlockDriverState *bs; 2488e293b7a3SKevin Wolf int ret; 2489e293b7a3SKevin Wolf } FlushCo; 2490e293b7a3SKevin Wolf 2491e293b7a3SKevin Wolf 249261007b31SStefan Hajnoczi static void coroutine_fn bdrv_flush_co_entry(void *opaque) 249361007b31SStefan Hajnoczi { 2494e293b7a3SKevin Wolf FlushCo *rwco = opaque; 249561007b31SStefan Hajnoczi 249661007b31SStefan Hajnoczi rwco->ret = bdrv_co_flush(rwco->bs); 24974720cbeeSKevin Wolf aio_wait_kick(); 249861007b31SStefan Hajnoczi } 249961007b31SStefan Hajnoczi 250061007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs) 250161007b31SStefan Hajnoczi { 250249ca6259SFam Zheng int current_gen; 250349ca6259SFam Zheng int ret = 0; 250461007b31SStefan Hajnoczi 250599723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2506c32b82afSPavel Dovgalyuk 2507e914404eSFam Zheng if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || 250849ca6259SFam Zheng bdrv_is_sg(bs)) { 250949ca6259SFam Zheng goto early_exit; 251049ca6259SFam Zheng } 251149ca6259SFam Zheng 25123783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 251347fec599SPaolo Bonzini current_gen = atomic_read(&bs->write_gen); 25143ff2f67aSEvgeny Yakovlev 25153ff2f67aSEvgeny Yakovlev /* Wait until any previous flushes are completed */ 251699723548SPaolo Bonzini while (bs->active_flush_req) { 25173783fa3dSPaolo Bonzini qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock); 25183ff2f67aSEvgeny Yakovlev } 25193ff2f67aSEvgeny Yakovlev 25203783fa3dSPaolo Bonzini /* Flushes reach this point in nondecreasing current_gen order. */ 252199723548SPaolo Bonzini bs->active_flush_req = true; 25223783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 25233ff2f67aSEvgeny Yakovlev 2524c32b82afSPavel Dovgalyuk /* Write back all layers by calling one driver function */ 2525c32b82afSPavel Dovgalyuk if (bs->drv->bdrv_co_flush) { 2526c32b82afSPavel Dovgalyuk ret = bs->drv->bdrv_co_flush(bs); 2527c32b82afSPavel Dovgalyuk goto out; 2528c32b82afSPavel Dovgalyuk } 2529c32b82afSPavel Dovgalyuk 253061007b31SStefan Hajnoczi /* Write back cached data to the OS even with cache=unsafe */ 253161007b31SStefan Hajnoczi BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); 253261007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_os) { 253361007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_os(bs); 253461007b31SStefan Hajnoczi if (ret < 0) { 2535cdb5e315SFam Zheng goto out; 253661007b31SStefan Hajnoczi } 253761007b31SStefan Hajnoczi } 253861007b31SStefan Hajnoczi 253961007b31SStefan Hajnoczi /* But don't actually force it to the disk with cache=unsafe */ 254061007b31SStefan Hajnoczi if (bs->open_flags & BDRV_O_NO_FLUSH) { 254161007b31SStefan Hajnoczi goto flush_parent; 254261007b31SStefan Hajnoczi } 254361007b31SStefan Hajnoczi 25443ff2f67aSEvgeny Yakovlev /* Check if we really need to flush anything */ 25453ff2f67aSEvgeny Yakovlev if (bs->flushed_gen == current_gen) { 25463ff2f67aSEvgeny Yakovlev goto flush_parent; 25473ff2f67aSEvgeny Yakovlev } 25483ff2f67aSEvgeny Yakovlev 254961007b31SStefan Hajnoczi BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); 2550d470ad42SMax Reitz if (!bs->drv) { 2551d470ad42SMax Reitz /* bs->drv->bdrv_co_flush() might have ejected the BDS 2552d470ad42SMax Reitz * (even in case of apparent success) */ 2553d470ad42SMax Reitz ret = -ENOMEDIUM; 2554d470ad42SMax Reitz goto out; 2555d470ad42SMax Reitz } 255661007b31SStefan Hajnoczi if (bs->drv->bdrv_co_flush_to_disk) { 255761007b31SStefan Hajnoczi ret = bs->drv->bdrv_co_flush_to_disk(bs); 255861007b31SStefan Hajnoczi } else if (bs->drv->bdrv_aio_flush) { 255961007b31SStefan Hajnoczi BlockAIOCB *acb; 256061007b31SStefan Hajnoczi CoroutineIOCompletion co = { 256161007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 256261007b31SStefan Hajnoczi }; 256361007b31SStefan Hajnoczi 256461007b31SStefan Hajnoczi acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); 256561007b31SStefan Hajnoczi if (acb == NULL) { 256661007b31SStefan Hajnoczi ret = -EIO; 256761007b31SStefan Hajnoczi } else { 256861007b31SStefan Hajnoczi qemu_coroutine_yield(); 256961007b31SStefan Hajnoczi ret = co.ret; 257061007b31SStefan Hajnoczi } 257161007b31SStefan Hajnoczi } else { 257261007b31SStefan Hajnoczi /* 257361007b31SStefan Hajnoczi * Some block drivers always operate in either writethrough or unsafe 257461007b31SStefan Hajnoczi * mode and don't support bdrv_flush therefore. Usually qemu doesn't 257561007b31SStefan Hajnoczi * know how the server works (because the behaviour is hardcoded or 257661007b31SStefan Hajnoczi * depends on server-side configuration), so we can't ensure that 257761007b31SStefan Hajnoczi * everything is safe on disk. Returning an error doesn't work because 257861007b31SStefan Hajnoczi * that would break guests even if the server operates in writethrough 257961007b31SStefan Hajnoczi * mode. 258061007b31SStefan Hajnoczi * 258161007b31SStefan Hajnoczi * Let's hope the user knows what he's doing. 258261007b31SStefan Hajnoczi */ 258361007b31SStefan Hajnoczi ret = 0; 258461007b31SStefan Hajnoczi } 25853ff2f67aSEvgeny Yakovlev 258661007b31SStefan Hajnoczi if (ret < 0) { 2587cdb5e315SFam Zheng goto out; 258861007b31SStefan Hajnoczi } 258961007b31SStefan Hajnoczi 259061007b31SStefan Hajnoczi /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH 259161007b31SStefan Hajnoczi * in the case of cache=unsafe, so there are no useless flushes. 259261007b31SStefan Hajnoczi */ 259361007b31SStefan Hajnoczi flush_parent: 2594cdb5e315SFam Zheng ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; 2595cdb5e315SFam Zheng out: 25963ff2f67aSEvgeny Yakovlev /* Notify any pending flushes that we have completed */ 2597e6af1e08SKevin Wolf if (ret == 0) { 25983ff2f67aSEvgeny Yakovlev bs->flushed_gen = current_gen; 2599e6af1e08SKevin Wolf } 26003783fa3dSPaolo Bonzini 26013783fa3dSPaolo Bonzini qemu_co_mutex_lock(&bs->reqs_lock); 260299723548SPaolo Bonzini bs->active_flush_req = false; 2603156af3acSDenis V. Lunev /* Return value is ignored - it's ok if wait queue is empty */ 2604156af3acSDenis V. Lunev qemu_co_queue_next(&bs->flush_queue); 26053783fa3dSPaolo Bonzini qemu_co_mutex_unlock(&bs->reqs_lock); 26063ff2f67aSEvgeny Yakovlev 260749ca6259SFam Zheng early_exit: 260899723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2609cdb5e315SFam Zheng return ret; 261061007b31SStefan Hajnoczi } 261161007b31SStefan Hajnoczi 261261007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs) 261361007b31SStefan Hajnoczi { 261461007b31SStefan Hajnoczi Coroutine *co; 2615e293b7a3SKevin Wolf FlushCo flush_co = { 261661007b31SStefan Hajnoczi .bs = bs, 261761007b31SStefan Hajnoczi .ret = NOT_DONE, 261861007b31SStefan Hajnoczi }; 261961007b31SStefan Hajnoczi 262061007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 262161007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 2622e293b7a3SKevin Wolf bdrv_flush_co_entry(&flush_co); 262361007b31SStefan Hajnoczi } else { 26240b8b8753SPaolo Bonzini co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); 2625e92f0e19SFam Zheng bdrv_coroutine_enter(bs, co); 262688b062c2SPaolo Bonzini BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); 262761007b31SStefan Hajnoczi } 262861007b31SStefan Hajnoczi 2629e293b7a3SKevin Wolf return flush_co.ret; 263061007b31SStefan Hajnoczi } 263161007b31SStefan Hajnoczi 263261007b31SStefan Hajnoczi typedef struct DiscardCo { 26330b9fd3f4SFam Zheng BdrvChild *child; 26340c51a893SEric Blake int64_t offset; 2635f5a5ca79SManos Pitsidianakis int bytes; 263661007b31SStefan Hajnoczi int ret; 263761007b31SStefan Hajnoczi } DiscardCo; 26380c51a893SEric Blake static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) 263961007b31SStefan Hajnoczi { 264061007b31SStefan Hajnoczi DiscardCo *rwco = opaque; 264161007b31SStefan Hajnoczi 26420b9fd3f4SFam Zheng rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); 26434720cbeeSKevin Wolf aio_wait_kick(); 264461007b31SStefan Hajnoczi } 264561007b31SStefan Hajnoczi 26460b9fd3f4SFam Zheng int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) 264761007b31SStefan Hajnoczi { 2648b1066c87SFam Zheng BdrvTrackedRequest req; 26499f1963b3SEric Blake int max_pdiscard, ret; 26503482b9bcSEric Blake int head, tail, align; 26510b9fd3f4SFam Zheng BlockDriverState *bs = child->bs; 265261007b31SStefan Hajnoczi 26530b9fd3f4SFam Zheng if (!bs || !bs->drv) { 265461007b31SStefan Hajnoczi return -ENOMEDIUM; 265561007b31SStefan Hajnoczi } 265661007b31SStefan Hajnoczi 2657d6883bc9SVladimir Sementsov-Ogievskiy if (bdrv_has_readonly_bitmaps(bs)) { 2658d6883bc9SVladimir Sementsov-Ogievskiy return -EPERM; 2659d6883bc9SVladimir Sementsov-Ogievskiy } 2660d6883bc9SVladimir Sementsov-Ogievskiy 2661f5a5ca79SManos Pitsidianakis ret = bdrv_check_byte_request(bs, offset, bytes); 266261007b31SStefan Hajnoczi if (ret < 0) { 266361007b31SStefan Hajnoczi return ret; 266461007b31SStefan Hajnoczi } 266561007b31SStefan Hajnoczi 266661007b31SStefan Hajnoczi /* Do nothing if disabled. */ 266761007b31SStefan Hajnoczi if (!(bs->open_flags & BDRV_O_UNMAP)) { 266861007b31SStefan Hajnoczi return 0; 266961007b31SStefan Hajnoczi } 267061007b31SStefan Hajnoczi 267102aefe43SEric Blake if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { 267261007b31SStefan Hajnoczi return 0; 267361007b31SStefan Hajnoczi } 267461007b31SStefan Hajnoczi 26753482b9bcSEric Blake /* Discard is advisory, but some devices track and coalesce 26763482b9bcSEric Blake * unaligned requests, so we must pass everything down rather than 26773482b9bcSEric Blake * round here. Still, most devices will just silently ignore 26783482b9bcSEric Blake * unaligned requests (by returning -ENOTSUP), so we must fragment 26793482b9bcSEric Blake * the request accordingly. */ 268002aefe43SEric Blake align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); 2681b8d0a980SEric Blake assert(align % bs->bl.request_alignment == 0); 2682b8d0a980SEric Blake head = offset % align; 2683f5a5ca79SManos Pitsidianakis tail = (offset + bytes) % align; 26849f1963b3SEric Blake 268599723548SPaolo Bonzini bdrv_inc_in_flight(bs); 2686f5a5ca79SManos Pitsidianakis tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); 268750824995SFam Zheng 268800695c27SFam Zheng ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); 2689ec050f77SDenis V. Lunev if (ret < 0) { 2690ec050f77SDenis V. Lunev goto out; 2691ec050f77SDenis V. Lunev } 2692ec050f77SDenis V. Lunev 26939f1963b3SEric Blake max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), 26949f1963b3SEric Blake align); 26953482b9bcSEric Blake assert(max_pdiscard >= bs->bl.request_alignment); 26969f1963b3SEric Blake 2697f5a5ca79SManos Pitsidianakis while (bytes > 0) { 2698f5a5ca79SManos Pitsidianakis int num = bytes; 26993482b9bcSEric Blake 27003482b9bcSEric Blake if (head) { 27013482b9bcSEric Blake /* Make small requests to get to alignment boundaries. */ 2702f5a5ca79SManos Pitsidianakis num = MIN(bytes, align - head); 27033482b9bcSEric Blake if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) { 27043482b9bcSEric Blake num %= bs->bl.request_alignment; 27053482b9bcSEric Blake } 27063482b9bcSEric Blake head = (head + num) % align; 27073482b9bcSEric Blake assert(num < max_pdiscard); 27083482b9bcSEric Blake } else if (tail) { 27093482b9bcSEric Blake if (num > align) { 27103482b9bcSEric Blake /* Shorten the request to the last aligned cluster. */ 27113482b9bcSEric Blake num -= tail; 27123482b9bcSEric Blake } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) && 27133482b9bcSEric Blake tail > bs->bl.request_alignment) { 27143482b9bcSEric Blake tail %= bs->bl.request_alignment; 27153482b9bcSEric Blake num -= tail; 27163482b9bcSEric Blake } 27173482b9bcSEric Blake } 27183482b9bcSEric Blake /* limit request size */ 27193482b9bcSEric Blake if (num > max_pdiscard) { 27203482b9bcSEric Blake num = max_pdiscard; 27213482b9bcSEric Blake } 272261007b31SStefan Hajnoczi 2723d470ad42SMax Reitz if (!bs->drv) { 2724d470ad42SMax Reitz ret = -ENOMEDIUM; 2725d470ad42SMax Reitz goto out; 2726d470ad42SMax Reitz } 272747a5486dSEric Blake if (bs->drv->bdrv_co_pdiscard) { 272847a5486dSEric Blake ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); 272961007b31SStefan Hajnoczi } else { 273061007b31SStefan Hajnoczi BlockAIOCB *acb; 273161007b31SStefan Hajnoczi CoroutineIOCompletion co = { 273261007b31SStefan Hajnoczi .coroutine = qemu_coroutine_self(), 273361007b31SStefan Hajnoczi }; 273461007b31SStefan Hajnoczi 27354da444a0SEric Blake acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, 273661007b31SStefan Hajnoczi bdrv_co_io_em_complete, &co); 273761007b31SStefan Hajnoczi if (acb == NULL) { 2738b1066c87SFam Zheng ret = -EIO; 2739b1066c87SFam Zheng goto out; 274061007b31SStefan Hajnoczi } else { 274161007b31SStefan Hajnoczi qemu_coroutine_yield(); 274261007b31SStefan Hajnoczi ret = co.ret; 274361007b31SStefan Hajnoczi } 274461007b31SStefan Hajnoczi } 274561007b31SStefan Hajnoczi if (ret && ret != -ENOTSUP) { 2746b1066c87SFam Zheng goto out; 274761007b31SStefan Hajnoczi } 274861007b31SStefan Hajnoczi 27499f1963b3SEric Blake offset += num; 2750f5a5ca79SManos Pitsidianakis bytes -= num; 275161007b31SStefan Hajnoczi } 2752b1066c87SFam Zheng ret = 0; 2753b1066c87SFam Zheng out: 275400695c27SFam Zheng bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); 2755b1066c87SFam Zheng tracked_request_end(&req); 275699723548SPaolo Bonzini bdrv_dec_in_flight(bs); 2757b1066c87SFam Zheng return ret; 275861007b31SStefan Hajnoczi } 275961007b31SStefan Hajnoczi 27600b9fd3f4SFam Zheng int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes) 276161007b31SStefan Hajnoczi { 276261007b31SStefan Hajnoczi Coroutine *co; 276361007b31SStefan Hajnoczi DiscardCo rwco = { 27640b9fd3f4SFam Zheng .child = child, 27650c51a893SEric Blake .offset = offset, 2766f5a5ca79SManos Pitsidianakis .bytes = bytes, 276761007b31SStefan Hajnoczi .ret = NOT_DONE, 276861007b31SStefan Hajnoczi }; 276961007b31SStefan Hajnoczi 277061007b31SStefan Hajnoczi if (qemu_in_coroutine()) { 277161007b31SStefan Hajnoczi /* Fast-path if already in coroutine context */ 27720c51a893SEric Blake bdrv_pdiscard_co_entry(&rwco); 277361007b31SStefan Hajnoczi } else { 27740c51a893SEric Blake co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); 27750b9fd3f4SFam Zheng bdrv_coroutine_enter(child->bs, co); 27760b9fd3f4SFam Zheng BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); 277761007b31SStefan Hajnoczi } 277861007b31SStefan Hajnoczi 277961007b31SStefan Hajnoczi return rwco.ret; 278061007b31SStefan Hajnoczi } 278161007b31SStefan Hajnoczi 278248af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) 278361007b31SStefan Hajnoczi { 278461007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 27855c5ae76aSFam Zheng CoroutineIOCompletion co = { 27865c5ae76aSFam Zheng .coroutine = qemu_coroutine_self(), 27875c5ae76aSFam Zheng }; 27885c5ae76aSFam Zheng BlockAIOCB *acb; 278961007b31SStefan Hajnoczi 279099723548SPaolo Bonzini bdrv_inc_in_flight(bs); 279116a389dcSKevin Wolf if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { 27925c5ae76aSFam Zheng co.ret = -ENOTSUP; 27935c5ae76aSFam Zheng goto out; 27945c5ae76aSFam Zheng } 27955c5ae76aSFam Zheng 279616a389dcSKevin Wolf if (drv->bdrv_co_ioctl) { 279716a389dcSKevin Wolf co.ret = drv->bdrv_co_ioctl(bs, req, buf); 279816a389dcSKevin Wolf } else { 27995c5ae76aSFam Zheng acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); 28005c5ae76aSFam Zheng if (!acb) { 2801c8a9fd80SFam Zheng co.ret = -ENOTSUP; 2802c8a9fd80SFam Zheng goto out; 28035c5ae76aSFam Zheng } 28045c5ae76aSFam Zheng qemu_coroutine_yield(); 280516a389dcSKevin Wolf } 28065c5ae76aSFam Zheng out: 280799723548SPaolo Bonzini bdrv_dec_in_flight(bs); 28085c5ae76aSFam Zheng return co.ret; 28095c5ae76aSFam Zheng } 28105c5ae76aSFam Zheng 281161007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size) 281261007b31SStefan Hajnoczi { 281361007b31SStefan Hajnoczi return qemu_memalign(bdrv_opt_mem_align(bs), size); 281461007b31SStefan Hajnoczi } 281561007b31SStefan Hajnoczi 281661007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size) 281761007b31SStefan Hajnoczi { 281861007b31SStefan Hajnoczi return memset(qemu_blockalign(bs, size), 0, size); 281961007b31SStefan Hajnoczi } 282061007b31SStefan Hajnoczi 282161007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size) 282261007b31SStefan Hajnoczi { 282361007b31SStefan Hajnoczi size_t align = bdrv_opt_mem_align(bs); 282461007b31SStefan Hajnoczi 282561007b31SStefan Hajnoczi /* Ensure that NULL is never returned on success */ 282661007b31SStefan Hajnoczi assert(align > 0); 282761007b31SStefan Hajnoczi if (size == 0) { 282861007b31SStefan Hajnoczi size = align; 282961007b31SStefan Hajnoczi } 283061007b31SStefan Hajnoczi 283161007b31SStefan Hajnoczi return qemu_try_memalign(align, size); 283261007b31SStefan Hajnoczi } 283361007b31SStefan Hajnoczi 283461007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) 283561007b31SStefan Hajnoczi { 283661007b31SStefan Hajnoczi void *mem = qemu_try_blockalign(bs, size); 283761007b31SStefan Hajnoczi 283861007b31SStefan Hajnoczi if (mem) { 283961007b31SStefan Hajnoczi memset(mem, 0, size); 284061007b31SStefan Hajnoczi } 284161007b31SStefan Hajnoczi 284261007b31SStefan Hajnoczi return mem; 284361007b31SStefan Hajnoczi } 284461007b31SStefan Hajnoczi 284561007b31SStefan Hajnoczi /* 284661007b31SStefan Hajnoczi * Check if all memory in this vector is sector aligned. 284761007b31SStefan Hajnoczi */ 284861007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) 284961007b31SStefan Hajnoczi { 285061007b31SStefan Hajnoczi int i; 28514196d2f0SDenis V. Lunev size_t alignment = bdrv_min_mem_align(bs); 285261007b31SStefan Hajnoczi 285361007b31SStefan Hajnoczi for (i = 0; i < qiov->niov; i++) { 285461007b31SStefan Hajnoczi if ((uintptr_t) qiov->iov[i].iov_base % alignment) { 285561007b31SStefan Hajnoczi return false; 285661007b31SStefan Hajnoczi } 285761007b31SStefan Hajnoczi if (qiov->iov[i].iov_len % alignment) { 285861007b31SStefan Hajnoczi return false; 285961007b31SStefan Hajnoczi } 286061007b31SStefan Hajnoczi } 286161007b31SStefan Hajnoczi 286261007b31SStefan Hajnoczi return true; 286361007b31SStefan Hajnoczi } 286461007b31SStefan Hajnoczi 286561007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs, 286661007b31SStefan Hajnoczi NotifierWithReturn *notifier) 286761007b31SStefan Hajnoczi { 286861007b31SStefan Hajnoczi notifier_with_return_list_add(&bs->before_write_notifiers, notifier); 286961007b31SStefan Hajnoczi } 287061007b31SStefan Hajnoczi 287161007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs) 287261007b31SStefan Hajnoczi { 28736b98bd64SPaolo Bonzini BdrvChild *child; 28746b98bd64SPaolo Bonzini 28756b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 28766b98bd64SPaolo Bonzini bdrv_io_plug(child->bs); 28776b98bd64SPaolo Bonzini } 28786b98bd64SPaolo Bonzini 2879850d54a2SPaolo Bonzini if (atomic_fetch_inc(&bs->io_plugged) == 0) { 288061007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 288161007b31SStefan Hajnoczi if (drv && drv->bdrv_io_plug) { 288261007b31SStefan Hajnoczi drv->bdrv_io_plug(bs); 28836b98bd64SPaolo Bonzini } 288461007b31SStefan Hajnoczi } 288561007b31SStefan Hajnoczi } 288661007b31SStefan Hajnoczi 288761007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs) 288861007b31SStefan Hajnoczi { 28896b98bd64SPaolo Bonzini BdrvChild *child; 28906b98bd64SPaolo Bonzini 28916b98bd64SPaolo Bonzini assert(bs->io_plugged); 2892850d54a2SPaolo Bonzini if (atomic_fetch_dec(&bs->io_plugged) == 1) { 289361007b31SStefan Hajnoczi BlockDriver *drv = bs->drv; 289461007b31SStefan Hajnoczi if (drv && drv->bdrv_io_unplug) { 289561007b31SStefan Hajnoczi drv->bdrv_io_unplug(bs); 289661007b31SStefan Hajnoczi } 289761007b31SStefan Hajnoczi } 289861007b31SStefan Hajnoczi 28996b98bd64SPaolo Bonzini QLIST_FOREACH(child, &bs->children, next) { 29006b98bd64SPaolo Bonzini bdrv_io_unplug(child->bs); 29016b98bd64SPaolo Bonzini } 29026b98bd64SPaolo Bonzini } 290323d0ba93SFam Zheng 290423d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) 290523d0ba93SFam Zheng { 290623d0ba93SFam Zheng BdrvChild *child; 290723d0ba93SFam Zheng 290823d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_register_buf) { 290923d0ba93SFam Zheng bs->drv->bdrv_register_buf(bs, host, size); 291023d0ba93SFam Zheng } 291123d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 291223d0ba93SFam Zheng bdrv_register_buf(child->bs, host, size); 291323d0ba93SFam Zheng } 291423d0ba93SFam Zheng } 291523d0ba93SFam Zheng 291623d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host) 291723d0ba93SFam Zheng { 291823d0ba93SFam Zheng BdrvChild *child; 291923d0ba93SFam Zheng 292023d0ba93SFam Zheng if (bs->drv && bs->drv->bdrv_unregister_buf) { 292123d0ba93SFam Zheng bs->drv->bdrv_unregister_buf(bs, host); 292223d0ba93SFam Zheng } 292323d0ba93SFam Zheng QLIST_FOREACH(child, &bs->children, next) { 292423d0ba93SFam Zheng bdrv_unregister_buf(child->bs, host); 292523d0ba93SFam Zheng } 292623d0ba93SFam Zheng } 2927fcc67678SFam Zheng 292867b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal( 292967b51fb9SVladimir Sementsov-Ogievskiy BdrvChild *src, uint64_t src_offset, BdrvChild *dst, 293067b51fb9SVladimir Sementsov-Ogievskiy uint64_t dst_offset, uint64_t bytes, 293167b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, 2932fcc67678SFam Zheng bool recurse_src) 2933fcc67678SFam Zheng { 2934999658a0SVladimir Sementsov-Ogievskiy BdrvTrackedRequest req; 2935fcc67678SFam Zheng int ret; 2936fcc67678SFam Zheng 2937fe0480d6SKevin Wolf /* TODO We can support BDRV_REQ_NO_FALLBACK here */ 2938fe0480d6SKevin Wolf assert(!(read_flags & BDRV_REQ_NO_FALLBACK)); 2939fe0480d6SKevin Wolf assert(!(write_flags & BDRV_REQ_NO_FALLBACK)); 2940fe0480d6SKevin Wolf 2941d4d3e5a0SFam Zheng if (!dst || !dst->bs) { 2942fcc67678SFam Zheng return -ENOMEDIUM; 2943fcc67678SFam Zheng } 2944fcc67678SFam Zheng ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes); 2945fcc67678SFam Zheng if (ret) { 2946fcc67678SFam Zheng return ret; 2947fcc67678SFam Zheng } 294867b51fb9SVladimir Sementsov-Ogievskiy if (write_flags & BDRV_REQ_ZERO_WRITE) { 294967b51fb9SVladimir Sementsov-Ogievskiy return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); 2950fcc67678SFam Zheng } 2951fcc67678SFam Zheng 2952d4d3e5a0SFam Zheng if (!src || !src->bs) { 2953d4d3e5a0SFam Zheng return -ENOMEDIUM; 2954d4d3e5a0SFam Zheng } 2955d4d3e5a0SFam Zheng ret = bdrv_check_byte_request(src->bs, src_offset, bytes); 2956d4d3e5a0SFam Zheng if (ret) { 2957d4d3e5a0SFam Zheng return ret; 2958d4d3e5a0SFam Zheng } 2959d4d3e5a0SFam Zheng 2960fcc67678SFam Zheng if (!src->bs->drv->bdrv_co_copy_range_from 2961fcc67678SFam Zheng || !dst->bs->drv->bdrv_co_copy_range_to 2962fcc67678SFam Zheng || src->bs->encrypted || dst->bs->encrypted) { 2963fcc67678SFam Zheng return -ENOTSUP; 2964fcc67678SFam Zheng } 2965999658a0SVladimir Sementsov-Ogievskiy 2966999658a0SVladimir Sementsov-Ogievskiy if (recurse_src) { 2967d4d3e5a0SFam Zheng bdrv_inc_in_flight(src->bs); 2968999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, src->bs, src_offset, bytes, 2969999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_READ); 297037aec7d7SFam Zheng 297109d2f948SVladimir Sementsov-Ogievskiy /* BDRV_REQ_SERIALISING is only for write operation */ 297209d2f948SVladimir Sementsov-Ogievskiy assert(!(read_flags & BDRV_REQ_SERIALISING)); 297367b51fb9SVladimir Sementsov-Ogievskiy if (!(read_flags & BDRV_REQ_NO_SERIALISING)) { 2974999658a0SVladimir Sementsov-Ogievskiy wait_serialising_requests(&req); 2975dee12de8SFam Zheng } 2976999658a0SVladimir Sementsov-Ogievskiy 297737aec7d7SFam Zheng ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, 2978fcc67678SFam Zheng src, src_offset, 2979fcc67678SFam Zheng dst, dst_offset, 298067b51fb9SVladimir Sementsov-Ogievskiy bytes, 298167b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 2982999658a0SVladimir Sementsov-Ogievskiy 2983999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 2984999658a0SVladimir Sementsov-Ogievskiy bdrv_dec_in_flight(src->bs); 2985fcc67678SFam Zheng } else { 2986999658a0SVladimir Sementsov-Ogievskiy bdrv_inc_in_flight(dst->bs); 2987999658a0SVladimir Sementsov-Ogievskiy tracked_request_begin(&req, dst->bs, dst_offset, bytes, 2988999658a0SVladimir Sementsov-Ogievskiy BDRV_TRACKED_WRITE); 29890eb1e891SFam Zheng ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, 29900eb1e891SFam Zheng write_flags); 29910eb1e891SFam Zheng if (!ret) { 299237aec7d7SFam Zheng ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, 2993fcc67678SFam Zheng src, src_offset, 2994fcc67678SFam Zheng dst, dst_offset, 299567b51fb9SVladimir Sementsov-Ogievskiy bytes, 299667b51fb9SVladimir Sementsov-Ogievskiy read_flags, write_flags); 29970eb1e891SFam Zheng } 29980eb1e891SFam Zheng bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); 2999999658a0SVladimir Sementsov-Ogievskiy tracked_request_end(&req); 3000d4d3e5a0SFam Zheng bdrv_dec_in_flight(dst->bs); 3001999658a0SVladimir Sementsov-Ogievskiy } 3002999658a0SVladimir Sementsov-Ogievskiy 300337aec7d7SFam Zheng return ret; 3004fcc67678SFam Zheng } 3005fcc67678SFam Zheng 3006fcc67678SFam Zheng /* Copy range from @src to @dst. 3007fcc67678SFam Zheng * 3008fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3009fcc67678SFam Zheng * semantics. */ 3010fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, 3011fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 301267b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 301367b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 301467b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3015fcc67678SFam Zheng { 3016ecc983a5SFam Zheng trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 3017ecc983a5SFam Zheng read_flags, write_flags); 3018fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 301967b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, true); 3020fcc67678SFam Zheng } 3021fcc67678SFam Zheng 3022fcc67678SFam Zheng /* Copy range from @src to @dst. 3023fcc67678SFam Zheng * 3024fcc67678SFam Zheng * See the comment of bdrv_co_copy_range for the parameter and return value 3025fcc67678SFam Zheng * semantics. */ 3026fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, 3027fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 302867b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, 302967b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags read_flags, 303067b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3031fcc67678SFam Zheng { 3032ecc983a5SFam Zheng trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 3033ecc983a5SFam Zheng read_flags, write_flags); 3034fcc67678SFam Zheng return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, 303567b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags, false); 3036fcc67678SFam Zheng } 3037fcc67678SFam Zheng 3038fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, 3039fcc67678SFam Zheng BdrvChild *dst, uint64_t dst_offset, 304067b51fb9SVladimir Sementsov-Ogievskiy uint64_t bytes, BdrvRequestFlags read_flags, 304167b51fb9SVladimir Sementsov-Ogievskiy BdrvRequestFlags write_flags) 3042fcc67678SFam Zheng { 304337aec7d7SFam Zheng return bdrv_co_copy_range_from(src, src_offset, 3044fcc67678SFam Zheng dst, dst_offset, 304567b51fb9SVladimir Sementsov-Ogievskiy bytes, read_flags, write_flags); 3046fcc67678SFam Zheng } 30473d9f2d2aSKevin Wolf 30483d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs) 30493d9f2d2aSKevin Wolf { 30503d9f2d2aSKevin Wolf BdrvChild *c; 30513d9f2d2aSKevin Wolf QLIST_FOREACH(c, &bs->parents, next_parent) { 30523d9f2d2aSKevin Wolf if (c->role->resize) { 30533d9f2d2aSKevin Wolf c->role->resize(c); 30543d9f2d2aSKevin Wolf } 30553d9f2d2aSKevin Wolf } 30563d9f2d2aSKevin Wolf } 30573d9f2d2aSKevin Wolf 30583d9f2d2aSKevin Wolf /** 30593d9f2d2aSKevin Wolf * Truncate file to 'offset' bytes (needed only for file protocols) 30603d9f2d2aSKevin Wolf */ 30613d9f2d2aSKevin Wolf int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, 30623d9f2d2aSKevin Wolf PreallocMode prealloc, Error **errp) 30633d9f2d2aSKevin Wolf { 30643d9f2d2aSKevin Wolf BlockDriverState *bs = child->bs; 30653d9f2d2aSKevin Wolf BlockDriver *drv = bs->drv; 30661bc5f09fSKevin Wolf BdrvTrackedRequest req; 30671bc5f09fSKevin Wolf int64_t old_size, new_bytes; 30683d9f2d2aSKevin Wolf int ret; 30693d9f2d2aSKevin Wolf 30703d9f2d2aSKevin Wolf 30713d9f2d2aSKevin Wolf /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ 30723d9f2d2aSKevin Wolf if (!drv) { 30733d9f2d2aSKevin Wolf error_setg(errp, "No medium inserted"); 30743d9f2d2aSKevin Wolf return -ENOMEDIUM; 30753d9f2d2aSKevin Wolf } 30763d9f2d2aSKevin Wolf if (offset < 0) { 30773d9f2d2aSKevin Wolf error_setg(errp, "Image size cannot be negative"); 30783d9f2d2aSKevin Wolf return -EINVAL; 30793d9f2d2aSKevin Wolf } 30803d9f2d2aSKevin Wolf 30811bc5f09fSKevin Wolf old_size = bdrv_getlength(bs); 30821bc5f09fSKevin Wolf if (old_size < 0) { 30831bc5f09fSKevin Wolf error_setg_errno(errp, -old_size, "Failed to get old image size"); 30841bc5f09fSKevin Wolf return old_size; 30851bc5f09fSKevin Wolf } 30861bc5f09fSKevin Wolf 30871bc5f09fSKevin Wolf if (offset > old_size) { 30881bc5f09fSKevin Wolf new_bytes = offset - old_size; 30891bc5f09fSKevin Wolf } else { 30901bc5f09fSKevin Wolf new_bytes = 0; 30911bc5f09fSKevin Wolf } 30921bc5f09fSKevin Wolf 30933d9f2d2aSKevin Wolf bdrv_inc_in_flight(bs); 30945416a11eSFam Zheng tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, 30955416a11eSFam Zheng BDRV_TRACKED_TRUNCATE); 30961bc5f09fSKevin Wolf 30971bc5f09fSKevin Wolf /* If we are growing the image and potentially using preallocation for the 30981bc5f09fSKevin Wolf * new area, we need to make sure that no write requests are made to it 30991bc5f09fSKevin Wolf * concurrently or they might be overwritten by preallocation. */ 31001bc5f09fSKevin Wolf if (new_bytes) { 31011bc5f09fSKevin Wolf mark_request_serialising(&req, 1); 3102cd47d792SFam Zheng } 3103cd47d792SFam Zheng if (bs->read_only) { 3104cd47d792SFam Zheng error_setg(errp, "Image is read-only"); 3105cd47d792SFam Zheng ret = -EACCES; 3106cd47d792SFam Zheng goto out; 3107cd47d792SFam Zheng } 3108cd47d792SFam Zheng ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, 3109cd47d792SFam Zheng 0); 3110cd47d792SFam Zheng if (ret < 0) { 3111cd47d792SFam Zheng error_setg_errno(errp, -ret, 3112cd47d792SFam Zheng "Failed to prepare request for truncation"); 3113cd47d792SFam Zheng goto out; 31141bc5f09fSKevin Wolf } 31153d9f2d2aSKevin Wolf 31163d9f2d2aSKevin Wolf if (!drv->bdrv_co_truncate) { 31173d9f2d2aSKevin Wolf if (bs->file && drv->is_filter) { 31183d9f2d2aSKevin Wolf ret = bdrv_co_truncate(bs->file, offset, prealloc, errp); 31193d9f2d2aSKevin Wolf goto out; 31203d9f2d2aSKevin Wolf } 31213d9f2d2aSKevin Wolf error_setg(errp, "Image format driver does not support resize"); 31223d9f2d2aSKevin Wolf ret = -ENOTSUP; 31233d9f2d2aSKevin Wolf goto out; 31243d9f2d2aSKevin Wolf } 31253d9f2d2aSKevin Wolf 31263d9f2d2aSKevin Wolf ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp); 31273d9f2d2aSKevin Wolf if (ret < 0) { 31283d9f2d2aSKevin Wolf goto out; 31293d9f2d2aSKevin Wolf } 31303d9f2d2aSKevin Wolf ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); 31313d9f2d2aSKevin Wolf if (ret < 0) { 31323d9f2d2aSKevin Wolf error_setg_errno(errp, -ret, "Could not refresh total sector count"); 31333d9f2d2aSKevin Wolf } else { 31343d9f2d2aSKevin Wolf offset = bs->total_sectors * BDRV_SECTOR_SIZE; 31353d9f2d2aSKevin Wolf } 3136cd47d792SFam Zheng /* It's possible that truncation succeeded but refresh_total_sectors 3137cd47d792SFam Zheng * failed, but the latter doesn't affect how we should finish the request. 3138cd47d792SFam Zheng * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ 3139cd47d792SFam Zheng bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); 31403d9f2d2aSKevin Wolf 31413d9f2d2aSKevin Wolf out: 31421bc5f09fSKevin Wolf tracked_request_end(&req); 31433d9f2d2aSKevin Wolf bdrv_dec_in_flight(bs); 31441bc5f09fSKevin Wolf 31453d9f2d2aSKevin Wolf return ret; 31463d9f2d2aSKevin Wolf } 31473d9f2d2aSKevin Wolf 31483d9f2d2aSKevin Wolf typedef struct TruncateCo { 31493d9f2d2aSKevin Wolf BdrvChild *child; 31503d9f2d2aSKevin Wolf int64_t offset; 31513d9f2d2aSKevin Wolf PreallocMode prealloc; 31523d9f2d2aSKevin Wolf Error **errp; 31533d9f2d2aSKevin Wolf int ret; 31543d9f2d2aSKevin Wolf } TruncateCo; 31553d9f2d2aSKevin Wolf 31563d9f2d2aSKevin Wolf static void coroutine_fn bdrv_truncate_co_entry(void *opaque) 31573d9f2d2aSKevin Wolf { 31583d9f2d2aSKevin Wolf TruncateCo *tco = opaque; 31593d9f2d2aSKevin Wolf tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc, 31603d9f2d2aSKevin Wolf tco->errp); 31614720cbeeSKevin Wolf aio_wait_kick(); 31623d9f2d2aSKevin Wolf } 31633d9f2d2aSKevin Wolf 31643d9f2d2aSKevin Wolf int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc, 31653d9f2d2aSKevin Wolf Error **errp) 31663d9f2d2aSKevin Wolf { 31673d9f2d2aSKevin Wolf Coroutine *co; 31683d9f2d2aSKevin Wolf TruncateCo tco = { 31693d9f2d2aSKevin Wolf .child = child, 31703d9f2d2aSKevin Wolf .offset = offset, 31713d9f2d2aSKevin Wolf .prealloc = prealloc, 31723d9f2d2aSKevin Wolf .errp = errp, 31733d9f2d2aSKevin Wolf .ret = NOT_DONE, 31743d9f2d2aSKevin Wolf }; 31753d9f2d2aSKevin Wolf 31763d9f2d2aSKevin Wolf if (qemu_in_coroutine()) { 31773d9f2d2aSKevin Wolf /* Fast-path if already in coroutine context */ 31783d9f2d2aSKevin Wolf bdrv_truncate_co_entry(&tco); 31793d9f2d2aSKevin Wolf } else { 31803d9f2d2aSKevin Wolf co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco); 31814720cbeeSKevin Wolf bdrv_coroutine_enter(child->bs, co); 31823d9f2d2aSKevin Wolf BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE); 31833d9f2d2aSKevin Wolf } 31843d9f2d2aSKevin Wolf 31853d9f2d2aSKevin Wolf return tco.ret; 31863d9f2d2aSKevin Wolf } 3187