xref: /qemu/block/io.c (revision 2e11d7562ac9f065b9fe696fda51273a1e6671e9)
161007b31SStefan Hajnoczi /*
261007b31SStefan Hajnoczi  * Block layer I/O functions
361007b31SStefan Hajnoczi  *
461007b31SStefan Hajnoczi  * Copyright (c) 2003 Fabrice Bellard
561007b31SStefan Hajnoczi  *
661007b31SStefan Hajnoczi  * Permission is hereby granted, free of charge, to any person obtaining a copy
761007b31SStefan Hajnoczi  * of this software and associated documentation files (the "Software"), to deal
861007b31SStefan Hajnoczi  * in the Software without restriction, including without limitation the rights
961007b31SStefan Hajnoczi  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1061007b31SStefan Hajnoczi  * copies of the Software, and to permit persons to whom the Software is
1161007b31SStefan Hajnoczi  * furnished to do so, subject to the following conditions:
1261007b31SStefan Hajnoczi  *
1361007b31SStefan Hajnoczi  * The above copyright notice and this permission notice shall be included in
1461007b31SStefan Hajnoczi  * all copies or substantial portions of the Software.
1561007b31SStefan Hajnoczi  *
1661007b31SStefan Hajnoczi  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1761007b31SStefan Hajnoczi  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1861007b31SStefan Hajnoczi  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1961007b31SStefan Hajnoczi  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2061007b31SStefan Hajnoczi  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2161007b31SStefan Hajnoczi  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2261007b31SStefan Hajnoczi  * THE SOFTWARE.
2361007b31SStefan Hajnoczi  */
2461007b31SStefan Hajnoczi 
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2661007b31SStefan Hajnoczi #include "trace.h"
277f0e9da6SMax Reitz #include "sysemu/block-backend.h"
287719f3c9SStefan Hajnoczi #include "block/aio-wait.h"
2961007b31SStefan Hajnoczi #include "block/blockjob.h"
30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h"
3161007b31SStefan Hajnoczi #include "block/block_int.h"
32f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
33da34e65cSMarkus Armbruster #include "qapi/error.h"
34d49b6836SMarkus Armbruster #include "qemu/error-report.h"
3561007b31SStefan Hajnoczi 
3661007b31SStefan Hajnoczi #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
3761007b31SStefan Hajnoczi 
38cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
40cb2e2878SEric Blake 
417f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs);
42d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
43f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags);
4461007b31SStefan Hajnoczi 
456cd5c9d7SKevin Wolf void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
466cd5c9d7SKevin Wolf                                bool ignore_bds_parents)
4761007b31SStefan Hajnoczi {
4802d21300SKevin Wolf     BdrvChild *c, *next;
4927ccdd52SKevin Wolf 
5002d21300SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
516cd5c9d7SKevin Wolf         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
520152bf40SKevin Wolf             continue;
530152bf40SKevin Wolf         }
544be6a6d1SKevin Wolf         bdrv_parent_drained_begin_single(c, false);
55ce0f1412SPaolo Bonzini     }
56ce0f1412SPaolo Bonzini }
57ce0f1412SPaolo Bonzini 
586cd5c9d7SKevin Wolf void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
596cd5c9d7SKevin Wolf                              bool ignore_bds_parents)
60ce0f1412SPaolo Bonzini {
6102d21300SKevin Wolf     BdrvChild *c, *next;
6227ccdd52SKevin Wolf 
6302d21300SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
646cd5c9d7SKevin Wolf         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
650152bf40SKevin Wolf             continue;
660152bf40SKevin Wolf         }
67c2066af0SKevin Wolf         if (c->role->drained_end) {
68c2066af0SKevin Wolf             c->role->drained_end(c);
6927ccdd52SKevin Wolf         }
70c2066af0SKevin Wolf     }
7161007b31SStefan Hajnoczi }
7261007b31SStefan Hajnoczi 
734be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c)
744be6a6d1SKevin Wolf {
754be6a6d1SKevin Wolf     if (c->role->drained_poll) {
764be6a6d1SKevin Wolf         return c->role->drained_poll(c);
774be6a6d1SKevin Wolf     }
784be6a6d1SKevin Wolf     return false;
794be6a6d1SKevin Wolf }
804be6a6d1SKevin Wolf 
816cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
826cd5c9d7SKevin Wolf                                      bool ignore_bds_parents)
8389bd0305SKevin Wolf {
8489bd0305SKevin Wolf     BdrvChild *c, *next;
8589bd0305SKevin Wolf     bool busy = false;
8689bd0305SKevin Wolf 
8789bd0305SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
886cd5c9d7SKevin Wolf         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
8989bd0305SKevin Wolf             continue;
9089bd0305SKevin Wolf         }
914be6a6d1SKevin Wolf         busy |= bdrv_parent_drained_poll_single(c);
9289bd0305SKevin Wolf     }
9389bd0305SKevin Wolf 
9489bd0305SKevin Wolf     return busy;
9589bd0305SKevin Wolf }
9689bd0305SKevin Wolf 
974be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
984be6a6d1SKevin Wolf {
994be6a6d1SKevin Wolf     if (c->role->drained_begin) {
1004be6a6d1SKevin Wolf         c->role->drained_begin(c);
1014be6a6d1SKevin Wolf     }
1024be6a6d1SKevin Wolf     if (poll) {
1034be6a6d1SKevin Wolf         BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
1044be6a6d1SKevin Wolf     }
1054be6a6d1SKevin Wolf }
1064be6a6d1SKevin Wolf 
107d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
108d9e0dfa2SEric Blake {
109d9e0dfa2SEric Blake     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
110d9e0dfa2SEric Blake     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
111d9e0dfa2SEric Blake     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
112d9e0dfa2SEric Blake                                  src->opt_mem_alignment);
113d9e0dfa2SEric Blake     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
114d9e0dfa2SEric Blake                                  src->min_mem_alignment);
115d9e0dfa2SEric Blake     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
116d9e0dfa2SEric Blake }
117d9e0dfa2SEric Blake 
11861007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
11961007b31SStefan Hajnoczi {
12061007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
12161007b31SStefan Hajnoczi     Error *local_err = NULL;
12261007b31SStefan Hajnoczi 
12361007b31SStefan Hajnoczi     memset(&bs->bl, 0, sizeof(bs->bl));
12461007b31SStefan Hajnoczi 
12561007b31SStefan Hajnoczi     if (!drv) {
12661007b31SStefan Hajnoczi         return;
12761007b31SStefan Hajnoczi     }
12861007b31SStefan Hajnoczi 
12979ba8c98SEric Blake     /* Default alignment based on whether driver has byte interface */
130e31f6864SEric Blake     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
131e31f6864SEric Blake                                 drv->bdrv_aio_preadv) ? 1 : 512;
13279ba8c98SEric Blake 
13361007b31SStefan Hajnoczi     /* Take some limits from the children as a default */
13461007b31SStefan Hajnoczi     if (bs->file) {
1359a4f4c31SKevin Wolf         bdrv_refresh_limits(bs->file->bs, &local_err);
13661007b31SStefan Hajnoczi         if (local_err) {
13761007b31SStefan Hajnoczi             error_propagate(errp, local_err);
13861007b31SStefan Hajnoczi             return;
13961007b31SStefan Hajnoczi         }
140d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
14161007b31SStefan Hajnoczi     } else {
1424196d2f0SDenis V. Lunev         bs->bl.min_mem_alignment = 512;
143459b4e66SDenis V. Lunev         bs->bl.opt_mem_alignment = getpagesize();
144bd44feb7SStefan Hajnoczi 
145bd44feb7SStefan Hajnoczi         /* Safe default since most protocols use readv()/writev()/etc */
146bd44feb7SStefan Hajnoczi         bs->bl.max_iov = IOV_MAX;
14761007b31SStefan Hajnoczi     }
14861007b31SStefan Hajnoczi 
149760e0063SKevin Wolf     if (bs->backing) {
150760e0063SKevin Wolf         bdrv_refresh_limits(bs->backing->bs, &local_err);
15161007b31SStefan Hajnoczi         if (local_err) {
15261007b31SStefan Hajnoczi             error_propagate(errp, local_err);
15361007b31SStefan Hajnoczi             return;
15461007b31SStefan Hajnoczi         }
155d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
15661007b31SStefan Hajnoczi     }
15761007b31SStefan Hajnoczi 
15861007b31SStefan Hajnoczi     /* Then let the driver override it */
15961007b31SStefan Hajnoczi     if (drv->bdrv_refresh_limits) {
16061007b31SStefan Hajnoczi         drv->bdrv_refresh_limits(bs, errp);
16161007b31SStefan Hajnoczi     }
16261007b31SStefan Hajnoczi }
16361007b31SStefan Hajnoczi 
16461007b31SStefan Hajnoczi /**
16561007b31SStefan Hajnoczi  * The copy-on-read flag is actually a reference count so multiple users may
16661007b31SStefan Hajnoczi  * use the feature without worrying about clobbering its previous state.
16761007b31SStefan Hajnoczi  * Copy-on-read stays enabled until all users have called to disable it.
16861007b31SStefan Hajnoczi  */
16961007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs)
17061007b31SStefan Hajnoczi {
171d3faa13eSPaolo Bonzini     atomic_inc(&bs->copy_on_read);
17261007b31SStefan Hajnoczi }
17361007b31SStefan Hajnoczi 
17461007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs)
17561007b31SStefan Hajnoczi {
176d3faa13eSPaolo Bonzini     int old = atomic_fetch_dec(&bs->copy_on_read);
177d3faa13eSPaolo Bonzini     assert(old >= 1);
17861007b31SStefan Hajnoczi }
17961007b31SStefan Hajnoczi 
18061124f03SPaolo Bonzini typedef struct {
18161124f03SPaolo Bonzini     Coroutine *co;
18261124f03SPaolo Bonzini     BlockDriverState *bs;
18361124f03SPaolo Bonzini     bool done;
184481cad48SManos Pitsidianakis     bool begin;
185b0165585SKevin Wolf     bool recursive;
186fe4f0614SKevin Wolf     bool poll;
1870152bf40SKevin Wolf     BdrvChild *parent;
1886cd5c9d7SKevin Wolf     bool ignore_bds_parents;
18961124f03SPaolo Bonzini } BdrvCoDrainData;
19061124f03SPaolo Bonzini 
19161124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
19261124f03SPaolo Bonzini {
19361124f03SPaolo Bonzini     BdrvCoDrainData *data = opaque;
19461124f03SPaolo Bonzini     BlockDriverState *bs = data->bs;
19561124f03SPaolo Bonzini 
196481cad48SManos Pitsidianakis     if (data->begin) {
197f8ea8dacSManos Pitsidianakis         bs->drv->bdrv_co_drain_begin(bs);
198481cad48SManos Pitsidianakis     } else {
199481cad48SManos Pitsidianakis         bs->drv->bdrv_co_drain_end(bs);
200481cad48SManos Pitsidianakis     }
20161124f03SPaolo Bonzini 
20261124f03SPaolo Bonzini     /* Set data->done before reading bs->wakeup.  */
20361124f03SPaolo Bonzini     atomic_mb_set(&data->done, true);
2040109e7e6SKevin Wolf     bdrv_dec_in_flight(bs);
2050109e7e6SKevin Wolf 
2060109e7e6SKevin Wolf     if (data->begin) {
2070109e7e6SKevin Wolf         g_free(data);
2080109e7e6SKevin Wolf     }
20961124f03SPaolo Bonzini }
21061124f03SPaolo Bonzini 
211db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
2127d40d9efSKevin Wolf static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
21361124f03SPaolo Bonzini {
2140109e7e6SKevin Wolf     BdrvCoDrainData *data;
21561124f03SPaolo Bonzini 
216f8ea8dacSManos Pitsidianakis     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
217481cad48SManos Pitsidianakis             (!begin && !bs->drv->bdrv_co_drain_end)) {
21861124f03SPaolo Bonzini         return;
21961124f03SPaolo Bonzini     }
22061124f03SPaolo Bonzini 
2210109e7e6SKevin Wolf     data = g_new(BdrvCoDrainData, 1);
2220109e7e6SKevin Wolf     *data = (BdrvCoDrainData) {
2230109e7e6SKevin Wolf         .bs = bs,
2240109e7e6SKevin Wolf         .done = false,
2250109e7e6SKevin Wolf         .begin = begin
2260109e7e6SKevin Wolf     };
2270109e7e6SKevin Wolf 
2280109e7e6SKevin Wolf     /* Make sure the driver callback completes during the polling phase for
2290109e7e6SKevin Wolf      * drain_begin. */
2300109e7e6SKevin Wolf     bdrv_inc_in_flight(bs);
2310109e7e6SKevin Wolf     data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
2320109e7e6SKevin Wolf     aio_co_schedule(bdrv_get_aio_context(bs), data->co);
2330109e7e6SKevin Wolf 
2340109e7e6SKevin Wolf     if (!begin) {
2350109e7e6SKevin Wolf         BDRV_POLL_WHILE(bs, !data->done);
2360109e7e6SKevin Wolf         g_free(data);
2370109e7e6SKevin Wolf     }
23861124f03SPaolo Bonzini }
23961124f03SPaolo Bonzini 
2401cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
241fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
2426cd5c9d7SKevin Wolf                      BdrvChild *ignore_parent, bool ignore_bds_parents)
24389bd0305SKevin Wolf {
244fe4f0614SKevin Wolf     BdrvChild *child, *next;
245fe4f0614SKevin Wolf 
2466cd5c9d7SKevin Wolf     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
24789bd0305SKevin Wolf         return true;
24889bd0305SKevin Wolf     }
24989bd0305SKevin Wolf 
250fe4f0614SKevin Wolf     if (atomic_read(&bs->in_flight)) {
251fe4f0614SKevin Wolf         return true;
25289bd0305SKevin Wolf     }
25389bd0305SKevin Wolf 
254fe4f0614SKevin Wolf     if (recursive) {
2556cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
256fe4f0614SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
2576cd5c9d7SKevin Wolf             if (bdrv_drain_poll(child->bs, recursive, child, false)) {
258fe4f0614SKevin Wolf                 return true;
259fe4f0614SKevin Wolf             }
260fe4f0614SKevin Wolf         }
261fe4f0614SKevin Wolf     }
262fe4f0614SKevin Wolf 
263fe4f0614SKevin Wolf     return false;
264fe4f0614SKevin Wolf }
265fe4f0614SKevin Wolf 
266fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
26789bd0305SKevin Wolf                                       BdrvChild *ignore_parent)
2681cc8e54aSKevin Wolf {
2696cd5c9d7SKevin Wolf     return bdrv_drain_poll(bs, recursive, ignore_parent, false);
2701cc8e54aSKevin Wolf }
2711cc8e54aSKevin Wolf 
272b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
2736cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
2746cd5c9d7SKevin Wolf                                   bool poll);
275b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
2766cd5c9d7SKevin Wolf                                 BdrvChild *parent, bool ignore_bds_parents);
2770152bf40SKevin Wolf 
278a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque)
279a77fd4bbSFam Zheng {
280a77fd4bbSFam Zheng     BdrvCoDrainData *data = opaque;
281a77fd4bbSFam Zheng     Coroutine *co = data->co;
28299723548SPaolo Bonzini     BlockDriverState *bs = data->bs;
283a77fd4bbSFam Zheng 
284c8ca33d0SKevin Wolf     if (bs) {
285aa1361d5SKevin Wolf         AioContext *ctx = bdrv_get_aio_context(bs);
286aa1361d5SKevin Wolf         AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
287aa1361d5SKevin Wolf 
288aa1361d5SKevin Wolf         /*
289aa1361d5SKevin Wolf          * When the coroutine yielded, the lock for its home context was
290aa1361d5SKevin Wolf          * released, so we need to re-acquire it here. If it explicitly
291aa1361d5SKevin Wolf          * acquired a different context, the lock is still held and we don't
292aa1361d5SKevin Wolf          * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
293aa1361d5SKevin Wolf          */
294aa1361d5SKevin Wolf         if (ctx == co_ctx) {
295aa1361d5SKevin Wolf             aio_context_acquire(ctx);
296aa1361d5SKevin Wolf         }
29799723548SPaolo Bonzini         bdrv_dec_in_flight(bs);
298481cad48SManos Pitsidianakis         if (data->begin) {
2996cd5c9d7SKevin Wolf             bdrv_do_drained_begin(bs, data->recursive, data->parent,
3006cd5c9d7SKevin Wolf                                   data->ignore_bds_parents, data->poll);
301481cad48SManos Pitsidianakis         } else {
3026cd5c9d7SKevin Wolf             bdrv_do_drained_end(bs, data->recursive, data->parent,
3036cd5c9d7SKevin Wolf                                 data->ignore_bds_parents);
304481cad48SManos Pitsidianakis         }
305aa1361d5SKevin Wolf         if (ctx == co_ctx) {
306aa1361d5SKevin Wolf             aio_context_release(ctx);
307aa1361d5SKevin Wolf         }
308c8ca33d0SKevin Wolf     } else {
309c8ca33d0SKevin Wolf         assert(data->begin);
310c8ca33d0SKevin Wolf         bdrv_drain_all_begin();
311c8ca33d0SKevin Wolf     }
312481cad48SManos Pitsidianakis 
313a77fd4bbSFam Zheng     data->done = true;
3141919631eSPaolo Bonzini     aio_co_wake(co);
315a77fd4bbSFam Zheng }
316a77fd4bbSFam Zheng 
317481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
318b0165585SKevin Wolf                                                 bool begin, bool recursive,
3196cd5c9d7SKevin Wolf                                                 BdrvChild *parent,
3206cd5c9d7SKevin Wolf                                                 bool ignore_bds_parents,
3216cd5c9d7SKevin Wolf                                                 bool poll)
322a77fd4bbSFam Zheng {
323a77fd4bbSFam Zheng     BdrvCoDrainData data;
324a77fd4bbSFam Zheng 
325a77fd4bbSFam Zheng     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
326c40a2545SStefan Hajnoczi      * other coroutines run if they were queued by aio_co_enter(). */
327a77fd4bbSFam Zheng 
328a77fd4bbSFam Zheng     assert(qemu_in_coroutine());
329a77fd4bbSFam Zheng     data = (BdrvCoDrainData) {
330a77fd4bbSFam Zheng         .co = qemu_coroutine_self(),
331a77fd4bbSFam Zheng         .bs = bs,
332a77fd4bbSFam Zheng         .done = false,
333481cad48SManos Pitsidianakis         .begin = begin,
334b0165585SKevin Wolf         .recursive = recursive,
3350152bf40SKevin Wolf         .parent = parent,
3366cd5c9d7SKevin Wolf         .ignore_bds_parents = ignore_bds_parents,
337fe4f0614SKevin Wolf         .poll = poll,
338a77fd4bbSFam Zheng     };
339c8ca33d0SKevin Wolf     if (bs) {
34099723548SPaolo Bonzini         bdrv_inc_in_flight(bs);
341c8ca33d0SKevin Wolf     }
342fffb6e12SPaolo Bonzini     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
343fffb6e12SPaolo Bonzini                             bdrv_co_drain_bh_cb, &data);
344a77fd4bbSFam Zheng 
345a77fd4bbSFam Zheng     qemu_coroutine_yield();
346a77fd4bbSFam Zheng     /* If we are resumed from some other event (such as an aio completion or a
347a77fd4bbSFam Zheng      * timer callback), it is a bug in the caller that should be fixed. */
348a77fd4bbSFam Zheng     assert(data.done);
349a77fd4bbSFam Zheng }
350a77fd4bbSFam Zheng 
351dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
3526cd5c9d7SKevin Wolf                                    BdrvChild *parent, bool ignore_bds_parents)
353dcf94a23SKevin Wolf {
354dcf94a23SKevin Wolf     assert(!qemu_in_coroutine());
355dcf94a23SKevin Wolf 
356dcf94a23SKevin Wolf     /* Stop things in parent-to-child order */
357dcf94a23SKevin Wolf     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
358dcf94a23SKevin Wolf         aio_disable_external(bdrv_get_aio_context(bs));
359dcf94a23SKevin Wolf     }
360dcf94a23SKevin Wolf 
3616cd5c9d7SKevin Wolf     bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
362dcf94a23SKevin Wolf     bdrv_drain_invoke(bs, true);
363dcf94a23SKevin Wolf }
364dcf94a23SKevin Wolf 
365dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
3666cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
3676cd5c9d7SKevin Wolf                                   bool poll)
3686820643fSKevin Wolf {
369b0165585SKevin Wolf     BdrvChild *child, *next;
370b0165585SKevin Wolf 
371d42cf288SPaolo Bonzini     if (qemu_in_coroutine()) {
3726cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
3736cd5c9d7SKevin Wolf                                poll);
374d42cf288SPaolo Bonzini         return;
375d42cf288SPaolo Bonzini     }
376d42cf288SPaolo Bonzini 
3776cd5c9d7SKevin Wolf     bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
378d30b8e64SKevin Wolf 
379b0165585SKevin Wolf     if (recursive) {
3806cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
381d736f119SKevin Wolf         bs->recursive_quiesce_counter++;
382b0165585SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
3836cd5c9d7SKevin Wolf             bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
3846cd5c9d7SKevin Wolf                                   false);
385b0165585SKevin Wolf         }
386b0165585SKevin Wolf     }
387fe4f0614SKevin Wolf 
388fe4f0614SKevin Wolf     /*
389fe4f0614SKevin Wolf      * Wait for drained requests to finish.
390fe4f0614SKevin Wolf      *
391fe4f0614SKevin Wolf      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
392fe4f0614SKevin Wolf      * call is needed so things in this AioContext can make progress even
393fe4f0614SKevin Wolf      * though we don't return to the main AioContext loop - this automatically
394fe4f0614SKevin Wolf      * includes other nodes in the same AioContext and therefore all child
395fe4f0614SKevin Wolf      * nodes.
396fe4f0614SKevin Wolf      */
397fe4f0614SKevin Wolf     if (poll) {
3986cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
399fe4f0614SKevin Wolf         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
400fe4f0614SKevin Wolf     }
4016820643fSKevin Wolf }
4026820643fSKevin Wolf 
4030152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs)
4040152bf40SKevin Wolf {
4056cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, false, NULL, false, true);
4060152bf40SKevin Wolf }
4070152bf40SKevin Wolf 
408b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs)
4096820643fSKevin Wolf {
4106cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, true, NULL, false, true);
411b0165585SKevin Wolf }
412b0165585SKevin Wolf 
4136cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
4146cd5c9d7SKevin Wolf                                 BdrvChild *parent, bool ignore_bds_parents)
415b0165585SKevin Wolf {
416b0165585SKevin Wolf     BdrvChild *child, *next;
4170f115168SKevin Wolf     int old_quiesce_counter;
4180f115168SKevin Wolf 
419481cad48SManos Pitsidianakis     if (qemu_in_coroutine()) {
4206cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
4216cd5c9d7SKevin Wolf                                false);
422481cad48SManos Pitsidianakis         return;
423481cad48SManos Pitsidianakis     }
4246820643fSKevin Wolf     assert(bs->quiesce_counter > 0);
4250f115168SKevin Wolf     old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
4266820643fSKevin Wolf 
42760369b86SKevin Wolf     /* Re-enable things in child-to-parent order */
4287d40d9efSKevin Wolf     bdrv_drain_invoke(bs, false);
4296cd5c9d7SKevin Wolf     bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
4300f115168SKevin Wolf     if (old_quiesce_counter == 1) {
4316820643fSKevin Wolf         aio_enable_external(bdrv_get_aio_context(bs));
4326820643fSKevin Wolf     }
433b0165585SKevin Wolf 
434b0165585SKevin Wolf     if (recursive) {
4356cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
436d736f119SKevin Wolf         bs->recursive_quiesce_counter--;
437b0165585SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
4386cd5c9d7SKevin Wolf             bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
439b0165585SKevin Wolf         }
440b0165585SKevin Wolf     }
4410f115168SKevin Wolf }
4426820643fSKevin Wolf 
4430152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs)
4440152bf40SKevin Wolf {
4456cd5c9d7SKevin Wolf     bdrv_do_drained_end(bs, false, NULL, false);
446b0165585SKevin Wolf }
447b0165585SKevin Wolf 
448b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs)
449b0165585SKevin Wolf {
4506cd5c9d7SKevin Wolf     bdrv_do_drained_end(bs, true, NULL, false);
4510152bf40SKevin Wolf }
4520152bf40SKevin Wolf 
453d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
454d736f119SKevin Wolf {
455d736f119SKevin Wolf     int i;
456d736f119SKevin Wolf 
457d736f119SKevin Wolf     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
4586cd5c9d7SKevin Wolf         bdrv_do_drained_begin(child->bs, true, child, false, true);
459d736f119SKevin Wolf     }
460d736f119SKevin Wolf }
461d736f119SKevin Wolf 
462d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
463d736f119SKevin Wolf {
464d736f119SKevin Wolf     int i;
465d736f119SKevin Wolf 
466d736f119SKevin Wolf     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
4676cd5c9d7SKevin Wolf         bdrv_do_drained_end(child->bs, true, child, false);
468d736f119SKevin Wolf     }
469d736f119SKevin Wolf }
470d736f119SKevin Wolf 
47161007b31SStefan Hajnoczi /*
47267da1dc5SFam Zheng  * Wait for pending requests to complete on a single BlockDriverState subtree,
47367da1dc5SFam Zheng  * and suspend block driver's internal I/O until next request arrives.
47461007b31SStefan Hajnoczi  *
47561007b31SStefan Hajnoczi  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
47661007b31SStefan Hajnoczi  * AioContext.
47761007b31SStefan Hajnoczi  */
478b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
479b6e84c97SPaolo Bonzini {
4806820643fSKevin Wolf     assert(qemu_in_coroutine());
4816820643fSKevin Wolf     bdrv_drained_begin(bs);
4826820643fSKevin Wolf     bdrv_drained_end(bs);
483b6e84c97SPaolo Bonzini }
484b6e84c97SPaolo Bonzini 
48561007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs)
48661007b31SStefan Hajnoczi {
4876820643fSKevin Wolf     bdrv_drained_begin(bs);
4886820643fSKevin Wolf     bdrv_drained_end(bs);
48961007b31SStefan Hajnoczi }
49061007b31SStefan Hajnoczi 
491c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs)
492c13ad59fSKevin Wolf {
493c13ad59fSKevin Wolf     BdrvChild *child, *next;
494c13ad59fSKevin Wolf 
495c13ad59fSKevin Wolf     assert(atomic_read(&bs->in_flight) == 0);
496c13ad59fSKevin Wolf     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
497c13ad59fSKevin Wolf         bdrv_drain_assert_idle(child->bs);
498c13ad59fSKevin Wolf     }
499c13ad59fSKevin Wolf }
500c13ad59fSKevin Wolf 
5010f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0;
5020f12264eSKevin Wolf 
5030f12264eSKevin Wolf static bool bdrv_drain_all_poll(void)
5040f12264eSKevin Wolf {
5050f12264eSKevin Wolf     BlockDriverState *bs = NULL;
5060f12264eSKevin Wolf     bool result = false;
5070f12264eSKevin Wolf 
5080f12264eSKevin Wolf     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
5090f12264eSKevin Wolf      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
5100f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
5110f12264eSKevin Wolf         AioContext *aio_context = bdrv_get_aio_context(bs);
5120f12264eSKevin Wolf         aio_context_acquire(aio_context);
5130f12264eSKevin Wolf         result |= bdrv_drain_poll(bs, false, NULL, true);
5140f12264eSKevin Wolf         aio_context_release(aio_context);
5150f12264eSKevin Wolf     }
5160f12264eSKevin Wolf 
5170f12264eSKevin Wolf     return result;
5180f12264eSKevin Wolf }
5190f12264eSKevin Wolf 
52061007b31SStefan Hajnoczi /*
52161007b31SStefan Hajnoczi  * Wait for pending requests to complete across all BlockDriverStates
52261007b31SStefan Hajnoczi  *
52361007b31SStefan Hajnoczi  * This function does not flush data to disk, use bdrv_flush_all() for that
52461007b31SStefan Hajnoczi  * after calling this function.
525c0778f66SAlberto Garcia  *
526c0778f66SAlberto Garcia  * This pauses all block jobs and disables external clients. It must
527c0778f66SAlberto Garcia  * be paired with bdrv_drain_all_end().
528c0778f66SAlberto Garcia  *
529c0778f66SAlberto Garcia  * NOTE: no new block jobs or BlockDriverStates can be created between
530c0778f66SAlberto Garcia  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
53161007b31SStefan Hajnoczi  */
532c0778f66SAlberto Garcia void bdrv_drain_all_begin(void)
53361007b31SStefan Hajnoczi {
5340f12264eSKevin Wolf     BlockDriverState *bs = NULL;
53561007b31SStefan Hajnoczi 
536c8ca33d0SKevin Wolf     if (qemu_in_coroutine()) {
5370f12264eSKevin Wolf         bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
538c8ca33d0SKevin Wolf         return;
539c8ca33d0SKevin Wolf     }
540c8ca33d0SKevin Wolf 
5410f12264eSKevin Wolf     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
5420f12264eSKevin Wolf      * loop AioContext, so make sure we're in the main context. */
5439a7e86c8SKevin Wolf     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
5440f12264eSKevin Wolf     assert(bdrv_drain_all_count < INT_MAX);
5450f12264eSKevin Wolf     bdrv_drain_all_count++;
5469a7e86c8SKevin Wolf 
5470f12264eSKevin Wolf     /* Quiesce all nodes, without polling in-flight requests yet. The graph
5480f12264eSKevin Wolf      * cannot change during this loop. */
5490f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
55061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
55161007b31SStefan Hajnoczi 
55261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
5530f12264eSKevin Wolf         bdrv_do_drained_begin(bs, false, NULL, true, false);
55461007b31SStefan Hajnoczi         aio_context_release(aio_context);
55561007b31SStefan Hajnoczi     }
55661007b31SStefan Hajnoczi 
5570f12264eSKevin Wolf     /* Now poll the in-flight requests */
558cfe29d82SKevin Wolf     AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
5590f12264eSKevin Wolf 
5600f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
561c13ad59fSKevin Wolf         bdrv_drain_assert_idle(bs);
562f406c03cSAlexander Yarygin     }
563f406c03cSAlexander Yarygin }
564c0778f66SAlberto Garcia 
565c0778f66SAlberto Garcia void bdrv_drain_all_end(void)
566c0778f66SAlberto Garcia {
5670f12264eSKevin Wolf     BlockDriverState *bs = NULL;
568c0778f66SAlberto Garcia 
5690f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
57061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
57161007b31SStefan Hajnoczi 
57261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
5730f12264eSKevin Wolf         bdrv_do_drained_end(bs, false, NULL, true);
57461007b31SStefan Hajnoczi         aio_context_release(aio_context);
57561007b31SStefan Hajnoczi     }
5760f12264eSKevin Wolf 
5770f12264eSKevin Wolf     assert(bdrv_drain_all_count > 0);
5780f12264eSKevin Wolf     bdrv_drain_all_count--;
57961007b31SStefan Hajnoczi }
58061007b31SStefan Hajnoczi 
581c0778f66SAlberto Garcia void bdrv_drain_all(void)
582c0778f66SAlberto Garcia {
583c0778f66SAlberto Garcia     bdrv_drain_all_begin();
584c0778f66SAlberto Garcia     bdrv_drain_all_end();
585c0778f66SAlberto Garcia }
586c0778f66SAlberto Garcia 
58761007b31SStefan Hajnoczi /**
58861007b31SStefan Hajnoczi  * Remove an active request from the tracked requests list
58961007b31SStefan Hajnoczi  *
59061007b31SStefan Hajnoczi  * This function should be called when a tracked request is completing.
59161007b31SStefan Hajnoczi  */
59261007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req)
59361007b31SStefan Hajnoczi {
59461007b31SStefan Hajnoczi     if (req->serialising) {
59520fc71b2SPaolo Bonzini         atomic_dec(&req->bs->serialising_in_flight);
59661007b31SStefan Hajnoczi     }
59761007b31SStefan Hajnoczi 
5983783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&req->bs->reqs_lock);
59961007b31SStefan Hajnoczi     QLIST_REMOVE(req, list);
60061007b31SStefan Hajnoczi     qemu_co_queue_restart_all(&req->wait_queue);
6013783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&req->bs->reqs_lock);
60261007b31SStefan Hajnoczi }
60361007b31SStefan Hajnoczi 
60461007b31SStefan Hajnoczi /**
60561007b31SStefan Hajnoczi  * Add an active request to the tracked requests list
60661007b31SStefan Hajnoczi  */
60761007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req,
60861007b31SStefan Hajnoczi                                   BlockDriverState *bs,
60961007b31SStefan Hajnoczi                                   int64_t offset,
61022931a15SFam Zheng                                   uint64_t bytes,
611ebde595cSFam Zheng                                   enum BdrvTrackedRequestType type)
61261007b31SStefan Hajnoczi {
61322931a15SFam Zheng     assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
61422931a15SFam Zheng 
61561007b31SStefan Hajnoczi     *req = (BdrvTrackedRequest){
61661007b31SStefan Hajnoczi         .bs = bs,
61761007b31SStefan Hajnoczi         .offset         = offset,
61861007b31SStefan Hajnoczi         .bytes          = bytes,
619ebde595cSFam Zheng         .type           = type,
62061007b31SStefan Hajnoczi         .co             = qemu_coroutine_self(),
62161007b31SStefan Hajnoczi         .serialising    = false,
62261007b31SStefan Hajnoczi         .overlap_offset = offset,
62361007b31SStefan Hajnoczi         .overlap_bytes  = bytes,
62461007b31SStefan Hajnoczi     };
62561007b31SStefan Hajnoczi 
62661007b31SStefan Hajnoczi     qemu_co_queue_init(&req->wait_queue);
62761007b31SStefan Hajnoczi 
6283783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
62961007b31SStefan Hajnoczi     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
6303783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
63161007b31SStefan Hajnoczi }
63261007b31SStefan Hajnoczi 
63361007b31SStefan Hajnoczi static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
63461007b31SStefan Hajnoczi {
63561007b31SStefan Hajnoczi     int64_t overlap_offset = req->offset & ~(align - 1);
63622931a15SFam Zheng     uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
63761007b31SStefan Hajnoczi                                - overlap_offset;
63861007b31SStefan Hajnoczi 
63961007b31SStefan Hajnoczi     if (!req->serialising) {
64020fc71b2SPaolo Bonzini         atomic_inc(&req->bs->serialising_in_flight);
64161007b31SStefan Hajnoczi         req->serialising = true;
64261007b31SStefan Hajnoczi     }
64361007b31SStefan Hajnoczi 
64461007b31SStefan Hajnoczi     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
64561007b31SStefan Hajnoczi     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
64661007b31SStefan Hajnoczi }
64761007b31SStefan Hajnoczi 
64809d2f948SVladimir Sementsov-Ogievskiy static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
64909d2f948SVladimir Sementsov-Ogievskiy {
65009d2f948SVladimir Sementsov-Ogievskiy     /*
65109d2f948SVladimir Sementsov-Ogievskiy      * If the request is serialising, overlap_offset and overlap_bytes are set,
65209d2f948SVladimir Sementsov-Ogievskiy      * so we can check if the request is aligned. Otherwise, don't care and
65309d2f948SVladimir Sementsov-Ogievskiy      * return false.
65409d2f948SVladimir Sementsov-Ogievskiy      */
65509d2f948SVladimir Sementsov-Ogievskiy 
65609d2f948SVladimir Sementsov-Ogievskiy     return req->serialising && (req->offset == req->overlap_offset) &&
65709d2f948SVladimir Sementsov-Ogievskiy            (req->bytes == req->overlap_bytes);
65809d2f948SVladimir Sementsov-Ogievskiy }
65909d2f948SVladimir Sementsov-Ogievskiy 
66061007b31SStefan Hajnoczi /**
661244483e6SKevin Wolf  * Round a region to cluster boundaries
662244483e6SKevin Wolf  */
663244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs,
6647cfd5275SEric Blake                             int64_t offset, int64_t bytes,
665244483e6SKevin Wolf                             int64_t *cluster_offset,
6667cfd5275SEric Blake                             int64_t *cluster_bytes)
667244483e6SKevin Wolf {
668244483e6SKevin Wolf     BlockDriverInfo bdi;
669244483e6SKevin Wolf 
670244483e6SKevin Wolf     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
671244483e6SKevin Wolf         *cluster_offset = offset;
672244483e6SKevin Wolf         *cluster_bytes = bytes;
673244483e6SKevin Wolf     } else {
674244483e6SKevin Wolf         int64_t c = bdi.cluster_size;
675244483e6SKevin Wolf         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
676244483e6SKevin Wolf         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
677244483e6SKevin Wolf     }
678244483e6SKevin Wolf }
679244483e6SKevin Wolf 
68061007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs)
68161007b31SStefan Hajnoczi {
68261007b31SStefan Hajnoczi     BlockDriverInfo bdi;
68361007b31SStefan Hajnoczi     int ret;
68461007b31SStefan Hajnoczi 
68561007b31SStefan Hajnoczi     ret = bdrv_get_info(bs, &bdi);
68661007b31SStefan Hajnoczi     if (ret < 0 || bdi.cluster_size == 0) {
687a5b8dd2cSEric Blake         return bs->bl.request_alignment;
68861007b31SStefan Hajnoczi     } else {
68961007b31SStefan Hajnoczi         return bdi.cluster_size;
69061007b31SStefan Hajnoczi     }
69161007b31SStefan Hajnoczi }
69261007b31SStefan Hajnoczi 
69361007b31SStefan Hajnoczi static bool tracked_request_overlaps(BdrvTrackedRequest *req,
69422931a15SFam Zheng                                      int64_t offset, uint64_t bytes)
69561007b31SStefan Hajnoczi {
69661007b31SStefan Hajnoczi     /*        aaaa   bbbb */
69761007b31SStefan Hajnoczi     if (offset >= req->overlap_offset + req->overlap_bytes) {
69861007b31SStefan Hajnoczi         return false;
69961007b31SStefan Hajnoczi     }
70061007b31SStefan Hajnoczi     /* bbbb   aaaa        */
70161007b31SStefan Hajnoczi     if (req->overlap_offset >= offset + bytes) {
70261007b31SStefan Hajnoczi         return false;
70361007b31SStefan Hajnoczi     }
70461007b31SStefan Hajnoczi     return true;
70561007b31SStefan Hajnoczi }
70661007b31SStefan Hajnoczi 
70799723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs)
70899723548SPaolo Bonzini {
70999723548SPaolo Bonzini     atomic_inc(&bs->in_flight);
71099723548SPaolo Bonzini }
71199723548SPaolo Bonzini 
712c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs)
713c9d1a561SPaolo Bonzini {
714cfe29d82SKevin Wolf     aio_wait_kick();
715c9d1a561SPaolo Bonzini }
716c9d1a561SPaolo Bonzini 
71799723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs)
71899723548SPaolo Bonzini {
71999723548SPaolo Bonzini     atomic_dec(&bs->in_flight);
720c9d1a561SPaolo Bonzini     bdrv_wakeup(bs);
72199723548SPaolo Bonzini }
72299723548SPaolo Bonzini 
72361007b31SStefan Hajnoczi static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
72461007b31SStefan Hajnoczi {
72561007b31SStefan Hajnoczi     BlockDriverState *bs = self->bs;
72661007b31SStefan Hajnoczi     BdrvTrackedRequest *req;
72761007b31SStefan Hajnoczi     bool retry;
72861007b31SStefan Hajnoczi     bool waited = false;
72961007b31SStefan Hajnoczi 
73020fc71b2SPaolo Bonzini     if (!atomic_read(&bs->serialising_in_flight)) {
73161007b31SStefan Hajnoczi         return false;
73261007b31SStefan Hajnoczi     }
73361007b31SStefan Hajnoczi 
73461007b31SStefan Hajnoczi     do {
73561007b31SStefan Hajnoczi         retry = false;
7363783fa3dSPaolo Bonzini         qemu_co_mutex_lock(&bs->reqs_lock);
73761007b31SStefan Hajnoczi         QLIST_FOREACH(req, &bs->tracked_requests, list) {
73861007b31SStefan Hajnoczi             if (req == self || (!req->serialising && !self->serialising)) {
73961007b31SStefan Hajnoczi                 continue;
74061007b31SStefan Hajnoczi             }
74161007b31SStefan Hajnoczi             if (tracked_request_overlaps(req, self->overlap_offset,
74261007b31SStefan Hajnoczi                                          self->overlap_bytes))
74361007b31SStefan Hajnoczi             {
74461007b31SStefan Hajnoczi                 /* Hitting this means there was a reentrant request, for
74561007b31SStefan Hajnoczi                  * example, a block driver issuing nested requests.  This must
74661007b31SStefan Hajnoczi                  * never happen since it means deadlock.
74761007b31SStefan Hajnoczi                  */
74861007b31SStefan Hajnoczi                 assert(qemu_coroutine_self() != req->co);
74961007b31SStefan Hajnoczi 
75061007b31SStefan Hajnoczi                 /* If the request is already (indirectly) waiting for us, or
75161007b31SStefan Hajnoczi                  * will wait for us as soon as it wakes up, then just go on
75261007b31SStefan Hajnoczi                  * (instead of producing a deadlock in the former case). */
75361007b31SStefan Hajnoczi                 if (!req->waiting_for) {
75461007b31SStefan Hajnoczi                     self->waiting_for = req;
7553783fa3dSPaolo Bonzini                     qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
75661007b31SStefan Hajnoczi                     self->waiting_for = NULL;
75761007b31SStefan Hajnoczi                     retry = true;
75861007b31SStefan Hajnoczi                     waited = true;
75961007b31SStefan Hajnoczi                     break;
76061007b31SStefan Hajnoczi                 }
76161007b31SStefan Hajnoczi             }
76261007b31SStefan Hajnoczi         }
7633783fa3dSPaolo Bonzini         qemu_co_mutex_unlock(&bs->reqs_lock);
76461007b31SStefan Hajnoczi     } while (retry);
76561007b31SStefan Hajnoczi 
76661007b31SStefan Hajnoczi     return waited;
76761007b31SStefan Hajnoczi }
76861007b31SStefan Hajnoczi 
76961007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
77061007b31SStefan Hajnoczi                                    size_t size)
77161007b31SStefan Hajnoczi {
77261007b31SStefan Hajnoczi     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
77361007b31SStefan Hajnoczi         return -EIO;
77461007b31SStefan Hajnoczi     }
77561007b31SStefan Hajnoczi 
77661007b31SStefan Hajnoczi     if (!bdrv_is_inserted(bs)) {
77761007b31SStefan Hajnoczi         return -ENOMEDIUM;
77861007b31SStefan Hajnoczi     }
77961007b31SStefan Hajnoczi 
78061007b31SStefan Hajnoczi     if (offset < 0) {
78161007b31SStefan Hajnoczi         return -EIO;
78261007b31SStefan Hajnoczi     }
78361007b31SStefan Hajnoczi 
78461007b31SStefan Hajnoczi     return 0;
78561007b31SStefan Hajnoczi }
78661007b31SStefan Hajnoczi 
78761007b31SStefan Hajnoczi typedef struct RwCo {
788e293b7a3SKevin Wolf     BdrvChild *child;
78961007b31SStefan Hajnoczi     int64_t offset;
79061007b31SStefan Hajnoczi     QEMUIOVector *qiov;
79161007b31SStefan Hajnoczi     bool is_write;
79261007b31SStefan Hajnoczi     int ret;
79361007b31SStefan Hajnoczi     BdrvRequestFlags flags;
79461007b31SStefan Hajnoczi } RwCo;
79561007b31SStefan Hajnoczi 
79661007b31SStefan Hajnoczi static void coroutine_fn bdrv_rw_co_entry(void *opaque)
79761007b31SStefan Hajnoczi {
79861007b31SStefan Hajnoczi     RwCo *rwco = opaque;
79961007b31SStefan Hajnoczi 
80061007b31SStefan Hajnoczi     if (!rwco->is_write) {
801a03ef88fSKevin Wolf         rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
80261007b31SStefan Hajnoczi                                    rwco->qiov->size, rwco->qiov,
80361007b31SStefan Hajnoczi                                    rwco->flags);
80461007b31SStefan Hajnoczi     } else {
805a03ef88fSKevin Wolf         rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
80661007b31SStefan Hajnoczi                                     rwco->qiov->size, rwco->qiov,
80761007b31SStefan Hajnoczi                                     rwco->flags);
80861007b31SStefan Hajnoczi     }
8094720cbeeSKevin Wolf     aio_wait_kick();
81061007b31SStefan Hajnoczi }
81161007b31SStefan Hajnoczi 
81261007b31SStefan Hajnoczi /*
81361007b31SStefan Hajnoczi  * Process a vectored synchronous request using coroutines
81461007b31SStefan Hajnoczi  */
815e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
81661007b31SStefan Hajnoczi                         QEMUIOVector *qiov, bool is_write,
81761007b31SStefan Hajnoczi                         BdrvRequestFlags flags)
81861007b31SStefan Hajnoczi {
81961007b31SStefan Hajnoczi     Coroutine *co;
82061007b31SStefan Hajnoczi     RwCo rwco = {
821e293b7a3SKevin Wolf         .child = child,
82261007b31SStefan Hajnoczi         .offset = offset,
82361007b31SStefan Hajnoczi         .qiov = qiov,
82461007b31SStefan Hajnoczi         .is_write = is_write,
82561007b31SStefan Hajnoczi         .ret = NOT_DONE,
82661007b31SStefan Hajnoczi         .flags = flags,
82761007b31SStefan Hajnoczi     };
82861007b31SStefan Hajnoczi 
82961007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
83061007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
83161007b31SStefan Hajnoczi         bdrv_rw_co_entry(&rwco);
83261007b31SStefan Hajnoczi     } else {
8330b8b8753SPaolo Bonzini         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
834e92f0e19SFam Zheng         bdrv_coroutine_enter(child->bs, co);
83588b062c2SPaolo Bonzini         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
83661007b31SStefan Hajnoczi     }
83761007b31SStefan Hajnoczi     return rwco.ret;
83861007b31SStefan Hajnoczi }
83961007b31SStefan Hajnoczi 
840720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
841f5a5ca79SManos Pitsidianakis                        int bytes, BdrvRequestFlags flags)
84261007b31SStefan Hajnoczi {
8430d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
84474021bc4SEric Blake 
845e293b7a3SKevin Wolf     return bdrv_prwv_co(child, offset, &qiov, true,
84661007b31SStefan Hajnoczi                         BDRV_REQ_ZERO_WRITE | flags);
84761007b31SStefan Hajnoczi }
84861007b31SStefan Hajnoczi 
84961007b31SStefan Hajnoczi /*
85074021bc4SEric Blake  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
85161007b31SStefan Hajnoczi  * The operation is sped up by checking the block status and only writing
85261007b31SStefan Hajnoczi  * zeroes to the device if they currently do not return zeroes. Optional
85374021bc4SEric Blake  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
854465fe887SEric Blake  * BDRV_REQ_FUA).
85561007b31SStefan Hajnoczi  *
85661007b31SStefan Hajnoczi  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
85761007b31SStefan Hajnoczi  */
858720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
85961007b31SStefan Hajnoczi {
860237d78f8SEric Blake     int ret;
861237d78f8SEric Blake     int64_t target_size, bytes, offset = 0;
862720ff280SKevin Wolf     BlockDriverState *bs = child->bs;
86361007b31SStefan Hajnoczi 
8647286d610SEric Blake     target_size = bdrv_getlength(bs);
8657286d610SEric Blake     if (target_size < 0) {
8667286d610SEric Blake         return target_size;
86761007b31SStefan Hajnoczi     }
86861007b31SStefan Hajnoczi 
86961007b31SStefan Hajnoczi     for (;;) {
8707286d610SEric Blake         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
8717286d610SEric Blake         if (bytes <= 0) {
87261007b31SStefan Hajnoczi             return 0;
87361007b31SStefan Hajnoczi         }
874237d78f8SEric Blake         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
87561007b31SStefan Hajnoczi         if (ret < 0) {
87661007b31SStefan Hajnoczi             return ret;
87761007b31SStefan Hajnoczi         }
87861007b31SStefan Hajnoczi         if (ret & BDRV_BLOCK_ZERO) {
879237d78f8SEric Blake             offset += bytes;
88061007b31SStefan Hajnoczi             continue;
88161007b31SStefan Hajnoczi         }
882237d78f8SEric Blake         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
88361007b31SStefan Hajnoczi         if (ret < 0) {
88461007b31SStefan Hajnoczi             return ret;
88561007b31SStefan Hajnoczi         }
886237d78f8SEric Blake         offset += bytes;
88761007b31SStefan Hajnoczi     }
88861007b31SStefan Hajnoczi }
88961007b31SStefan Hajnoczi 
890cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
891f1e84741SKevin Wolf {
892f1e84741SKevin Wolf     int ret;
893f1e84741SKevin Wolf 
894e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
895f1e84741SKevin Wolf     if (ret < 0) {
896f1e84741SKevin Wolf         return ret;
897f1e84741SKevin Wolf     }
898f1e84741SKevin Wolf 
899f1e84741SKevin Wolf     return qiov->size;
900f1e84741SKevin Wolf }
901f1e84741SKevin Wolf 
902*2e11d756SAlberto Garcia /* See bdrv_pwrite() for the return codes */
903cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
90461007b31SStefan Hajnoczi {
9050d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
90661007b31SStefan Hajnoczi 
90761007b31SStefan Hajnoczi     if (bytes < 0) {
90861007b31SStefan Hajnoczi         return -EINVAL;
90961007b31SStefan Hajnoczi     }
91061007b31SStefan Hajnoczi 
911cf2ab8fcSKevin Wolf     return bdrv_preadv(child, offset, &qiov);
91261007b31SStefan Hajnoczi }
91361007b31SStefan Hajnoczi 
914d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
91561007b31SStefan Hajnoczi {
91661007b31SStefan Hajnoczi     int ret;
91761007b31SStefan Hajnoczi 
918e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
91961007b31SStefan Hajnoczi     if (ret < 0) {
92061007b31SStefan Hajnoczi         return ret;
92161007b31SStefan Hajnoczi     }
92261007b31SStefan Hajnoczi 
92361007b31SStefan Hajnoczi     return qiov->size;
92461007b31SStefan Hajnoczi }
92561007b31SStefan Hajnoczi 
926*2e11d756SAlberto Garcia /* Return no. of bytes on success or < 0 on error. Important errors are:
927*2e11d756SAlberto Garcia   -EIO         generic I/O error (may happen for all errors)
928*2e11d756SAlberto Garcia   -ENOMEDIUM   No media inserted.
929*2e11d756SAlberto Garcia   -EINVAL      Invalid offset or number of bytes
930*2e11d756SAlberto Garcia   -EACCES      Trying to write a read-only device
931*2e11d756SAlberto Garcia */
932d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
93361007b31SStefan Hajnoczi {
9340d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
93561007b31SStefan Hajnoczi 
93661007b31SStefan Hajnoczi     if (bytes < 0) {
93761007b31SStefan Hajnoczi         return -EINVAL;
93861007b31SStefan Hajnoczi     }
93961007b31SStefan Hajnoczi 
940d9ca2ea2SKevin Wolf     return bdrv_pwritev(child, offset, &qiov);
94161007b31SStefan Hajnoczi }
94261007b31SStefan Hajnoczi 
94361007b31SStefan Hajnoczi /*
94461007b31SStefan Hajnoczi  * Writes to the file and ensures that no writes are reordered across this
94561007b31SStefan Hajnoczi  * request (acts as a barrier)
94661007b31SStefan Hajnoczi  *
94761007b31SStefan Hajnoczi  * Returns 0 on success, -errno in error cases.
94861007b31SStefan Hajnoczi  */
949d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
95061007b31SStefan Hajnoczi                      const void *buf, int count)
95161007b31SStefan Hajnoczi {
95261007b31SStefan Hajnoczi     int ret;
95361007b31SStefan Hajnoczi 
954d9ca2ea2SKevin Wolf     ret = bdrv_pwrite(child, offset, buf, count);
95561007b31SStefan Hajnoczi     if (ret < 0) {
95661007b31SStefan Hajnoczi         return ret;
95761007b31SStefan Hajnoczi     }
95861007b31SStefan Hajnoczi 
959d9ca2ea2SKevin Wolf     ret = bdrv_flush(child->bs);
960855a6a93SKevin Wolf     if (ret < 0) {
961855a6a93SKevin Wolf         return ret;
96261007b31SStefan Hajnoczi     }
96361007b31SStefan Hajnoczi 
96461007b31SStefan Hajnoczi     return 0;
96561007b31SStefan Hajnoczi }
96661007b31SStefan Hajnoczi 
96708844473SKevin Wolf typedef struct CoroutineIOCompletion {
96808844473SKevin Wolf     Coroutine *coroutine;
96908844473SKevin Wolf     int ret;
97008844473SKevin Wolf } CoroutineIOCompletion;
97108844473SKevin Wolf 
97208844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret)
97308844473SKevin Wolf {
97408844473SKevin Wolf     CoroutineIOCompletion *co = opaque;
97508844473SKevin Wolf 
97608844473SKevin Wolf     co->ret = ret;
977b9e413ddSPaolo Bonzini     aio_co_wake(co->coroutine);
97808844473SKevin Wolf }
97908844473SKevin Wolf 
980166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
981166fe960SKevin Wolf                                            uint64_t offset, uint64_t bytes,
982166fe960SKevin Wolf                                            QEMUIOVector *qiov, int flags)
983166fe960SKevin Wolf {
984166fe960SKevin Wolf     BlockDriver *drv = bs->drv;
9853fb06697SKevin Wolf     int64_t sector_num;
9863fb06697SKevin Wolf     unsigned int nb_sectors;
9873fb06697SKevin Wolf 
988fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
989fe0480d6SKevin Wolf     assert(!(flags & BDRV_REQ_NO_FALLBACK));
990fa166538SEric Blake 
991d470ad42SMax Reitz     if (!drv) {
992d470ad42SMax Reitz         return -ENOMEDIUM;
993d470ad42SMax Reitz     }
994d470ad42SMax Reitz 
9953fb06697SKevin Wolf     if (drv->bdrv_co_preadv) {
9963fb06697SKevin Wolf         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
9973fb06697SKevin Wolf     }
9983fb06697SKevin Wolf 
999edfab6a0SEric Blake     if (drv->bdrv_aio_preadv) {
100008844473SKevin Wolf         BlockAIOCB *acb;
100108844473SKevin Wolf         CoroutineIOCompletion co = {
100208844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
100308844473SKevin Wolf         };
100408844473SKevin Wolf 
1005e31f6864SEric Blake         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
100608844473SKevin Wolf                                    bdrv_co_io_em_complete, &co);
100708844473SKevin Wolf         if (acb == NULL) {
100808844473SKevin Wolf             return -EIO;
100908844473SKevin Wolf         } else {
101008844473SKevin Wolf             qemu_coroutine_yield();
101108844473SKevin Wolf             return co.ret;
101208844473SKevin Wolf         }
101308844473SKevin Wolf     }
1014edfab6a0SEric Blake 
1015edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1016edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1017edfab6a0SEric Blake 
1018edfab6a0SEric Blake     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1019edfab6a0SEric Blake     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1020edfab6a0SEric Blake     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1021edfab6a0SEric Blake     assert(drv->bdrv_co_readv);
1022edfab6a0SEric Blake 
1023edfab6a0SEric Blake     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1024166fe960SKevin Wolf }
1025166fe960SKevin Wolf 
102678a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
102778a07294SKevin Wolf                                             uint64_t offset, uint64_t bytes,
102878a07294SKevin Wolf                                             QEMUIOVector *qiov, int flags)
102978a07294SKevin Wolf {
103078a07294SKevin Wolf     BlockDriver *drv = bs->drv;
10313fb06697SKevin Wolf     int64_t sector_num;
10323fb06697SKevin Wolf     unsigned int nb_sectors;
103378a07294SKevin Wolf     int ret;
103478a07294SKevin Wolf 
1035fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
1036fe0480d6SKevin Wolf     assert(!(flags & BDRV_REQ_NO_FALLBACK));
1037fa166538SEric Blake 
1038d470ad42SMax Reitz     if (!drv) {
1039d470ad42SMax Reitz         return -ENOMEDIUM;
1040d470ad42SMax Reitz     }
1041d470ad42SMax Reitz 
10423fb06697SKevin Wolf     if (drv->bdrv_co_pwritev) {
1043515c2f43SKevin Wolf         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1044515c2f43SKevin Wolf                                    flags & bs->supported_write_flags);
1045515c2f43SKevin Wolf         flags &= ~bs->supported_write_flags;
10463fb06697SKevin Wolf         goto emulate_flags;
10473fb06697SKevin Wolf     }
10483fb06697SKevin Wolf 
1049edfab6a0SEric Blake     if (drv->bdrv_aio_pwritev) {
105008844473SKevin Wolf         BlockAIOCB *acb;
105108844473SKevin Wolf         CoroutineIOCompletion co = {
105208844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
105308844473SKevin Wolf         };
105408844473SKevin Wolf 
1055e31f6864SEric Blake         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1056e31f6864SEric Blake                                     flags & bs->supported_write_flags,
105708844473SKevin Wolf                                     bdrv_co_io_em_complete, &co);
1058e31f6864SEric Blake         flags &= ~bs->supported_write_flags;
105908844473SKevin Wolf         if (acb == NULL) {
10603fb06697SKevin Wolf             ret = -EIO;
106108844473SKevin Wolf         } else {
106208844473SKevin Wolf             qemu_coroutine_yield();
10633fb06697SKevin Wolf             ret = co.ret;
106408844473SKevin Wolf         }
1065edfab6a0SEric Blake         goto emulate_flags;
1066edfab6a0SEric Blake     }
1067edfab6a0SEric Blake 
1068edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1069edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1070edfab6a0SEric Blake 
1071edfab6a0SEric Blake     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1072edfab6a0SEric Blake     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1073edfab6a0SEric Blake     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1074edfab6a0SEric Blake 
1075e18a58b4SEric Blake     assert(drv->bdrv_co_writev);
1076e18a58b4SEric Blake     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1077edfab6a0SEric Blake                               flags & bs->supported_write_flags);
1078edfab6a0SEric Blake     flags &= ~bs->supported_write_flags;
107978a07294SKevin Wolf 
10803fb06697SKevin Wolf emulate_flags:
10814df863f3SEric Blake     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
108278a07294SKevin Wolf         ret = bdrv_co_flush(bs);
108378a07294SKevin Wolf     }
108478a07294SKevin Wolf 
108578a07294SKevin Wolf     return ret;
108678a07294SKevin Wolf }
108778a07294SKevin Wolf 
108829a298afSPavel Butsykin static int coroutine_fn
108929a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
109029a298afSPavel Butsykin                                uint64_t bytes, QEMUIOVector *qiov)
109129a298afSPavel Butsykin {
109229a298afSPavel Butsykin     BlockDriver *drv = bs->drv;
109329a298afSPavel Butsykin 
1094d470ad42SMax Reitz     if (!drv) {
1095d470ad42SMax Reitz         return -ENOMEDIUM;
1096d470ad42SMax Reitz     }
1097d470ad42SMax Reitz 
109829a298afSPavel Butsykin     if (!drv->bdrv_co_pwritev_compressed) {
109929a298afSPavel Butsykin         return -ENOTSUP;
110029a298afSPavel Butsykin     }
110129a298afSPavel Butsykin 
110229a298afSPavel Butsykin     return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
110329a298afSPavel Butsykin }
110429a298afSPavel Butsykin 
110585c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1106244483e6SKevin Wolf         int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
110761007b31SStefan Hajnoczi {
110885c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
110985c97ca7SKevin Wolf 
111061007b31SStefan Hajnoczi     /* Perform I/O through a temporary buffer so that users who scribble over
111161007b31SStefan Hajnoczi      * their read buffer while the operation is in progress do not end up
111261007b31SStefan Hajnoczi      * modifying the image file.  This is critical for zero-copy guest I/O
111361007b31SStefan Hajnoczi      * where anything might happen inside guest memory.
111461007b31SStefan Hajnoczi      */
111561007b31SStefan Hajnoczi     void *bounce_buffer;
111661007b31SStefan Hajnoczi 
111761007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
1118cb2e2878SEric Blake     QEMUIOVector local_qiov;
1119244483e6SKevin Wolf     int64_t cluster_offset;
11207cfd5275SEric Blake     int64_t cluster_bytes;
112161007b31SStefan Hajnoczi     size_t skip_bytes;
112261007b31SStefan Hajnoczi     int ret;
1123cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1124cb2e2878SEric Blake                                     BDRV_REQUEST_MAX_BYTES);
1125cb2e2878SEric Blake     unsigned int progress = 0;
112661007b31SStefan Hajnoczi 
1127d470ad42SMax Reitz     if (!drv) {
1128d470ad42SMax Reitz         return -ENOMEDIUM;
1129d470ad42SMax Reitz     }
1130d470ad42SMax Reitz 
11311bf03e66SKevin Wolf     /* FIXME We cannot require callers to have write permissions when all they
11321bf03e66SKevin Wolf      * are doing is a read request. If we did things right, write permissions
11331bf03e66SKevin Wolf      * would be obtained anyway, but internally by the copy-on-read code. As
1134765d9df9SEric Blake      * long as it is implemented here rather than in a separate filter driver,
11351bf03e66SKevin Wolf      * the copy-on-read code doesn't have its own BdrvChild, however, for which
11361bf03e66SKevin Wolf      * it could request permissions. Therefore we have to bypass the permission
11371bf03e66SKevin Wolf      * system for the moment. */
11381bf03e66SKevin Wolf     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1139afa4b293SKevin Wolf 
114061007b31SStefan Hajnoczi     /* Cover entire cluster so no additional backing file I/O is required when
1141cb2e2878SEric Blake      * allocating cluster in the image file.  Note that this value may exceed
1142cb2e2878SEric Blake      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1143cb2e2878SEric Blake      * is one reason we loop rather than doing it all at once.
114461007b31SStefan Hajnoczi      */
1145244483e6SKevin Wolf     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1146cb2e2878SEric Blake     skip_bytes = offset - cluster_offset;
114761007b31SStefan Hajnoczi 
1148244483e6SKevin Wolf     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1149244483e6SKevin Wolf                                    cluster_offset, cluster_bytes);
115061007b31SStefan Hajnoczi 
1151cb2e2878SEric Blake     bounce_buffer = qemu_try_blockalign(bs,
1152cb2e2878SEric Blake                                         MIN(MIN(max_transfer, cluster_bytes),
1153cb2e2878SEric Blake                                             MAX_BOUNCE_BUFFER));
115461007b31SStefan Hajnoczi     if (bounce_buffer == NULL) {
115561007b31SStefan Hajnoczi         ret = -ENOMEM;
115661007b31SStefan Hajnoczi         goto err;
115761007b31SStefan Hajnoczi     }
115861007b31SStefan Hajnoczi 
1159cb2e2878SEric Blake     while (cluster_bytes) {
1160cb2e2878SEric Blake         int64_t pnum;
116161007b31SStefan Hajnoczi 
1162cb2e2878SEric Blake         ret = bdrv_is_allocated(bs, cluster_offset,
1163cb2e2878SEric Blake                                 MIN(cluster_bytes, max_transfer), &pnum);
1164cb2e2878SEric Blake         if (ret < 0) {
1165cb2e2878SEric Blake             /* Safe to treat errors in querying allocation as if
1166cb2e2878SEric Blake              * unallocated; we'll probably fail again soon on the
1167cb2e2878SEric Blake              * read, but at least that will set a decent errno.
1168cb2e2878SEric Blake              */
1169cb2e2878SEric Blake             pnum = MIN(cluster_bytes, max_transfer);
1170cb2e2878SEric Blake         }
1171cb2e2878SEric Blake 
1172b0ddcbbbSKevin Wolf         /* Stop at EOF if the image ends in the middle of the cluster */
1173b0ddcbbbSKevin Wolf         if (ret == 0 && pnum == 0) {
1174b0ddcbbbSKevin Wolf             assert(progress >= bytes);
1175b0ddcbbbSKevin Wolf             break;
1176b0ddcbbbSKevin Wolf         }
1177b0ddcbbbSKevin Wolf 
1178cb2e2878SEric Blake         assert(skip_bytes < pnum);
1179cb2e2878SEric Blake 
1180cb2e2878SEric Blake         if (ret <= 0) {
1181cb2e2878SEric Blake             /* Must copy-on-read; use the bounce buffer */
11820d93ed08SVladimir Sementsov-Ogievskiy             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
11830d93ed08SVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1184cb2e2878SEric Blake 
1185cb2e2878SEric Blake             ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1186cb2e2878SEric Blake                                      &local_qiov, 0);
118761007b31SStefan Hajnoczi             if (ret < 0) {
118861007b31SStefan Hajnoczi                 goto err;
118961007b31SStefan Hajnoczi             }
119061007b31SStefan Hajnoczi 
1191d855ebcdSEric Blake             bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1192c1499a5eSEric Blake             if (drv->bdrv_co_pwrite_zeroes &&
1193cb2e2878SEric Blake                 buffer_is_zero(bounce_buffer, pnum)) {
1194a604fa2bSEric Blake                 /* FIXME: Should we (perhaps conditionally) be setting
1195a604fa2bSEric Blake                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1196a604fa2bSEric Blake                  * that still correctly reads as zero? */
11977adcf59fSMax Reitz                 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
11987adcf59fSMax Reitz                                                BDRV_REQ_WRITE_UNCHANGED);
119961007b31SStefan Hajnoczi             } else {
1200cb2e2878SEric Blake                 /* This does not change the data on the disk, it is not
1201cb2e2878SEric Blake                  * necessary to flush even in cache=writethrough mode.
120261007b31SStefan Hajnoczi                  */
1203cb2e2878SEric Blake                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
12047adcf59fSMax Reitz                                           &local_qiov,
12057adcf59fSMax Reitz                                           BDRV_REQ_WRITE_UNCHANGED);
120661007b31SStefan Hajnoczi             }
120761007b31SStefan Hajnoczi 
120861007b31SStefan Hajnoczi             if (ret < 0) {
1209cb2e2878SEric Blake                 /* It might be okay to ignore write errors for guest
1210cb2e2878SEric Blake                  * requests.  If this is a deliberate copy-on-read
1211cb2e2878SEric Blake                  * then we don't want to ignore the error.  Simply
1212cb2e2878SEric Blake                  * report it in all cases.
121361007b31SStefan Hajnoczi                  */
121461007b31SStefan Hajnoczi                 goto err;
121561007b31SStefan Hajnoczi             }
121661007b31SStefan Hajnoczi 
1217cb2e2878SEric Blake             qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1218cb2e2878SEric Blake                                 pnum - skip_bytes);
1219cb2e2878SEric Blake         } else {
1220cb2e2878SEric Blake             /* Read directly into the destination */
1221cb2e2878SEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
1222cb2e2878SEric Blake             qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1223cb2e2878SEric Blake             ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1224cb2e2878SEric Blake                                      &local_qiov, 0);
1225cb2e2878SEric Blake             qemu_iovec_destroy(&local_qiov);
1226cb2e2878SEric Blake             if (ret < 0) {
1227cb2e2878SEric Blake                 goto err;
1228cb2e2878SEric Blake             }
1229cb2e2878SEric Blake         }
1230cb2e2878SEric Blake 
1231cb2e2878SEric Blake         cluster_offset += pnum;
1232cb2e2878SEric Blake         cluster_bytes -= pnum;
1233cb2e2878SEric Blake         progress += pnum - skip_bytes;
1234cb2e2878SEric Blake         skip_bytes = 0;
1235cb2e2878SEric Blake     }
1236cb2e2878SEric Blake     ret = 0;
123761007b31SStefan Hajnoczi 
123861007b31SStefan Hajnoczi err:
123961007b31SStefan Hajnoczi     qemu_vfree(bounce_buffer);
124061007b31SStefan Hajnoczi     return ret;
124161007b31SStefan Hajnoczi }
124261007b31SStefan Hajnoczi 
124361007b31SStefan Hajnoczi /*
124461007b31SStefan Hajnoczi  * Forwards an already correctly aligned request to the BlockDriver. This
12451a62d0acSEric Blake  * handles copy on read, zeroing after EOF, and fragmentation of large
12461a62d0acSEric Blake  * reads; any other features must be implemented by the caller.
124761007b31SStefan Hajnoczi  */
124885c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
124961007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
125061007b31SStefan Hajnoczi     int64_t align, QEMUIOVector *qiov, int flags)
125161007b31SStefan Hajnoczi {
125285c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
1253c9d20029SKevin Wolf     int64_t total_bytes, max_bytes;
12541a62d0acSEric Blake     int ret = 0;
12551a62d0acSEric Blake     uint64_t bytes_remaining = bytes;
12561a62d0acSEric Blake     int max_transfer;
125761007b31SStefan Hajnoczi 
125849c07526SKevin Wolf     assert(is_power_of_2(align));
125949c07526SKevin Wolf     assert((offset & (align - 1)) == 0);
126049c07526SKevin Wolf     assert((bytes & (align - 1)) == 0);
126161007b31SStefan Hajnoczi     assert(!qiov || bytes == qiov->size);
1262abb06c5aSDaniel P. Berrange     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
12631a62d0acSEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
12641a62d0acSEric Blake                                    align);
1265a604fa2bSEric Blake 
1266a604fa2bSEric Blake     /* TODO: We would need a per-BDS .supported_read_flags and
1267a604fa2bSEric Blake      * potential fallback support, if we ever implement any read flags
1268a604fa2bSEric Blake      * to pass through to drivers.  For now, there aren't any
1269a604fa2bSEric Blake      * passthrough flags.  */
1270a604fa2bSEric Blake     assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
127161007b31SStefan Hajnoczi 
127261007b31SStefan Hajnoczi     /* Handle Copy on Read and associated serialisation */
127361007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
127461007b31SStefan Hajnoczi         /* If we touch the same cluster it counts as an overlap.  This
127561007b31SStefan Hajnoczi          * guarantees that allocating writes will be serialized and not race
127661007b31SStefan Hajnoczi          * with each other for the same cluster.  For example, in copy-on-read
127761007b31SStefan Hajnoczi          * it ensures that the CoR read and write operations are atomic and
127861007b31SStefan Hajnoczi          * guest writes cannot interleave between them. */
127961007b31SStefan Hajnoczi         mark_request_serialising(req, bdrv_get_cluster_size(bs));
128061007b31SStefan Hajnoczi     }
128161007b31SStefan Hajnoczi 
128209d2f948SVladimir Sementsov-Ogievskiy     /* BDRV_REQ_SERIALISING is only for write operation */
128309d2f948SVladimir Sementsov-Ogievskiy     assert(!(flags & BDRV_REQ_SERIALISING));
128409d2f948SVladimir Sementsov-Ogievskiy 
128561408b25SFam Zheng     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
128661007b31SStefan Hajnoczi         wait_serialising_requests(req);
128761408b25SFam Zheng     }
128861007b31SStefan Hajnoczi 
128961007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
1290d6a644bbSEric Blake         int64_t pnum;
129161007b31SStefan Hajnoczi 
129288e63df2SEric Blake         ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
129361007b31SStefan Hajnoczi         if (ret < 0) {
129461007b31SStefan Hajnoczi             goto out;
129561007b31SStefan Hajnoczi         }
129661007b31SStefan Hajnoczi 
129788e63df2SEric Blake         if (!ret || pnum != bytes) {
129885c97ca7SKevin Wolf             ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
129961007b31SStefan Hajnoczi             goto out;
130061007b31SStefan Hajnoczi         }
130161007b31SStefan Hajnoczi     }
130261007b31SStefan Hajnoczi 
13031a62d0acSEric Blake     /* Forward the request to the BlockDriver, possibly fragmenting it */
130449c07526SKevin Wolf     total_bytes = bdrv_getlength(bs);
130549c07526SKevin Wolf     if (total_bytes < 0) {
130649c07526SKevin Wolf         ret = total_bytes;
130761007b31SStefan Hajnoczi         goto out;
130861007b31SStefan Hajnoczi     }
130961007b31SStefan Hajnoczi 
131049c07526SKevin Wolf     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
13111a62d0acSEric Blake     if (bytes <= max_bytes && bytes <= max_transfer) {
1312166fe960SKevin Wolf         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
13131a62d0acSEric Blake         goto out;
131461007b31SStefan Hajnoczi     }
131561007b31SStefan Hajnoczi 
13161a62d0acSEric Blake     while (bytes_remaining) {
13171a62d0acSEric Blake         int num;
13181a62d0acSEric Blake 
13191a62d0acSEric Blake         if (max_bytes) {
13201a62d0acSEric Blake             QEMUIOVector local_qiov;
13211a62d0acSEric Blake 
13221a62d0acSEric Blake             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
13231a62d0acSEric Blake             assert(num);
13241a62d0acSEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
13251a62d0acSEric Blake             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
13261a62d0acSEric Blake 
13271a62d0acSEric Blake             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
13281a62d0acSEric Blake                                      num, &local_qiov, 0);
13291a62d0acSEric Blake             max_bytes -= num;
13301a62d0acSEric Blake             qemu_iovec_destroy(&local_qiov);
13311a62d0acSEric Blake         } else {
13321a62d0acSEric Blake             num = bytes_remaining;
13331a62d0acSEric Blake             ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
13341a62d0acSEric Blake                                     bytes_remaining);
13351a62d0acSEric Blake         }
13361a62d0acSEric Blake         if (ret < 0) {
13371a62d0acSEric Blake             goto out;
13381a62d0acSEric Blake         }
13391a62d0acSEric Blake         bytes_remaining -= num;
134061007b31SStefan Hajnoczi     }
134161007b31SStefan Hajnoczi 
134261007b31SStefan Hajnoczi out:
13431a62d0acSEric Blake     return ret < 0 ? ret : 0;
134461007b31SStefan Hajnoczi }
134561007b31SStefan Hajnoczi 
134661007b31SStefan Hajnoczi /*
134761007b31SStefan Hajnoczi  * Handle a read request in coroutine context
134861007b31SStefan Hajnoczi  */
1349a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child,
135061007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
135161007b31SStefan Hajnoczi     BdrvRequestFlags flags)
135261007b31SStefan Hajnoczi {
1353a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
135461007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
135561007b31SStefan Hajnoczi     BdrvTrackedRequest req;
135661007b31SStefan Hajnoczi 
1357a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
135861007b31SStefan Hajnoczi     uint8_t *head_buf = NULL;
135961007b31SStefan Hajnoczi     uint8_t *tail_buf = NULL;
136061007b31SStefan Hajnoczi     QEMUIOVector local_qiov;
136161007b31SStefan Hajnoczi     bool use_local_qiov = false;
136261007b31SStefan Hajnoczi     int ret;
136361007b31SStefan Hajnoczi 
1364f42cf447SDaniel P. Berrange     trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1365f42cf447SDaniel P. Berrange 
136661007b31SStefan Hajnoczi     if (!drv) {
136761007b31SStefan Hajnoczi         return -ENOMEDIUM;
136861007b31SStefan Hajnoczi     }
136961007b31SStefan Hajnoczi 
137061007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
137161007b31SStefan Hajnoczi     if (ret < 0) {
137261007b31SStefan Hajnoczi         return ret;
137361007b31SStefan Hajnoczi     }
137461007b31SStefan Hajnoczi 
137599723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
137699723548SPaolo Bonzini 
13779568b511SWen Congyang     /* Don't do copy-on-read if we read data before write operation */
1378d3faa13eSPaolo Bonzini     if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
137961007b31SStefan Hajnoczi         flags |= BDRV_REQ_COPY_ON_READ;
138061007b31SStefan Hajnoczi     }
138161007b31SStefan Hajnoczi 
138261007b31SStefan Hajnoczi     /* Align read if necessary by padding qiov */
138361007b31SStefan Hajnoczi     if (offset & (align - 1)) {
138461007b31SStefan Hajnoczi         head_buf = qemu_blockalign(bs, align);
138561007b31SStefan Hajnoczi         qemu_iovec_init(&local_qiov, qiov->niov + 2);
138661007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
138761007b31SStefan Hajnoczi         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
138861007b31SStefan Hajnoczi         use_local_qiov = true;
138961007b31SStefan Hajnoczi 
139061007b31SStefan Hajnoczi         bytes += offset & (align - 1);
139161007b31SStefan Hajnoczi         offset = offset & ~(align - 1);
139261007b31SStefan Hajnoczi     }
139361007b31SStefan Hajnoczi 
139461007b31SStefan Hajnoczi     if ((offset + bytes) & (align - 1)) {
139561007b31SStefan Hajnoczi         if (!use_local_qiov) {
139661007b31SStefan Hajnoczi             qemu_iovec_init(&local_qiov, qiov->niov + 1);
139761007b31SStefan Hajnoczi             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
139861007b31SStefan Hajnoczi             use_local_qiov = true;
139961007b31SStefan Hajnoczi         }
140061007b31SStefan Hajnoczi         tail_buf = qemu_blockalign(bs, align);
140161007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, tail_buf,
140261007b31SStefan Hajnoczi                        align - ((offset + bytes) & (align - 1)));
140361007b31SStefan Hajnoczi 
140461007b31SStefan Hajnoczi         bytes = ROUND_UP(bytes, align);
140561007b31SStefan Hajnoczi     }
140661007b31SStefan Hajnoczi 
1407ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
140885c97ca7SKevin Wolf     ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
140961007b31SStefan Hajnoczi                               use_local_qiov ? &local_qiov : qiov,
141061007b31SStefan Hajnoczi                               flags);
141161007b31SStefan Hajnoczi     tracked_request_end(&req);
141299723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
141361007b31SStefan Hajnoczi 
141461007b31SStefan Hajnoczi     if (use_local_qiov) {
141561007b31SStefan Hajnoczi         qemu_iovec_destroy(&local_qiov);
141661007b31SStefan Hajnoczi         qemu_vfree(head_buf);
141761007b31SStefan Hajnoczi         qemu_vfree(tail_buf);
141861007b31SStefan Hajnoczi     }
141961007b31SStefan Hajnoczi 
142061007b31SStefan Hajnoczi     return ret;
142161007b31SStefan Hajnoczi }
142261007b31SStefan Hajnoczi 
1423d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1424f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags)
142561007b31SStefan Hajnoczi {
142661007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
142761007b31SStefan Hajnoczi     QEMUIOVector qiov;
14280d93ed08SVladimir Sementsov-Ogievskiy     void *buf = NULL;
142961007b31SStefan Hajnoczi     int ret = 0;
1430465fe887SEric Blake     bool need_flush = false;
1431443668caSDenis V. Lunev     int head = 0;
1432443668caSDenis V. Lunev     int tail = 0;
143361007b31SStefan Hajnoczi 
1434cf081fcaSEric Blake     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1435a5b8dd2cSEric Blake     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1436a5b8dd2cSEric Blake                         bs->bl.request_alignment);
1437cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1438cf081fcaSEric Blake 
1439d470ad42SMax Reitz     if (!drv) {
1440d470ad42SMax Reitz         return -ENOMEDIUM;
1441d470ad42SMax Reitz     }
1442d470ad42SMax Reitz 
1443fe0480d6SKevin Wolf     if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1444fe0480d6SKevin Wolf         return -ENOTSUP;
1445fe0480d6SKevin Wolf     }
1446fe0480d6SKevin Wolf 
1447b8d0a980SEric Blake     assert(alignment % bs->bl.request_alignment == 0);
1448b8d0a980SEric Blake     head = offset % alignment;
1449f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % alignment;
1450b8d0a980SEric Blake     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1451b8d0a980SEric Blake     assert(max_write_zeroes >= bs->bl.request_alignment);
145261007b31SStefan Hajnoczi 
1453f5a5ca79SManos Pitsidianakis     while (bytes > 0 && !ret) {
1454f5a5ca79SManos Pitsidianakis         int num = bytes;
145561007b31SStefan Hajnoczi 
145661007b31SStefan Hajnoczi         /* Align request.  Block drivers can expect the "bulk" of the request
1457443668caSDenis V. Lunev          * to be aligned, and that unaligned requests do not cross cluster
1458443668caSDenis V. Lunev          * boundaries.
145961007b31SStefan Hajnoczi          */
1460443668caSDenis V. Lunev         if (head) {
1461b2f95feeSEric Blake             /* Make a small request up to the first aligned sector. For
1462b2f95feeSEric Blake              * convenience, limit this request to max_transfer even if
1463b2f95feeSEric Blake              * we don't need to fall back to writes.  */
1464f5a5ca79SManos Pitsidianakis             num = MIN(MIN(bytes, max_transfer), alignment - head);
1465b2f95feeSEric Blake             head = (head + num) % alignment;
1466b2f95feeSEric Blake             assert(num < max_write_zeroes);
1467d05aa8bbSEric Blake         } else if (tail && num > alignment) {
1468443668caSDenis V. Lunev             /* Shorten the request to the last aligned sector.  */
1469443668caSDenis V. Lunev             num -= tail;
147061007b31SStefan Hajnoczi         }
147161007b31SStefan Hajnoczi 
147261007b31SStefan Hajnoczi         /* limit request size */
147361007b31SStefan Hajnoczi         if (num > max_write_zeroes) {
147461007b31SStefan Hajnoczi             num = max_write_zeroes;
147561007b31SStefan Hajnoczi         }
147661007b31SStefan Hajnoczi 
147761007b31SStefan Hajnoczi         ret = -ENOTSUP;
147861007b31SStefan Hajnoczi         /* First try the efficient write zeroes operation */
1479d05aa8bbSEric Blake         if (drv->bdrv_co_pwrite_zeroes) {
1480d05aa8bbSEric Blake             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1481d05aa8bbSEric Blake                                              flags & bs->supported_zero_flags);
1482d05aa8bbSEric Blake             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1483d05aa8bbSEric Blake                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1484d05aa8bbSEric Blake                 need_flush = true;
1485d05aa8bbSEric Blake             }
1486465fe887SEric Blake         } else {
1487465fe887SEric Blake             assert(!bs->supported_zero_flags);
148861007b31SStefan Hajnoczi         }
148961007b31SStefan Hajnoczi 
1490118f9944SAndrey Shinkevich         if (ret < 0 && !(flags & BDRV_REQ_NO_FALLBACK)) {
149161007b31SStefan Hajnoczi             /* Fall back to bounce buffer if write zeroes is unsupported */
1492465fe887SEric Blake             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1493465fe887SEric Blake 
1494465fe887SEric Blake             if ((flags & BDRV_REQ_FUA) &&
1495465fe887SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1496465fe887SEric Blake                 /* No need for bdrv_driver_pwrite() to do a fallback
1497465fe887SEric Blake                  * flush on each chunk; use just one at the end */
1498465fe887SEric Blake                 write_flags &= ~BDRV_REQ_FUA;
1499465fe887SEric Blake                 need_flush = true;
1500465fe887SEric Blake             }
15015def6b80SEric Blake             num = MIN(num, max_transfer);
15020d93ed08SVladimir Sementsov-Ogievskiy             if (buf == NULL) {
15030d93ed08SVladimir Sementsov-Ogievskiy                 buf = qemu_try_blockalign0(bs, num);
15040d93ed08SVladimir Sementsov-Ogievskiy                 if (buf == NULL) {
150561007b31SStefan Hajnoczi                     ret = -ENOMEM;
150661007b31SStefan Hajnoczi                     goto fail;
150761007b31SStefan Hajnoczi                 }
150861007b31SStefan Hajnoczi             }
15090d93ed08SVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&qiov, buf, num);
151061007b31SStefan Hajnoczi 
1511d05aa8bbSEric Blake             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
151261007b31SStefan Hajnoczi 
151361007b31SStefan Hajnoczi             /* Keep bounce buffer around if it is big enough for all
151461007b31SStefan Hajnoczi              * all future requests.
151561007b31SStefan Hajnoczi              */
15165def6b80SEric Blake             if (num < max_transfer) {
15170d93ed08SVladimir Sementsov-Ogievskiy                 qemu_vfree(buf);
15180d93ed08SVladimir Sementsov-Ogievskiy                 buf = NULL;
151961007b31SStefan Hajnoczi             }
152061007b31SStefan Hajnoczi         }
152161007b31SStefan Hajnoczi 
1522d05aa8bbSEric Blake         offset += num;
1523f5a5ca79SManos Pitsidianakis         bytes -= num;
152461007b31SStefan Hajnoczi     }
152561007b31SStefan Hajnoczi 
152661007b31SStefan Hajnoczi fail:
1527465fe887SEric Blake     if (ret == 0 && need_flush) {
1528465fe887SEric Blake         ret = bdrv_co_flush(bs);
1529465fe887SEric Blake     }
15300d93ed08SVladimir Sementsov-Ogievskiy     qemu_vfree(buf);
153161007b31SStefan Hajnoczi     return ret;
153261007b31SStefan Hajnoczi }
153361007b31SStefan Hajnoczi 
153485fe2479SFam Zheng static inline int coroutine_fn
153585fe2479SFam Zheng bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
153685fe2479SFam Zheng                           BdrvTrackedRequest *req, int flags)
153785fe2479SFam Zheng {
153885fe2479SFam Zheng     BlockDriverState *bs = child->bs;
153985fe2479SFam Zheng     bool waited;
154085fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
154185fe2479SFam Zheng 
154285fe2479SFam Zheng     if (bs->read_only) {
154385fe2479SFam Zheng         return -EPERM;
154485fe2479SFam Zheng     }
154585fe2479SFam Zheng 
154685fe2479SFam Zheng     /* BDRV_REQ_NO_SERIALISING is only for read operation */
154785fe2479SFam Zheng     assert(!(flags & BDRV_REQ_NO_SERIALISING));
154885fe2479SFam Zheng     assert(!(bs->open_flags & BDRV_O_INACTIVE));
154985fe2479SFam Zheng     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
155085fe2479SFam Zheng     assert(!(flags & ~BDRV_REQ_MASK));
155185fe2479SFam Zheng 
155285fe2479SFam Zheng     if (flags & BDRV_REQ_SERIALISING) {
155385fe2479SFam Zheng         mark_request_serialising(req, bdrv_get_cluster_size(bs));
155485fe2479SFam Zheng     }
155585fe2479SFam Zheng 
155685fe2479SFam Zheng     waited = wait_serialising_requests(req);
155785fe2479SFam Zheng 
155885fe2479SFam Zheng     assert(!waited || !req->serialising ||
155985fe2479SFam Zheng            is_request_serialising_and_aligned(req));
156085fe2479SFam Zheng     assert(req->overlap_offset <= offset);
156185fe2479SFam Zheng     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1562cd47d792SFam Zheng     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
156385fe2479SFam Zheng 
1564cd47d792SFam Zheng     switch (req->type) {
1565cd47d792SFam Zheng     case BDRV_TRACKED_WRITE:
1566cd47d792SFam Zheng     case BDRV_TRACKED_DISCARD:
156785fe2479SFam Zheng         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
156885fe2479SFam Zheng             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
156985fe2479SFam Zheng         } else {
157085fe2479SFam Zheng             assert(child->perm & BLK_PERM_WRITE);
157185fe2479SFam Zheng         }
1572cd47d792SFam Zheng         return notifier_with_return_list_notify(&bs->before_write_notifiers,
1573cd47d792SFam Zheng                                                 req);
1574cd47d792SFam Zheng     case BDRV_TRACKED_TRUNCATE:
1575cd47d792SFam Zheng         assert(child->perm & BLK_PERM_RESIZE);
1576cd47d792SFam Zheng         return 0;
1577cd47d792SFam Zheng     default:
1578cd47d792SFam Zheng         abort();
1579cd47d792SFam Zheng     }
158085fe2479SFam Zheng }
158185fe2479SFam Zheng 
158285fe2479SFam Zheng static inline void coroutine_fn
158385fe2479SFam Zheng bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
158485fe2479SFam Zheng                          BdrvTrackedRequest *req, int ret)
158585fe2479SFam Zheng {
158685fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
158785fe2479SFam Zheng     BlockDriverState *bs = child->bs;
158885fe2479SFam Zheng 
158985fe2479SFam Zheng     atomic_inc(&bs->write_gen);
159085fe2479SFam Zheng 
159100695c27SFam Zheng     /*
159200695c27SFam Zheng      * Discard cannot extend the image, but in error handling cases, such as
159300695c27SFam Zheng      * when reverting a qcow2 cluster allocation, the discarded range can pass
159400695c27SFam Zheng      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
159500695c27SFam Zheng      * here. Instead, just skip it, since semantically a discard request
159600695c27SFam Zheng      * beyond EOF cannot expand the image anyway.
159700695c27SFam Zheng      */
15987f8f03efSFam Zheng     if (ret == 0 &&
1599cd47d792SFam Zheng         (req->type == BDRV_TRACKED_TRUNCATE ||
1600cd47d792SFam Zheng          end_sector > bs->total_sectors) &&
160100695c27SFam Zheng         req->type != BDRV_TRACKED_DISCARD) {
16027f8f03efSFam Zheng         bs->total_sectors = end_sector;
16037f8f03efSFam Zheng         bdrv_parent_cb_resize(bs);
16047f8f03efSFam Zheng         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
160585fe2479SFam Zheng     }
160600695c27SFam Zheng     if (req->bytes) {
160700695c27SFam Zheng         switch (req->type) {
160800695c27SFam Zheng         case BDRV_TRACKED_WRITE:
160900695c27SFam Zheng             stat64_max(&bs->wr_highest_offset, offset + bytes);
161000695c27SFam Zheng             /* fall through, to set dirty bits */
161100695c27SFam Zheng         case BDRV_TRACKED_DISCARD:
16127f8f03efSFam Zheng             bdrv_set_dirty(bs, offset, bytes);
161300695c27SFam Zheng             break;
161400695c27SFam Zheng         default:
161500695c27SFam Zheng             break;
161600695c27SFam Zheng         }
161700695c27SFam Zheng     }
161885fe2479SFam Zheng }
161985fe2479SFam Zheng 
162061007b31SStefan Hajnoczi /*
162104ed95f4SEric Blake  * Forwards an already correctly aligned write request to the BlockDriver,
162204ed95f4SEric Blake  * after possibly fragmenting it.
162361007b31SStefan Hajnoczi  */
162485c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
162561007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1626cff86b38SEric Blake     int64_t align, QEMUIOVector *qiov, int flags)
162761007b31SStefan Hajnoczi {
162885c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
162961007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
163061007b31SStefan Hajnoczi     int ret;
163161007b31SStefan Hajnoczi 
163204ed95f4SEric Blake     uint64_t bytes_remaining = bytes;
163304ed95f4SEric Blake     int max_transfer;
163461007b31SStefan Hajnoczi 
1635d470ad42SMax Reitz     if (!drv) {
1636d470ad42SMax Reitz         return -ENOMEDIUM;
1637d470ad42SMax Reitz     }
1638d470ad42SMax Reitz 
1639d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
1640d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
1641d6883bc9SVladimir Sementsov-Ogievskiy     }
1642d6883bc9SVladimir Sementsov-Ogievskiy 
1643cff86b38SEric Blake     assert(is_power_of_2(align));
1644cff86b38SEric Blake     assert((offset & (align - 1)) == 0);
1645cff86b38SEric Blake     assert((bytes & (align - 1)) == 0);
164661007b31SStefan Hajnoczi     assert(!qiov || bytes == qiov->size);
164704ed95f4SEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
164804ed95f4SEric Blake                                    align);
164961007b31SStefan Hajnoczi 
165085fe2479SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
165161007b31SStefan Hajnoczi 
165261007b31SStefan Hajnoczi     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1653c1499a5eSEric Blake         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
165461007b31SStefan Hajnoczi         qemu_iovec_is_zero(qiov)) {
165561007b31SStefan Hajnoczi         flags |= BDRV_REQ_ZERO_WRITE;
165661007b31SStefan Hajnoczi         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
165761007b31SStefan Hajnoczi             flags |= BDRV_REQ_MAY_UNMAP;
165861007b31SStefan Hajnoczi         }
165961007b31SStefan Hajnoczi     }
166061007b31SStefan Hajnoczi 
166161007b31SStefan Hajnoczi     if (ret < 0) {
166261007b31SStefan Hajnoczi         /* Do nothing, write notifier decided to fail this request */
166361007b31SStefan Hajnoczi     } else if (flags & BDRV_REQ_ZERO_WRITE) {
16649a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
16659896c876SKevin Wolf         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
16663ea1a091SPavel Butsykin     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
16673ea1a091SPavel Butsykin         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
166804ed95f4SEric Blake     } else if (bytes <= max_transfer) {
16699a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV);
167078a07294SKevin Wolf         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
167104ed95f4SEric Blake     } else {
167204ed95f4SEric Blake         bdrv_debug_event(bs, BLKDBG_PWRITEV);
167304ed95f4SEric Blake         while (bytes_remaining) {
167404ed95f4SEric Blake             int num = MIN(bytes_remaining, max_transfer);
167504ed95f4SEric Blake             QEMUIOVector local_qiov;
167604ed95f4SEric Blake             int local_flags = flags;
167704ed95f4SEric Blake 
167804ed95f4SEric Blake             assert(num);
167904ed95f4SEric Blake             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
168004ed95f4SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
168104ed95f4SEric Blake                 /* If FUA is going to be emulated by flush, we only
168204ed95f4SEric Blake                  * need to flush on the last iteration */
168304ed95f4SEric Blake                 local_flags &= ~BDRV_REQ_FUA;
168404ed95f4SEric Blake             }
168504ed95f4SEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
168604ed95f4SEric Blake             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
168704ed95f4SEric Blake 
168804ed95f4SEric Blake             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
168904ed95f4SEric Blake                                       num, &local_qiov, local_flags);
169004ed95f4SEric Blake             qemu_iovec_destroy(&local_qiov);
169104ed95f4SEric Blake             if (ret < 0) {
169204ed95f4SEric Blake                 break;
169304ed95f4SEric Blake             }
169404ed95f4SEric Blake             bytes_remaining -= num;
169504ed95f4SEric Blake         }
169661007b31SStefan Hajnoczi     }
16979a4f4c31SKevin Wolf     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
169861007b31SStefan Hajnoczi 
169961007b31SStefan Hajnoczi     if (ret >= 0) {
170004ed95f4SEric Blake         ret = 0;
170161007b31SStefan Hajnoczi     }
170285fe2479SFam Zheng     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
170361007b31SStefan Hajnoczi 
170461007b31SStefan Hajnoczi     return ret;
170561007b31SStefan Hajnoczi }
170661007b31SStefan Hajnoczi 
170785c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
17089eeb6dd1SFam Zheng                                                 int64_t offset,
17099eeb6dd1SFam Zheng                                                 unsigned int bytes,
17109eeb6dd1SFam Zheng                                                 BdrvRequestFlags flags,
17119eeb6dd1SFam Zheng                                                 BdrvTrackedRequest *req)
17129eeb6dd1SFam Zheng {
171385c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
17149eeb6dd1SFam Zheng     uint8_t *buf = NULL;
17159eeb6dd1SFam Zheng     QEMUIOVector local_qiov;
1716a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
17179eeb6dd1SFam Zheng     unsigned int head_padding_bytes, tail_padding_bytes;
17189eeb6dd1SFam Zheng     int ret = 0;
17199eeb6dd1SFam Zheng 
17209eeb6dd1SFam Zheng     head_padding_bytes = offset & (align - 1);
1721f13ce1beSDenis V. Lunev     tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
17229eeb6dd1SFam Zheng 
17239eeb6dd1SFam Zheng 
17249eeb6dd1SFam Zheng     assert(flags & BDRV_REQ_ZERO_WRITE);
17259eeb6dd1SFam Zheng     if (head_padding_bytes || tail_padding_bytes) {
17269eeb6dd1SFam Zheng         buf = qemu_blockalign(bs, align);
17270d93ed08SVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&local_qiov, buf, align);
17289eeb6dd1SFam Zheng     }
17299eeb6dd1SFam Zheng     if (head_padding_bytes) {
17309eeb6dd1SFam Zheng         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
17319eeb6dd1SFam Zheng 
17329eeb6dd1SFam Zheng         /* RMW the unaligned part before head. */
17339eeb6dd1SFam Zheng         mark_request_serialising(req, align);
17349eeb6dd1SFam Zheng         wait_serialising_requests(req);
17359a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
173685c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
17379eeb6dd1SFam Zheng                                   align, &local_qiov, 0);
17389eeb6dd1SFam Zheng         if (ret < 0) {
17399eeb6dd1SFam Zheng             goto fail;
17409eeb6dd1SFam Zheng         }
17419a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
17429eeb6dd1SFam Zheng 
17439eeb6dd1SFam Zheng         memset(buf + head_padding_bytes, 0, zero_bytes);
174485c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1745cff86b38SEric Blake                                    align, &local_qiov,
17469eeb6dd1SFam Zheng                                    flags & ~BDRV_REQ_ZERO_WRITE);
17479eeb6dd1SFam Zheng         if (ret < 0) {
17489eeb6dd1SFam Zheng             goto fail;
17499eeb6dd1SFam Zheng         }
17509eeb6dd1SFam Zheng         offset += zero_bytes;
17519eeb6dd1SFam Zheng         bytes -= zero_bytes;
17529eeb6dd1SFam Zheng     }
17539eeb6dd1SFam Zheng 
17549eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
17559eeb6dd1SFam Zheng     if (bytes >= align) {
17569eeb6dd1SFam Zheng         /* Write the aligned part in the middle. */
17579eeb6dd1SFam Zheng         uint64_t aligned_bytes = bytes & ~(align - 1);
175885c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
17599eeb6dd1SFam Zheng                                    NULL, flags);
17609eeb6dd1SFam Zheng         if (ret < 0) {
17619eeb6dd1SFam Zheng             goto fail;
17629eeb6dd1SFam Zheng         }
17639eeb6dd1SFam Zheng         bytes -= aligned_bytes;
17649eeb6dd1SFam Zheng         offset += aligned_bytes;
17659eeb6dd1SFam Zheng     }
17669eeb6dd1SFam Zheng 
17679eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
17689eeb6dd1SFam Zheng     if (bytes) {
17699eeb6dd1SFam Zheng         assert(align == tail_padding_bytes + bytes);
17709eeb6dd1SFam Zheng         /* RMW the unaligned part after tail. */
17719eeb6dd1SFam Zheng         mark_request_serialising(req, align);
17729eeb6dd1SFam Zheng         wait_serialising_requests(req);
17739a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
177485c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, req, offset, align,
17759eeb6dd1SFam Zheng                                   align, &local_qiov, 0);
17769eeb6dd1SFam Zheng         if (ret < 0) {
17779eeb6dd1SFam Zheng             goto fail;
17789eeb6dd1SFam Zheng         }
17799a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
17809eeb6dd1SFam Zheng 
17819eeb6dd1SFam Zheng         memset(buf, 0, bytes);
178285c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
17839eeb6dd1SFam Zheng                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
17849eeb6dd1SFam Zheng     }
17859eeb6dd1SFam Zheng fail:
17869eeb6dd1SFam Zheng     qemu_vfree(buf);
17879eeb6dd1SFam Zheng     return ret;
17889eeb6dd1SFam Zheng 
17899eeb6dd1SFam Zheng }
17909eeb6dd1SFam Zheng 
179161007b31SStefan Hajnoczi /*
179261007b31SStefan Hajnoczi  * Handle a write request in coroutine context
179361007b31SStefan Hajnoczi  */
1794a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
179561007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
179661007b31SStefan Hajnoczi     BdrvRequestFlags flags)
179761007b31SStefan Hajnoczi {
1798a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
179961007b31SStefan Hajnoczi     BdrvTrackedRequest req;
1800a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
180161007b31SStefan Hajnoczi     uint8_t *head_buf = NULL;
180261007b31SStefan Hajnoczi     uint8_t *tail_buf = NULL;
180361007b31SStefan Hajnoczi     QEMUIOVector local_qiov;
180461007b31SStefan Hajnoczi     bool use_local_qiov = false;
180561007b31SStefan Hajnoczi     int ret;
180661007b31SStefan Hajnoczi 
1807f42cf447SDaniel P. Berrange     trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1808f42cf447SDaniel P. Berrange 
180961007b31SStefan Hajnoczi     if (!bs->drv) {
181061007b31SStefan Hajnoczi         return -ENOMEDIUM;
181161007b31SStefan Hajnoczi     }
181261007b31SStefan Hajnoczi 
181361007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
181461007b31SStefan Hajnoczi     if (ret < 0) {
181561007b31SStefan Hajnoczi         return ret;
181661007b31SStefan Hajnoczi     }
181761007b31SStefan Hajnoczi 
181899723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
181961007b31SStefan Hajnoczi     /*
182061007b31SStefan Hajnoczi      * Align write if necessary by performing a read-modify-write cycle.
182161007b31SStefan Hajnoczi      * Pad qiov with the read parts and be sure to have a tracked request not
182261007b31SStefan Hajnoczi      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
182361007b31SStefan Hajnoczi      */
1824ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
182561007b31SStefan Hajnoczi 
182618a59f03SAnton Nefedov     if (flags & BDRV_REQ_ZERO_WRITE) {
182785c97ca7SKevin Wolf         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
18289eeb6dd1SFam Zheng         goto out;
18299eeb6dd1SFam Zheng     }
18309eeb6dd1SFam Zheng 
183161007b31SStefan Hajnoczi     if (offset & (align - 1)) {
183261007b31SStefan Hajnoczi         QEMUIOVector head_qiov;
183361007b31SStefan Hajnoczi 
183461007b31SStefan Hajnoczi         mark_request_serialising(&req, align);
183561007b31SStefan Hajnoczi         wait_serialising_requests(&req);
183661007b31SStefan Hajnoczi 
183761007b31SStefan Hajnoczi         head_buf = qemu_blockalign(bs, align);
18380d93ed08SVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&head_qiov, head_buf, align);
183961007b31SStefan Hajnoczi 
18409a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
184185c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
184261007b31SStefan Hajnoczi                                   align, &head_qiov, 0);
184361007b31SStefan Hajnoczi         if (ret < 0) {
184461007b31SStefan Hajnoczi             goto fail;
184561007b31SStefan Hajnoczi         }
18469a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
184761007b31SStefan Hajnoczi 
184861007b31SStefan Hajnoczi         qemu_iovec_init(&local_qiov, qiov->niov + 2);
184961007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
185061007b31SStefan Hajnoczi         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
185161007b31SStefan Hajnoczi         use_local_qiov = true;
185261007b31SStefan Hajnoczi 
185361007b31SStefan Hajnoczi         bytes += offset & (align - 1);
185461007b31SStefan Hajnoczi         offset = offset & ~(align - 1);
1855117bc3faSPeter Lieven 
1856117bc3faSPeter Lieven         /* We have read the tail already if the request is smaller
1857117bc3faSPeter Lieven          * than one aligned block.
1858117bc3faSPeter Lieven          */
1859117bc3faSPeter Lieven         if (bytes < align) {
1860117bc3faSPeter Lieven             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1861117bc3faSPeter Lieven             bytes = align;
1862117bc3faSPeter Lieven         }
186361007b31SStefan Hajnoczi     }
186461007b31SStefan Hajnoczi 
186561007b31SStefan Hajnoczi     if ((offset + bytes) & (align - 1)) {
186661007b31SStefan Hajnoczi         QEMUIOVector tail_qiov;
186761007b31SStefan Hajnoczi         size_t tail_bytes;
186861007b31SStefan Hajnoczi         bool waited;
186961007b31SStefan Hajnoczi 
187061007b31SStefan Hajnoczi         mark_request_serialising(&req, align);
187161007b31SStefan Hajnoczi         waited = wait_serialising_requests(&req);
187261007b31SStefan Hajnoczi         assert(!waited || !use_local_qiov);
187361007b31SStefan Hajnoczi 
187461007b31SStefan Hajnoczi         tail_buf = qemu_blockalign(bs, align);
18750d93ed08SVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&tail_qiov, tail_buf, align);
187661007b31SStefan Hajnoczi 
18779a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
187885c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
187985c97ca7SKevin Wolf                                   align, align, &tail_qiov, 0);
188061007b31SStefan Hajnoczi         if (ret < 0) {
188161007b31SStefan Hajnoczi             goto fail;
188261007b31SStefan Hajnoczi         }
18839a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
188461007b31SStefan Hajnoczi 
188561007b31SStefan Hajnoczi         if (!use_local_qiov) {
188661007b31SStefan Hajnoczi             qemu_iovec_init(&local_qiov, qiov->niov + 1);
188761007b31SStefan Hajnoczi             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
188861007b31SStefan Hajnoczi             use_local_qiov = true;
188961007b31SStefan Hajnoczi         }
189061007b31SStefan Hajnoczi 
189161007b31SStefan Hajnoczi         tail_bytes = (offset + bytes) & (align - 1);
189261007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
189361007b31SStefan Hajnoczi 
189461007b31SStefan Hajnoczi         bytes = ROUND_UP(bytes, align);
189561007b31SStefan Hajnoczi     }
189661007b31SStefan Hajnoczi 
189785c97ca7SKevin Wolf     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
189861007b31SStefan Hajnoczi                                use_local_qiov ? &local_qiov : qiov,
189961007b31SStefan Hajnoczi                                flags);
190061007b31SStefan Hajnoczi 
190161007b31SStefan Hajnoczi fail:
190261007b31SStefan Hajnoczi 
190361007b31SStefan Hajnoczi     if (use_local_qiov) {
190461007b31SStefan Hajnoczi         qemu_iovec_destroy(&local_qiov);
190561007b31SStefan Hajnoczi     }
190661007b31SStefan Hajnoczi     qemu_vfree(head_buf);
190761007b31SStefan Hajnoczi     qemu_vfree(tail_buf);
19089eeb6dd1SFam Zheng out:
19099eeb6dd1SFam Zheng     tracked_request_end(&req);
191099723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
191161007b31SStefan Hajnoczi     return ret;
191261007b31SStefan Hajnoczi }
191361007b31SStefan Hajnoczi 
1914a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1915f5a5ca79SManos Pitsidianakis                                        int bytes, BdrvRequestFlags flags)
191661007b31SStefan Hajnoczi {
1917f5a5ca79SManos Pitsidianakis     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
191861007b31SStefan Hajnoczi 
1919a03ef88fSKevin Wolf     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
192061007b31SStefan Hajnoczi         flags &= ~BDRV_REQ_MAY_UNMAP;
192161007b31SStefan Hajnoczi     }
192261007b31SStefan Hajnoczi 
1923f5a5ca79SManos Pitsidianakis     return bdrv_co_pwritev(child, offset, bytes, NULL,
192461007b31SStefan Hajnoczi                            BDRV_REQ_ZERO_WRITE | flags);
192561007b31SStefan Hajnoczi }
192661007b31SStefan Hajnoczi 
19274085f5c7SJohn Snow /*
19284085f5c7SJohn Snow  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
19294085f5c7SJohn Snow  */
19304085f5c7SJohn Snow int bdrv_flush_all(void)
19314085f5c7SJohn Snow {
19324085f5c7SJohn Snow     BdrvNextIterator it;
19334085f5c7SJohn Snow     BlockDriverState *bs = NULL;
19344085f5c7SJohn Snow     int result = 0;
19354085f5c7SJohn Snow 
19364085f5c7SJohn Snow     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
19374085f5c7SJohn Snow         AioContext *aio_context = bdrv_get_aio_context(bs);
19384085f5c7SJohn Snow         int ret;
19394085f5c7SJohn Snow 
19404085f5c7SJohn Snow         aio_context_acquire(aio_context);
19414085f5c7SJohn Snow         ret = bdrv_flush(bs);
19424085f5c7SJohn Snow         if (ret < 0 && !result) {
19434085f5c7SJohn Snow             result = ret;
19444085f5c7SJohn Snow         }
19454085f5c7SJohn Snow         aio_context_release(aio_context);
19464085f5c7SJohn Snow     }
19474085f5c7SJohn Snow 
19484085f5c7SJohn Snow     return result;
19494085f5c7SJohn Snow }
19504085f5c7SJohn Snow 
19514085f5c7SJohn Snow 
19524bcd936eSEric Blake typedef struct BdrvCoBlockStatusData {
195361007b31SStefan Hajnoczi     BlockDriverState *bs;
195461007b31SStefan Hajnoczi     BlockDriverState *base;
1955c9ce8c4dSEric Blake     bool want_zero;
19564bcd936eSEric Blake     int64_t offset;
19574bcd936eSEric Blake     int64_t bytes;
19584bcd936eSEric Blake     int64_t *pnum;
19594bcd936eSEric Blake     int64_t *map;
1960c9ce8c4dSEric Blake     BlockDriverState **file;
19614bcd936eSEric Blake     int ret;
196261007b31SStefan Hajnoczi     bool done;
19634bcd936eSEric Blake } BdrvCoBlockStatusData;
196461007b31SStefan Hajnoczi 
19653e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
19663e4d0e72SEric Blake                                                 bool want_zero,
19673e4d0e72SEric Blake                                                 int64_t offset,
19683e4d0e72SEric Blake                                                 int64_t bytes,
19693e4d0e72SEric Blake                                                 int64_t *pnum,
19703e4d0e72SEric Blake                                                 int64_t *map,
1971f7cc69b3SManos Pitsidianakis                                                 BlockDriverState **file)
1972f7cc69b3SManos Pitsidianakis {
1973f7cc69b3SManos Pitsidianakis     assert(bs->file && bs->file->bs);
19743e4d0e72SEric Blake     *pnum = bytes;
19753e4d0e72SEric Blake     *map = offset;
1976f7cc69b3SManos Pitsidianakis     *file = bs->file->bs;
19773e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1978f7cc69b3SManos Pitsidianakis }
1979f7cc69b3SManos Pitsidianakis 
19803e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
19813e4d0e72SEric Blake                                                    bool want_zero,
19823e4d0e72SEric Blake                                                    int64_t offset,
19833e4d0e72SEric Blake                                                    int64_t bytes,
19843e4d0e72SEric Blake                                                    int64_t *pnum,
19853e4d0e72SEric Blake                                                    int64_t *map,
1986f7cc69b3SManos Pitsidianakis                                                    BlockDriverState **file)
1987f7cc69b3SManos Pitsidianakis {
1988f7cc69b3SManos Pitsidianakis     assert(bs->backing && bs->backing->bs);
19893e4d0e72SEric Blake     *pnum = bytes;
19903e4d0e72SEric Blake     *map = offset;
1991f7cc69b3SManos Pitsidianakis     *file = bs->backing->bs;
19923e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1993f7cc69b3SManos Pitsidianakis }
1994f7cc69b3SManos Pitsidianakis 
199561007b31SStefan Hajnoczi /*
199661007b31SStefan Hajnoczi  * Returns the allocation status of the specified sectors.
199761007b31SStefan Hajnoczi  * Drivers not implementing the functionality are assumed to not support
199861007b31SStefan Hajnoczi  * backing files, hence all their sectors are reported as allocated.
199961007b31SStefan Hajnoczi  *
200086a3d5c6SEric Blake  * If 'want_zero' is true, the caller is querying for mapping
200186a3d5c6SEric Blake  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
200286a3d5c6SEric Blake  * _ZERO where possible; otherwise, the result favors larger 'pnum',
200386a3d5c6SEric Blake  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2004c9ce8c4dSEric Blake  *
20052e8bc787SEric Blake  * If 'offset' is beyond the end of the disk image the return value is
2006fb0d8654SEric Blake  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
200761007b31SStefan Hajnoczi  *
20082e8bc787SEric Blake  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2009fb0d8654SEric Blake  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2010fb0d8654SEric Blake  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
201167a0fd2aSFam Zheng  *
20122e8bc787SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
20132e8bc787SEric Blake  * following the specified offset) that are easily known to be in the
20142e8bc787SEric Blake  * same allocated/unallocated state.  Note that a second call starting
20152e8bc787SEric Blake  * at the original offset plus returned pnum may have the same status.
20162e8bc787SEric Blake  * The returned value is non-zero on success except at end-of-file.
20172e8bc787SEric Blake  *
20182e8bc787SEric Blake  * Returns negative errno on failure.  Otherwise, if the
20192e8bc787SEric Blake  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
20202e8bc787SEric Blake  * set to the host mapping and BDS corresponding to the guest offset.
202161007b31SStefan Hajnoczi  */
20222e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2023c9ce8c4dSEric Blake                                              bool want_zero,
20242e8bc787SEric Blake                                              int64_t offset, int64_t bytes,
20252e8bc787SEric Blake                                              int64_t *pnum, int64_t *map,
202667a0fd2aSFam Zheng                                              BlockDriverState **file)
202761007b31SStefan Hajnoczi {
20282e8bc787SEric Blake     int64_t total_size;
20292e8bc787SEric Blake     int64_t n; /* bytes */
2030efa6e2edSEric Blake     int ret;
20312e8bc787SEric Blake     int64_t local_map = 0;
2032298a1665SEric Blake     BlockDriverState *local_file = NULL;
2033efa6e2edSEric Blake     int64_t aligned_offset, aligned_bytes;
2034efa6e2edSEric Blake     uint32_t align;
203561007b31SStefan Hajnoczi 
2036298a1665SEric Blake     assert(pnum);
2037298a1665SEric Blake     *pnum = 0;
20382e8bc787SEric Blake     total_size = bdrv_getlength(bs);
20392e8bc787SEric Blake     if (total_size < 0) {
20402e8bc787SEric Blake         ret = total_size;
2041298a1665SEric Blake         goto early_out;
204261007b31SStefan Hajnoczi     }
204361007b31SStefan Hajnoczi 
20442e8bc787SEric Blake     if (offset >= total_size) {
2045298a1665SEric Blake         ret = BDRV_BLOCK_EOF;
2046298a1665SEric Blake         goto early_out;
204761007b31SStefan Hajnoczi     }
20482e8bc787SEric Blake     if (!bytes) {
2049298a1665SEric Blake         ret = 0;
2050298a1665SEric Blake         goto early_out;
20519cdcfd9fSEric Blake     }
205261007b31SStefan Hajnoczi 
20532e8bc787SEric Blake     n = total_size - offset;
20542e8bc787SEric Blake     if (n < bytes) {
20552e8bc787SEric Blake         bytes = n;
205661007b31SStefan Hajnoczi     }
205761007b31SStefan Hajnoczi 
2058d470ad42SMax Reitz     /* Must be non-NULL or bdrv_getlength() would have failed */
2059d470ad42SMax Reitz     assert(bs->drv);
2060636cb512SEric Blake     if (!bs->drv->bdrv_co_block_status) {
20612e8bc787SEric Blake         *pnum = bytes;
206261007b31SStefan Hajnoczi         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
20632e8bc787SEric Blake         if (offset + bytes == total_size) {
2064fb0d8654SEric Blake             ret |= BDRV_BLOCK_EOF;
2065fb0d8654SEric Blake         }
206661007b31SStefan Hajnoczi         if (bs->drv->protocol_name) {
20672e8bc787SEric Blake             ret |= BDRV_BLOCK_OFFSET_VALID;
20682e8bc787SEric Blake             local_map = offset;
2069298a1665SEric Blake             local_file = bs;
207061007b31SStefan Hajnoczi         }
2071298a1665SEric Blake         goto early_out;
207261007b31SStefan Hajnoczi     }
207361007b31SStefan Hajnoczi 
207499723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2075efa6e2edSEric Blake 
2076efa6e2edSEric Blake     /* Round out to request_alignment boundaries */
207786a3d5c6SEric Blake     align = bs->bl.request_alignment;
2078efa6e2edSEric Blake     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2079efa6e2edSEric Blake     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2080efa6e2edSEric Blake 
208186a3d5c6SEric Blake     ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
208286a3d5c6SEric Blake                                         aligned_bytes, pnum, &local_map,
208386a3d5c6SEric Blake                                         &local_file);
208486a3d5c6SEric Blake     if (ret < 0) {
208586a3d5c6SEric Blake         *pnum = 0;
208686a3d5c6SEric Blake         goto out;
208786a3d5c6SEric Blake     }
2088efa6e2edSEric Blake 
2089efa6e2edSEric Blake     /*
2090636cb512SEric Blake      * The driver's result must be a non-zero multiple of request_alignment.
2091efa6e2edSEric Blake      * Clamp pnum and adjust map to original request.
2092efa6e2edSEric Blake      */
2093636cb512SEric Blake     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2094636cb512SEric Blake            align > offset - aligned_offset);
2095efa6e2edSEric Blake     *pnum -= offset - aligned_offset;
2096efa6e2edSEric Blake     if (*pnum > bytes) {
2097efa6e2edSEric Blake         *pnum = bytes;
2098efa6e2edSEric Blake     }
2099efa6e2edSEric Blake     if (ret & BDRV_BLOCK_OFFSET_VALID) {
2100efa6e2edSEric Blake         local_map += offset - aligned_offset;
2101efa6e2edSEric Blake     }
210261007b31SStefan Hajnoczi 
210361007b31SStefan Hajnoczi     if (ret & BDRV_BLOCK_RAW) {
2104298a1665SEric Blake         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
21052e8bc787SEric Blake         ret = bdrv_co_block_status(local_file, want_zero, local_map,
21062e8bc787SEric Blake                                    *pnum, pnum, &local_map, &local_file);
210799723548SPaolo Bonzini         goto out;
210861007b31SStefan Hajnoczi     }
210961007b31SStefan Hajnoczi 
211061007b31SStefan Hajnoczi     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
211161007b31SStefan Hajnoczi         ret |= BDRV_BLOCK_ALLOCATED;
2112c9ce8c4dSEric Blake     } else if (want_zero) {
211361007b31SStefan Hajnoczi         if (bdrv_unallocated_blocks_are_zero(bs)) {
211461007b31SStefan Hajnoczi             ret |= BDRV_BLOCK_ZERO;
2115760e0063SKevin Wolf         } else if (bs->backing) {
2116760e0063SKevin Wolf             BlockDriverState *bs2 = bs->backing->bs;
21172e8bc787SEric Blake             int64_t size2 = bdrv_getlength(bs2);
2118c9ce8c4dSEric Blake 
21192e8bc787SEric Blake             if (size2 >= 0 && offset >= size2) {
212061007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
212161007b31SStefan Hajnoczi             }
212261007b31SStefan Hajnoczi         }
212361007b31SStefan Hajnoczi     }
212461007b31SStefan Hajnoczi 
2125c9ce8c4dSEric Blake     if (want_zero && local_file && local_file != bs &&
212661007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
212761007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_OFFSET_VALID)) {
21282e8bc787SEric Blake         int64_t file_pnum;
21292e8bc787SEric Blake         int ret2;
213061007b31SStefan Hajnoczi 
21312e8bc787SEric Blake         ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
21322e8bc787SEric Blake                                     *pnum, &file_pnum, NULL, NULL);
213361007b31SStefan Hajnoczi         if (ret2 >= 0) {
213461007b31SStefan Hajnoczi             /* Ignore errors.  This is just providing extra information, it
213561007b31SStefan Hajnoczi              * is useful but not necessary.
213661007b31SStefan Hajnoczi              */
2137c61e684eSEric Blake             if (ret2 & BDRV_BLOCK_EOF &&
2138c61e684eSEric Blake                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2139c61e684eSEric Blake                 /*
2140c61e684eSEric Blake                  * It is valid for the format block driver to read
2141c61e684eSEric Blake                  * beyond the end of the underlying file's current
2142c61e684eSEric Blake                  * size; such areas read as zero.
2143c61e684eSEric Blake                  */
214461007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
214561007b31SStefan Hajnoczi             } else {
214661007b31SStefan Hajnoczi                 /* Limit request to the range reported by the protocol driver */
214761007b31SStefan Hajnoczi                 *pnum = file_pnum;
214861007b31SStefan Hajnoczi                 ret |= (ret2 & BDRV_BLOCK_ZERO);
214961007b31SStefan Hajnoczi             }
215061007b31SStefan Hajnoczi         }
215161007b31SStefan Hajnoczi     }
215261007b31SStefan Hajnoczi 
215399723548SPaolo Bonzini out:
215499723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
21552e8bc787SEric Blake     if (ret >= 0 && offset + *pnum == total_size) {
2156fb0d8654SEric Blake         ret |= BDRV_BLOCK_EOF;
2157fb0d8654SEric Blake     }
2158298a1665SEric Blake early_out:
2159298a1665SEric Blake     if (file) {
2160298a1665SEric Blake         *file = local_file;
2161298a1665SEric Blake     }
21622e8bc787SEric Blake     if (map) {
21632e8bc787SEric Blake         *map = local_map;
21642e8bc787SEric Blake     }
216561007b31SStefan Hajnoczi     return ret;
216661007b31SStefan Hajnoczi }
216761007b31SStefan Hajnoczi 
21685b648c67SEric Blake static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2169ba3f0e25SFam Zheng                                                    BlockDriverState *base,
2170c9ce8c4dSEric Blake                                                    bool want_zero,
21715b648c67SEric Blake                                                    int64_t offset,
21725b648c67SEric Blake                                                    int64_t bytes,
21735b648c67SEric Blake                                                    int64_t *pnum,
21745b648c67SEric Blake                                                    int64_t *map,
217567a0fd2aSFam Zheng                                                    BlockDriverState **file)
2176ba3f0e25SFam Zheng {
2177ba3f0e25SFam Zheng     BlockDriverState *p;
21785b648c67SEric Blake     int ret = 0;
2179c61e684eSEric Blake     bool first = true;
2180ba3f0e25SFam Zheng 
2181ba3f0e25SFam Zheng     assert(bs != base);
2182760e0063SKevin Wolf     for (p = bs; p != base; p = backing_bs(p)) {
21835b648c67SEric Blake         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
21845b648c67SEric Blake                                    file);
2185c61e684eSEric Blake         if (ret < 0) {
2186c61e684eSEric Blake             break;
2187c61e684eSEric Blake         }
2188c61e684eSEric Blake         if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2189c61e684eSEric Blake             /*
2190c61e684eSEric Blake              * Reading beyond the end of the file continues to read
2191c61e684eSEric Blake              * zeroes, but we can only widen the result to the
2192c61e684eSEric Blake              * unallocated length we learned from an earlier
2193c61e684eSEric Blake              * iteration.
2194c61e684eSEric Blake              */
21955b648c67SEric Blake             *pnum = bytes;
2196c61e684eSEric Blake         }
2197c61e684eSEric Blake         if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2198ba3f0e25SFam Zheng             break;
2199ba3f0e25SFam Zheng         }
22005b648c67SEric Blake         /* [offset, pnum] unallocated on this layer, which could be only
22015b648c67SEric Blake          * the first part of [offset, bytes].  */
22025b648c67SEric Blake         bytes = MIN(bytes, *pnum);
2203c61e684eSEric Blake         first = false;
2204ba3f0e25SFam Zheng     }
2205ba3f0e25SFam Zheng     return ret;
2206ba3f0e25SFam Zheng }
2207ba3f0e25SFam Zheng 
220831826642SEric Blake /* Coroutine wrapper for bdrv_block_status_above() */
22095b648c67SEric Blake static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
221061007b31SStefan Hajnoczi {
22114bcd936eSEric Blake     BdrvCoBlockStatusData *data = opaque;
221261007b31SStefan Hajnoczi 
22135b648c67SEric Blake     data->ret = bdrv_co_block_status_above(data->bs, data->base,
2214c9ce8c4dSEric Blake                                            data->want_zero,
22155b648c67SEric Blake                                            data->offset, data->bytes,
22165b648c67SEric Blake                                            data->pnum, data->map, data->file);
221761007b31SStefan Hajnoczi     data->done = true;
22184720cbeeSKevin Wolf     aio_wait_kick();
221961007b31SStefan Hajnoczi }
222061007b31SStefan Hajnoczi 
222161007b31SStefan Hajnoczi /*
22225b648c67SEric Blake  * Synchronous wrapper around bdrv_co_block_status_above().
222361007b31SStefan Hajnoczi  *
22245b648c67SEric Blake  * See bdrv_co_block_status_above() for details.
222561007b31SStefan Hajnoczi  */
22267ddb99b9SEric Blake static int bdrv_common_block_status_above(BlockDriverState *bs,
2227ba3f0e25SFam Zheng                                           BlockDriverState *base,
22287ddb99b9SEric Blake                                           bool want_zero, int64_t offset,
22297ddb99b9SEric Blake                                           int64_t bytes, int64_t *pnum,
22307ddb99b9SEric Blake                                           int64_t *map,
223167a0fd2aSFam Zheng                                           BlockDriverState **file)
223261007b31SStefan Hajnoczi {
223361007b31SStefan Hajnoczi     Coroutine *co;
22344bcd936eSEric Blake     BdrvCoBlockStatusData data = {
223561007b31SStefan Hajnoczi         .bs = bs,
2236ba3f0e25SFam Zheng         .base = base,
2237c9ce8c4dSEric Blake         .want_zero = want_zero,
22387ddb99b9SEric Blake         .offset = offset,
22397ddb99b9SEric Blake         .bytes = bytes,
22407ddb99b9SEric Blake         .pnum = pnum,
22417ddb99b9SEric Blake         .map = map,
2242c9ce8c4dSEric Blake         .file = file,
224361007b31SStefan Hajnoczi         .done = false,
224461007b31SStefan Hajnoczi     };
224561007b31SStefan Hajnoczi 
224661007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
224761007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
22485b648c67SEric Blake         bdrv_block_status_above_co_entry(&data);
224961007b31SStefan Hajnoczi     } else {
22505b648c67SEric Blake         co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2251e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
225288b062c2SPaolo Bonzini         BDRV_POLL_WHILE(bs, !data.done);
225361007b31SStefan Hajnoczi     }
225461007b31SStefan Hajnoczi     return data.ret;
225561007b31SStefan Hajnoczi }
225661007b31SStefan Hajnoczi 
225731826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
225831826642SEric Blake                             int64_t offset, int64_t bytes, int64_t *pnum,
225931826642SEric Blake                             int64_t *map, BlockDriverState **file)
2260c9ce8c4dSEric Blake {
226131826642SEric Blake     return bdrv_common_block_status_above(bs, base, true, offset, bytes,
226231826642SEric Blake                                           pnum, map, file);
2263c9ce8c4dSEric Blake }
2264c9ce8c4dSEric Blake 
2265237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2266237d78f8SEric Blake                       int64_t *pnum, int64_t *map, BlockDriverState **file)
2267ba3f0e25SFam Zheng {
226831826642SEric Blake     return bdrv_block_status_above(bs, backing_bs(bs),
226931826642SEric Blake                                    offset, bytes, pnum, map, file);
2270ba3f0e25SFam Zheng }
2271ba3f0e25SFam Zheng 
2272d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2273d6a644bbSEric Blake                                    int64_t bytes, int64_t *pnum)
227461007b31SStefan Hajnoczi {
22757ddb99b9SEric Blake     int ret;
22767ddb99b9SEric Blake     int64_t dummy;
2277d6a644bbSEric Blake 
22787ddb99b9SEric Blake     ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
22797ddb99b9SEric Blake                                          bytes, pnum ? pnum : &dummy, NULL,
2280298a1665SEric Blake                                          NULL);
228161007b31SStefan Hajnoczi     if (ret < 0) {
228261007b31SStefan Hajnoczi         return ret;
228361007b31SStefan Hajnoczi     }
228461007b31SStefan Hajnoczi     return !!(ret & BDRV_BLOCK_ALLOCATED);
228561007b31SStefan Hajnoczi }
228661007b31SStefan Hajnoczi 
228761007b31SStefan Hajnoczi /*
228861007b31SStefan Hajnoczi  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
228961007b31SStefan Hajnoczi  *
229051b0a488SEric Blake  * Return true if (a prefix of) the given range is allocated in any image
229151b0a488SEric Blake  * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
229251b0a488SEric Blake  * offset is allocated in any image of the chain.  Return false otherwise,
2293d6a644bbSEric Blake  * or negative errno on failure.
229461007b31SStefan Hajnoczi  *
229551b0a488SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
229651b0a488SEric Blake  * following the specified offset) that are known to be in the same
229751b0a488SEric Blake  * allocated/unallocated state.  Note that a subsequent call starting
229851b0a488SEric Blake  * at 'offset + *pnum' may return the same allocation status (in other
229951b0a488SEric Blake  * words, the result is not necessarily the maximum possible range);
230051b0a488SEric Blake  * but 'pnum' will only be 0 when end of file is reached.
230161007b31SStefan Hajnoczi  *
230261007b31SStefan Hajnoczi  */
230361007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top,
230461007b31SStefan Hajnoczi                             BlockDriverState *base,
230551b0a488SEric Blake                             int64_t offset, int64_t bytes, int64_t *pnum)
230661007b31SStefan Hajnoczi {
230761007b31SStefan Hajnoczi     BlockDriverState *intermediate;
230851b0a488SEric Blake     int ret;
230951b0a488SEric Blake     int64_t n = bytes;
231061007b31SStefan Hajnoczi 
231161007b31SStefan Hajnoczi     intermediate = top;
231261007b31SStefan Hajnoczi     while (intermediate && intermediate != base) {
2313d6a644bbSEric Blake         int64_t pnum_inter;
2314c00716beSEric Blake         int64_t size_inter;
2315d6a644bbSEric Blake 
231651b0a488SEric Blake         ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
231761007b31SStefan Hajnoczi         if (ret < 0) {
231861007b31SStefan Hajnoczi             return ret;
2319d6a644bbSEric Blake         }
2320d6a644bbSEric Blake         if (ret) {
232151b0a488SEric Blake             *pnum = pnum_inter;
232261007b31SStefan Hajnoczi             return 1;
232361007b31SStefan Hajnoczi         }
232461007b31SStefan Hajnoczi 
232551b0a488SEric Blake         size_inter = bdrv_getlength(intermediate);
2326c00716beSEric Blake         if (size_inter < 0) {
2327c00716beSEric Blake             return size_inter;
2328c00716beSEric Blake         }
232951b0a488SEric Blake         if (n > pnum_inter &&
233051b0a488SEric Blake             (intermediate == top || offset + pnum_inter < size_inter)) {
233151b0a488SEric Blake             n = pnum_inter;
233261007b31SStefan Hajnoczi         }
233361007b31SStefan Hajnoczi 
2334760e0063SKevin Wolf         intermediate = backing_bs(intermediate);
233561007b31SStefan Hajnoczi     }
233661007b31SStefan Hajnoczi 
233761007b31SStefan Hajnoczi     *pnum = n;
233861007b31SStefan Hajnoczi     return 0;
233961007b31SStefan Hajnoczi }
234061007b31SStefan Hajnoczi 
23411a8ae822SKevin Wolf typedef struct BdrvVmstateCo {
23421a8ae822SKevin Wolf     BlockDriverState    *bs;
23431a8ae822SKevin Wolf     QEMUIOVector        *qiov;
23441a8ae822SKevin Wolf     int64_t             pos;
23451a8ae822SKevin Wolf     bool                is_read;
23461a8ae822SKevin Wolf     int                 ret;
23471a8ae822SKevin Wolf } BdrvVmstateCo;
23481a8ae822SKevin Wolf 
23491a8ae822SKevin Wolf static int coroutine_fn
23501a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
23511a8ae822SKevin Wolf                    bool is_read)
23521a8ae822SKevin Wolf {
23531a8ae822SKevin Wolf     BlockDriver *drv = bs->drv;
2354dc88a467SStefan Hajnoczi     int ret = -ENOTSUP;
2355dc88a467SStefan Hajnoczi 
2356dc88a467SStefan Hajnoczi     bdrv_inc_in_flight(bs);
23571a8ae822SKevin Wolf 
23581a8ae822SKevin Wolf     if (!drv) {
2359dc88a467SStefan Hajnoczi         ret = -ENOMEDIUM;
23601a8ae822SKevin Wolf     } else if (drv->bdrv_load_vmstate) {
2361dc88a467SStefan Hajnoczi         if (is_read) {
2362dc88a467SStefan Hajnoczi             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2363dc88a467SStefan Hajnoczi         } else {
2364dc88a467SStefan Hajnoczi             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2365dc88a467SStefan Hajnoczi         }
23661a8ae822SKevin Wolf     } else if (bs->file) {
2367dc88a467SStefan Hajnoczi         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
23681a8ae822SKevin Wolf     }
23691a8ae822SKevin Wolf 
2370dc88a467SStefan Hajnoczi     bdrv_dec_in_flight(bs);
2371dc88a467SStefan Hajnoczi     return ret;
23721a8ae822SKevin Wolf }
23731a8ae822SKevin Wolf 
23741a8ae822SKevin Wolf static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
23751a8ae822SKevin Wolf {
23761a8ae822SKevin Wolf     BdrvVmstateCo *co = opaque;
23771a8ae822SKevin Wolf     co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
23784720cbeeSKevin Wolf     aio_wait_kick();
23791a8ae822SKevin Wolf }
23801a8ae822SKevin Wolf 
23811a8ae822SKevin Wolf static inline int
23821a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
23831a8ae822SKevin Wolf                 bool is_read)
23841a8ae822SKevin Wolf {
23851a8ae822SKevin Wolf     if (qemu_in_coroutine()) {
23861a8ae822SKevin Wolf         return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
23871a8ae822SKevin Wolf     } else {
23881a8ae822SKevin Wolf         BdrvVmstateCo data = {
23891a8ae822SKevin Wolf             .bs         = bs,
23901a8ae822SKevin Wolf             .qiov       = qiov,
23911a8ae822SKevin Wolf             .pos        = pos,
23921a8ae822SKevin Wolf             .is_read    = is_read,
23931a8ae822SKevin Wolf             .ret        = -EINPROGRESS,
23941a8ae822SKevin Wolf         };
23950b8b8753SPaolo Bonzini         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
23961a8ae822SKevin Wolf 
2397e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
2398ea17c9d2SStefan Hajnoczi         BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
23991a8ae822SKevin Wolf         return data.ret;
24001a8ae822SKevin Wolf     }
24011a8ae822SKevin Wolf }
24021a8ae822SKevin Wolf 
240361007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
240461007b31SStefan Hajnoczi                       int64_t pos, int size)
240561007b31SStefan Hajnoczi {
24060d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2407b433d942SKevin Wolf     int ret;
240861007b31SStefan Hajnoczi 
2409b433d942SKevin Wolf     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2410b433d942SKevin Wolf     if (ret < 0) {
2411b433d942SKevin Wolf         return ret;
2412b433d942SKevin Wolf     }
2413b433d942SKevin Wolf 
2414b433d942SKevin Wolf     return size;
241561007b31SStefan Hajnoczi }
241661007b31SStefan Hajnoczi 
241761007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
241861007b31SStefan Hajnoczi {
24191a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, false);
242061007b31SStefan Hajnoczi }
242161007b31SStefan Hajnoczi 
242261007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
242361007b31SStefan Hajnoczi                       int64_t pos, int size)
242461007b31SStefan Hajnoczi {
24250d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2426b433d942SKevin Wolf     int ret;
24275ddda0b8SKevin Wolf 
2428b433d942SKevin Wolf     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2429b433d942SKevin Wolf     if (ret < 0) {
2430b433d942SKevin Wolf         return ret;
2431b433d942SKevin Wolf     }
2432b433d942SKevin Wolf 
2433b433d942SKevin Wolf     return size;
24345ddda0b8SKevin Wolf }
24355ddda0b8SKevin Wolf 
24365ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
24375ddda0b8SKevin Wolf {
24381a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, true);
243961007b31SStefan Hajnoczi }
244061007b31SStefan Hajnoczi 
244161007b31SStefan Hajnoczi /**************************************************************/
244261007b31SStefan Hajnoczi /* async I/Os */
244361007b31SStefan Hajnoczi 
244461007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb)
244561007b31SStefan Hajnoczi {
244661007b31SStefan Hajnoczi     qemu_aio_ref(acb);
244761007b31SStefan Hajnoczi     bdrv_aio_cancel_async(acb);
244861007b31SStefan Hajnoczi     while (acb->refcnt > 1) {
244961007b31SStefan Hajnoczi         if (acb->aiocb_info->get_aio_context) {
245061007b31SStefan Hajnoczi             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
245161007b31SStefan Hajnoczi         } else if (acb->bs) {
24522f47da5fSPaolo Bonzini             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
24532f47da5fSPaolo Bonzini              * assert that we're not using an I/O thread.  Thread-safe
24542f47da5fSPaolo Bonzini              * code should use bdrv_aio_cancel_async exclusively.
24552f47da5fSPaolo Bonzini              */
24562f47da5fSPaolo Bonzini             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
245761007b31SStefan Hajnoczi             aio_poll(bdrv_get_aio_context(acb->bs), true);
245861007b31SStefan Hajnoczi         } else {
245961007b31SStefan Hajnoczi             abort();
246061007b31SStefan Hajnoczi         }
246161007b31SStefan Hajnoczi     }
246261007b31SStefan Hajnoczi     qemu_aio_unref(acb);
246361007b31SStefan Hajnoczi }
246461007b31SStefan Hajnoczi 
246561007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements
246661007b31SStefan Hajnoczi  * cancel_async, otherwise we do nothing and let the request normally complete.
246761007b31SStefan Hajnoczi  * In either case the completion callback must be called. */
246861007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb)
246961007b31SStefan Hajnoczi {
247061007b31SStefan Hajnoczi     if (acb->aiocb_info->cancel_async) {
247161007b31SStefan Hajnoczi         acb->aiocb_info->cancel_async(acb);
247261007b31SStefan Hajnoczi     }
247361007b31SStefan Hajnoczi }
247461007b31SStefan Hajnoczi 
247561007b31SStefan Hajnoczi /**************************************************************/
247661007b31SStefan Hajnoczi /* Coroutine block device emulation */
247761007b31SStefan Hajnoczi 
2478e293b7a3SKevin Wolf typedef struct FlushCo {
2479e293b7a3SKevin Wolf     BlockDriverState *bs;
2480e293b7a3SKevin Wolf     int ret;
2481e293b7a3SKevin Wolf } FlushCo;
2482e293b7a3SKevin Wolf 
2483e293b7a3SKevin Wolf 
248461007b31SStefan Hajnoczi static void coroutine_fn bdrv_flush_co_entry(void *opaque)
248561007b31SStefan Hajnoczi {
2486e293b7a3SKevin Wolf     FlushCo *rwco = opaque;
248761007b31SStefan Hajnoczi 
248861007b31SStefan Hajnoczi     rwco->ret = bdrv_co_flush(rwco->bs);
24894720cbeeSKevin Wolf     aio_wait_kick();
249061007b31SStefan Hajnoczi }
249161007b31SStefan Hajnoczi 
249261007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
249361007b31SStefan Hajnoczi {
249449ca6259SFam Zheng     int current_gen;
249549ca6259SFam Zheng     int ret = 0;
249661007b31SStefan Hajnoczi 
249799723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2498c32b82afSPavel Dovgalyuk 
2499e914404eSFam Zheng     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
250049ca6259SFam Zheng         bdrv_is_sg(bs)) {
250149ca6259SFam Zheng         goto early_exit;
250249ca6259SFam Zheng     }
250349ca6259SFam Zheng 
25043783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
250547fec599SPaolo Bonzini     current_gen = atomic_read(&bs->write_gen);
25063ff2f67aSEvgeny Yakovlev 
25073ff2f67aSEvgeny Yakovlev     /* Wait until any previous flushes are completed */
250899723548SPaolo Bonzini     while (bs->active_flush_req) {
25093783fa3dSPaolo Bonzini         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
25103ff2f67aSEvgeny Yakovlev     }
25113ff2f67aSEvgeny Yakovlev 
25123783fa3dSPaolo Bonzini     /* Flushes reach this point in nondecreasing current_gen order.  */
251399723548SPaolo Bonzini     bs->active_flush_req = true;
25143783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
25153ff2f67aSEvgeny Yakovlev 
2516c32b82afSPavel Dovgalyuk     /* Write back all layers by calling one driver function */
2517c32b82afSPavel Dovgalyuk     if (bs->drv->bdrv_co_flush) {
2518c32b82afSPavel Dovgalyuk         ret = bs->drv->bdrv_co_flush(bs);
2519c32b82afSPavel Dovgalyuk         goto out;
2520c32b82afSPavel Dovgalyuk     }
2521c32b82afSPavel Dovgalyuk 
252261007b31SStefan Hajnoczi     /* Write back cached data to the OS even with cache=unsafe */
252361007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
252461007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_os) {
252561007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_os(bs);
252661007b31SStefan Hajnoczi         if (ret < 0) {
2527cdb5e315SFam Zheng             goto out;
252861007b31SStefan Hajnoczi         }
252961007b31SStefan Hajnoczi     }
253061007b31SStefan Hajnoczi 
253161007b31SStefan Hajnoczi     /* But don't actually force it to the disk with cache=unsafe */
253261007b31SStefan Hajnoczi     if (bs->open_flags & BDRV_O_NO_FLUSH) {
253361007b31SStefan Hajnoczi         goto flush_parent;
253461007b31SStefan Hajnoczi     }
253561007b31SStefan Hajnoczi 
25363ff2f67aSEvgeny Yakovlev     /* Check if we really need to flush anything */
25373ff2f67aSEvgeny Yakovlev     if (bs->flushed_gen == current_gen) {
25383ff2f67aSEvgeny Yakovlev         goto flush_parent;
25393ff2f67aSEvgeny Yakovlev     }
25403ff2f67aSEvgeny Yakovlev 
254161007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2542d470ad42SMax Reitz     if (!bs->drv) {
2543d470ad42SMax Reitz         /* bs->drv->bdrv_co_flush() might have ejected the BDS
2544d470ad42SMax Reitz          * (even in case of apparent success) */
2545d470ad42SMax Reitz         ret = -ENOMEDIUM;
2546d470ad42SMax Reitz         goto out;
2547d470ad42SMax Reitz     }
254861007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_disk) {
254961007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_disk(bs);
255061007b31SStefan Hajnoczi     } else if (bs->drv->bdrv_aio_flush) {
255161007b31SStefan Hajnoczi         BlockAIOCB *acb;
255261007b31SStefan Hajnoczi         CoroutineIOCompletion co = {
255361007b31SStefan Hajnoczi             .coroutine = qemu_coroutine_self(),
255461007b31SStefan Hajnoczi         };
255561007b31SStefan Hajnoczi 
255661007b31SStefan Hajnoczi         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
255761007b31SStefan Hajnoczi         if (acb == NULL) {
255861007b31SStefan Hajnoczi             ret = -EIO;
255961007b31SStefan Hajnoczi         } else {
256061007b31SStefan Hajnoczi             qemu_coroutine_yield();
256161007b31SStefan Hajnoczi             ret = co.ret;
256261007b31SStefan Hajnoczi         }
256361007b31SStefan Hajnoczi     } else {
256461007b31SStefan Hajnoczi         /*
256561007b31SStefan Hajnoczi          * Some block drivers always operate in either writethrough or unsafe
256661007b31SStefan Hajnoczi          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
256761007b31SStefan Hajnoczi          * know how the server works (because the behaviour is hardcoded or
256861007b31SStefan Hajnoczi          * depends on server-side configuration), so we can't ensure that
256961007b31SStefan Hajnoczi          * everything is safe on disk. Returning an error doesn't work because
257061007b31SStefan Hajnoczi          * that would break guests even if the server operates in writethrough
257161007b31SStefan Hajnoczi          * mode.
257261007b31SStefan Hajnoczi          *
257361007b31SStefan Hajnoczi          * Let's hope the user knows what he's doing.
257461007b31SStefan Hajnoczi          */
257561007b31SStefan Hajnoczi         ret = 0;
257661007b31SStefan Hajnoczi     }
25773ff2f67aSEvgeny Yakovlev 
257861007b31SStefan Hajnoczi     if (ret < 0) {
2579cdb5e315SFam Zheng         goto out;
258061007b31SStefan Hajnoczi     }
258161007b31SStefan Hajnoczi 
258261007b31SStefan Hajnoczi     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
258361007b31SStefan Hajnoczi      * in the case of cache=unsafe, so there are no useless flushes.
258461007b31SStefan Hajnoczi      */
258561007b31SStefan Hajnoczi flush_parent:
2586cdb5e315SFam Zheng     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2587cdb5e315SFam Zheng out:
25883ff2f67aSEvgeny Yakovlev     /* Notify any pending flushes that we have completed */
2589e6af1e08SKevin Wolf     if (ret == 0) {
25903ff2f67aSEvgeny Yakovlev         bs->flushed_gen = current_gen;
2591e6af1e08SKevin Wolf     }
25923783fa3dSPaolo Bonzini 
25933783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
259499723548SPaolo Bonzini     bs->active_flush_req = false;
2595156af3acSDenis V. Lunev     /* Return value is ignored - it's ok if wait queue is empty */
2596156af3acSDenis V. Lunev     qemu_co_queue_next(&bs->flush_queue);
25973783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
25983ff2f67aSEvgeny Yakovlev 
259949ca6259SFam Zheng early_exit:
260099723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2601cdb5e315SFam Zheng     return ret;
260261007b31SStefan Hajnoczi }
260361007b31SStefan Hajnoczi 
260461007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs)
260561007b31SStefan Hajnoczi {
260661007b31SStefan Hajnoczi     Coroutine *co;
2607e293b7a3SKevin Wolf     FlushCo flush_co = {
260861007b31SStefan Hajnoczi         .bs = bs,
260961007b31SStefan Hajnoczi         .ret = NOT_DONE,
261061007b31SStefan Hajnoczi     };
261161007b31SStefan Hajnoczi 
261261007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
261361007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
2614e293b7a3SKevin Wolf         bdrv_flush_co_entry(&flush_co);
261561007b31SStefan Hajnoczi     } else {
26160b8b8753SPaolo Bonzini         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2617e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
261888b062c2SPaolo Bonzini         BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
261961007b31SStefan Hajnoczi     }
262061007b31SStefan Hajnoczi 
2621e293b7a3SKevin Wolf     return flush_co.ret;
262261007b31SStefan Hajnoczi }
262361007b31SStefan Hajnoczi 
262461007b31SStefan Hajnoczi typedef struct DiscardCo {
26250b9fd3f4SFam Zheng     BdrvChild *child;
26260c51a893SEric Blake     int64_t offset;
2627f5a5ca79SManos Pitsidianakis     int bytes;
262861007b31SStefan Hajnoczi     int ret;
262961007b31SStefan Hajnoczi } DiscardCo;
26300c51a893SEric Blake static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
263161007b31SStefan Hajnoczi {
263261007b31SStefan Hajnoczi     DiscardCo *rwco = opaque;
263361007b31SStefan Hajnoczi 
26340b9fd3f4SFam Zheng     rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
26354720cbeeSKevin Wolf     aio_wait_kick();
263661007b31SStefan Hajnoczi }
263761007b31SStefan Hajnoczi 
26380b9fd3f4SFam Zheng int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
263961007b31SStefan Hajnoczi {
2640b1066c87SFam Zheng     BdrvTrackedRequest req;
26419f1963b3SEric Blake     int max_pdiscard, ret;
26423482b9bcSEric Blake     int head, tail, align;
26430b9fd3f4SFam Zheng     BlockDriverState *bs = child->bs;
264461007b31SStefan Hajnoczi 
26450b9fd3f4SFam Zheng     if (!bs || !bs->drv) {
264661007b31SStefan Hajnoczi         return -ENOMEDIUM;
264761007b31SStefan Hajnoczi     }
264861007b31SStefan Hajnoczi 
2649d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
2650d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
2651d6883bc9SVladimir Sementsov-Ogievskiy     }
2652d6883bc9SVladimir Sementsov-Ogievskiy 
2653f5a5ca79SManos Pitsidianakis     ret = bdrv_check_byte_request(bs, offset, bytes);
265461007b31SStefan Hajnoczi     if (ret < 0) {
265561007b31SStefan Hajnoczi         return ret;
265661007b31SStefan Hajnoczi     }
265761007b31SStefan Hajnoczi 
265861007b31SStefan Hajnoczi     /* Do nothing if disabled.  */
265961007b31SStefan Hajnoczi     if (!(bs->open_flags & BDRV_O_UNMAP)) {
266061007b31SStefan Hajnoczi         return 0;
266161007b31SStefan Hajnoczi     }
266261007b31SStefan Hajnoczi 
266302aefe43SEric Blake     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
266461007b31SStefan Hajnoczi         return 0;
266561007b31SStefan Hajnoczi     }
266661007b31SStefan Hajnoczi 
26673482b9bcSEric Blake     /* Discard is advisory, but some devices track and coalesce
26683482b9bcSEric Blake      * unaligned requests, so we must pass everything down rather than
26693482b9bcSEric Blake      * round here.  Still, most devices will just silently ignore
26703482b9bcSEric Blake      * unaligned requests (by returning -ENOTSUP), so we must fragment
26713482b9bcSEric Blake      * the request accordingly.  */
267202aefe43SEric Blake     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2673b8d0a980SEric Blake     assert(align % bs->bl.request_alignment == 0);
2674b8d0a980SEric Blake     head = offset % align;
2675f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % align;
26769f1963b3SEric Blake 
267799723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2678f5a5ca79SManos Pitsidianakis     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
267950824995SFam Zheng 
268000695c27SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2681ec050f77SDenis V. Lunev     if (ret < 0) {
2682ec050f77SDenis V. Lunev         goto out;
2683ec050f77SDenis V. Lunev     }
2684ec050f77SDenis V. Lunev 
26859f1963b3SEric Blake     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
26869f1963b3SEric Blake                                    align);
26873482b9bcSEric Blake     assert(max_pdiscard >= bs->bl.request_alignment);
26889f1963b3SEric Blake 
2689f5a5ca79SManos Pitsidianakis     while (bytes > 0) {
2690f5a5ca79SManos Pitsidianakis         int num = bytes;
26913482b9bcSEric Blake 
26923482b9bcSEric Blake         if (head) {
26933482b9bcSEric Blake             /* Make small requests to get to alignment boundaries. */
2694f5a5ca79SManos Pitsidianakis             num = MIN(bytes, align - head);
26953482b9bcSEric Blake             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
26963482b9bcSEric Blake                 num %= bs->bl.request_alignment;
26973482b9bcSEric Blake             }
26983482b9bcSEric Blake             head = (head + num) % align;
26993482b9bcSEric Blake             assert(num < max_pdiscard);
27003482b9bcSEric Blake         } else if (tail) {
27013482b9bcSEric Blake             if (num > align) {
27023482b9bcSEric Blake                 /* Shorten the request to the last aligned cluster.  */
27033482b9bcSEric Blake                 num -= tail;
27043482b9bcSEric Blake             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
27053482b9bcSEric Blake                        tail > bs->bl.request_alignment) {
27063482b9bcSEric Blake                 tail %= bs->bl.request_alignment;
27073482b9bcSEric Blake                 num -= tail;
27083482b9bcSEric Blake             }
27093482b9bcSEric Blake         }
27103482b9bcSEric Blake         /* limit request size */
27113482b9bcSEric Blake         if (num > max_pdiscard) {
27123482b9bcSEric Blake             num = max_pdiscard;
27133482b9bcSEric Blake         }
271461007b31SStefan Hajnoczi 
2715d470ad42SMax Reitz         if (!bs->drv) {
2716d470ad42SMax Reitz             ret = -ENOMEDIUM;
2717d470ad42SMax Reitz             goto out;
2718d470ad42SMax Reitz         }
271947a5486dSEric Blake         if (bs->drv->bdrv_co_pdiscard) {
272047a5486dSEric Blake             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
272161007b31SStefan Hajnoczi         } else {
272261007b31SStefan Hajnoczi             BlockAIOCB *acb;
272361007b31SStefan Hajnoczi             CoroutineIOCompletion co = {
272461007b31SStefan Hajnoczi                 .coroutine = qemu_coroutine_self(),
272561007b31SStefan Hajnoczi             };
272661007b31SStefan Hajnoczi 
27274da444a0SEric Blake             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
272861007b31SStefan Hajnoczi                                              bdrv_co_io_em_complete, &co);
272961007b31SStefan Hajnoczi             if (acb == NULL) {
2730b1066c87SFam Zheng                 ret = -EIO;
2731b1066c87SFam Zheng                 goto out;
273261007b31SStefan Hajnoczi             } else {
273361007b31SStefan Hajnoczi                 qemu_coroutine_yield();
273461007b31SStefan Hajnoczi                 ret = co.ret;
273561007b31SStefan Hajnoczi             }
273661007b31SStefan Hajnoczi         }
273761007b31SStefan Hajnoczi         if (ret && ret != -ENOTSUP) {
2738b1066c87SFam Zheng             goto out;
273961007b31SStefan Hajnoczi         }
274061007b31SStefan Hajnoczi 
27419f1963b3SEric Blake         offset += num;
2742f5a5ca79SManos Pitsidianakis         bytes -= num;
274361007b31SStefan Hajnoczi     }
2744b1066c87SFam Zheng     ret = 0;
2745b1066c87SFam Zheng out:
274600695c27SFam Zheng     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
2747b1066c87SFam Zheng     tracked_request_end(&req);
274899723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2749b1066c87SFam Zheng     return ret;
275061007b31SStefan Hajnoczi }
275161007b31SStefan Hajnoczi 
27520b9fd3f4SFam Zheng int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes)
275361007b31SStefan Hajnoczi {
275461007b31SStefan Hajnoczi     Coroutine *co;
275561007b31SStefan Hajnoczi     DiscardCo rwco = {
27560b9fd3f4SFam Zheng         .child = child,
27570c51a893SEric Blake         .offset = offset,
2758f5a5ca79SManos Pitsidianakis         .bytes = bytes,
275961007b31SStefan Hajnoczi         .ret = NOT_DONE,
276061007b31SStefan Hajnoczi     };
276161007b31SStefan Hajnoczi 
276261007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
276361007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
27640c51a893SEric Blake         bdrv_pdiscard_co_entry(&rwco);
276561007b31SStefan Hajnoczi     } else {
27660c51a893SEric Blake         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
27670b9fd3f4SFam Zheng         bdrv_coroutine_enter(child->bs, co);
27680b9fd3f4SFam Zheng         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
276961007b31SStefan Hajnoczi     }
277061007b31SStefan Hajnoczi 
277161007b31SStefan Hajnoczi     return rwco.ret;
277261007b31SStefan Hajnoczi }
277361007b31SStefan Hajnoczi 
277448af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
277561007b31SStefan Hajnoczi {
277661007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
27775c5ae76aSFam Zheng     CoroutineIOCompletion co = {
27785c5ae76aSFam Zheng         .coroutine = qemu_coroutine_self(),
27795c5ae76aSFam Zheng     };
27805c5ae76aSFam Zheng     BlockAIOCB *acb;
278161007b31SStefan Hajnoczi 
278299723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
278316a389dcSKevin Wolf     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
27845c5ae76aSFam Zheng         co.ret = -ENOTSUP;
27855c5ae76aSFam Zheng         goto out;
27865c5ae76aSFam Zheng     }
27875c5ae76aSFam Zheng 
278816a389dcSKevin Wolf     if (drv->bdrv_co_ioctl) {
278916a389dcSKevin Wolf         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
279016a389dcSKevin Wolf     } else {
27915c5ae76aSFam Zheng         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
27925c5ae76aSFam Zheng         if (!acb) {
2793c8a9fd80SFam Zheng             co.ret = -ENOTSUP;
2794c8a9fd80SFam Zheng             goto out;
27955c5ae76aSFam Zheng         }
27965c5ae76aSFam Zheng         qemu_coroutine_yield();
279716a389dcSKevin Wolf     }
27985c5ae76aSFam Zheng out:
279999723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
28005c5ae76aSFam Zheng     return co.ret;
28015c5ae76aSFam Zheng }
28025c5ae76aSFam Zheng 
280361007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size)
280461007b31SStefan Hajnoczi {
280561007b31SStefan Hajnoczi     return qemu_memalign(bdrv_opt_mem_align(bs), size);
280661007b31SStefan Hajnoczi }
280761007b31SStefan Hajnoczi 
280861007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size)
280961007b31SStefan Hajnoczi {
281061007b31SStefan Hajnoczi     return memset(qemu_blockalign(bs, size), 0, size);
281161007b31SStefan Hajnoczi }
281261007b31SStefan Hajnoczi 
281361007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
281461007b31SStefan Hajnoczi {
281561007b31SStefan Hajnoczi     size_t align = bdrv_opt_mem_align(bs);
281661007b31SStefan Hajnoczi 
281761007b31SStefan Hajnoczi     /* Ensure that NULL is never returned on success */
281861007b31SStefan Hajnoczi     assert(align > 0);
281961007b31SStefan Hajnoczi     if (size == 0) {
282061007b31SStefan Hajnoczi         size = align;
282161007b31SStefan Hajnoczi     }
282261007b31SStefan Hajnoczi 
282361007b31SStefan Hajnoczi     return qemu_try_memalign(align, size);
282461007b31SStefan Hajnoczi }
282561007b31SStefan Hajnoczi 
282661007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
282761007b31SStefan Hajnoczi {
282861007b31SStefan Hajnoczi     void *mem = qemu_try_blockalign(bs, size);
282961007b31SStefan Hajnoczi 
283061007b31SStefan Hajnoczi     if (mem) {
283161007b31SStefan Hajnoczi         memset(mem, 0, size);
283261007b31SStefan Hajnoczi     }
283361007b31SStefan Hajnoczi 
283461007b31SStefan Hajnoczi     return mem;
283561007b31SStefan Hajnoczi }
283661007b31SStefan Hajnoczi 
283761007b31SStefan Hajnoczi /*
283861007b31SStefan Hajnoczi  * Check if all memory in this vector is sector aligned.
283961007b31SStefan Hajnoczi  */
284061007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
284161007b31SStefan Hajnoczi {
284261007b31SStefan Hajnoczi     int i;
28434196d2f0SDenis V. Lunev     size_t alignment = bdrv_min_mem_align(bs);
284461007b31SStefan Hajnoczi 
284561007b31SStefan Hajnoczi     for (i = 0; i < qiov->niov; i++) {
284661007b31SStefan Hajnoczi         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
284761007b31SStefan Hajnoczi             return false;
284861007b31SStefan Hajnoczi         }
284961007b31SStefan Hajnoczi         if (qiov->iov[i].iov_len % alignment) {
285061007b31SStefan Hajnoczi             return false;
285161007b31SStefan Hajnoczi         }
285261007b31SStefan Hajnoczi     }
285361007b31SStefan Hajnoczi 
285461007b31SStefan Hajnoczi     return true;
285561007b31SStefan Hajnoczi }
285661007b31SStefan Hajnoczi 
285761007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs,
285861007b31SStefan Hajnoczi                                     NotifierWithReturn *notifier)
285961007b31SStefan Hajnoczi {
286061007b31SStefan Hajnoczi     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
286161007b31SStefan Hajnoczi }
286261007b31SStefan Hajnoczi 
286361007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs)
286461007b31SStefan Hajnoczi {
28656b98bd64SPaolo Bonzini     BdrvChild *child;
28666b98bd64SPaolo Bonzini 
28676b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
28686b98bd64SPaolo Bonzini         bdrv_io_plug(child->bs);
28696b98bd64SPaolo Bonzini     }
28706b98bd64SPaolo Bonzini 
2871850d54a2SPaolo Bonzini     if (atomic_fetch_inc(&bs->io_plugged) == 0) {
287261007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
287361007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_plug) {
287461007b31SStefan Hajnoczi             drv->bdrv_io_plug(bs);
28756b98bd64SPaolo Bonzini         }
287661007b31SStefan Hajnoczi     }
287761007b31SStefan Hajnoczi }
287861007b31SStefan Hajnoczi 
287961007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs)
288061007b31SStefan Hajnoczi {
28816b98bd64SPaolo Bonzini     BdrvChild *child;
28826b98bd64SPaolo Bonzini 
28836b98bd64SPaolo Bonzini     assert(bs->io_plugged);
2884850d54a2SPaolo Bonzini     if (atomic_fetch_dec(&bs->io_plugged) == 1) {
288561007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
288661007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_unplug) {
288761007b31SStefan Hajnoczi             drv->bdrv_io_unplug(bs);
288861007b31SStefan Hajnoczi         }
288961007b31SStefan Hajnoczi     }
289061007b31SStefan Hajnoczi 
28916b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
28926b98bd64SPaolo Bonzini         bdrv_io_unplug(child->bs);
28936b98bd64SPaolo Bonzini     }
28946b98bd64SPaolo Bonzini }
289523d0ba93SFam Zheng 
289623d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
289723d0ba93SFam Zheng {
289823d0ba93SFam Zheng     BdrvChild *child;
289923d0ba93SFam Zheng 
290023d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_register_buf) {
290123d0ba93SFam Zheng         bs->drv->bdrv_register_buf(bs, host, size);
290223d0ba93SFam Zheng     }
290323d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
290423d0ba93SFam Zheng         bdrv_register_buf(child->bs, host, size);
290523d0ba93SFam Zheng     }
290623d0ba93SFam Zheng }
290723d0ba93SFam Zheng 
290823d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host)
290923d0ba93SFam Zheng {
291023d0ba93SFam Zheng     BdrvChild *child;
291123d0ba93SFam Zheng 
291223d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_unregister_buf) {
291323d0ba93SFam Zheng         bs->drv->bdrv_unregister_buf(bs, host);
291423d0ba93SFam Zheng     }
291523d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
291623d0ba93SFam Zheng         bdrv_unregister_buf(child->bs, host);
291723d0ba93SFam Zheng     }
291823d0ba93SFam Zheng }
2919fcc67678SFam Zheng 
292067b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal(
292167b51fb9SVladimir Sementsov-Ogievskiy         BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
292267b51fb9SVladimir Sementsov-Ogievskiy         uint64_t dst_offset, uint64_t bytes,
292367b51fb9SVladimir Sementsov-Ogievskiy         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
2924fcc67678SFam Zheng         bool recurse_src)
2925fcc67678SFam Zheng {
2926999658a0SVladimir Sementsov-Ogievskiy     BdrvTrackedRequest req;
2927fcc67678SFam Zheng     int ret;
2928fcc67678SFam Zheng 
2929fe0480d6SKevin Wolf     /* TODO We can support BDRV_REQ_NO_FALLBACK here */
2930fe0480d6SKevin Wolf     assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
2931fe0480d6SKevin Wolf     assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
2932fe0480d6SKevin Wolf 
2933d4d3e5a0SFam Zheng     if (!dst || !dst->bs) {
2934fcc67678SFam Zheng         return -ENOMEDIUM;
2935fcc67678SFam Zheng     }
2936fcc67678SFam Zheng     ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
2937fcc67678SFam Zheng     if (ret) {
2938fcc67678SFam Zheng         return ret;
2939fcc67678SFam Zheng     }
294067b51fb9SVladimir Sementsov-Ogievskiy     if (write_flags & BDRV_REQ_ZERO_WRITE) {
294167b51fb9SVladimir Sementsov-Ogievskiy         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
2942fcc67678SFam Zheng     }
2943fcc67678SFam Zheng 
2944d4d3e5a0SFam Zheng     if (!src || !src->bs) {
2945d4d3e5a0SFam Zheng         return -ENOMEDIUM;
2946d4d3e5a0SFam Zheng     }
2947d4d3e5a0SFam Zheng     ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
2948d4d3e5a0SFam Zheng     if (ret) {
2949d4d3e5a0SFam Zheng         return ret;
2950d4d3e5a0SFam Zheng     }
2951d4d3e5a0SFam Zheng 
2952fcc67678SFam Zheng     if (!src->bs->drv->bdrv_co_copy_range_from
2953fcc67678SFam Zheng         || !dst->bs->drv->bdrv_co_copy_range_to
2954fcc67678SFam Zheng         || src->bs->encrypted || dst->bs->encrypted) {
2955fcc67678SFam Zheng         return -ENOTSUP;
2956fcc67678SFam Zheng     }
2957999658a0SVladimir Sementsov-Ogievskiy 
2958999658a0SVladimir Sementsov-Ogievskiy     if (recurse_src) {
2959d4d3e5a0SFam Zheng         bdrv_inc_in_flight(src->bs);
2960999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, src->bs, src_offset, bytes,
2961999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_READ);
296237aec7d7SFam Zheng 
296309d2f948SVladimir Sementsov-Ogievskiy         /* BDRV_REQ_SERIALISING is only for write operation */
296409d2f948SVladimir Sementsov-Ogievskiy         assert(!(read_flags & BDRV_REQ_SERIALISING));
296567b51fb9SVladimir Sementsov-Ogievskiy         if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
2966999658a0SVladimir Sementsov-Ogievskiy             wait_serialising_requests(&req);
2967dee12de8SFam Zheng         }
2968999658a0SVladimir Sementsov-Ogievskiy 
296937aec7d7SFam Zheng         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
2970fcc67678SFam Zheng                                                     src, src_offset,
2971fcc67678SFam Zheng                                                     dst, dst_offset,
297267b51fb9SVladimir Sementsov-Ogievskiy                                                     bytes,
297367b51fb9SVladimir Sementsov-Ogievskiy                                                     read_flags, write_flags);
2974999658a0SVladimir Sementsov-Ogievskiy 
2975999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
2976999658a0SVladimir Sementsov-Ogievskiy         bdrv_dec_in_flight(src->bs);
2977fcc67678SFam Zheng     } else {
2978999658a0SVladimir Sementsov-Ogievskiy         bdrv_inc_in_flight(dst->bs);
2979999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
2980999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_WRITE);
29810eb1e891SFam Zheng         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
29820eb1e891SFam Zheng                                         write_flags);
29830eb1e891SFam Zheng         if (!ret) {
298437aec7d7SFam Zheng             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
2985fcc67678SFam Zheng                                                       src, src_offset,
2986fcc67678SFam Zheng                                                       dst, dst_offset,
298767b51fb9SVladimir Sementsov-Ogievskiy                                                       bytes,
298867b51fb9SVladimir Sementsov-Ogievskiy                                                       read_flags, write_flags);
29890eb1e891SFam Zheng         }
29900eb1e891SFam Zheng         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
2991999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
2992d4d3e5a0SFam Zheng         bdrv_dec_in_flight(dst->bs);
2993999658a0SVladimir Sementsov-Ogievskiy     }
2994999658a0SVladimir Sementsov-Ogievskiy 
299537aec7d7SFam Zheng     return ret;
2996fcc67678SFam Zheng }
2997fcc67678SFam Zheng 
2998fcc67678SFam Zheng /* Copy range from @src to @dst.
2999fcc67678SFam Zheng  *
3000fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3001fcc67678SFam Zheng  * semantics. */
3002fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3003fcc67678SFam Zheng                                          BdrvChild *dst, uint64_t dst_offset,
300467b51fb9SVladimir Sementsov-Ogievskiy                                          uint64_t bytes,
300567b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags read_flags,
300667b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags write_flags)
3007fcc67678SFam Zheng {
3008ecc983a5SFam Zheng     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3009ecc983a5SFam Zheng                                   read_flags, write_flags);
3010fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
301167b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, true);
3012fcc67678SFam Zheng }
3013fcc67678SFam Zheng 
3014fcc67678SFam Zheng /* Copy range from @src to @dst.
3015fcc67678SFam Zheng  *
3016fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3017fcc67678SFam Zheng  * semantics. */
3018fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3019fcc67678SFam Zheng                                        BdrvChild *dst, uint64_t dst_offset,
302067b51fb9SVladimir Sementsov-Ogievskiy                                        uint64_t bytes,
302167b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags read_flags,
302267b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags write_flags)
3023fcc67678SFam Zheng {
3024ecc983a5SFam Zheng     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3025ecc983a5SFam Zheng                                 read_flags, write_flags);
3026fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
302767b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, false);
3028fcc67678SFam Zheng }
3029fcc67678SFam Zheng 
3030fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3031fcc67678SFam Zheng                                     BdrvChild *dst, uint64_t dst_offset,
303267b51fb9SVladimir Sementsov-Ogievskiy                                     uint64_t bytes, BdrvRequestFlags read_flags,
303367b51fb9SVladimir Sementsov-Ogievskiy                                     BdrvRequestFlags write_flags)
3034fcc67678SFam Zheng {
303537aec7d7SFam Zheng     return bdrv_co_copy_range_from(src, src_offset,
3036fcc67678SFam Zheng                                    dst, dst_offset,
303767b51fb9SVladimir Sementsov-Ogievskiy                                    bytes, read_flags, write_flags);
3038fcc67678SFam Zheng }
30393d9f2d2aSKevin Wolf 
30403d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs)
30413d9f2d2aSKevin Wolf {
30423d9f2d2aSKevin Wolf     BdrvChild *c;
30433d9f2d2aSKevin Wolf     QLIST_FOREACH(c, &bs->parents, next_parent) {
30443d9f2d2aSKevin Wolf         if (c->role->resize) {
30453d9f2d2aSKevin Wolf             c->role->resize(c);
30463d9f2d2aSKevin Wolf         }
30473d9f2d2aSKevin Wolf     }
30483d9f2d2aSKevin Wolf }
30493d9f2d2aSKevin Wolf 
30503d9f2d2aSKevin Wolf /**
30513d9f2d2aSKevin Wolf  * Truncate file to 'offset' bytes (needed only for file protocols)
30523d9f2d2aSKevin Wolf  */
30533d9f2d2aSKevin Wolf int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
30543d9f2d2aSKevin Wolf                                   PreallocMode prealloc, Error **errp)
30553d9f2d2aSKevin Wolf {
30563d9f2d2aSKevin Wolf     BlockDriverState *bs = child->bs;
30573d9f2d2aSKevin Wolf     BlockDriver *drv = bs->drv;
30581bc5f09fSKevin Wolf     BdrvTrackedRequest req;
30591bc5f09fSKevin Wolf     int64_t old_size, new_bytes;
30603d9f2d2aSKevin Wolf     int ret;
30613d9f2d2aSKevin Wolf 
30623d9f2d2aSKevin Wolf 
30633d9f2d2aSKevin Wolf     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
30643d9f2d2aSKevin Wolf     if (!drv) {
30653d9f2d2aSKevin Wolf         error_setg(errp, "No medium inserted");
30663d9f2d2aSKevin Wolf         return -ENOMEDIUM;
30673d9f2d2aSKevin Wolf     }
30683d9f2d2aSKevin Wolf     if (offset < 0) {
30693d9f2d2aSKevin Wolf         error_setg(errp, "Image size cannot be negative");
30703d9f2d2aSKevin Wolf         return -EINVAL;
30713d9f2d2aSKevin Wolf     }
30723d9f2d2aSKevin Wolf 
30731bc5f09fSKevin Wolf     old_size = bdrv_getlength(bs);
30741bc5f09fSKevin Wolf     if (old_size < 0) {
30751bc5f09fSKevin Wolf         error_setg_errno(errp, -old_size, "Failed to get old image size");
30761bc5f09fSKevin Wolf         return old_size;
30771bc5f09fSKevin Wolf     }
30781bc5f09fSKevin Wolf 
30791bc5f09fSKevin Wolf     if (offset > old_size) {
30801bc5f09fSKevin Wolf         new_bytes = offset - old_size;
30811bc5f09fSKevin Wolf     } else {
30821bc5f09fSKevin Wolf         new_bytes = 0;
30831bc5f09fSKevin Wolf     }
30841bc5f09fSKevin Wolf 
30853d9f2d2aSKevin Wolf     bdrv_inc_in_flight(bs);
30865416a11eSFam Zheng     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
30875416a11eSFam Zheng                           BDRV_TRACKED_TRUNCATE);
30881bc5f09fSKevin Wolf 
30891bc5f09fSKevin Wolf     /* If we are growing the image and potentially using preallocation for the
30901bc5f09fSKevin Wolf      * new area, we need to make sure that no write requests are made to it
30911bc5f09fSKevin Wolf      * concurrently or they might be overwritten by preallocation. */
30921bc5f09fSKevin Wolf     if (new_bytes) {
30931bc5f09fSKevin Wolf         mark_request_serialising(&req, 1);
3094cd47d792SFam Zheng     }
3095cd47d792SFam Zheng     if (bs->read_only) {
3096cd47d792SFam Zheng         error_setg(errp, "Image is read-only");
3097cd47d792SFam Zheng         ret = -EACCES;
3098cd47d792SFam Zheng         goto out;
3099cd47d792SFam Zheng     }
3100cd47d792SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3101cd47d792SFam Zheng                                     0);
3102cd47d792SFam Zheng     if (ret < 0) {
3103cd47d792SFam Zheng         error_setg_errno(errp, -ret,
3104cd47d792SFam Zheng                          "Failed to prepare request for truncation");
3105cd47d792SFam Zheng         goto out;
31061bc5f09fSKevin Wolf     }
31073d9f2d2aSKevin Wolf 
31083d9f2d2aSKevin Wolf     if (!drv->bdrv_co_truncate) {
31093d9f2d2aSKevin Wolf         if (bs->file && drv->is_filter) {
31103d9f2d2aSKevin Wolf             ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
31113d9f2d2aSKevin Wolf             goto out;
31123d9f2d2aSKevin Wolf         }
31133d9f2d2aSKevin Wolf         error_setg(errp, "Image format driver does not support resize");
31143d9f2d2aSKevin Wolf         ret = -ENOTSUP;
31153d9f2d2aSKevin Wolf         goto out;
31163d9f2d2aSKevin Wolf     }
31173d9f2d2aSKevin Wolf 
31183d9f2d2aSKevin Wolf     ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
31193d9f2d2aSKevin Wolf     if (ret < 0) {
31203d9f2d2aSKevin Wolf         goto out;
31213d9f2d2aSKevin Wolf     }
31223d9f2d2aSKevin Wolf     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
31233d9f2d2aSKevin Wolf     if (ret < 0) {
31243d9f2d2aSKevin Wolf         error_setg_errno(errp, -ret, "Could not refresh total sector count");
31253d9f2d2aSKevin Wolf     } else {
31263d9f2d2aSKevin Wolf         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
31273d9f2d2aSKevin Wolf     }
3128cd47d792SFam Zheng     /* It's possible that truncation succeeded but refresh_total_sectors
3129cd47d792SFam Zheng      * failed, but the latter doesn't affect how we should finish the request.
3130cd47d792SFam Zheng      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3131cd47d792SFam Zheng     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
31323d9f2d2aSKevin Wolf 
31333d9f2d2aSKevin Wolf out:
31341bc5f09fSKevin Wolf     tracked_request_end(&req);
31353d9f2d2aSKevin Wolf     bdrv_dec_in_flight(bs);
31361bc5f09fSKevin Wolf 
31373d9f2d2aSKevin Wolf     return ret;
31383d9f2d2aSKevin Wolf }
31393d9f2d2aSKevin Wolf 
31403d9f2d2aSKevin Wolf typedef struct TruncateCo {
31413d9f2d2aSKevin Wolf     BdrvChild *child;
31423d9f2d2aSKevin Wolf     int64_t offset;
31433d9f2d2aSKevin Wolf     PreallocMode prealloc;
31443d9f2d2aSKevin Wolf     Error **errp;
31453d9f2d2aSKevin Wolf     int ret;
31463d9f2d2aSKevin Wolf } TruncateCo;
31473d9f2d2aSKevin Wolf 
31483d9f2d2aSKevin Wolf static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
31493d9f2d2aSKevin Wolf {
31503d9f2d2aSKevin Wolf     TruncateCo *tco = opaque;
31513d9f2d2aSKevin Wolf     tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
31523d9f2d2aSKevin Wolf                                 tco->errp);
31534720cbeeSKevin Wolf     aio_wait_kick();
31543d9f2d2aSKevin Wolf }
31553d9f2d2aSKevin Wolf 
31563d9f2d2aSKevin Wolf int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
31573d9f2d2aSKevin Wolf                   Error **errp)
31583d9f2d2aSKevin Wolf {
31593d9f2d2aSKevin Wolf     Coroutine *co;
31603d9f2d2aSKevin Wolf     TruncateCo tco = {
31613d9f2d2aSKevin Wolf         .child      = child,
31623d9f2d2aSKevin Wolf         .offset     = offset,
31633d9f2d2aSKevin Wolf         .prealloc   = prealloc,
31643d9f2d2aSKevin Wolf         .errp       = errp,
31653d9f2d2aSKevin Wolf         .ret        = NOT_DONE,
31663d9f2d2aSKevin Wolf     };
31673d9f2d2aSKevin Wolf 
31683d9f2d2aSKevin Wolf     if (qemu_in_coroutine()) {
31693d9f2d2aSKevin Wolf         /* Fast-path if already in coroutine context */
31703d9f2d2aSKevin Wolf         bdrv_truncate_co_entry(&tco);
31713d9f2d2aSKevin Wolf     } else {
31723d9f2d2aSKevin Wolf         co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
31734720cbeeSKevin Wolf         bdrv_coroutine_enter(child->bs, co);
31743d9f2d2aSKevin Wolf         BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
31753d9f2d2aSKevin Wolf     }
31763d9f2d2aSKevin Wolf 
31773d9f2d2aSKevin Wolf     return tco.ret;
31783d9f2d2aSKevin Wolf }
3179