xref: /qemu/block/io.c (revision cfe29d8294e06420e15d4938421ae006c8ac49e7)
161007b31SStefan Hajnoczi /*
261007b31SStefan Hajnoczi  * Block layer I/O functions
361007b31SStefan Hajnoczi  *
461007b31SStefan Hajnoczi  * Copyright (c) 2003 Fabrice Bellard
561007b31SStefan Hajnoczi  *
661007b31SStefan Hajnoczi  * Permission is hereby granted, free of charge, to any person obtaining a copy
761007b31SStefan Hajnoczi  * of this software and associated documentation files (the "Software"), to deal
861007b31SStefan Hajnoczi  * in the Software without restriction, including without limitation the rights
961007b31SStefan Hajnoczi  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1061007b31SStefan Hajnoczi  * copies of the Software, and to permit persons to whom the Software is
1161007b31SStefan Hajnoczi  * furnished to do so, subject to the following conditions:
1261007b31SStefan Hajnoczi  *
1361007b31SStefan Hajnoczi  * The above copyright notice and this permission notice shall be included in
1461007b31SStefan Hajnoczi  * all copies or substantial portions of the Software.
1561007b31SStefan Hajnoczi  *
1661007b31SStefan Hajnoczi  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1761007b31SStefan Hajnoczi  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1861007b31SStefan Hajnoczi  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1961007b31SStefan Hajnoczi  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2061007b31SStefan Hajnoczi  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2161007b31SStefan Hajnoczi  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2261007b31SStefan Hajnoczi  * THE SOFTWARE.
2361007b31SStefan Hajnoczi  */
2461007b31SStefan Hajnoczi 
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2661007b31SStefan Hajnoczi #include "trace.h"
277f0e9da6SMax Reitz #include "sysemu/block-backend.h"
287719f3c9SStefan Hajnoczi #include "block/aio-wait.h"
2961007b31SStefan Hajnoczi #include "block/blockjob.h"
30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h"
3161007b31SStefan Hajnoczi #include "block/block_int.h"
32f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
33da34e65cSMarkus Armbruster #include "qapi/error.h"
34d49b6836SMarkus Armbruster #include "qemu/error-report.h"
3561007b31SStefan Hajnoczi 
3661007b31SStefan Hajnoczi #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
3761007b31SStefan Hajnoczi 
38cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
40cb2e2878SEric Blake 
417f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs);
42d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
43f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags);
4461007b31SStefan Hajnoczi 
456cd5c9d7SKevin Wolf void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
466cd5c9d7SKevin Wolf                                bool ignore_bds_parents)
4761007b31SStefan Hajnoczi {
4802d21300SKevin Wolf     BdrvChild *c, *next;
4927ccdd52SKevin Wolf 
5002d21300SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
516cd5c9d7SKevin Wolf         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
520152bf40SKevin Wolf             continue;
530152bf40SKevin Wolf         }
544be6a6d1SKevin Wolf         bdrv_parent_drained_begin_single(c, false);
55ce0f1412SPaolo Bonzini     }
56ce0f1412SPaolo Bonzini }
57ce0f1412SPaolo Bonzini 
586cd5c9d7SKevin Wolf void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
596cd5c9d7SKevin Wolf                              bool ignore_bds_parents)
60ce0f1412SPaolo Bonzini {
6102d21300SKevin Wolf     BdrvChild *c, *next;
6227ccdd52SKevin Wolf 
6302d21300SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
646cd5c9d7SKevin Wolf         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
650152bf40SKevin Wolf             continue;
660152bf40SKevin Wolf         }
67c2066af0SKevin Wolf         if (c->role->drained_end) {
68c2066af0SKevin Wolf             c->role->drained_end(c);
6927ccdd52SKevin Wolf         }
70c2066af0SKevin Wolf     }
7161007b31SStefan Hajnoczi }
7261007b31SStefan Hajnoczi 
734be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c)
744be6a6d1SKevin Wolf {
754be6a6d1SKevin Wolf     if (c->role->drained_poll) {
764be6a6d1SKevin Wolf         return c->role->drained_poll(c);
774be6a6d1SKevin Wolf     }
784be6a6d1SKevin Wolf     return false;
794be6a6d1SKevin Wolf }
804be6a6d1SKevin Wolf 
816cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
826cd5c9d7SKevin Wolf                                      bool ignore_bds_parents)
8389bd0305SKevin Wolf {
8489bd0305SKevin Wolf     BdrvChild *c, *next;
8589bd0305SKevin Wolf     bool busy = false;
8689bd0305SKevin Wolf 
8789bd0305SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
886cd5c9d7SKevin Wolf         if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
8989bd0305SKevin Wolf             continue;
9089bd0305SKevin Wolf         }
914be6a6d1SKevin Wolf         busy |= bdrv_parent_drained_poll_single(c);
9289bd0305SKevin Wolf     }
9389bd0305SKevin Wolf 
9489bd0305SKevin Wolf     return busy;
9589bd0305SKevin Wolf }
9689bd0305SKevin Wolf 
974be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
984be6a6d1SKevin Wolf {
994be6a6d1SKevin Wolf     if (c->role->drained_begin) {
1004be6a6d1SKevin Wolf         c->role->drained_begin(c);
1014be6a6d1SKevin Wolf     }
1024be6a6d1SKevin Wolf     if (poll) {
1034be6a6d1SKevin Wolf         BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
1044be6a6d1SKevin Wolf     }
1054be6a6d1SKevin Wolf }
1064be6a6d1SKevin Wolf 
107d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
108d9e0dfa2SEric Blake {
109d9e0dfa2SEric Blake     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
110d9e0dfa2SEric Blake     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
111d9e0dfa2SEric Blake     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
112d9e0dfa2SEric Blake                                  src->opt_mem_alignment);
113d9e0dfa2SEric Blake     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
114d9e0dfa2SEric Blake                                  src->min_mem_alignment);
115d9e0dfa2SEric Blake     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
116d9e0dfa2SEric Blake }
117d9e0dfa2SEric Blake 
11861007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
11961007b31SStefan Hajnoczi {
12061007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
12161007b31SStefan Hajnoczi     Error *local_err = NULL;
12261007b31SStefan Hajnoczi 
12361007b31SStefan Hajnoczi     memset(&bs->bl, 0, sizeof(bs->bl));
12461007b31SStefan Hajnoczi 
12561007b31SStefan Hajnoczi     if (!drv) {
12661007b31SStefan Hajnoczi         return;
12761007b31SStefan Hajnoczi     }
12861007b31SStefan Hajnoczi 
12979ba8c98SEric Blake     /* Default alignment based on whether driver has byte interface */
130e31f6864SEric Blake     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
131e31f6864SEric Blake                                 drv->bdrv_aio_preadv) ? 1 : 512;
13279ba8c98SEric Blake 
13361007b31SStefan Hajnoczi     /* Take some limits from the children as a default */
13461007b31SStefan Hajnoczi     if (bs->file) {
1359a4f4c31SKevin Wolf         bdrv_refresh_limits(bs->file->bs, &local_err);
13661007b31SStefan Hajnoczi         if (local_err) {
13761007b31SStefan Hajnoczi             error_propagate(errp, local_err);
13861007b31SStefan Hajnoczi             return;
13961007b31SStefan Hajnoczi         }
140d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
14161007b31SStefan Hajnoczi     } else {
1424196d2f0SDenis V. Lunev         bs->bl.min_mem_alignment = 512;
143459b4e66SDenis V. Lunev         bs->bl.opt_mem_alignment = getpagesize();
144bd44feb7SStefan Hajnoczi 
145bd44feb7SStefan Hajnoczi         /* Safe default since most protocols use readv()/writev()/etc */
146bd44feb7SStefan Hajnoczi         bs->bl.max_iov = IOV_MAX;
14761007b31SStefan Hajnoczi     }
14861007b31SStefan Hajnoczi 
149760e0063SKevin Wolf     if (bs->backing) {
150760e0063SKevin Wolf         bdrv_refresh_limits(bs->backing->bs, &local_err);
15161007b31SStefan Hajnoczi         if (local_err) {
15261007b31SStefan Hajnoczi             error_propagate(errp, local_err);
15361007b31SStefan Hajnoczi             return;
15461007b31SStefan Hajnoczi         }
155d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
15661007b31SStefan Hajnoczi     }
15761007b31SStefan Hajnoczi 
15861007b31SStefan Hajnoczi     /* Then let the driver override it */
15961007b31SStefan Hajnoczi     if (drv->bdrv_refresh_limits) {
16061007b31SStefan Hajnoczi         drv->bdrv_refresh_limits(bs, errp);
16161007b31SStefan Hajnoczi     }
16261007b31SStefan Hajnoczi }
16361007b31SStefan Hajnoczi 
16461007b31SStefan Hajnoczi /**
16561007b31SStefan Hajnoczi  * The copy-on-read flag is actually a reference count so multiple users may
16661007b31SStefan Hajnoczi  * use the feature without worrying about clobbering its previous state.
16761007b31SStefan Hajnoczi  * Copy-on-read stays enabled until all users have called to disable it.
16861007b31SStefan Hajnoczi  */
16961007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs)
17061007b31SStefan Hajnoczi {
171d3faa13eSPaolo Bonzini     atomic_inc(&bs->copy_on_read);
17261007b31SStefan Hajnoczi }
17361007b31SStefan Hajnoczi 
17461007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs)
17561007b31SStefan Hajnoczi {
176d3faa13eSPaolo Bonzini     int old = atomic_fetch_dec(&bs->copy_on_read);
177d3faa13eSPaolo Bonzini     assert(old >= 1);
17861007b31SStefan Hajnoczi }
17961007b31SStefan Hajnoczi 
18061124f03SPaolo Bonzini typedef struct {
18161124f03SPaolo Bonzini     Coroutine *co;
18261124f03SPaolo Bonzini     BlockDriverState *bs;
18361124f03SPaolo Bonzini     bool done;
184481cad48SManos Pitsidianakis     bool begin;
185b0165585SKevin Wolf     bool recursive;
186fe4f0614SKevin Wolf     bool poll;
1870152bf40SKevin Wolf     BdrvChild *parent;
1886cd5c9d7SKevin Wolf     bool ignore_bds_parents;
18961124f03SPaolo Bonzini } BdrvCoDrainData;
19061124f03SPaolo Bonzini 
19161124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
19261124f03SPaolo Bonzini {
19361124f03SPaolo Bonzini     BdrvCoDrainData *data = opaque;
19461124f03SPaolo Bonzini     BlockDriverState *bs = data->bs;
19561124f03SPaolo Bonzini 
196481cad48SManos Pitsidianakis     if (data->begin) {
197f8ea8dacSManos Pitsidianakis         bs->drv->bdrv_co_drain_begin(bs);
198481cad48SManos Pitsidianakis     } else {
199481cad48SManos Pitsidianakis         bs->drv->bdrv_co_drain_end(bs);
200481cad48SManos Pitsidianakis     }
20161124f03SPaolo Bonzini 
20261124f03SPaolo Bonzini     /* Set data->done before reading bs->wakeup.  */
20361124f03SPaolo Bonzini     atomic_mb_set(&data->done, true);
2040109e7e6SKevin Wolf     bdrv_dec_in_flight(bs);
2050109e7e6SKevin Wolf 
2060109e7e6SKevin Wolf     if (data->begin) {
2070109e7e6SKevin Wolf         g_free(data);
2080109e7e6SKevin Wolf     }
20961124f03SPaolo Bonzini }
21061124f03SPaolo Bonzini 
211db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
2127d40d9efSKevin Wolf static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
21361124f03SPaolo Bonzini {
2140109e7e6SKevin Wolf     BdrvCoDrainData *data;
21561124f03SPaolo Bonzini 
216f8ea8dacSManos Pitsidianakis     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
217481cad48SManos Pitsidianakis             (!begin && !bs->drv->bdrv_co_drain_end)) {
21861124f03SPaolo Bonzini         return;
21961124f03SPaolo Bonzini     }
22061124f03SPaolo Bonzini 
2210109e7e6SKevin Wolf     data = g_new(BdrvCoDrainData, 1);
2220109e7e6SKevin Wolf     *data = (BdrvCoDrainData) {
2230109e7e6SKevin Wolf         .bs = bs,
2240109e7e6SKevin Wolf         .done = false,
2250109e7e6SKevin Wolf         .begin = begin
2260109e7e6SKevin Wolf     };
2270109e7e6SKevin Wolf 
2280109e7e6SKevin Wolf     /* Make sure the driver callback completes during the polling phase for
2290109e7e6SKevin Wolf      * drain_begin. */
2300109e7e6SKevin Wolf     bdrv_inc_in_flight(bs);
2310109e7e6SKevin Wolf     data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
2320109e7e6SKevin Wolf     aio_co_schedule(bdrv_get_aio_context(bs), data->co);
2330109e7e6SKevin Wolf 
2340109e7e6SKevin Wolf     if (!begin) {
2350109e7e6SKevin Wolf         BDRV_POLL_WHILE(bs, !data->done);
2360109e7e6SKevin Wolf         g_free(data);
2370109e7e6SKevin Wolf     }
23861124f03SPaolo Bonzini }
23961124f03SPaolo Bonzini 
2401cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
241fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
2426cd5c9d7SKevin Wolf                      BdrvChild *ignore_parent, bool ignore_bds_parents)
24389bd0305SKevin Wolf {
244fe4f0614SKevin Wolf     BdrvChild *child, *next;
245fe4f0614SKevin Wolf 
2466cd5c9d7SKevin Wolf     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
24789bd0305SKevin Wolf         return true;
24889bd0305SKevin Wolf     }
24989bd0305SKevin Wolf 
250fe4f0614SKevin Wolf     if (atomic_read(&bs->in_flight)) {
251fe4f0614SKevin Wolf         return true;
25289bd0305SKevin Wolf     }
25389bd0305SKevin Wolf 
254fe4f0614SKevin Wolf     if (recursive) {
2556cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
256fe4f0614SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
2576cd5c9d7SKevin Wolf             if (bdrv_drain_poll(child->bs, recursive, child, false)) {
258fe4f0614SKevin Wolf                 return true;
259fe4f0614SKevin Wolf             }
260fe4f0614SKevin Wolf         }
261fe4f0614SKevin Wolf     }
262fe4f0614SKevin Wolf 
263fe4f0614SKevin Wolf     return false;
264fe4f0614SKevin Wolf }
265fe4f0614SKevin Wolf 
266fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
26789bd0305SKevin Wolf                                       BdrvChild *ignore_parent)
2681cc8e54aSKevin Wolf {
2696cd5c9d7SKevin Wolf     return bdrv_drain_poll(bs, recursive, ignore_parent, false);
2701cc8e54aSKevin Wolf }
2711cc8e54aSKevin Wolf 
272b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
2736cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
2746cd5c9d7SKevin Wolf                                   bool poll);
275b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
2766cd5c9d7SKevin Wolf                                 BdrvChild *parent, bool ignore_bds_parents);
2770152bf40SKevin Wolf 
278a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque)
279a77fd4bbSFam Zheng {
280a77fd4bbSFam Zheng     BdrvCoDrainData *data = opaque;
281a77fd4bbSFam Zheng     Coroutine *co = data->co;
28299723548SPaolo Bonzini     BlockDriverState *bs = data->bs;
283a77fd4bbSFam Zheng 
284c8ca33d0SKevin Wolf     if (bs) {
285aa1361d5SKevin Wolf         AioContext *ctx = bdrv_get_aio_context(bs);
286aa1361d5SKevin Wolf         AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
287aa1361d5SKevin Wolf 
288aa1361d5SKevin Wolf         /*
289aa1361d5SKevin Wolf          * When the coroutine yielded, the lock for its home context was
290aa1361d5SKevin Wolf          * released, so we need to re-acquire it here. If it explicitly
291aa1361d5SKevin Wolf          * acquired a different context, the lock is still held and we don't
292aa1361d5SKevin Wolf          * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
293aa1361d5SKevin Wolf          */
294aa1361d5SKevin Wolf         if (ctx == co_ctx) {
295aa1361d5SKevin Wolf             aio_context_acquire(ctx);
296aa1361d5SKevin Wolf         }
29799723548SPaolo Bonzini         bdrv_dec_in_flight(bs);
298481cad48SManos Pitsidianakis         if (data->begin) {
2996cd5c9d7SKevin Wolf             bdrv_do_drained_begin(bs, data->recursive, data->parent,
3006cd5c9d7SKevin Wolf                                   data->ignore_bds_parents, data->poll);
301481cad48SManos Pitsidianakis         } else {
3026cd5c9d7SKevin Wolf             bdrv_do_drained_end(bs, data->recursive, data->parent,
3036cd5c9d7SKevin Wolf                                 data->ignore_bds_parents);
304481cad48SManos Pitsidianakis         }
305aa1361d5SKevin Wolf         if (ctx == co_ctx) {
306aa1361d5SKevin Wolf             aio_context_release(ctx);
307aa1361d5SKevin Wolf         }
308c8ca33d0SKevin Wolf     } else {
309c8ca33d0SKevin Wolf         assert(data->begin);
310c8ca33d0SKevin Wolf         bdrv_drain_all_begin();
311c8ca33d0SKevin Wolf     }
312481cad48SManos Pitsidianakis 
313a77fd4bbSFam Zheng     data->done = true;
3141919631eSPaolo Bonzini     aio_co_wake(co);
315a77fd4bbSFam Zheng }
316a77fd4bbSFam Zheng 
317481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
318b0165585SKevin Wolf                                                 bool begin, bool recursive,
3196cd5c9d7SKevin Wolf                                                 BdrvChild *parent,
3206cd5c9d7SKevin Wolf                                                 bool ignore_bds_parents,
3216cd5c9d7SKevin Wolf                                                 bool poll)
322a77fd4bbSFam Zheng {
323a77fd4bbSFam Zheng     BdrvCoDrainData data;
324a77fd4bbSFam Zheng 
325a77fd4bbSFam Zheng     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
326c40a2545SStefan Hajnoczi      * other coroutines run if they were queued by aio_co_enter(). */
327a77fd4bbSFam Zheng 
328a77fd4bbSFam Zheng     assert(qemu_in_coroutine());
329a77fd4bbSFam Zheng     data = (BdrvCoDrainData) {
330a77fd4bbSFam Zheng         .co = qemu_coroutine_self(),
331a77fd4bbSFam Zheng         .bs = bs,
332a77fd4bbSFam Zheng         .done = false,
333481cad48SManos Pitsidianakis         .begin = begin,
334b0165585SKevin Wolf         .recursive = recursive,
3350152bf40SKevin Wolf         .parent = parent,
3366cd5c9d7SKevin Wolf         .ignore_bds_parents = ignore_bds_parents,
337fe4f0614SKevin Wolf         .poll = poll,
338a77fd4bbSFam Zheng     };
339c8ca33d0SKevin Wolf     if (bs) {
34099723548SPaolo Bonzini         bdrv_inc_in_flight(bs);
341c8ca33d0SKevin Wolf     }
342fffb6e12SPaolo Bonzini     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
343fffb6e12SPaolo Bonzini                             bdrv_co_drain_bh_cb, &data);
344a77fd4bbSFam Zheng 
345a77fd4bbSFam Zheng     qemu_coroutine_yield();
346a77fd4bbSFam Zheng     /* If we are resumed from some other event (such as an aio completion or a
347a77fd4bbSFam Zheng      * timer callback), it is a bug in the caller that should be fixed. */
348a77fd4bbSFam Zheng     assert(data.done);
349a77fd4bbSFam Zheng }
350a77fd4bbSFam Zheng 
351dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
3526cd5c9d7SKevin Wolf                                    BdrvChild *parent, bool ignore_bds_parents)
353dcf94a23SKevin Wolf {
354dcf94a23SKevin Wolf     assert(!qemu_in_coroutine());
355dcf94a23SKevin Wolf 
356dcf94a23SKevin Wolf     /* Stop things in parent-to-child order */
357dcf94a23SKevin Wolf     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
358dcf94a23SKevin Wolf         aio_disable_external(bdrv_get_aio_context(bs));
359dcf94a23SKevin Wolf     }
360dcf94a23SKevin Wolf 
3616cd5c9d7SKevin Wolf     bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
362dcf94a23SKevin Wolf     bdrv_drain_invoke(bs, true);
363dcf94a23SKevin Wolf }
364dcf94a23SKevin Wolf 
365dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
3666cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
3676cd5c9d7SKevin Wolf                                   bool poll)
3686820643fSKevin Wolf {
369b0165585SKevin Wolf     BdrvChild *child, *next;
370b0165585SKevin Wolf 
371d42cf288SPaolo Bonzini     if (qemu_in_coroutine()) {
3726cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
3736cd5c9d7SKevin Wolf                                poll);
374d42cf288SPaolo Bonzini         return;
375d42cf288SPaolo Bonzini     }
376d42cf288SPaolo Bonzini 
3776cd5c9d7SKevin Wolf     bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
378d30b8e64SKevin Wolf 
379b0165585SKevin Wolf     if (recursive) {
3806cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
381d736f119SKevin Wolf         bs->recursive_quiesce_counter++;
382b0165585SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
3836cd5c9d7SKevin Wolf             bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
3846cd5c9d7SKevin Wolf                                   false);
385b0165585SKevin Wolf         }
386b0165585SKevin Wolf     }
387fe4f0614SKevin Wolf 
388fe4f0614SKevin Wolf     /*
389fe4f0614SKevin Wolf      * Wait for drained requests to finish.
390fe4f0614SKevin Wolf      *
391fe4f0614SKevin Wolf      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
392fe4f0614SKevin Wolf      * call is needed so things in this AioContext can make progress even
393fe4f0614SKevin Wolf      * though we don't return to the main AioContext loop - this automatically
394fe4f0614SKevin Wolf      * includes other nodes in the same AioContext and therefore all child
395fe4f0614SKevin Wolf      * nodes.
396fe4f0614SKevin Wolf      */
397fe4f0614SKevin Wolf     if (poll) {
3986cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
399fe4f0614SKevin Wolf         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
400fe4f0614SKevin Wolf     }
4016820643fSKevin Wolf }
4026820643fSKevin Wolf 
4030152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs)
4040152bf40SKevin Wolf {
4056cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, false, NULL, false, true);
4060152bf40SKevin Wolf }
4070152bf40SKevin Wolf 
408b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs)
4096820643fSKevin Wolf {
4106cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, true, NULL, false, true);
411b0165585SKevin Wolf }
412b0165585SKevin Wolf 
4136cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
4146cd5c9d7SKevin Wolf                                 BdrvChild *parent, bool ignore_bds_parents)
415b0165585SKevin Wolf {
416b0165585SKevin Wolf     BdrvChild *child, *next;
4170f115168SKevin Wolf     int old_quiesce_counter;
4180f115168SKevin Wolf 
419481cad48SManos Pitsidianakis     if (qemu_in_coroutine()) {
4206cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
4216cd5c9d7SKevin Wolf                                false);
422481cad48SManos Pitsidianakis         return;
423481cad48SManos Pitsidianakis     }
4246820643fSKevin Wolf     assert(bs->quiesce_counter > 0);
4250f115168SKevin Wolf     old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
4266820643fSKevin Wolf 
42760369b86SKevin Wolf     /* Re-enable things in child-to-parent order */
4287d40d9efSKevin Wolf     bdrv_drain_invoke(bs, false);
4296cd5c9d7SKevin Wolf     bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
4300f115168SKevin Wolf     if (old_quiesce_counter == 1) {
4316820643fSKevin Wolf         aio_enable_external(bdrv_get_aio_context(bs));
4326820643fSKevin Wolf     }
433b0165585SKevin Wolf 
434b0165585SKevin Wolf     if (recursive) {
4356cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
436d736f119SKevin Wolf         bs->recursive_quiesce_counter--;
437b0165585SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
4386cd5c9d7SKevin Wolf             bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
439b0165585SKevin Wolf         }
440b0165585SKevin Wolf     }
4410f115168SKevin Wolf }
4426820643fSKevin Wolf 
4430152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs)
4440152bf40SKevin Wolf {
4456cd5c9d7SKevin Wolf     bdrv_do_drained_end(bs, false, NULL, false);
446b0165585SKevin Wolf }
447b0165585SKevin Wolf 
448b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs)
449b0165585SKevin Wolf {
4506cd5c9d7SKevin Wolf     bdrv_do_drained_end(bs, true, NULL, false);
4510152bf40SKevin Wolf }
4520152bf40SKevin Wolf 
453d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
454d736f119SKevin Wolf {
455d736f119SKevin Wolf     int i;
456d736f119SKevin Wolf 
457d736f119SKevin Wolf     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
4586cd5c9d7SKevin Wolf         bdrv_do_drained_begin(child->bs, true, child, false, true);
459d736f119SKevin Wolf     }
460d736f119SKevin Wolf }
461d736f119SKevin Wolf 
462d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
463d736f119SKevin Wolf {
464d736f119SKevin Wolf     int i;
465d736f119SKevin Wolf 
466d736f119SKevin Wolf     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
4676cd5c9d7SKevin Wolf         bdrv_do_drained_end(child->bs, true, child, false);
468d736f119SKevin Wolf     }
469d736f119SKevin Wolf }
470d736f119SKevin Wolf 
47161007b31SStefan Hajnoczi /*
47267da1dc5SFam Zheng  * Wait for pending requests to complete on a single BlockDriverState subtree,
47367da1dc5SFam Zheng  * and suspend block driver's internal I/O until next request arrives.
47461007b31SStefan Hajnoczi  *
47561007b31SStefan Hajnoczi  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
47661007b31SStefan Hajnoczi  * AioContext.
47761007b31SStefan Hajnoczi  */
478b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
479b6e84c97SPaolo Bonzini {
4806820643fSKevin Wolf     assert(qemu_in_coroutine());
4816820643fSKevin Wolf     bdrv_drained_begin(bs);
4826820643fSKevin Wolf     bdrv_drained_end(bs);
483b6e84c97SPaolo Bonzini }
484b6e84c97SPaolo Bonzini 
48561007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs)
48661007b31SStefan Hajnoczi {
4876820643fSKevin Wolf     bdrv_drained_begin(bs);
4886820643fSKevin Wolf     bdrv_drained_end(bs);
48961007b31SStefan Hajnoczi }
49061007b31SStefan Hajnoczi 
491c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs)
492c13ad59fSKevin Wolf {
493c13ad59fSKevin Wolf     BdrvChild *child, *next;
494c13ad59fSKevin Wolf 
495c13ad59fSKevin Wolf     assert(atomic_read(&bs->in_flight) == 0);
496c13ad59fSKevin Wolf     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
497c13ad59fSKevin Wolf         bdrv_drain_assert_idle(child->bs);
498c13ad59fSKevin Wolf     }
499c13ad59fSKevin Wolf }
500c13ad59fSKevin Wolf 
5010f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0;
5020f12264eSKevin Wolf 
5030f12264eSKevin Wolf static bool bdrv_drain_all_poll(void)
5040f12264eSKevin Wolf {
5050f12264eSKevin Wolf     BlockDriverState *bs = NULL;
5060f12264eSKevin Wolf     bool result = false;
5070f12264eSKevin Wolf 
5080f12264eSKevin Wolf     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
5090f12264eSKevin Wolf      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
5100f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
5110f12264eSKevin Wolf         AioContext *aio_context = bdrv_get_aio_context(bs);
5120f12264eSKevin Wolf         aio_context_acquire(aio_context);
5130f12264eSKevin Wolf         result |= bdrv_drain_poll(bs, false, NULL, true);
5140f12264eSKevin Wolf         aio_context_release(aio_context);
5150f12264eSKevin Wolf     }
5160f12264eSKevin Wolf 
5170f12264eSKevin Wolf     return result;
5180f12264eSKevin Wolf }
5190f12264eSKevin Wolf 
52061007b31SStefan Hajnoczi /*
52161007b31SStefan Hajnoczi  * Wait for pending requests to complete across all BlockDriverStates
52261007b31SStefan Hajnoczi  *
52361007b31SStefan Hajnoczi  * This function does not flush data to disk, use bdrv_flush_all() for that
52461007b31SStefan Hajnoczi  * after calling this function.
525c0778f66SAlberto Garcia  *
526c0778f66SAlberto Garcia  * This pauses all block jobs and disables external clients. It must
527c0778f66SAlberto Garcia  * be paired with bdrv_drain_all_end().
528c0778f66SAlberto Garcia  *
529c0778f66SAlberto Garcia  * NOTE: no new block jobs or BlockDriverStates can be created between
530c0778f66SAlberto Garcia  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
53161007b31SStefan Hajnoczi  */
532c0778f66SAlberto Garcia void bdrv_drain_all_begin(void)
53361007b31SStefan Hajnoczi {
5340f12264eSKevin Wolf     BlockDriverState *bs = NULL;
53561007b31SStefan Hajnoczi 
536c8ca33d0SKevin Wolf     if (qemu_in_coroutine()) {
5370f12264eSKevin Wolf         bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
538c8ca33d0SKevin Wolf         return;
539c8ca33d0SKevin Wolf     }
540c8ca33d0SKevin Wolf 
5410f12264eSKevin Wolf     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
5420f12264eSKevin Wolf      * loop AioContext, so make sure we're in the main context. */
5439a7e86c8SKevin Wolf     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
5440f12264eSKevin Wolf     assert(bdrv_drain_all_count < INT_MAX);
5450f12264eSKevin Wolf     bdrv_drain_all_count++;
5469a7e86c8SKevin Wolf 
5470f12264eSKevin Wolf     /* Quiesce all nodes, without polling in-flight requests yet. The graph
5480f12264eSKevin Wolf      * cannot change during this loop. */
5490f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
55061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
55161007b31SStefan Hajnoczi 
55261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
5530f12264eSKevin Wolf         bdrv_do_drained_begin(bs, false, NULL, true, false);
55461007b31SStefan Hajnoczi         aio_context_release(aio_context);
55561007b31SStefan Hajnoczi     }
55661007b31SStefan Hajnoczi 
5570f12264eSKevin Wolf     /* Now poll the in-flight requests */
558*cfe29d82SKevin Wolf     AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
5590f12264eSKevin Wolf 
5600f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
561c13ad59fSKevin Wolf         bdrv_drain_assert_idle(bs);
562f406c03cSAlexander Yarygin     }
563f406c03cSAlexander Yarygin }
564c0778f66SAlberto Garcia 
565c0778f66SAlberto Garcia void bdrv_drain_all_end(void)
566c0778f66SAlberto Garcia {
5670f12264eSKevin Wolf     BlockDriverState *bs = NULL;
568c0778f66SAlberto Garcia 
5690f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
57061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
57161007b31SStefan Hajnoczi 
57261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
5730f12264eSKevin Wolf         bdrv_do_drained_end(bs, false, NULL, true);
57461007b31SStefan Hajnoczi         aio_context_release(aio_context);
57561007b31SStefan Hajnoczi     }
5760f12264eSKevin Wolf 
5770f12264eSKevin Wolf     assert(bdrv_drain_all_count > 0);
5780f12264eSKevin Wolf     bdrv_drain_all_count--;
57961007b31SStefan Hajnoczi }
58061007b31SStefan Hajnoczi 
581c0778f66SAlberto Garcia void bdrv_drain_all(void)
582c0778f66SAlberto Garcia {
583c0778f66SAlberto Garcia     bdrv_drain_all_begin();
584c0778f66SAlberto Garcia     bdrv_drain_all_end();
585c0778f66SAlberto Garcia }
586c0778f66SAlberto Garcia 
58761007b31SStefan Hajnoczi /**
58861007b31SStefan Hajnoczi  * Remove an active request from the tracked requests list
58961007b31SStefan Hajnoczi  *
59061007b31SStefan Hajnoczi  * This function should be called when a tracked request is completing.
59161007b31SStefan Hajnoczi  */
59261007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req)
59361007b31SStefan Hajnoczi {
59461007b31SStefan Hajnoczi     if (req->serialising) {
59520fc71b2SPaolo Bonzini         atomic_dec(&req->bs->serialising_in_flight);
59661007b31SStefan Hajnoczi     }
59761007b31SStefan Hajnoczi 
5983783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&req->bs->reqs_lock);
59961007b31SStefan Hajnoczi     QLIST_REMOVE(req, list);
60061007b31SStefan Hajnoczi     qemu_co_queue_restart_all(&req->wait_queue);
6013783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&req->bs->reqs_lock);
60261007b31SStefan Hajnoczi }
60361007b31SStefan Hajnoczi 
60461007b31SStefan Hajnoczi /**
60561007b31SStefan Hajnoczi  * Add an active request to the tracked requests list
60661007b31SStefan Hajnoczi  */
60761007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req,
60861007b31SStefan Hajnoczi                                   BlockDriverState *bs,
60961007b31SStefan Hajnoczi                                   int64_t offset,
61022931a15SFam Zheng                                   uint64_t bytes,
611ebde595cSFam Zheng                                   enum BdrvTrackedRequestType type)
61261007b31SStefan Hajnoczi {
61322931a15SFam Zheng     assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
61422931a15SFam Zheng 
61561007b31SStefan Hajnoczi     *req = (BdrvTrackedRequest){
61661007b31SStefan Hajnoczi         .bs = bs,
61761007b31SStefan Hajnoczi         .offset         = offset,
61861007b31SStefan Hajnoczi         .bytes          = bytes,
619ebde595cSFam Zheng         .type           = type,
62061007b31SStefan Hajnoczi         .co             = qemu_coroutine_self(),
62161007b31SStefan Hajnoczi         .serialising    = false,
62261007b31SStefan Hajnoczi         .overlap_offset = offset,
62361007b31SStefan Hajnoczi         .overlap_bytes  = bytes,
62461007b31SStefan Hajnoczi     };
62561007b31SStefan Hajnoczi 
62661007b31SStefan Hajnoczi     qemu_co_queue_init(&req->wait_queue);
62761007b31SStefan Hajnoczi 
6283783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
62961007b31SStefan Hajnoczi     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
6303783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
63161007b31SStefan Hajnoczi }
63261007b31SStefan Hajnoczi 
63361007b31SStefan Hajnoczi static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
63461007b31SStefan Hajnoczi {
63561007b31SStefan Hajnoczi     int64_t overlap_offset = req->offset & ~(align - 1);
63622931a15SFam Zheng     uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
63761007b31SStefan Hajnoczi                                - overlap_offset;
63861007b31SStefan Hajnoczi 
63961007b31SStefan Hajnoczi     if (!req->serialising) {
64020fc71b2SPaolo Bonzini         atomic_inc(&req->bs->serialising_in_flight);
64161007b31SStefan Hajnoczi         req->serialising = true;
64261007b31SStefan Hajnoczi     }
64361007b31SStefan Hajnoczi 
64461007b31SStefan Hajnoczi     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
64561007b31SStefan Hajnoczi     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
64661007b31SStefan Hajnoczi }
64761007b31SStefan Hajnoczi 
64809d2f948SVladimir Sementsov-Ogievskiy static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
64909d2f948SVladimir Sementsov-Ogievskiy {
65009d2f948SVladimir Sementsov-Ogievskiy     /*
65109d2f948SVladimir Sementsov-Ogievskiy      * If the request is serialising, overlap_offset and overlap_bytes are set,
65209d2f948SVladimir Sementsov-Ogievskiy      * so we can check if the request is aligned. Otherwise, don't care and
65309d2f948SVladimir Sementsov-Ogievskiy      * return false.
65409d2f948SVladimir Sementsov-Ogievskiy      */
65509d2f948SVladimir Sementsov-Ogievskiy 
65609d2f948SVladimir Sementsov-Ogievskiy     return req->serialising && (req->offset == req->overlap_offset) &&
65709d2f948SVladimir Sementsov-Ogievskiy            (req->bytes == req->overlap_bytes);
65809d2f948SVladimir Sementsov-Ogievskiy }
65909d2f948SVladimir Sementsov-Ogievskiy 
66061007b31SStefan Hajnoczi /**
661244483e6SKevin Wolf  * Round a region to cluster boundaries
662244483e6SKevin Wolf  */
663244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs,
6647cfd5275SEric Blake                             int64_t offset, int64_t bytes,
665244483e6SKevin Wolf                             int64_t *cluster_offset,
6667cfd5275SEric Blake                             int64_t *cluster_bytes)
667244483e6SKevin Wolf {
668244483e6SKevin Wolf     BlockDriverInfo bdi;
669244483e6SKevin Wolf 
670244483e6SKevin Wolf     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
671244483e6SKevin Wolf         *cluster_offset = offset;
672244483e6SKevin Wolf         *cluster_bytes = bytes;
673244483e6SKevin Wolf     } else {
674244483e6SKevin Wolf         int64_t c = bdi.cluster_size;
675244483e6SKevin Wolf         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
676244483e6SKevin Wolf         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
677244483e6SKevin Wolf     }
678244483e6SKevin Wolf }
679244483e6SKevin Wolf 
68061007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs)
68161007b31SStefan Hajnoczi {
68261007b31SStefan Hajnoczi     BlockDriverInfo bdi;
68361007b31SStefan Hajnoczi     int ret;
68461007b31SStefan Hajnoczi 
68561007b31SStefan Hajnoczi     ret = bdrv_get_info(bs, &bdi);
68661007b31SStefan Hajnoczi     if (ret < 0 || bdi.cluster_size == 0) {
687a5b8dd2cSEric Blake         return bs->bl.request_alignment;
68861007b31SStefan Hajnoczi     } else {
68961007b31SStefan Hajnoczi         return bdi.cluster_size;
69061007b31SStefan Hajnoczi     }
69161007b31SStefan Hajnoczi }
69261007b31SStefan Hajnoczi 
69361007b31SStefan Hajnoczi static bool tracked_request_overlaps(BdrvTrackedRequest *req,
69422931a15SFam Zheng                                      int64_t offset, uint64_t bytes)
69561007b31SStefan Hajnoczi {
69661007b31SStefan Hajnoczi     /*        aaaa   bbbb */
69761007b31SStefan Hajnoczi     if (offset >= req->overlap_offset + req->overlap_bytes) {
69861007b31SStefan Hajnoczi         return false;
69961007b31SStefan Hajnoczi     }
70061007b31SStefan Hajnoczi     /* bbbb   aaaa        */
70161007b31SStefan Hajnoczi     if (req->overlap_offset >= offset + bytes) {
70261007b31SStefan Hajnoczi         return false;
70361007b31SStefan Hajnoczi     }
70461007b31SStefan Hajnoczi     return true;
70561007b31SStefan Hajnoczi }
70661007b31SStefan Hajnoczi 
70799723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs)
70899723548SPaolo Bonzini {
70999723548SPaolo Bonzini     atomic_inc(&bs->in_flight);
71099723548SPaolo Bonzini }
71199723548SPaolo Bonzini 
712c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs)
713c9d1a561SPaolo Bonzini {
714*cfe29d82SKevin Wolf     aio_wait_kick();
715c9d1a561SPaolo Bonzini }
716c9d1a561SPaolo Bonzini 
71799723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs)
71899723548SPaolo Bonzini {
71999723548SPaolo Bonzini     atomic_dec(&bs->in_flight);
720c9d1a561SPaolo Bonzini     bdrv_wakeup(bs);
72199723548SPaolo Bonzini }
72299723548SPaolo Bonzini 
72361007b31SStefan Hajnoczi static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
72461007b31SStefan Hajnoczi {
72561007b31SStefan Hajnoczi     BlockDriverState *bs = self->bs;
72661007b31SStefan Hajnoczi     BdrvTrackedRequest *req;
72761007b31SStefan Hajnoczi     bool retry;
72861007b31SStefan Hajnoczi     bool waited = false;
72961007b31SStefan Hajnoczi 
73020fc71b2SPaolo Bonzini     if (!atomic_read(&bs->serialising_in_flight)) {
73161007b31SStefan Hajnoczi         return false;
73261007b31SStefan Hajnoczi     }
73361007b31SStefan Hajnoczi 
73461007b31SStefan Hajnoczi     do {
73561007b31SStefan Hajnoczi         retry = false;
7363783fa3dSPaolo Bonzini         qemu_co_mutex_lock(&bs->reqs_lock);
73761007b31SStefan Hajnoczi         QLIST_FOREACH(req, &bs->tracked_requests, list) {
73861007b31SStefan Hajnoczi             if (req == self || (!req->serialising && !self->serialising)) {
73961007b31SStefan Hajnoczi                 continue;
74061007b31SStefan Hajnoczi             }
74161007b31SStefan Hajnoczi             if (tracked_request_overlaps(req, self->overlap_offset,
74261007b31SStefan Hajnoczi                                          self->overlap_bytes))
74361007b31SStefan Hajnoczi             {
74461007b31SStefan Hajnoczi                 /* Hitting this means there was a reentrant request, for
74561007b31SStefan Hajnoczi                  * example, a block driver issuing nested requests.  This must
74661007b31SStefan Hajnoczi                  * never happen since it means deadlock.
74761007b31SStefan Hajnoczi                  */
74861007b31SStefan Hajnoczi                 assert(qemu_coroutine_self() != req->co);
74961007b31SStefan Hajnoczi 
75061007b31SStefan Hajnoczi                 /* If the request is already (indirectly) waiting for us, or
75161007b31SStefan Hajnoczi                  * will wait for us as soon as it wakes up, then just go on
75261007b31SStefan Hajnoczi                  * (instead of producing a deadlock in the former case). */
75361007b31SStefan Hajnoczi                 if (!req->waiting_for) {
75461007b31SStefan Hajnoczi                     self->waiting_for = req;
7553783fa3dSPaolo Bonzini                     qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
75661007b31SStefan Hajnoczi                     self->waiting_for = NULL;
75761007b31SStefan Hajnoczi                     retry = true;
75861007b31SStefan Hajnoczi                     waited = true;
75961007b31SStefan Hajnoczi                     break;
76061007b31SStefan Hajnoczi                 }
76161007b31SStefan Hajnoczi             }
76261007b31SStefan Hajnoczi         }
7633783fa3dSPaolo Bonzini         qemu_co_mutex_unlock(&bs->reqs_lock);
76461007b31SStefan Hajnoczi     } while (retry);
76561007b31SStefan Hajnoczi 
76661007b31SStefan Hajnoczi     return waited;
76761007b31SStefan Hajnoczi }
76861007b31SStefan Hajnoczi 
76961007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
77061007b31SStefan Hajnoczi                                    size_t size)
77161007b31SStefan Hajnoczi {
77261007b31SStefan Hajnoczi     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
77361007b31SStefan Hajnoczi         return -EIO;
77461007b31SStefan Hajnoczi     }
77561007b31SStefan Hajnoczi 
77661007b31SStefan Hajnoczi     if (!bdrv_is_inserted(bs)) {
77761007b31SStefan Hajnoczi         return -ENOMEDIUM;
77861007b31SStefan Hajnoczi     }
77961007b31SStefan Hajnoczi 
78061007b31SStefan Hajnoczi     if (offset < 0) {
78161007b31SStefan Hajnoczi         return -EIO;
78261007b31SStefan Hajnoczi     }
78361007b31SStefan Hajnoczi 
78461007b31SStefan Hajnoczi     return 0;
78561007b31SStefan Hajnoczi }
78661007b31SStefan Hajnoczi 
78761007b31SStefan Hajnoczi typedef struct RwCo {
788e293b7a3SKevin Wolf     BdrvChild *child;
78961007b31SStefan Hajnoczi     int64_t offset;
79061007b31SStefan Hajnoczi     QEMUIOVector *qiov;
79161007b31SStefan Hajnoczi     bool is_write;
79261007b31SStefan Hajnoczi     int ret;
79361007b31SStefan Hajnoczi     BdrvRequestFlags flags;
79461007b31SStefan Hajnoczi } RwCo;
79561007b31SStefan Hajnoczi 
79661007b31SStefan Hajnoczi static void coroutine_fn bdrv_rw_co_entry(void *opaque)
79761007b31SStefan Hajnoczi {
79861007b31SStefan Hajnoczi     RwCo *rwco = opaque;
79961007b31SStefan Hajnoczi 
80061007b31SStefan Hajnoczi     if (!rwco->is_write) {
801a03ef88fSKevin Wolf         rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
80261007b31SStefan Hajnoczi                                    rwco->qiov->size, rwco->qiov,
80361007b31SStefan Hajnoczi                                    rwco->flags);
80461007b31SStefan Hajnoczi     } else {
805a03ef88fSKevin Wolf         rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
80661007b31SStefan Hajnoczi                                     rwco->qiov->size, rwco->qiov,
80761007b31SStefan Hajnoczi                                     rwco->flags);
80861007b31SStefan Hajnoczi     }
80961007b31SStefan Hajnoczi }
81061007b31SStefan Hajnoczi 
81161007b31SStefan Hajnoczi /*
81261007b31SStefan Hajnoczi  * Process a vectored synchronous request using coroutines
81361007b31SStefan Hajnoczi  */
814e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
81561007b31SStefan Hajnoczi                         QEMUIOVector *qiov, bool is_write,
81661007b31SStefan Hajnoczi                         BdrvRequestFlags flags)
81761007b31SStefan Hajnoczi {
81861007b31SStefan Hajnoczi     Coroutine *co;
81961007b31SStefan Hajnoczi     RwCo rwco = {
820e293b7a3SKevin Wolf         .child = child,
82161007b31SStefan Hajnoczi         .offset = offset,
82261007b31SStefan Hajnoczi         .qiov = qiov,
82361007b31SStefan Hajnoczi         .is_write = is_write,
82461007b31SStefan Hajnoczi         .ret = NOT_DONE,
82561007b31SStefan Hajnoczi         .flags = flags,
82661007b31SStefan Hajnoczi     };
82761007b31SStefan Hajnoczi 
82861007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
82961007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
83061007b31SStefan Hajnoczi         bdrv_rw_co_entry(&rwco);
83161007b31SStefan Hajnoczi     } else {
8320b8b8753SPaolo Bonzini         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
833e92f0e19SFam Zheng         bdrv_coroutine_enter(child->bs, co);
83488b062c2SPaolo Bonzini         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
83561007b31SStefan Hajnoczi     }
83661007b31SStefan Hajnoczi     return rwco.ret;
83761007b31SStefan Hajnoczi }
83861007b31SStefan Hajnoczi 
83961007b31SStefan Hajnoczi /*
84061007b31SStefan Hajnoczi  * Process a synchronous request using coroutines
84161007b31SStefan Hajnoczi  */
842e293b7a3SKevin Wolf static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
84361007b31SStefan Hajnoczi                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
84461007b31SStefan Hajnoczi {
84561007b31SStefan Hajnoczi     QEMUIOVector qiov;
84661007b31SStefan Hajnoczi     struct iovec iov = {
84761007b31SStefan Hajnoczi         .iov_base = (void *)buf,
84861007b31SStefan Hajnoczi         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
84961007b31SStefan Hajnoczi     };
85061007b31SStefan Hajnoczi 
85161007b31SStefan Hajnoczi     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
85261007b31SStefan Hajnoczi         return -EINVAL;
85361007b31SStefan Hajnoczi     }
85461007b31SStefan Hajnoczi 
85561007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
856e293b7a3SKevin Wolf     return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
85761007b31SStefan Hajnoczi                         &qiov, is_write, flags);
85861007b31SStefan Hajnoczi }
85961007b31SStefan Hajnoczi 
86061007b31SStefan Hajnoczi /* return < 0 if error. See bdrv_write() for the return codes */
861fbcbbf4eSKevin Wolf int bdrv_read(BdrvChild *child, int64_t sector_num,
86261007b31SStefan Hajnoczi               uint8_t *buf, int nb_sectors)
86361007b31SStefan Hajnoczi {
864e293b7a3SKevin Wolf     return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
86561007b31SStefan Hajnoczi }
86661007b31SStefan Hajnoczi 
86761007b31SStefan Hajnoczi /* Return < 0 if error. Important errors are:
86861007b31SStefan Hajnoczi   -EIO         generic I/O error (may happen for all errors)
86961007b31SStefan Hajnoczi   -ENOMEDIUM   No media inserted.
87061007b31SStefan Hajnoczi   -EINVAL      Invalid sector number or nb_sectors
87161007b31SStefan Hajnoczi   -EACCES      Trying to write a read-only device
87261007b31SStefan Hajnoczi */
87318d51c4bSKevin Wolf int bdrv_write(BdrvChild *child, int64_t sector_num,
87461007b31SStefan Hajnoczi                const uint8_t *buf, int nb_sectors)
87561007b31SStefan Hajnoczi {
876e293b7a3SKevin Wolf     return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
87761007b31SStefan Hajnoczi }
87861007b31SStefan Hajnoczi 
879720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
880f5a5ca79SManos Pitsidianakis                        int bytes, BdrvRequestFlags flags)
88161007b31SStefan Hajnoczi {
88274021bc4SEric Blake     QEMUIOVector qiov;
88374021bc4SEric Blake     struct iovec iov = {
88474021bc4SEric Blake         .iov_base = NULL,
885f5a5ca79SManos Pitsidianakis         .iov_len = bytes,
88674021bc4SEric Blake     };
88774021bc4SEric Blake 
88874021bc4SEric Blake     qemu_iovec_init_external(&qiov, &iov, 1);
889e293b7a3SKevin Wolf     return bdrv_prwv_co(child, offset, &qiov, true,
89061007b31SStefan Hajnoczi                         BDRV_REQ_ZERO_WRITE | flags);
89161007b31SStefan Hajnoczi }
89261007b31SStefan Hajnoczi 
89361007b31SStefan Hajnoczi /*
89474021bc4SEric Blake  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
89561007b31SStefan Hajnoczi  * The operation is sped up by checking the block status and only writing
89661007b31SStefan Hajnoczi  * zeroes to the device if they currently do not return zeroes. Optional
89774021bc4SEric Blake  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
898465fe887SEric Blake  * BDRV_REQ_FUA).
89961007b31SStefan Hajnoczi  *
90061007b31SStefan Hajnoczi  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
90161007b31SStefan Hajnoczi  */
902720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
90361007b31SStefan Hajnoczi {
904237d78f8SEric Blake     int ret;
905237d78f8SEric Blake     int64_t target_size, bytes, offset = 0;
906720ff280SKevin Wolf     BlockDriverState *bs = child->bs;
90761007b31SStefan Hajnoczi 
9087286d610SEric Blake     target_size = bdrv_getlength(bs);
9097286d610SEric Blake     if (target_size < 0) {
9107286d610SEric Blake         return target_size;
91161007b31SStefan Hajnoczi     }
91261007b31SStefan Hajnoczi 
91361007b31SStefan Hajnoczi     for (;;) {
9147286d610SEric Blake         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
9157286d610SEric Blake         if (bytes <= 0) {
91661007b31SStefan Hajnoczi             return 0;
91761007b31SStefan Hajnoczi         }
918237d78f8SEric Blake         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
91961007b31SStefan Hajnoczi         if (ret < 0) {
9207286d610SEric Blake             error_report("error getting block status at offset %" PRId64 ": %s",
9217286d610SEric Blake                          offset, strerror(-ret));
92261007b31SStefan Hajnoczi             return ret;
92361007b31SStefan Hajnoczi         }
92461007b31SStefan Hajnoczi         if (ret & BDRV_BLOCK_ZERO) {
925237d78f8SEric Blake             offset += bytes;
92661007b31SStefan Hajnoczi             continue;
92761007b31SStefan Hajnoczi         }
928237d78f8SEric Blake         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
92961007b31SStefan Hajnoczi         if (ret < 0) {
9307286d610SEric Blake             error_report("error writing zeroes at offset %" PRId64 ": %s",
9317286d610SEric Blake                          offset, strerror(-ret));
93261007b31SStefan Hajnoczi             return ret;
93361007b31SStefan Hajnoczi         }
934237d78f8SEric Blake         offset += bytes;
93561007b31SStefan Hajnoczi     }
93661007b31SStefan Hajnoczi }
93761007b31SStefan Hajnoczi 
938cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
939f1e84741SKevin Wolf {
940f1e84741SKevin Wolf     int ret;
941f1e84741SKevin Wolf 
942e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
943f1e84741SKevin Wolf     if (ret < 0) {
944f1e84741SKevin Wolf         return ret;
945f1e84741SKevin Wolf     }
946f1e84741SKevin Wolf 
947f1e84741SKevin Wolf     return qiov->size;
948f1e84741SKevin Wolf }
949f1e84741SKevin Wolf 
950cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
95161007b31SStefan Hajnoczi {
95261007b31SStefan Hajnoczi     QEMUIOVector qiov;
95361007b31SStefan Hajnoczi     struct iovec iov = {
95461007b31SStefan Hajnoczi         .iov_base = (void *)buf,
95561007b31SStefan Hajnoczi         .iov_len = bytes,
95661007b31SStefan Hajnoczi     };
95761007b31SStefan Hajnoczi 
95861007b31SStefan Hajnoczi     if (bytes < 0) {
95961007b31SStefan Hajnoczi         return -EINVAL;
96061007b31SStefan Hajnoczi     }
96161007b31SStefan Hajnoczi 
96261007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
963cf2ab8fcSKevin Wolf     return bdrv_preadv(child, offset, &qiov);
96461007b31SStefan Hajnoczi }
96561007b31SStefan Hajnoczi 
966d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
96761007b31SStefan Hajnoczi {
96861007b31SStefan Hajnoczi     int ret;
96961007b31SStefan Hajnoczi 
970e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
97161007b31SStefan Hajnoczi     if (ret < 0) {
97261007b31SStefan Hajnoczi         return ret;
97361007b31SStefan Hajnoczi     }
97461007b31SStefan Hajnoczi 
97561007b31SStefan Hajnoczi     return qiov->size;
97661007b31SStefan Hajnoczi }
97761007b31SStefan Hajnoczi 
978d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
97961007b31SStefan Hajnoczi {
98061007b31SStefan Hajnoczi     QEMUIOVector qiov;
98161007b31SStefan Hajnoczi     struct iovec iov = {
98261007b31SStefan Hajnoczi         .iov_base   = (void *) buf,
98361007b31SStefan Hajnoczi         .iov_len    = bytes,
98461007b31SStefan Hajnoczi     };
98561007b31SStefan Hajnoczi 
98661007b31SStefan Hajnoczi     if (bytes < 0) {
98761007b31SStefan Hajnoczi         return -EINVAL;
98861007b31SStefan Hajnoczi     }
98961007b31SStefan Hajnoczi 
99061007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
991d9ca2ea2SKevin Wolf     return bdrv_pwritev(child, offset, &qiov);
99261007b31SStefan Hajnoczi }
99361007b31SStefan Hajnoczi 
99461007b31SStefan Hajnoczi /*
99561007b31SStefan Hajnoczi  * Writes to the file and ensures that no writes are reordered across this
99661007b31SStefan Hajnoczi  * request (acts as a barrier)
99761007b31SStefan Hajnoczi  *
99861007b31SStefan Hajnoczi  * Returns 0 on success, -errno in error cases.
99961007b31SStefan Hajnoczi  */
1000d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
100161007b31SStefan Hajnoczi                      const void *buf, int count)
100261007b31SStefan Hajnoczi {
100361007b31SStefan Hajnoczi     int ret;
100461007b31SStefan Hajnoczi 
1005d9ca2ea2SKevin Wolf     ret = bdrv_pwrite(child, offset, buf, count);
100661007b31SStefan Hajnoczi     if (ret < 0) {
100761007b31SStefan Hajnoczi         return ret;
100861007b31SStefan Hajnoczi     }
100961007b31SStefan Hajnoczi 
1010d9ca2ea2SKevin Wolf     ret = bdrv_flush(child->bs);
1011855a6a93SKevin Wolf     if (ret < 0) {
1012855a6a93SKevin Wolf         return ret;
101361007b31SStefan Hajnoczi     }
101461007b31SStefan Hajnoczi 
101561007b31SStefan Hajnoczi     return 0;
101661007b31SStefan Hajnoczi }
101761007b31SStefan Hajnoczi 
101808844473SKevin Wolf typedef struct CoroutineIOCompletion {
101908844473SKevin Wolf     Coroutine *coroutine;
102008844473SKevin Wolf     int ret;
102108844473SKevin Wolf } CoroutineIOCompletion;
102208844473SKevin Wolf 
102308844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret)
102408844473SKevin Wolf {
102508844473SKevin Wolf     CoroutineIOCompletion *co = opaque;
102608844473SKevin Wolf 
102708844473SKevin Wolf     co->ret = ret;
1028b9e413ddSPaolo Bonzini     aio_co_wake(co->coroutine);
102908844473SKevin Wolf }
103008844473SKevin Wolf 
1031166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1032166fe960SKevin Wolf                                            uint64_t offset, uint64_t bytes,
1033166fe960SKevin Wolf                                            QEMUIOVector *qiov, int flags)
1034166fe960SKevin Wolf {
1035166fe960SKevin Wolf     BlockDriver *drv = bs->drv;
10363fb06697SKevin Wolf     int64_t sector_num;
10373fb06697SKevin Wolf     unsigned int nb_sectors;
10383fb06697SKevin Wolf 
1039fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
1040fa166538SEric Blake 
1041d470ad42SMax Reitz     if (!drv) {
1042d470ad42SMax Reitz         return -ENOMEDIUM;
1043d470ad42SMax Reitz     }
1044d470ad42SMax Reitz 
10453fb06697SKevin Wolf     if (drv->bdrv_co_preadv) {
10463fb06697SKevin Wolf         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
10473fb06697SKevin Wolf     }
10483fb06697SKevin Wolf 
1049edfab6a0SEric Blake     if (drv->bdrv_aio_preadv) {
105008844473SKevin Wolf         BlockAIOCB *acb;
105108844473SKevin Wolf         CoroutineIOCompletion co = {
105208844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
105308844473SKevin Wolf         };
105408844473SKevin Wolf 
1055e31f6864SEric Blake         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
105608844473SKevin Wolf                                    bdrv_co_io_em_complete, &co);
105708844473SKevin Wolf         if (acb == NULL) {
105808844473SKevin Wolf             return -EIO;
105908844473SKevin Wolf         } else {
106008844473SKevin Wolf             qemu_coroutine_yield();
106108844473SKevin Wolf             return co.ret;
106208844473SKevin Wolf         }
106308844473SKevin Wolf     }
1064edfab6a0SEric Blake 
1065edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1066edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1067edfab6a0SEric Blake 
1068edfab6a0SEric Blake     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1069edfab6a0SEric Blake     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1070edfab6a0SEric Blake     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1071edfab6a0SEric Blake     assert(drv->bdrv_co_readv);
1072edfab6a0SEric Blake 
1073edfab6a0SEric Blake     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1074166fe960SKevin Wolf }
1075166fe960SKevin Wolf 
107678a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
107778a07294SKevin Wolf                                             uint64_t offset, uint64_t bytes,
107878a07294SKevin Wolf                                             QEMUIOVector *qiov, int flags)
107978a07294SKevin Wolf {
108078a07294SKevin Wolf     BlockDriver *drv = bs->drv;
10813fb06697SKevin Wolf     int64_t sector_num;
10823fb06697SKevin Wolf     unsigned int nb_sectors;
108378a07294SKevin Wolf     int ret;
108478a07294SKevin Wolf 
1085fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
1086fa166538SEric Blake 
1087d470ad42SMax Reitz     if (!drv) {
1088d470ad42SMax Reitz         return -ENOMEDIUM;
1089d470ad42SMax Reitz     }
1090d470ad42SMax Reitz 
10913fb06697SKevin Wolf     if (drv->bdrv_co_pwritev) {
1092515c2f43SKevin Wolf         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1093515c2f43SKevin Wolf                                    flags & bs->supported_write_flags);
1094515c2f43SKevin Wolf         flags &= ~bs->supported_write_flags;
10953fb06697SKevin Wolf         goto emulate_flags;
10963fb06697SKevin Wolf     }
10973fb06697SKevin Wolf 
1098edfab6a0SEric Blake     if (drv->bdrv_aio_pwritev) {
109908844473SKevin Wolf         BlockAIOCB *acb;
110008844473SKevin Wolf         CoroutineIOCompletion co = {
110108844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
110208844473SKevin Wolf         };
110308844473SKevin Wolf 
1104e31f6864SEric Blake         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1105e31f6864SEric Blake                                     flags & bs->supported_write_flags,
110608844473SKevin Wolf                                     bdrv_co_io_em_complete, &co);
1107e31f6864SEric Blake         flags &= ~bs->supported_write_flags;
110808844473SKevin Wolf         if (acb == NULL) {
11093fb06697SKevin Wolf             ret = -EIO;
111008844473SKevin Wolf         } else {
111108844473SKevin Wolf             qemu_coroutine_yield();
11123fb06697SKevin Wolf             ret = co.ret;
111308844473SKevin Wolf         }
1114edfab6a0SEric Blake         goto emulate_flags;
1115edfab6a0SEric Blake     }
1116edfab6a0SEric Blake 
1117edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1118edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1119edfab6a0SEric Blake 
1120edfab6a0SEric Blake     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1121edfab6a0SEric Blake     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1122edfab6a0SEric Blake     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1123edfab6a0SEric Blake 
1124e18a58b4SEric Blake     assert(drv->bdrv_co_writev);
1125e18a58b4SEric Blake     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1126edfab6a0SEric Blake                               flags & bs->supported_write_flags);
1127edfab6a0SEric Blake     flags &= ~bs->supported_write_flags;
112878a07294SKevin Wolf 
11293fb06697SKevin Wolf emulate_flags:
11304df863f3SEric Blake     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
113178a07294SKevin Wolf         ret = bdrv_co_flush(bs);
113278a07294SKevin Wolf     }
113378a07294SKevin Wolf 
113478a07294SKevin Wolf     return ret;
113578a07294SKevin Wolf }
113678a07294SKevin Wolf 
113729a298afSPavel Butsykin static int coroutine_fn
113829a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
113929a298afSPavel Butsykin                                uint64_t bytes, QEMUIOVector *qiov)
114029a298afSPavel Butsykin {
114129a298afSPavel Butsykin     BlockDriver *drv = bs->drv;
114229a298afSPavel Butsykin 
1143d470ad42SMax Reitz     if (!drv) {
1144d470ad42SMax Reitz         return -ENOMEDIUM;
1145d470ad42SMax Reitz     }
1146d470ad42SMax Reitz 
114729a298afSPavel Butsykin     if (!drv->bdrv_co_pwritev_compressed) {
114829a298afSPavel Butsykin         return -ENOTSUP;
114929a298afSPavel Butsykin     }
115029a298afSPavel Butsykin 
115129a298afSPavel Butsykin     return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
115229a298afSPavel Butsykin }
115329a298afSPavel Butsykin 
115485c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1155244483e6SKevin Wolf         int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
115661007b31SStefan Hajnoczi {
115785c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
115885c97ca7SKevin Wolf 
115961007b31SStefan Hajnoczi     /* Perform I/O through a temporary buffer so that users who scribble over
116061007b31SStefan Hajnoczi      * their read buffer while the operation is in progress do not end up
116161007b31SStefan Hajnoczi      * modifying the image file.  This is critical for zero-copy guest I/O
116261007b31SStefan Hajnoczi      * where anything might happen inside guest memory.
116361007b31SStefan Hajnoczi      */
116461007b31SStefan Hajnoczi     void *bounce_buffer;
116561007b31SStefan Hajnoczi 
116661007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
116761007b31SStefan Hajnoczi     struct iovec iov;
1168cb2e2878SEric Blake     QEMUIOVector local_qiov;
1169244483e6SKevin Wolf     int64_t cluster_offset;
11707cfd5275SEric Blake     int64_t cluster_bytes;
117161007b31SStefan Hajnoczi     size_t skip_bytes;
117261007b31SStefan Hajnoczi     int ret;
1173cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1174cb2e2878SEric Blake                                     BDRV_REQUEST_MAX_BYTES);
1175cb2e2878SEric Blake     unsigned int progress = 0;
117661007b31SStefan Hajnoczi 
1177d470ad42SMax Reitz     if (!drv) {
1178d470ad42SMax Reitz         return -ENOMEDIUM;
1179d470ad42SMax Reitz     }
1180d470ad42SMax Reitz 
11811bf03e66SKevin Wolf     /* FIXME We cannot require callers to have write permissions when all they
11821bf03e66SKevin Wolf      * are doing is a read request. If we did things right, write permissions
11831bf03e66SKevin Wolf      * would be obtained anyway, but internally by the copy-on-read code. As
1184765d9df9SEric Blake      * long as it is implemented here rather than in a separate filter driver,
11851bf03e66SKevin Wolf      * the copy-on-read code doesn't have its own BdrvChild, however, for which
11861bf03e66SKevin Wolf      * it could request permissions. Therefore we have to bypass the permission
11871bf03e66SKevin Wolf      * system for the moment. */
11881bf03e66SKevin Wolf     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1189afa4b293SKevin Wolf 
119061007b31SStefan Hajnoczi     /* Cover entire cluster so no additional backing file I/O is required when
1191cb2e2878SEric Blake      * allocating cluster in the image file.  Note that this value may exceed
1192cb2e2878SEric Blake      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1193cb2e2878SEric Blake      * is one reason we loop rather than doing it all at once.
119461007b31SStefan Hajnoczi      */
1195244483e6SKevin Wolf     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1196cb2e2878SEric Blake     skip_bytes = offset - cluster_offset;
119761007b31SStefan Hajnoczi 
1198244483e6SKevin Wolf     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1199244483e6SKevin Wolf                                    cluster_offset, cluster_bytes);
120061007b31SStefan Hajnoczi 
1201cb2e2878SEric Blake     bounce_buffer = qemu_try_blockalign(bs,
1202cb2e2878SEric Blake                                         MIN(MIN(max_transfer, cluster_bytes),
1203cb2e2878SEric Blake                                             MAX_BOUNCE_BUFFER));
120461007b31SStefan Hajnoczi     if (bounce_buffer == NULL) {
120561007b31SStefan Hajnoczi         ret = -ENOMEM;
120661007b31SStefan Hajnoczi         goto err;
120761007b31SStefan Hajnoczi     }
120861007b31SStefan Hajnoczi 
1209cb2e2878SEric Blake     while (cluster_bytes) {
1210cb2e2878SEric Blake         int64_t pnum;
121161007b31SStefan Hajnoczi 
1212cb2e2878SEric Blake         ret = bdrv_is_allocated(bs, cluster_offset,
1213cb2e2878SEric Blake                                 MIN(cluster_bytes, max_transfer), &pnum);
1214cb2e2878SEric Blake         if (ret < 0) {
1215cb2e2878SEric Blake             /* Safe to treat errors in querying allocation as if
1216cb2e2878SEric Blake              * unallocated; we'll probably fail again soon on the
1217cb2e2878SEric Blake              * read, but at least that will set a decent errno.
1218cb2e2878SEric Blake              */
1219cb2e2878SEric Blake             pnum = MIN(cluster_bytes, max_transfer);
1220cb2e2878SEric Blake         }
1221cb2e2878SEric Blake 
1222b0ddcbbbSKevin Wolf         /* Stop at EOF if the image ends in the middle of the cluster */
1223b0ddcbbbSKevin Wolf         if (ret == 0 && pnum == 0) {
1224b0ddcbbbSKevin Wolf             assert(progress >= bytes);
1225b0ddcbbbSKevin Wolf             break;
1226b0ddcbbbSKevin Wolf         }
1227b0ddcbbbSKevin Wolf 
1228cb2e2878SEric Blake         assert(skip_bytes < pnum);
1229cb2e2878SEric Blake 
1230cb2e2878SEric Blake         if (ret <= 0) {
1231cb2e2878SEric Blake             /* Must copy-on-read; use the bounce buffer */
1232cb2e2878SEric Blake             iov.iov_base = bounce_buffer;
1233cb2e2878SEric Blake             iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1234cb2e2878SEric Blake             qemu_iovec_init_external(&local_qiov, &iov, 1);
1235cb2e2878SEric Blake 
1236cb2e2878SEric Blake             ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1237cb2e2878SEric Blake                                      &local_qiov, 0);
123861007b31SStefan Hajnoczi             if (ret < 0) {
123961007b31SStefan Hajnoczi                 goto err;
124061007b31SStefan Hajnoczi             }
124161007b31SStefan Hajnoczi 
1242d855ebcdSEric Blake             bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1243c1499a5eSEric Blake             if (drv->bdrv_co_pwrite_zeroes &&
1244cb2e2878SEric Blake                 buffer_is_zero(bounce_buffer, pnum)) {
1245a604fa2bSEric Blake                 /* FIXME: Should we (perhaps conditionally) be setting
1246a604fa2bSEric Blake                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1247a604fa2bSEric Blake                  * that still correctly reads as zero? */
12487adcf59fSMax Reitz                 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
12497adcf59fSMax Reitz                                                BDRV_REQ_WRITE_UNCHANGED);
125061007b31SStefan Hajnoczi             } else {
1251cb2e2878SEric Blake                 /* This does not change the data on the disk, it is not
1252cb2e2878SEric Blake                  * necessary to flush even in cache=writethrough mode.
125361007b31SStefan Hajnoczi                  */
1254cb2e2878SEric Blake                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
12557adcf59fSMax Reitz                                           &local_qiov,
12567adcf59fSMax Reitz                                           BDRV_REQ_WRITE_UNCHANGED);
125761007b31SStefan Hajnoczi             }
125861007b31SStefan Hajnoczi 
125961007b31SStefan Hajnoczi             if (ret < 0) {
1260cb2e2878SEric Blake                 /* It might be okay to ignore write errors for guest
1261cb2e2878SEric Blake                  * requests.  If this is a deliberate copy-on-read
1262cb2e2878SEric Blake                  * then we don't want to ignore the error.  Simply
1263cb2e2878SEric Blake                  * report it in all cases.
126461007b31SStefan Hajnoczi                  */
126561007b31SStefan Hajnoczi                 goto err;
126661007b31SStefan Hajnoczi             }
126761007b31SStefan Hajnoczi 
1268cb2e2878SEric Blake             qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1269cb2e2878SEric Blake                                 pnum - skip_bytes);
1270cb2e2878SEric Blake         } else {
1271cb2e2878SEric Blake             /* Read directly into the destination */
1272cb2e2878SEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
1273cb2e2878SEric Blake             qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1274cb2e2878SEric Blake             ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1275cb2e2878SEric Blake                                      &local_qiov, 0);
1276cb2e2878SEric Blake             qemu_iovec_destroy(&local_qiov);
1277cb2e2878SEric Blake             if (ret < 0) {
1278cb2e2878SEric Blake                 goto err;
1279cb2e2878SEric Blake             }
1280cb2e2878SEric Blake         }
1281cb2e2878SEric Blake 
1282cb2e2878SEric Blake         cluster_offset += pnum;
1283cb2e2878SEric Blake         cluster_bytes -= pnum;
1284cb2e2878SEric Blake         progress += pnum - skip_bytes;
1285cb2e2878SEric Blake         skip_bytes = 0;
1286cb2e2878SEric Blake     }
1287cb2e2878SEric Blake     ret = 0;
128861007b31SStefan Hajnoczi 
128961007b31SStefan Hajnoczi err:
129061007b31SStefan Hajnoczi     qemu_vfree(bounce_buffer);
129161007b31SStefan Hajnoczi     return ret;
129261007b31SStefan Hajnoczi }
129361007b31SStefan Hajnoczi 
129461007b31SStefan Hajnoczi /*
129561007b31SStefan Hajnoczi  * Forwards an already correctly aligned request to the BlockDriver. This
12961a62d0acSEric Blake  * handles copy on read, zeroing after EOF, and fragmentation of large
12971a62d0acSEric Blake  * reads; any other features must be implemented by the caller.
129861007b31SStefan Hajnoczi  */
129985c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
130061007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
130161007b31SStefan Hajnoczi     int64_t align, QEMUIOVector *qiov, int flags)
130261007b31SStefan Hajnoczi {
130385c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
1304c9d20029SKevin Wolf     int64_t total_bytes, max_bytes;
13051a62d0acSEric Blake     int ret = 0;
13061a62d0acSEric Blake     uint64_t bytes_remaining = bytes;
13071a62d0acSEric Blake     int max_transfer;
130861007b31SStefan Hajnoczi 
130949c07526SKevin Wolf     assert(is_power_of_2(align));
131049c07526SKevin Wolf     assert((offset & (align - 1)) == 0);
131149c07526SKevin Wolf     assert((bytes & (align - 1)) == 0);
131261007b31SStefan Hajnoczi     assert(!qiov || bytes == qiov->size);
1313abb06c5aSDaniel P. Berrange     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
13141a62d0acSEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
13151a62d0acSEric Blake                                    align);
1316a604fa2bSEric Blake 
1317a604fa2bSEric Blake     /* TODO: We would need a per-BDS .supported_read_flags and
1318a604fa2bSEric Blake      * potential fallback support, if we ever implement any read flags
1319a604fa2bSEric Blake      * to pass through to drivers.  For now, there aren't any
1320a604fa2bSEric Blake      * passthrough flags.  */
1321a604fa2bSEric Blake     assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
132261007b31SStefan Hajnoczi 
132361007b31SStefan Hajnoczi     /* Handle Copy on Read and associated serialisation */
132461007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
132561007b31SStefan Hajnoczi         /* If we touch the same cluster it counts as an overlap.  This
132661007b31SStefan Hajnoczi          * guarantees that allocating writes will be serialized and not race
132761007b31SStefan Hajnoczi          * with each other for the same cluster.  For example, in copy-on-read
132861007b31SStefan Hajnoczi          * it ensures that the CoR read and write operations are atomic and
132961007b31SStefan Hajnoczi          * guest writes cannot interleave between them. */
133061007b31SStefan Hajnoczi         mark_request_serialising(req, bdrv_get_cluster_size(bs));
133161007b31SStefan Hajnoczi     }
133261007b31SStefan Hajnoczi 
133309d2f948SVladimir Sementsov-Ogievskiy     /* BDRV_REQ_SERIALISING is only for write operation */
133409d2f948SVladimir Sementsov-Ogievskiy     assert(!(flags & BDRV_REQ_SERIALISING));
133509d2f948SVladimir Sementsov-Ogievskiy 
133661408b25SFam Zheng     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
133761007b31SStefan Hajnoczi         wait_serialising_requests(req);
133861408b25SFam Zheng     }
133961007b31SStefan Hajnoczi 
134061007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
1341d6a644bbSEric Blake         int64_t pnum;
134261007b31SStefan Hajnoczi 
134388e63df2SEric Blake         ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
134461007b31SStefan Hajnoczi         if (ret < 0) {
134561007b31SStefan Hajnoczi             goto out;
134661007b31SStefan Hajnoczi         }
134761007b31SStefan Hajnoczi 
134888e63df2SEric Blake         if (!ret || pnum != bytes) {
134985c97ca7SKevin Wolf             ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
135061007b31SStefan Hajnoczi             goto out;
135161007b31SStefan Hajnoczi         }
135261007b31SStefan Hajnoczi     }
135361007b31SStefan Hajnoczi 
13541a62d0acSEric Blake     /* Forward the request to the BlockDriver, possibly fragmenting it */
135549c07526SKevin Wolf     total_bytes = bdrv_getlength(bs);
135649c07526SKevin Wolf     if (total_bytes < 0) {
135749c07526SKevin Wolf         ret = total_bytes;
135861007b31SStefan Hajnoczi         goto out;
135961007b31SStefan Hajnoczi     }
136061007b31SStefan Hajnoczi 
136149c07526SKevin Wolf     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
13621a62d0acSEric Blake     if (bytes <= max_bytes && bytes <= max_transfer) {
1363166fe960SKevin Wolf         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
13641a62d0acSEric Blake         goto out;
136561007b31SStefan Hajnoczi     }
136661007b31SStefan Hajnoczi 
13671a62d0acSEric Blake     while (bytes_remaining) {
13681a62d0acSEric Blake         int num;
13691a62d0acSEric Blake 
13701a62d0acSEric Blake         if (max_bytes) {
13711a62d0acSEric Blake             QEMUIOVector local_qiov;
13721a62d0acSEric Blake 
13731a62d0acSEric Blake             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
13741a62d0acSEric Blake             assert(num);
13751a62d0acSEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
13761a62d0acSEric Blake             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
13771a62d0acSEric Blake 
13781a62d0acSEric Blake             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
13791a62d0acSEric Blake                                      num, &local_qiov, 0);
13801a62d0acSEric Blake             max_bytes -= num;
13811a62d0acSEric Blake             qemu_iovec_destroy(&local_qiov);
13821a62d0acSEric Blake         } else {
13831a62d0acSEric Blake             num = bytes_remaining;
13841a62d0acSEric Blake             ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
13851a62d0acSEric Blake                                     bytes_remaining);
13861a62d0acSEric Blake         }
13871a62d0acSEric Blake         if (ret < 0) {
13881a62d0acSEric Blake             goto out;
13891a62d0acSEric Blake         }
13901a62d0acSEric Blake         bytes_remaining -= num;
139161007b31SStefan Hajnoczi     }
139261007b31SStefan Hajnoczi 
139361007b31SStefan Hajnoczi out:
13941a62d0acSEric Blake     return ret < 0 ? ret : 0;
139561007b31SStefan Hajnoczi }
139661007b31SStefan Hajnoczi 
139761007b31SStefan Hajnoczi /*
139861007b31SStefan Hajnoczi  * Handle a read request in coroutine context
139961007b31SStefan Hajnoczi  */
1400a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child,
140161007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
140261007b31SStefan Hajnoczi     BdrvRequestFlags flags)
140361007b31SStefan Hajnoczi {
1404a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
140561007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
140661007b31SStefan Hajnoczi     BdrvTrackedRequest req;
140761007b31SStefan Hajnoczi 
1408a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
140961007b31SStefan Hajnoczi     uint8_t *head_buf = NULL;
141061007b31SStefan Hajnoczi     uint8_t *tail_buf = NULL;
141161007b31SStefan Hajnoczi     QEMUIOVector local_qiov;
141261007b31SStefan Hajnoczi     bool use_local_qiov = false;
141361007b31SStefan Hajnoczi     int ret;
141461007b31SStefan Hajnoczi 
1415f42cf447SDaniel P. Berrange     trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1416f42cf447SDaniel P. Berrange 
141761007b31SStefan Hajnoczi     if (!drv) {
141861007b31SStefan Hajnoczi         return -ENOMEDIUM;
141961007b31SStefan Hajnoczi     }
142061007b31SStefan Hajnoczi 
142161007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
142261007b31SStefan Hajnoczi     if (ret < 0) {
142361007b31SStefan Hajnoczi         return ret;
142461007b31SStefan Hajnoczi     }
142561007b31SStefan Hajnoczi 
142699723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
142799723548SPaolo Bonzini 
14289568b511SWen Congyang     /* Don't do copy-on-read if we read data before write operation */
1429d3faa13eSPaolo Bonzini     if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
143061007b31SStefan Hajnoczi         flags |= BDRV_REQ_COPY_ON_READ;
143161007b31SStefan Hajnoczi     }
143261007b31SStefan Hajnoczi 
143361007b31SStefan Hajnoczi     /* Align read if necessary by padding qiov */
143461007b31SStefan Hajnoczi     if (offset & (align - 1)) {
143561007b31SStefan Hajnoczi         head_buf = qemu_blockalign(bs, align);
143661007b31SStefan Hajnoczi         qemu_iovec_init(&local_qiov, qiov->niov + 2);
143761007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
143861007b31SStefan Hajnoczi         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
143961007b31SStefan Hajnoczi         use_local_qiov = true;
144061007b31SStefan Hajnoczi 
144161007b31SStefan Hajnoczi         bytes += offset & (align - 1);
144261007b31SStefan Hajnoczi         offset = offset & ~(align - 1);
144361007b31SStefan Hajnoczi     }
144461007b31SStefan Hajnoczi 
144561007b31SStefan Hajnoczi     if ((offset + bytes) & (align - 1)) {
144661007b31SStefan Hajnoczi         if (!use_local_qiov) {
144761007b31SStefan Hajnoczi             qemu_iovec_init(&local_qiov, qiov->niov + 1);
144861007b31SStefan Hajnoczi             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
144961007b31SStefan Hajnoczi             use_local_qiov = true;
145061007b31SStefan Hajnoczi         }
145161007b31SStefan Hajnoczi         tail_buf = qemu_blockalign(bs, align);
145261007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, tail_buf,
145361007b31SStefan Hajnoczi                        align - ((offset + bytes) & (align - 1)));
145461007b31SStefan Hajnoczi 
145561007b31SStefan Hajnoczi         bytes = ROUND_UP(bytes, align);
145661007b31SStefan Hajnoczi     }
145761007b31SStefan Hajnoczi 
1458ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
145985c97ca7SKevin Wolf     ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
146061007b31SStefan Hajnoczi                               use_local_qiov ? &local_qiov : qiov,
146161007b31SStefan Hajnoczi                               flags);
146261007b31SStefan Hajnoczi     tracked_request_end(&req);
146399723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
146461007b31SStefan Hajnoczi 
146561007b31SStefan Hajnoczi     if (use_local_qiov) {
146661007b31SStefan Hajnoczi         qemu_iovec_destroy(&local_qiov);
146761007b31SStefan Hajnoczi         qemu_vfree(head_buf);
146861007b31SStefan Hajnoczi         qemu_vfree(tail_buf);
146961007b31SStefan Hajnoczi     }
147061007b31SStefan Hajnoczi 
147161007b31SStefan Hajnoczi     return ret;
147261007b31SStefan Hajnoczi }
147361007b31SStefan Hajnoczi 
1474d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1475f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags)
147661007b31SStefan Hajnoczi {
147761007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
147861007b31SStefan Hajnoczi     QEMUIOVector qiov;
147961007b31SStefan Hajnoczi     struct iovec iov = {0};
148061007b31SStefan Hajnoczi     int ret = 0;
1481465fe887SEric Blake     bool need_flush = false;
1482443668caSDenis V. Lunev     int head = 0;
1483443668caSDenis V. Lunev     int tail = 0;
148461007b31SStefan Hajnoczi 
1485cf081fcaSEric Blake     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1486a5b8dd2cSEric Blake     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1487a5b8dd2cSEric Blake                         bs->bl.request_alignment);
1488cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1489cf081fcaSEric Blake 
1490d470ad42SMax Reitz     if (!drv) {
1491d470ad42SMax Reitz         return -ENOMEDIUM;
1492d470ad42SMax Reitz     }
1493d470ad42SMax Reitz 
1494b8d0a980SEric Blake     assert(alignment % bs->bl.request_alignment == 0);
1495b8d0a980SEric Blake     head = offset % alignment;
1496f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % alignment;
1497b8d0a980SEric Blake     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1498b8d0a980SEric Blake     assert(max_write_zeroes >= bs->bl.request_alignment);
149961007b31SStefan Hajnoczi 
1500f5a5ca79SManos Pitsidianakis     while (bytes > 0 && !ret) {
1501f5a5ca79SManos Pitsidianakis         int num = bytes;
150261007b31SStefan Hajnoczi 
150361007b31SStefan Hajnoczi         /* Align request.  Block drivers can expect the "bulk" of the request
1504443668caSDenis V. Lunev          * to be aligned, and that unaligned requests do not cross cluster
1505443668caSDenis V. Lunev          * boundaries.
150661007b31SStefan Hajnoczi          */
1507443668caSDenis V. Lunev         if (head) {
1508b2f95feeSEric Blake             /* Make a small request up to the first aligned sector. For
1509b2f95feeSEric Blake              * convenience, limit this request to max_transfer even if
1510b2f95feeSEric Blake              * we don't need to fall back to writes.  */
1511f5a5ca79SManos Pitsidianakis             num = MIN(MIN(bytes, max_transfer), alignment - head);
1512b2f95feeSEric Blake             head = (head + num) % alignment;
1513b2f95feeSEric Blake             assert(num < max_write_zeroes);
1514d05aa8bbSEric Blake         } else if (tail && num > alignment) {
1515443668caSDenis V. Lunev             /* Shorten the request to the last aligned sector.  */
1516443668caSDenis V. Lunev             num -= tail;
151761007b31SStefan Hajnoczi         }
151861007b31SStefan Hajnoczi 
151961007b31SStefan Hajnoczi         /* limit request size */
152061007b31SStefan Hajnoczi         if (num > max_write_zeroes) {
152161007b31SStefan Hajnoczi             num = max_write_zeroes;
152261007b31SStefan Hajnoczi         }
152361007b31SStefan Hajnoczi 
152461007b31SStefan Hajnoczi         ret = -ENOTSUP;
152561007b31SStefan Hajnoczi         /* First try the efficient write zeroes operation */
1526d05aa8bbSEric Blake         if (drv->bdrv_co_pwrite_zeroes) {
1527d05aa8bbSEric Blake             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1528d05aa8bbSEric Blake                                              flags & bs->supported_zero_flags);
1529d05aa8bbSEric Blake             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1530d05aa8bbSEric Blake                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1531d05aa8bbSEric Blake                 need_flush = true;
1532d05aa8bbSEric Blake             }
1533465fe887SEric Blake         } else {
1534465fe887SEric Blake             assert(!bs->supported_zero_flags);
153561007b31SStefan Hajnoczi         }
153661007b31SStefan Hajnoczi 
153761007b31SStefan Hajnoczi         if (ret == -ENOTSUP) {
153861007b31SStefan Hajnoczi             /* Fall back to bounce buffer if write zeroes is unsupported */
1539465fe887SEric Blake             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1540465fe887SEric Blake 
1541465fe887SEric Blake             if ((flags & BDRV_REQ_FUA) &&
1542465fe887SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1543465fe887SEric Blake                 /* No need for bdrv_driver_pwrite() to do a fallback
1544465fe887SEric Blake                  * flush on each chunk; use just one at the end */
1545465fe887SEric Blake                 write_flags &= ~BDRV_REQ_FUA;
1546465fe887SEric Blake                 need_flush = true;
1547465fe887SEric Blake             }
15485def6b80SEric Blake             num = MIN(num, max_transfer);
1549d05aa8bbSEric Blake             iov.iov_len = num;
155061007b31SStefan Hajnoczi             if (iov.iov_base == NULL) {
1551d05aa8bbSEric Blake                 iov.iov_base = qemu_try_blockalign(bs, num);
155261007b31SStefan Hajnoczi                 if (iov.iov_base == NULL) {
155361007b31SStefan Hajnoczi                     ret = -ENOMEM;
155461007b31SStefan Hajnoczi                     goto fail;
155561007b31SStefan Hajnoczi                 }
1556d05aa8bbSEric Blake                 memset(iov.iov_base, 0, num);
155761007b31SStefan Hajnoczi             }
155861007b31SStefan Hajnoczi             qemu_iovec_init_external(&qiov, &iov, 1);
155961007b31SStefan Hajnoczi 
1560d05aa8bbSEric Blake             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
156161007b31SStefan Hajnoczi 
156261007b31SStefan Hajnoczi             /* Keep bounce buffer around if it is big enough for all
156361007b31SStefan Hajnoczi              * all future requests.
156461007b31SStefan Hajnoczi              */
15655def6b80SEric Blake             if (num < max_transfer) {
156661007b31SStefan Hajnoczi                 qemu_vfree(iov.iov_base);
156761007b31SStefan Hajnoczi                 iov.iov_base = NULL;
156861007b31SStefan Hajnoczi             }
156961007b31SStefan Hajnoczi         }
157061007b31SStefan Hajnoczi 
1571d05aa8bbSEric Blake         offset += num;
1572f5a5ca79SManos Pitsidianakis         bytes -= num;
157361007b31SStefan Hajnoczi     }
157461007b31SStefan Hajnoczi 
157561007b31SStefan Hajnoczi fail:
1576465fe887SEric Blake     if (ret == 0 && need_flush) {
1577465fe887SEric Blake         ret = bdrv_co_flush(bs);
1578465fe887SEric Blake     }
157961007b31SStefan Hajnoczi     qemu_vfree(iov.iov_base);
158061007b31SStefan Hajnoczi     return ret;
158161007b31SStefan Hajnoczi }
158261007b31SStefan Hajnoczi 
158385fe2479SFam Zheng static inline int coroutine_fn
158485fe2479SFam Zheng bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
158585fe2479SFam Zheng                           BdrvTrackedRequest *req, int flags)
158685fe2479SFam Zheng {
158785fe2479SFam Zheng     BlockDriverState *bs = child->bs;
158885fe2479SFam Zheng     bool waited;
158985fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
159085fe2479SFam Zheng 
159185fe2479SFam Zheng     if (bs->read_only) {
159285fe2479SFam Zheng         return -EPERM;
159385fe2479SFam Zheng     }
159485fe2479SFam Zheng 
159585fe2479SFam Zheng     /* BDRV_REQ_NO_SERIALISING is only for read operation */
159685fe2479SFam Zheng     assert(!(flags & BDRV_REQ_NO_SERIALISING));
159785fe2479SFam Zheng     assert(!(bs->open_flags & BDRV_O_INACTIVE));
159885fe2479SFam Zheng     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
159985fe2479SFam Zheng     assert(!(flags & ~BDRV_REQ_MASK));
160085fe2479SFam Zheng 
160185fe2479SFam Zheng     if (flags & BDRV_REQ_SERIALISING) {
160285fe2479SFam Zheng         mark_request_serialising(req, bdrv_get_cluster_size(bs));
160385fe2479SFam Zheng     }
160485fe2479SFam Zheng 
160585fe2479SFam Zheng     waited = wait_serialising_requests(req);
160685fe2479SFam Zheng 
160785fe2479SFam Zheng     assert(!waited || !req->serialising ||
160885fe2479SFam Zheng            is_request_serialising_and_aligned(req));
160985fe2479SFam Zheng     assert(req->overlap_offset <= offset);
161085fe2479SFam Zheng     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1611cd47d792SFam Zheng     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
161285fe2479SFam Zheng 
1613cd47d792SFam Zheng     switch (req->type) {
1614cd47d792SFam Zheng     case BDRV_TRACKED_WRITE:
1615cd47d792SFam Zheng     case BDRV_TRACKED_DISCARD:
161685fe2479SFam Zheng         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
161785fe2479SFam Zheng             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
161885fe2479SFam Zheng         } else {
161985fe2479SFam Zheng             assert(child->perm & BLK_PERM_WRITE);
162085fe2479SFam Zheng         }
1621cd47d792SFam Zheng         return notifier_with_return_list_notify(&bs->before_write_notifiers,
1622cd47d792SFam Zheng                                                 req);
1623cd47d792SFam Zheng     case BDRV_TRACKED_TRUNCATE:
1624cd47d792SFam Zheng         assert(child->perm & BLK_PERM_RESIZE);
1625cd47d792SFam Zheng         return 0;
1626cd47d792SFam Zheng     default:
1627cd47d792SFam Zheng         abort();
1628cd47d792SFam Zheng     }
162985fe2479SFam Zheng }
163085fe2479SFam Zheng 
163185fe2479SFam Zheng static inline void coroutine_fn
163285fe2479SFam Zheng bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
163385fe2479SFam Zheng                          BdrvTrackedRequest *req, int ret)
163485fe2479SFam Zheng {
163585fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
163685fe2479SFam Zheng     BlockDriverState *bs = child->bs;
163785fe2479SFam Zheng 
163885fe2479SFam Zheng     atomic_inc(&bs->write_gen);
163985fe2479SFam Zheng 
164000695c27SFam Zheng     /*
164100695c27SFam Zheng      * Discard cannot extend the image, but in error handling cases, such as
164200695c27SFam Zheng      * when reverting a qcow2 cluster allocation, the discarded range can pass
164300695c27SFam Zheng      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
164400695c27SFam Zheng      * here. Instead, just skip it, since semantically a discard request
164500695c27SFam Zheng      * beyond EOF cannot expand the image anyway.
164600695c27SFam Zheng      */
16477f8f03efSFam Zheng     if (ret == 0 &&
1648cd47d792SFam Zheng         (req->type == BDRV_TRACKED_TRUNCATE ||
1649cd47d792SFam Zheng          end_sector > bs->total_sectors) &&
165000695c27SFam Zheng         req->type != BDRV_TRACKED_DISCARD) {
16517f8f03efSFam Zheng         bs->total_sectors = end_sector;
16527f8f03efSFam Zheng         bdrv_parent_cb_resize(bs);
16537f8f03efSFam Zheng         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
165485fe2479SFam Zheng     }
165500695c27SFam Zheng     if (req->bytes) {
165600695c27SFam Zheng         switch (req->type) {
165700695c27SFam Zheng         case BDRV_TRACKED_WRITE:
165800695c27SFam Zheng             stat64_max(&bs->wr_highest_offset, offset + bytes);
165900695c27SFam Zheng             /* fall through, to set dirty bits */
166000695c27SFam Zheng         case BDRV_TRACKED_DISCARD:
16617f8f03efSFam Zheng             bdrv_set_dirty(bs, offset, bytes);
166200695c27SFam Zheng             break;
166300695c27SFam Zheng         default:
166400695c27SFam Zheng             break;
166500695c27SFam Zheng         }
166600695c27SFam Zheng     }
166785fe2479SFam Zheng }
166885fe2479SFam Zheng 
166961007b31SStefan Hajnoczi /*
167004ed95f4SEric Blake  * Forwards an already correctly aligned write request to the BlockDriver,
167104ed95f4SEric Blake  * after possibly fragmenting it.
167261007b31SStefan Hajnoczi  */
167385c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
167461007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1675cff86b38SEric Blake     int64_t align, QEMUIOVector *qiov, int flags)
167661007b31SStefan Hajnoczi {
167785c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
167861007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
167961007b31SStefan Hajnoczi     int ret;
168061007b31SStefan Hajnoczi 
168104ed95f4SEric Blake     uint64_t bytes_remaining = bytes;
168204ed95f4SEric Blake     int max_transfer;
168361007b31SStefan Hajnoczi 
1684d470ad42SMax Reitz     if (!drv) {
1685d470ad42SMax Reitz         return -ENOMEDIUM;
1686d470ad42SMax Reitz     }
1687d470ad42SMax Reitz 
1688d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
1689d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
1690d6883bc9SVladimir Sementsov-Ogievskiy     }
1691d6883bc9SVladimir Sementsov-Ogievskiy 
1692cff86b38SEric Blake     assert(is_power_of_2(align));
1693cff86b38SEric Blake     assert((offset & (align - 1)) == 0);
1694cff86b38SEric Blake     assert((bytes & (align - 1)) == 0);
169561007b31SStefan Hajnoczi     assert(!qiov || bytes == qiov->size);
169604ed95f4SEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
169704ed95f4SEric Blake                                    align);
169861007b31SStefan Hajnoczi 
169985fe2479SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
170061007b31SStefan Hajnoczi 
170161007b31SStefan Hajnoczi     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1702c1499a5eSEric Blake         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
170361007b31SStefan Hajnoczi         qemu_iovec_is_zero(qiov)) {
170461007b31SStefan Hajnoczi         flags |= BDRV_REQ_ZERO_WRITE;
170561007b31SStefan Hajnoczi         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
170661007b31SStefan Hajnoczi             flags |= BDRV_REQ_MAY_UNMAP;
170761007b31SStefan Hajnoczi         }
170861007b31SStefan Hajnoczi     }
170961007b31SStefan Hajnoczi 
171061007b31SStefan Hajnoczi     if (ret < 0) {
171161007b31SStefan Hajnoczi         /* Do nothing, write notifier decided to fail this request */
171261007b31SStefan Hajnoczi     } else if (flags & BDRV_REQ_ZERO_WRITE) {
17139a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
17149896c876SKevin Wolf         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
17153ea1a091SPavel Butsykin     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
17163ea1a091SPavel Butsykin         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
171704ed95f4SEric Blake     } else if (bytes <= max_transfer) {
17189a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV);
171978a07294SKevin Wolf         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
172004ed95f4SEric Blake     } else {
172104ed95f4SEric Blake         bdrv_debug_event(bs, BLKDBG_PWRITEV);
172204ed95f4SEric Blake         while (bytes_remaining) {
172304ed95f4SEric Blake             int num = MIN(bytes_remaining, max_transfer);
172404ed95f4SEric Blake             QEMUIOVector local_qiov;
172504ed95f4SEric Blake             int local_flags = flags;
172604ed95f4SEric Blake 
172704ed95f4SEric Blake             assert(num);
172804ed95f4SEric Blake             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
172904ed95f4SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
173004ed95f4SEric Blake                 /* If FUA is going to be emulated by flush, we only
173104ed95f4SEric Blake                  * need to flush on the last iteration */
173204ed95f4SEric Blake                 local_flags &= ~BDRV_REQ_FUA;
173304ed95f4SEric Blake             }
173404ed95f4SEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
173504ed95f4SEric Blake             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
173604ed95f4SEric Blake 
173704ed95f4SEric Blake             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
173804ed95f4SEric Blake                                       num, &local_qiov, local_flags);
173904ed95f4SEric Blake             qemu_iovec_destroy(&local_qiov);
174004ed95f4SEric Blake             if (ret < 0) {
174104ed95f4SEric Blake                 break;
174204ed95f4SEric Blake             }
174304ed95f4SEric Blake             bytes_remaining -= num;
174404ed95f4SEric Blake         }
174561007b31SStefan Hajnoczi     }
17469a4f4c31SKevin Wolf     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
174761007b31SStefan Hajnoczi 
174861007b31SStefan Hajnoczi     if (ret >= 0) {
174904ed95f4SEric Blake         ret = 0;
175061007b31SStefan Hajnoczi     }
175185fe2479SFam Zheng     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
175261007b31SStefan Hajnoczi 
175361007b31SStefan Hajnoczi     return ret;
175461007b31SStefan Hajnoczi }
175561007b31SStefan Hajnoczi 
175685c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
17579eeb6dd1SFam Zheng                                                 int64_t offset,
17589eeb6dd1SFam Zheng                                                 unsigned int bytes,
17599eeb6dd1SFam Zheng                                                 BdrvRequestFlags flags,
17609eeb6dd1SFam Zheng                                                 BdrvTrackedRequest *req)
17619eeb6dd1SFam Zheng {
176285c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
17639eeb6dd1SFam Zheng     uint8_t *buf = NULL;
17649eeb6dd1SFam Zheng     QEMUIOVector local_qiov;
17659eeb6dd1SFam Zheng     struct iovec iov;
1766a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
17679eeb6dd1SFam Zheng     unsigned int head_padding_bytes, tail_padding_bytes;
17689eeb6dd1SFam Zheng     int ret = 0;
17699eeb6dd1SFam Zheng 
17709eeb6dd1SFam Zheng     head_padding_bytes = offset & (align - 1);
1771f13ce1beSDenis V. Lunev     tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
17729eeb6dd1SFam Zheng 
17739eeb6dd1SFam Zheng 
17749eeb6dd1SFam Zheng     assert(flags & BDRV_REQ_ZERO_WRITE);
17759eeb6dd1SFam Zheng     if (head_padding_bytes || tail_padding_bytes) {
17769eeb6dd1SFam Zheng         buf = qemu_blockalign(bs, align);
17779eeb6dd1SFam Zheng         iov = (struct iovec) {
17789eeb6dd1SFam Zheng             .iov_base   = buf,
17799eeb6dd1SFam Zheng             .iov_len    = align,
17809eeb6dd1SFam Zheng         };
17819eeb6dd1SFam Zheng         qemu_iovec_init_external(&local_qiov, &iov, 1);
17829eeb6dd1SFam Zheng     }
17839eeb6dd1SFam Zheng     if (head_padding_bytes) {
17849eeb6dd1SFam Zheng         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
17859eeb6dd1SFam Zheng 
17869eeb6dd1SFam Zheng         /* RMW the unaligned part before head. */
17879eeb6dd1SFam Zheng         mark_request_serialising(req, align);
17889eeb6dd1SFam Zheng         wait_serialising_requests(req);
17899a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
179085c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
17919eeb6dd1SFam Zheng                                   align, &local_qiov, 0);
17929eeb6dd1SFam Zheng         if (ret < 0) {
17939eeb6dd1SFam Zheng             goto fail;
17949eeb6dd1SFam Zheng         }
17959a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
17969eeb6dd1SFam Zheng 
17979eeb6dd1SFam Zheng         memset(buf + head_padding_bytes, 0, zero_bytes);
179885c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1799cff86b38SEric Blake                                    align, &local_qiov,
18009eeb6dd1SFam Zheng                                    flags & ~BDRV_REQ_ZERO_WRITE);
18019eeb6dd1SFam Zheng         if (ret < 0) {
18029eeb6dd1SFam Zheng             goto fail;
18039eeb6dd1SFam Zheng         }
18049eeb6dd1SFam Zheng         offset += zero_bytes;
18059eeb6dd1SFam Zheng         bytes -= zero_bytes;
18069eeb6dd1SFam Zheng     }
18079eeb6dd1SFam Zheng 
18089eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
18099eeb6dd1SFam Zheng     if (bytes >= align) {
18109eeb6dd1SFam Zheng         /* Write the aligned part in the middle. */
18119eeb6dd1SFam Zheng         uint64_t aligned_bytes = bytes & ~(align - 1);
181285c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
18139eeb6dd1SFam Zheng                                    NULL, flags);
18149eeb6dd1SFam Zheng         if (ret < 0) {
18159eeb6dd1SFam Zheng             goto fail;
18169eeb6dd1SFam Zheng         }
18179eeb6dd1SFam Zheng         bytes -= aligned_bytes;
18189eeb6dd1SFam Zheng         offset += aligned_bytes;
18199eeb6dd1SFam Zheng     }
18209eeb6dd1SFam Zheng 
18219eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
18229eeb6dd1SFam Zheng     if (bytes) {
18239eeb6dd1SFam Zheng         assert(align == tail_padding_bytes + bytes);
18249eeb6dd1SFam Zheng         /* RMW the unaligned part after tail. */
18259eeb6dd1SFam Zheng         mark_request_serialising(req, align);
18269eeb6dd1SFam Zheng         wait_serialising_requests(req);
18279a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
182885c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, req, offset, align,
18299eeb6dd1SFam Zheng                                   align, &local_qiov, 0);
18309eeb6dd1SFam Zheng         if (ret < 0) {
18319eeb6dd1SFam Zheng             goto fail;
18329eeb6dd1SFam Zheng         }
18339a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
18349eeb6dd1SFam Zheng 
18359eeb6dd1SFam Zheng         memset(buf, 0, bytes);
183685c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
18379eeb6dd1SFam Zheng                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
18389eeb6dd1SFam Zheng     }
18399eeb6dd1SFam Zheng fail:
18409eeb6dd1SFam Zheng     qemu_vfree(buf);
18419eeb6dd1SFam Zheng     return ret;
18429eeb6dd1SFam Zheng 
18439eeb6dd1SFam Zheng }
18449eeb6dd1SFam Zheng 
184561007b31SStefan Hajnoczi /*
184661007b31SStefan Hajnoczi  * Handle a write request in coroutine context
184761007b31SStefan Hajnoczi  */
1848a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
184961007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
185061007b31SStefan Hajnoczi     BdrvRequestFlags flags)
185161007b31SStefan Hajnoczi {
1852a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
185361007b31SStefan Hajnoczi     BdrvTrackedRequest req;
1854a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
185561007b31SStefan Hajnoczi     uint8_t *head_buf = NULL;
185661007b31SStefan Hajnoczi     uint8_t *tail_buf = NULL;
185761007b31SStefan Hajnoczi     QEMUIOVector local_qiov;
185861007b31SStefan Hajnoczi     bool use_local_qiov = false;
185961007b31SStefan Hajnoczi     int ret;
186061007b31SStefan Hajnoczi 
1861f42cf447SDaniel P. Berrange     trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1862f42cf447SDaniel P. Berrange 
186361007b31SStefan Hajnoczi     if (!bs->drv) {
186461007b31SStefan Hajnoczi         return -ENOMEDIUM;
186561007b31SStefan Hajnoczi     }
186661007b31SStefan Hajnoczi 
186761007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
186861007b31SStefan Hajnoczi     if (ret < 0) {
186961007b31SStefan Hajnoczi         return ret;
187061007b31SStefan Hajnoczi     }
187161007b31SStefan Hajnoczi 
187299723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
187361007b31SStefan Hajnoczi     /*
187461007b31SStefan Hajnoczi      * Align write if necessary by performing a read-modify-write cycle.
187561007b31SStefan Hajnoczi      * Pad qiov with the read parts and be sure to have a tracked request not
187661007b31SStefan Hajnoczi      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
187761007b31SStefan Hajnoczi      */
1878ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
187961007b31SStefan Hajnoczi 
188018a59f03SAnton Nefedov     if (flags & BDRV_REQ_ZERO_WRITE) {
188185c97ca7SKevin Wolf         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
18829eeb6dd1SFam Zheng         goto out;
18839eeb6dd1SFam Zheng     }
18849eeb6dd1SFam Zheng 
188561007b31SStefan Hajnoczi     if (offset & (align - 1)) {
188661007b31SStefan Hajnoczi         QEMUIOVector head_qiov;
188761007b31SStefan Hajnoczi         struct iovec head_iov;
188861007b31SStefan Hajnoczi 
188961007b31SStefan Hajnoczi         mark_request_serialising(&req, align);
189061007b31SStefan Hajnoczi         wait_serialising_requests(&req);
189161007b31SStefan Hajnoczi 
189261007b31SStefan Hajnoczi         head_buf = qemu_blockalign(bs, align);
189361007b31SStefan Hajnoczi         head_iov = (struct iovec) {
189461007b31SStefan Hajnoczi             .iov_base   = head_buf,
189561007b31SStefan Hajnoczi             .iov_len    = align,
189661007b31SStefan Hajnoczi         };
189761007b31SStefan Hajnoczi         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
189861007b31SStefan Hajnoczi 
18999a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
190085c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
190161007b31SStefan Hajnoczi                                   align, &head_qiov, 0);
190261007b31SStefan Hajnoczi         if (ret < 0) {
190361007b31SStefan Hajnoczi             goto fail;
190461007b31SStefan Hajnoczi         }
19059a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
190661007b31SStefan Hajnoczi 
190761007b31SStefan Hajnoczi         qemu_iovec_init(&local_qiov, qiov->niov + 2);
190861007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
190961007b31SStefan Hajnoczi         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
191061007b31SStefan Hajnoczi         use_local_qiov = true;
191161007b31SStefan Hajnoczi 
191261007b31SStefan Hajnoczi         bytes += offset & (align - 1);
191361007b31SStefan Hajnoczi         offset = offset & ~(align - 1);
1914117bc3faSPeter Lieven 
1915117bc3faSPeter Lieven         /* We have read the tail already if the request is smaller
1916117bc3faSPeter Lieven          * than one aligned block.
1917117bc3faSPeter Lieven          */
1918117bc3faSPeter Lieven         if (bytes < align) {
1919117bc3faSPeter Lieven             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1920117bc3faSPeter Lieven             bytes = align;
1921117bc3faSPeter Lieven         }
192261007b31SStefan Hajnoczi     }
192361007b31SStefan Hajnoczi 
192461007b31SStefan Hajnoczi     if ((offset + bytes) & (align - 1)) {
192561007b31SStefan Hajnoczi         QEMUIOVector tail_qiov;
192661007b31SStefan Hajnoczi         struct iovec tail_iov;
192761007b31SStefan Hajnoczi         size_t tail_bytes;
192861007b31SStefan Hajnoczi         bool waited;
192961007b31SStefan Hajnoczi 
193061007b31SStefan Hajnoczi         mark_request_serialising(&req, align);
193161007b31SStefan Hajnoczi         waited = wait_serialising_requests(&req);
193261007b31SStefan Hajnoczi         assert(!waited || !use_local_qiov);
193361007b31SStefan Hajnoczi 
193461007b31SStefan Hajnoczi         tail_buf = qemu_blockalign(bs, align);
193561007b31SStefan Hajnoczi         tail_iov = (struct iovec) {
193661007b31SStefan Hajnoczi             .iov_base   = tail_buf,
193761007b31SStefan Hajnoczi             .iov_len    = align,
193861007b31SStefan Hajnoczi         };
193961007b31SStefan Hajnoczi         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
194061007b31SStefan Hajnoczi 
19419a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
194285c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
194385c97ca7SKevin Wolf                                   align, align, &tail_qiov, 0);
194461007b31SStefan Hajnoczi         if (ret < 0) {
194561007b31SStefan Hajnoczi             goto fail;
194661007b31SStefan Hajnoczi         }
19479a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
194861007b31SStefan Hajnoczi 
194961007b31SStefan Hajnoczi         if (!use_local_qiov) {
195061007b31SStefan Hajnoczi             qemu_iovec_init(&local_qiov, qiov->niov + 1);
195161007b31SStefan Hajnoczi             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
195261007b31SStefan Hajnoczi             use_local_qiov = true;
195361007b31SStefan Hajnoczi         }
195461007b31SStefan Hajnoczi 
195561007b31SStefan Hajnoczi         tail_bytes = (offset + bytes) & (align - 1);
195661007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
195761007b31SStefan Hajnoczi 
195861007b31SStefan Hajnoczi         bytes = ROUND_UP(bytes, align);
195961007b31SStefan Hajnoczi     }
196061007b31SStefan Hajnoczi 
196185c97ca7SKevin Wolf     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
196261007b31SStefan Hajnoczi                                use_local_qiov ? &local_qiov : qiov,
196361007b31SStefan Hajnoczi                                flags);
196461007b31SStefan Hajnoczi 
196561007b31SStefan Hajnoczi fail:
196661007b31SStefan Hajnoczi 
196761007b31SStefan Hajnoczi     if (use_local_qiov) {
196861007b31SStefan Hajnoczi         qemu_iovec_destroy(&local_qiov);
196961007b31SStefan Hajnoczi     }
197061007b31SStefan Hajnoczi     qemu_vfree(head_buf);
197161007b31SStefan Hajnoczi     qemu_vfree(tail_buf);
19729eeb6dd1SFam Zheng out:
19739eeb6dd1SFam Zheng     tracked_request_end(&req);
197499723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
197561007b31SStefan Hajnoczi     return ret;
197661007b31SStefan Hajnoczi }
197761007b31SStefan Hajnoczi 
1978a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1979f5a5ca79SManos Pitsidianakis                                        int bytes, BdrvRequestFlags flags)
198061007b31SStefan Hajnoczi {
1981f5a5ca79SManos Pitsidianakis     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
198261007b31SStefan Hajnoczi 
1983a03ef88fSKevin Wolf     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
198461007b31SStefan Hajnoczi         flags &= ~BDRV_REQ_MAY_UNMAP;
198561007b31SStefan Hajnoczi     }
198661007b31SStefan Hajnoczi 
1987f5a5ca79SManos Pitsidianakis     return bdrv_co_pwritev(child, offset, bytes, NULL,
198861007b31SStefan Hajnoczi                            BDRV_REQ_ZERO_WRITE | flags);
198961007b31SStefan Hajnoczi }
199061007b31SStefan Hajnoczi 
19914085f5c7SJohn Snow /*
19924085f5c7SJohn Snow  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
19934085f5c7SJohn Snow  */
19944085f5c7SJohn Snow int bdrv_flush_all(void)
19954085f5c7SJohn Snow {
19964085f5c7SJohn Snow     BdrvNextIterator it;
19974085f5c7SJohn Snow     BlockDriverState *bs = NULL;
19984085f5c7SJohn Snow     int result = 0;
19994085f5c7SJohn Snow 
20004085f5c7SJohn Snow     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
20014085f5c7SJohn Snow         AioContext *aio_context = bdrv_get_aio_context(bs);
20024085f5c7SJohn Snow         int ret;
20034085f5c7SJohn Snow 
20044085f5c7SJohn Snow         aio_context_acquire(aio_context);
20054085f5c7SJohn Snow         ret = bdrv_flush(bs);
20064085f5c7SJohn Snow         if (ret < 0 && !result) {
20074085f5c7SJohn Snow             result = ret;
20084085f5c7SJohn Snow         }
20094085f5c7SJohn Snow         aio_context_release(aio_context);
20104085f5c7SJohn Snow     }
20114085f5c7SJohn Snow 
20124085f5c7SJohn Snow     return result;
20134085f5c7SJohn Snow }
20144085f5c7SJohn Snow 
20154085f5c7SJohn Snow 
20164bcd936eSEric Blake typedef struct BdrvCoBlockStatusData {
201761007b31SStefan Hajnoczi     BlockDriverState *bs;
201861007b31SStefan Hajnoczi     BlockDriverState *base;
2019c9ce8c4dSEric Blake     bool want_zero;
20204bcd936eSEric Blake     int64_t offset;
20214bcd936eSEric Blake     int64_t bytes;
20224bcd936eSEric Blake     int64_t *pnum;
20234bcd936eSEric Blake     int64_t *map;
2024c9ce8c4dSEric Blake     BlockDriverState **file;
20254bcd936eSEric Blake     int ret;
202661007b31SStefan Hajnoczi     bool done;
20274bcd936eSEric Blake } BdrvCoBlockStatusData;
202861007b31SStefan Hajnoczi 
20293e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
20303e4d0e72SEric Blake                                                 bool want_zero,
20313e4d0e72SEric Blake                                                 int64_t offset,
20323e4d0e72SEric Blake                                                 int64_t bytes,
20333e4d0e72SEric Blake                                                 int64_t *pnum,
20343e4d0e72SEric Blake                                                 int64_t *map,
2035f7cc69b3SManos Pitsidianakis                                                 BlockDriverState **file)
2036f7cc69b3SManos Pitsidianakis {
2037f7cc69b3SManos Pitsidianakis     assert(bs->file && bs->file->bs);
20383e4d0e72SEric Blake     *pnum = bytes;
20393e4d0e72SEric Blake     *map = offset;
2040f7cc69b3SManos Pitsidianakis     *file = bs->file->bs;
20413e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2042f7cc69b3SManos Pitsidianakis }
2043f7cc69b3SManos Pitsidianakis 
20443e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
20453e4d0e72SEric Blake                                                    bool want_zero,
20463e4d0e72SEric Blake                                                    int64_t offset,
20473e4d0e72SEric Blake                                                    int64_t bytes,
20483e4d0e72SEric Blake                                                    int64_t *pnum,
20493e4d0e72SEric Blake                                                    int64_t *map,
2050f7cc69b3SManos Pitsidianakis                                                    BlockDriverState **file)
2051f7cc69b3SManos Pitsidianakis {
2052f7cc69b3SManos Pitsidianakis     assert(bs->backing && bs->backing->bs);
20533e4d0e72SEric Blake     *pnum = bytes;
20543e4d0e72SEric Blake     *map = offset;
2055f7cc69b3SManos Pitsidianakis     *file = bs->backing->bs;
20563e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2057f7cc69b3SManos Pitsidianakis }
2058f7cc69b3SManos Pitsidianakis 
205961007b31SStefan Hajnoczi /*
206061007b31SStefan Hajnoczi  * Returns the allocation status of the specified sectors.
206161007b31SStefan Hajnoczi  * Drivers not implementing the functionality are assumed to not support
206261007b31SStefan Hajnoczi  * backing files, hence all their sectors are reported as allocated.
206361007b31SStefan Hajnoczi  *
206486a3d5c6SEric Blake  * If 'want_zero' is true, the caller is querying for mapping
206586a3d5c6SEric Blake  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
206686a3d5c6SEric Blake  * _ZERO where possible; otherwise, the result favors larger 'pnum',
206786a3d5c6SEric Blake  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2068c9ce8c4dSEric Blake  *
20692e8bc787SEric Blake  * If 'offset' is beyond the end of the disk image the return value is
2070fb0d8654SEric Blake  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
207161007b31SStefan Hajnoczi  *
20722e8bc787SEric Blake  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2073fb0d8654SEric Blake  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2074fb0d8654SEric Blake  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
207567a0fd2aSFam Zheng  *
20762e8bc787SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
20772e8bc787SEric Blake  * following the specified offset) that are easily known to be in the
20782e8bc787SEric Blake  * same allocated/unallocated state.  Note that a second call starting
20792e8bc787SEric Blake  * at the original offset plus returned pnum may have the same status.
20802e8bc787SEric Blake  * The returned value is non-zero on success except at end-of-file.
20812e8bc787SEric Blake  *
20822e8bc787SEric Blake  * Returns negative errno on failure.  Otherwise, if the
20832e8bc787SEric Blake  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
20842e8bc787SEric Blake  * set to the host mapping and BDS corresponding to the guest offset.
208561007b31SStefan Hajnoczi  */
20862e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2087c9ce8c4dSEric Blake                                              bool want_zero,
20882e8bc787SEric Blake                                              int64_t offset, int64_t bytes,
20892e8bc787SEric Blake                                              int64_t *pnum, int64_t *map,
209067a0fd2aSFam Zheng                                              BlockDriverState **file)
209161007b31SStefan Hajnoczi {
20922e8bc787SEric Blake     int64_t total_size;
20932e8bc787SEric Blake     int64_t n; /* bytes */
2094efa6e2edSEric Blake     int ret;
20952e8bc787SEric Blake     int64_t local_map = 0;
2096298a1665SEric Blake     BlockDriverState *local_file = NULL;
2097efa6e2edSEric Blake     int64_t aligned_offset, aligned_bytes;
2098efa6e2edSEric Blake     uint32_t align;
209961007b31SStefan Hajnoczi 
2100298a1665SEric Blake     assert(pnum);
2101298a1665SEric Blake     *pnum = 0;
21022e8bc787SEric Blake     total_size = bdrv_getlength(bs);
21032e8bc787SEric Blake     if (total_size < 0) {
21042e8bc787SEric Blake         ret = total_size;
2105298a1665SEric Blake         goto early_out;
210661007b31SStefan Hajnoczi     }
210761007b31SStefan Hajnoczi 
21082e8bc787SEric Blake     if (offset >= total_size) {
2109298a1665SEric Blake         ret = BDRV_BLOCK_EOF;
2110298a1665SEric Blake         goto early_out;
211161007b31SStefan Hajnoczi     }
21122e8bc787SEric Blake     if (!bytes) {
2113298a1665SEric Blake         ret = 0;
2114298a1665SEric Blake         goto early_out;
21159cdcfd9fSEric Blake     }
211661007b31SStefan Hajnoczi 
21172e8bc787SEric Blake     n = total_size - offset;
21182e8bc787SEric Blake     if (n < bytes) {
21192e8bc787SEric Blake         bytes = n;
212061007b31SStefan Hajnoczi     }
212161007b31SStefan Hajnoczi 
2122d470ad42SMax Reitz     /* Must be non-NULL or bdrv_getlength() would have failed */
2123d470ad42SMax Reitz     assert(bs->drv);
2124636cb512SEric Blake     if (!bs->drv->bdrv_co_block_status) {
21252e8bc787SEric Blake         *pnum = bytes;
212661007b31SStefan Hajnoczi         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
21272e8bc787SEric Blake         if (offset + bytes == total_size) {
2128fb0d8654SEric Blake             ret |= BDRV_BLOCK_EOF;
2129fb0d8654SEric Blake         }
213061007b31SStefan Hajnoczi         if (bs->drv->protocol_name) {
21312e8bc787SEric Blake             ret |= BDRV_BLOCK_OFFSET_VALID;
21322e8bc787SEric Blake             local_map = offset;
2133298a1665SEric Blake             local_file = bs;
213461007b31SStefan Hajnoczi         }
2135298a1665SEric Blake         goto early_out;
213661007b31SStefan Hajnoczi     }
213761007b31SStefan Hajnoczi 
213899723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2139efa6e2edSEric Blake 
2140efa6e2edSEric Blake     /* Round out to request_alignment boundaries */
214186a3d5c6SEric Blake     align = bs->bl.request_alignment;
2142efa6e2edSEric Blake     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2143efa6e2edSEric Blake     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2144efa6e2edSEric Blake 
214586a3d5c6SEric Blake     ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
214686a3d5c6SEric Blake                                         aligned_bytes, pnum, &local_map,
214786a3d5c6SEric Blake                                         &local_file);
214886a3d5c6SEric Blake     if (ret < 0) {
214986a3d5c6SEric Blake         *pnum = 0;
215086a3d5c6SEric Blake         goto out;
215186a3d5c6SEric Blake     }
2152efa6e2edSEric Blake 
2153efa6e2edSEric Blake     /*
2154636cb512SEric Blake      * The driver's result must be a non-zero multiple of request_alignment.
2155efa6e2edSEric Blake      * Clamp pnum and adjust map to original request.
2156efa6e2edSEric Blake      */
2157636cb512SEric Blake     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2158636cb512SEric Blake            align > offset - aligned_offset);
2159efa6e2edSEric Blake     *pnum -= offset - aligned_offset;
2160efa6e2edSEric Blake     if (*pnum > bytes) {
2161efa6e2edSEric Blake         *pnum = bytes;
2162efa6e2edSEric Blake     }
2163efa6e2edSEric Blake     if (ret & BDRV_BLOCK_OFFSET_VALID) {
2164efa6e2edSEric Blake         local_map += offset - aligned_offset;
2165efa6e2edSEric Blake     }
216661007b31SStefan Hajnoczi 
216761007b31SStefan Hajnoczi     if (ret & BDRV_BLOCK_RAW) {
2168298a1665SEric Blake         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
21692e8bc787SEric Blake         ret = bdrv_co_block_status(local_file, want_zero, local_map,
21702e8bc787SEric Blake                                    *pnum, pnum, &local_map, &local_file);
217199723548SPaolo Bonzini         goto out;
217261007b31SStefan Hajnoczi     }
217361007b31SStefan Hajnoczi 
217461007b31SStefan Hajnoczi     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
217561007b31SStefan Hajnoczi         ret |= BDRV_BLOCK_ALLOCATED;
2176c9ce8c4dSEric Blake     } else if (want_zero) {
217761007b31SStefan Hajnoczi         if (bdrv_unallocated_blocks_are_zero(bs)) {
217861007b31SStefan Hajnoczi             ret |= BDRV_BLOCK_ZERO;
2179760e0063SKevin Wolf         } else if (bs->backing) {
2180760e0063SKevin Wolf             BlockDriverState *bs2 = bs->backing->bs;
21812e8bc787SEric Blake             int64_t size2 = bdrv_getlength(bs2);
2182c9ce8c4dSEric Blake 
21832e8bc787SEric Blake             if (size2 >= 0 && offset >= size2) {
218461007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
218561007b31SStefan Hajnoczi             }
218661007b31SStefan Hajnoczi         }
218761007b31SStefan Hajnoczi     }
218861007b31SStefan Hajnoczi 
2189c9ce8c4dSEric Blake     if (want_zero && local_file && local_file != bs &&
219061007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
219161007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_OFFSET_VALID)) {
21922e8bc787SEric Blake         int64_t file_pnum;
21932e8bc787SEric Blake         int ret2;
219461007b31SStefan Hajnoczi 
21952e8bc787SEric Blake         ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
21962e8bc787SEric Blake                                     *pnum, &file_pnum, NULL, NULL);
219761007b31SStefan Hajnoczi         if (ret2 >= 0) {
219861007b31SStefan Hajnoczi             /* Ignore errors.  This is just providing extra information, it
219961007b31SStefan Hajnoczi              * is useful but not necessary.
220061007b31SStefan Hajnoczi              */
2201c61e684eSEric Blake             if (ret2 & BDRV_BLOCK_EOF &&
2202c61e684eSEric Blake                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2203c61e684eSEric Blake                 /*
2204c61e684eSEric Blake                  * It is valid for the format block driver to read
2205c61e684eSEric Blake                  * beyond the end of the underlying file's current
2206c61e684eSEric Blake                  * size; such areas read as zero.
2207c61e684eSEric Blake                  */
220861007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
220961007b31SStefan Hajnoczi             } else {
221061007b31SStefan Hajnoczi                 /* Limit request to the range reported by the protocol driver */
221161007b31SStefan Hajnoczi                 *pnum = file_pnum;
221261007b31SStefan Hajnoczi                 ret |= (ret2 & BDRV_BLOCK_ZERO);
221361007b31SStefan Hajnoczi             }
221461007b31SStefan Hajnoczi         }
221561007b31SStefan Hajnoczi     }
221661007b31SStefan Hajnoczi 
221799723548SPaolo Bonzini out:
221899723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
22192e8bc787SEric Blake     if (ret >= 0 && offset + *pnum == total_size) {
2220fb0d8654SEric Blake         ret |= BDRV_BLOCK_EOF;
2221fb0d8654SEric Blake     }
2222298a1665SEric Blake early_out:
2223298a1665SEric Blake     if (file) {
2224298a1665SEric Blake         *file = local_file;
2225298a1665SEric Blake     }
22262e8bc787SEric Blake     if (map) {
22272e8bc787SEric Blake         *map = local_map;
22282e8bc787SEric Blake     }
222961007b31SStefan Hajnoczi     return ret;
223061007b31SStefan Hajnoczi }
223161007b31SStefan Hajnoczi 
22325b648c67SEric Blake static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2233ba3f0e25SFam Zheng                                                    BlockDriverState *base,
2234c9ce8c4dSEric Blake                                                    bool want_zero,
22355b648c67SEric Blake                                                    int64_t offset,
22365b648c67SEric Blake                                                    int64_t bytes,
22375b648c67SEric Blake                                                    int64_t *pnum,
22385b648c67SEric Blake                                                    int64_t *map,
223967a0fd2aSFam Zheng                                                    BlockDriverState **file)
2240ba3f0e25SFam Zheng {
2241ba3f0e25SFam Zheng     BlockDriverState *p;
22425b648c67SEric Blake     int ret = 0;
2243c61e684eSEric Blake     bool first = true;
2244ba3f0e25SFam Zheng 
2245ba3f0e25SFam Zheng     assert(bs != base);
2246760e0063SKevin Wolf     for (p = bs; p != base; p = backing_bs(p)) {
22475b648c67SEric Blake         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
22485b648c67SEric Blake                                    file);
2249c61e684eSEric Blake         if (ret < 0) {
2250c61e684eSEric Blake             break;
2251c61e684eSEric Blake         }
2252c61e684eSEric Blake         if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2253c61e684eSEric Blake             /*
2254c61e684eSEric Blake              * Reading beyond the end of the file continues to read
2255c61e684eSEric Blake              * zeroes, but we can only widen the result to the
2256c61e684eSEric Blake              * unallocated length we learned from an earlier
2257c61e684eSEric Blake              * iteration.
2258c61e684eSEric Blake              */
22595b648c67SEric Blake             *pnum = bytes;
2260c61e684eSEric Blake         }
2261c61e684eSEric Blake         if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2262ba3f0e25SFam Zheng             break;
2263ba3f0e25SFam Zheng         }
22645b648c67SEric Blake         /* [offset, pnum] unallocated on this layer, which could be only
22655b648c67SEric Blake          * the first part of [offset, bytes].  */
22665b648c67SEric Blake         bytes = MIN(bytes, *pnum);
2267c61e684eSEric Blake         first = false;
2268ba3f0e25SFam Zheng     }
2269ba3f0e25SFam Zheng     return ret;
2270ba3f0e25SFam Zheng }
2271ba3f0e25SFam Zheng 
227231826642SEric Blake /* Coroutine wrapper for bdrv_block_status_above() */
22735b648c67SEric Blake static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
227461007b31SStefan Hajnoczi {
22754bcd936eSEric Blake     BdrvCoBlockStatusData *data = opaque;
227661007b31SStefan Hajnoczi 
22775b648c67SEric Blake     data->ret = bdrv_co_block_status_above(data->bs, data->base,
2278c9ce8c4dSEric Blake                                            data->want_zero,
22795b648c67SEric Blake                                            data->offset, data->bytes,
22805b648c67SEric Blake                                            data->pnum, data->map, data->file);
228161007b31SStefan Hajnoczi     data->done = true;
228261007b31SStefan Hajnoczi }
228361007b31SStefan Hajnoczi 
228461007b31SStefan Hajnoczi /*
22855b648c67SEric Blake  * Synchronous wrapper around bdrv_co_block_status_above().
228661007b31SStefan Hajnoczi  *
22875b648c67SEric Blake  * See bdrv_co_block_status_above() for details.
228861007b31SStefan Hajnoczi  */
22897ddb99b9SEric Blake static int bdrv_common_block_status_above(BlockDriverState *bs,
2290ba3f0e25SFam Zheng                                           BlockDriverState *base,
22917ddb99b9SEric Blake                                           bool want_zero, int64_t offset,
22927ddb99b9SEric Blake                                           int64_t bytes, int64_t *pnum,
22937ddb99b9SEric Blake                                           int64_t *map,
229467a0fd2aSFam Zheng                                           BlockDriverState **file)
229561007b31SStefan Hajnoczi {
229661007b31SStefan Hajnoczi     Coroutine *co;
22974bcd936eSEric Blake     BdrvCoBlockStatusData data = {
229861007b31SStefan Hajnoczi         .bs = bs,
2299ba3f0e25SFam Zheng         .base = base,
2300c9ce8c4dSEric Blake         .want_zero = want_zero,
23017ddb99b9SEric Blake         .offset = offset,
23027ddb99b9SEric Blake         .bytes = bytes,
23037ddb99b9SEric Blake         .pnum = pnum,
23047ddb99b9SEric Blake         .map = map,
2305c9ce8c4dSEric Blake         .file = file,
230661007b31SStefan Hajnoczi         .done = false,
230761007b31SStefan Hajnoczi     };
230861007b31SStefan Hajnoczi 
230961007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
231061007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
23115b648c67SEric Blake         bdrv_block_status_above_co_entry(&data);
231261007b31SStefan Hajnoczi     } else {
23135b648c67SEric Blake         co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2314e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
231588b062c2SPaolo Bonzini         BDRV_POLL_WHILE(bs, !data.done);
231661007b31SStefan Hajnoczi     }
231761007b31SStefan Hajnoczi     return data.ret;
231861007b31SStefan Hajnoczi }
231961007b31SStefan Hajnoczi 
232031826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
232131826642SEric Blake                             int64_t offset, int64_t bytes, int64_t *pnum,
232231826642SEric Blake                             int64_t *map, BlockDriverState **file)
2323c9ce8c4dSEric Blake {
232431826642SEric Blake     return bdrv_common_block_status_above(bs, base, true, offset, bytes,
232531826642SEric Blake                                           pnum, map, file);
2326c9ce8c4dSEric Blake }
2327c9ce8c4dSEric Blake 
2328237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2329237d78f8SEric Blake                       int64_t *pnum, int64_t *map, BlockDriverState **file)
2330ba3f0e25SFam Zheng {
233131826642SEric Blake     return bdrv_block_status_above(bs, backing_bs(bs),
233231826642SEric Blake                                    offset, bytes, pnum, map, file);
2333ba3f0e25SFam Zheng }
2334ba3f0e25SFam Zheng 
2335d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2336d6a644bbSEric Blake                                    int64_t bytes, int64_t *pnum)
233761007b31SStefan Hajnoczi {
23387ddb99b9SEric Blake     int ret;
23397ddb99b9SEric Blake     int64_t dummy;
2340d6a644bbSEric Blake 
23417ddb99b9SEric Blake     ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
23427ddb99b9SEric Blake                                          bytes, pnum ? pnum : &dummy, NULL,
2343298a1665SEric Blake                                          NULL);
234461007b31SStefan Hajnoczi     if (ret < 0) {
234561007b31SStefan Hajnoczi         return ret;
234661007b31SStefan Hajnoczi     }
234761007b31SStefan Hajnoczi     return !!(ret & BDRV_BLOCK_ALLOCATED);
234861007b31SStefan Hajnoczi }
234961007b31SStefan Hajnoczi 
235061007b31SStefan Hajnoczi /*
235161007b31SStefan Hajnoczi  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
235261007b31SStefan Hajnoczi  *
235351b0a488SEric Blake  * Return true if (a prefix of) the given range is allocated in any image
235451b0a488SEric Blake  * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
235551b0a488SEric Blake  * offset is allocated in any image of the chain.  Return false otherwise,
2356d6a644bbSEric Blake  * or negative errno on failure.
235761007b31SStefan Hajnoczi  *
235851b0a488SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
235951b0a488SEric Blake  * following the specified offset) that are known to be in the same
236051b0a488SEric Blake  * allocated/unallocated state.  Note that a subsequent call starting
236151b0a488SEric Blake  * at 'offset + *pnum' may return the same allocation status (in other
236251b0a488SEric Blake  * words, the result is not necessarily the maximum possible range);
236351b0a488SEric Blake  * but 'pnum' will only be 0 when end of file is reached.
236461007b31SStefan Hajnoczi  *
236561007b31SStefan Hajnoczi  */
236661007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top,
236761007b31SStefan Hajnoczi                             BlockDriverState *base,
236851b0a488SEric Blake                             int64_t offset, int64_t bytes, int64_t *pnum)
236961007b31SStefan Hajnoczi {
237061007b31SStefan Hajnoczi     BlockDriverState *intermediate;
237151b0a488SEric Blake     int ret;
237251b0a488SEric Blake     int64_t n = bytes;
237361007b31SStefan Hajnoczi 
237461007b31SStefan Hajnoczi     intermediate = top;
237561007b31SStefan Hajnoczi     while (intermediate && intermediate != base) {
2376d6a644bbSEric Blake         int64_t pnum_inter;
2377c00716beSEric Blake         int64_t size_inter;
2378d6a644bbSEric Blake 
237951b0a488SEric Blake         ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
238061007b31SStefan Hajnoczi         if (ret < 0) {
238161007b31SStefan Hajnoczi             return ret;
2382d6a644bbSEric Blake         }
2383d6a644bbSEric Blake         if (ret) {
238451b0a488SEric Blake             *pnum = pnum_inter;
238561007b31SStefan Hajnoczi             return 1;
238661007b31SStefan Hajnoczi         }
238761007b31SStefan Hajnoczi 
238851b0a488SEric Blake         size_inter = bdrv_getlength(intermediate);
2389c00716beSEric Blake         if (size_inter < 0) {
2390c00716beSEric Blake             return size_inter;
2391c00716beSEric Blake         }
239251b0a488SEric Blake         if (n > pnum_inter &&
239351b0a488SEric Blake             (intermediate == top || offset + pnum_inter < size_inter)) {
239451b0a488SEric Blake             n = pnum_inter;
239561007b31SStefan Hajnoczi         }
239661007b31SStefan Hajnoczi 
2397760e0063SKevin Wolf         intermediate = backing_bs(intermediate);
239861007b31SStefan Hajnoczi     }
239961007b31SStefan Hajnoczi 
240061007b31SStefan Hajnoczi     *pnum = n;
240161007b31SStefan Hajnoczi     return 0;
240261007b31SStefan Hajnoczi }
240361007b31SStefan Hajnoczi 
24041a8ae822SKevin Wolf typedef struct BdrvVmstateCo {
24051a8ae822SKevin Wolf     BlockDriverState    *bs;
24061a8ae822SKevin Wolf     QEMUIOVector        *qiov;
24071a8ae822SKevin Wolf     int64_t             pos;
24081a8ae822SKevin Wolf     bool                is_read;
24091a8ae822SKevin Wolf     int                 ret;
24101a8ae822SKevin Wolf } BdrvVmstateCo;
24111a8ae822SKevin Wolf 
24121a8ae822SKevin Wolf static int coroutine_fn
24131a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
24141a8ae822SKevin Wolf                    bool is_read)
24151a8ae822SKevin Wolf {
24161a8ae822SKevin Wolf     BlockDriver *drv = bs->drv;
2417dc88a467SStefan Hajnoczi     int ret = -ENOTSUP;
2418dc88a467SStefan Hajnoczi 
2419dc88a467SStefan Hajnoczi     bdrv_inc_in_flight(bs);
24201a8ae822SKevin Wolf 
24211a8ae822SKevin Wolf     if (!drv) {
2422dc88a467SStefan Hajnoczi         ret = -ENOMEDIUM;
24231a8ae822SKevin Wolf     } else if (drv->bdrv_load_vmstate) {
2424dc88a467SStefan Hajnoczi         if (is_read) {
2425dc88a467SStefan Hajnoczi             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2426dc88a467SStefan Hajnoczi         } else {
2427dc88a467SStefan Hajnoczi             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2428dc88a467SStefan Hajnoczi         }
24291a8ae822SKevin Wolf     } else if (bs->file) {
2430dc88a467SStefan Hajnoczi         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
24311a8ae822SKevin Wolf     }
24321a8ae822SKevin Wolf 
2433dc88a467SStefan Hajnoczi     bdrv_dec_in_flight(bs);
2434dc88a467SStefan Hajnoczi     return ret;
24351a8ae822SKevin Wolf }
24361a8ae822SKevin Wolf 
24371a8ae822SKevin Wolf static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
24381a8ae822SKevin Wolf {
24391a8ae822SKevin Wolf     BdrvVmstateCo *co = opaque;
24401a8ae822SKevin Wolf     co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
24411a8ae822SKevin Wolf }
24421a8ae822SKevin Wolf 
24431a8ae822SKevin Wolf static inline int
24441a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
24451a8ae822SKevin Wolf                 bool is_read)
24461a8ae822SKevin Wolf {
24471a8ae822SKevin Wolf     if (qemu_in_coroutine()) {
24481a8ae822SKevin Wolf         return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
24491a8ae822SKevin Wolf     } else {
24501a8ae822SKevin Wolf         BdrvVmstateCo data = {
24511a8ae822SKevin Wolf             .bs         = bs,
24521a8ae822SKevin Wolf             .qiov       = qiov,
24531a8ae822SKevin Wolf             .pos        = pos,
24541a8ae822SKevin Wolf             .is_read    = is_read,
24551a8ae822SKevin Wolf             .ret        = -EINPROGRESS,
24561a8ae822SKevin Wolf         };
24570b8b8753SPaolo Bonzini         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
24581a8ae822SKevin Wolf 
2459e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
2460ea17c9d2SStefan Hajnoczi         BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
24611a8ae822SKevin Wolf         return data.ret;
24621a8ae822SKevin Wolf     }
24631a8ae822SKevin Wolf }
24641a8ae822SKevin Wolf 
246561007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
246661007b31SStefan Hajnoczi                       int64_t pos, int size)
246761007b31SStefan Hajnoczi {
246861007b31SStefan Hajnoczi     QEMUIOVector qiov;
246961007b31SStefan Hajnoczi     struct iovec iov = {
247061007b31SStefan Hajnoczi         .iov_base   = (void *) buf,
247161007b31SStefan Hajnoczi         .iov_len    = size,
247261007b31SStefan Hajnoczi     };
2473b433d942SKevin Wolf     int ret;
247461007b31SStefan Hajnoczi 
247561007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
2476b433d942SKevin Wolf 
2477b433d942SKevin Wolf     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2478b433d942SKevin Wolf     if (ret < 0) {
2479b433d942SKevin Wolf         return ret;
2480b433d942SKevin Wolf     }
2481b433d942SKevin Wolf 
2482b433d942SKevin Wolf     return size;
248361007b31SStefan Hajnoczi }
248461007b31SStefan Hajnoczi 
248561007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
248661007b31SStefan Hajnoczi {
24871a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, false);
248861007b31SStefan Hajnoczi }
248961007b31SStefan Hajnoczi 
249061007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
249161007b31SStefan Hajnoczi                       int64_t pos, int size)
249261007b31SStefan Hajnoczi {
24935ddda0b8SKevin Wolf     QEMUIOVector qiov;
24945ddda0b8SKevin Wolf     struct iovec iov = {
24955ddda0b8SKevin Wolf         .iov_base   = buf,
24965ddda0b8SKevin Wolf         .iov_len    = size,
24975ddda0b8SKevin Wolf     };
2498b433d942SKevin Wolf     int ret;
24995ddda0b8SKevin Wolf 
25005ddda0b8SKevin Wolf     qemu_iovec_init_external(&qiov, &iov, 1);
2501b433d942SKevin Wolf     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2502b433d942SKevin Wolf     if (ret < 0) {
2503b433d942SKevin Wolf         return ret;
2504b433d942SKevin Wolf     }
2505b433d942SKevin Wolf 
2506b433d942SKevin Wolf     return size;
25075ddda0b8SKevin Wolf }
25085ddda0b8SKevin Wolf 
25095ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
25105ddda0b8SKevin Wolf {
25111a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, true);
251261007b31SStefan Hajnoczi }
251361007b31SStefan Hajnoczi 
251461007b31SStefan Hajnoczi /**************************************************************/
251561007b31SStefan Hajnoczi /* async I/Os */
251661007b31SStefan Hajnoczi 
251761007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb)
251861007b31SStefan Hajnoczi {
251961007b31SStefan Hajnoczi     qemu_aio_ref(acb);
252061007b31SStefan Hajnoczi     bdrv_aio_cancel_async(acb);
252161007b31SStefan Hajnoczi     while (acb->refcnt > 1) {
252261007b31SStefan Hajnoczi         if (acb->aiocb_info->get_aio_context) {
252361007b31SStefan Hajnoczi             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
252461007b31SStefan Hajnoczi         } else if (acb->bs) {
25252f47da5fSPaolo Bonzini             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
25262f47da5fSPaolo Bonzini              * assert that we're not using an I/O thread.  Thread-safe
25272f47da5fSPaolo Bonzini              * code should use bdrv_aio_cancel_async exclusively.
25282f47da5fSPaolo Bonzini              */
25292f47da5fSPaolo Bonzini             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
253061007b31SStefan Hajnoczi             aio_poll(bdrv_get_aio_context(acb->bs), true);
253161007b31SStefan Hajnoczi         } else {
253261007b31SStefan Hajnoczi             abort();
253361007b31SStefan Hajnoczi         }
253461007b31SStefan Hajnoczi     }
253561007b31SStefan Hajnoczi     qemu_aio_unref(acb);
253661007b31SStefan Hajnoczi }
253761007b31SStefan Hajnoczi 
253861007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements
253961007b31SStefan Hajnoczi  * cancel_async, otherwise we do nothing and let the request normally complete.
254061007b31SStefan Hajnoczi  * In either case the completion callback must be called. */
254161007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb)
254261007b31SStefan Hajnoczi {
254361007b31SStefan Hajnoczi     if (acb->aiocb_info->cancel_async) {
254461007b31SStefan Hajnoczi         acb->aiocb_info->cancel_async(acb);
254561007b31SStefan Hajnoczi     }
254661007b31SStefan Hajnoczi }
254761007b31SStefan Hajnoczi 
254861007b31SStefan Hajnoczi /**************************************************************/
254961007b31SStefan Hajnoczi /* Coroutine block device emulation */
255061007b31SStefan Hajnoczi 
2551e293b7a3SKevin Wolf typedef struct FlushCo {
2552e293b7a3SKevin Wolf     BlockDriverState *bs;
2553e293b7a3SKevin Wolf     int ret;
2554e293b7a3SKevin Wolf } FlushCo;
2555e293b7a3SKevin Wolf 
2556e293b7a3SKevin Wolf 
255761007b31SStefan Hajnoczi static void coroutine_fn bdrv_flush_co_entry(void *opaque)
255861007b31SStefan Hajnoczi {
2559e293b7a3SKevin Wolf     FlushCo *rwco = opaque;
256061007b31SStefan Hajnoczi 
256161007b31SStefan Hajnoczi     rwco->ret = bdrv_co_flush(rwco->bs);
256261007b31SStefan Hajnoczi }
256361007b31SStefan Hajnoczi 
256461007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
256561007b31SStefan Hajnoczi {
256649ca6259SFam Zheng     int current_gen;
256749ca6259SFam Zheng     int ret = 0;
256861007b31SStefan Hajnoczi 
256999723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2570c32b82afSPavel Dovgalyuk 
2571e914404eSFam Zheng     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
257249ca6259SFam Zheng         bdrv_is_sg(bs)) {
257349ca6259SFam Zheng         goto early_exit;
257449ca6259SFam Zheng     }
257549ca6259SFam Zheng 
25763783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
257747fec599SPaolo Bonzini     current_gen = atomic_read(&bs->write_gen);
25783ff2f67aSEvgeny Yakovlev 
25793ff2f67aSEvgeny Yakovlev     /* Wait until any previous flushes are completed */
258099723548SPaolo Bonzini     while (bs->active_flush_req) {
25813783fa3dSPaolo Bonzini         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
25823ff2f67aSEvgeny Yakovlev     }
25833ff2f67aSEvgeny Yakovlev 
25843783fa3dSPaolo Bonzini     /* Flushes reach this point in nondecreasing current_gen order.  */
258599723548SPaolo Bonzini     bs->active_flush_req = true;
25863783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
25873ff2f67aSEvgeny Yakovlev 
2588c32b82afSPavel Dovgalyuk     /* Write back all layers by calling one driver function */
2589c32b82afSPavel Dovgalyuk     if (bs->drv->bdrv_co_flush) {
2590c32b82afSPavel Dovgalyuk         ret = bs->drv->bdrv_co_flush(bs);
2591c32b82afSPavel Dovgalyuk         goto out;
2592c32b82afSPavel Dovgalyuk     }
2593c32b82afSPavel Dovgalyuk 
259461007b31SStefan Hajnoczi     /* Write back cached data to the OS even with cache=unsafe */
259561007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
259661007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_os) {
259761007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_os(bs);
259861007b31SStefan Hajnoczi         if (ret < 0) {
2599cdb5e315SFam Zheng             goto out;
260061007b31SStefan Hajnoczi         }
260161007b31SStefan Hajnoczi     }
260261007b31SStefan Hajnoczi 
260361007b31SStefan Hajnoczi     /* But don't actually force it to the disk with cache=unsafe */
260461007b31SStefan Hajnoczi     if (bs->open_flags & BDRV_O_NO_FLUSH) {
260561007b31SStefan Hajnoczi         goto flush_parent;
260661007b31SStefan Hajnoczi     }
260761007b31SStefan Hajnoczi 
26083ff2f67aSEvgeny Yakovlev     /* Check if we really need to flush anything */
26093ff2f67aSEvgeny Yakovlev     if (bs->flushed_gen == current_gen) {
26103ff2f67aSEvgeny Yakovlev         goto flush_parent;
26113ff2f67aSEvgeny Yakovlev     }
26123ff2f67aSEvgeny Yakovlev 
261361007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2614d470ad42SMax Reitz     if (!bs->drv) {
2615d470ad42SMax Reitz         /* bs->drv->bdrv_co_flush() might have ejected the BDS
2616d470ad42SMax Reitz          * (even in case of apparent success) */
2617d470ad42SMax Reitz         ret = -ENOMEDIUM;
2618d470ad42SMax Reitz         goto out;
2619d470ad42SMax Reitz     }
262061007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_disk) {
262161007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_disk(bs);
262261007b31SStefan Hajnoczi     } else if (bs->drv->bdrv_aio_flush) {
262361007b31SStefan Hajnoczi         BlockAIOCB *acb;
262461007b31SStefan Hajnoczi         CoroutineIOCompletion co = {
262561007b31SStefan Hajnoczi             .coroutine = qemu_coroutine_self(),
262661007b31SStefan Hajnoczi         };
262761007b31SStefan Hajnoczi 
262861007b31SStefan Hajnoczi         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
262961007b31SStefan Hajnoczi         if (acb == NULL) {
263061007b31SStefan Hajnoczi             ret = -EIO;
263161007b31SStefan Hajnoczi         } else {
263261007b31SStefan Hajnoczi             qemu_coroutine_yield();
263361007b31SStefan Hajnoczi             ret = co.ret;
263461007b31SStefan Hajnoczi         }
263561007b31SStefan Hajnoczi     } else {
263661007b31SStefan Hajnoczi         /*
263761007b31SStefan Hajnoczi          * Some block drivers always operate in either writethrough or unsafe
263861007b31SStefan Hajnoczi          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
263961007b31SStefan Hajnoczi          * know how the server works (because the behaviour is hardcoded or
264061007b31SStefan Hajnoczi          * depends on server-side configuration), so we can't ensure that
264161007b31SStefan Hajnoczi          * everything is safe on disk. Returning an error doesn't work because
264261007b31SStefan Hajnoczi          * that would break guests even if the server operates in writethrough
264361007b31SStefan Hajnoczi          * mode.
264461007b31SStefan Hajnoczi          *
264561007b31SStefan Hajnoczi          * Let's hope the user knows what he's doing.
264661007b31SStefan Hajnoczi          */
264761007b31SStefan Hajnoczi         ret = 0;
264861007b31SStefan Hajnoczi     }
26493ff2f67aSEvgeny Yakovlev 
265061007b31SStefan Hajnoczi     if (ret < 0) {
2651cdb5e315SFam Zheng         goto out;
265261007b31SStefan Hajnoczi     }
265361007b31SStefan Hajnoczi 
265461007b31SStefan Hajnoczi     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
265561007b31SStefan Hajnoczi      * in the case of cache=unsafe, so there are no useless flushes.
265661007b31SStefan Hajnoczi      */
265761007b31SStefan Hajnoczi flush_parent:
2658cdb5e315SFam Zheng     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2659cdb5e315SFam Zheng out:
26603ff2f67aSEvgeny Yakovlev     /* Notify any pending flushes that we have completed */
2661e6af1e08SKevin Wolf     if (ret == 0) {
26623ff2f67aSEvgeny Yakovlev         bs->flushed_gen = current_gen;
2663e6af1e08SKevin Wolf     }
26643783fa3dSPaolo Bonzini 
26653783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
266699723548SPaolo Bonzini     bs->active_flush_req = false;
2667156af3acSDenis V. Lunev     /* Return value is ignored - it's ok if wait queue is empty */
2668156af3acSDenis V. Lunev     qemu_co_queue_next(&bs->flush_queue);
26693783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
26703ff2f67aSEvgeny Yakovlev 
267149ca6259SFam Zheng early_exit:
267299723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2673cdb5e315SFam Zheng     return ret;
267461007b31SStefan Hajnoczi }
267561007b31SStefan Hajnoczi 
267661007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs)
267761007b31SStefan Hajnoczi {
267861007b31SStefan Hajnoczi     Coroutine *co;
2679e293b7a3SKevin Wolf     FlushCo flush_co = {
268061007b31SStefan Hajnoczi         .bs = bs,
268161007b31SStefan Hajnoczi         .ret = NOT_DONE,
268261007b31SStefan Hajnoczi     };
268361007b31SStefan Hajnoczi 
268461007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
268561007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
2686e293b7a3SKevin Wolf         bdrv_flush_co_entry(&flush_co);
268761007b31SStefan Hajnoczi     } else {
26880b8b8753SPaolo Bonzini         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2689e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
269088b062c2SPaolo Bonzini         BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
269161007b31SStefan Hajnoczi     }
269261007b31SStefan Hajnoczi 
2693e293b7a3SKevin Wolf     return flush_co.ret;
269461007b31SStefan Hajnoczi }
269561007b31SStefan Hajnoczi 
269661007b31SStefan Hajnoczi typedef struct DiscardCo {
26970b9fd3f4SFam Zheng     BdrvChild *child;
26980c51a893SEric Blake     int64_t offset;
2699f5a5ca79SManos Pitsidianakis     int bytes;
270061007b31SStefan Hajnoczi     int ret;
270161007b31SStefan Hajnoczi } DiscardCo;
27020c51a893SEric Blake static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
270361007b31SStefan Hajnoczi {
270461007b31SStefan Hajnoczi     DiscardCo *rwco = opaque;
270561007b31SStefan Hajnoczi 
27060b9fd3f4SFam Zheng     rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
270761007b31SStefan Hajnoczi }
270861007b31SStefan Hajnoczi 
27090b9fd3f4SFam Zheng int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
271061007b31SStefan Hajnoczi {
2711b1066c87SFam Zheng     BdrvTrackedRequest req;
27129f1963b3SEric Blake     int max_pdiscard, ret;
27133482b9bcSEric Blake     int head, tail, align;
27140b9fd3f4SFam Zheng     BlockDriverState *bs = child->bs;
271561007b31SStefan Hajnoczi 
27160b9fd3f4SFam Zheng     if (!bs || !bs->drv) {
271761007b31SStefan Hajnoczi         return -ENOMEDIUM;
271861007b31SStefan Hajnoczi     }
271961007b31SStefan Hajnoczi 
2720d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
2721d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
2722d6883bc9SVladimir Sementsov-Ogievskiy     }
2723d6883bc9SVladimir Sementsov-Ogievskiy 
2724f5a5ca79SManos Pitsidianakis     ret = bdrv_check_byte_request(bs, offset, bytes);
272561007b31SStefan Hajnoczi     if (ret < 0) {
272661007b31SStefan Hajnoczi         return ret;
272761007b31SStefan Hajnoczi     }
272861007b31SStefan Hajnoczi 
272961007b31SStefan Hajnoczi     /* Do nothing if disabled.  */
273061007b31SStefan Hajnoczi     if (!(bs->open_flags & BDRV_O_UNMAP)) {
273161007b31SStefan Hajnoczi         return 0;
273261007b31SStefan Hajnoczi     }
273361007b31SStefan Hajnoczi 
273402aefe43SEric Blake     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
273561007b31SStefan Hajnoczi         return 0;
273661007b31SStefan Hajnoczi     }
273761007b31SStefan Hajnoczi 
27383482b9bcSEric Blake     /* Discard is advisory, but some devices track and coalesce
27393482b9bcSEric Blake      * unaligned requests, so we must pass everything down rather than
27403482b9bcSEric Blake      * round here.  Still, most devices will just silently ignore
27413482b9bcSEric Blake      * unaligned requests (by returning -ENOTSUP), so we must fragment
27423482b9bcSEric Blake      * the request accordingly.  */
274302aefe43SEric Blake     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2744b8d0a980SEric Blake     assert(align % bs->bl.request_alignment == 0);
2745b8d0a980SEric Blake     head = offset % align;
2746f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % align;
27479f1963b3SEric Blake 
274899723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2749f5a5ca79SManos Pitsidianakis     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
275050824995SFam Zheng 
275100695c27SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2752ec050f77SDenis V. Lunev     if (ret < 0) {
2753ec050f77SDenis V. Lunev         goto out;
2754ec050f77SDenis V. Lunev     }
2755ec050f77SDenis V. Lunev 
27569f1963b3SEric Blake     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
27579f1963b3SEric Blake                                    align);
27583482b9bcSEric Blake     assert(max_pdiscard >= bs->bl.request_alignment);
27599f1963b3SEric Blake 
2760f5a5ca79SManos Pitsidianakis     while (bytes > 0) {
2761f5a5ca79SManos Pitsidianakis         int num = bytes;
27623482b9bcSEric Blake 
27633482b9bcSEric Blake         if (head) {
27643482b9bcSEric Blake             /* Make small requests to get to alignment boundaries. */
2765f5a5ca79SManos Pitsidianakis             num = MIN(bytes, align - head);
27663482b9bcSEric Blake             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
27673482b9bcSEric Blake                 num %= bs->bl.request_alignment;
27683482b9bcSEric Blake             }
27693482b9bcSEric Blake             head = (head + num) % align;
27703482b9bcSEric Blake             assert(num < max_pdiscard);
27713482b9bcSEric Blake         } else if (tail) {
27723482b9bcSEric Blake             if (num > align) {
27733482b9bcSEric Blake                 /* Shorten the request to the last aligned cluster.  */
27743482b9bcSEric Blake                 num -= tail;
27753482b9bcSEric Blake             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
27763482b9bcSEric Blake                        tail > bs->bl.request_alignment) {
27773482b9bcSEric Blake                 tail %= bs->bl.request_alignment;
27783482b9bcSEric Blake                 num -= tail;
27793482b9bcSEric Blake             }
27803482b9bcSEric Blake         }
27813482b9bcSEric Blake         /* limit request size */
27823482b9bcSEric Blake         if (num > max_pdiscard) {
27833482b9bcSEric Blake             num = max_pdiscard;
27843482b9bcSEric Blake         }
278561007b31SStefan Hajnoczi 
2786d470ad42SMax Reitz         if (!bs->drv) {
2787d470ad42SMax Reitz             ret = -ENOMEDIUM;
2788d470ad42SMax Reitz             goto out;
2789d470ad42SMax Reitz         }
279047a5486dSEric Blake         if (bs->drv->bdrv_co_pdiscard) {
279147a5486dSEric Blake             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
279261007b31SStefan Hajnoczi         } else {
279361007b31SStefan Hajnoczi             BlockAIOCB *acb;
279461007b31SStefan Hajnoczi             CoroutineIOCompletion co = {
279561007b31SStefan Hajnoczi                 .coroutine = qemu_coroutine_self(),
279661007b31SStefan Hajnoczi             };
279761007b31SStefan Hajnoczi 
27984da444a0SEric Blake             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
279961007b31SStefan Hajnoczi                                              bdrv_co_io_em_complete, &co);
280061007b31SStefan Hajnoczi             if (acb == NULL) {
2801b1066c87SFam Zheng                 ret = -EIO;
2802b1066c87SFam Zheng                 goto out;
280361007b31SStefan Hajnoczi             } else {
280461007b31SStefan Hajnoczi                 qemu_coroutine_yield();
280561007b31SStefan Hajnoczi                 ret = co.ret;
280661007b31SStefan Hajnoczi             }
280761007b31SStefan Hajnoczi         }
280861007b31SStefan Hajnoczi         if (ret && ret != -ENOTSUP) {
2809b1066c87SFam Zheng             goto out;
281061007b31SStefan Hajnoczi         }
281161007b31SStefan Hajnoczi 
28129f1963b3SEric Blake         offset += num;
2813f5a5ca79SManos Pitsidianakis         bytes -= num;
281461007b31SStefan Hajnoczi     }
2815b1066c87SFam Zheng     ret = 0;
2816b1066c87SFam Zheng out:
281700695c27SFam Zheng     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
2818b1066c87SFam Zheng     tracked_request_end(&req);
281999723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2820b1066c87SFam Zheng     return ret;
282161007b31SStefan Hajnoczi }
282261007b31SStefan Hajnoczi 
28230b9fd3f4SFam Zheng int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes)
282461007b31SStefan Hajnoczi {
282561007b31SStefan Hajnoczi     Coroutine *co;
282661007b31SStefan Hajnoczi     DiscardCo rwco = {
28270b9fd3f4SFam Zheng         .child = child,
28280c51a893SEric Blake         .offset = offset,
2829f5a5ca79SManos Pitsidianakis         .bytes = bytes,
283061007b31SStefan Hajnoczi         .ret = NOT_DONE,
283161007b31SStefan Hajnoczi     };
283261007b31SStefan Hajnoczi 
283361007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
283461007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
28350c51a893SEric Blake         bdrv_pdiscard_co_entry(&rwco);
283661007b31SStefan Hajnoczi     } else {
28370c51a893SEric Blake         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
28380b9fd3f4SFam Zheng         bdrv_coroutine_enter(child->bs, co);
28390b9fd3f4SFam Zheng         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
284061007b31SStefan Hajnoczi     }
284161007b31SStefan Hajnoczi 
284261007b31SStefan Hajnoczi     return rwco.ret;
284361007b31SStefan Hajnoczi }
284461007b31SStefan Hajnoczi 
284548af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
284661007b31SStefan Hajnoczi {
284761007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
28485c5ae76aSFam Zheng     CoroutineIOCompletion co = {
28495c5ae76aSFam Zheng         .coroutine = qemu_coroutine_self(),
28505c5ae76aSFam Zheng     };
28515c5ae76aSFam Zheng     BlockAIOCB *acb;
285261007b31SStefan Hajnoczi 
285399723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
285416a389dcSKevin Wolf     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
28555c5ae76aSFam Zheng         co.ret = -ENOTSUP;
28565c5ae76aSFam Zheng         goto out;
28575c5ae76aSFam Zheng     }
28585c5ae76aSFam Zheng 
285916a389dcSKevin Wolf     if (drv->bdrv_co_ioctl) {
286016a389dcSKevin Wolf         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
286116a389dcSKevin Wolf     } else {
28625c5ae76aSFam Zheng         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
28635c5ae76aSFam Zheng         if (!acb) {
2864c8a9fd80SFam Zheng             co.ret = -ENOTSUP;
2865c8a9fd80SFam Zheng             goto out;
28665c5ae76aSFam Zheng         }
28675c5ae76aSFam Zheng         qemu_coroutine_yield();
286816a389dcSKevin Wolf     }
28695c5ae76aSFam Zheng out:
287099723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
28715c5ae76aSFam Zheng     return co.ret;
28725c5ae76aSFam Zheng }
28735c5ae76aSFam Zheng 
287461007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size)
287561007b31SStefan Hajnoczi {
287661007b31SStefan Hajnoczi     return qemu_memalign(bdrv_opt_mem_align(bs), size);
287761007b31SStefan Hajnoczi }
287861007b31SStefan Hajnoczi 
287961007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size)
288061007b31SStefan Hajnoczi {
288161007b31SStefan Hajnoczi     return memset(qemu_blockalign(bs, size), 0, size);
288261007b31SStefan Hajnoczi }
288361007b31SStefan Hajnoczi 
288461007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
288561007b31SStefan Hajnoczi {
288661007b31SStefan Hajnoczi     size_t align = bdrv_opt_mem_align(bs);
288761007b31SStefan Hajnoczi 
288861007b31SStefan Hajnoczi     /* Ensure that NULL is never returned on success */
288961007b31SStefan Hajnoczi     assert(align > 0);
289061007b31SStefan Hajnoczi     if (size == 0) {
289161007b31SStefan Hajnoczi         size = align;
289261007b31SStefan Hajnoczi     }
289361007b31SStefan Hajnoczi 
289461007b31SStefan Hajnoczi     return qemu_try_memalign(align, size);
289561007b31SStefan Hajnoczi }
289661007b31SStefan Hajnoczi 
289761007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
289861007b31SStefan Hajnoczi {
289961007b31SStefan Hajnoczi     void *mem = qemu_try_blockalign(bs, size);
290061007b31SStefan Hajnoczi 
290161007b31SStefan Hajnoczi     if (mem) {
290261007b31SStefan Hajnoczi         memset(mem, 0, size);
290361007b31SStefan Hajnoczi     }
290461007b31SStefan Hajnoczi 
290561007b31SStefan Hajnoczi     return mem;
290661007b31SStefan Hajnoczi }
290761007b31SStefan Hajnoczi 
290861007b31SStefan Hajnoczi /*
290961007b31SStefan Hajnoczi  * Check if all memory in this vector is sector aligned.
291061007b31SStefan Hajnoczi  */
291161007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
291261007b31SStefan Hajnoczi {
291361007b31SStefan Hajnoczi     int i;
29144196d2f0SDenis V. Lunev     size_t alignment = bdrv_min_mem_align(bs);
291561007b31SStefan Hajnoczi 
291661007b31SStefan Hajnoczi     for (i = 0; i < qiov->niov; i++) {
291761007b31SStefan Hajnoczi         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
291861007b31SStefan Hajnoczi             return false;
291961007b31SStefan Hajnoczi         }
292061007b31SStefan Hajnoczi         if (qiov->iov[i].iov_len % alignment) {
292161007b31SStefan Hajnoczi             return false;
292261007b31SStefan Hajnoczi         }
292361007b31SStefan Hajnoczi     }
292461007b31SStefan Hajnoczi 
292561007b31SStefan Hajnoczi     return true;
292661007b31SStefan Hajnoczi }
292761007b31SStefan Hajnoczi 
292861007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs,
292961007b31SStefan Hajnoczi                                     NotifierWithReturn *notifier)
293061007b31SStefan Hajnoczi {
293161007b31SStefan Hajnoczi     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
293261007b31SStefan Hajnoczi }
293361007b31SStefan Hajnoczi 
293461007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs)
293561007b31SStefan Hajnoczi {
29366b98bd64SPaolo Bonzini     BdrvChild *child;
29376b98bd64SPaolo Bonzini 
29386b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
29396b98bd64SPaolo Bonzini         bdrv_io_plug(child->bs);
29406b98bd64SPaolo Bonzini     }
29416b98bd64SPaolo Bonzini 
2942850d54a2SPaolo Bonzini     if (atomic_fetch_inc(&bs->io_plugged) == 0) {
294361007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
294461007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_plug) {
294561007b31SStefan Hajnoczi             drv->bdrv_io_plug(bs);
29466b98bd64SPaolo Bonzini         }
294761007b31SStefan Hajnoczi     }
294861007b31SStefan Hajnoczi }
294961007b31SStefan Hajnoczi 
295061007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs)
295161007b31SStefan Hajnoczi {
29526b98bd64SPaolo Bonzini     BdrvChild *child;
29536b98bd64SPaolo Bonzini 
29546b98bd64SPaolo Bonzini     assert(bs->io_plugged);
2955850d54a2SPaolo Bonzini     if (atomic_fetch_dec(&bs->io_plugged) == 1) {
295661007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
295761007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_unplug) {
295861007b31SStefan Hajnoczi             drv->bdrv_io_unplug(bs);
295961007b31SStefan Hajnoczi         }
296061007b31SStefan Hajnoczi     }
296161007b31SStefan Hajnoczi 
29626b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
29636b98bd64SPaolo Bonzini         bdrv_io_unplug(child->bs);
29646b98bd64SPaolo Bonzini     }
29656b98bd64SPaolo Bonzini }
296623d0ba93SFam Zheng 
296723d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
296823d0ba93SFam Zheng {
296923d0ba93SFam Zheng     BdrvChild *child;
297023d0ba93SFam Zheng 
297123d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_register_buf) {
297223d0ba93SFam Zheng         bs->drv->bdrv_register_buf(bs, host, size);
297323d0ba93SFam Zheng     }
297423d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
297523d0ba93SFam Zheng         bdrv_register_buf(child->bs, host, size);
297623d0ba93SFam Zheng     }
297723d0ba93SFam Zheng }
297823d0ba93SFam Zheng 
297923d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host)
298023d0ba93SFam Zheng {
298123d0ba93SFam Zheng     BdrvChild *child;
298223d0ba93SFam Zheng 
298323d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_unregister_buf) {
298423d0ba93SFam Zheng         bs->drv->bdrv_unregister_buf(bs, host);
298523d0ba93SFam Zheng     }
298623d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
298723d0ba93SFam Zheng         bdrv_unregister_buf(child->bs, host);
298823d0ba93SFam Zheng     }
298923d0ba93SFam Zheng }
2990fcc67678SFam Zheng 
299167b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal(
299267b51fb9SVladimir Sementsov-Ogievskiy         BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
299367b51fb9SVladimir Sementsov-Ogievskiy         uint64_t dst_offset, uint64_t bytes,
299467b51fb9SVladimir Sementsov-Ogievskiy         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
2995fcc67678SFam Zheng         bool recurse_src)
2996fcc67678SFam Zheng {
2997999658a0SVladimir Sementsov-Ogievskiy     BdrvTrackedRequest req;
2998fcc67678SFam Zheng     int ret;
2999fcc67678SFam Zheng 
3000d4d3e5a0SFam Zheng     if (!dst || !dst->bs) {
3001fcc67678SFam Zheng         return -ENOMEDIUM;
3002fcc67678SFam Zheng     }
3003fcc67678SFam Zheng     ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
3004fcc67678SFam Zheng     if (ret) {
3005fcc67678SFam Zheng         return ret;
3006fcc67678SFam Zheng     }
300767b51fb9SVladimir Sementsov-Ogievskiy     if (write_flags & BDRV_REQ_ZERO_WRITE) {
300867b51fb9SVladimir Sementsov-Ogievskiy         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3009fcc67678SFam Zheng     }
3010fcc67678SFam Zheng 
3011d4d3e5a0SFam Zheng     if (!src || !src->bs) {
3012d4d3e5a0SFam Zheng         return -ENOMEDIUM;
3013d4d3e5a0SFam Zheng     }
3014d4d3e5a0SFam Zheng     ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
3015d4d3e5a0SFam Zheng     if (ret) {
3016d4d3e5a0SFam Zheng         return ret;
3017d4d3e5a0SFam Zheng     }
3018d4d3e5a0SFam Zheng 
3019fcc67678SFam Zheng     if (!src->bs->drv->bdrv_co_copy_range_from
3020fcc67678SFam Zheng         || !dst->bs->drv->bdrv_co_copy_range_to
3021fcc67678SFam Zheng         || src->bs->encrypted || dst->bs->encrypted) {
3022fcc67678SFam Zheng         return -ENOTSUP;
3023fcc67678SFam Zheng     }
3024999658a0SVladimir Sementsov-Ogievskiy 
3025999658a0SVladimir Sementsov-Ogievskiy     if (recurse_src) {
3026d4d3e5a0SFam Zheng         bdrv_inc_in_flight(src->bs);
3027999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, src->bs, src_offset, bytes,
3028999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_READ);
302937aec7d7SFam Zheng 
303009d2f948SVladimir Sementsov-Ogievskiy         /* BDRV_REQ_SERIALISING is only for write operation */
303109d2f948SVladimir Sementsov-Ogievskiy         assert(!(read_flags & BDRV_REQ_SERIALISING));
303267b51fb9SVladimir Sementsov-Ogievskiy         if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
3033999658a0SVladimir Sementsov-Ogievskiy             wait_serialising_requests(&req);
3034dee12de8SFam Zheng         }
3035999658a0SVladimir Sementsov-Ogievskiy 
303637aec7d7SFam Zheng         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3037fcc67678SFam Zheng                                                     src, src_offset,
3038fcc67678SFam Zheng                                                     dst, dst_offset,
303967b51fb9SVladimir Sementsov-Ogievskiy                                                     bytes,
304067b51fb9SVladimir Sementsov-Ogievskiy                                                     read_flags, write_flags);
3041999658a0SVladimir Sementsov-Ogievskiy 
3042999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
3043999658a0SVladimir Sementsov-Ogievskiy         bdrv_dec_in_flight(src->bs);
3044fcc67678SFam Zheng     } else {
3045999658a0SVladimir Sementsov-Ogievskiy         bdrv_inc_in_flight(dst->bs);
3046999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3047999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_WRITE);
30480eb1e891SFam Zheng         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
30490eb1e891SFam Zheng                                         write_flags);
30500eb1e891SFam Zheng         if (!ret) {
305137aec7d7SFam Zheng             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3052fcc67678SFam Zheng                                                       src, src_offset,
3053fcc67678SFam Zheng                                                       dst, dst_offset,
305467b51fb9SVladimir Sementsov-Ogievskiy                                                       bytes,
305567b51fb9SVladimir Sementsov-Ogievskiy                                                       read_flags, write_flags);
30560eb1e891SFam Zheng         }
30570eb1e891SFam Zheng         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3058999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
3059d4d3e5a0SFam Zheng         bdrv_dec_in_flight(dst->bs);
3060999658a0SVladimir Sementsov-Ogievskiy     }
3061999658a0SVladimir Sementsov-Ogievskiy 
306237aec7d7SFam Zheng     return ret;
3063fcc67678SFam Zheng }
3064fcc67678SFam Zheng 
3065fcc67678SFam Zheng /* Copy range from @src to @dst.
3066fcc67678SFam Zheng  *
3067fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3068fcc67678SFam Zheng  * semantics. */
3069fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3070fcc67678SFam Zheng                                          BdrvChild *dst, uint64_t dst_offset,
307167b51fb9SVladimir Sementsov-Ogievskiy                                          uint64_t bytes,
307267b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags read_flags,
307367b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags write_flags)
3074fcc67678SFam Zheng {
3075ecc983a5SFam Zheng     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3076ecc983a5SFam Zheng                                   read_flags, write_flags);
3077fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
307867b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, true);
3079fcc67678SFam Zheng }
3080fcc67678SFam Zheng 
3081fcc67678SFam Zheng /* Copy range from @src to @dst.
3082fcc67678SFam Zheng  *
3083fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3084fcc67678SFam Zheng  * semantics. */
3085fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3086fcc67678SFam Zheng                                        BdrvChild *dst, uint64_t dst_offset,
308767b51fb9SVladimir Sementsov-Ogievskiy                                        uint64_t bytes,
308867b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags read_flags,
308967b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags write_flags)
3090fcc67678SFam Zheng {
3091ecc983a5SFam Zheng     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3092ecc983a5SFam Zheng                                 read_flags, write_flags);
3093fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
309467b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, false);
3095fcc67678SFam Zheng }
3096fcc67678SFam Zheng 
3097fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3098fcc67678SFam Zheng                                     BdrvChild *dst, uint64_t dst_offset,
309967b51fb9SVladimir Sementsov-Ogievskiy                                     uint64_t bytes, BdrvRequestFlags read_flags,
310067b51fb9SVladimir Sementsov-Ogievskiy                                     BdrvRequestFlags write_flags)
3101fcc67678SFam Zheng {
310237aec7d7SFam Zheng     return bdrv_co_copy_range_from(src, src_offset,
3103fcc67678SFam Zheng                                    dst, dst_offset,
310467b51fb9SVladimir Sementsov-Ogievskiy                                    bytes, read_flags, write_flags);
3105fcc67678SFam Zheng }
31063d9f2d2aSKevin Wolf 
31073d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs)
31083d9f2d2aSKevin Wolf {
31093d9f2d2aSKevin Wolf     BdrvChild *c;
31103d9f2d2aSKevin Wolf     QLIST_FOREACH(c, &bs->parents, next_parent) {
31113d9f2d2aSKevin Wolf         if (c->role->resize) {
31123d9f2d2aSKevin Wolf             c->role->resize(c);
31133d9f2d2aSKevin Wolf         }
31143d9f2d2aSKevin Wolf     }
31153d9f2d2aSKevin Wolf }
31163d9f2d2aSKevin Wolf 
31173d9f2d2aSKevin Wolf /**
31183d9f2d2aSKevin Wolf  * Truncate file to 'offset' bytes (needed only for file protocols)
31193d9f2d2aSKevin Wolf  */
31203d9f2d2aSKevin Wolf int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
31213d9f2d2aSKevin Wolf                                   PreallocMode prealloc, Error **errp)
31223d9f2d2aSKevin Wolf {
31233d9f2d2aSKevin Wolf     BlockDriverState *bs = child->bs;
31243d9f2d2aSKevin Wolf     BlockDriver *drv = bs->drv;
31251bc5f09fSKevin Wolf     BdrvTrackedRequest req;
31261bc5f09fSKevin Wolf     int64_t old_size, new_bytes;
31273d9f2d2aSKevin Wolf     int ret;
31283d9f2d2aSKevin Wolf 
31293d9f2d2aSKevin Wolf 
31303d9f2d2aSKevin Wolf     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
31313d9f2d2aSKevin Wolf     if (!drv) {
31323d9f2d2aSKevin Wolf         error_setg(errp, "No medium inserted");
31333d9f2d2aSKevin Wolf         return -ENOMEDIUM;
31343d9f2d2aSKevin Wolf     }
31353d9f2d2aSKevin Wolf     if (offset < 0) {
31363d9f2d2aSKevin Wolf         error_setg(errp, "Image size cannot be negative");
31373d9f2d2aSKevin Wolf         return -EINVAL;
31383d9f2d2aSKevin Wolf     }
31393d9f2d2aSKevin Wolf 
31401bc5f09fSKevin Wolf     old_size = bdrv_getlength(bs);
31411bc5f09fSKevin Wolf     if (old_size < 0) {
31421bc5f09fSKevin Wolf         error_setg_errno(errp, -old_size, "Failed to get old image size");
31431bc5f09fSKevin Wolf         return old_size;
31441bc5f09fSKevin Wolf     }
31451bc5f09fSKevin Wolf 
31461bc5f09fSKevin Wolf     if (offset > old_size) {
31471bc5f09fSKevin Wolf         new_bytes = offset - old_size;
31481bc5f09fSKevin Wolf     } else {
31491bc5f09fSKevin Wolf         new_bytes = 0;
31501bc5f09fSKevin Wolf     }
31511bc5f09fSKevin Wolf 
31523d9f2d2aSKevin Wolf     bdrv_inc_in_flight(bs);
31535416a11eSFam Zheng     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
31545416a11eSFam Zheng                           BDRV_TRACKED_TRUNCATE);
31551bc5f09fSKevin Wolf 
31561bc5f09fSKevin Wolf     /* If we are growing the image and potentially using preallocation for the
31571bc5f09fSKevin Wolf      * new area, we need to make sure that no write requests are made to it
31581bc5f09fSKevin Wolf      * concurrently or they might be overwritten by preallocation. */
31591bc5f09fSKevin Wolf     if (new_bytes) {
31601bc5f09fSKevin Wolf         mark_request_serialising(&req, 1);
3161cd47d792SFam Zheng     }
3162cd47d792SFam Zheng     if (bs->read_only) {
3163cd47d792SFam Zheng         error_setg(errp, "Image is read-only");
3164cd47d792SFam Zheng         ret = -EACCES;
3165cd47d792SFam Zheng         goto out;
3166cd47d792SFam Zheng     }
3167cd47d792SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3168cd47d792SFam Zheng                                     0);
3169cd47d792SFam Zheng     if (ret < 0) {
3170cd47d792SFam Zheng         error_setg_errno(errp, -ret,
3171cd47d792SFam Zheng                          "Failed to prepare request for truncation");
3172cd47d792SFam Zheng         goto out;
31731bc5f09fSKevin Wolf     }
31743d9f2d2aSKevin Wolf 
31753d9f2d2aSKevin Wolf     if (!drv->bdrv_co_truncate) {
31763d9f2d2aSKevin Wolf         if (bs->file && drv->is_filter) {
31773d9f2d2aSKevin Wolf             ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
31783d9f2d2aSKevin Wolf             goto out;
31793d9f2d2aSKevin Wolf         }
31803d9f2d2aSKevin Wolf         error_setg(errp, "Image format driver does not support resize");
31813d9f2d2aSKevin Wolf         ret = -ENOTSUP;
31823d9f2d2aSKevin Wolf         goto out;
31833d9f2d2aSKevin Wolf     }
31843d9f2d2aSKevin Wolf 
31853d9f2d2aSKevin Wolf     ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
31863d9f2d2aSKevin Wolf     if (ret < 0) {
31873d9f2d2aSKevin Wolf         goto out;
31883d9f2d2aSKevin Wolf     }
31893d9f2d2aSKevin Wolf     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
31903d9f2d2aSKevin Wolf     if (ret < 0) {
31913d9f2d2aSKevin Wolf         error_setg_errno(errp, -ret, "Could not refresh total sector count");
31923d9f2d2aSKevin Wolf     } else {
31933d9f2d2aSKevin Wolf         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
31943d9f2d2aSKevin Wolf     }
3195cd47d792SFam Zheng     /* It's possible that truncation succeeded but refresh_total_sectors
3196cd47d792SFam Zheng      * failed, but the latter doesn't affect how we should finish the request.
3197cd47d792SFam Zheng      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3198cd47d792SFam Zheng     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
31993d9f2d2aSKevin Wolf 
32003d9f2d2aSKevin Wolf out:
32011bc5f09fSKevin Wolf     tracked_request_end(&req);
32023d9f2d2aSKevin Wolf     bdrv_dec_in_flight(bs);
32031bc5f09fSKevin Wolf 
32043d9f2d2aSKevin Wolf     return ret;
32053d9f2d2aSKevin Wolf }
32063d9f2d2aSKevin Wolf 
32073d9f2d2aSKevin Wolf typedef struct TruncateCo {
32083d9f2d2aSKevin Wolf     BdrvChild *child;
32093d9f2d2aSKevin Wolf     int64_t offset;
32103d9f2d2aSKevin Wolf     PreallocMode prealloc;
32113d9f2d2aSKevin Wolf     Error **errp;
32123d9f2d2aSKevin Wolf     int ret;
32133d9f2d2aSKevin Wolf } TruncateCo;
32143d9f2d2aSKevin Wolf 
32153d9f2d2aSKevin Wolf static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
32163d9f2d2aSKevin Wolf {
32173d9f2d2aSKevin Wolf     TruncateCo *tco = opaque;
32183d9f2d2aSKevin Wolf     tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
32193d9f2d2aSKevin Wolf                                 tco->errp);
32203d9f2d2aSKevin Wolf }
32213d9f2d2aSKevin Wolf 
32223d9f2d2aSKevin Wolf int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
32233d9f2d2aSKevin Wolf                   Error **errp)
32243d9f2d2aSKevin Wolf {
32253d9f2d2aSKevin Wolf     Coroutine *co;
32263d9f2d2aSKevin Wolf     TruncateCo tco = {
32273d9f2d2aSKevin Wolf         .child      = child,
32283d9f2d2aSKevin Wolf         .offset     = offset,
32293d9f2d2aSKevin Wolf         .prealloc   = prealloc,
32303d9f2d2aSKevin Wolf         .errp       = errp,
32313d9f2d2aSKevin Wolf         .ret        = NOT_DONE,
32323d9f2d2aSKevin Wolf     };
32333d9f2d2aSKevin Wolf 
32343d9f2d2aSKevin Wolf     if (qemu_in_coroutine()) {
32353d9f2d2aSKevin Wolf         /* Fast-path if already in coroutine context */
32363d9f2d2aSKevin Wolf         bdrv_truncate_co_entry(&tco);
32373d9f2d2aSKevin Wolf     } else {
32383d9f2d2aSKevin Wolf         co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
32393d9f2d2aSKevin Wolf         qemu_coroutine_enter(co);
32403d9f2d2aSKevin Wolf         BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
32413d9f2d2aSKevin Wolf     }
32423d9f2d2aSKevin Wolf 
32433d9f2d2aSKevin Wolf     return tco.ret;
32443d9f2d2aSKevin Wolf }
3245