xref: /qemu/block/io.c (revision 765d9df9626f45a821f221f7a46ef524354b3600)
161007b31SStefan Hajnoczi /*
261007b31SStefan Hajnoczi  * Block layer I/O functions
361007b31SStefan Hajnoczi  *
461007b31SStefan Hajnoczi  * Copyright (c) 2003 Fabrice Bellard
561007b31SStefan Hajnoczi  *
661007b31SStefan Hajnoczi  * Permission is hereby granted, free of charge, to any person obtaining a copy
761007b31SStefan Hajnoczi  * of this software and associated documentation files (the "Software"), to deal
861007b31SStefan Hajnoczi  * in the Software without restriction, including without limitation the rights
961007b31SStefan Hajnoczi  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1061007b31SStefan Hajnoczi  * copies of the Software, and to permit persons to whom the Software is
1161007b31SStefan Hajnoczi  * furnished to do so, subject to the following conditions:
1261007b31SStefan Hajnoczi  *
1361007b31SStefan Hajnoczi  * The above copyright notice and this permission notice shall be included in
1461007b31SStefan Hajnoczi  * all copies or substantial portions of the Software.
1561007b31SStefan Hajnoczi  *
1661007b31SStefan Hajnoczi  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1761007b31SStefan Hajnoczi  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1861007b31SStefan Hajnoczi  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1961007b31SStefan Hajnoczi  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2061007b31SStefan Hajnoczi  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2161007b31SStefan Hajnoczi  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2261007b31SStefan Hajnoczi  * THE SOFTWARE.
2361007b31SStefan Hajnoczi  */
2461007b31SStefan Hajnoczi 
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2661007b31SStefan Hajnoczi #include "trace.h"
277f0e9da6SMax Reitz #include "sysemu/block-backend.h"
2861007b31SStefan Hajnoczi #include "block/blockjob.h"
29f321dcb5SPaolo Bonzini #include "block/blockjob_int.h"
3061007b31SStefan Hajnoczi #include "block/block_int.h"
31f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
32da34e65cSMarkus Armbruster #include "qapi/error.h"
33d49b6836SMarkus Armbruster #include "qemu/error-report.h"
3461007b31SStefan Hajnoczi 
3561007b31SStefan Hajnoczi #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
3661007b31SStefan Hajnoczi 
37d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
38f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags);
3961007b31SStefan Hajnoczi 
4014e9559fSFam Zheng void bdrv_parent_drained_begin(BlockDriverState *bs)
4161007b31SStefan Hajnoczi {
42c2066af0SKevin Wolf     BdrvChild *c;
4327ccdd52SKevin Wolf 
44c2066af0SKevin Wolf     QLIST_FOREACH(c, &bs->parents, next_parent) {
45c2066af0SKevin Wolf         if (c->role->drained_begin) {
46c2066af0SKevin Wolf             c->role->drained_begin(c);
47c2066af0SKevin Wolf         }
48ce0f1412SPaolo Bonzini     }
49ce0f1412SPaolo Bonzini }
50ce0f1412SPaolo Bonzini 
5114e9559fSFam Zheng void bdrv_parent_drained_end(BlockDriverState *bs)
52ce0f1412SPaolo Bonzini {
53c2066af0SKevin Wolf     BdrvChild *c;
5427ccdd52SKevin Wolf 
55c2066af0SKevin Wolf     QLIST_FOREACH(c, &bs->parents, next_parent) {
56c2066af0SKevin Wolf         if (c->role->drained_end) {
57c2066af0SKevin Wolf             c->role->drained_end(c);
5827ccdd52SKevin Wolf         }
59c2066af0SKevin Wolf     }
6061007b31SStefan Hajnoczi }
6161007b31SStefan Hajnoczi 
62d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
63d9e0dfa2SEric Blake {
64d9e0dfa2SEric Blake     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
65d9e0dfa2SEric Blake     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
66d9e0dfa2SEric Blake     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
67d9e0dfa2SEric Blake                                  src->opt_mem_alignment);
68d9e0dfa2SEric Blake     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
69d9e0dfa2SEric Blake                                  src->min_mem_alignment);
70d9e0dfa2SEric Blake     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
71d9e0dfa2SEric Blake }
72d9e0dfa2SEric Blake 
7361007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
7461007b31SStefan Hajnoczi {
7561007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
7661007b31SStefan Hajnoczi     Error *local_err = NULL;
7761007b31SStefan Hajnoczi 
7861007b31SStefan Hajnoczi     memset(&bs->bl, 0, sizeof(bs->bl));
7961007b31SStefan Hajnoczi 
8061007b31SStefan Hajnoczi     if (!drv) {
8161007b31SStefan Hajnoczi         return;
8261007b31SStefan Hajnoczi     }
8361007b31SStefan Hajnoczi 
8479ba8c98SEric Blake     /* Default alignment based on whether driver has byte interface */
85a5b8dd2cSEric Blake     bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
8679ba8c98SEric Blake 
8761007b31SStefan Hajnoczi     /* Take some limits from the children as a default */
8861007b31SStefan Hajnoczi     if (bs->file) {
899a4f4c31SKevin Wolf         bdrv_refresh_limits(bs->file->bs, &local_err);
9061007b31SStefan Hajnoczi         if (local_err) {
9161007b31SStefan Hajnoczi             error_propagate(errp, local_err);
9261007b31SStefan Hajnoczi             return;
9361007b31SStefan Hajnoczi         }
94d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
9561007b31SStefan Hajnoczi     } else {
964196d2f0SDenis V. Lunev         bs->bl.min_mem_alignment = 512;
97459b4e66SDenis V. Lunev         bs->bl.opt_mem_alignment = getpagesize();
98bd44feb7SStefan Hajnoczi 
99bd44feb7SStefan Hajnoczi         /* Safe default since most protocols use readv()/writev()/etc */
100bd44feb7SStefan Hajnoczi         bs->bl.max_iov = IOV_MAX;
10161007b31SStefan Hajnoczi     }
10261007b31SStefan Hajnoczi 
103760e0063SKevin Wolf     if (bs->backing) {
104760e0063SKevin Wolf         bdrv_refresh_limits(bs->backing->bs, &local_err);
10561007b31SStefan Hajnoczi         if (local_err) {
10661007b31SStefan Hajnoczi             error_propagate(errp, local_err);
10761007b31SStefan Hajnoczi             return;
10861007b31SStefan Hajnoczi         }
109d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
11061007b31SStefan Hajnoczi     }
11161007b31SStefan Hajnoczi 
11261007b31SStefan Hajnoczi     /* Then let the driver override it */
11361007b31SStefan Hajnoczi     if (drv->bdrv_refresh_limits) {
11461007b31SStefan Hajnoczi         drv->bdrv_refresh_limits(bs, errp);
11561007b31SStefan Hajnoczi     }
11661007b31SStefan Hajnoczi }
11761007b31SStefan Hajnoczi 
11861007b31SStefan Hajnoczi /**
11961007b31SStefan Hajnoczi  * The copy-on-read flag is actually a reference count so multiple users may
12061007b31SStefan Hajnoczi  * use the feature without worrying about clobbering its previous state.
12161007b31SStefan Hajnoczi  * Copy-on-read stays enabled until all users have called to disable it.
12261007b31SStefan Hajnoczi  */
12361007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs)
12461007b31SStefan Hajnoczi {
125d3faa13eSPaolo Bonzini     atomic_inc(&bs->copy_on_read);
12661007b31SStefan Hajnoczi }
12761007b31SStefan Hajnoczi 
12861007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs)
12961007b31SStefan Hajnoczi {
130d3faa13eSPaolo Bonzini     int old = atomic_fetch_dec(&bs->copy_on_read);
131d3faa13eSPaolo Bonzini     assert(old >= 1);
13261007b31SStefan Hajnoczi }
13361007b31SStefan Hajnoczi 
13461007b31SStefan Hajnoczi /* Check if any requests are in-flight (including throttled requests) */
135439db28cSKevin Wolf bool bdrv_requests_pending(BlockDriverState *bs)
13661007b31SStefan Hajnoczi {
13737a639a7SKevin Wolf     BdrvChild *child;
13837a639a7SKevin Wolf 
13999723548SPaolo Bonzini     if (atomic_read(&bs->in_flight)) {
14061007b31SStefan Hajnoczi         return true;
14161007b31SStefan Hajnoczi     }
14237a639a7SKevin Wolf 
14337a639a7SKevin Wolf     QLIST_FOREACH(child, &bs->children, next) {
14437a639a7SKevin Wolf         if (bdrv_requests_pending(child->bs)) {
14561007b31SStefan Hajnoczi             return true;
14661007b31SStefan Hajnoczi         }
14761007b31SStefan Hajnoczi     }
14837a639a7SKevin Wolf 
14961007b31SStefan Hajnoczi     return false;
15061007b31SStefan Hajnoczi }
15161007b31SStefan Hajnoczi 
15261124f03SPaolo Bonzini typedef struct {
15361124f03SPaolo Bonzini     Coroutine *co;
15461124f03SPaolo Bonzini     BlockDriverState *bs;
15561124f03SPaolo Bonzini     bool done;
15661124f03SPaolo Bonzini } BdrvCoDrainData;
15761124f03SPaolo Bonzini 
15861124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
15961124f03SPaolo Bonzini {
16061124f03SPaolo Bonzini     BdrvCoDrainData *data = opaque;
16161124f03SPaolo Bonzini     BlockDriverState *bs = data->bs;
16261124f03SPaolo Bonzini 
16361124f03SPaolo Bonzini     bs->drv->bdrv_co_drain(bs);
16461124f03SPaolo Bonzini 
16561124f03SPaolo Bonzini     /* Set data->done before reading bs->wakeup.  */
16661124f03SPaolo Bonzini     atomic_mb_set(&data->done, true);
16761124f03SPaolo Bonzini     bdrv_wakeup(bs);
16861124f03SPaolo Bonzini }
16961124f03SPaolo Bonzini 
17061124f03SPaolo Bonzini static void bdrv_drain_invoke(BlockDriverState *bs)
17161124f03SPaolo Bonzini {
17261124f03SPaolo Bonzini     BdrvCoDrainData data = { .bs = bs, .done = false };
17361124f03SPaolo Bonzini 
17461124f03SPaolo Bonzini     if (!bs->drv || !bs->drv->bdrv_co_drain) {
17561124f03SPaolo Bonzini         return;
17661124f03SPaolo Bonzini     }
17761124f03SPaolo Bonzini 
17861124f03SPaolo Bonzini     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
17961124f03SPaolo Bonzini     bdrv_coroutine_enter(bs, data.co);
18061124f03SPaolo Bonzini     BDRV_POLL_WHILE(bs, !data.done);
18161124f03SPaolo Bonzini }
18261124f03SPaolo Bonzini 
183d42cf288SPaolo Bonzini static bool bdrv_drain_recurse(BlockDriverState *bs)
18467da1dc5SFam Zheng {
185178bd438SFam Zheng     BdrvChild *child, *tmp;
186d42cf288SPaolo Bonzini     bool waited;
187d42cf288SPaolo Bonzini 
18888b062c2SPaolo Bonzini     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
18967da1dc5SFam Zheng 
19061124f03SPaolo Bonzini     /* Ensure any pending metadata writes are submitted to bs->file.  */
19161124f03SPaolo Bonzini     bdrv_drain_invoke(bs);
192d42cf288SPaolo Bonzini 
193178bd438SFam Zheng     QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
194178bd438SFam Zheng         BlockDriverState *bs = child->bs;
195178bd438SFam Zheng         bool in_main_loop =
196178bd438SFam Zheng             qemu_get_current_aio_context() == qemu_get_aio_context();
197178bd438SFam Zheng         assert(bs->refcnt > 0);
198178bd438SFam Zheng         if (in_main_loop) {
199178bd438SFam Zheng             /* In case the recursive bdrv_drain_recurse processes a
200178bd438SFam Zheng              * block_job_defer_to_main_loop BH and modifies the graph,
201178bd438SFam Zheng              * let's hold a reference to bs until we are done.
202178bd438SFam Zheng              *
203178bd438SFam Zheng              * IOThread doesn't have such a BH, and it is not safe to call
204178bd438SFam Zheng              * bdrv_unref without BQL, so skip doing it there.
205178bd438SFam Zheng              */
206178bd438SFam Zheng             bdrv_ref(bs);
207178bd438SFam Zheng         }
208178bd438SFam Zheng         waited |= bdrv_drain_recurse(bs);
209178bd438SFam Zheng         if (in_main_loop) {
210178bd438SFam Zheng             bdrv_unref(bs);
211178bd438SFam Zheng         }
21267da1dc5SFam Zheng     }
213d42cf288SPaolo Bonzini 
214d42cf288SPaolo Bonzini     return waited;
21567da1dc5SFam Zheng }
21667da1dc5SFam Zheng 
217a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque)
218a77fd4bbSFam Zheng {
219a77fd4bbSFam Zheng     BdrvCoDrainData *data = opaque;
220a77fd4bbSFam Zheng     Coroutine *co = data->co;
22199723548SPaolo Bonzini     BlockDriverState *bs = data->bs;
222a77fd4bbSFam Zheng 
22399723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
224d42cf288SPaolo Bonzini     bdrv_drained_begin(bs);
225a77fd4bbSFam Zheng     data->done = true;
2261919631eSPaolo Bonzini     aio_co_wake(co);
227a77fd4bbSFam Zheng }
228a77fd4bbSFam Zheng 
229b6e84c97SPaolo Bonzini static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
230a77fd4bbSFam Zheng {
231a77fd4bbSFam Zheng     BdrvCoDrainData data;
232a77fd4bbSFam Zheng 
233a77fd4bbSFam Zheng     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
234a77fd4bbSFam Zheng      * other coroutines run if they were queued from
235a77fd4bbSFam Zheng      * qemu_co_queue_run_restart(). */
236a77fd4bbSFam Zheng 
237a77fd4bbSFam Zheng     assert(qemu_in_coroutine());
238a77fd4bbSFam Zheng     data = (BdrvCoDrainData) {
239a77fd4bbSFam Zheng         .co = qemu_coroutine_self(),
240a77fd4bbSFam Zheng         .bs = bs,
241a77fd4bbSFam Zheng         .done = false,
242a77fd4bbSFam Zheng     };
24399723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
244fffb6e12SPaolo Bonzini     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
245fffb6e12SPaolo Bonzini                             bdrv_co_drain_bh_cb, &data);
246a77fd4bbSFam Zheng 
247a77fd4bbSFam Zheng     qemu_coroutine_yield();
248a77fd4bbSFam Zheng     /* If we are resumed from some other event (such as an aio completion or a
249a77fd4bbSFam Zheng      * timer callback), it is a bug in the caller that should be fixed. */
250a77fd4bbSFam Zheng     assert(data.done);
251a77fd4bbSFam Zheng }
252a77fd4bbSFam Zheng 
2536820643fSKevin Wolf void bdrv_drained_begin(BlockDriverState *bs)
2546820643fSKevin Wolf {
255d42cf288SPaolo Bonzini     if (qemu_in_coroutine()) {
256d42cf288SPaolo Bonzini         bdrv_co_yield_to_drain(bs);
257d42cf288SPaolo Bonzini         return;
258d42cf288SPaolo Bonzini     }
259d42cf288SPaolo Bonzini 
260414c2ec3SPaolo Bonzini     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
2616820643fSKevin Wolf         aio_disable_external(bdrv_get_aio_context(bs));
2626820643fSKevin Wolf         bdrv_parent_drained_begin(bs);
2636820643fSKevin Wolf     }
2646820643fSKevin Wolf 
2656820643fSKevin Wolf     bdrv_drain_recurse(bs);
2666820643fSKevin Wolf }
2676820643fSKevin Wolf 
2686820643fSKevin Wolf void bdrv_drained_end(BlockDriverState *bs)
2696820643fSKevin Wolf {
2706820643fSKevin Wolf     assert(bs->quiesce_counter > 0);
271414c2ec3SPaolo Bonzini     if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
2726820643fSKevin Wolf         return;
2736820643fSKevin Wolf     }
2746820643fSKevin Wolf 
2756820643fSKevin Wolf     bdrv_parent_drained_end(bs);
2766820643fSKevin Wolf     aio_enable_external(bdrv_get_aio_context(bs));
2776820643fSKevin Wolf }
2786820643fSKevin Wolf 
27961007b31SStefan Hajnoczi /*
28067da1dc5SFam Zheng  * Wait for pending requests to complete on a single BlockDriverState subtree,
28167da1dc5SFam Zheng  * and suspend block driver's internal I/O until next request arrives.
28261007b31SStefan Hajnoczi  *
28361007b31SStefan Hajnoczi  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
28461007b31SStefan Hajnoczi  * AioContext.
2857a63f3cdSStefan Hajnoczi  *
2867a63f3cdSStefan Hajnoczi  * Only this BlockDriverState's AioContext is run, so in-flight requests must
2877a63f3cdSStefan Hajnoczi  * not depend on events in other AioContexts.  In that case, use
2887a63f3cdSStefan Hajnoczi  * bdrv_drain_all() instead.
28961007b31SStefan Hajnoczi  */
290b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
291b6e84c97SPaolo Bonzini {
2926820643fSKevin Wolf     assert(qemu_in_coroutine());
2936820643fSKevin Wolf     bdrv_drained_begin(bs);
2946820643fSKevin Wolf     bdrv_drained_end(bs);
295b6e84c97SPaolo Bonzini }
296b6e84c97SPaolo Bonzini 
29761007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs)
29861007b31SStefan Hajnoczi {
2996820643fSKevin Wolf     bdrv_drained_begin(bs);
3006820643fSKevin Wolf     bdrv_drained_end(bs);
30161007b31SStefan Hajnoczi }
30261007b31SStefan Hajnoczi 
30361007b31SStefan Hajnoczi /*
30461007b31SStefan Hajnoczi  * Wait for pending requests to complete across all BlockDriverStates
30561007b31SStefan Hajnoczi  *
30661007b31SStefan Hajnoczi  * This function does not flush data to disk, use bdrv_flush_all() for that
30761007b31SStefan Hajnoczi  * after calling this function.
308c0778f66SAlberto Garcia  *
309c0778f66SAlberto Garcia  * This pauses all block jobs and disables external clients. It must
310c0778f66SAlberto Garcia  * be paired with bdrv_drain_all_end().
311c0778f66SAlberto Garcia  *
312c0778f66SAlberto Garcia  * NOTE: no new block jobs or BlockDriverStates can be created between
313c0778f66SAlberto Garcia  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
31461007b31SStefan Hajnoczi  */
315c0778f66SAlberto Garcia void bdrv_drain_all_begin(void)
31661007b31SStefan Hajnoczi {
31761007b31SStefan Hajnoczi     /* Always run first iteration so any pending completion BHs run */
31899723548SPaolo Bonzini     bool waited = true;
3197c8eece4SKevin Wolf     BlockDriverState *bs;
32088be7b4bSKevin Wolf     BdrvNextIterator it;
321f406c03cSAlexander Yarygin     GSList *aio_ctxs = NULL, *ctx;
32261007b31SStefan Hajnoczi 
323f321dcb5SPaolo Bonzini     block_job_pause_all();
324eb1364ceSAlberto Garcia 
32588be7b4bSKevin Wolf     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
32661007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
32761007b31SStefan Hajnoczi 
32861007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
329c2066af0SKevin Wolf         bdrv_parent_drained_begin(bs);
330c0778f66SAlberto Garcia         aio_disable_external(aio_context);
33161007b31SStefan Hajnoczi         aio_context_release(aio_context);
332f406c03cSAlexander Yarygin 
333764ba3aeSAlberto Garcia         if (!g_slist_find(aio_ctxs, aio_context)) {
334f406c03cSAlexander Yarygin             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
335f406c03cSAlexander Yarygin         }
33661007b31SStefan Hajnoczi     }
33761007b31SStefan Hajnoczi 
3387a63f3cdSStefan Hajnoczi     /* Note that completion of an asynchronous I/O operation can trigger any
3397a63f3cdSStefan Hajnoczi      * number of other I/O operations on other devices---for example a
3407a63f3cdSStefan Hajnoczi      * coroutine can submit an I/O request to another device in response to
3417a63f3cdSStefan Hajnoczi      * request completion.  Therefore we must keep looping until there was no
3427a63f3cdSStefan Hajnoczi      * more activity rather than simply draining each device independently.
3437a63f3cdSStefan Hajnoczi      */
34499723548SPaolo Bonzini     while (waited) {
34599723548SPaolo Bonzini         waited = false;
346f406c03cSAlexander Yarygin 
347f406c03cSAlexander Yarygin         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
348f406c03cSAlexander Yarygin             AioContext *aio_context = ctx->data;
34961007b31SStefan Hajnoczi 
35061007b31SStefan Hajnoczi             aio_context_acquire(aio_context);
35188be7b4bSKevin Wolf             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
352f406c03cSAlexander Yarygin                 if (aio_context == bdrv_get_aio_context(bs)) {
353d42cf288SPaolo Bonzini                     waited |= bdrv_drain_recurse(bs);
354f406c03cSAlexander Yarygin                 }
355f406c03cSAlexander Yarygin             }
35661007b31SStefan Hajnoczi             aio_context_release(aio_context);
35761007b31SStefan Hajnoczi         }
35861007b31SStefan Hajnoczi     }
35961007b31SStefan Hajnoczi 
360c0778f66SAlberto Garcia     g_slist_free(aio_ctxs);
361c0778f66SAlberto Garcia }
362c0778f66SAlberto Garcia 
363c0778f66SAlberto Garcia void bdrv_drain_all_end(void)
364c0778f66SAlberto Garcia {
365c0778f66SAlberto Garcia     BlockDriverState *bs;
366c0778f66SAlberto Garcia     BdrvNextIterator it;
367c0778f66SAlberto Garcia 
36888be7b4bSKevin Wolf     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
36961007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
37061007b31SStefan Hajnoczi 
37161007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
372c0778f66SAlberto Garcia         aio_enable_external(aio_context);
373c2066af0SKevin Wolf         bdrv_parent_drained_end(bs);
37461007b31SStefan Hajnoczi         aio_context_release(aio_context);
37561007b31SStefan Hajnoczi     }
376eb1364ceSAlberto Garcia 
377f321dcb5SPaolo Bonzini     block_job_resume_all();
37861007b31SStefan Hajnoczi }
37961007b31SStefan Hajnoczi 
380c0778f66SAlberto Garcia void bdrv_drain_all(void)
381c0778f66SAlberto Garcia {
382c0778f66SAlberto Garcia     bdrv_drain_all_begin();
383c0778f66SAlberto Garcia     bdrv_drain_all_end();
384c0778f66SAlberto Garcia }
385c0778f66SAlberto Garcia 
38661007b31SStefan Hajnoczi /**
38761007b31SStefan Hajnoczi  * Remove an active request from the tracked requests list
38861007b31SStefan Hajnoczi  *
38961007b31SStefan Hajnoczi  * This function should be called when a tracked request is completing.
39061007b31SStefan Hajnoczi  */
39161007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req)
39261007b31SStefan Hajnoczi {
39361007b31SStefan Hajnoczi     if (req->serialising) {
39420fc71b2SPaolo Bonzini         atomic_dec(&req->bs->serialising_in_flight);
39561007b31SStefan Hajnoczi     }
39661007b31SStefan Hajnoczi 
3973783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&req->bs->reqs_lock);
39861007b31SStefan Hajnoczi     QLIST_REMOVE(req, list);
39961007b31SStefan Hajnoczi     qemu_co_queue_restart_all(&req->wait_queue);
4003783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&req->bs->reqs_lock);
40161007b31SStefan Hajnoczi }
40261007b31SStefan Hajnoczi 
40361007b31SStefan Hajnoczi /**
40461007b31SStefan Hajnoczi  * Add an active request to the tracked requests list
40561007b31SStefan Hajnoczi  */
40661007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req,
40761007b31SStefan Hajnoczi                                   BlockDriverState *bs,
40861007b31SStefan Hajnoczi                                   int64_t offset,
409ebde595cSFam Zheng                                   unsigned int bytes,
410ebde595cSFam Zheng                                   enum BdrvTrackedRequestType type)
41161007b31SStefan Hajnoczi {
41261007b31SStefan Hajnoczi     *req = (BdrvTrackedRequest){
41361007b31SStefan Hajnoczi         .bs = bs,
41461007b31SStefan Hajnoczi         .offset         = offset,
41561007b31SStefan Hajnoczi         .bytes          = bytes,
416ebde595cSFam Zheng         .type           = type,
41761007b31SStefan Hajnoczi         .co             = qemu_coroutine_self(),
41861007b31SStefan Hajnoczi         .serialising    = false,
41961007b31SStefan Hajnoczi         .overlap_offset = offset,
42061007b31SStefan Hajnoczi         .overlap_bytes  = bytes,
42161007b31SStefan Hajnoczi     };
42261007b31SStefan Hajnoczi 
42361007b31SStefan Hajnoczi     qemu_co_queue_init(&req->wait_queue);
42461007b31SStefan Hajnoczi 
4253783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
42661007b31SStefan Hajnoczi     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
4273783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
42861007b31SStefan Hajnoczi }
42961007b31SStefan Hajnoczi 
43061007b31SStefan Hajnoczi static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
43161007b31SStefan Hajnoczi {
43261007b31SStefan Hajnoczi     int64_t overlap_offset = req->offset & ~(align - 1);
43361007b31SStefan Hajnoczi     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
43461007b31SStefan Hajnoczi                                - overlap_offset;
43561007b31SStefan Hajnoczi 
43661007b31SStefan Hajnoczi     if (!req->serialising) {
43720fc71b2SPaolo Bonzini         atomic_inc(&req->bs->serialising_in_flight);
43861007b31SStefan Hajnoczi         req->serialising = true;
43961007b31SStefan Hajnoczi     }
44061007b31SStefan Hajnoczi 
44161007b31SStefan Hajnoczi     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
44261007b31SStefan Hajnoczi     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
44361007b31SStefan Hajnoczi }
44461007b31SStefan Hajnoczi 
44561007b31SStefan Hajnoczi /**
446244483e6SKevin Wolf  * Round a region to cluster boundaries
447244483e6SKevin Wolf  */
448244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs,
449244483e6SKevin Wolf                             int64_t offset, unsigned int bytes,
450244483e6SKevin Wolf                             int64_t *cluster_offset,
451244483e6SKevin Wolf                             unsigned int *cluster_bytes)
452244483e6SKevin Wolf {
453244483e6SKevin Wolf     BlockDriverInfo bdi;
454244483e6SKevin Wolf 
455244483e6SKevin Wolf     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
456244483e6SKevin Wolf         *cluster_offset = offset;
457244483e6SKevin Wolf         *cluster_bytes = bytes;
458244483e6SKevin Wolf     } else {
459244483e6SKevin Wolf         int64_t c = bdi.cluster_size;
460244483e6SKevin Wolf         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
461244483e6SKevin Wolf         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
462244483e6SKevin Wolf     }
463244483e6SKevin Wolf }
464244483e6SKevin Wolf 
46561007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs)
46661007b31SStefan Hajnoczi {
46761007b31SStefan Hajnoczi     BlockDriverInfo bdi;
46861007b31SStefan Hajnoczi     int ret;
46961007b31SStefan Hajnoczi 
47061007b31SStefan Hajnoczi     ret = bdrv_get_info(bs, &bdi);
47161007b31SStefan Hajnoczi     if (ret < 0 || bdi.cluster_size == 0) {
472a5b8dd2cSEric Blake         return bs->bl.request_alignment;
47361007b31SStefan Hajnoczi     } else {
47461007b31SStefan Hajnoczi         return bdi.cluster_size;
47561007b31SStefan Hajnoczi     }
47661007b31SStefan Hajnoczi }
47761007b31SStefan Hajnoczi 
47861007b31SStefan Hajnoczi static bool tracked_request_overlaps(BdrvTrackedRequest *req,
47961007b31SStefan Hajnoczi                                      int64_t offset, unsigned int bytes)
48061007b31SStefan Hajnoczi {
48161007b31SStefan Hajnoczi     /*        aaaa   bbbb */
48261007b31SStefan Hajnoczi     if (offset >= req->overlap_offset + req->overlap_bytes) {
48361007b31SStefan Hajnoczi         return false;
48461007b31SStefan Hajnoczi     }
48561007b31SStefan Hajnoczi     /* bbbb   aaaa        */
48661007b31SStefan Hajnoczi     if (req->overlap_offset >= offset + bytes) {
48761007b31SStefan Hajnoczi         return false;
48861007b31SStefan Hajnoczi     }
48961007b31SStefan Hajnoczi     return true;
49061007b31SStefan Hajnoczi }
49161007b31SStefan Hajnoczi 
49299723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs)
49399723548SPaolo Bonzini {
49499723548SPaolo Bonzini     atomic_inc(&bs->in_flight);
49599723548SPaolo Bonzini }
49699723548SPaolo Bonzini 
497c9d1a561SPaolo Bonzini static void dummy_bh_cb(void *opaque)
498c9d1a561SPaolo Bonzini {
499c9d1a561SPaolo Bonzini }
500c9d1a561SPaolo Bonzini 
501c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs)
502c9d1a561SPaolo Bonzini {
503e2a6ae7fSPaolo Bonzini     /* The barrier (or an atomic op) is in the caller.  */
504e2a6ae7fSPaolo Bonzini     if (atomic_read(&bs->wakeup)) {
505c9d1a561SPaolo Bonzini         aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
506c9d1a561SPaolo Bonzini     }
507c9d1a561SPaolo Bonzini }
508c9d1a561SPaolo Bonzini 
50999723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs)
51099723548SPaolo Bonzini {
51199723548SPaolo Bonzini     atomic_dec(&bs->in_flight);
512c9d1a561SPaolo Bonzini     bdrv_wakeup(bs);
51399723548SPaolo Bonzini }
51499723548SPaolo Bonzini 
51561007b31SStefan Hajnoczi static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
51661007b31SStefan Hajnoczi {
51761007b31SStefan Hajnoczi     BlockDriverState *bs = self->bs;
51861007b31SStefan Hajnoczi     BdrvTrackedRequest *req;
51961007b31SStefan Hajnoczi     bool retry;
52061007b31SStefan Hajnoczi     bool waited = false;
52161007b31SStefan Hajnoczi 
52220fc71b2SPaolo Bonzini     if (!atomic_read(&bs->serialising_in_flight)) {
52361007b31SStefan Hajnoczi         return false;
52461007b31SStefan Hajnoczi     }
52561007b31SStefan Hajnoczi 
52661007b31SStefan Hajnoczi     do {
52761007b31SStefan Hajnoczi         retry = false;
5283783fa3dSPaolo Bonzini         qemu_co_mutex_lock(&bs->reqs_lock);
52961007b31SStefan Hajnoczi         QLIST_FOREACH(req, &bs->tracked_requests, list) {
53061007b31SStefan Hajnoczi             if (req == self || (!req->serialising && !self->serialising)) {
53161007b31SStefan Hajnoczi                 continue;
53261007b31SStefan Hajnoczi             }
53361007b31SStefan Hajnoczi             if (tracked_request_overlaps(req, self->overlap_offset,
53461007b31SStefan Hajnoczi                                          self->overlap_bytes))
53561007b31SStefan Hajnoczi             {
53661007b31SStefan Hajnoczi                 /* Hitting this means there was a reentrant request, for
53761007b31SStefan Hajnoczi                  * example, a block driver issuing nested requests.  This must
53861007b31SStefan Hajnoczi                  * never happen since it means deadlock.
53961007b31SStefan Hajnoczi                  */
54061007b31SStefan Hajnoczi                 assert(qemu_coroutine_self() != req->co);
54161007b31SStefan Hajnoczi 
54261007b31SStefan Hajnoczi                 /* If the request is already (indirectly) waiting for us, or
54361007b31SStefan Hajnoczi                  * will wait for us as soon as it wakes up, then just go on
54461007b31SStefan Hajnoczi                  * (instead of producing a deadlock in the former case). */
54561007b31SStefan Hajnoczi                 if (!req->waiting_for) {
54661007b31SStefan Hajnoczi                     self->waiting_for = req;
5473783fa3dSPaolo Bonzini                     qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
54861007b31SStefan Hajnoczi                     self->waiting_for = NULL;
54961007b31SStefan Hajnoczi                     retry = true;
55061007b31SStefan Hajnoczi                     waited = true;
55161007b31SStefan Hajnoczi                     break;
55261007b31SStefan Hajnoczi                 }
55361007b31SStefan Hajnoczi             }
55461007b31SStefan Hajnoczi         }
5553783fa3dSPaolo Bonzini         qemu_co_mutex_unlock(&bs->reqs_lock);
55661007b31SStefan Hajnoczi     } while (retry);
55761007b31SStefan Hajnoczi 
55861007b31SStefan Hajnoczi     return waited;
55961007b31SStefan Hajnoczi }
56061007b31SStefan Hajnoczi 
56161007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
56261007b31SStefan Hajnoczi                                    size_t size)
56361007b31SStefan Hajnoczi {
56461007b31SStefan Hajnoczi     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
56561007b31SStefan Hajnoczi         return -EIO;
56661007b31SStefan Hajnoczi     }
56761007b31SStefan Hajnoczi 
56861007b31SStefan Hajnoczi     if (!bdrv_is_inserted(bs)) {
56961007b31SStefan Hajnoczi         return -ENOMEDIUM;
57061007b31SStefan Hajnoczi     }
57161007b31SStefan Hajnoczi 
57261007b31SStefan Hajnoczi     if (offset < 0) {
57361007b31SStefan Hajnoczi         return -EIO;
57461007b31SStefan Hajnoczi     }
57561007b31SStefan Hajnoczi 
57661007b31SStefan Hajnoczi     return 0;
57761007b31SStefan Hajnoczi }
57861007b31SStefan Hajnoczi 
57961007b31SStefan Hajnoczi typedef struct RwCo {
580e293b7a3SKevin Wolf     BdrvChild *child;
58161007b31SStefan Hajnoczi     int64_t offset;
58261007b31SStefan Hajnoczi     QEMUIOVector *qiov;
58361007b31SStefan Hajnoczi     bool is_write;
58461007b31SStefan Hajnoczi     int ret;
58561007b31SStefan Hajnoczi     BdrvRequestFlags flags;
58661007b31SStefan Hajnoczi } RwCo;
58761007b31SStefan Hajnoczi 
58861007b31SStefan Hajnoczi static void coroutine_fn bdrv_rw_co_entry(void *opaque)
58961007b31SStefan Hajnoczi {
59061007b31SStefan Hajnoczi     RwCo *rwco = opaque;
59161007b31SStefan Hajnoczi 
59261007b31SStefan Hajnoczi     if (!rwco->is_write) {
593a03ef88fSKevin Wolf         rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
59461007b31SStefan Hajnoczi                                    rwco->qiov->size, rwco->qiov,
59561007b31SStefan Hajnoczi                                    rwco->flags);
59661007b31SStefan Hajnoczi     } else {
597a03ef88fSKevin Wolf         rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
59861007b31SStefan Hajnoczi                                     rwco->qiov->size, rwco->qiov,
59961007b31SStefan Hajnoczi                                     rwco->flags);
60061007b31SStefan Hajnoczi     }
60161007b31SStefan Hajnoczi }
60261007b31SStefan Hajnoczi 
60361007b31SStefan Hajnoczi /*
60461007b31SStefan Hajnoczi  * Process a vectored synchronous request using coroutines
60561007b31SStefan Hajnoczi  */
606e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
60761007b31SStefan Hajnoczi                         QEMUIOVector *qiov, bool is_write,
60861007b31SStefan Hajnoczi                         BdrvRequestFlags flags)
60961007b31SStefan Hajnoczi {
61061007b31SStefan Hajnoczi     Coroutine *co;
61161007b31SStefan Hajnoczi     RwCo rwco = {
612e293b7a3SKevin Wolf         .child = child,
61361007b31SStefan Hajnoczi         .offset = offset,
61461007b31SStefan Hajnoczi         .qiov = qiov,
61561007b31SStefan Hajnoczi         .is_write = is_write,
61661007b31SStefan Hajnoczi         .ret = NOT_DONE,
61761007b31SStefan Hajnoczi         .flags = flags,
61861007b31SStefan Hajnoczi     };
61961007b31SStefan Hajnoczi 
62061007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
62161007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
62261007b31SStefan Hajnoczi         bdrv_rw_co_entry(&rwco);
62361007b31SStefan Hajnoczi     } else {
6240b8b8753SPaolo Bonzini         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
625e92f0e19SFam Zheng         bdrv_coroutine_enter(child->bs, co);
62688b062c2SPaolo Bonzini         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
62761007b31SStefan Hajnoczi     }
62861007b31SStefan Hajnoczi     return rwco.ret;
62961007b31SStefan Hajnoczi }
63061007b31SStefan Hajnoczi 
63161007b31SStefan Hajnoczi /*
63261007b31SStefan Hajnoczi  * Process a synchronous request using coroutines
63361007b31SStefan Hajnoczi  */
634e293b7a3SKevin Wolf static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
63561007b31SStefan Hajnoczi                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
63661007b31SStefan Hajnoczi {
63761007b31SStefan Hajnoczi     QEMUIOVector qiov;
63861007b31SStefan Hajnoczi     struct iovec iov = {
63961007b31SStefan Hajnoczi         .iov_base = (void *)buf,
64061007b31SStefan Hajnoczi         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
64161007b31SStefan Hajnoczi     };
64261007b31SStefan Hajnoczi 
64361007b31SStefan Hajnoczi     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
64461007b31SStefan Hajnoczi         return -EINVAL;
64561007b31SStefan Hajnoczi     }
64661007b31SStefan Hajnoczi 
64761007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
648e293b7a3SKevin Wolf     return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
64961007b31SStefan Hajnoczi                         &qiov, is_write, flags);
65061007b31SStefan Hajnoczi }
65161007b31SStefan Hajnoczi 
65261007b31SStefan Hajnoczi /* return < 0 if error. See bdrv_write() for the return codes */
653fbcbbf4eSKevin Wolf int bdrv_read(BdrvChild *child, int64_t sector_num,
65461007b31SStefan Hajnoczi               uint8_t *buf, int nb_sectors)
65561007b31SStefan Hajnoczi {
656e293b7a3SKevin Wolf     return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
65761007b31SStefan Hajnoczi }
65861007b31SStefan Hajnoczi 
65961007b31SStefan Hajnoczi /* Return < 0 if error. Important errors are:
66061007b31SStefan Hajnoczi   -EIO         generic I/O error (may happen for all errors)
66161007b31SStefan Hajnoczi   -ENOMEDIUM   No media inserted.
66261007b31SStefan Hajnoczi   -EINVAL      Invalid sector number or nb_sectors
66361007b31SStefan Hajnoczi   -EACCES      Trying to write a read-only device
66461007b31SStefan Hajnoczi */
66518d51c4bSKevin Wolf int bdrv_write(BdrvChild *child, int64_t sector_num,
66661007b31SStefan Hajnoczi                const uint8_t *buf, int nb_sectors)
66761007b31SStefan Hajnoczi {
668e293b7a3SKevin Wolf     return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
66961007b31SStefan Hajnoczi }
67061007b31SStefan Hajnoczi 
671720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
672f5a5ca79SManos Pitsidianakis                        int bytes, BdrvRequestFlags flags)
67361007b31SStefan Hajnoczi {
67474021bc4SEric Blake     QEMUIOVector qiov;
67574021bc4SEric Blake     struct iovec iov = {
67674021bc4SEric Blake         .iov_base = NULL,
677f5a5ca79SManos Pitsidianakis         .iov_len = bytes,
67874021bc4SEric Blake     };
67974021bc4SEric Blake 
68074021bc4SEric Blake     qemu_iovec_init_external(&qiov, &iov, 1);
681e293b7a3SKevin Wolf     return bdrv_prwv_co(child, offset, &qiov, true,
68261007b31SStefan Hajnoczi                         BDRV_REQ_ZERO_WRITE | flags);
68361007b31SStefan Hajnoczi }
68461007b31SStefan Hajnoczi 
68561007b31SStefan Hajnoczi /*
68674021bc4SEric Blake  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
68761007b31SStefan Hajnoczi  * The operation is sped up by checking the block status and only writing
68861007b31SStefan Hajnoczi  * zeroes to the device if they currently do not return zeroes. Optional
68974021bc4SEric Blake  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
690465fe887SEric Blake  * BDRV_REQ_FUA).
69161007b31SStefan Hajnoczi  *
69261007b31SStefan Hajnoczi  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
69361007b31SStefan Hajnoczi  */
694720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
69561007b31SStefan Hajnoczi {
69661007b31SStefan Hajnoczi     int64_t target_sectors, ret, nb_sectors, sector_num = 0;
697720ff280SKevin Wolf     BlockDriverState *bs = child->bs;
69867a0fd2aSFam Zheng     BlockDriverState *file;
69961007b31SStefan Hajnoczi     int n;
70061007b31SStefan Hajnoczi 
70161007b31SStefan Hajnoczi     target_sectors = bdrv_nb_sectors(bs);
70261007b31SStefan Hajnoczi     if (target_sectors < 0) {
70361007b31SStefan Hajnoczi         return target_sectors;
70461007b31SStefan Hajnoczi     }
70561007b31SStefan Hajnoczi 
70661007b31SStefan Hajnoczi     for (;;) {
70761007b31SStefan Hajnoczi         nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
70861007b31SStefan Hajnoczi         if (nb_sectors <= 0) {
70961007b31SStefan Hajnoczi             return 0;
71061007b31SStefan Hajnoczi         }
71167a0fd2aSFam Zheng         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
71261007b31SStefan Hajnoczi         if (ret < 0) {
71361007b31SStefan Hajnoczi             error_report("error getting block status at sector %" PRId64 ": %s",
71461007b31SStefan Hajnoczi                          sector_num, strerror(-ret));
71561007b31SStefan Hajnoczi             return ret;
71661007b31SStefan Hajnoczi         }
71761007b31SStefan Hajnoczi         if (ret & BDRV_BLOCK_ZERO) {
71861007b31SStefan Hajnoczi             sector_num += n;
71961007b31SStefan Hajnoczi             continue;
72061007b31SStefan Hajnoczi         }
721720ff280SKevin Wolf         ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS,
72274021bc4SEric Blake                                  n << BDRV_SECTOR_BITS, flags);
72361007b31SStefan Hajnoczi         if (ret < 0) {
72461007b31SStefan Hajnoczi             error_report("error writing zeroes at sector %" PRId64 ": %s",
72561007b31SStefan Hajnoczi                          sector_num, strerror(-ret));
72661007b31SStefan Hajnoczi             return ret;
72761007b31SStefan Hajnoczi         }
72861007b31SStefan Hajnoczi         sector_num += n;
72961007b31SStefan Hajnoczi     }
73061007b31SStefan Hajnoczi }
73161007b31SStefan Hajnoczi 
732cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
733f1e84741SKevin Wolf {
734f1e84741SKevin Wolf     int ret;
735f1e84741SKevin Wolf 
736e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
737f1e84741SKevin Wolf     if (ret < 0) {
738f1e84741SKevin Wolf         return ret;
739f1e84741SKevin Wolf     }
740f1e84741SKevin Wolf 
741f1e84741SKevin Wolf     return qiov->size;
742f1e84741SKevin Wolf }
743f1e84741SKevin Wolf 
744cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
74561007b31SStefan Hajnoczi {
74661007b31SStefan Hajnoczi     QEMUIOVector qiov;
74761007b31SStefan Hajnoczi     struct iovec iov = {
74861007b31SStefan Hajnoczi         .iov_base = (void *)buf,
74961007b31SStefan Hajnoczi         .iov_len = bytes,
75061007b31SStefan Hajnoczi     };
75161007b31SStefan Hajnoczi 
75261007b31SStefan Hajnoczi     if (bytes < 0) {
75361007b31SStefan Hajnoczi         return -EINVAL;
75461007b31SStefan Hajnoczi     }
75561007b31SStefan Hajnoczi 
75661007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
757cf2ab8fcSKevin Wolf     return bdrv_preadv(child, offset, &qiov);
75861007b31SStefan Hajnoczi }
75961007b31SStefan Hajnoczi 
760d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
76161007b31SStefan Hajnoczi {
76261007b31SStefan Hajnoczi     int ret;
76361007b31SStefan Hajnoczi 
764e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
76561007b31SStefan Hajnoczi     if (ret < 0) {
76661007b31SStefan Hajnoczi         return ret;
76761007b31SStefan Hajnoczi     }
76861007b31SStefan Hajnoczi 
76961007b31SStefan Hajnoczi     return qiov->size;
77061007b31SStefan Hajnoczi }
77161007b31SStefan Hajnoczi 
772d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
77361007b31SStefan Hajnoczi {
77461007b31SStefan Hajnoczi     QEMUIOVector qiov;
77561007b31SStefan Hajnoczi     struct iovec iov = {
77661007b31SStefan Hajnoczi         .iov_base   = (void *) buf,
77761007b31SStefan Hajnoczi         .iov_len    = bytes,
77861007b31SStefan Hajnoczi     };
77961007b31SStefan Hajnoczi 
78061007b31SStefan Hajnoczi     if (bytes < 0) {
78161007b31SStefan Hajnoczi         return -EINVAL;
78261007b31SStefan Hajnoczi     }
78361007b31SStefan Hajnoczi 
78461007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
785d9ca2ea2SKevin Wolf     return bdrv_pwritev(child, offset, &qiov);
78661007b31SStefan Hajnoczi }
78761007b31SStefan Hajnoczi 
78861007b31SStefan Hajnoczi /*
78961007b31SStefan Hajnoczi  * Writes to the file and ensures that no writes are reordered across this
79061007b31SStefan Hajnoczi  * request (acts as a barrier)
79161007b31SStefan Hajnoczi  *
79261007b31SStefan Hajnoczi  * Returns 0 on success, -errno in error cases.
79361007b31SStefan Hajnoczi  */
794d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
79561007b31SStefan Hajnoczi                      const void *buf, int count)
79661007b31SStefan Hajnoczi {
79761007b31SStefan Hajnoczi     int ret;
79861007b31SStefan Hajnoczi 
799d9ca2ea2SKevin Wolf     ret = bdrv_pwrite(child, offset, buf, count);
80061007b31SStefan Hajnoczi     if (ret < 0) {
80161007b31SStefan Hajnoczi         return ret;
80261007b31SStefan Hajnoczi     }
80361007b31SStefan Hajnoczi 
804d9ca2ea2SKevin Wolf     ret = bdrv_flush(child->bs);
805855a6a93SKevin Wolf     if (ret < 0) {
806855a6a93SKevin Wolf         return ret;
80761007b31SStefan Hajnoczi     }
80861007b31SStefan Hajnoczi 
80961007b31SStefan Hajnoczi     return 0;
81061007b31SStefan Hajnoczi }
81161007b31SStefan Hajnoczi 
81208844473SKevin Wolf typedef struct CoroutineIOCompletion {
81308844473SKevin Wolf     Coroutine *coroutine;
81408844473SKevin Wolf     int ret;
81508844473SKevin Wolf } CoroutineIOCompletion;
81608844473SKevin Wolf 
81708844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret)
81808844473SKevin Wolf {
81908844473SKevin Wolf     CoroutineIOCompletion *co = opaque;
82008844473SKevin Wolf 
82108844473SKevin Wolf     co->ret = ret;
822b9e413ddSPaolo Bonzini     aio_co_wake(co->coroutine);
82308844473SKevin Wolf }
82408844473SKevin Wolf 
825166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
826166fe960SKevin Wolf                                            uint64_t offset, uint64_t bytes,
827166fe960SKevin Wolf                                            QEMUIOVector *qiov, int flags)
828166fe960SKevin Wolf {
829166fe960SKevin Wolf     BlockDriver *drv = bs->drv;
8303fb06697SKevin Wolf     int64_t sector_num;
8313fb06697SKevin Wolf     unsigned int nb_sectors;
8323fb06697SKevin Wolf 
833fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
834fa166538SEric Blake 
8353fb06697SKevin Wolf     if (drv->bdrv_co_preadv) {
8363fb06697SKevin Wolf         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
8373fb06697SKevin Wolf     }
8383fb06697SKevin Wolf 
8393fb06697SKevin Wolf     sector_num = offset >> BDRV_SECTOR_BITS;
8403fb06697SKevin Wolf     nb_sectors = bytes >> BDRV_SECTOR_BITS;
841166fe960SKevin Wolf 
842166fe960SKevin Wolf     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
843166fe960SKevin Wolf     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
844166fe960SKevin Wolf     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
845166fe960SKevin Wolf 
84608844473SKevin Wolf     if (drv->bdrv_co_readv) {
847166fe960SKevin Wolf         return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
84808844473SKevin Wolf     } else {
84908844473SKevin Wolf         BlockAIOCB *acb;
85008844473SKevin Wolf         CoroutineIOCompletion co = {
85108844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
85208844473SKevin Wolf         };
85308844473SKevin Wolf 
85408844473SKevin Wolf         acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
85508844473SKevin Wolf                                       bdrv_co_io_em_complete, &co);
85608844473SKevin Wolf         if (acb == NULL) {
85708844473SKevin Wolf             return -EIO;
85808844473SKevin Wolf         } else {
85908844473SKevin Wolf             qemu_coroutine_yield();
86008844473SKevin Wolf             return co.ret;
86108844473SKevin Wolf         }
86208844473SKevin Wolf     }
863166fe960SKevin Wolf }
864166fe960SKevin Wolf 
86578a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
86678a07294SKevin Wolf                                             uint64_t offset, uint64_t bytes,
86778a07294SKevin Wolf                                             QEMUIOVector *qiov, int flags)
86878a07294SKevin Wolf {
86978a07294SKevin Wolf     BlockDriver *drv = bs->drv;
8703fb06697SKevin Wolf     int64_t sector_num;
8713fb06697SKevin Wolf     unsigned int nb_sectors;
87278a07294SKevin Wolf     int ret;
87378a07294SKevin Wolf 
874fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
875fa166538SEric Blake 
8763fb06697SKevin Wolf     if (drv->bdrv_co_pwritev) {
877515c2f43SKevin Wolf         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
878515c2f43SKevin Wolf                                    flags & bs->supported_write_flags);
879515c2f43SKevin Wolf         flags &= ~bs->supported_write_flags;
8803fb06697SKevin Wolf         goto emulate_flags;
8813fb06697SKevin Wolf     }
8823fb06697SKevin Wolf 
8833fb06697SKevin Wolf     sector_num = offset >> BDRV_SECTOR_BITS;
8843fb06697SKevin Wolf     nb_sectors = bytes >> BDRV_SECTOR_BITS;
8853fb06697SKevin Wolf 
88678a07294SKevin Wolf     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
88778a07294SKevin Wolf     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
88878a07294SKevin Wolf     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
88978a07294SKevin Wolf 
89078a07294SKevin Wolf     if (drv->bdrv_co_writev_flags) {
89178a07294SKevin Wolf         ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
8924df863f3SEric Blake                                         flags & bs->supported_write_flags);
8934df863f3SEric Blake         flags &= ~bs->supported_write_flags;
89408844473SKevin Wolf     } else if (drv->bdrv_co_writev) {
8954df863f3SEric Blake         assert(!bs->supported_write_flags);
89678a07294SKevin Wolf         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
89708844473SKevin Wolf     } else {
89808844473SKevin Wolf         BlockAIOCB *acb;
89908844473SKevin Wolf         CoroutineIOCompletion co = {
90008844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
90108844473SKevin Wolf         };
90208844473SKevin Wolf 
90308844473SKevin Wolf         acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
90408844473SKevin Wolf                                        bdrv_co_io_em_complete, &co);
90508844473SKevin Wolf         if (acb == NULL) {
9063fb06697SKevin Wolf             ret = -EIO;
90708844473SKevin Wolf         } else {
90808844473SKevin Wolf             qemu_coroutine_yield();
9093fb06697SKevin Wolf             ret = co.ret;
91008844473SKevin Wolf         }
91178a07294SKevin Wolf     }
91278a07294SKevin Wolf 
9133fb06697SKevin Wolf emulate_flags:
9144df863f3SEric Blake     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
91578a07294SKevin Wolf         ret = bdrv_co_flush(bs);
91678a07294SKevin Wolf     }
91778a07294SKevin Wolf 
91878a07294SKevin Wolf     return ret;
91978a07294SKevin Wolf }
92078a07294SKevin Wolf 
92129a298afSPavel Butsykin static int coroutine_fn
92229a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
92329a298afSPavel Butsykin                                uint64_t bytes, QEMUIOVector *qiov)
92429a298afSPavel Butsykin {
92529a298afSPavel Butsykin     BlockDriver *drv = bs->drv;
92629a298afSPavel Butsykin 
92729a298afSPavel Butsykin     if (!drv->bdrv_co_pwritev_compressed) {
92829a298afSPavel Butsykin         return -ENOTSUP;
92929a298afSPavel Butsykin     }
93029a298afSPavel Butsykin 
93129a298afSPavel Butsykin     return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
93229a298afSPavel Butsykin }
93329a298afSPavel Butsykin 
93485c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
935244483e6SKevin Wolf         int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
93661007b31SStefan Hajnoczi {
93785c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
93885c97ca7SKevin Wolf 
93961007b31SStefan Hajnoczi     /* Perform I/O through a temporary buffer so that users who scribble over
94061007b31SStefan Hajnoczi      * their read buffer while the operation is in progress do not end up
94161007b31SStefan Hajnoczi      * modifying the image file.  This is critical for zero-copy guest I/O
94261007b31SStefan Hajnoczi      * where anything might happen inside guest memory.
94361007b31SStefan Hajnoczi      */
94461007b31SStefan Hajnoczi     void *bounce_buffer;
94561007b31SStefan Hajnoczi 
94661007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
94761007b31SStefan Hajnoczi     struct iovec iov;
94861007b31SStefan Hajnoczi     QEMUIOVector bounce_qiov;
949244483e6SKevin Wolf     int64_t cluster_offset;
950244483e6SKevin Wolf     unsigned int cluster_bytes;
95161007b31SStefan Hajnoczi     size_t skip_bytes;
95261007b31SStefan Hajnoczi     int ret;
95361007b31SStefan Hajnoczi 
9541bf03e66SKevin Wolf     /* FIXME We cannot require callers to have write permissions when all they
9551bf03e66SKevin Wolf      * are doing is a read request. If we did things right, write permissions
9561bf03e66SKevin Wolf      * would be obtained anyway, but internally by the copy-on-read code. As
957*765d9df9SEric Blake      * long as it is implemented here rather than in a separate filter driver,
9581bf03e66SKevin Wolf      * the copy-on-read code doesn't have its own BdrvChild, however, for which
9591bf03e66SKevin Wolf      * it could request permissions. Therefore we have to bypass the permission
9601bf03e66SKevin Wolf      * system for the moment. */
9611bf03e66SKevin Wolf     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
962afa4b293SKevin Wolf 
96361007b31SStefan Hajnoczi     /* Cover entire cluster so no additional backing file I/O is required when
96461007b31SStefan Hajnoczi      * allocating cluster in the image file.
96561007b31SStefan Hajnoczi      */
966244483e6SKevin Wolf     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
96761007b31SStefan Hajnoczi 
968244483e6SKevin Wolf     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
969244483e6SKevin Wolf                                    cluster_offset, cluster_bytes);
97061007b31SStefan Hajnoczi 
971244483e6SKevin Wolf     iov.iov_len = cluster_bytes;
97261007b31SStefan Hajnoczi     iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
97361007b31SStefan Hajnoczi     if (bounce_buffer == NULL) {
97461007b31SStefan Hajnoczi         ret = -ENOMEM;
97561007b31SStefan Hajnoczi         goto err;
97661007b31SStefan Hajnoczi     }
97761007b31SStefan Hajnoczi 
97861007b31SStefan Hajnoczi     qemu_iovec_init_external(&bounce_qiov, &iov, 1);
97961007b31SStefan Hajnoczi 
980244483e6SKevin Wolf     ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
981166fe960SKevin Wolf                              &bounce_qiov, 0);
98261007b31SStefan Hajnoczi     if (ret < 0) {
98361007b31SStefan Hajnoczi         goto err;
98461007b31SStefan Hajnoczi     }
98561007b31SStefan Hajnoczi 
986c1499a5eSEric Blake     if (drv->bdrv_co_pwrite_zeroes &&
98761007b31SStefan Hajnoczi         buffer_is_zero(bounce_buffer, iov.iov_len)) {
988a604fa2bSEric Blake         /* FIXME: Should we (perhaps conditionally) be setting
989a604fa2bSEric Blake          * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
990a604fa2bSEric Blake          * that still correctly reads as zero? */
991244483e6SKevin Wolf         ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
99261007b31SStefan Hajnoczi     } else {
99361007b31SStefan Hajnoczi         /* This does not change the data on the disk, it is not necessary
99461007b31SStefan Hajnoczi          * to flush even in cache=writethrough mode.
99561007b31SStefan Hajnoczi          */
996244483e6SKevin Wolf         ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
99778a07294SKevin Wolf                                   &bounce_qiov, 0);
99861007b31SStefan Hajnoczi     }
99961007b31SStefan Hajnoczi 
100061007b31SStefan Hajnoczi     if (ret < 0) {
100161007b31SStefan Hajnoczi         /* It might be okay to ignore write errors for guest requests.  If this
100261007b31SStefan Hajnoczi          * is a deliberate copy-on-read then we don't want to ignore the error.
100361007b31SStefan Hajnoczi          * Simply report it in all cases.
100461007b31SStefan Hajnoczi          */
100561007b31SStefan Hajnoczi         goto err;
100661007b31SStefan Hajnoczi     }
100761007b31SStefan Hajnoczi 
1008244483e6SKevin Wolf     skip_bytes = offset - cluster_offset;
1009244483e6SKevin Wolf     qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
101061007b31SStefan Hajnoczi 
101161007b31SStefan Hajnoczi err:
101261007b31SStefan Hajnoczi     qemu_vfree(bounce_buffer);
101361007b31SStefan Hajnoczi     return ret;
101461007b31SStefan Hajnoczi }
101561007b31SStefan Hajnoczi 
101661007b31SStefan Hajnoczi /*
101761007b31SStefan Hajnoczi  * Forwards an already correctly aligned request to the BlockDriver. This
10181a62d0acSEric Blake  * handles copy on read, zeroing after EOF, and fragmentation of large
10191a62d0acSEric Blake  * reads; any other features must be implemented by the caller.
102061007b31SStefan Hajnoczi  */
102185c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
102261007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
102361007b31SStefan Hajnoczi     int64_t align, QEMUIOVector *qiov, int flags)
102461007b31SStefan Hajnoczi {
102585c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
1026c9d20029SKevin Wolf     int64_t total_bytes, max_bytes;
10271a62d0acSEric Blake     int ret = 0;
10281a62d0acSEric Blake     uint64_t bytes_remaining = bytes;
10291a62d0acSEric Blake     int max_transfer;
103061007b31SStefan Hajnoczi 
103149c07526SKevin Wolf     assert(is_power_of_2(align));
103249c07526SKevin Wolf     assert((offset & (align - 1)) == 0);
103349c07526SKevin Wolf     assert((bytes & (align - 1)) == 0);
103461007b31SStefan Hajnoczi     assert(!qiov || bytes == qiov->size);
1035abb06c5aSDaniel P. Berrange     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
10361a62d0acSEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
10371a62d0acSEric Blake                                    align);
1038a604fa2bSEric Blake 
1039a604fa2bSEric Blake     /* TODO: We would need a per-BDS .supported_read_flags and
1040a604fa2bSEric Blake      * potential fallback support, if we ever implement any read flags
1041a604fa2bSEric Blake      * to pass through to drivers.  For now, there aren't any
1042a604fa2bSEric Blake      * passthrough flags.  */
1043a604fa2bSEric Blake     assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
104461007b31SStefan Hajnoczi 
104561007b31SStefan Hajnoczi     /* Handle Copy on Read and associated serialisation */
104661007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
104761007b31SStefan Hajnoczi         /* If we touch the same cluster it counts as an overlap.  This
104861007b31SStefan Hajnoczi          * guarantees that allocating writes will be serialized and not race
104961007b31SStefan Hajnoczi          * with each other for the same cluster.  For example, in copy-on-read
105061007b31SStefan Hajnoczi          * it ensures that the CoR read and write operations are atomic and
105161007b31SStefan Hajnoczi          * guest writes cannot interleave between them. */
105261007b31SStefan Hajnoczi         mark_request_serialising(req, bdrv_get_cluster_size(bs));
105361007b31SStefan Hajnoczi     }
105461007b31SStefan Hajnoczi 
105561408b25SFam Zheng     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
105661007b31SStefan Hajnoczi         wait_serialising_requests(req);
105761408b25SFam Zheng     }
105861007b31SStefan Hajnoczi 
105961007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
1060d6a644bbSEric Blake         /* TODO: Simplify further once bdrv_is_allocated no longer
1061d6a644bbSEric Blake          * requires sector alignment */
1062d6a644bbSEric Blake         int64_t start = QEMU_ALIGN_DOWN(offset, BDRV_SECTOR_SIZE);
1063d6a644bbSEric Blake         int64_t end = QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE);
1064d6a644bbSEric Blake         int64_t pnum;
106561007b31SStefan Hajnoczi 
1066d6a644bbSEric Blake         ret = bdrv_is_allocated(bs, start, end - start, &pnum);
106761007b31SStefan Hajnoczi         if (ret < 0) {
106861007b31SStefan Hajnoczi             goto out;
106961007b31SStefan Hajnoczi         }
107061007b31SStefan Hajnoczi 
1071d6a644bbSEric Blake         if (!ret || pnum != end - start) {
107285c97ca7SKevin Wolf             ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
107361007b31SStefan Hajnoczi             goto out;
107461007b31SStefan Hajnoczi         }
107561007b31SStefan Hajnoczi     }
107661007b31SStefan Hajnoczi 
10771a62d0acSEric Blake     /* Forward the request to the BlockDriver, possibly fragmenting it */
107849c07526SKevin Wolf     total_bytes = bdrv_getlength(bs);
107949c07526SKevin Wolf     if (total_bytes < 0) {
108049c07526SKevin Wolf         ret = total_bytes;
108161007b31SStefan Hajnoczi         goto out;
108261007b31SStefan Hajnoczi     }
108361007b31SStefan Hajnoczi 
108449c07526SKevin Wolf     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
10851a62d0acSEric Blake     if (bytes <= max_bytes && bytes <= max_transfer) {
1086166fe960SKevin Wolf         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
10871a62d0acSEric Blake         goto out;
108861007b31SStefan Hajnoczi     }
108961007b31SStefan Hajnoczi 
10901a62d0acSEric Blake     while (bytes_remaining) {
10911a62d0acSEric Blake         int num;
10921a62d0acSEric Blake 
10931a62d0acSEric Blake         if (max_bytes) {
10941a62d0acSEric Blake             QEMUIOVector local_qiov;
10951a62d0acSEric Blake 
10961a62d0acSEric Blake             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
10971a62d0acSEric Blake             assert(num);
10981a62d0acSEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
10991a62d0acSEric Blake             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
11001a62d0acSEric Blake 
11011a62d0acSEric Blake             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
11021a62d0acSEric Blake                                      num, &local_qiov, 0);
11031a62d0acSEric Blake             max_bytes -= num;
11041a62d0acSEric Blake             qemu_iovec_destroy(&local_qiov);
11051a62d0acSEric Blake         } else {
11061a62d0acSEric Blake             num = bytes_remaining;
11071a62d0acSEric Blake             ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
11081a62d0acSEric Blake                                     bytes_remaining);
11091a62d0acSEric Blake         }
11101a62d0acSEric Blake         if (ret < 0) {
11111a62d0acSEric Blake             goto out;
11121a62d0acSEric Blake         }
11131a62d0acSEric Blake         bytes_remaining -= num;
111461007b31SStefan Hajnoczi     }
111561007b31SStefan Hajnoczi 
111661007b31SStefan Hajnoczi out:
11171a62d0acSEric Blake     return ret < 0 ? ret : 0;
111861007b31SStefan Hajnoczi }
111961007b31SStefan Hajnoczi 
112061007b31SStefan Hajnoczi /*
112161007b31SStefan Hajnoczi  * Handle a read request in coroutine context
112261007b31SStefan Hajnoczi  */
1123a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child,
112461007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
112561007b31SStefan Hajnoczi     BdrvRequestFlags flags)
112661007b31SStefan Hajnoczi {
1127a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
112861007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
112961007b31SStefan Hajnoczi     BdrvTrackedRequest req;
113061007b31SStefan Hajnoczi 
1131a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
113261007b31SStefan Hajnoczi     uint8_t *head_buf = NULL;
113361007b31SStefan Hajnoczi     uint8_t *tail_buf = NULL;
113461007b31SStefan Hajnoczi     QEMUIOVector local_qiov;
113561007b31SStefan Hajnoczi     bool use_local_qiov = false;
113661007b31SStefan Hajnoczi     int ret;
113761007b31SStefan Hajnoczi 
1138f42cf447SDaniel P. Berrange     trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1139f42cf447SDaniel P. Berrange 
114061007b31SStefan Hajnoczi     if (!drv) {
114161007b31SStefan Hajnoczi         return -ENOMEDIUM;
114261007b31SStefan Hajnoczi     }
114361007b31SStefan Hajnoczi 
114461007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
114561007b31SStefan Hajnoczi     if (ret < 0) {
114661007b31SStefan Hajnoczi         return ret;
114761007b31SStefan Hajnoczi     }
114861007b31SStefan Hajnoczi 
114999723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
115099723548SPaolo Bonzini 
11519568b511SWen Congyang     /* Don't do copy-on-read if we read data before write operation */
1152d3faa13eSPaolo Bonzini     if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
115361007b31SStefan Hajnoczi         flags |= BDRV_REQ_COPY_ON_READ;
115461007b31SStefan Hajnoczi     }
115561007b31SStefan Hajnoczi 
115661007b31SStefan Hajnoczi     /* Align read if necessary by padding qiov */
115761007b31SStefan Hajnoczi     if (offset & (align - 1)) {
115861007b31SStefan Hajnoczi         head_buf = qemu_blockalign(bs, align);
115961007b31SStefan Hajnoczi         qemu_iovec_init(&local_qiov, qiov->niov + 2);
116061007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
116161007b31SStefan Hajnoczi         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
116261007b31SStefan Hajnoczi         use_local_qiov = true;
116361007b31SStefan Hajnoczi 
116461007b31SStefan Hajnoczi         bytes += offset & (align - 1);
116561007b31SStefan Hajnoczi         offset = offset & ~(align - 1);
116661007b31SStefan Hajnoczi     }
116761007b31SStefan Hajnoczi 
116861007b31SStefan Hajnoczi     if ((offset + bytes) & (align - 1)) {
116961007b31SStefan Hajnoczi         if (!use_local_qiov) {
117061007b31SStefan Hajnoczi             qemu_iovec_init(&local_qiov, qiov->niov + 1);
117161007b31SStefan Hajnoczi             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
117261007b31SStefan Hajnoczi             use_local_qiov = true;
117361007b31SStefan Hajnoczi         }
117461007b31SStefan Hajnoczi         tail_buf = qemu_blockalign(bs, align);
117561007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, tail_buf,
117661007b31SStefan Hajnoczi                        align - ((offset + bytes) & (align - 1)));
117761007b31SStefan Hajnoczi 
117861007b31SStefan Hajnoczi         bytes = ROUND_UP(bytes, align);
117961007b31SStefan Hajnoczi     }
118061007b31SStefan Hajnoczi 
1181ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
118285c97ca7SKevin Wolf     ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
118361007b31SStefan Hajnoczi                               use_local_qiov ? &local_qiov : qiov,
118461007b31SStefan Hajnoczi                               flags);
118561007b31SStefan Hajnoczi     tracked_request_end(&req);
118699723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
118761007b31SStefan Hajnoczi 
118861007b31SStefan Hajnoczi     if (use_local_qiov) {
118961007b31SStefan Hajnoczi         qemu_iovec_destroy(&local_qiov);
119061007b31SStefan Hajnoczi         qemu_vfree(head_buf);
119161007b31SStefan Hajnoczi         qemu_vfree(tail_buf);
119261007b31SStefan Hajnoczi     }
119361007b31SStefan Hajnoczi 
119461007b31SStefan Hajnoczi     return ret;
119561007b31SStefan Hajnoczi }
119661007b31SStefan Hajnoczi 
1197adad6496SKevin Wolf static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
119861007b31SStefan Hajnoczi     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
119961007b31SStefan Hajnoczi     BdrvRequestFlags flags)
120061007b31SStefan Hajnoczi {
120161007b31SStefan Hajnoczi     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
120261007b31SStefan Hajnoczi         return -EINVAL;
120361007b31SStefan Hajnoczi     }
120461007b31SStefan Hajnoczi 
1205a03ef88fSKevin Wolf     return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
120661007b31SStefan Hajnoczi                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
120761007b31SStefan Hajnoczi }
120861007b31SStefan Hajnoczi 
120928b04a8fSKevin Wolf int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
121061007b31SStefan Hajnoczi                                int nb_sectors, QEMUIOVector *qiov)
121161007b31SStefan Hajnoczi {
1212adad6496SKevin Wolf     return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
121361007b31SStefan Hajnoczi }
121461007b31SStefan Hajnoczi 
12155def6b80SEric Blake /* Maximum buffer for write zeroes fallback, in bytes */
12165def6b80SEric Blake #define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
121761007b31SStefan Hajnoczi 
1218d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1219f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags)
122061007b31SStefan Hajnoczi {
122161007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
122261007b31SStefan Hajnoczi     QEMUIOVector qiov;
122361007b31SStefan Hajnoczi     struct iovec iov = {0};
122461007b31SStefan Hajnoczi     int ret = 0;
1225465fe887SEric Blake     bool need_flush = false;
1226443668caSDenis V. Lunev     int head = 0;
1227443668caSDenis V. Lunev     int tail = 0;
122861007b31SStefan Hajnoczi 
1229cf081fcaSEric Blake     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1230a5b8dd2cSEric Blake     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1231a5b8dd2cSEric Blake                         bs->bl.request_alignment);
1232b2f95feeSEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1233b2f95feeSEric Blake                                     MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1234cf081fcaSEric Blake 
1235b8d0a980SEric Blake     assert(alignment % bs->bl.request_alignment == 0);
1236b8d0a980SEric Blake     head = offset % alignment;
1237f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % alignment;
1238b8d0a980SEric Blake     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1239b8d0a980SEric Blake     assert(max_write_zeroes >= bs->bl.request_alignment);
124061007b31SStefan Hajnoczi 
1241f5a5ca79SManos Pitsidianakis     while (bytes > 0 && !ret) {
1242f5a5ca79SManos Pitsidianakis         int num = bytes;
124361007b31SStefan Hajnoczi 
124461007b31SStefan Hajnoczi         /* Align request.  Block drivers can expect the "bulk" of the request
1245443668caSDenis V. Lunev          * to be aligned, and that unaligned requests do not cross cluster
1246443668caSDenis V. Lunev          * boundaries.
124761007b31SStefan Hajnoczi          */
1248443668caSDenis V. Lunev         if (head) {
1249b2f95feeSEric Blake             /* Make a small request up to the first aligned sector. For
1250b2f95feeSEric Blake              * convenience, limit this request to max_transfer even if
1251b2f95feeSEric Blake              * we don't need to fall back to writes.  */
1252f5a5ca79SManos Pitsidianakis             num = MIN(MIN(bytes, max_transfer), alignment - head);
1253b2f95feeSEric Blake             head = (head + num) % alignment;
1254b2f95feeSEric Blake             assert(num < max_write_zeroes);
1255d05aa8bbSEric Blake         } else if (tail && num > alignment) {
1256443668caSDenis V. Lunev             /* Shorten the request to the last aligned sector.  */
1257443668caSDenis V. Lunev             num -= tail;
125861007b31SStefan Hajnoczi         }
125961007b31SStefan Hajnoczi 
126061007b31SStefan Hajnoczi         /* limit request size */
126161007b31SStefan Hajnoczi         if (num > max_write_zeroes) {
126261007b31SStefan Hajnoczi             num = max_write_zeroes;
126361007b31SStefan Hajnoczi         }
126461007b31SStefan Hajnoczi 
126561007b31SStefan Hajnoczi         ret = -ENOTSUP;
126661007b31SStefan Hajnoczi         /* First try the efficient write zeroes operation */
1267d05aa8bbSEric Blake         if (drv->bdrv_co_pwrite_zeroes) {
1268d05aa8bbSEric Blake             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1269d05aa8bbSEric Blake                                              flags & bs->supported_zero_flags);
1270d05aa8bbSEric Blake             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1271d05aa8bbSEric Blake                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1272d05aa8bbSEric Blake                 need_flush = true;
1273d05aa8bbSEric Blake             }
1274465fe887SEric Blake         } else {
1275465fe887SEric Blake             assert(!bs->supported_zero_flags);
127661007b31SStefan Hajnoczi         }
127761007b31SStefan Hajnoczi 
127861007b31SStefan Hajnoczi         if (ret == -ENOTSUP) {
127961007b31SStefan Hajnoczi             /* Fall back to bounce buffer if write zeroes is unsupported */
1280465fe887SEric Blake             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1281465fe887SEric Blake 
1282465fe887SEric Blake             if ((flags & BDRV_REQ_FUA) &&
1283465fe887SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1284465fe887SEric Blake                 /* No need for bdrv_driver_pwrite() to do a fallback
1285465fe887SEric Blake                  * flush on each chunk; use just one at the end */
1286465fe887SEric Blake                 write_flags &= ~BDRV_REQ_FUA;
1287465fe887SEric Blake                 need_flush = true;
1288465fe887SEric Blake             }
12895def6b80SEric Blake             num = MIN(num, max_transfer);
1290d05aa8bbSEric Blake             iov.iov_len = num;
129161007b31SStefan Hajnoczi             if (iov.iov_base == NULL) {
1292d05aa8bbSEric Blake                 iov.iov_base = qemu_try_blockalign(bs, num);
129361007b31SStefan Hajnoczi                 if (iov.iov_base == NULL) {
129461007b31SStefan Hajnoczi                     ret = -ENOMEM;
129561007b31SStefan Hajnoczi                     goto fail;
129661007b31SStefan Hajnoczi                 }
1297d05aa8bbSEric Blake                 memset(iov.iov_base, 0, num);
129861007b31SStefan Hajnoczi             }
129961007b31SStefan Hajnoczi             qemu_iovec_init_external(&qiov, &iov, 1);
130061007b31SStefan Hajnoczi 
1301d05aa8bbSEric Blake             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
130261007b31SStefan Hajnoczi 
130361007b31SStefan Hajnoczi             /* Keep bounce buffer around if it is big enough for all
130461007b31SStefan Hajnoczi              * all future requests.
130561007b31SStefan Hajnoczi              */
13065def6b80SEric Blake             if (num < max_transfer) {
130761007b31SStefan Hajnoczi                 qemu_vfree(iov.iov_base);
130861007b31SStefan Hajnoczi                 iov.iov_base = NULL;
130961007b31SStefan Hajnoczi             }
131061007b31SStefan Hajnoczi         }
131161007b31SStefan Hajnoczi 
1312d05aa8bbSEric Blake         offset += num;
1313f5a5ca79SManos Pitsidianakis         bytes -= num;
131461007b31SStefan Hajnoczi     }
131561007b31SStefan Hajnoczi 
131661007b31SStefan Hajnoczi fail:
1317465fe887SEric Blake     if (ret == 0 && need_flush) {
1318465fe887SEric Blake         ret = bdrv_co_flush(bs);
1319465fe887SEric Blake     }
132061007b31SStefan Hajnoczi     qemu_vfree(iov.iov_base);
132161007b31SStefan Hajnoczi     return ret;
132261007b31SStefan Hajnoczi }
132361007b31SStefan Hajnoczi 
132461007b31SStefan Hajnoczi /*
132504ed95f4SEric Blake  * Forwards an already correctly aligned write request to the BlockDriver,
132604ed95f4SEric Blake  * after possibly fragmenting it.
132761007b31SStefan Hajnoczi  */
132885c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
132961007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1330cff86b38SEric Blake     int64_t align, QEMUIOVector *qiov, int flags)
133161007b31SStefan Hajnoczi {
133285c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
133361007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
133461007b31SStefan Hajnoczi     bool waited;
133561007b31SStefan Hajnoczi     int ret;
133661007b31SStefan Hajnoczi 
13379896c876SKevin Wolf     int64_t start_sector = offset >> BDRV_SECTOR_BITS;
13389896c876SKevin Wolf     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
133904ed95f4SEric Blake     uint64_t bytes_remaining = bytes;
134004ed95f4SEric Blake     int max_transfer;
134161007b31SStefan Hajnoczi 
1342d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
1343d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
1344d6883bc9SVladimir Sementsov-Ogievskiy     }
1345d6883bc9SVladimir Sementsov-Ogievskiy 
1346cff86b38SEric Blake     assert(is_power_of_2(align));
1347cff86b38SEric Blake     assert((offset & (align - 1)) == 0);
1348cff86b38SEric Blake     assert((bytes & (align - 1)) == 0);
134961007b31SStefan Hajnoczi     assert(!qiov || bytes == qiov->size);
1350abb06c5aSDaniel P. Berrange     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1351fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
135204ed95f4SEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
135304ed95f4SEric Blake                                    align);
135461007b31SStefan Hajnoczi 
135561007b31SStefan Hajnoczi     waited = wait_serialising_requests(req);
135661007b31SStefan Hajnoczi     assert(!waited || !req->serialising);
135761007b31SStefan Hajnoczi     assert(req->overlap_offset <= offset);
135861007b31SStefan Hajnoczi     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1359362b3786SMax Reitz     assert(child->perm & BLK_PERM_WRITE);
1360362b3786SMax Reitz     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
136161007b31SStefan Hajnoczi 
136261007b31SStefan Hajnoczi     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
136361007b31SStefan Hajnoczi 
136461007b31SStefan Hajnoczi     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1365c1499a5eSEric Blake         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
136661007b31SStefan Hajnoczi         qemu_iovec_is_zero(qiov)) {
136761007b31SStefan Hajnoczi         flags |= BDRV_REQ_ZERO_WRITE;
136861007b31SStefan Hajnoczi         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
136961007b31SStefan Hajnoczi             flags |= BDRV_REQ_MAY_UNMAP;
137061007b31SStefan Hajnoczi         }
137161007b31SStefan Hajnoczi     }
137261007b31SStefan Hajnoczi 
137361007b31SStefan Hajnoczi     if (ret < 0) {
137461007b31SStefan Hajnoczi         /* Do nothing, write notifier decided to fail this request */
137561007b31SStefan Hajnoczi     } else if (flags & BDRV_REQ_ZERO_WRITE) {
13769a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
13779896c876SKevin Wolf         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
13783ea1a091SPavel Butsykin     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
13793ea1a091SPavel Butsykin         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
138004ed95f4SEric Blake     } else if (bytes <= max_transfer) {
13819a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV);
138278a07294SKevin Wolf         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
138304ed95f4SEric Blake     } else {
138404ed95f4SEric Blake         bdrv_debug_event(bs, BLKDBG_PWRITEV);
138504ed95f4SEric Blake         while (bytes_remaining) {
138604ed95f4SEric Blake             int num = MIN(bytes_remaining, max_transfer);
138704ed95f4SEric Blake             QEMUIOVector local_qiov;
138804ed95f4SEric Blake             int local_flags = flags;
138904ed95f4SEric Blake 
139004ed95f4SEric Blake             assert(num);
139104ed95f4SEric Blake             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
139204ed95f4SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
139304ed95f4SEric Blake                 /* If FUA is going to be emulated by flush, we only
139404ed95f4SEric Blake                  * need to flush on the last iteration */
139504ed95f4SEric Blake                 local_flags &= ~BDRV_REQ_FUA;
139604ed95f4SEric Blake             }
139704ed95f4SEric Blake             qemu_iovec_init(&local_qiov, qiov->niov);
139804ed95f4SEric Blake             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
139904ed95f4SEric Blake 
140004ed95f4SEric Blake             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
140104ed95f4SEric Blake                                       num, &local_qiov, local_flags);
140204ed95f4SEric Blake             qemu_iovec_destroy(&local_qiov);
140304ed95f4SEric Blake             if (ret < 0) {
140404ed95f4SEric Blake                 break;
140504ed95f4SEric Blake             }
140604ed95f4SEric Blake             bytes_remaining -= num;
140704ed95f4SEric Blake         }
140861007b31SStefan Hajnoczi     }
14099a4f4c31SKevin Wolf     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
141061007b31SStefan Hajnoczi 
141147fec599SPaolo Bonzini     atomic_inc(&bs->write_gen);
14129896c876SKevin Wolf     bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
141361007b31SStefan Hajnoczi 
1414f7946da2SPaolo Bonzini     stat64_max(&bs->wr_highest_offset, offset + bytes);
141561007b31SStefan Hajnoczi 
141661007b31SStefan Hajnoczi     if (ret >= 0) {
14179896c876SKevin Wolf         bs->total_sectors = MAX(bs->total_sectors, end_sector);
141804ed95f4SEric Blake         ret = 0;
141961007b31SStefan Hajnoczi     }
142061007b31SStefan Hajnoczi 
142161007b31SStefan Hajnoczi     return ret;
142261007b31SStefan Hajnoczi }
142361007b31SStefan Hajnoczi 
142485c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
14259eeb6dd1SFam Zheng                                                 int64_t offset,
14269eeb6dd1SFam Zheng                                                 unsigned int bytes,
14279eeb6dd1SFam Zheng                                                 BdrvRequestFlags flags,
14289eeb6dd1SFam Zheng                                                 BdrvTrackedRequest *req)
14299eeb6dd1SFam Zheng {
143085c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
14319eeb6dd1SFam Zheng     uint8_t *buf = NULL;
14329eeb6dd1SFam Zheng     QEMUIOVector local_qiov;
14339eeb6dd1SFam Zheng     struct iovec iov;
1434a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
14359eeb6dd1SFam Zheng     unsigned int head_padding_bytes, tail_padding_bytes;
14369eeb6dd1SFam Zheng     int ret = 0;
14379eeb6dd1SFam Zheng 
14389eeb6dd1SFam Zheng     head_padding_bytes = offset & (align - 1);
1439f13ce1beSDenis V. Lunev     tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
14409eeb6dd1SFam Zheng 
14419eeb6dd1SFam Zheng 
14429eeb6dd1SFam Zheng     assert(flags & BDRV_REQ_ZERO_WRITE);
14439eeb6dd1SFam Zheng     if (head_padding_bytes || tail_padding_bytes) {
14449eeb6dd1SFam Zheng         buf = qemu_blockalign(bs, align);
14459eeb6dd1SFam Zheng         iov = (struct iovec) {
14469eeb6dd1SFam Zheng             .iov_base   = buf,
14479eeb6dd1SFam Zheng             .iov_len    = align,
14489eeb6dd1SFam Zheng         };
14499eeb6dd1SFam Zheng         qemu_iovec_init_external(&local_qiov, &iov, 1);
14509eeb6dd1SFam Zheng     }
14519eeb6dd1SFam Zheng     if (head_padding_bytes) {
14529eeb6dd1SFam Zheng         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
14539eeb6dd1SFam Zheng 
14549eeb6dd1SFam Zheng         /* RMW the unaligned part before head. */
14559eeb6dd1SFam Zheng         mark_request_serialising(req, align);
14569eeb6dd1SFam Zheng         wait_serialising_requests(req);
14579a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
145885c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
14599eeb6dd1SFam Zheng                                   align, &local_qiov, 0);
14609eeb6dd1SFam Zheng         if (ret < 0) {
14619eeb6dd1SFam Zheng             goto fail;
14629eeb6dd1SFam Zheng         }
14639a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
14649eeb6dd1SFam Zheng 
14659eeb6dd1SFam Zheng         memset(buf + head_padding_bytes, 0, zero_bytes);
146685c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1467cff86b38SEric Blake                                    align, &local_qiov,
14689eeb6dd1SFam Zheng                                    flags & ~BDRV_REQ_ZERO_WRITE);
14699eeb6dd1SFam Zheng         if (ret < 0) {
14709eeb6dd1SFam Zheng             goto fail;
14719eeb6dd1SFam Zheng         }
14729eeb6dd1SFam Zheng         offset += zero_bytes;
14739eeb6dd1SFam Zheng         bytes -= zero_bytes;
14749eeb6dd1SFam Zheng     }
14759eeb6dd1SFam Zheng 
14769eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
14779eeb6dd1SFam Zheng     if (bytes >= align) {
14789eeb6dd1SFam Zheng         /* Write the aligned part in the middle. */
14799eeb6dd1SFam Zheng         uint64_t aligned_bytes = bytes & ~(align - 1);
148085c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
14819eeb6dd1SFam Zheng                                    NULL, flags);
14829eeb6dd1SFam Zheng         if (ret < 0) {
14839eeb6dd1SFam Zheng             goto fail;
14849eeb6dd1SFam Zheng         }
14859eeb6dd1SFam Zheng         bytes -= aligned_bytes;
14869eeb6dd1SFam Zheng         offset += aligned_bytes;
14879eeb6dd1SFam Zheng     }
14889eeb6dd1SFam Zheng 
14899eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
14909eeb6dd1SFam Zheng     if (bytes) {
14919eeb6dd1SFam Zheng         assert(align == tail_padding_bytes + bytes);
14929eeb6dd1SFam Zheng         /* RMW the unaligned part after tail. */
14939eeb6dd1SFam Zheng         mark_request_serialising(req, align);
14949eeb6dd1SFam Zheng         wait_serialising_requests(req);
14959a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
149685c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, req, offset, align,
14979eeb6dd1SFam Zheng                                   align, &local_qiov, 0);
14989eeb6dd1SFam Zheng         if (ret < 0) {
14999eeb6dd1SFam Zheng             goto fail;
15009eeb6dd1SFam Zheng         }
15019a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
15029eeb6dd1SFam Zheng 
15039eeb6dd1SFam Zheng         memset(buf, 0, bytes);
150485c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
15059eeb6dd1SFam Zheng                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
15069eeb6dd1SFam Zheng     }
15079eeb6dd1SFam Zheng fail:
15089eeb6dd1SFam Zheng     qemu_vfree(buf);
15099eeb6dd1SFam Zheng     return ret;
15109eeb6dd1SFam Zheng 
15119eeb6dd1SFam Zheng }
15129eeb6dd1SFam Zheng 
151361007b31SStefan Hajnoczi /*
151461007b31SStefan Hajnoczi  * Handle a write request in coroutine context
151561007b31SStefan Hajnoczi  */
1516a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
151761007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
151861007b31SStefan Hajnoczi     BdrvRequestFlags flags)
151961007b31SStefan Hajnoczi {
1520a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
152161007b31SStefan Hajnoczi     BdrvTrackedRequest req;
1522a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
152361007b31SStefan Hajnoczi     uint8_t *head_buf = NULL;
152461007b31SStefan Hajnoczi     uint8_t *tail_buf = NULL;
152561007b31SStefan Hajnoczi     QEMUIOVector local_qiov;
152661007b31SStefan Hajnoczi     bool use_local_qiov = false;
152761007b31SStefan Hajnoczi     int ret;
152861007b31SStefan Hajnoczi 
1529f42cf447SDaniel P. Berrange     trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1530f42cf447SDaniel P. Berrange 
153161007b31SStefan Hajnoczi     if (!bs->drv) {
153261007b31SStefan Hajnoczi         return -ENOMEDIUM;
153361007b31SStefan Hajnoczi     }
153461007b31SStefan Hajnoczi     if (bs->read_only) {
1535eaf5fe2dSPaolo Bonzini         return -EPERM;
153661007b31SStefan Hajnoczi     }
153704c01a5cSKevin Wolf     assert(!(bs->open_flags & BDRV_O_INACTIVE));
153861007b31SStefan Hajnoczi 
153961007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
154061007b31SStefan Hajnoczi     if (ret < 0) {
154161007b31SStefan Hajnoczi         return ret;
154261007b31SStefan Hajnoczi     }
154361007b31SStefan Hajnoczi 
154499723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
154561007b31SStefan Hajnoczi     /*
154661007b31SStefan Hajnoczi      * Align write if necessary by performing a read-modify-write cycle.
154761007b31SStefan Hajnoczi      * Pad qiov with the read parts and be sure to have a tracked request not
154861007b31SStefan Hajnoczi      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
154961007b31SStefan Hajnoczi      */
1550ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
155161007b31SStefan Hajnoczi 
15529eeb6dd1SFam Zheng     if (!qiov) {
155385c97ca7SKevin Wolf         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
15549eeb6dd1SFam Zheng         goto out;
15559eeb6dd1SFam Zheng     }
15569eeb6dd1SFam Zheng 
155761007b31SStefan Hajnoczi     if (offset & (align - 1)) {
155861007b31SStefan Hajnoczi         QEMUIOVector head_qiov;
155961007b31SStefan Hajnoczi         struct iovec head_iov;
156061007b31SStefan Hajnoczi 
156161007b31SStefan Hajnoczi         mark_request_serialising(&req, align);
156261007b31SStefan Hajnoczi         wait_serialising_requests(&req);
156361007b31SStefan Hajnoczi 
156461007b31SStefan Hajnoczi         head_buf = qemu_blockalign(bs, align);
156561007b31SStefan Hajnoczi         head_iov = (struct iovec) {
156661007b31SStefan Hajnoczi             .iov_base   = head_buf,
156761007b31SStefan Hajnoczi             .iov_len    = align,
156861007b31SStefan Hajnoczi         };
156961007b31SStefan Hajnoczi         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
157061007b31SStefan Hajnoczi 
15719a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
157285c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
157361007b31SStefan Hajnoczi                                   align, &head_qiov, 0);
157461007b31SStefan Hajnoczi         if (ret < 0) {
157561007b31SStefan Hajnoczi             goto fail;
157661007b31SStefan Hajnoczi         }
15779a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
157861007b31SStefan Hajnoczi 
157961007b31SStefan Hajnoczi         qemu_iovec_init(&local_qiov, qiov->niov + 2);
158061007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
158161007b31SStefan Hajnoczi         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
158261007b31SStefan Hajnoczi         use_local_qiov = true;
158361007b31SStefan Hajnoczi 
158461007b31SStefan Hajnoczi         bytes += offset & (align - 1);
158561007b31SStefan Hajnoczi         offset = offset & ~(align - 1);
1586117bc3faSPeter Lieven 
1587117bc3faSPeter Lieven         /* We have read the tail already if the request is smaller
1588117bc3faSPeter Lieven          * than one aligned block.
1589117bc3faSPeter Lieven          */
1590117bc3faSPeter Lieven         if (bytes < align) {
1591117bc3faSPeter Lieven             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1592117bc3faSPeter Lieven             bytes = align;
1593117bc3faSPeter Lieven         }
159461007b31SStefan Hajnoczi     }
159561007b31SStefan Hajnoczi 
159661007b31SStefan Hajnoczi     if ((offset + bytes) & (align - 1)) {
159761007b31SStefan Hajnoczi         QEMUIOVector tail_qiov;
159861007b31SStefan Hajnoczi         struct iovec tail_iov;
159961007b31SStefan Hajnoczi         size_t tail_bytes;
160061007b31SStefan Hajnoczi         bool waited;
160161007b31SStefan Hajnoczi 
160261007b31SStefan Hajnoczi         mark_request_serialising(&req, align);
160361007b31SStefan Hajnoczi         waited = wait_serialising_requests(&req);
160461007b31SStefan Hajnoczi         assert(!waited || !use_local_qiov);
160561007b31SStefan Hajnoczi 
160661007b31SStefan Hajnoczi         tail_buf = qemu_blockalign(bs, align);
160761007b31SStefan Hajnoczi         tail_iov = (struct iovec) {
160861007b31SStefan Hajnoczi             .iov_base   = tail_buf,
160961007b31SStefan Hajnoczi             .iov_len    = align,
161061007b31SStefan Hajnoczi         };
161161007b31SStefan Hajnoczi         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
161261007b31SStefan Hajnoczi 
16139a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
161485c97ca7SKevin Wolf         ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
161585c97ca7SKevin Wolf                                   align, align, &tail_qiov, 0);
161661007b31SStefan Hajnoczi         if (ret < 0) {
161761007b31SStefan Hajnoczi             goto fail;
161861007b31SStefan Hajnoczi         }
16199a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
162061007b31SStefan Hajnoczi 
162161007b31SStefan Hajnoczi         if (!use_local_qiov) {
162261007b31SStefan Hajnoczi             qemu_iovec_init(&local_qiov, qiov->niov + 1);
162361007b31SStefan Hajnoczi             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
162461007b31SStefan Hajnoczi             use_local_qiov = true;
162561007b31SStefan Hajnoczi         }
162661007b31SStefan Hajnoczi 
162761007b31SStefan Hajnoczi         tail_bytes = (offset + bytes) & (align - 1);
162861007b31SStefan Hajnoczi         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
162961007b31SStefan Hajnoczi 
163061007b31SStefan Hajnoczi         bytes = ROUND_UP(bytes, align);
163161007b31SStefan Hajnoczi     }
163261007b31SStefan Hajnoczi 
163385c97ca7SKevin Wolf     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
163461007b31SStefan Hajnoczi                                use_local_qiov ? &local_qiov : qiov,
163561007b31SStefan Hajnoczi                                flags);
163661007b31SStefan Hajnoczi 
163761007b31SStefan Hajnoczi fail:
163861007b31SStefan Hajnoczi 
163961007b31SStefan Hajnoczi     if (use_local_qiov) {
164061007b31SStefan Hajnoczi         qemu_iovec_destroy(&local_qiov);
164161007b31SStefan Hajnoczi     }
164261007b31SStefan Hajnoczi     qemu_vfree(head_buf);
164361007b31SStefan Hajnoczi     qemu_vfree(tail_buf);
16449eeb6dd1SFam Zheng out:
16459eeb6dd1SFam Zheng     tracked_request_end(&req);
164699723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
164761007b31SStefan Hajnoczi     return ret;
164861007b31SStefan Hajnoczi }
164961007b31SStefan Hajnoczi 
1650adad6496SKevin Wolf static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
165161007b31SStefan Hajnoczi     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
165261007b31SStefan Hajnoczi     BdrvRequestFlags flags)
165361007b31SStefan Hajnoczi {
165461007b31SStefan Hajnoczi     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
165561007b31SStefan Hajnoczi         return -EINVAL;
165661007b31SStefan Hajnoczi     }
165761007b31SStefan Hajnoczi 
1658a03ef88fSKevin Wolf     return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
165961007b31SStefan Hajnoczi                            nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
166061007b31SStefan Hajnoczi }
166161007b31SStefan Hajnoczi 
166225ec177dSKevin Wolf int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
166361007b31SStefan Hajnoczi     int nb_sectors, QEMUIOVector *qiov)
166461007b31SStefan Hajnoczi {
1665adad6496SKevin Wolf     return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
166661007b31SStefan Hajnoczi }
166761007b31SStefan Hajnoczi 
1668a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1669f5a5ca79SManos Pitsidianakis                                        int bytes, BdrvRequestFlags flags)
167061007b31SStefan Hajnoczi {
1671f5a5ca79SManos Pitsidianakis     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
167261007b31SStefan Hajnoczi 
1673a03ef88fSKevin Wolf     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
167461007b31SStefan Hajnoczi         flags &= ~BDRV_REQ_MAY_UNMAP;
167561007b31SStefan Hajnoczi     }
167661007b31SStefan Hajnoczi 
1677f5a5ca79SManos Pitsidianakis     return bdrv_co_pwritev(child, offset, bytes, NULL,
167861007b31SStefan Hajnoczi                            BDRV_REQ_ZERO_WRITE | flags);
167961007b31SStefan Hajnoczi }
168061007b31SStefan Hajnoczi 
16814085f5c7SJohn Snow /*
16824085f5c7SJohn Snow  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
16834085f5c7SJohn Snow  */
16844085f5c7SJohn Snow int bdrv_flush_all(void)
16854085f5c7SJohn Snow {
16864085f5c7SJohn Snow     BdrvNextIterator it;
16874085f5c7SJohn Snow     BlockDriverState *bs = NULL;
16884085f5c7SJohn Snow     int result = 0;
16894085f5c7SJohn Snow 
16904085f5c7SJohn Snow     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
16914085f5c7SJohn Snow         AioContext *aio_context = bdrv_get_aio_context(bs);
16924085f5c7SJohn Snow         int ret;
16934085f5c7SJohn Snow 
16944085f5c7SJohn Snow         aio_context_acquire(aio_context);
16954085f5c7SJohn Snow         ret = bdrv_flush(bs);
16964085f5c7SJohn Snow         if (ret < 0 && !result) {
16974085f5c7SJohn Snow             result = ret;
16984085f5c7SJohn Snow         }
16994085f5c7SJohn Snow         aio_context_release(aio_context);
17004085f5c7SJohn Snow     }
17014085f5c7SJohn Snow 
17024085f5c7SJohn Snow     return result;
17034085f5c7SJohn Snow }
17044085f5c7SJohn Snow 
17054085f5c7SJohn Snow 
170661007b31SStefan Hajnoczi typedef struct BdrvCoGetBlockStatusData {
170761007b31SStefan Hajnoczi     BlockDriverState *bs;
170861007b31SStefan Hajnoczi     BlockDriverState *base;
170967a0fd2aSFam Zheng     BlockDriverState **file;
171061007b31SStefan Hajnoczi     int64_t sector_num;
171161007b31SStefan Hajnoczi     int nb_sectors;
171261007b31SStefan Hajnoczi     int *pnum;
171361007b31SStefan Hajnoczi     int64_t ret;
171461007b31SStefan Hajnoczi     bool done;
171561007b31SStefan Hajnoczi } BdrvCoGetBlockStatusData;
171661007b31SStefan Hajnoczi 
1717f7cc69b3SManos Pitsidianakis int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
1718f7cc69b3SManos Pitsidianakis                                                         int64_t sector_num,
1719f7cc69b3SManos Pitsidianakis                                                         int nb_sectors,
1720f7cc69b3SManos Pitsidianakis                                                         int *pnum,
1721f7cc69b3SManos Pitsidianakis                                                         BlockDriverState **file)
1722f7cc69b3SManos Pitsidianakis {
1723f7cc69b3SManos Pitsidianakis     assert(bs->file && bs->file->bs);
1724f7cc69b3SManos Pitsidianakis     *pnum = nb_sectors;
1725f7cc69b3SManos Pitsidianakis     *file = bs->file->bs;
1726f7cc69b3SManos Pitsidianakis     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
1727f7cc69b3SManos Pitsidianakis            (sector_num << BDRV_SECTOR_BITS);
1728f7cc69b3SManos Pitsidianakis }
1729f7cc69b3SManos Pitsidianakis 
1730f7cc69b3SManos Pitsidianakis int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
1731f7cc69b3SManos Pitsidianakis                                                            int64_t sector_num,
1732f7cc69b3SManos Pitsidianakis                                                            int nb_sectors,
1733f7cc69b3SManos Pitsidianakis                                                            int *pnum,
1734f7cc69b3SManos Pitsidianakis                                                            BlockDriverState **file)
1735f7cc69b3SManos Pitsidianakis {
1736f7cc69b3SManos Pitsidianakis     assert(bs->backing && bs->backing->bs);
1737f7cc69b3SManos Pitsidianakis     *pnum = nb_sectors;
1738f7cc69b3SManos Pitsidianakis     *file = bs->backing->bs;
1739f7cc69b3SManos Pitsidianakis     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
1740f7cc69b3SManos Pitsidianakis            (sector_num << BDRV_SECTOR_BITS);
1741f7cc69b3SManos Pitsidianakis }
1742f7cc69b3SManos Pitsidianakis 
174361007b31SStefan Hajnoczi /*
174461007b31SStefan Hajnoczi  * Returns the allocation status of the specified sectors.
174561007b31SStefan Hajnoczi  * Drivers not implementing the functionality are assumed to not support
174661007b31SStefan Hajnoczi  * backing files, hence all their sectors are reported as allocated.
174761007b31SStefan Hajnoczi  *
1748fb0d8654SEric Blake  * If 'sector_num' is beyond the end of the disk image the return value is
1749fb0d8654SEric Blake  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
175061007b31SStefan Hajnoczi  *
175161007b31SStefan Hajnoczi  * 'pnum' is set to the number of sectors (including and immediately following
175261007b31SStefan Hajnoczi  * the specified sector) that are known to be in the same
175361007b31SStefan Hajnoczi  * allocated/unallocated state.
175461007b31SStefan Hajnoczi  *
175561007b31SStefan Hajnoczi  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1756fb0d8654SEric Blake  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
1757fb0d8654SEric Blake  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
175867a0fd2aSFam Zheng  *
175967a0fd2aSFam Zheng  * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
176067a0fd2aSFam Zheng  * points to the BDS which the sector range is allocated in.
176161007b31SStefan Hajnoczi  */
176261007b31SStefan Hajnoczi static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
176361007b31SStefan Hajnoczi                                                      int64_t sector_num,
176467a0fd2aSFam Zheng                                                      int nb_sectors, int *pnum,
176567a0fd2aSFam Zheng                                                      BlockDriverState **file)
176661007b31SStefan Hajnoczi {
176761007b31SStefan Hajnoczi     int64_t total_sectors;
176861007b31SStefan Hajnoczi     int64_t n;
176961007b31SStefan Hajnoczi     int64_t ret, ret2;
177061007b31SStefan Hajnoczi 
177181c219acSEric Blake     *file = NULL;
177261007b31SStefan Hajnoczi     total_sectors = bdrv_nb_sectors(bs);
177361007b31SStefan Hajnoczi     if (total_sectors < 0) {
177461007b31SStefan Hajnoczi         return total_sectors;
177561007b31SStefan Hajnoczi     }
177661007b31SStefan Hajnoczi 
177761007b31SStefan Hajnoczi     if (sector_num >= total_sectors) {
177861007b31SStefan Hajnoczi         *pnum = 0;
1779fb0d8654SEric Blake         return BDRV_BLOCK_EOF;
178061007b31SStefan Hajnoczi     }
178161007b31SStefan Hajnoczi 
178261007b31SStefan Hajnoczi     n = total_sectors - sector_num;
178361007b31SStefan Hajnoczi     if (n < nb_sectors) {
178461007b31SStefan Hajnoczi         nb_sectors = n;
178561007b31SStefan Hajnoczi     }
178661007b31SStefan Hajnoczi 
178761007b31SStefan Hajnoczi     if (!bs->drv->bdrv_co_get_block_status) {
178861007b31SStefan Hajnoczi         *pnum = nb_sectors;
178961007b31SStefan Hajnoczi         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1790fb0d8654SEric Blake         if (sector_num + nb_sectors == total_sectors) {
1791fb0d8654SEric Blake             ret |= BDRV_BLOCK_EOF;
1792fb0d8654SEric Blake         }
179361007b31SStefan Hajnoczi         if (bs->drv->protocol_name) {
179461007b31SStefan Hajnoczi             ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
179581c219acSEric Blake             *file = bs;
179661007b31SStefan Hajnoczi         }
179761007b31SStefan Hajnoczi         return ret;
179861007b31SStefan Hajnoczi     }
179961007b31SStefan Hajnoczi 
180099723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
180167a0fd2aSFam Zheng     ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
180267a0fd2aSFam Zheng                                             file);
180361007b31SStefan Hajnoczi     if (ret < 0) {
180461007b31SStefan Hajnoczi         *pnum = 0;
180599723548SPaolo Bonzini         goto out;
180661007b31SStefan Hajnoczi     }
180761007b31SStefan Hajnoczi 
180861007b31SStefan Hajnoczi     if (ret & BDRV_BLOCK_RAW) {
180981c219acSEric Blake         assert(ret & BDRV_BLOCK_OFFSET_VALID && *file);
1810ee29d6adSEric Blake         ret = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
181167a0fd2aSFam Zheng                                        *pnum, pnum, file);
181299723548SPaolo Bonzini         goto out;
181361007b31SStefan Hajnoczi     }
181461007b31SStefan Hajnoczi 
181561007b31SStefan Hajnoczi     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
181661007b31SStefan Hajnoczi         ret |= BDRV_BLOCK_ALLOCATED;
1817a53f1a95SPaolo Bonzini     } else {
181861007b31SStefan Hajnoczi         if (bdrv_unallocated_blocks_are_zero(bs)) {
181961007b31SStefan Hajnoczi             ret |= BDRV_BLOCK_ZERO;
1820760e0063SKevin Wolf         } else if (bs->backing) {
1821760e0063SKevin Wolf             BlockDriverState *bs2 = bs->backing->bs;
182261007b31SStefan Hajnoczi             int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
182361007b31SStefan Hajnoczi             if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
182461007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
182561007b31SStefan Hajnoczi             }
182661007b31SStefan Hajnoczi         }
182761007b31SStefan Hajnoczi     }
182861007b31SStefan Hajnoczi 
1829ac987b30SFam Zheng     if (*file && *file != bs &&
183061007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
183161007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_OFFSET_VALID)) {
183267a0fd2aSFam Zheng         BlockDriverState *file2;
183361007b31SStefan Hajnoczi         int file_pnum;
183461007b31SStefan Hajnoczi 
1835ac987b30SFam Zheng         ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
183667a0fd2aSFam Zheng                                         *pnum, &file_pnum, &file2);
183761007b31SStefan Hajnoczi         if (ret2 >= 0) {
183861007b31SStefan Hajnoczi             /* Ignore errors.  This is just providing extra information, it
183961007b31SStefan Hajnoczi              * is useful but not necessary.
184061007b31SStefan Hajnoczi              */
1841c61e684eSEric Blake             if (ret2 & BDRV_BLOCK_EOF &&
1842c61e684eSEric Blake                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
1843c61e684eSEric Blake                 /*
1844c61e684eSEric Blake                  * It is valid for the format block driver to read
1845c61e684eSEric Blake                  * beyond the end of the underlying file's current
1846c61e684eSEric Blake                  * size; such areas read as zero.
1847c61e684eSEric Blake                  */
184861007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
184961007b31SStefan Hajnoczi             } else {
185061007b31SStefan Hajnoczi                 /* Limit request to the range reported by the protocol driver */
185161007b31SStefan Hajnoczi                 *pnum = file_pnum;
185261007b31SStefan Hajnoczi                 ret |= (ret2 & BDRV_BLOCK_ZERO);
185361007b31SStefan Hajnoczi             }
185461007b31SStefan Hajnoczi         }
185561007b31SStefan Hajnoczi     }
185661007b31SStefan Hajnoczi 
185799723548SPaolo Bonzini out:
185899723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
1859fb0d8654SEric Blake     if (ret >= 0 && sector_num + *pnum == total_sectors) {
1860fb0d8654SEric Blake         ret |= BDRV_BLOCK_EOF;
1861fb0d8654SEric Blake     }
186261007b31SStefan Hajnoczi     return ret;
186361007b31SStefan Hajnoczi }
186461007b31SStefan Hajnoczi 
1865ba3f0e25SFam Zheng static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1866ba3f0e25SFam Zheng         BlockDriverState *base,
1867ba3f0e25SFam Zheng         int64_t sector_num,
1868ba3f0e25SFam Zheng         int nb_sectors,
186967a0fd2aSFam Zheng         int *pnum,
187067a0fd2aSFam Zheng         BlockDriverState **file)
1871ba3f0e25SFam Zheng {
1872ba3f0e25SFam Zheng     BlockDriverState *p;
1873ba3f0e25SFam Zheng     int64_t ret = 0;
1874c61e684eSEric Blake     bool first = true;
1875ba3f0e25SFam Zheng 
1876ba3f0e25SFam Zheng     assert(bs != base);
1877760e0063SKevin Wolf     for (p = bs; p != base; p = backing_bs(p)) {
187867a0fd2aSFam Zheng         ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1879c61e684eSEric Blake         if (ret < 0) {
1880c61e684eSEric Blake             break;
1881c61e684eSEric Blake         }
1882c61e684eSEric Blake         if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
1883c61e684eSEric Blake             /*
1884c61e684eSEric Blake              * Reading beyond the end of the file continues to read
1885c61e684eSEric Blake              * zeroes, but we can only widen the result to the
1886c61e684eSEric Blake              * unallocated length we learned from an earlier
1887c61e684eSEric Blake              * iteration.
1888c61e684eSEric Blake              */
1889c61e684eSEric Blake             *pnum = nb_sectors;
1890c61e684eSEric Blake         }
1891c61e684eSEric Blake         if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
1892ba3f0e25SFam Zheng             break;
1893ba3f0e25SFam Zheng         }
1894ba3f0e25SFam Zheng         /* [sector_num, pnum] unallocated on this layer, which could be only
1895ba3f0e25SFam Zheng          * the first part of [sector_num, nb_sectors].  */
1896ba3f0e25SFam Zheng         nb_sectors = MIN(nb_sectors, *pnum);
1897c61e684eSEric Blake         first = false;
1898ba3f0e25SFam Zheng     }
1899ba3f0e25SFam Zheng     return ret;
1900ba3f0e25SFam Zheng }
1901ba3f0e25SFam Zheng 
1902ba3f0e25SFam Zheng /* Coroutine wrapper for bdrv_get_block_status_above() */
1903ba3f0e25SFam Zheng static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
190461007b31SStefan Hajnoczi {
190561007b31SStefan Hajnoczi     BdrvCoGetBlockStatusData *data = opaque;
190661007b31SStefan Hajnoczi 
1907ba3f0e25SFam Zheng     data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1908ba3f0e25SFam Zheng                                                data->sector_num,
1909ba3f0e25SFam Zheng                                                data->nb_sectors,
191067a0fd2aSFam Zheng                                                data->pnum,
191167a0fd2aSFam Zheng                                                data->file);
191261007b31SStefan Hajnoczi     data->done = true;
191361007b31SStefan Hajnoczi }
191461007b31SStefan Hajnoczi 
191561007b31SStefan Hajnoczi /*
1916ba3f0e25SFam Zheng  * Synchronous wrapper around bdrv_co_get_block_status_above().
191761007b31SStefan Hajnoczi  *
1918ba3f0e25SFam Zheng  * See bdrv_co_get_block_status_above() for details.
191961007b31SStefan Hajnoczi  */
1920ba3f0e25SFam Zheng int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1921ba3f0e25SFam Zheng                                     BlockDriverState *base,
1922ba3f0e25SFam Zheng                                     int64_t sector_num,
192367a0fd2aSFam Zheng                                     int nb_sectors, int *pnum,
192467a0fd2aSFam Zheng                                     BlockDriverState **file)
192561007b31SStefan Hajnoczi {
192661007b31SStefan Hajnoczi     Coroutine *co;
192761007b31SStefan Hajnoczi     BdrvCoGetBlockStatusData data = {
192861007b31SStefan Hajnoczi         .bs = bs,
1929ba3f0e25SFam Zheng         .base = base,
193067a0fd2aSFam Zheng         .file = file,
193161007b31SStefan Hajnoczi         .sector_num = sector_num,
193261007b31SStefan Hajnoczi         .nb_sectors = nb_sectors,
193361007b31SStefan Hajnoczi         .pnum = pnum,
193461007b31SStefan Hajnoczi         .done = false,
193561007b31SStefan Hajnoczi     };
193661007b31SStefan Hajnoczi 
193761007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
193861007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
1939ba3f0e25SFam Zheng         bdrv_get_block_status_above_co_entry(&data);
194061007b31SStefan Hajnoczi     } else {
19410b8b8753SPaolo Bonzini         co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
19420b8b8753SPaolo Bonzini                                    &data);
1943e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
194488b062c2SPaolo Bonzini         BDRV_POLL_WHILE(bs, !data.done);
194561007b31SStefan Hajnoczi     }
194661007b31SStefan Hajnoczi     return data.ret;
194761007b31SStefan Hajnoczi }
194861007b31SStefan Hajnoczi 
1949ba3f0e25SFam Zheng int64_t bdrv_get_block_status(BlockDriverState *bs,
1950ba3f0e25SFam Zheng                               int64_t sector_num,
195167a0fd2aSFam Zheng                               int nb_sectors, int *pnum,
195267a0fd2aSFam Zheng                               BlockDriverState **file)
1953ba3f0e25SFam Zheng {
1954760e0063SKevin Wolf     return bdrv_get_block_status_above(bs, backing_bs(bs),
195567a0fd2aSFam Zheng                                        sector_num, nb_sectors, pnum, file);
1956ba3f0e25SFam Zheng }
1957ba3f0e25SFam Zheng 
1958d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
1959d6a644bbSEric Blake                                    int64_t bytes, int64_t *pnum)
196061007b31SStefan Hajnoczi {
196167a0fd2aSFam Zheng     BlockDriverState *file;
1962d6a644bbSEric Blake     int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1963d6a644bbSEric Blake     int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1964d6a644bbSEric Blake     int64_t ret;
1965d6a644bbSEric Blake     int psectors;
1966d6a644bbSEric Blake 
1967d6a644bbSEric Blake     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1968d6a644bbSEric Blake     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE) && bytes < INT_MAX);
1969d6a644bbSEric Blake     ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &psectors,
197067a0fd2aSFam Zheng                                 &file);
197161007b31SStefan Hajnoczi     if (ret < 0) {
197261007b31SStefan Hajnoczi         return ret;
197361007b31SStefan Hajnoczi     }
1974d6a644bbSEric Blake     if (pnum) {
1975d6a644bbSEric Blake         *pnum = psectors * BDRV_SECTOR_SIZE;
1976d6a644bbSEric Blake     }
197761007b31SStefan Hajnoczi     return !!(ret & BDRV_BLOCK_ALLOCATED);
197861007b31SStefan Hajnoczi }
197961007b31SStefan Hajnoczi 
198061007b31SStefan Hajnoczi /*
198161007b31SStefan Hajnoczi  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
198261007b31SStefan Hajnoczi  *
198351b0a488SEric Blake  * Return true if (a prefix of) the given range is allocated in any image
198451b0a488SEric Blake  * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
198551b0a488SEric Blake  * offset is allocated in any image of the chain.  Return false otherwise,
1986d6a644bbSEric Blake  * or negative errno on failure.
198761007b31SStefan Hajnoczi  *
198851b0a488SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
198951b0a488SEric Blake  * following the specified offset) that are known to be in the same
199051b0a488SEric Blake  * allocated/unallocated state.  Note that a subsequent call starting
199151b0a488SEric Blake  * at 'offset + *pnum' may return the same allocation status (in other
199251b0a488SEric Blake  * words, the result is not necessarily the maximum possible range);
199351b0a488SEric Blake  * but 'pnum' will only be 0 when end of file is reached.
199461007b31SStefan Hajnoczi  *
199561007b31SStefan Hajnoczi  */
199661007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top,
199761007b31SStefan Hajnoczi                             BlockDriverState *base,
199851b0a488SEric Blake                             int64_t offset, int64_t bytes, int64_t *pnum)
199961007b31SStefan Hajnoczi {
200061007b31SStefan Hajnoczi     BlockDriverState *intermediate;
200151b0a488SEric Blake     int ret;
200251b0a488SEric Blake     int64_t n = bytes;
200361007b31SStefan Hajnoczi 
200461007b31SStefan Hajnoczi     intermediate = top;
200561007b31SStefan Hajnoczi     while (intermediate && intermediate != base) {
2006d6a644bbSEric Blake         int64_t pnum_inter;
2007c00716beSEric Blake         int64_t size_inter;
2008d6a644bbSEric Blake 
200951b0a488SEric Blake         ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
201061007b31SStefan Hajnoczi         if (ret < 0) {
201161007b31SStefan Hajnoczi             return ret;
2012d6a644bbSEric Blake         }
2013d6a644bbSEric Blake         if (ret) {
201451b0a488SEric Blake             *pnum = pnum_inter;
201561007b31SStefan Hajnoczi             return 1;
201661007b31SStefan Hajnoczi         }
201761007b31SStefan Hajnoczi 
201851b0a488SEric Blake         size_inter = bdrv_getlength(intermediate);
2019c00716beSEric Blake         if (size_inter < 0) {
2020c00716beSEric Blake             return size_inter;
2021c00716beSEric Blake         }
202251b0a488SEric Blake         if (n > pnum_inter &&
202351b0a488SEric Blake             (intermediate == top || offset + pnum_inter < size_inter)) {
202451b0a488SEric Blake             n = pnum_inter;
202561007b31SStefan Hajnoczi         }
202661007b31SStefan Hajnoczi 
2027760e0063SKevin Wolf         intermediate = backing_bs(intermediate);
202861007b31SStefan Hajnoczi     }
202961007b31SStefan Hajnoczi 
203061007b31SStefan Hajnoczi     *pnum = n;
203161007b31SStefan Hajnoczi     return 0;
203261007b31SStefan Hajnoczi }
203361007b31SStefan Hajnoczi 
20341a8ae822SKevin Wolf typedef struct BdrvVmstateCo {
20351a8ae822SKevin Wolf     BlockDriverState    *bs;
20361a8ae822SKevin Wolf     QEMUIOVector        *qiov;
20371a8ae822SKevin Wolf     int64_t             pos;
20381a8ae822SKevin Wolf     bool                is_read;
20391a8ae822SKevin Wolf     int                 ret;
20401a8ae822SKevin Wolf } BdrvVmstateCo;
20411a8ae822SKevin Wolf 
20421a8ae822SKevin Wolf static int coroutine_fn
20431a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
20441a8ae822SKevin Wolf                    bool is_read)
20451a8ae822SKevin Wolf {
20461a8ae822SKevin Wolf     BlockDriver *drv = bs->drv;
2047dc88a467SStefan Hajnoczi     int ret = -ENOTSUP;
2048dc88a467SStefan Hajnoczi 
2049dc88a467SStefan Hajnoczi     bdrv_inc_in_flight(bs);
20501a8ae822SKevin Wolf 
20511a8ae822SKevin Wolf     if (!drv) {
2052dc88a467SStefan Hajnoczi         ret = -ENOMEDIUM;
20531a8ae822SKevin Wolf     } else if (drv->bdrv_load_vmstate) {
2054dc88a467SStefan Hajnoczi         if (is_read) {
2055dc88a467SStefan Hajnoczi             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2056dc88a467SStefan Hajnoczi         } else {
2057dc88a467SStefan Hajnoczi             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2058dc88a467SStefan Hajnoczi         }
20591a8ae822SKevin Wolf     } else if (bs->file) {
2060dc88a467SStefan Hajnoczi         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
20611a8ae822SKevin Wolf     }
20621a8ae822SKevin Wolf 
2063dc88a467SStefan Hajnoczi     bdrv_dec_in_flight(bs);
2064dc88a467SStefan Hajnoczi     return ret;
20651a8ae822SKevin Wolf }
20661a8ae822SKevin Wolf 
20671a8ae822SKevin Wolf static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
20681a8ae822SKevin Wolf {
20691a8ae822SKevin Wolf     BdrvVmstateCo *co = opaque;
20701a8ae822SKevin Wolf     co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
20711a8ae822SKevin Wolf }
20721a8ae822SKevin Wolf 
20731a8ae822SKevin Wolf static inline int
20741a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
20751a8ae822SKevin Wolf                 bool is_read)
20761a8ae822SKevin Wolf {
20771a8ae822SKevin Wolf     if (qemu_in_coroutine()) {
20781a8ae822SKevin Wolf         return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
20791a8ae822SKevin Wolf     } else {
20801a8ae822SKevin Wolf         BdrvVmstateCo data = {
20811a8ae822SKevin Wolf             .bs         = bs,
20821a8ae822SKevin Wolf             .qiov       = qiov,
20831a8ae822SKevin Wolf             .pos        = pos,
20841a8ae822SKevin Wolf             .is_read    = is_read,
20851a8ae822SKevin Wolf             .ret        = -EINPROGRESS,
20861a8ae822SKevin Wolf         };
20870b8b8753SPaolo Bonzini         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
20881a8ae822SKevin Wolf 
2089e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
2090ea17c9d2SStefan Hajnoczi         BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
20911a8ae822SKevin Wolf         return data.ret;
20921a8ae822SKevin Wolf     }
20931a8ae822SKevin Wolf }
20941a8ae822SKevin Wolf 
209561007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
209661007b31SStefan Hajnoczi                       int64_t pos, int size)
209761007b31SStefan Hajnoczi {
209861007b31SStefan Hajnoczi     QEMUIOVector qiov;
209961007b31SStefan Hajnoczi     struct iovec iov = {
210061007b31SStefan Hajnoczi         .iov_base   = (void *) buf,
210161007b31SStefan Hajnoczi         .iov_len    = size,
210261007b31SStefan Hajnoczi     };
2103b433d942SKevin Wolf     int ret;
210461007b31SStefan Hajnoczi 
210561007b31SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
2106b433d942SKevin Wolf 
2107b433d942SKevin Wolf     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2108b433d942SKevin Wolf     if (ret < 0) {
2109b433d942SKevin Wolf         return ret;
2110b433d942SKevin Wolf     }
2111b433d942SKevin Wolf 
2112b433d942SKevin Wolf     return size;
211361007b31SStefan Hajnoczi }
211461007b31SStefan Hajnoczi 
211561007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
211661007b31SStefan Hajnoczi {
21171a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, false);
211861007b31SStefan Hajnoczi }
211961007b31SStefan Hajnoczi 
212061007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
212161007b31SStefan Hajnoczi                       int64_t pos, int size)
212261007b31SStefan Hajnoczi {
21235ddda0b8SKevin Wolf     QEMUIOVector qiov;
21245ddda0b8SKevin Wolf     struct iovec iov = {
21255ddda0b8SKevin Wolf         .iov_base   = buf,
21265ddda0b8SKevin Wolf         .iov_len    = size,
21275ddda0b8SKevin Wolf     };
2128b433d942SKevin Wolf     int ret;
21295ddda0b8SKevin Wolf 
21305ddda0b8SKevin Wolf     qemu_iovec_init_external(&qiov, &iov, 1);
2131b433d942SKevin Wolf     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2132b433d942SKevin Wolf     if (ret < 0) {
2133b433d942SKevin Wolf         return ret;
2134b433d942SKevin Wolf     }
2135b433d942SKevin Wolf 
2136b433d942SKevin Wolf     return size;
21375ddda0b8SKevin Wolf }
21385ddda0b8SKevin Wolf 
21395ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
21405ddda0b8SKevin Wolf {
21411a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, true);
214261007b31SStefan Hajnoczi }
214361007b31SStefan Hajnoczi 
214461007b31SStefan Hajnoczi /**************************************************************/
214561007b31SStefan Hajnoczi /* async I/Os */
214661007b31SStefan Hajnoczi 
214761007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb)
214861007b31SStefan Hajnoczi {
214961007b31SStefan Hajnoczi     qemu_aio_ref(acb);
215061007b31SStefan Hajnoczi     bdrv_aio_cancel_async(acb);
215161007b31SStefan Hajnoczi     while (acb->refcnt > 1) {
215261007b31SStefan Hajnoczi         if (acb->aiocb_info->get_aio_context) {
215361007b31SStefan Hajnoczi             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
215461007b31SStefan Hajnoczi         } else if (acb->bs) {
21552f47da5fSPaolo Bonzini             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
21562f47da5fSPaolo Bonzini              * assert that we're not using an I/O thread.  Thread-safe
21572f47da5fSPaolo Bonzini              * code should use bdrv_aio_cancel_async exclusively.
21582f47da5fSPaolo Bonzini              */
21592f47da5fSPaolo Bonzini             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
216061007b31SStefan Hajnoczi             aio_poll(bdrv_get_aio_context(acb->bs), true);
216161007b31SStefan Hajnoczi         } else {
216261007b31SStefan Hajnoczi             abort();
216361007b31SStefan Hajnoczi         }
216461007b31SStefan Hajnoczi     }
216561007b31SStefan Hajnoczi     qemu_aio_unref(acb);
216661007b31SStefan Hajnoczi }
216761007b31SStefan Hajnoczi 
216861007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements
216961007b31SStefan Hajnoczi  * cancel_async, otherwise we do nothing and let the request normally complete.
217061007b31SStefan Hajnoczi  * In either case the completion callback must be called. */
217161007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb)
217261007b31SStefan Hajnoczi {
217361007b31SStefan Hajnoczi     if (acb->aiocb_info->cancel_async) {
217461007b31SStefan Hajnoczi         acb->aiocb_info->cancel_async(acb);
217561007b31SStefan Hajnoczi     }
217661007b31SStefan Hajnoczi }
217761007b31SStefan Hajnoczi 
217861007b31SStefan Hajnoczi /**************************************************************/
217961007b31SStefan Hajnoczi /* Coroutine block device emulation */
218061007b31SStefan Hajnoczi 
2181e293b7a3SKevin Wolf typedef struct FlushCo {
2182e293b7a3SKevin Wolf     BlockDriverState *bs;
2183e293b7a3SKevin Wolf     int ret;
2184e293b7a3SKevin Wolf } FlushCo;
2185e293b7a3SKevin Wolf 
2186e293b7a3SKevin Wolf 
218761007b31SStefan Hajnoczi static void coroutine_fn bdrv_flush_co_entry(void *opaque)
218861007b31SStefan Hajnoczi {
2189e293b7a3SKevin Wolf     FlushCo *rwco = opaque;
219061007b31SStefan Hajnoczi 
219161007b31SStefan Hajnoczi     rwco->ret = bdrv_co_flush(rwco->bs);
219261007b31SStefan Hajnoczi }
219361007b31SStefan Hajnoczi 
219461007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
219561007b31SStefan Hajnoczi {
219649ca6259SFam Zheng     int current_gen;
219749ca6259SFam Zheng     int ret = 0;
219861007b31SStefan Hajnoczi 
219999723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2200c32b82afSPavel Dovgalyuk 
2201e914404eSFam Zheng     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
220249ca6259SFam Zheng         bdrv_is_sg(bs)) {
220349ca6259SFam Zheng         goto early_exit;
220449ca6259SFam Zheng     }
220549ca6259SFam Zheng 
22063783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
220747fec599SPaolo Bonzini     current_gen = atomic_read(&bs->write_gen);
22083ff2f67aSEvgeny Yakovlev 
22093ff2f67aSEvgeny Yakovlev     /* Wait until any previous flushes are completed */
221099723548SPaolo Bonzini     while (bs->active_flush_req) {
22113783fa3dSPaolo Bonzini         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
22123ff2f67aSEvgeny Yakovlev     }
22133ff2f67aSEvgeny Yakovlev 
22143783fa3dSPaolo Bonzini     /* Flushes reach this point in nondecreasing current_gen order.  */
221599723548SPaolo Bonzini     bs->active_flush_req = true;
22163783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
22173ff2f67aSEvgeny Yakovlev 
2218c32b82afSPavel Dovgalyuk     /* Write back all layers by calling one driver function */
2219c32b82afSPavel Dovgalyuk     if (bs->drv->bdrv_co_flush) {
2220c32b82afSPavel Dovgalyuk         ret = bs->drv->bdrv_co_flush(bs);
2221c32b82afSPavel Dovgalyuk         goto out;
2222c32b82afSPavel Dovgalyuk     }
2223c32b82afSPavel Dovgalyuk 
222461007b31SStefan Hajnoczi     /* Write back cached data to the OS even with cache=unsafe */
222561007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
222661007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_os) {
222761007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_os(bs);
222861007b31SStefan Hajnoczi         if (ret < 0) {
2229cdb5e315SFam Zheng             goto out;
223061007b31SStefan Hajnoczi         }
223161007b31SStefan Hajnoczi     }
223261007b31SStefan Hajnoczi 
223361007b31SStefan Hajnoczi     /* But don't actually force it to the disk with cache=unsafe */
223461007b31SStefan Hajnoczi     if (bs->open_flags & BDRV_O_NO_FLUSH) {
223561007b31SStefan Hajnoczi         goto flush_parent;
223661007b31SStefan Hajnoczi     }
223761007b31SStefan Hajnoczi 
22383ff2f67aSEvgeny Yakovlev     /* Check if we really need to flush anything */
22393ff2f67aSEvgeny Yakovlev     if (bs->flushed_gen == current_gen) {
22403ff2f67aSEvgeny Yakovlev         goto flush_parent;
22413ff2f67aSEvgeny Yakovlev     }
22423ff2f67aSEvgeny Yakovlev 
224361007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
224461007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_disk) {
224561007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_disk(bs);
224661007b31SStefan Hajnoczi     } else if (bs->drv->bdrv_aio_flush) {
224761007b31SStefan Hajnoczi         BlockAIOCB *acb;
224861007b31SStefan Hajnoczi         CoroutineIOCompletion co = {
224961007b31SStefan Hajnoczi             .coroutine = qemu_coroutine_self(),
225061007b31SStefan Hajnoczi         };
225161007b31SStefan Hajnoczi 
225261007b31SStefan Hajnoczi         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
225361007b31SStefan Hajnoczi         if (acb == NULL) {
225461007b31SStefan Hajnoczi             ret = -EIO;
225561007b31SStefan Hajnoczi         } else {
225661007b31SStefan Hajnoczi             qemu_coroutine_yield();
225761007b31SStefan Hajnoczi             ret = co.ret;
225861007b31SStefan Hajnoczi         }
225961007b31SStefan Hajnoczi     } else {
226061007b31SStefan Hajnoczi         /*
226161007b31SStefan Hajnoczi          * Some block drivers always operate in either writethrough or unsafe
226261007b31SStefan Hajnoczi          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
226361007b31SStefan Hajnoczi          * know how the server works (because the behaviour is hardcoded or
226461007b31SStefan Hajnoczi          * depends on server-side configuration), so we can't ensure that
226561007b31SStefan Hajnoczi          * everything is safe on disk. Returning an error doesn't work because
226661007b31SStefan Hajnoczi          * that would break guests even if the server operates in writethrough
226761007b31SStefan Hajnoczi          * mode.
226861007b31SStefan Hajnoczi          *
226961007b31SStefan Hajnoczi          * Let's hope the user knows what he's doing.
227061007b31SStefan Hajnoczi          */
227161007b31SStefan Hajnoczi         ret = 0;
227261007b31SStefan Hajnoczi     }
22733ff2f67aSEvgeny Yakovlev 
227461007b31SStefan Hajnoczi     if (ret < 0) {
2275cdb5e315SFam Zheng         goto out;
227661007b31SStefan Hajnoczi     }
227761007b31SStefan Hajnoczi 
227861007b31SStefan Hajnoczi     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
227961007b31SStefan Hajnoczi      * in the case of cache=unsafe, so there are no useless flushes.
228061007b31SStefan Hajnoczi      */
228161007b31SStefan Hajnoczi flush_parent:
2282cdb5e315SFam Zheng     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2283cdb5e315SFam Zheng out:
22843ff2f67aSEvgeny Yakovlev     /* Notify any pending flushes that we have completed */
2285e6af1e08SKevin Wolf     if (ret == 0) {
22863ff2f67aSEvgeny Yakovlev         bs->flushed_gen = current_gen;
2287e6af1e08SKevin Wolf     }
22883783fa3dSPaolo Bonzini 
22893783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
229099723548SPaolo Bonzini     bs->active_flush_req = false;
2291156af3acSDenis V. Lunev     /* Return value is ignored - it's ok if wait queue is empty */
2292156af3acSDenis V. Lunev     qemu_co_queue_next(&bs->flush_queue);
22933783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
22943ff2f67aSEvgeny Yakovlev 
229549ca6259SFam Zheng early_exit:
229699723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2297cdb5e315SFam Zheng     return ret;
229861007b31SStefan Hajnoczi }
229961007b31SStefan Hajnoczi 
230061007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs)
230161007b31SStefan Hajnoczi {
230261007b31SStefan Hajnoczi     Coroutine *co;
2303e293b7a3SKevin Wolf     FlushCo flush_co = {
230461007b31SStefan Hajnoczi         .bs = bs,
230561007b31SStefan Hajnoczi         .ret = NOT_DONE,
230661007b31SStefan Hajnoczi     };
230761007b31SStefan Hajnoczi 
230861007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
230961007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
2310e293b7a3SKevin Wolf         bdrv_flush_co_entry(&flush_co);
231161007b31SStefan Hajnoczi     } else {
23120b8b8753SPaolo Bonzini         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2313e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
231488b062c2SPaolo Bonzini         BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
231561007b31SStefan Hajnoczi     }
231661007b31SStefan Hajnoczi 
2317e293b7a3SKevin Wolf     return flush_co.ret;
231861007b31SStefan Hajnoczi }
231961007b31SStefan Hajnoczi 
232061007b31SStefan Hajnoczi typedef struct DiscardCo {
232161007b31SStefan Hajnoczi     BlockDriverState *bs;
23220c51a893SEric Blake     int64_t offset;
2323f5a5ca79SManos Pitsidianakis     int bytes;
232461007b31SStefan Hajnoczi     int ret;
232561007b31SStefan Hajnoczi } DiscardCo;
23260c51a893SEric Blake static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
232761007b31SStefan Hajnoczi {
232861007b31SStefan Hajnoczi     DiscardCo *rwco = opaque;
232961007b31SStefan Hajnoczi 
2330f5a5ca79SManos Pitsidianakis     rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
233161007b31SStefan Hajnoczi }
233261007b31SStefan Hajnoczi 
23339f1963b3SEric Blake int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2334f5a5ca79SManos Pitsidianakis                                   int bytes)
233561007b31SStefan Hajnoczi {
2336b1066c87SFam Zheng     BdrvTrackedRequest req;
23379f1963b3SEric Blake     int max_pdiscard, ret;
23383482b9bcSEric Blake     int head, tail, align;
233961007b31SStefan Hajnoczi 
234061007b31SStefan Hajnoczi     if (!bs->drv) {
234161007b31SStefan Hajnoczi         return -ENOMEDIUM;
234261007b31SStefan Hajnoczi     }
234361007b31SStefan Hajnoczi 
2344d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
2345d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
2346d6883bc9SVladimir Sementsov-Ogievskiy     }
2347d6883bc9SVladimir Sementsov-Ogievskiy 
2348f5a5ca79SManos Pitsidianakis     ret = bdrv_check_byte_request(bs, offset, bytes);
234961007b31SStefan Hajnoczi     if (ret < 0) {
235061007b31SStefan Hajnoczi         return ret;
235161007b31SStefan Hajnoczi     } else if (bs->read_only) {
2352eaf5fe2dSPaolo Bonzini         return -EPERM;
235361007b31SStefan Hajnoczi     }
235404c01a5cSKevin Wolf     assert(!(bs->open_flags & BDRV_O_INACTIVE));
235561007b31SStefan Hajnoczi 
235661007b31SStefan Hajnoczi     /* Do nothing if disabled.  */
235761007b31SStefan Hajnoczi     if (!(bs->open_flags & BDRV_O_UNMAP)) {
235861007b31SStefan Hajnoczi         return 0;
235961007b31SStefan Hajnoczi     }
236061007b31SStefan Hajnoczi 
236102aefe43SEric Blake     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
236261007b31SStefan Hajnoczi         return 0;
236361007b31SStefan Hajnoczi     }
236461007b31SStefan Hajnoczi 
23653482b9bcSEric Blake     /* Discard is advisory, but some devices track and coalesce
23663482b9bcSEric Blake      * unaligned requests, so we must pass everything down rather than
23673482b9bcSEric Blake      * round here.  Still, most devices will just silently ignore
23683482b9bcSEric Blake      * unaligned requests (by returning -ENOTSUP), so we must fragment
23693482b9bcSEric Blake      * the request accordingly.  */
237002aefe43SEric Blake     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2371b8d0a980SEric Blake     assert(align % bs->bl.request_alignment == 0);
2372b8d0a980SEric Blake     head = offset % align;
2373f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % align;
23749f1963b3SEric Blake 
237599723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2376f5a5ca79SManos Pitsidianakis     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
237750824995SFam Zheng 
2378ec050f77SDenis V. Lunev     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2379ec050f77SDenis V. Lunev     if (ret < 0) {
2380ec050f77SDenis V. Lunev         goto out;
2381ec050f77SDenis V. Lunev     }
2382ec050f77SDenis V. Lunev 
23839f1963b3SEric Blake     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
23849f1963b3SEric Blake                                    align);
23853482b9bcSEric Blake     assert(max_pdiscard >= bs->bl.request_alignment);
23869f1963b3SEric Blake 
2387f5a5ca79SManos Pitsidianakis     while (bytes > 0) {
2388f5a5ca79SManos Pitsidianakis         int num = bytes;
23893482b9bcSEric Blake 
23903482b9bcSEric Blake         if (head) {
23913482b9bcSEric Blake             /* Make small requests to get to alignment boundaries. */
2392f5a5ca79SManos Pitsidianakis             num = MIN(bytes, align - head);
23933482b9bcSEric Blake             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
23943482b9bcSEric Blake                 num %= bs->bl.request_alignment;
23953482b9bcSEric Blake             }
23963482b9bcSEric Blake             head = (head + num) % align;
23973482b9bcSEric Blake             assert(num < max_pdiscard);
23983482b9bcSEric Blake         } else if (tail) {
23993482b9bcSEric Blake             if (num > align) {
24003482b9bcSEric Blake                 /* Shorten the request to the last aligned cluster.  */
24013482b9bcSEric Blake                 num -= tail;
24023482b9bcSEric Blake             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
24033482b9bcSEric Blake                        tail > bs->bl.request_alignment) {
24043482b9bcSEric Blake                 tail %= bs->bl.request_alignment;
24053482b9bcSEric Blake                 num -= tail;
24063482b9bcSEric Blake             }
24073482b9bcSEric Blake         }
24083482b9bcSEric Blake         /* limit request size */
24093482b9bcSEric Blake         if (num > max_pdiscard) {
24103482b9bcSEric Blake             num = max_pdiscard;
24113482b9bcSEric Blake         }
241261007b31SStefan Hajnoczi 
241347a5486dSEric Blake         if (bs->drv->bdrv_co_pdiscard) {
241447a5486dSEric Blake             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
241561007b31SStefan Hajnoczi         } else {
241661007b31SStefan Hajnoczi             BlockAIOCB *acb;
241761007b31SStefan Hajnoczi             CoroutineIOCompletion co = {
241861007b31SStefan Hajnoczi                 .coroutine = qemu_coroutine_self(),
241961007b31SStefan Hajnoczi             };
242061007b31SStefan Hajnoczi 
24214da444a0SEric Blake             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
242261007b31SStefan Hajnoczi                                              bdrv_co_io_em_complete, &co);
242361007b31SStefan Hajnoczi             if (acb == NULL) {
2424b1066c87SFam Zheng                 ret = -EIO;
2425b1066c87SFam Zheng                 goto out;
242661007b31SStefan Hajnoczi             } else {
242761007b31SStefan Hajnoczi                 qemu_coroutine_yield();
242861007b31SStefan Hajnoczi                 ret = co.ret;
242961007b31SStefan Hajnoczi             }
243061007b31SStefan Hajnoczi         }
243161007b31SStefan Hajnoczi         if (ret && ret != -ENOTSUP) {
2432b1066c87SFam Zheng             goto out;
243361007b31SStefan Hajnoczi         }
243461007b31SStefan Hajnoczi 
24359f1963b3SEric Blake         offset += num;
2436f5a5ca79SManos Pitsidianakis         bytes -= num;
243761007b31SStefan Hajnoczi     }
2438b1066c87SFam Zheng     ret = 0;
2439b1066c87SFam Zheng out:
244047fec599SPaolo Bonzini     atomic_inc(&bs->write_gen);
2441968d8b06SDenis V. Lunev     bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
2442968d8b06SDenis V. Lunev                    req.bytes >> BDRV_SECTOR_BITS);
2443b1066c87SFam Zheng     tracked_request_end(&req);
244499723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2445b1066c87SFam Zheng     return ret;
244661007b31SStefan Hajnoczi }
244761007b31SStefan Hajnoczi 
2448f5a5ca79SManos Pitsidianakis int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
244961007b31SStefan Hajnoczi {
245061007b31SStefan Hajnoczi     Coroutine *co;
245161007b31SStefan Hajnoczi     DiscardCo rwco = {
245261007b31SStefan Hajnoczi         .bs = bs,
24530c51a893SEric Blake         .offset = offset,
2454f5a5ca79SManos Pitsidianakis         .bytes = bytes,
245561007b31SStefan Hajnoczi         .ret = NOT_DONE,
245661007b31SStefan Hajnoczi     };
245761007b31SStefan Hajnoczi 
245861007b31SStefan Hajnoczi     if (qemu_in_coroutine()) {
245961007b31SStefan Hajnoczi         /* Fast-path if already in coroutine context */
24600c51a893SEric Blake         bdrv_pdiscard_co_entry(&rwco);
246161007b31SStefan Hajnoczi     } else {
24620c51a893SEric Blake         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2463e92f0e19SFam Zheng         bdrv_coroutine_enter(bs, co);
246488b062c2SPaolo Bonzini         BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
246561007b31SStefan Hajnoczi     }
246661007b31SStefan Hajnoczi 
246761007b31SStefan Hajnoczi     return rwco.ret;
246861007b31SStefan Hajnoczi }
246961007b31SStefan Hajnoczi 
247048af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
247161007b31SStefan Hajnoczi {
247261007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
24735c5ae76aSFam Zheng     CoroutineIOCompletion co = {
24745c5ae76aSFam Zheng         .coroutine = qemu_coroutine_self(),
24755c5ae76aSFam Zheng     };
24765c5ae76aSFam Zheng     BlockAIOCB *acb;
247761007b31SStefan Hajnoczi 
247899723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
247916a389dcSKevin Wolf     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
24805c5ae76aSFam Zheng         co.ret = -ENOTSUP;
24815c5ae76aSFam Zheng         goto out;
24825c5ae76aSFam Zheng     }
24835c5ae76aSFam Zheng 
248416a389dcSKevin Wolf     if (drv->bdrv_co_ioctl) {
248516a389dcSKevin Wolf         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
248616a389dcSKevin Wolf     } else {
24875c5ae76aSFam Zheng         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
24885c5ae76aSFam Zheng         if (!acb) {
2489c8a9fd80SFam Zheng             co.ret = -ENOTSUP;
2490c8a9fd80SFam Zheng             goto out;
24915c5ae76aSFam Zheng         }
24925c5ae76aSFam Zheng         qemu_coroutine_yield();
249316a389dcSKevin Wolf     }
24945c5ae76aSFam Zheng out:
249599723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
24965c5ae76aSFam Zheng     return co.ret;
24975c5ae76aSFam Zheng }
24985c5ae76aSFam Zheng 
249961007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size)
250061007b31SStefan Hajnoczi {
250161007b31SStefan Hajnoczi     return qemu_memalign(bdrv_opt_mem_align(bs), size);
250261007b31SStefan Hajnoczi }
250361007b31SStefan Hajnoczi 
250461007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size)
250561007b31SStefan Hajnoczi {
250661007b31SStefan Hajnoczi     return memset(qemu_blockalign(bs, size), 0, size);
250761007b31SStefan Hajnoczi }
250861007b31SStefan Hajnoczi 
250961007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
251061007b31SStefan Hajnoczi {
251161007b31SStefan Hajnoczi     size_t align = bdrv_opt_mem_align(bs);
251261007b31SStefan Hajnoczi 
251361007b31SStefan Hajnoczi     /* Ensure that NULL is never returned on success */
251461007b31SStefan Hajnoczi     assert(align > 0);
251561007b31SStefan Hajnoczi     if (size == 0) {
251661007b31SStefan Hajnoczi         size = align;
251761007b31SStefan Hajnoczi     }
251861007b31SStefan Hajnoczi 
251961007b31SStefan Hajnoczi     return qemu_try_memalign(align, size);
252061007b31SStefan Hajnoczi }
252161007b31SStefan Hajnoczi 
252261007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
252361007b31SStefan Hajnoczi {
252461007b31SStefan Hajnoczi     void *mem = qemu_try_blockalign(bs, size);
252561007b31SStefan Hajnoczi 
252661007b31SStefan Hajnoczi     if (mem) {
252761007b31SStefan Hajnoczi         memset(mem, 0, size);
252861007b31SStefan Hajnoczi     }
252961007b31SStefan Hajnoczi 
253061007b31SStefan Hajnoczi     return mem;
253161007b31SStefan Hajnoczi }
253261007b31SStefan Hajnoczi 
253361007b31SStefan Hajnoczi /*
253461007b31SStefan Hajnoczi  * Check if all memory in this vector is sector aligned.
253561007b31SStefan Hajnoczi  */
253661007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
253761007b31SStefan Hajnoczi {
253861007b31SStefan Hajnoczi     int i;
25394196d2f0SDenis V. Lunev     size_t alignment = bdrv_min_mem_align(bs);
254061007b31SStefan Hajnoczi 
254161007b31SStefan Hajnoczi     for (i = 0; i < qiov->niov; i++) {
254261007b31SStefan Hajnoczi         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
254361007b31SStefan Hajnoczi             return false;
254461007b31SStefan Hajnoczi         }
254561007b31SStefan Hajnoczi         if (qiov->iov[i].iov_len % alignment) {
254661007b31SStefan Hajnoczi             return false;
254761007b31SStefan Hajnoczi         }
254861007b31SStefan Hajnoczi     }
254961007b31SStefan Hajnoczi 
255061007b31SStefan Hajnoczi     return true;
255161007b31SStefan Hajnoczi }
255261007b31SStefan Hajnoczi 
255361007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs,
255461007b31SStefan Hajnoczi                                     NotifierWithReturn *notifier)
255561007b31SStefan Hajnoczi {
255661007b31SStefan Hajnoczi     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
255761007b31SStefan Hajnoczi }
255861007b31SStefan Hajnoczi 
255961007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs)
256061007b31SStefan Hajnoczi {
25616b98bd64SPaolo Bonzini     BdrvChild *child;
25626b98bd64SPaolo Bonzini 
25636b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
25646b98bd64SPaolo Bonzini         bdrv_io_plug(child->bs);
25656b98bd64SPaolo Bonzini     }
25666b98bd64SPaolo Bonzini 
2567850d54a2SPaolo Bonzini     if (atomic_fetch_inc(&bs->io_plugged) == 0) {
256861007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
256961007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_plug) {
257061007b31SStefan Hajnoczi             drv->bdrv_io_plug(bs);
25716b98bd64SPaolo Bonzini         }
257261007b31SStefan Hajnoczi     }
257361007b31SStefan Hajnoczi }
257461007b31SStefan Hajnoczi 
257561007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs)
257661007b31SStefan Hajnoczi {
25776b98bd64SPaolo Bonzini     BdrvChild *child;
25786b98bd64SPaolo Bonzini 
25796b98bd64SPaolo Bonzini     assert(bs->io_plugged);
2580850d54a2SPaolo Bonzini     if (atomic_fetch_dec(&bs->io_plugged) == 1) {
258161007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
258261007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_unplug) {
258361007b31SStefan Hajnoczi             drv->bdrv_io_unplug(bs);
258461007b31SStefan Hajnoczi         }
258561007b31SStefan Hajnoczi     }
258661007b31SStefan Hajnoczi 
25876b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
25886b98bd64SPaolo Bonzini         bdrv_io_unplug(child->bs);
25896b98bd64SPaolo Bonzini     }
25906b98bd64SPaolo Bonzini }
2591