xref: /qemu/block/io.c (revision a2adbbf603cee443ca923f6e8546267a706567d5)
161007b31SStefan Hajnoczi /*
261007b31SStefan Hajnoczi  * Block layer I/O functions
361007b31SStefan Hajnoczi  *
461007b31SStefan Hajnoczi  * Copyright (c) 2003 Fabrice Bellard
561007b31SStefan Hajnoczi  *
661007b31SStefan Hajnoczi  * Permission is hereby granted, free of charge, to any person obtaining a copy
761007b31SStefan Hajnoczi  * of this software and associated documentation files (the "Software"), to deal
861007b31SStefan Hajnoczi  * in the Software without restriction, including without limitation the rights
961007b31SStefan Hajnoczi  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1061007b31SStefan Hajnoczi  * copies of the Software, and to permit persons to whom the Software is
1161007b31SStefan Hajnoczi  * furnished to do so, subject to the following conditions:
1261007b31SStefan Hajnoczi  *
1361007b31SStefan Hajnoczi  * The above copyright notice and this permission notice shall be included in
1461007b31SStefan Hajnoczi  * all copies or substantial portions of the Software.
1561007b31SStefan Hajnoczi  *
1661007b31SStefan Hajnoczi  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1761007b31SStefan Hajnoczi  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1861007b31SStefan Hajnoczi  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1961007b31SStefan Hajnoczi  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2061007b31SStefan Hajnoczi  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2161007b31SStefan Hajnoczi  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2261007b31SStefan Hajnoczi  * THE SOFTWARE.
2361007b31SStefan Hajnoczi  */
2461007b31SStefan Hajnoczi 
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2661007b31SStefan Hajnoczi #include "trace.h"
277f0e9da6SMax Reitz #include "sysemu/block-backend.h"
287719f3c9SStefan Hajnoczi #include "block/aio-wait.h"
2961007b31SStefan Hajnoczi #include "block/blockjob.h"
30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h"
3161007b31SStefan Hajnoczi #include "block/block_int.h"
32f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
33da34e65cSMarkus Armbruster #include "qapi/error.h"
34d49b6836SMarkus Armbruster #include "qemu/error-report.h"
35db725815SMarkus Armbruster #include "qemu/main-loop.h"
36c8aa7895SPavel Dovgalyuk #include "sysemu/replay.h"
3761007b31SStefan Hajnoczi 
38cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
40cb2e2878SEric Blake 
417f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs);
42d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
43f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags);
4461007b31SStefan Hajnoczi 
45f4c8a43bSMax Reitz static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
466cd5c9d7SKevin Wolf                                       bool ignore_bds_parents)
4761007b31SStefan Hajnoczi {
4802d21300SKevin Wolf     BdrvChild *c, *next;
4927ccdd52SKevin Wolf 
5002d21300SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
51bd86fb99SMax Reitz         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
520152bf40SKevin Wolf             continue;
530152bf40SKevin Wolf         }
544be6a6d1SKevin Wolf         bdrv_parent_drained_begin_single(c, false);
55ce0f1412SPaolo Bonzini     }
56ce0f1412SPaolo Bonzini }
57ce0f1412SPaolo Bonzini 
58e037c09cSMax Reitz static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
59e037c09cSMax Reitz                                                    int *drained_end_counter)
60804db8eaSMax Reitz {
61804db8eaSMax Reitz     assert(c->parent_quiesce_counter > 0);
62804db8eaSMax Reitz     c->parent_quiesce_counter--;
63bd86fb99SMax Reitz     if (c->klass->drained_end) {
64bd86fb99SMax Reitz         c->klass->drained_end(c, drained_end_counter);
65804db8eaSMax Reitz     }
66804db8eaSMax Reitz }
67804db8eaSMax Reitz 
68e037c09cSMax Reitz void bdrv_parent_drained_end_single(BdrvChild *c)
69e037c09cSMax Reitz {
70e037c09cSMax Reitz     int drained_end_counter = 0;
71e037c09cSMax Reitz     bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
72e037c09cSMax Reitz     BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0);
73e037c09cSMax Reitz }
74e037c09cSMax Reitz 
75f4c8a43bSMax Reitz static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
76e037c09cSMax Reitz                                     bool ignore_bds_parents,
77e037c09cSMax Reitz                                     int *drained_end_counter)
78ce0f1412SPaolo Bonzini {
7961ad631cSMax Reitz     BdrvChild *c;
8027ccdd52SKevin Wolf 
8161ad631cSMax Reitz     QLIST_FOREACH(c, &bs->parents, next_parent) {
82bd86fb99SMax Reitz         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
830152bf40SKevin Wolf             continue;
840152bf40SKevin Wolf         }
85e037c09cSMax Reitz         bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
86c2066af0SKevin Wolf     }
8761007b31SStefan Hajnoczi }
8861007b31SStefan Hajnoczi 
894be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c)
904be6a6d1SKevin Wolf {
91bd86fb99SMax Reitz     if (c->klass->drained_poll) {
92bd86fb99SMax Reitz         return c->klass->drained_poll(c);
934be6a6d1SKevin Wolf     }
944be6a6d1SKevin Wolf     return false;
954be6a6d1SKevin Wolf }
964be6a6d1SKevin Wolf 
976cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
986cd5c9d7SKevin Wolf                                      bool ignore_bds_parents)
9989bd0305SKevin Wolf {
10089bd0305SKevin Wolf     BdrvChild *c, *next;
10189bd0305SKevin Wolf     bool busy = false;
10289bd0305SKevin Wolf 
10389bd0305SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
104bd86fb99SMax Reitz         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
10589bd0305SKevin Wolf             continue;
10689bd0305SKevin Wolf         }
1074be6a6d1SKevin Wolf         busy |= bdrv_parent_drained_poll_single(c);
10889bd0305SKevin Wolf     }
10989bd0305SKevin Wolf 
11089bd0305SKevin Wolf     return busy;
11189bd0305SKevin Wolf }
11289bd0305SKevin Wolf 
1134be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
1144be6a6d1SKevin Wolf {
115804db8eaSMax Reitz     c->parent_quiesce_counter++;
116bd86fb99SMax Reitz     if (c->klass->drained_begin) {
117bd86fb99SMax Reitz         c->klass->drained_begin(c);
1184be6a6d1SKevin Wolf     }
1194be6a6d1SKevin Wolf     if (poll) {
1204be6a6d1SKevin Wolf         BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
1214be6a6d1SKevin Wolf     }
1224be6a6d1SKevin Wolf }
1234be6a6d1SKevin Wolf 
124d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
125d9e0dfa2SEric Blake {
126d9e0dfa2SEric Blake     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
127d9e0dfa2SEric Blake     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
128d9e0dfa2SEric Blake     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
129d9e0dfa2SEric Blake                                  src->opt_mem_alignment);
130d9e0dfa2SEric Blake     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
131d9e0dfa2SEric Blake                                  src->min_mem_alignment);
132d9e0dfa2SEric Blake     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
133d9e0dfa2SEric Blake }
134d9e0dfa2SEric Blake 
13561007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
13661007b31SStefan Hajnoczi {
13761007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
13861007b31SStefan Hajnoczi     Error *local_err = NULL;
13961007b31SStefan Hajnoczi 
14061007b31SStefan Hajnoczi     memset(&bs->bl, 0, sizeof(bs->bl));
14161007b31SStefan Hajnoczi 
14261007b31SStefan Hajnoczi     if (!drv) {
14361007b31SStefan Hajnoczi         return;
14461007b31SStefan Hajnoczi     }
14561007b31SStefan Hajnoczi 
14679ba8c98SEric Blake     /* Default alignment based on whether driver has byte interface */
147e31f6864SEric Blake     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
148ac850bf0SVladimir Sementsov-Ogievskiy                                 drv->bdrv_aio_preadv ||
149ac850bf0SVladimir Sementsov-Ogievskiy                                 drv->bdrv_co_preadv_part) ? 1 : 512;
15079ba8c98SEric Blake 
15161007b31SStefan Hajnoczi     /* Take some limits from the children as a default */
15261007b31SStefan Hajnoczi     if (bs->file) {
1539a4f4c31SKevin Wolf         bdrv_refresh_limits(bs->file->bs, &local_err);
15461007b31SStefan Hajnoczi         if (local_err) {
15561007b31SStefan Hajnoczi             error_propagate(errp, local_err);
15661007b31SStefan Hajnoczi             return;
15761007b31SStefan Hajnoczi         }
158d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
15961007b31SStefan Hajnoczi     } else {
1604196d2f0SDenis V. Lunev         bs->bl.min_mem_alignment = 512;
161038adc2fSWei Yang         bs->bl.opt_mem_alignment = qemu_real_host_page_size;
162bd44feb7SStefan Hajnoczi 
163bd44feb7SStefan Hajnoczi         /* Safe default since most protocols use readv()/writev()/etc */
164bd44feb7SStefan Hajnoczi         bs->bl.max_iov = IOV_MAX;
16561007b31SStefan Hajnoczi     }
16661007b31SStefan Hajnoczi 
167760e0063SKevin Wolf     if (bs->backing) {
168760e0063SKevin Wolf         bdrv_refresh_limits(bs->backing->bs, &local_err);
16961007b31SStefan Hajnoczi         if (local_err) {
17061007b31SStefan Hajnoczi             error_propagate(errp, local_err);
17161007b31SStefan Hajnoczi             return;
17261007b31SStefan Hajnoczi         }
173d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
17461007b31SStefan Hajnoczi     }
17561007b31SStefan Hajnoczi 
17661007b31SStefan Hajnoczi     /* Then let the driver override it */
17761007b31SStefan Hajnoczi     if (drv->bdrv_refresh_limits) {
17861007b31SStefan Hajnoczi         drv->bdrv_refresh_limits(bs, errp);
17961007b31SStefan Hajnoczi     }
18061007b31SStefan Hajnoczi }
18161007b31SStefan Hajnoczi 
18261007b31SStefan Hajnoczi /**
18361007b31SStefan Hajnoczi  * The copy-on-read flag is actually a reference count so multiple users may
18461007b31SStefan Hajnoczi  * use the feature without worrying about clobbering its previous state.
18561007b31SStefan Hajnoczi  * Copy-on-read stays enabled until all users have called to disable it.
18661007b31SStefan Hajnoczi  */
18761007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs)
18861007b31SStefan Hajnoczi {
189d3faa13eSPaolo Bonzini     atomic_inc(&bs->copy_on_read);
19061007b31SStefan Hajnoczi }
19161007b31SStefan Hajnoczi 
19261007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs)
19361007b31SStefan Hajnoczi {
194d3faa13eSPaolo Bonzini     int old = atomic_fetch_dec(&bs->copy_on_read);
195d3faa13eSPaolo Bonzini     assert(old >= 1);
19661007b31SStefan Hajnoczi }
19761007b31SStefan Hajnoczi 
19861124f03SPaolo Bonzini typedef struct {
19961124f03SPaolo Bonzini     Coroutine *co;
20061124f03SPaolo Bonzini     BlockDriverState *bs;
20161124f03SPaolo Bonzini     bool done;
202481cad48SManos Pitsidianakis     bool begin;
203b0165585SKevin Wolf     bool recursive;
204fe4f0614SKevin Wolf     bool poll;
2050152bf40SKevin Wolf     BdrvChild *parent;
2066cd5c9d7SKevin Wolf     bool ignore_bds_parents;
2078e1da77eSMax Reitz     int *drained_end_counter;
20861124f03SPaolo Bonzini } BdrvCoDrainData;
20961124f03SPaolo Bonzini 
21061124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
21161124f03SPaolo Bonzini {
21261124f03SPaolo Bonzini     BdrvCoDrainData *data = opaque;
21361124f03SPaolo Bonzini     BlockDriverState *bs = data->bs;
21461124f03SPaolo Bonzini 
215481cad48SManos Pitsidianakis     if (data->begin) {
216f8ea8dacSManos Pitsidianakis         bs->drv->bdrv_co_drain_begin(bs);
217481cad48SManos Pitsidianakis     } else {
218481cad48SManos Pitsidianakis         bs->drv->bdrv_co_drain_end(bs);
219481cad48SManos Pitsidianakis     }
22061124f03SPaolo Bonzini 
22165181d63SMax Reitz     /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
22261124f03SPaolo Bonzini     atomic_mb_set(&data->done, true);
223e037c09cSMax Reitz     if (!data->begin) {
2248e1da77eSMax Reitz         atomic_dec(data->drained_end_counter);
2258e1da77eSMax Reitz     }
22665181d63SMax Reitz     bdrv_dec_in_flight(bs);
2278e1da77eSMax Reitz 
2280109e7e6SKevin Wolf     g_free(data);
2290109e7e6SKevin Wolf }
23061124f03SPaolo Bonzini 
231db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
2328e1da77eSMax Reitz static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
2338e1da77eSMax Reitz                               int *drained_end_counter)
23461124f03SPaolo Bonzini {
2350109e7e6SKevin Wolf     BdrvCoDrainData *data;
23661124f03SPaolo Bonzini 
237f8ea8dacSManos Pitsidianakis     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
238481cad48SManos Pitsidianakis             (!begin && !bs->drv->bdrv_co_drain_end)) {
23961124f03SPaolo Bonzini         return;
24061124f03SPaolo Bonzini     }
24161124f03SPaolo Bonzini 
2420109e7e6SKevin Wolf     data = g_new(BdrvCoDrainData, 1);
2430109e7e6SKevin Wolf     *data = (BdrvCoDrainData) {
2440109e7e6SKevin Wolf         .bs = bs,
2450109e7e6SKevin Wolf         .done = false,
2468e1da77eSMax Reitz         .begin = begin,
2478e1da77eSMax Reitz         .drained_end_counter = drained_end_counter,
2480109e7e6SKevin Wolf     };
2490109e7e6SKevin Wolf 
250e037c09cSMax Reitz     if (!begin) {
2518e1da77eSMax Reitz         atomic_inc(drained_end_counter);
2528e1da77eSMax Reitz     }
2538e1da77eSMax Reitz 
2540109e7e6SKevin Wolf     /* Make sure the driver callback completes during the polling phase for
2550109e7e6SKevin Wolf      * drain_begin. */
2560109e7e6SKevin Wolf     bdrv_inc_in_flight(bs);
2570109e7e6SKevin Wolf     data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
2580109e7e6SKevin Wolf     aio_co_schedule(bdrv_get_aio_context(bs), data->co);
25961124f03SPaolo Bonzini }
26061124f03SPaolo Bonzini 
2611cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
262fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
2636cd5c9d7SKevin Wolf                      BdrvChild *ignore_parent, bool ignore_bds_parents)
26489bd0305SKevin Wolf {
265fe4f0614SKevin Wolf     BdrvChild *child, *next;
266fe4f0614SKevin Wolf 
2676cd5c9d7SKevin Wolf     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
26889bd0305SKevin Wolf         return true;
26989bd0305SKevin Wolf     }
27089bd0305SKevin Wolf 
271fe4f0614SKevin Wolf     if (atomic_read(&bs->in_flight)) {
272fe4f0614SKevin Wolf         return true;
27389bd0305SKevin Wolf     }
27489bd0305SKevin Wolf 
275fe4f0614SKevin Wolf     if (recursive) {
2766cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
277fe4f0614SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
2786cd5c9d7SKevin Wolf             if (bdrv_drain_poll(child->bs, recursive, child, false)) {
279fe4f0614SKevin Wolf                 return true;
280fe4f0614SKevin Wolf             }
281fe4f0614SKevin Wolf         }
282fe4f0614SKevin Wolf     }
283fe4f0614SKevin Wolf 
284fe4f0614SKevin Wolf     return false;
285fe4f0614SKevin Wolf }
286fe4f0614SKevin Wolf 
287fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
28889bd0305SKevin Wolf                                       BdrvChild *ignore_parent)
2891cc8e54aSKevin Wolf {
2906cd5c9d7SKevin Wolf     return bdrv_drain_poll(bs, recursive, ignore_parent, false);
2911cc8e54aSKevin Wolf }
2921cc8e54aSKevin Wolf 
293b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
2946cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
2956cd5c9d7SKevin Wolf                                   bool poll);
296b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
2978e1da77eSMax Reitz                                 BdrvChild *parent, bool ignore_bds_parents,
2988e1da77eSMax Reitz                                 int *drained_end_counter);
2990152bf40SKevin Wolf 
300a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque)
301a77fd4bbSFam Zheng {
302a77fd4bbSFam Zheng     BdrvCoDrainData *data = opaque;
303a77fd4bbSFam Zheng     Coroutine *co = data->co;
30499723548SPaolo Bonzini     BlockDriverState *bs = data->bs;
305a77fd4bbSFam Zheng 
306c8ca33d0SKevin Wolf     if (bs) {
307aa1361d5SKevin Wolf         AioContext *ctx = bdrv_get_aio_context(bs);
308aa1361d5SKevin Wolf         AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
309aa1361d5SKevin Wolf 
310aa1361d5SKevin Wolf         /*
311aa1361d5SKevin Wolf          * When the coroutine yielded, the lock for its home context was
312aa1361d5SKevin Wolf          * released, so we need to re-acquire it here. If it explicitly
313aa1361d5SKevin Wolf          * acquired a different context, the lock is still held and we don't
314aa1361d5SKevin Wolf          * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
315aa1361d5SKevin Wolf          */
316aa1361d5SKevin Wolf         if (ctx == co_ctx) {
317aa1361d5SKevin Wolf             aio_context_acquire(ctx);
318aa1361d5SKevin Wolf         }
31999723548SPaolo Bonzini         bdrv_dec_in_flight(bs);
320481cad48SManos Pitsidianakis         if (data->begin) {
321e037c09cSMax Reitz             assert(!data->drained_end_counter);
3226cd5c9d7SKevin Wolf             bdrv_do_drained_begin(bs, data->recursive, data->parent,
3236cd5c9d7SKevin Wolf                                   data->ignore_bds_parents, data->poll);
324481cad48SManos Pitsidianakis         } else {
325e037c09cSMax Reitz             assert(!data->poll);
3266cd5c9d7SKevin Wolf             bdrv_do_drained_end(bs, data->recursive, data->parent,
3278e1da77eSMax Reitz                                 data->ignore_bds_parents,
3288e1da77eSMax Reitz                                 data->drained_end_counter);
329481cad48SManos Pitsidianakis         }
330aa1361d5SKevin Wolf         if (ctx == co_ctx) {
331aa1361d5SKevin Wolf             aio_context_release(ctx);
332aa1361d5SKevin Wolf         }
333c8ca33d0SKevin Wolf     } else {
334c8ca33d0SKevin Wolf         assert(data->begin);
335c8ca33d0SKevin Wolf         bdrv_drain_all_begin();
336c8ca33d0SKevin Wolf     }
337481cad48SManos Pitsidianakis 
338a77fd4bbSFam Zheng     data->done = true;
3391919631eSPaolo Bonzini     aio_co_wake(co);
340a77fd4bbSFam Zheng }
341a77fd4bbSFam Zheng 
342481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
343b0165585SKevin Wolf                                                 bool begin, bool recursive,
3446cd5c9d7SKevin Wolf                                                 BdrvChild *parent,
3456cd5c9d7SKevin Wolf                                                 bool ignore_bds_parents,
3468e1da77eSMax Reitz                                                 bool poll,
3478e1da77eSMax Reitz                                                 int *drained_end_counter)
348a77fd4bbSFam Zheng {
349a77fd4bbSFam Zheng     BdrvCoDrainData data;
350a77fd4bbSFam Zheng 
351a77fd4bbSFam Zheng     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
352c40a2545SStefan Hajnoczi      * other coroutines run if they were queued by aio_co_enter(). */
353a77fd4bbSFam Zheng 
354a77fd4bbSFam Zheng     assert(qemu_in_coroutine());
355a77fd4bbSFam Zheng     data = (BdrvCoDrainData) {
356a77fd4bbSFam Zheng         .co = qemu_coroutine_self(),
357a77fd4bbSFam Zheng         .bs = bs,
358a77fd4bbSFam Zheng         .done = false,
359481cad48SManos Pitsidianakis         .begin = begin,
360b0165585SKevin Wolf         .recursive = recursive,
3610152bf40SKevin Wolf         .parent = parent,
3626cd5c9d7SKevin Wolf         .ignore_bds_parents = ignore_bds_parents,
363fe4f0614SKevin Wolf         .poll = poll,
3648e1da77eSMax Reitz         .drained_end_counter = drained_end_counter,
365a77fd4bbSFam Zheng     };
3668e1da77eSMax Reitz 
367c8ca33d0SKevin Wolf     if (bs) {
36899723548SPaolo Bonzini         bdrv_inc_in_flight(bs);
369c8ca33d0SKevin Wolf     }
370e4ec5ad4SPavel Dovgalyuk     replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs),
371fffb6e12SPaolo Bonzini                                      bdrv_co_drain_bh_cb, &data);
372a77fd4bbSFam Zheng 
373a77fd4bbSFam Zheng     qemu_coroutine_yield();
374a77fd4bbSFam Zheng     /* If we are resumed from some other event (such as an aio completion or a
375a77fd4bbSFam Zheng      * timer callback), it is a bug in the caller that should be fixed. */
376a77fd4bbSFam Zheng     assert(data.done);
377a77fd4bbSFam Zheng }
378a77fd4bbSFam Zheng 
379dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
3806cd5c9d7SKevin Wolf                                    BdrvChild *parent, bool ignore_bds_parents)
381dcf94a23SKevin Wolf {
382dcf94a23SKevin Wolf     assert(!qemu_in_coroutine());
383dcf94a23SKevin Wolf 
384dcf94a23SKevin Wolf     /* Stop things in parent-to-child order */
385dcf94a23SKevin Wolf     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
386dcf94a23SKevin Wolf         aio_disable_external(bdrv_get_aio_context(bs));
387dcf94a23SKevin Wolf     }
388dcf94a23SKevin Wolf 
3896cd5c9d7SKevin Wolf     bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
3908e1da77eSMax Reitz     bdrv_drain_invoke(bs, true, NULL);
391dcf94a23SKevin Wolf }
392dcf94a23SKevin Wolf 
393dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
3946cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
3956cd5c9d7SKevin Wolf                                   bool poll)
3966820643fSKevin Wolf {
397b0165585SKevin Wolf     BdrvChild *child, *next;
398b0165585SKevin Wolf 
399d42cf288SPaolo Bonzini     if (qemu_in_coroutine()) {
4006cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
4018e1da77eSMax Reitz                                poll, NULL);
402d42cf288SPaolo Bonzini         return;
403d42cf288SPaolo Bonzini     }
404d42cf288SPaolo Bonzini 
4056cd5c9d7SKevin Wolf     bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
406d30b8e64SKevin Wolf 
407b0165585SKevin Wolf     if (recursive) {
4086cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
409d736f119SKevin Wolf         bs->recursive_quiesce_counter++;
410b0165585SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
4116cd5c9d7SKevin Wolf             bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
4126cd5c9d7SKevin Wolf                                   false);
413b0165585SKevin Wolf         }
414b0165585SKevin Wolf     }
415fe4f0614SKevin Wolf 
416fe4f0614SKevin Wolf     /*
417fe4f0614SKevin Wolf      * Wait for drained requests to finish.
418fe4f0614SKevin Wolf      *
419fe4f0614SKevin Wolf      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
420fe4f0614SKevin Wolf      * call is needed so things in this AioContext can make progress even
421fe4f0614SKevin Wolf      * though we don't return to the main AioContext loop - this automatically
422fe4f0614SKevin Wolf      * includes other nodes in the same AioContext and therefore all child
423fe4f0614SKevin Wolf      * nodes.
424fe4f0614SKevin Wolf      */
425fe4f0614SKevin Wolf     if (poll) {
4266cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
427fe4f0614SKevin Wolf         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
428fe4f0614SKevin Wolf     }
4296820643fSKevin Wolf }
4306820643fSKevin Wolf 
4310152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs)
4320152bf40SKevin Wolf {
4336cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, false, NULL, false, true);
4340152bf40SKevin Wolf }
4350152bf40SKevin Wolf 
436b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs)
4376820643fSKevin Wolf {
4386cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, true, NULL, false, true);
439b0165585SKevin Wolf }
440b0165585SKevin Wolf 
441e037c09cSMax Reitz /**
442e037c09cSMax Reitz  * This function does not poll, nor must any of its recursively called
443e037c09cSMax Reitz  * functions.  The *drained_end_counter pointee will be incremented
444e037c09cSMax Reitz  * once for every background operation scheduled, and decremented once
445e037c09cSMax Reitz  * the operation settles.  Therefore, the pointer must remain valid
446e037c09cSMax Reitz  * until the pointee reaches 0.  That implies that whoever sets up the
447e037c09cSMax Reitz  * pointee has to poll until it is 0.
448e037c09cSMax Reitz  *
449e037c09cSMax Reitz  * We use atomic operations to access *drained_end_counter, because
450e037c09cSMax Reitz  * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
451e037c09cSMax Reitz  *     @bs may contain nodes in different AioContexts,
452e037c09cSMax Reitz  * (2) bdrv_drain_all_end() uses the same counter for all nodes,
453e037c09cSMax Reitz  *     regardless of which AioContext they are in.
454e037c09cSMax Reitz  */
4556cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
4568e1da77eSMax Reitz                                 BdrvChild *parent, bool ignore_bds_parents,
4578e1da77eSMax Reitz                                 int *drained_end_counter)
458b0165585SKevin Wolf {
45961ad631cSMax Reitz     BdrvChild *child;
4600f115168SKevin Wolf     int old_quiesce_counter;
4610f115168SKevin Wolf 
462e037c09cSMax Reitz     assert(drained_end_counter != NULL);
463e037c09cSMax Reitz 
464481cad48SManos Pitsidianakis     if (qemu_in_coroutine()) {
4656cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
4668e1da77eSMax Reitz                                false, drained_end_counter);
467481cad48SManos Pitsidianakis         return;
468481cad48SManos Pitsidianakis     }
4696820643fSKevin Wolf     assert(bs->quiesce_counter > 0);
4706820643fSKevin Wolf 
47160369b86SKevin Wolf     /* Re-enable things in child-to-parent order */
4728e1da77eSMax Reitz     bdrv_drain_invoke(bs, false, drained_end_counter);
473e037c09cSMax Reitz     bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
474e037c09cSMax Reitz                             drained_end_counter);
4755cb2737eSMax Reitz 
4765cb2737eSMax Reitz     old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
4770f115168SKevin Wolf     if (old_quiesce_counter == 1) {
4786820643fSKevin Wolf         aio_enable_external(bdrv_get_aio_context(bs));
4796820643fSKevin Wolf     }
480b0165585SKevin Wolf 
481b0165585SKevin Wolf     if (recursive) {
4826cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
483d736f119SKevin Wolf         bs->recursive_quiesce_counter--;
48461ad631cSMax Reitz         QLIST_FOREACH(child, &bs->children, next) {
4858e1da77eSMax Reitz             bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
4868e1da77eSMax Reitz                                 drained_end_counter);
487b0165585SKevin Wolf         }
488b0165585SKevin Wolf     }
4890f115168SKevin Wolf }
4906820643fSKevin Wolf 
4910152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs)
4920152bf40SKevin Wolf {
493e037c09cSMax Reitz     int drained_end_counter = 0;
494e037c09cSMax Reitz     bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
495e037c09cSMax Reitz     BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
496e037c09cSMax Reitz }
497e037c09cSMax Reitz 
498e037c09cSMax Reitz void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
499e037c09cSMax Reitz {
500e037c09cSMax Reitz     bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
501b0165585SKevin Wolf }
502b0165585SKevin Wolf 
503b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs)
504b0165585SKevin Wolf {
505e037c09cSMax Reitz     int drained_end_counter = 0;
506e037c09cSMax Reitz     bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
507e037c09cSMax Reitz     BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
5080152bf40SKevin Wolf }
5090152bf40SKevin Wolf 
510d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
511d736f119SKevin Wolf {
512d736f119SKevin Wolf     int i;
513d736f119SKevin Wolf 
514d736f119SKevin Wolf     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
5156cd5c9d7SKevin Wolf         bdrv_do_drained_begin(child->bs, true, child, false, true);
516d736f119SKevin Wolf     }
517d736f119SKevin Wolf }
518d736f119SKevin Wolf 
519d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
520d736f119SKevin Wolf {
521e037c09cSMax Reitz     int drained_end_counter = 0;
522d736f119SKevin Wolf     int i;
523d736f119SKevin Wolf 
524d736f119SKevin Wolf     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
525e037c09cSMax Reitz         bdrv_do_drained_end(child->bs, true, child, false,
526e037c09cSMax Reitz                             &drained_end_counter);
527d736f119SKevin Wolf     }
528e037c09cSMax Reitz 
529e037c09cSMax Reitz     BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0);
530d736f119SKevin Wolf }
531d736f119SKevin Wolf 
53261007b31SStefan Hajnoczi /*
53367da1dc5SFam Zheng  * Wait for pending requests to complete on a single BlockDriverState subtree,
53467da1dc5SFam Zheng  * and suspend block driver's internal I/O until next request arrives.
53561007b31SStefan Hajnoczi  *
53661007b31SStefan Hajnoczi  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
53761007b31SStefan Hajnoczi  * AioContext.
53861007b31SStefan Hajnoczi  */
539b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
540b6e84c97SPaolo Bonzini {
5416820643fSKevin Wolf     assert(qemu_in_coroutine());
5426820643fSKevin Wolf     bdrv_drained_begin(bs);
5436820643fSKevin Wolf     bdrv_drained_end(bs);
544b6e84c97SPaolo Bonzini }
545b6e84c97SPaolo Bonzini 
54661007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs)
54761007b31SStefan Hajnoczi {
5486820643fSKevin Wolf     bdrv_drained_begin(bs);
5496820643fSKevin Wolf     bdrv_drained_end(bs);
55061007b31SStefan Hajnoczi }
55161007b31SStefan Hajnoczi 
552c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs)
553c13ad59fSKevin Wolf {
554c13ad59fSKevin Wolf     BdrvChild *child, *next;
555c13ad59fSKevin Wolf 
556c13ad59fSKevin Wolf     assert(atomic_read(&bs->in_flight) == 0);
557c13ad59fSKevin Wolf     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
558c13ad59fSKevin Wolf         bdrv_drain_assert_idle(child->bs);
559c13ad59fSKevin Wolf     }
560c13ad59fSKevin Wolf }
561c13ad59fSKevin Wolf 
5620f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0;
5630f12264eSKevin Wolf 
5640f12264eSKevin Wolf static bool bdrv_drain_all_poll(void)
5650f12264eSKevin Wolf {
5660f12264eSKevin Wolf     BlockDriverState *bs = NULL;
5670f12264eSKevin Wolf     bool result = false;
5680f12264eSKevin Wolf 
5690f12264eSKevin Wolf     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
5700f12264eSKevin Wolf      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
5710f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
5720f12264eSKevin Wolf         AioContext *aio_context = bdrv_get_aio_context(bs);
5730f12264eSKevin Wolf         aio_context_acquire(aio_context);
5740f12264eSKevin Wolf         result |= bdrv_drain_poll(bs, false, NULL, true);
5750f12264eSKevin Wolf         aio_context_release(aio_context);
5760f12264eSKevin Wolf     }
5770f12264eSKevin Wolf 
5780f12264eSKevin Wolf     return result;
5790f12264eSKevin Wolf }
5800f12264eSKevin Wolf 
58161007b31SStefan Hajnoczi /*
58261007b31SStefan Hajnoczi  * Wait for pending requests to complete across all BlockDriverStates
58361007b31SStefan Hajnoczi  *
58461007b31SStefan Hajnoczi  * This function does not flush data to disk, use bdrv_flush_all() for that
58561007b31SStefan Hajnoczi  * after calling this function.
586c0778f66SAlberto Garcia  *
587c0778f66SAlberto Garcia  * This pauses all block jobs and disables external clients. It must
588c0778f66SAlberto Garcia  * be paired with bdrv_drain_all_end().
589c0778f66SAlberto Garcia  *
590c0778f66SAlberto Garcia  * NOTE: no new block jobs or BlockDriverStates can be created between
591c0778f66SAlberto Garcia  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
59261007b31SStefan Hajnoczi  */
593c0778f66SAlberto Garcia void bdrv_drain_all_begin(void)
59461007b31SStefan Hajnoczi {
5950f12264eSKevin Wolf     BlockDriverState *bs = NULL;
59661007b31SStefan Hajnoczi 
597c8ca33d0SKevin Wolf     if (qemu_in_coroutine()) {
5988e1da77eSMax Reitz         bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
599c8ca33d0SKevin Wolf         return;
600c8ca33d0SKevin Wolf     }
601c8ca33d0SKevin Wolf 
602c8aa7895SPavel Dovgalyuk     /*
603c8aa7895SPavel Dovgalyuk      * bdrv queue is managed by record/replay,
604c8aa7895SPavel Dovgalyuk      * waiting for finishing the I/O requests may
605c8aa7895SPavel Dovgalyuk      * be infinite
606c8aa7895SPavel Dovgalyuk      */
607c8aa7895SPavel Dovgalyuk     if (replay_events_enabled()) {
608c8aa7895SPavel Dovgalyuk         return;
609c8aa7895SPavel Dovgalyuk     }
610c8aa7895SPavel Dovgalyuk 
6110f12264eSKevin Wolf     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
6120f12264eSKevin Wolf      * loop AioContext, so make sure we're in the main context. */
6139a7e86c8SKevin Wolf     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
6140f12264eSKevin Wolf     assert(bdrv_drain_all_count < INT_MAX);
6150f12264eSKevin Wolf     bdrv_drain_all_count++;
6169a7e86c8SKevin Wolf 
6170f12264eSKevin Wolf     /* Quiesce all nodes, without polling in-flight requests yet. The graph
6180f12264eSKevin Wolf      * cannot change during this loop. */
6190f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
62061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
62161007b31SStefan Hajnoczi 
62261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
6230f12264eSKevin Wolf         bdrv_do_drained_begin(bs, false, NULL, true, false);
62461007b31SStefan Hajnoczi         aio_context_release(aio_context);
62561007b31SStefan Hajnoczi     }
62661007b31SStefan Hajnoczi 
6270f12264eSKevin Wolf     /* Now poll the in-flight requests */
628cfe29d82SKevin Wolf     AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
6290f12264eSKevin Wolf 
6300f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
631c13ad59fSKevin Wolf         bdrv_drain_assert_idle(bs);
632f406c03cSAlexander Yarygin     }
633f406c03cSAlexander Yarygin }
634c0778f66SAlberto Garcia 
635c0778f66SAlberto Garcia void bdrv_drain_all_end(void)
636c0778f66SAlberto Garcia {
6370f12264eSKevin Wolf     BlockDriverState *bs = NULL;
638e037c09cSMax Reitz     int drained_end_counter = 0;
639c0778f66SAlberto Garcia 
640c8aa7895SPavel Dovgalyuk     /*
641c8aa7895SPavel Dovgalyuk      * bdrv queue is managed by record/replay,
642c8aa7895SPavel Dovgalyuk      * waiting for finishing the I/O requests may
643c8aa7895SPavel Dovgalyuk      * be endless
644c8aa7895SPavel Dovgalyuk      */
645c8aa7895SPavel Dovgalyuk     if (replay_events_enabled()) {
646c8aa7895SPavel Dovgalyuk         return;
647c8aa7895SPavel Dovgalyuk     }
648c8aa7895SPavel Dovgalyuk 
6490f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
65061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
65161007b31SStefan Hajnoczi 
65261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
653e037c09cSMax Reitz         bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
65461007b31SStefan Hajnoczi         aio_context_release(aio_context);
65561007b31SStefan Hajnoczi     }
6560f12264eSKevin Wolf 
657e037c09cSMax Reitz     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
658e037c09cSMax Reitz     AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0);
659e037c09cSMax Reitz 
6600f12264eSKevin Wolf     assert(bdrv_drain_all_count > 0);
6610f12264eSKevin Wolf     bdrv_drain_all_count--;
66261007b31SStefan Hajnoczi }
66361007b31SStefan Hajnoczi 
664c0778f66SAlberto Garcia void bdrv_drain_all(void)
665c0778f66SAlberto Garcia {
666c0778f66SAlberto Garcia     bdrv_drain_all_begin();
667c0778f66SAlberto Garcia     bdrv_drain_all_end();
668c0778f66SAlberto Garcia }
669c0778f66SAlberto Garcia 
67061007b31SStefan Hajnoczi /**
67161007b31SStefan Hajnoczi  * Remove an active request from the tracked requests list
67261007b31SStefan Hajnoczi  *
67361007b31SStefan Hajnoczi  * This function should be called when a tracked request is completing.
67461007b31SStefan Hajnoczi  */
67561007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req)
67661007b31SStefan Hajnoczi {
67761007b31SStefan Hajnoczi     if (req->serialising) {
67820fc71b2SPaolo Bonzini         atomic_dec(&req->bs->serialising_in_flight);
67961007b31SStefan Hajnoczi     }
68061007b31SStefan Hajnoczi 
6813783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&req->bs->reqs_lock);
68261007b31SStefan Hajnoczi     QLIST_REMOVE(req, list);
68361007b31SStefan Hajnoczi     qemu_co_queue_restart_all(&req->wait_queue);
6843783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&req->bs->reqs_lock);
68561007b31SStefan Hajnoczi }
68661007b31SStefan Hajnoczi 
68761007b31SStefan Hajnoczi /**
68861007b31SStefan Hajnoczi  * Add an active request to the tracked requests list
68961007b31SStefan Hajnoczi  */
69061007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req,
69161007b31SStefan Hajnoczi                                   BlockDriverState *bs,
69261007b31SStefan Hajnoczi                                   int64_t offset,
69322931a15SFam Zheng                                   uint64_t bytes,
694ebde595cSFam Zheng                                   enum BdrvTrackedRequestType type)
69561007b31SStefan Hajnoczi {
69622931a15SFam Zheng     assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
69722931a15SFam Zheng 
69861007b31SStefan Hajnoczi     *req = (BdrvTrackedRequest){
69961007b31SStefan Hajnoczi         .bs = bs,
70061007b31SStefan Hajnoczi         .offset         = offset,
70161007b31SStefan Hajnoczi         .bytes          = bytes,
702ebde595cSFam Zheng         .type           = type,
70361007b31SStefan Hajnoczi         .co             = qemu_coroutine_self(),
70461007b31SStefan Hajnoczi         .serialising    = false,
70561007b31SStefan Hajnoczi         .overlap_offset = offset,
70661007b31SStefan Hajnoczi         .overlap_bytes  = bytes,
70761007b31SStefan Hajnoczi     };
70861007b31SStefan Hajnoczi 
70961007b31SStefan Hajnoczi     qemu_co_queue_init(&req->wait_queue);
71061007b31SStefan Hajnoczi 
7113783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
71261007b31SStefan Hajnoczi     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
7133783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
71461007b31SStefan Hajnoczi }
71561007b31SStefan Hajnoczi 
7163ba0e1a0SPaolo Bonzini static bool tracked_request_overlaps(BdrvTrackedRequest *req,
7173ba0e1a0SPaolo Bonzini                                      int64_t offset, uint64_t bytes)
7183ba0e1a0SPaolo Bonzini {
7193ba0e1a0SPaolo Bonzini     /*        aaaa   bbbb */
7203ba0e1a0SPaolo Bonzini     if (offset >= req->overlap_offset + req->overlap_bytes) {
7213ba0e1a0SPaolo Bonzini         return false;
7223ba0e1a0SPaolo Bonzini     }
7233ba0e1a0SPaolo Bonzini     /* bbbb   aaaa        */
7243ba0e1a0SPaolo Bonzini     if (req->overlap_offset >= offset + bytes) {
7253ba0e1a0SPaolo Bonzini         return false;
7263ba0e1a0SPaolo Bonzini     }
7273ba0e1a0SPaolo Bonzini     return true;
7283ba0e1a0SPaolo Bonzini }
7293ba0e1a0SPaolo Bonzini 
7303ba0e1a0SPaolo Bonzini static bool coroutine_fn
7313ba0e1a0SPaolo Bonzini bdrv_wait_serialising_requests_locked(BlockDriverState *bs,
7323ba0e1a0SPaolo Bonzini                                       BdrvTrackedRequest *self)
7333ba0e1a0SPaolo Bonzini {
7343ba0e1a0SPaolo Bonzini     BdrvTrackedRequest *req;
7353ba0e1a0SPaolo Bonzini     bool retry;
7363ba0e1a0SPaolo Bonzini     bool waited = false;
7373ba0e1a0SPaolo Bonzini 
7383ba0e1a0SPaolo Bonzini     do {
7393ba0e1a0SPaolo Bonzini         retry = false;
7403ba0e1a0SPaolo Bonzini         QLIST_FOREACH(req, &bs->tracked_requests, list) {
7413ba0e1a0SPaolo Bonzini             if (req == self || (!req->serialising && !self->serialising)) {
7423ba0e1a0SPaolo Bonzini                 continue;
7433ba0e1a0SPaolo Bonzini             }
7443ba0e1a0SPaolo Bonzini             if (tracked_request_overlaps(req, self->overlap_offset,
7453ba0e1a0SPaolo Bonzini                                          self->overlap_bytes))
7463ba0e1a0SPaolo Bonzini             {
7473ba0e1a0SPaolo Bonzini                 /* Hitting this means there was a reentrant request, for
7483ba0e1a0SPaolo Bonzini                  * example, a block driver issuing nested requests.  This must
7493ba0e1a0SPaolo Bonzini                  * never happen since it means deadlock.
7503ba0e1a0SPaolo Bonzini                  */
7513ba0e1a0SPaolo Bonzini                 assert(qemu_coroutine_self() != req->co);
7523ba0e1a0SPaolo Bonzini 
7533ba0e1a0SPaolo Bonzini                 /* If the request is already (indirectly) waiting for us, or
7543ba0e1a0SPaolo Bonzini                  * will wait for us as soon as it wakes up, then just go on
7553ba0e1a0SPaolo Bonzini                  * (instead of producing a deadlock in the former case). */
7563ba0e1a0SPaolo Bonzini                 if (!req->waiting_for) {
7573ba0e1a0SPaolo Bonzini                     self->waiting_for = req;
7583ba0e1a0SPaolo Bonzini                     qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
7593ba0e1a0SPaolo Bonzini                     self->waiting_for = NULL;
7603ba0e1a0SPaolo Bonzini                     retry = true;
7613ba0e1a0SPaolo Bonzini                     waited = true;
7623ba0e1a0SPaolo Bonzini                     break;
7633ba0e1a0SPaolo Bonzini                 }
7643ba0e1a0SPaolo Bonzini             }
7653ba0e1a0SPaolo Bonzini         }
7663ba0e1a0SPaolo Bonzini     } while (retry);
7673ba0e1a0SPaolo Bonzini     return waited;
7683ba0e1a0SPaolo Bonzini }
7693ba0e1a0SPaolo Bonzini 
77018fbd0deSPaolo Bonzini bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
77161007b31SStefan Hajnoczi {
7723ba0e1a0SPaolo Bonzini     BlockDriverState *bs = req->bs;
77361007b31SStefan Hajnoczi     int64_t overlap_offset = req->offset & ~(align - 1);
77422931a15SFam Zheng     uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
77561007b31SStefan Hajnoczi                                - overlap_offset;
7763ba0e1a0SPaolo Bonzini     bool waited;
77761007b31SStefan Hajnoczi 
7783ba0e1a0SPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
77961007b31SStefan Hajnoczi     if (!req->serialising) {
78020fc71b2SPaolo Bonzini         atomic_inc(&req->bs->serialising_in_flight);
78161007b31SStefan Hajnoczi         req->serialising = true;
78261007b31SStefan Hajnoczi     }
78361007b31SStefan Hajnoczi 
78461007b31SStefan Hajnoczi     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
78561007b31SStefan Hajnoczi     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
7863ba0e1a0SPaolo Bonzini     waited = bdrv_wait_serialising_requests_locked(bs, req);
7873ba0e1a0SPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
7883ba0e1a0SPaolo Bonzini     return waited;
78909d2f948SVladimir Sementsov-Ogievskiy }
79009d2f948SVladimir Sementsov-Ogievskiy 
79161007b31SStefan Hajnoczi /**
792c28107e9SMax Reitz  * Return the tracked request on @bs for the current coroutine, or
793c28107e9SMax Reitz  * NULL if there is none.
794c28107e9SMax Reitz  */
795c28107e9SMax Reitz BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
796c28107e9SMax Reitz {
797c28107e9SMax Reitz     BdrvTrackedRequest *req;
798c28107e9SMax Reitz     Coroutine *self = qemu_coroutine_self();
799c28107e9SMax Reitz 
800c28107e9SMax Reitz     QLIST_FOREACH(req, &bs->tracked_requests, list) {
801c28107e9SMax Reitz         if (req->co == self) {
802c28107e9SMax Reitz             return req;
803c28107e9SMax Reitz         }
804c28107e9SMax Reitz     }
805c28107e9SMax Reitz 
806c28107e9SMax Reitz     return NULL;
807c28107e9SMax Reitz }
808c28107e9SMax Reitz 
809c28107e9SMax Reitz /**
810244483e6SKevin Wolf  * Round a region to cluster boundaries
811244483e6SKevin Wolf  */
812244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs,
8137cfd5275SEric Blake                             int64_t offset, int64_t bytes,
814244483e6SKevin Wolf                             int64_t *cluster_offset,
8157cfd5275SEric Blake                             int64_t *cluster_bytes)
816244483e6SKevin Wolf {
817244483e6SKevin Wolf     BlockDriverInfo bdi;
818244483e6SKevin Wolf 
819244483e6SKevin Wolf     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
820244483e6SKevin Wolf         *cluster_offset = offset;
821244483e6SKevin Wolf         *cluster_bytes = bytes;
822244483e6SKevin Wolf     } else {
823244483e6SKevin Wolf         int64_t c = bdi.cluster_size;
824244483e6SKevin Wolf         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
825244483e6SKevin Wolf         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
826244483e6SKevin Wolf     }
827244483e6SKevin Wolf }
828244483e6SKevin Wolf 
82961007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs)
83061007b31SStefan Hajnoczi {
83161007b31SStefan Hajnoczi     BlockDriverInfo bdi;
83261007b31SStefan Hajnoczi     int ret;
83361007b31SStefan Hajnoczi 
83461007b31SStefan Hajnoczi     ret = bdrv_get_info(bs, &bdi);
83561007b31SStefan Hajnoczi     if (ret < 0 || bdi.cluster_size == 0) {
836a5b8dd2cSEric Blake         return bs->bl.request_alignment;
83761007b31SStefan Hajnoczi     } else {
83861007b31SStefan Hajnoczi         return bdi.cluster_size;
83961007b31SStefan Hajnoczi     }
84061007b31SStefan Hajnoczi }
84161007b31SStefan Hajnoczi 
84299723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs)
84399723548SPaolo Bonzini {
84499723548SPaolo Bonzini     atomic_inc(&bs->in_flight);
84599723548SPaolo Bonzini }
84699723548SPaolo Bonzini 
847c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs)
848c9d1a561SPaolo Bonzini {
849cfe29d82SKevin Wolf     aio_wait_kick();
850c9d1a561SPaolo Bonzini }
851c9d1a561SPaolo Bonzini 
85299723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs)
85399723548SPaolo Bonzini {
85499723548SPaolo Bonzini     atomic_dec(&bs->in_flight);
855c9d1a561SPaolo Bonzini     bdrv_wakeup(bs);
85699723548SPaolo Bonzini }
85799723548SPaolo Bonzini 
85818fbd0deSPaolo Bonzini static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
85961007b31SStefan Hajnoczi {
86061007b31SStefan Hajnoczi     BlockDriverState *bs = self->bs;
86161007b31SStefan Hajnoczi     bool waited = false;
86261007b31SStefan Hajnoczi 
86320fc71b2SPaolo Bonzini     if (!atomic_read(&bs->serialising_in_flight)) {
86461007b31SStefan Hajnoczi         return false;
86561007b31SStefan Hajnoczi     }
86661007b31SStefan Hajnoczi 
8673783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
8683ba0e1a0SPaolo Bonzini     waited = bdrv_wait_serialising_requests_locked(bs, self);
8693783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
87061007b31SStefan Hajnoczi 
87161007b31SStefan Hajnoczi     return waited;
87261007b31SStefan Hajnoczi }
87361007b31SStefan Hajnoczi 
87461007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
87561007b31SStefan Hajnoczi                                    size_t size)
87661007b31SStefan Hajnoczi {
87741ae31e3SAlberto Garcia     if (size > BDRV_REQUEST_MAX_BYTES) {
87861007b31SStefan Hajnoczi         return -EIO;
87961007b31SStefan Hajnoczi     }
88061007b31SStefan Hajnoczi 
88161007b31SStefan Hajnoczi     if (!bdrv_is_inserted(bs)) {
88261007b31SStefan Hajnoczi         return -ENOMEDIUM;
88361007b31SStefan Hajnoczi     }
88461007b31SStefan Hajnoczi 
88561007b31SStefan Hajnoczi     if (offset < 0) {
88661007b31SStefan Hajnoczi         return -EIO;
88761007b31SStefan Hajnoczi     }
88861007b31SStefan Hajnoczi 
88961007b31SStefan Hajnoczi     return 0;
89061007b31SStefan Hajnoczi }
89161007b31SStefan Hajnoczi 
8927d2410ceSVladimir Sementsov-Ogievskiy typedef int coroutine_fn BdrvRequestEntry(void *opaque);
8937d2410ceSVladimir Sementsov-Ogievskiy typedef struct BdrvRunCo {
8947d2410ceSVladimir Sementsov-Ogievskiy     BdrvRequestEntry *entry;
8957d2410ceSVladimir Sementsov-Ogievskiy     void *opaque;
8967d2410ceSVladimir Sementsov-Ogievskiy     int ret;
8977d2410ceSVladimir Sementsov-Ogievskiy     bool done;
8987d2410ceSVladimir Sementsov-Ogievskiy     Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */
8997d2410ceSVladimir Sementsov-Ogievskiy } BdrvRunCo;
9007d2410ceSVladimir Sementsov-Ogievskiy 
9017d2410ceSVladimir Sementsov-Ogievskiy static void coroutine_fn bdrv_run_co_entry(void *opaque)
9027d2410ceSVladimir Sementsov-Ogievskiy {
9037d2410ceSVladimir Sementsov-Ogievskiy     BdrvRunCo *arg = opaque;
9047d2410ceSVladimir Sementsov-Ogievskiy 
9057d2410ceSVladimir Sementsov-Ogievskiy     arg->ret = arg->entry(arg->opaque);
9067d2410ceSVladimir Sementsov-Ogievskiy     arg->done = true;
9077d2410ceSVladimir Sementsov-Ogievskiy     aio_wait_kick();
9087d2410ceSVladimir Sementsov-Ogievskiy }
9097d2410ceSVladimir Sementsov-Ogievskiy 
9107d2410ceSVladimir Sementsov-Ogievskiy static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry,
9117d2410ceSVladimir Sementsov-Ogievskiy                        void *opaque)
9127d2410ceSVladimir Sementsov-Ogievskiy {
9137d2410ceSVladimir Sementsov-Ogievskiy     if (qemu_in_coroutine()) {
9147d2410ceSVladimir Sementsov-Ogievskiy         /* Fast-path if already in coroutine context */
9157d2410ceSVladimir Sementsov-Ogievskiy         return entry(opaque);
9167d2410ceSVladimir Sementsov-Ogievskiy     } else {
9177d2410ceSVladimir Sementsov-Ogievskiy         BdrvRunCo s = { .entry = entry, .opaque = opaque };
9187d2410ceSVladimir Sementsov-Ogievskiy 
9197d2410ceSVladimir Sementsov-Ogievskiy         s.co = qemu_coroutine_create(bdrv_run_co_entry, &s);
9207d2410ceSVladimir Sementsov-Ogievskiy         bdrv_coroutine_enter(bs, s.co);
9217d2410ceSVladimir Sementsov-Ogievskiy 
9227d2410ceSVladimir Sementsov-Ogievskiy         BDRV_POLL_WHILE(bs, !s.done);
9237d2410ceSVladimir Sementsov-Ogievskiy 
9247d2410ceSVladimir Sementsov-Ogievskiy         return s.ret;
9257d2410ceSVladimir Sementsov-Ogievskiy     }
9267d2410ceSVladimir Sementsov-Ogievskiy }
9277d2410ceSVladimir Sementsov-Ogievskiy 
92861007b31SStefan Hajnoczi typedef struct RwCo {
929e293b7a3SKevin Wolf     BdrvChild *child;
93061007b31SStefan Hajnoczi     int64_t offset;
93161007b31SStefan Hajnoczi     QEMUIOVector *qiov;
93261007b31SStefan Hajnoczi     bool is_write;
93361007b31SStefan Hajnoczi     BdrvRequestFlags flags;
93461007b31SStefan Hajnoczi } RwCo;
93561007b31SStefan Hajnoczi 
9367d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_rw_co_entry(void *opaque)
93761007b31SStefan Hajnoczi {
93861007b31SStefan Hajnoczi     RwCo *rwco = opaque;
93961007b31SStefan Hajnoczi 
94061007b31SStefan Hajnoczi     if (!rwco->is_write) {
9417d2410ceSVladimir Sementsov-Ogievskiy         return bdrv_co_preadv(rwco->child, rwco->offset,
94261007b31SStefan Hajnoczi                               rwco->qiov->size, rwco->qiov,
94361007b31SStefan Hajnoczi                               rwco->flags);
94461007b31SStefan Hajnoczi     } else {
9457d2410ceSVladimir Sementsov-Ogievskiy         return bdrv_co_pwritev(rwco->child, rwco->offset,
94661007b31SStefan Hajnoczi                                rwco->qiov->size, rwco->qiov,
94761007b31SStefan Hajnoczi                                rwco->flags);
94861007b31SStefan Hajnoczi     }
94961007b31SStefan Hajnoczi }
95061007b31SStefan Hajnoczi 
95161007b31SStefan Hajnoczi /*
95261007b31SStefan Hajnoczi  * Process a vectored synchronous request using coroutines
95361007b31SStefan Hajnoczi  */
954e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
95561007b31SStefan Hajnoczi                         QEMUIOVector *qiov, bool is_write,
95661007b31SStefan Hajnoczi                         BdrvRequestFlags flags)
95761007b31SStefan Hajnoczi {
95861007b31SStefan Hajnoczi     RwCo rwco = {
959e293b7a3SKevin Wolf         .child = child,
96061007b31SStefan Hajnoczi         .offset = offset,
96161007b31SStefan Hajnoczi         .qiov = qiov,
96261007b31SStefan Hajnoczi         .is_write = is_write,
96361007b31SStefan Hajnoczi         .flags = flags,
96461007b31SStefan Hajnoczi     };
96561007b31SStefan Hajnoczi 
9667d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco);
96761007b31SStefan Hajnoczi }
96861007b31SStefan Hajnoczi 
969720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
970f5a5ca79SManos Pitsidianakis                        int bytes, BdrvRequestFlags flags)
97161007b31SStefan Hajnoczi {
9720d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
97374021bc4SEric Blake 
974e293b7a3SKevin Wolf     return bdrv_prwv_co(child, offset, &qiov, true,
97561007b31SStefan Hajnoczi                         BDRV_REQ_ZERO_WRITE | flags);
97661007b31SStefan Hajnoczi }
97761007b31SStefan Hajnoczi 
97861007b31SStefan Hajnoczi /*
97974021bc4SEric Blake  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
98061007b31SStefan Hajnoczi  * The operation is sped up by checking the block status and only writing
98161007b31SStefan Hajnoczi  * zeroes to the device if they currently do not return zeroes. Optional
98274021bc4SEric Blake  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
983465fe887SEric Blake  * BDRV_REQ_FUA).
98461007b31SStefan Hajnoczi  *
985f4649069SEric Blake  * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
98661007b31SStefan Hajnoczi  */
987720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
98861007b31SStefan Hajnoczi {
989237d78f8SEric Blake     int ret;
990237d78f8SEric Blake     int64_t target_size, bytes, offset = 0;
991720ff280SKevin Wolf     BlockDriverState *bs = child->bs;
99261007b31SStefan Hajnoczi 
9937286d610SEric Blake     target_size = bdrv_getlength(bs);
9947286d610SEric Blake     if (target_size < 0) {
9957286d610SEric Blake         return target_size;
99661007b31SStefan Hajnoczi     }
99761007b31SStefan Hajnoczi 
99861007b31SStefan Hajnoczi     for (;;) {
9997286d610SEric Blake         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
10007286d610SEric Blake         if (bytes <= 0) {
100161007b31SStefan Hajnoczi             return 0;
100261007b31SStefan Hajnoczi         }
1003237d78f8SEric Blake         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
100461007b31SStefan Hajnoczi         if (ret < 0) {
100561007b31SStefan Hajnoczi             return ret;
100661007b31SStefan Hajnoczi         }
100761007b31SStefan Hajnoczi         if (ret & BDRV_BLOCK_ZERO) {
1008237d78f8SEric Blake             offset += bytes;
100961007b31SStefan Hajnoczi             continue;
101061007b31SStefan Hajnoczi         }
1011237d78f8SEric Blake         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
101261007b31SStefan Hajnoczi         if (ret < 0) {
101361007b31SStefan Hajnoczi             return ret;
101461007b31SStefan Hajnoczi         }
1015237d78f8SEric Blake         offset += bytes;
101661007b31SStefan Hajnoczi     }
101761007b31SStefan Hajnoczi }
101861007b31SStefan Hajnoczi 
1019f4649069SEric Blake /* return < 0 if error. See bdrv_pwrite() for the return codes */
1020cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
1021f1e84741SKevin Wolf {
1022f1e84741SKevin Wolf     int ret;
1023f1e84741SKevin Wolf 
1024e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
1025f1e84741SKevin Wolf     if (ret < 0) {
1026f1e84741SKevin Wolf         return ret;
1027f1e84741SKevin Wolf     }
1028f1e84741SKevin Wolf 
1029f1e84741SKevin Wolf     return qiov->size;
1030f1e84741SKevin Wolf }
1031f1e84741SKevin Wolf 
10322e11d756SAlberto Garcia /* See bdrv_pwrite() for the return codes */
1033cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
103461007b31SStefan Hajnoczi {
10350d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
103661007b31SStefan Hajnoczi 
103761007b31SStefan Hajnoczi     if (bytes < 0) {
103861007b31SStefan Hajnoczi         return -EINVAL;
103961007b31SStefan Hajnoczi     }
104061007b31SStefan Hajnoczi 
1041cf2ab8fcSKevin Wolf     return bdrv_preadv(child, offset, &qiov);
104261007b31SStefan Hajnoczi }
104361007b31SStefan Hajnoczi 
1044d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
104561007b31SStefan Hajnoczi {
104661007b31SStefan Hajnoczi     int ret;
104761007b31SStefan Hajnoczi 
1048e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
104961007b31SStefan Hajnoczi     if (ret < 0) {
105061007b31SStefan Hajnoczi         return ret;
105161007b31SStefan Hajnoczi     }
105261007b31SStefan Hajnoczi 
105361007b31SStefan Hajnoczi     return qiov->size;
105461007b31SStefan Hajnoczi }
105561007b31SStefan Hajnoczi 
10562e11d756SAlberto Garcia /* Return no. of bytes on success or < 0 on error. Important errors are:
10572e11d756SAlberto Garcia   -EIO         generic I/O error (may happen for all errors)
10582e11d756SAlberto Garcia   -ENOMEDIUM   No media inserted.
10592e11d756SAlberto Garcia   -EINVAL      Invalid offset or number of bytes
10602e11d756SAlberto Garcia   -EACCES      Trying to write a read-only device
10612e11d756SAlberto Garcia */
1062d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
106361007b31SStefan Hajnoczi {
10640d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
106561007b31SStefan Hajnoczi 
106661007b31SStefan Hajnoczi     if (bytes < 0) {
106761007b31SStefan Hajnoczi         return -EINVAL;
106861007b31SStefan Hajnoczi     }
106961007b31SStefan Hajnoczi 
1070d9ca2ea2SKevin Wolf     return bdrv_pwritev(child, offset, &qiov);
107161007b31SStefan Hajnoczi }
107261007b31SStefan Hajnoczi 
107361007b31SStefan Hajnoczi /*
107461007b31SStefan Hajnoczi  * Writes to the file and ensures that no writes are reordered across this
107561007b31SStefan Hajnoczi  * request (acts as a barrier)
107661007b31SStefan Hajnoczi  *
107761007b31SStefan Hajnoczi  * Returns 0 on success, -errno in error cases.
107861007b31SStefan Hajnoczi  */
1079d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
108061007b31SStefan Hajnoczi                      const void *buf, int count)
108161007b31SStefan Hajnoczi {
108261007b31SStefan Hajnoczi     int ret;
108361007b31SStefan Hajnoczi 
1084d9ca2ea2SKevin Wolf     ret = bdrv_pwrite(child, offset, buf, count);
108561007b31SStefan Hajnoczi     if (ret < 0) {
108661007b31SStefan Hajnoczi         return ret;
108761007b31SStefan Hajnoczi     }
108861007b31SStefan Hajnoczi 
1089d9ca2ea2SKevin Wolf     ret = bdrv_flush(child->bs);
1090855a6a93SKevin Wolf     if (ret < 0) {
1091855a6a93SKevin Wolf         return ret;
109261007b31SStefan Hajnoczi     }
109361007b31SStefan Hajnoczi 
109461007b31SStefan Hajnoczi     return 0;
109561007b31SStefan Hajnoczi }
109661007b31SStefan Hajnoczi 
109708844473SKevin Wolf typedef struct CoroutineIOCompletion {
109808844473SKevin Wolf     Coroutine *coroutine;
109908844473SKevin Wolf     int ret;
110008844473SKevin Wolf } CoroutineIOCompletion;
110108844473SKevin Wolf 
110208844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret)
110308844473SKevin Wolf {
110408844473SKevin Wolf     CoroutineIOCompletion *co = opaque;
110508844473SKevin Wolf 
110608844473SKevin Wolf     co->ret = ret;
1107b9e413ddSPaolo Bonzini     aio_co_wake(co->coroutine);
110808844473SKevin Wolf }
110908844473SKevin Wolf 
1110166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1111166fe960SKevin Wolf                                            uint64_t offset, uint64_t bytes,
1112ac850bf0SVladimir Sementsov-Ogievskiy                                            QEMUIOVector *qiov,
1113ac850bf0SVladimir Sementsov-Ogievskiy                                            size_t qiov_offset, int flags)
1114166fe960SKevin Wolf {
1115166fe960SKevin Wolf     BlockDriver *drv = bs->drv;
11163fb06697SKevin Wolf     int64_t sector_num;
11173fb06697SKevin Wolf     unsigned int nb_sectors;
1118ac850bf0SVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
1119ac850bf0SVladimir Sementsov-Ogievskiy     int ret;
11203fb06697SKevin Wolf 
1121fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
1122fe0480d6SKevin Wolf     assert(!(flags & BDRV_REQ_NO_FALLBACK));
1123fa166538SEric Blake 
1124d470ad42SMax Reitz     if (!drv) {
1125d470ad42SMax Reitz         return -ENOMEDIUM;
1126d470ad42SMax Reitz     }
1127d470ad42SMax Reitz 
1128ac850bf0SVladimir Sementsov-Ogievskiy     if (drv->bdrv_co_preadv_part) {
1129ac850bf0SVladimir Sementsov-Ogievskiy         return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1130ac850bf0SVladimir Sementsov-Ogievskiy                                         flags);
1131ac850bf0SVladimir Sementsov-Ogievskiy     }
1132ac850bf0SVladimir Sementsov-Ogievskiy 
1133ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov_offset > 0 || bytes != qiov->size) {
1134ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1135ac850bf0SVladimir Sementsov-Ogievskiy         qiov = &local_qiov;
1136ac850bf0SVladimir Sementsov-Ogievskiy     }
1137ac850bf0SVladimir Sementsov-Ogievskiy 
11383fb06697SKevin Wolf     if (drv->bdrv_co_preadv) {
1139ac850bf0SVladimir Sementsov-Ogievskiy         ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1140ac850bf0SVladimir Sementsov-Ogievskiy         goto out;
11413fb06697SKevin Wolf     }
11423fb06697SKevin Wolf 
1143edfab6a0SEric Blake     if (drv->bdrv_aio_preadv) {
114408844473SKevin Wolf         BlockAIOCB *acb;
114508844473SKevin Wolf         CoroutineIOCompletion co = {
114608844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
114708844473SKevin Wolf         };
114808844473SKevin Wolf 
1149e31f6864SEric Blake         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
115008844473SKevin Wolf                                    bdrv_co_io_em_complete, &co);
115108844473SKevin Wolf         if (acb == NULL) {
1152ac850bf0SVladimir Sementsov-Ogievskiy             ret = -EIO;
1153ac850bf0SVladimir Sementsov-Ogievskiy             goto out;
115408844473SKevin Wolf         } else {
115508844473SKevin Wolf             qemu_coroutine_yield();
1156ac850bf0SVladimir Sementsov-Ogievskiy             ret = co.ret;
1157ac850bf0SVladimir Sementsov-Ogievskiy             goto out;
115808844473SKevin Wolf         }
115908844473SKevin Wolf     }
1160edfab6a0SEric Blake 
1161edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1162edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1163edfab6a0SEric Blake 
11641bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
11651bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
116641ae31e3SAlberto Garcia     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1167edfab6a0SEric Blake     assert(drv->bdrv_co_readv);
1168edfab6a0SEric Blake 
1169ac850bf0SVladimir Sementsov-Ogievskiy     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1170ac850bf0SVladimir Sementsov-Ogievskiy 
1171ac850bf0SVladimir Sementsov-Ogievskiy out:
1172ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov == &local_qiov) {
1173ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_destroy(&local_qiov);
1174ac850bf0SVladimir Sementsov-Ogievskiy     }
1175ac850bf0SVladimir Sementsov-Ogievskiy 
1176ac850bf0SVladimir Sementsov-Ogievskiy     return ret;
1177166fe960SKevin Wolf }
1178166fe960SKevin Wolf 
117978a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
118078a07294SKevin Wolf                                             uint64_t offset, uint64_t bytes,
1181ac850bf0SVladimir Sementsov-Ogievskiy                                             QEMUIOVector *qiov,
1182ac850bf0SVladimir Sementsov-Ogievskiy                                             size_t qiov_offset, int flags)
118378a07294SKevin Wolf {
118478a07294SKevin Wolf     BlockDriver *drv = bs->drv;
11853fb06697SKevin Wolf     int64_t sector_num;
11863fb06697SKevin Wolf     unsigned int nb_sectors;
1187ac850bf0SVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
118878a07294SKevin Wolf     int ret;
118978a07294SKevin Wolf 
1190fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
1191fe0480d6SKevin Wolf     assert(!(flags & BDRV_REQ_NO_FALLBACK));
1192fa166538SEric Blake 
1193d470ad42SMax Reitz     if (!drv) {
1194d470ad42SMax Reitz         return -ENOMEDIUM;
1195d470ad42SMax Reitz     }
1196d470ad42SMax Reitz 
1197ac850bf0SVladimir Sementsov-Ogievskiy     if (drv->bdrv_co_pwritev_part) {
1198ac850bf0SVladimir Sementsov-Ogievskiy         ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1199ac850bf0SVladimir Sementsov-Ogievskiy                                         flags & bs->supported_write_flags);
1200ac850bf0SVladimir Sementsov-Ogievskiy         flags &= ~bs->supported_write_flags;
1201ac850bf0SVladimir Sementsov-Ogievskiy         goto emulate_flags;
1202ac850bf0SVladimir Sementsov-Ogievskiy     }
1203ac850bf0SVladimir Sementsov-Ogievskiy 
1204ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov_offset > 0 || bytes != qiov->size) {
1205ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1206ac850bf0SVladimir Sementsov-Ogievskiy         qiov = &local_qiov;
1207ac850bf0SVladimir Sementsov-Ogievskiy     }
1208ac850bf0SVladimir Sementsov-Ogievskiy 
12093fb06697SKevin Wolf     if (drv->bdrv_co_pwritev) {
1210515c2f43SKevin Wolf         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1211515c2f43SKevin Wolf                                    flags & bs->supported_write_flags);
1212515c2f43SKevin Wolf         flags &= ~bs->supported_write_flags;
12133fb06697SKevin Wolf         goto emulate_flags;
12143fb06697SKevin Wolf     }
12153fb06697SKevin Wolf 
1216edfab6a0SEric Blake     if (drv->bdrv_aio_pwritev) {
121708844473SKevin Wolf         BlockAIOCB *acb;
121808844473SKevin Wolf         CoroutineIOCompletion co = {
121908844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
122008844473SKevin Wolf         };
122108844473SKevin Wolf 
1222e31f6864SEric Blake         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1223e31f6864SEric Blake                                     flags & bs->supported_write_flags,
122408844473SKevin Wolf                                     bdrv_co_io_em_complete, &co);
1225e31f6864SEric Blake         flags &= ~bs->supported_write_flags;
122608844473SKevin Wolf         if (acb == NULL) {
12273fb06697SKevin Wolf             ret = -EIO;
122808844473SKevin Wolf         } else {
122908844473SKevin Wolf             qemu_coroutine_yield();
12303fb06697SKevin Wolf             ret = co.ret;
123108844473SKevin Wolf         }
1232edfab6a0SEric Blake         goto emulate_flags;
1233edfab6a0SEric Blake     }
1234edfab6a0SEric Blake 
1235edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1236edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1237edfab6a0SEric Blake 
12381bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
12391bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
124041ae31e3SAlberto Garcia     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1241edfab6a0SEric Blake 
1242e18a58b4SEric Blake     assert(drv->bdrv_co_writev);
1243e18a58b4SEric Blake     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1244edfab6a0SEric Blake                               flags & bs->supported_write_flags);
1245edfab6a0SEric Blake     flags &= ~bs->supported_write_flags;
124678a07294SKevin Wolf 
12473fb06697SKevin Wolf emulate_flags:
12484df863f3SEric Blake     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
124978a07294SKevin Wolf         ret = bdrv_co_flush(bs);
125078a07294SKevin Wolf     }
125178a07294SKevin Wolf 
1252ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov == &local_qiov) {
1253ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_destroy(&local_qiov);
1254ac850bf0SVladimir Sementsov-Ogievskiy     }
1255ac850bf0SVladimir Sementsov-Ogievskiy 
125678a07294SKevin Wolf     return ret;
125778a07294SKevin Wolf }
125878a07294SKevin Wolf 
125929a298afSPavel Butsykin static int coroutine_fn
126029a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1261ac850bf0SVladimir Sementsov-Ogievskiy                                uint64_t bytes, QEMUIOVector *qiov,
1262ac850bf0SVladimir Sementsov-Ogievskiy                                size_t qiov_offset)
126329a298afSPavel Butsykin {
126429a298afSPavel Butsykin     BlockDriver *drv = bs->drv;
1265ac850bf0SVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
1266ac850bf0SVladimir Sementsov-Ogievskiy     int ret;
126729a298afSPavel Butsykin 
1268d470ad42SMax Reitz     if (!drv) {
1269d470ad42SMax Reitz         return -ENOMEDIUM;
1270d470ad42SMax Reitz     }
1271d470ad42SMax Reitz 
1272ac850bf0SVladimir Sementsov-Ogievskiy     if (!block_driver_can_compress(drv)) {
127329a298afSPavel Butsykin         return -ENOTSUP;
127429a298afSPavel Butsykin     }
127529a298afSPavel Butsykin 
1276ac850bf0SVladimir Sementsov-Ogievskiy     if (drv->bdrv_co_pwritev_compressed_part) {
1277ac850bf0SVladimir Sementsov-Ogievskiy         return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1278ac850bf0SVladimir Sementsov-Ogievskiy                                                     qiov, qiov_offset);
1279ac850bf0SVladimir Sementsov-Ogievskiy     }
1280ac850bf0SVladimir Sementsov-Ogievskiy 
1281ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov_offset == 0) {
128229a298afSPavel Butsykin         return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
128329a298afSPavel Butsykin     }
128429a298afSPavel Butsykin 
1285ac850bf0SVladimir Sementsov-Ogievskiy     qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1286ac850bf0SVladimir Sementsov-Ogievskiy     ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1287ac850bf0SVladimir Sementsov-Ogievskiy     qemu_iovec_destroy(&local_qiov);
1288ac850bf0SVladimir Sementsov-Ogievskiy 
1289ac850bf0SVladimir Sementsov-Ogievskiy     return ret;
1290ac850bf0SVladimir Sementsov-Ogievskiy }
1291ac850bf0SVladimir Sementsov-Ogievskiy 
129285c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
12933299e5ecSVladimir Sementsov-Ogievskiy         int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
12941143ec5eSVladimir Sementsov-Ogievskiy         size_t qiov_offset, int flags)
129561007b31SStefan Hajnoczi {
129685c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
129785c97ca7SKevin Wolf 
129861007b31SStefan Hajnoczi     /* Perform I/O through a temporary buffer so that users who scribble over
129961007b31SStefan Hajnoczi      * their read buffer while the operation is in progress do not end up
130061007b31SStefan Hajnoczi      * modifying the image file.  This is critical for zero-copy guest I/O
130161007b31SStefan Hajnoczi      * where anything might happen inside guest memory.
130261007b31SStefan Hajnoczi      */
13032275cc90SVladimir Sementsov-Ogievskiy     void *bounce_buffer = NULL;
130461007b31SStefan Hajnoczi 
130561007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
1306244483e6SKevin Wolf     int64_t cluster_offset;
13077cfd5275SEric Blake     int64_t cluster_bytes;
130861007b31SStefan Hajnoczi     size_t skip_bytes;
130961007b31SStefan Hajnoczi     int ret;
1310cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1311cb2e2878SEric Blake                                     BDRV_REQUEST_MAX_BYTES);
1312cb2e2878SEric Blake     unsigned int progress = 0;
13138644476eSMax Reitz     bool skip_write;
131461007b31SStefan Hajnoczi 
1315d470ad42SMax Reitz     if (!drv) {
1316d470ad42SMax Reitz         return -ENOMEDIUM;
1317d470ad42SMax Reitz     }
1318d470ad42SMax Reitz 
13198644476eSMax Reitz     /*
13208644476eSMax Reitz      * Do not write anything when the BDS is inactive.  That is not
13218644476eSMax Reitz      * allowed, and it would not help.
13228644476eSMax Reitz      */
13238644476eSMax Reitz     skip_write = (bs->open_flags & BDRV_O_INACTIVE);
13248644476eSMax Reitz 
13251bf03e66SKevin Wolf     /* FIXME We cannot require callers to have write permissions when all they
13261bf03e66SKevin Wolf      * are doing is a read request. If we did things right, write permissions
13271bf03e66SKevin Wolf      * would be obtained anyway, but internally by the copy-on-read code. As
1328765d9df9SEric Blake      * long as it is implemented here rather than in a separate filter driver,
13291bf03e66SKevin Wolf      * the copy-on-read code doesn't have its own BdrvChild, however, for which
13301bf03e66SKevin Wolf      * it could request permissions. Therefore we have to bypass the permission
13311bf03e66SKevin Wolf      * system for the moment. */
13321bf03e66SKevin Wolf     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1333afa4b293SKevin Wolf 
133461007b31SStefan Hajnoczi     /* Cover entire cluster so no additional backing file I/O is required when
1335cb2e2878SEric Blake      * allocating cluster in the image file.  Note that this value may exceed
1336cb2e2878SEric Blake      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1337cb2e2878SEric Blake      * is one reason we loop rather than doing it all at once.
133861007b31SStefan Hajnoczi      */
1339244483e6SKevin Wolf     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1340cb2e2878SEric Blake     skip_bytes = offset - cluster_offset;
134161007b31SStefan Hajnoczi 
1342244483e6SKevin Wolf     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1343244483e6SKevin Wolf                                    cluster_offset, cluster_bytes);
134461007b31SStefan Hajnoczi 
1345cb2e2878SEric Blake     while (cluster_bytes) {
1346cb2e2878SEric Blake         int64_t pnum;
134761007b31SStefan Hajnoczi 
13488644476eSMax Reitz         if (skip_write) {
13498644476eSMax Reitz             ret = 1; /* "already allocated", so nothing will be copied */
13508644476eSMax Reitz             pnum = MIN(cluster_bytes, max_transfer);
13518644476eSMax Reitz         } else {
1352cb2e2878SEric Blake             ret = bdrv_is_allocated(bs, cluster_offset,
1353cb2e2878SEric Blake                                     MIN(cluster_bytes, max_transfer), &pnum);
1354cb2e2878SEric Blake             if (ret < 0) {
13558644476eSMax Reitz                 /*
13568644476eSMax Reitz                  * Safe to treat errors in querying allocation as if
1357cb2e2878SEric Blake                  * unallocated; we'll probably fail again soon on the
1358cb2e2878SEric Blake                  * read, but at least that will set a decent errno.
1359cb2e2878SEric Blake                  */
1360cb2e2878SEric Blake                 pnum = MIN(cluster_bytes, max_transfer);
1361cb2e2878SEric Blake             }
1362cb2e2878SEric Blake 
1363b0ddcbbbSKevin Wolf             /* Stop at EOF if the image ends in the middle of the cluster */
1364b0ddcbbbSKevin Wolf             if (ret == 0 && pnum == 0) {
1365b0ddcbbbSKevin Wolf                 assert(progress >= bytes);
1366b0ddcbbbSKevin Wolf                 break;
1367b0ddcbbbSKevin Wolf             }
1368b0ddcbbbSKevin Wolf 
1369cb2e2878SEric Blake             assert(skip_bytes < pnum);
13708644476eSMax Reitz         }
1371cb2e2878SEric Blake 
1372cb2e2878SEric Blake         if (ret <= 0) {
13731143ec5eSVladimir Sementsov-Ogievskiy             QEMUIOVector local_qiov;
13741143ec5eSVladimir Sementsov-Ogievskiy 
1375cb2e2878SEric Blake             /* Must copy-on-read; use the bounce buffer */
13760d93ed08SVladimir Sementsov-Ogievskiy             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
13772275cc90SVladimir Sementsov-Ogievskiy             if (!bounce_buffer) {
13782275cc90SVladimir Sementsov-Ogievskiy                 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
13792275cc90SVladimir Sementsov-Ogievskiy                 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
13802275cc90SVladimir Sementsov-Ogievskiy                 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
13812275cc90SVladimir Sementsov-Ogievskiy 
13822275cc90SVladimir Sementsov-Ogievskiy                 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
13832275cc90SVladimir Sementsov-Ogievskiy                 if (!bounce_buffer) {
13842275cc90SVladimir Sementsov-Ogievskiy                     ret = -ENOMEM;
13852275cc90SVladimir Sementsov-Ogievskiy                     goto err;
13862275cc90SVladimir Sementsov-Ogievskiy                 }
13872275cc90SVladimir Sementsov-Ogievskiy             }
13880d93ed08SVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1389cb2e2878SEric Blake 
1390cb2e2878SEric Blake             ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1391ac850bf0SVladimir Sementsov-Ogievskiy                                      &local_qiov, 0, 0);
139261007b31SStefan Hajnoczi             if (ret < 0) {
139361007b31SStefan Hajnoczi                 goto err;
139461007b31SStefan Hajnoczi             }
139561007b31SStefan Hajnoczi 
1396d855ebcdSEric Blake             bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1397c1499a5eSEric Blake             if (drv->bdrv_co_pwrite_zeroes &&
1398cb2e2878SEric Blake                 buffer_is_zero(bounce_buffer, pnum)) {
1399a604fa2bSEric Blake                 /* FIXME: Should we (perhaps conditionally) be setting
1400a604fa2bSEric Blake                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1401a604fa2bSEric Blake                  * that still correctly reads as zero? */
14027adcf59fSMax Reitz                 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
14037adcf59fSMax Reitz                                                BDRV_REQ_WRITE_UNCHANGED);
140461007b31SStefan Hajnoczi             } else {
1405cb2e2878SEric Blake                 /* This does not change the data on the disk, it is not
1406cb2e2878SEric Blake                  * necessary to flush even in cache=writethrough mode.
140761007b31SStefan Hajnoczi                  */
1408cb2e2878SEric Blake                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1409ac850bf0SVladimir Sementsov-Ogievskiy                                           &local_qiov, 0,
14107adcf59fSMax Reitz                                           BDRV_REQ_WRITE_UNCHANGED);
141161007b31SStefan Hajnoczi             }
141261007b31SStefan Hajnoczi 
141361007b31SStefan Hajnoczi             if (ret < 0) {
1414cb2e2878SEric Blake                 /* It might be okay to ignore write errors for guest
1415cb2e2878SEric Blake                  * requests.  If this is a deliberate copy-on-read
1416cb2e2878SEric Blake                  * then we don't want to ignore the error.  Simply
1417cb2e2878SEric Blake                  * report it in all cases.
141861007b31SStefan Hajnoczi                  */
141961007b31SStefan Hajnoczi                 goto err;
142061007b31SStefan Hajnoczi             }
142161007b31SStefan Hajnoczi 
14223299e5ecSVladimir Sementsov-Ogievskiy             if (!(flags & BDRV_REQ_PREFETCH)) {
14231143ec5eSVladimir Sementsov-Ogievskiy                 qemu_iovec_from_buf(qiov, qiov_offset + progress,
14241143ec5eSVladimir Sementsov-Ogievskiy                                     bounce_buffer + skip_bytes,
14254ab78b19SVladimir Sementsov-Ogievskiy                                     MIN(pnum - skip_bytes, bytes - progress));
14263299e5ecSVladimir Sementsov-Ogievskiy             }
14273299e5ecSVladimir Sementsov-Ogievskiy         } else if (!(flags & BDRV_REQ_PREFETCH)) {
1428cb2e2878SEric Blake             /* Read directly into the destination */
14291143ec5eSVladimir Sementsov-Ogievskiy             ret = bdrv_driver_preadv(bs, offset + progress,
14301143ec5eSVladimir Sementsov-Ogievskiy                                      MIN(pnum - skip_bytes, bytes - progress),
14311143ec5eSVladimir Sementsov-Ogievskiy                                      qiov, qiov_offset + progress, 0);
1432cb2e2878SEric Blake             if (ret < 0) {
1433cb2e2878SEric Blake                 goto err;
1434cb2e2878SEric Blake             }
1435cb2e2878SEric Blake         }
1436cb2e2878SEric Blake 
1437cb2e2878SEric Blake         cluster_offset += pnum;
1438cb2e2878SEric Blake         cluster_bytes -= pnum;
1439cb2e2878SEric Blake         progress += pnum - skip_bytes;
1440cb2e2878SEric Blake         skip_bytes = 0;
1441cb2e2878SEric Blake     }
1442cb2e2878SEric Blake     ret = 0;
144361007b31SStefan Hajnoczi 
144461007b31SStefan Hajnoczi err:
144561007b31SStefan Hajnoczi     qemu_vfree(bounce_buffer);
144661007b31SStefan Hajnoczi     return ret;
144761007b31SStefan Hajnoczi }
144861007b31SStefan Hajnoczi 
144961007b31SStefan Hajnoczi /*
145061007b31SStefan Hajnoczi  * Forwards an already correctly aligned request to the BlockDriver. This
14511a62d0acSEric Blake  * handles copy on read, zeroing after EOF, and fragmentation of large
14521a62d0acSEric Blake  * reads; any other features must be implemented by the caller.
145361007b31SStefan Hajnoczi  */
145485c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
145561007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
145665cd4424SVladimir Sementsov-Ogievskiy     int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
145761007b31SStefan Hajnoczi {
145885c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
1459c9d20029SKevin Wolf     int64_t total_bytes, max_bytes;
14601a62d0acSEric Blake     int ret = 0;
14611a62d0acSEric Blake     uint64_t bytes_remaining = bytes;
14621a62d0acSEric Blake     int max_transfer;
146361007b31SStefan Hajnoczi 
146449c07526SKevin Wolf     assert(is_power_of_2(align));
146549c07526SKevin Wolf     assert((offset & (align - 1)) == 0);
146649c07526SKevin Wolf     assert((bytes & (align - 1)) == 0);
1467abb06c5aSDaniel P. Berrange     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
14681a62d0acSEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
14691a62d0acSEric Blake                                    align);
1470a604fa2bSEric Blake 
1471a604fa2bSEric Blake     /* TODO: We would need a per-BDS .supported_read_flags and
1472a604fa2bSEric Blake      * potential fallback support, if we ever implement any read flags
1473a604fa2bSEric Blake      * to pass through to drivers.  For now, there aren't any
1474a604fa2bSEric Blake      * passthrough flags.  */
1475c53cb427SPaolo Bonzini     assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));
147661007b31SStefan Hajnoczi 
147761007b31SStefan Hajnoczi     /* Handle Copy on Read and associated serialisation */
147861007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
147961007b31SStefan Hajnoczi         /* If we touch the same cluster it counts as an overlap.  This
148061007b31SStefan Hajnoczi          * guarantees that allocating writes will be serialized and not race
148161007b31SStefan Hajnoczi          * with each other for the same cluster.  For example, in copy-on-read
148261007b31SStefan Hajnoczi          * it ensures that the CoR read and write operations are atomic and
148361007b31SStefan Hajnoczi          * guest writes cannot interleave between them. */
1484304d9d7fSMax Reitz         bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
148518fbd0deSPaolo Bonzini     } else {
1486304d9d7fSMax Reitz         bdrv_wait_serialising_requests(req);
148718fbd0deSPaolo Bonzini     }
148861007b31SStefan Hajnoczi 
148961007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
1490d6a644bbSEric Blake         int64_t pnum;
149161007b31SStefan Hajnoczi 
149288e63df2SEric Blake         ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
149361007b31SStefan Hajnoczi         if (ret < 0) {
149461007b31SStefan Hajnoczi             goto out;
149561007b31SStefan Hajnoczi         }
149661007b31SStefan Hajnoczi 
149788e63df2SEric Blake         if (!ret || pnum != bytes) {
149865cd4424SVladimir Sementsov-Ogievskiy             ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
149965cd4424SVladimir Sementsov-Ogievskiy                                            qiov, qiov_offset, flags);
15003299e5ecSVladimir Sementsov-Ogievskiy             goto out;
15013299e5ecSVladimir Sementsov-Ogievskiy         } else if (flags & BDRV_REQ_PREFETCH) {
150261007b31SStefan Hajnoczi             goto out;
150361007b31SStefan Hajnoczi         }
150461007b31SStefan Hajnoczi     }
150561007b31SStefan Hajnoczi 
15061a62d0acSEric Blake     /* Forward the request to the BlockDriver, possibly fragmenting it */
150749c07526SKevin Wolf     total_bytes = bdrv_getlength(bs);
150849c07526SKevin Wolf     if (total_bytes < 0) {
150949c07526SKevin Wolf         ret = total_bytes;
151061007b31SStefan Hajnoczi         goto out;
151161007b31SStefan Hajnoczi     }
151261007b31SStefan Hajnoczi 
151349c07526SKevin Wolf     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
15141a62d0acSEric Blake     if (bytes <= max_bytes && bytes <= max_transfer) {
151565cd4424SVladimir Sementsov-Ogievskiy         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0);
15161a62d0acSEric Blake         goto out;
151761007b31SStefan Hajnoczi     }
151861007b31SStefan Hajnoczi 
15191a62d0acSEric Blake     while (bytes_remaining) {
15201a62d0acSEric Blake         int num;
15211a62d0acSEric Blake 
15221a62d0acSEric Blake         if (max_bytes) {
15231a62d0acSEric Blake             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
15241a62d0acSEric Blake             assert(num);
15251a62d0acSEric Blake 
15261a62d0acSEric Blake             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
152765cd4424SVladimir Sementsov-Ogievskiy                                      num, qiov, bytes - bytes_remaining, 0);
15281a62d0acSEric Blake             max_bytes -= num;
15291a62d0acSEric Blake         } else {
15301a62d0acSEric Blake             num = bytes_remaining;
15311a62d0acSEric Blake             ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
15321a62d0acSEric Blake                                     bytes_remaining);
15331a62d0acSEric Blake         }
15341a62d0acSEric Blake         if (ret < 0) {
15351a62d0acSEric Blake             goto out;
15361a62d0acSEric Blake         }
15371a62d0acSEric Blake         bytes_remaining -= num;
153861007b31SStefan Hajnoczi     }
153961007b31SStefan Hajnoczi 
154061007b31SStefan Hajnoczi out:
15411a62d0acSEric Blake     return ret < 0 ? ret : 0;
154261007b31SStefan Hajnoczi }
154361007b31SStefan Hajnoczi 
154461007b31SStefan Hajnoczi /*
15457a3f542fSVladimir Sementsov-Ogievskiy  * Request padding
15467a3f542fSVladimir Sementsov-Ogievskiy  *
15477a3f542fSVladimir Sementsov-Ogievskiy  *  |<---- align ----->|                     |<----- align ---->|
15487a3f542fSVladimir Sementsov-Ogievskiy  *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
15497a3f542fSVladimir Sementsov-Ogievskiy  *  |          |       |                     |     |            |
15507a3f542fSVladimir Sementsov-Ogievskiy  * -*----------$-------*-------- ... --------*-----$------------*---
15517a3f542fSVladimir Sementsov-Ogievskiy  *  |          |       |                     |     |            |
15527a3f542fSVladimir Sementsov-Ogievskiy  *  |          offset  |                     |     end          |
15537a3f542fSVladimir Sementsov-Ogievskiy  *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
15547a3f542fSVladimir Sementsov-Ogievskiy  *  [buf   ... )                             [tail_buf          )
15557a3f542fSVladimir Sementsov-Ogievskiy  *
15567a3f542fSVladimir Sementsov-Ogievskiy  * @buf is an aligned allocation needed to store @head and @tail paddings. @head
15577a3f542fSVladimir Sementsov-Ogievskiy  * is placed at the beginning of @buf and @tail at the @end.
15587a3f542fSVladimir Sementsov-Ogievskiy  *
15597a3f542fSVladimir Sementsov-Ogievskiy  * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
15607a3f542fSVladimir Sementsov-Ogievskiy  * around tail, if tail exists.
15617a3f542fSVladimir Sementsov-Ogievskiy  *
15627a3f542fSVladimir Sementsov-Ogievskiy  * @merge_reads is true for small requests,
15637a3f542fSVladimir Sementsov-Ogievskiy  * if @buf_len == @head + bytes + @tail. In this case it is possible that both
15647a3f542fSVladimir Sementsov-Ogievskiy  * head and tail exist but @buf_len == align and @tail_buf == @buf.
156561007b31SStefan Hajnoczi  */
15667a3f542fSVladimir Sementsov-Ogievskiy typedef struct BdrvRequestPadding {
15677a3f542fSVladimir Sementsov-Ogievskiy     uint8_t *buf;
15687a3f542fSVladimir Sementsov-Ogievskiy     size_t buf_len;
15697a3f542fSVladimir Sementsov-Ogievskiy     uint8_t *tail_buf;
15707a3f542fSVladimir Sementsov-Ogievskiy     size_t head;
15717a3f542fSVladimir Sementsov-Ogievskiy     size_t tail;
15727a3f542fSVladimir Sementsov-Ogievskiy     bool merge_reads;
15737a3f542fSVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
15747a3f542fSVladimir Sementsov-Ogievskiy } BdrvRequestPadding;
15757a3f542fSVladimir Sementsov-Ogievskiy 
15767a3f542fSVladimir Sementsov-Ogievskiy static bool bdrv_init_padding(BlockDriverState *bs,
15777a3f542fSVladimir Sementsov-Ogievskiy                               int64_t offset, int64_t bytes,
15787a3f542fSVladimir Sementsov-Ogievskiy                               BdrvRequestPadding *pad)
15797a3f542fSVladimir Sementsov-Ogievskiy {
15807a3f542fSVladimir Sementsov-Ogievskiy     uint64_t align = bs->bl.request_alignment;
15817a3f542fSVladimir Sementsov-Ogievskiy     size_t sum;
15827a3f542fSVladimir Sementsov-Ogievskiy 
15837a3f542fSVladimir Sementsov-Ogievskiy     memset(pad, 0, sizeof(*pad));
15847a3f542fSVladimir Sementsov-Ogievskiy 
15857a3f542fSVladimir Sementsov-Ogievskiy     pad->head = offset & (align - 1);
15867a3f542fSVladimir Sementsov-Ogievskiy     pad->tail = ((offset + bytes) & (align - 1));
15877a3f542fSVladimir Sementsov-Ogievskiy     if (pad->tail) {
15887a3f542fSVladimir Sementsov-Ogievskiy         pad->tail = align - pad->tail;
15897a3f542fSVladimir Sementsov-Ogievskiy     }
15907a3f542fSVladimir Sementsov-Ogievskiy 
1591ac9d00bfSVladimir Sementsov-Ogievskiy     if (!pad->head && !pad->tail) {
15927a3f542fSVladimir Sementsov-Ogievskiy         return false;
15937a3f542fSVladimir Sementsov-Ogievskiy     }
15947a3f542fSVladimir Sementsov-Ogievskiy 
1595ac9d00bfSVladimir Sementsov-Ogievskiy     assert(bytes); /* Nothing good in aligning zero-length requests */
1596ac9d00bfSVladimir Sementsov-Ogievskiy 
15977a3f542fSVladimir Sementsov-Ogievskiy     sum = pad->head + bytes + pad->tail;
15987a3f542fSVladimir Sementsov-Ogievskiy     pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
15997a3f542fSVladimir Sementsov-Ogievskiy     pad->buf = qemu_blockalign(bs, pad->buf_len);
16007a3f542fSVladimir Sementsov-Ogievskiy     pad->merge_reads = sum == pad->buf_len;
16017a3f542fSVladimir Sementsov-Ogievskiy     if (pad->tail) {
16027a3f542fSVladimir Sementsov-Ogievskiy         pad->tail_buf = pad->buf + pad->buf_len - align;
16037a3f542fSVladimir Sementsov-Ogievskiy     }
16047a3f542fSVladimir Sementsov-Ogievskiy 
16057a3f542fSVladimir Sementsov-Ogievskiy     return true;
16067a3f542fSVladimir Sementsov-Ogievskiy }
16077a3f542fSVladimir Sementsov-Ogievskiy 
16087a3f542fSVladimir Sementsov-Ogievskiy static int bdrv_padding_rmw_read(BdrvChild *child,
16097a3f542fSVladimir Sementsov-Ogievskiy                                  BdrvTrackedRequest *req,
16107a3f542fSVladimir Sementsov-Ogievskiy                                  BdrvRequestPadding *pad,
16117a3f542fSVladimir Sementsov-Ogievskiy                                  bool zero_middle)
16127a3f542fSVladimir Sementsov-Ogievskiy {
16137a3f542fSVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
16147a3f542fSVladimir Sementsov-Ogievskiy     BlockDriverState *bs = child->bs;
16157a3f542fSVladimir Sementsov-Ogievskiy     uint64_t align = bs->bl.request_alignment;
16167a3f542fSVladimir Sementsov-Ogievskiy     int ret;
16177a3f542fSVladimir Sementsov-Ogievskiy 
16187a3f542fSVladimir Sementsov-Ogievskiy     assert(req->serialising && pad->buf);
16197a3f542fSVladimir Sementsov-Ogievskiy 
16207a3f542fSVladimir Sementsov-Ogievskiy     if (pad->head || pad->merge_reads) {
16217a3f542fSVladimir Sementsov-Ogievskiy         uint64_t bytes = pad->merge_reads ? pad->buf_len : align;
16227a3f542fSVladimir Sementsov-Ogievskiy 
16237a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
16247a3f542fSVladimir Sementsov-Ogievskiy 
16257a3f542fSVladimir Sementsov-Ogievskiy         if (pad->head) {
16267a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
16277a3f542fSVladimir Sementsov-Ogievskiy         }
16287a3f542fSVladimir Sementsov-Ogievskiy         if (pad->merge_reads && pad->tail) {
16297a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
16307a3f542fSVladimir Sementsov-Ogievskiy         }
16317a3f542fSVladimir Sementsov-Ogievskiy         ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
163265cd4424SVladimir Sementsov-Ogievskiy                                   align, &local_qiov, 0, 0);
16337a3f542fSVladimir Sementsov-Ogievskiy         if (ret < 0) {
16347a3f542fSVladimir Sementsov-Ogievskiy             return ret;
16357a3f542fSVladimir Sementsov-Ogievskiy         }
16367a3f542fSVladimir Sementsov-Ogievskiy         if (pad->head) {
16377a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
16387a3f542fSVladimir Sementsov-Ogievskiy         }
16397a3f542fSVladimir Sementsov-Ogievskiy         if (pad->merge_reads && pad->tail) {
16407a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
16417a3f542fSVladimir Sementsov-Ogievskiy         }
16427a3f542fSVladimir Sementsov-Ogievskiy 
16437a3f542fSVladimir Sementsov-Ogievskiy         if (pad->merge_reads) {
16447a3f542fSVladimir Sementsov-Ogievskiy             goto zero_mem;
16457a3f542fSVladimir Sementsov-Ogievskiy         }
16467a3f542fSVladimir Sementsov-Ogievskiy     }
16477a3f542fSVladimir Sementsov-Ogievskiy 
16487a3f542fSVladimir Sementsov-Ogievskiy     if (pad->tail) {
16497a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
16507a3f542fSVladimir Sementsov-Ogievskiy 
16517a3f542fSVladimir Sementsov-Ogievskiy         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
16527a3f542fSVladimir Sementsov-Ogievskiy         ret = bdrv_aligned_preadv(
16537a3f542fSVladimir Sementsov-Ogievskiy                 child, req,
16547a3f542fSVladimir Sementsov-Ogievskiy                 req->overlap_offset + req->overlap_bytes - align,
165565cd4424SVladimir Sementsov-Ogievskiy                 align, align, &local_qiov, 0, 0);
16567a3f542fSVladimir Sementsov-Ogievskiy         if (ret < 0) {
16577a3f542fSVladimir Sementsov-Ogievskiy             return ret;
16587a3f542fSVladimir Sementsov-Ogievskiy         }
16597a3f542fSVladimir Sementsov-Ogievskiy         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
16607a3f542fSVladimir Sementsov-Ogievskiy     }
16617a3f542fSVladimir Sementsov-Ogievskiy 
16627a3f542fSVladimir Sementsov-Ogievskiy zero_mem:
16637a3f542fSVladimir Sementsov-Ogievskiy     if (zero_middle) {
16647a3f542fSVladimir Sementsov-Ogievskiy         memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
16657a3f542fSVladimir Sementsov-Ogievskiy     }
16667a3f542fSVladimir Sementsov-Ogievskiy 
16677a3f542fSVladimir Sementsov-Ogievskiy     return 0;
16687a3f542fSVladimir Sementsov-Ogievskiy }
16697a3f542fSVladimir Sementsov-Ogievskiy 
16707a3f542fSVladimir Sementsov-Ogievskiy static void bdrv_padding_destroy(BdrvRequestPadding *pad)
16717a3f542fSVladimir Sementsov-Ogievskiy {
16727a3f542fSVladimir Sementsov-Ogievskiy     if (pad->buf) {
16737a3f542fSVladimir Sementsov-Ogievskiy         qemu_vfree(pad->buf);
16747a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_destroy(&pad->local_qiov);
16757a3f542fSVladimir Sementsov-Ogievskiy     }
16767a3f542fSVladimir Sementsov-Ogievskiy }
16777a3f542fSVladimir Sementsov-Ogievskiy 
16787a3f542fSVladimir Sementsov-Ogievskiy /*
16797a3f542fSVladimir Sementsov-Ogievskiy  * bdrv_pad_request
16807a3f542fSVladimir Sementsov-Ogievskiy  *
16817a3f542fSVladimir Sementsov-Ogievskiy  * Exchange request parameters with padded request if needed. Don't include RMW
16827a3f542fSVladimir Sementsov-Ogievskiy  * read of padding, bdrv_padding_rmw_read() should be called separately if
16837a3f542fSVladimir Sementsov-Ogievskiy  * needed.
16847a3f542fSVladimir Sementsov-Ogievskiy  *
16857a3f542fSVladimir Sementsov-Ogievskiy  * All parameters except @bs are in-out: they represent original request at
16867a3f542fSVladimir Sementsov-Ogievskiy  * function call and padded (if padding needed) at function finish.
16877a3f542fSVladimir Sementsov-Ogievskiy  *
16887a3f542fSVladimir Sementsov-Ogievskiy  * Function always succeeds.
16897a3f542fSVladimir Sementsov-Ogievskiy  */
16901acc3466SVladimir Sementsov-Ogievskiy static bool bdrv_pad_request(BlockDriverState *bs,
16911acc3466SVladimir Sementsov-Ogievskiy                              QEMUIOVector **qiov, size_t *qiov_offset,
16927a3f542fSVladimir Sementsov-Ogievskiy                              int64_t *offset, unsigned int *bytes,
16937a3f542fSVladimir Sementsov-Ogievskiy                              BdrvRequestPadding *pad)
16947a3f542fSVladimir Sementsov-Ogievskiy {
16957a3f542fSVladimir Sementsov-Ogievskiy     if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
16967a3f542fSVladimir Sementsov-Ogievskiy         return false;
16977a3f542fSVladimir Sementsov-Ogievskiy     }
16987a3f542fSVladimir Sementsov-Ogievskiy 
16997a3f542fSVladimir Sementsov-Ogievskiy     qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
17001acc3466SVladimir Sementsov-Ogievskiy                              *qiov, *qiov_offset, *bytes,
17017a3f542fSVladimir Sementsov-Ogievskiy                              pad->buf + pad->buf_len - pad->tail, pad->tail);
17027a3f542fSVladimir Sementsov-Ogievskiy     *bytes += pad->head + pad->tail;
17037a3f542fSVladimir Sementsov-Ogievskiy     *offset -= pad->head;
17047a3f542fSVladimir Sementsov-Ogievskiy     *qiov = &pad->local_qiov;
17051acc3466SVladimir Sementsov-Ogievskiy     *qiov_offset = 0;
17067a3f542fSVladimir Sementsov-Ogievskiy 
17077a3f542fSVladimir Sementsov-Ogievskiy     return true;
17087a3f542fSVladimir Sementsov-Ogievskiy }
17097a3f542fSVladimir Sementsov-Ogievskiy 
1710a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child,
171161007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
171261007b31SStefan Hajnoczi     BdrvRequestFlags flags)
171361007b31SStefan Hajnoczi {
17141acc3466SVladimir Sementsov-Ogievskiy     return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
17151acc3466SVladimir Sementsov-Ogievskiy }
17161acc3466SVladimir Sementsov-Ogievskiy 
17171acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
17181acc3466SVladimir Sementsov-Ogievskiy     int64_t offset, unsigned int bytes,
17191acc3466SVladimir Sementsov-Ogievskiy     QEMUIOVector *qiov, size_t qiov_offset,
17201acc3466SVladimir Sementsov-Ogievskiy     BdrvRequestFlags flags)
17211acc3466SVladimir Sementsov-Ogievskiy {
1722a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
172361007b31SStefan Hajnoczi     BdrvTrackedRequest req;
17247a3f542fSVladimir Sementsov-Ogievskiy     BdrvRequestPadding pad;
172561007b31SStefan Hajnoczi     int ret;
172661007b31SStefan Hajnoczi 
17277a3f542fSVladimir Sementsov-Ogievskiy     trace_bdrv_co_preadv(bs, offset, bytes, flags);
172861007b31SStefan Hajnoczi 
172961007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
173061007b31SStefan Hajnoczi     if (ret < 0) {
173161007b31SStefan Hajnoczi         return ret;
173261007b31SStefan Hajnoczi     }
173361007b31SStefan Hajnoczi 
1734ac9d00bfSVladimir Sementsov-Ogievskiy     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
1735ac9d00bfSVladimir Sementsov-Ogievskiy         /*
1736ac9d00bfSVladimir Sementsov-Ogievskiy          * Aligning zero request is nonsense. Even if driver has special meaning
1737ac9d00bfSVladimir Sementsov-Ogievskiy          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
1738ac9d00bfSVladimir Sementsov-Ogievskiy          * it to driver due to request_alignment.
1739ac9d00bfSVladimir Sementsov-Ogievskiy          *
1740ac9d00bfSVladimir Sementsov-Ogievskiy          * Still, no reason to return an error if someone do unaligned
1741ac9d00bfSVladimir Sementsov-Ogievskiy          * zero-length read occasionally.
1742ac9d00bfSVladimir Sementsov-Ogievskiy          */
1743ac9d00bfSVladimir Sementsov-Ogievskiy         return 0;
1744ac9d00bfSVladimir Sementsov-Ogievskiy     }
1745ac9d00bfSVladimir Sementsov-Ogievskiy 
174699723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
174799723548SPaolo Bonzini 
17489568b511SWen Congyang     /* Don't do copy-on-read if we read data before write operation */
1749c53cb427SPaolo Bonzini     if (atomic_read(&bs->copy_on_read)) {
175061007b31SStefan Hajnoczi         flags |= BDRV_REQ_COPY_ON_READ;
175161007b31SStefan Hajnoczi     }
175261007b31SStefan Hajnoczi 
17531acc3466SVladimir Sementsov-Ogievskiy     bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad);
175461007b31SStefan Hajnoczi 
1755ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
17567a3f542fSVladimir Sementsov-Ogievskiy     ret = bdrv_aligned_preadv(child, &req, offset, bytes,
17577a3f542fSVladimir Sementsov-Ogievskiy                               bs->bl.request_alignment,
17581acc3466SVladimir Sementsov-Ogievskiy                               qiov, qiov_offset, flags);
175961007b31SStefan Hajnoczi     tracked_request_end(&req);
176099723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
176161007b31SStefan Hajnoczi 
17627a3f542fSVladimir Sementsov-Ogievskiy     bdrv_padding_destroy(&pad);
176361007b31SStefan Hajnoczi 
176461007b31SStefan Hajnoczi     return ret;
176561007b31SStefan Hajnoczi }
176661007b31SStefan Hajnoczi 
1767d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1768f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags)
176961007b31SStefan Hajnoczi {
177061007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
177161007b31SStefan Hajnoczi     QEMUIOVector qiov;
17720d93ed08SVladimir Sementsov-Ogievskiy     void *buf = NULL;
177361007b31SStefan Hajnoczi     int ret = 0;
1774465fe887SEric Blake     bool need_flush = false;
1775443668caSDenis V. Lunev     int head = 0;
1776443668caSDenis V. Lunev     int tail = 0;
177761007b31SStefan Hajnoczi 
1778cf081fcaSEric Blake     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1779a5b8dd2cSEric Blake     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1780a5b8dd2cSEric Blake                         bs->bl.request_alignment);
1781cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1782cf081fcaSEric Blake 
1783d470ad42SMax Reitz     if (!drv) {
1784d470ad42SMax Reitz         return -ENOMEDIUM;
1785d470ad42SMax Reitz     }
1786d470ad42SMax Reitz 
1787fe0480d6SKevin Wolf     if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1788fe0480d6SKevin Wolf         return -ENOTSUP;
1789fe0480d6SKevin Wolf     }
1790fe0480d6SKevin Wolf 
1791b8d0a980SEric Blake     assert(alignment % bs->bl.request_alignment == 0);
1792b8d0a980SEric Blake     head = offset % alignment;
1793f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % alignment;
1794b8d0a980SEric Blake     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1795b8d0a980SEric Blake     assert(max_write_zeroes >= bs->bl.request_alignment);
179661007b31SStefan Hajnoczi 
1797f5a5ca79SManos Pitsidianakis     while (bytes > 0 && !ret) {
1798f5a5ca79SManos Pitsidianakis         int num = bytes;
179961007b31SStefan Hajnoczi 
180061007b31SStefan Hajnoczi         /* Align request.  Block drivers can expect the "bulk" of the request
1801443668caSDenis V. Lunev          * to be aligned, and that unaligned requests do not cross cluster
1802443668caSDenis V. Lunev          * boundaries.
180361007b31SStefan Hajnoczi          */
1804443668caSDenis V. Lunev         if (head) {
1805b2f95feeSEric Blake             /* Make a small request up to the first aligned sector. For
1806b2f95feeSEric Blake              * convenience, limit this request to max_transfer even if
1807b2f95feeSEric Blake              * we don't need to fall back to writes.  */
1808f5a5ca79SManos Pitsidianakis             num = MIN(MIN(bytes, max_transfer), alignment - head);
1809b2f95feeSEric Blake             head = (head + num) % alignment;
1810b2f95feeSEric Blake             assert(num < max_write_zeroes);
1811d05aa8bbSEric Blake         } else if (tail && num > alignment) {
1812443668caSDenis V. Lunev             /* Shorten the request to the last aligned sector.  */
1813443668caSDenis V. Lunev             num -= tail;
181461007b31SStefan Hajnoczi         }
181561007b31SStefan Hajnoczi 
181661007b31SStefan Hajnoczi         /* limit request size */
181761007b31SStefan Hajnoczi         if (num > max_write_zeroes) {
181861007b31SStefan Hajnoczi             num = max_write_zeroes;
181961007b31SStefan Hajnoczi         }
182061007b31SStefan Hajnoczi 
182161007b31SStefan Hajnoczi         ret = -ENOTSUP;
182261007b31SStefan Hajnoczi         /* First try the efficient write zeroes operation */
1823d05aa8bbSEric Blake         if (drv->bdrv_co_pwrite_zeroes) {
1824d05aa8bbSEric Blake             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1825d05aa8bbSEric Blake                                              flags & bs->supported_zero_flags);
1826d05aa8bbSEric Blake             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1827d05aa8bbSEric Blake                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1828d05aa8bbSEric Blake                 need_flush = true;
1829d05aa8bbSEric Blake             }
1830465fe887SEric Blake         } else {
1831465fe887SEric Blake             assert(!bs->supported_zero_flags);
183261007b31SStefan Hajnoczi         }
183361007b31SStefan Hajnoczi 
1834294682ccSAndrey Shinkevich         if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
183561007b31SStefan Hajnoczi             /* Fall back to bounce buffer if write zeroes is unsupported */
1836465fe887SEric Blake             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1837465fe887SEric Blake 
1838465fe887SEric Blake             if ((flags & BDRV_REQ_FUA) &&
1839465fe887SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1840465fe887SEric Blake                 /* No need for bdrv_driver_pwrite() to do a fallback
1841465fe887SEric Blake                  * flush on each chunk; use just one at the end */
1842465fe887SEric Blake                 write_flags &= ~BDRV_REQ_FUA;
1843465fe887SEric Blake                 need_flush = true;
1844465fe887SEric Blake             }
18455def6b80SEric Blake             num = MIN(num, max_transfer);
18460d93ed08SVladimir Sementsov-Ogievskiy             if (buf == NULL) {
18470d93ed08SVladimir Sementsov-Ogievskiy                 buf = qemu_try_blockalign0(bs, num);
18480d93ed08SVladimir Sementsov-Ogievskiy                 if (buf == NULL) {
184961007b31SStefan Hajnoczi                     ret = -ENOMEM;
185061007b31SStefan Hajnoczi                     goto fail;
185161007b31SStefan Hajnoczi                 }
185261007b31SStefan Hajnoczi             }
18530d93ed08SVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&qiov, buf, num);
185461007b31SStefan Hajnoczi 
1855ac850bf0SVladimir Sementsov-Ogievskiy             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
185661007b31SStefan Hajnoczi 
185761007b31SStefan Hajnoczi             /* Keep bounce buffer around if it is big enough for all
185861007b31SStefan Hajnoczi              * all future requests.
185961007b31SStefan Hajnoczi              */
18605def6b80SEric Blake             if (num < max_transfer) {
18610d93ed08SVladimir Sementsov-Ogievskiy                 qemu_vfree(buf);
18620d93ed08SVladimir Sementsov-Ogievskiy                 buf = NULL;
186361007b31SStefan Hajnoczi             }
186461007b31SStefan Hajnoczi         }
186561007b31SStefan Hajnoczi 
1866d05aa8bbSEric Blake         offset += num;
1867f5a5ca79SManos Pitsidianakis         bytes -= num;
186861007b31SStefan Hajnoczi     }
186961007b31SStefan Hajnoczi 
187061007b31SStefan Hajnoczi fail:
1871465fe887SEric Blake     if (ret == 0 && need_flush) {
1872465fe887SEric Blake         ret = bdrv_co_flush(bs);
1873465fe887SEric Blake     }
18740d93ed08SVladimir Sementsov-Ogievskiy     qemu_vfree(buf);
187561007b31SStefan Hajnoczi     return ret;
187661007b31SStefan Hajnoczi }
187761007b31SStefan Hajnoczi 
187885fe2479SFam Zheng static inline int coroutine_fn
187985fe2479SFam Zheng bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
188085fe2479SFam Zheng                           BdrvTrackedRequest *req, int flags)
188185fe2479SFam Zheng {
188285fe2479SFam Zheng     BlockDriverState *bs = child->bs;
188385fe2479SFam Zheng     bool waited;
188485fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
188585fe2479SFam Zheng 
188685fe2479SFam Zheng     if (bs->read_only) {
188785fe2479SFam Zheng         return -EPERM;
188885fe2479SFam Zheng     }
188985fe2479SFam Zheng 
189085fe2479SFam Zheng     assert(!(bs->open_flags & BDRV_O_INACTIVE));
189185fe2479SFam Zheng     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
189285fe2479SFam Zheng     assert(!(flags & ~BDRV_REQ_MASK));
189385fe2479SFam Zheng 
189485fe2479SFam Zheng     if (flags & BDRV_REQ_SERIALISING) {
189518fbd0deSPaolo Bonzini         waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
189618fbd0deSPaolo Bonzini         /*
189718fbd0deSPaolo Bonzini          * For a misaligned request we should have already waited earlier,
189818fbd0deSPaolo Bonzini          * because we come after bdrv_padding_rmw_read which must be called
189918fbd0deSPaolo Bonzini          * with the request already marked as serialising.
190018fbd0deSPaolo Bonzini          */
190118fbd0deSPaolo Bonzini         assert(!waited ||
190218fbd0deSPaolo Bonzini                (req->offset == req->overlap_offset &&
190318fbd0deSPaolo Bonzini                 req->bytes == req->overlap_bytes));
190418fbd0deSPaolo Bonzini     } else {
190518fbd0deSPaolo Bonzini         bdrv_wait_serialising_requests(req);
190685fe2479SFam Zheng     }
190785fe2479SFam Zheng 
190885fe2479SFam Zheng     assert(req->overlap_offset <= offset);
190985fe2479SFam Zheng     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1910cd47d792SFam Zheng     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
191185fe2479SFam Zheng 
1912cd47d792SFam Zheng     switch (req->type) {
1913cd47d792SFam Zheng     case BDRV_TRACKED_WRITE:
1914cd47d792SFam Zheng     case BDRV_TRACKED_DISCARD:
191585fe2479SFam Zheng         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
191685fe2479SFam Zheng             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
191785fe2479SFam Zheng         } else {
191885fe2479SFam Zheng             assert(child->perm & BLK_PERM_WRITE);
191985fe2479SFam Zheng         }
1920cd47d792SFam Zheng         return notifier_with_return_list_notify(&bs->before_write_notifiers,
1921cd47d792SFam Zheng                                                 req);
1922cd47d792SFam Zheng     case BDRV_TRACKED_TRUNCATE:
1923cd47d792SFam Zheng         assert(child->perm & BLK_PERM_RESIZE);
1924cd47d792SFam Zheng         return 0;
1925cd47d792SFam Zheng     default:
1926cd47d792SFam Zheng         abort();
1927cd47d792SFam Zheng     }
192885fe2479SFam Zheng }
192985fe2479SFam Zheng 
193085fe2479SFam Zheng static inline void coroutine_fn
193185fe2479SFam Zheng bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
193285fe2479SFam Zheng                          BdrvTrackedRequest *req, int ret)
193385fe2479SFam Zheng {
193485fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
193585fe2479SFam Zheng     BlockDriverState *bs = child->bs;
193685fe2479SFam Zheng 
193785fe2479SFam Zheng     atomic_inc(&bs->write_gen);
193885fe2479SFam Zheng 
193900695c27SFam Zheng     /*
194000695c27SFam Zheng      * Discard cannot extend the image, but in error handling cases, such as
194100695c27SFam Zheng      * when reverting a qcow2 cluster allocation, the discarded range can pass
194200695c27SFam Zheng      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
194300695c27SFam Zheng      * here. Instead, just skip it, since semantically a discard request
194400695c27SFam Zheng      * beyond EOF cannot expand the image anyway.
194500695c27SFam Zheng      */
19467f8f03efSFam Zheng     if (ret == 0 &&
1947cd47d792SFam Zheng         (req->type == BDRV_TRACKED_TRUNCATE ||
1948cd47d792SFam Zheng          end_sector > bs->total_sectors) &&
194900695c27SFam Zheng         req->type != BDRV_TRACKED_DISCARD) {
19507f8f03efSFam Zheng         bs->total_sectors = end_sector;
19517f8f03efSFam Zheng         bdrv_parent_cb_resize(bs);
19527f8f03efSFam Zheng         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
195385fe2479SFam Zheng     }
195400695c27SFam Zheng     if (req->bytes) {
195500695c27SFam Zheng         switch (req->type) {
195600695c27SFam Zheng         case BDRV_TRACKED_WRITE:
195700695c27SFam Zheng             stat64_max(&bs->wr_highest_offset, offset + bytes);
195800695c27SFam Zheng             /* fall through, to set dirty bits */
195900695c27SFam Zheng         case BDRV_TRACKED_DISCARD:
19607f8f03efSFam Zheng             bdrv_set_dirty(bs, offset, bytes);
196100695c27SFam Zheng             break;
196200695c27SFam Zheng         default:
196300695c27SFam Zheng             break;
196400695c27SFam Zheng         }
196500695c27SFam Zheng     }
196685fe2479SFam Zheng }
196785fe2479SFam Zheng 
196861007b31SStefan Hajnoczi /*
196904ed95f4SEric Blake  * Forwards an already correctly aligned write request to the BlockDriver,
197004ed95f4SEric Blake  * after possibly fragmenting it.
197161007b31SStefan Hajnoczi  */
197285c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
197361007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
197428c4da28SVladimir Sementsov-Ogievskiy     int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
197561007b31SStefan Hajnoczi {
197685c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
197761007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
197861007b31SStefan Hajnoczi     int ret;
197961007b31SStefan Hajnoczi 
198004ed95f4SEric Blake     uint64_t bytes_remaining = bytes;
198104ed95f4SEric Blake     int max_transfer;
198261007b31SStefan Hajnoczi 
1983d470ad42SMax Reitz     if (!drv) {
1984d470ad42SMax Reitz         return -ENOMEDIUM;
1985d470ad42SMax Reitz     }
1986d470ad42SMax Reitz 
1987d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
1988d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
1989d6883bc9SVladimir Sementsov-Ogievskiy     }
1990d6883bc9SVladimir Sementsov-Ogievskiy 
1991cff86b38SEric Blake     assert(is_power_of_2(align));
1992cff86b38SEric Blake     assert((offset & (align - 1)) == 0);
1993cff86b38SEric Blake     assert((bytes & (align - 1)) == 0);
199428c4da28SVladimir Sementsov-Ogievskiy     assert(!qiov || qiov_offset + bytes <= qiov->size);
199504ed95f4SEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
199604ed95f4SEric Blake                                    align);
199761007b31SStefan Hajnoczi 
199885fe2479SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
199961007b31SStefan Hajnoczi 
200061007b31SStefan Hajnoczi     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
2001c1499a5eSEric Blake         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
200228c4da28SVladimir Sementsov-Ogievskiy         qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
200361007b31SStefan Hajnoczi         flags |= BDRV_REQ_ZERO_WRITE;
200461007b31SStefan Hajnoczi         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
200561007b31SStefan Hajnoczi             flags |= BDRV_REQ_MAY_UNMAP;
200661007b31SStefan Hajnoczi         }
200761007b31SStefan Hajnoczi     }
200861007b31SStefan Hajnoczi 
200961007b31SStefan Hajnoczi     if (ret < 0) {
201061007b31SStefan Hajnoczi         /* Do nothing, write notifier decided to fail this request */
201161007b31SStefan Hajnoczi     } else if (flags & BDRV_REQ_ZERO_WRITE) {
20129a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
20139896c876SKevin Wolf         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
20143ea1a091SPavel Butsykin     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
201528c4da28SVladimir Sementsov-Ogievskiy         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
201628c4da28SVladimir Sementsov-Ogievskiy                                              qiov, qiov_offset);
201704ed95f4SEric Blake     } else if (bytes <= max_transfer) {
20189a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV);
201928c4da28SVladimir Sementsov-Ogievskiy         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
202004ed95f4SEric Blake     } else {
202104ed95f4SEric Blake         bdrv_debug_event(bs, BLKDBG_PWRITEV);
202204ed95f4SEric Blake         while (bytes_remaining) {
202304ed95f4SEric Blake             int num = MIN(bytes_remaining, max_transfer);
202404ed95f4SEric Blake             int local_flags = flags;
202504ed95f4SEric Blake 
202604ed95f4SEric Blake             assert(num);
202704ed95f4SEric Blake             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
202804ed95f4SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
202904ed95f4SEric Blake                 /* If FUA is going to be emulated by flush, we only
203004ed95f4SEric Blake                  * need to flush on the last iteration */
203104ed95f4SEric Blake                 local_flags &= ~BDRV_REQ_FUA;
203204ed95f4SEric Blake             }
203304ed95f4SEric Blake 
203404ed95f4SEric Blake             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
203528c4da28SVladimir Sementsov-Ogievskiy                                       num, qiov, bytes - bytes_remaining,
203628c4da28SVladimir Sementsov-Ogievskiy                                       local_flags);
203704ed95f4SEric Blake             if (ret < 0) {
203804ed95f4SEric Blake                 break;
203904ed95f4SEric Blake             }
204004ed95f4SEric Blake             bytes_remaining -= num;
204104ed95f4SEric Blake         }
204261007b31SStefan Hajnoczi     }
20439a4f4c31SKevin Wolf     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
204461007b31SStefan Hajnoczi 
204561007b31SStefan Hajnoczi     if (ret >= 0) {
204604ed95f4SEric Blake         ret = 0;
204761007b31SStefan Hajnoczi     }
204885fe2479SFam Zheng     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
204961007b31SStefan Hajnoczi 
205061007b31SStefan Hajnoczi     return ret;
205161007b31SStefan Hajnoczi }
205261007b31SStefan Hajnoczi 
205385c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
20549eeb6dd1SFam Zheng                                                 int64_t offset,
20559eeb6dd1SFam Zheng                                                 unsigned int bytes,
20569eeb6dd1SFam Zheng                                                 BdrvRequestFlags flags,
20579eeb6dd1SFam Zheng                                                 BdrvTrackedRequest *req)
20589eeb6dd1SFam Zheng {
205985c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
20609eeb6dd1SFam Zheng     QEMUIOVector local_qiov;
2061a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
20629eeb6dd1SFam Zheng     int ret = 0;
20637a3f542fSVladimir Sementsov-Ogievskiy     bool padding;
20647a3f542fSVladimir Sementsov-Ogievskiy     BdrvRequestPadding pad;
20659eeb6dd1SFam Zheng 
20667a3f542fSVladimir Sementsov-Ogievskiy     padding = bdrv_init_padding(bs, offset, bytes, &pad);
20677a3f542fSVladimir Sementsov-Ogievskiy     if (padding) {
2068304d9d7fSMax Reitz         bdrv_mark_request_serialising(req, align);
20699eeb6dd1SFam Zheng 
20707a3f542fSVladimir Sementsov-Ogievskiy         bdrv_padding_rmw_read(child, req, &pad, true);
20717a3f542fSVladimir Sementsov-Ogievskiy 
20727a3f542fSVladimir Sementsov-Ogievskiy         if (pad.head || pad.merge_reads) {
20737a3f542fSVladimir Sementsov-Ogievskiy             int64_t aligned_offset = offset & ~(align - 1);
20747a3f542fSVladimir Sementsov-Ogievskiy             int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
20757a3f542fSVladimir Sementsov-Ogievskiy 
20767a3f542fSVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
20777a3f542fSVladimir Sementsov-Ogievskiy             ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
207828c4da28SVladimir Sementsov-Ogievskiy                                        align, &local_qiov, 0,
20799eeb6dd1SFam Zheng                                        flags & ~BDRV_REQ_ZERO_WRITE);
20807a3f542fSVladimir Sementsov-Ogievskiy             if (ret < 0 || pad.merge_reads) {
20817a3f542fSVladimir Sementsov-Ogievskiy                 /* Error or all work is done */
20827a3f542fSVladimir Sementsov-Ogievskiy                 goto out;
20839eeb6dd1SFam Zheng             }
20847a3f542fSVladimir Sementsov-Ogievskiy             offset += write_bytes - pad.head;
20857a3f542fSVladimir Sementsov-Ogievskiy             bytes -= write_bytes - pad.head;
20867a3f542fSVladimir Sementsov-Ogievskiy         }
20879eeb6dd1SFam Zheng     }
20889eeb6dd1SFam Zheng 
20899eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
20909eeb6dd1SFam Zheng     if (bytes >= align) {
20919eeb6dd1SFam Zheng         /* Write the aligned part in the middle. */
20929eeb6dd1SFam Zheng         uint64_t aligned_bytes = bytes & ~(align - 1);
209385c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
209428c4da28SVladimir Sementsov-Ogievskiy                                    NULL, 0, flags);
20959eeb6dd1SFam Zheng         if (ret < 0) {
20967a3f542fSVladimir Sementsov-Ogievskiy             goto out;
20979eeb6dd1SFam Zheng         }
20989eeb6dd1SFam Zheng         bytes -= aligned_bytes;
20999eeb6dd1SFam Zheng         offset += aligned_bytes;
21009eeb6dd1SFam Zheng     }
21019eeb6dd1SFam Zheng 
21029eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
21039eeb6dd1SFam Zheng     if (bytes) {
21047a3f542fSVladimir Sementsov-Ogievskiy         assert(align == pad.tail + bytes);
21059eeb6dd1SFam Zheng 
21067a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
210785c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
210828c4da28SVladimir Sementsov-Ogievskiy                                    &local_qiov, 0,
210928c4da28SVladimir Sementsov-Ogievskiy                                    flags & ~BDRV_REQ_ZERO_WRITE);
21109eeb6dd1SFam Zheng     }
21119eeb6dd1SFam Zheng 
21127a3f542fSVladimir Sementsov-Ogievskiy out:
21137a3f542fSVladimir Sementsov-Ogievskiy     bdrv_padding_destroy(&pad);
21147a3f542fSVladimir Sementsov-Ogievskiy 
21157a3f542fSVladimir Sementsov-Ogievskiy     return ret;
21169eeb6dd1SFam Zheng }
21179eeb6dd1SFam Zheng 
211861007b31SStefan Hajnoczi /*
211961007b31SStefan Hajnoczi  * Handle a write request in coroutine context
212061007b31SStefan Hajnoczi  */
2121a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
212261007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
212361007b31SStefan Hajnoczi     BdrvRequestFlags flags)
212461007b31SStefan Hajnoczi {
21251acc3466SVladimir Sementsov-Ogievskiy     return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
21261acc3466SVladimir Sementsov-Ogievskiy }
21271acc3466SVladimir Sementsov-Ogievskiy 
21281acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
21291acc3466SVladimir Sementsov-Ogievskiy     int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
21301acc3466SVladimir Sementsov-Ogievskiy     BdrvRequestFlags flags)
21311acc3466SVladimir Sementsov-Ogievskiy {
2132a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
213361007b31SStefan Hajnoczi     BdrvTrackedRequest req;
2134a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
21357a3f542fSVladimir Sementsov-Ogievskiy     BdrvRequestPadding pad;
213661007b31SStefan Hajnoczi     int ret;
213761007b31SStefan Hajnoczi 
2138f42cf447SDaniel P. Berrange     trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
2139f42cf447SDaniel P. Berrange 
214061007b31SStefan Hajnoczi     if (!bs->drv) {
214161007b31SStefan Hajnoczi         return -ENOMEDIUM;
214261007b31SStefan Hajnoczi     }
214361007b31SStefan Hajnoczi 
214461007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
214561007b31SStefan Hajnoczi     if (ret < 0) {
214661007b31SStefan Hajnoczi         return ret;
214761007b31SStefan Hajnoczi     }
214861007b31SStefan Hajnoczi 
2149f2208fdcSAlberto Garcia     /* If the request is misaligned then we can't make it efficient */
2150f2208fdcSAlberto Garcia     if ((flags & BDRV_REQ_NO_FALLBACK) &&
2151f2208fdcSAlberto Garcia         !QEMU_IS_ALIGNED(offset | bytes, align))
2152f2208fdcSAlberto Garcia     {
2153f2208fdcSAlberto Garcia         return -ENOTSUP;
2154f2208fdcSAlberto Garcia     }
2155f2208fdcSAlberto Garcia 
2156ac9d00bfSVladimir Sementsov-Ogievskiy     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
2157ac9d00bfSVladimir Sementsov-Ogievskiy         /*
2158ac9d00bfSVladimir Sementsov-Ogievskiy          * Aligning zero request is nonsense. Even if driver has special meaning
2159ac9d00bfSVladimir Sementsov-Ogievskiy          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
2160ac9d00bfSVladimir Sementsov-Ogievskiy          * it to driver due to request_alignment.
2161ac9d00bfSVladimir Sementsov-Ogievskiy          *
2162ac9d00bfSVladimir Sementsov-Ogievskiy          * Still, no reason to return an error if someone do unaligned
2163ac9d00bfSVladimir Sementsov-Ogievskiy          * zero-length write occasionally.
2164ac9d00bfSVladimir Sementsov-Ogievskiy          */
2165ac9d00bfSVladimir Sementsov-Ogievskiy         return 0;
2166ac9d00bfSVladimir Sementsov-Ogievskiy     }
2167ac9d00bfSVladimir Sementsov-Ogievskiy 
216899723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
216961007b31SStefan Hajnoczi     /*
217061007b31SStefan Hajnoczi      * Align write if necessary by performing a read-modify-write cycle.
217161007b31SStefan Hajnoczi      * Pad qiov with the read parts and be sure to have a tracked request not
217261007b31SStefan Hajnoczi      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
217361007b31SStefan Hajnoczi      */
2174ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
217561007b31SStefan Hajnoczi 
217618a59f03SAnton Nefedov     if (flags & BDRV_REQ_ZERO_WRITE) {
217785c97ca7SKevin Wolf         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
21789eeb6dd1SFam Zheng         goto out;
21799eeb6dd1SFam Zheng     }
21809eeb6dd1SFam Zheng 
21811acc3466SVladimir Sementsov-Ogievskiy     if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) {
2182304d9d7fSMax Reitz         bdrv_mark_request_serialising(&req, align);
21837a3f542fSVladimir Sementsov-Ogievskiy         bdrv_padding_rmw_read(child, &req, &pad, false);
218461007b31SStefan Hajnoczi     }
218561007b31SStefan Hajnoczi 
218685c97ca7SKevin Wolf     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
21871acc3466SVladimir Sementsov-Ogievskiy                                qiov, qiov_offset, flags);
218861007b31SStefan Hajnoczi 
21897a3f542fSVladimir Sementsov-Ogievskiy     bdrv_padding_destroy(&pad);
219061007b31SStefan Hajnoczi 
21919eeb6dd1SFam Zheng out:
21929eeb6dd1SFam Zheng     tracked_request_end(&req);
219399723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
21947a3f542fSVladimir Sementsov-Ogievskiy 
219561007b31SStefan Hajnoczi     return ret;
219661007b31SStefan Hajnoczi }
219761007b31SStefan Hajnoczi 
2198a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2199f5a5ca79SManos Pitsidianakis                                        int bytes, BdrvRequestFlags flags)
220061007b31SStefan Hajnoczi {
2201f5a5ca79SManos Pitsidianakis     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
220261007b31SStefan Hajnoczi 
2203a03ef88fSKevin Wolf     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
220461007b31SStefan Hajnoczi         flags &= ~BDRV_REQ_MAY_UNMAP;
220561007b31SStefan Hajnoczi     }
220661007b31SStefan Hajnoczi 
2207f5a5ca79SManos Pitsidianakis     return bdrv_co_pwritev(child, offset, bytes, NULL,
220861007b31SStefan Hajnoczi                            BDRV_REQ_ZERO_WRITE | flags);
220961007b31SStefan Hajnoczi }
221061007b31SStefan Hajnoczi 
22114085f5c7SJohn Snow /*
22124085f5c7SJohn Snow  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
22134085f5c7SJohn Snow  */
22144085f5c7SJohn Snow int bdrv_flush_all(void)
22154085f5c7SJohn Snow {
22164085f5c7SJohn Snow     BdrvNextIterator it;
22174085f5c7SJohn Snow     BlockDriverState *bs = NULL;
22184085f5c7SJohn Snow     int result = 0;
22194085f5c7SJohn Snow 
2220c8aa7895SPavel Dovgalyuk     /*
2221c8aa7895SPavel Dovgalyuk      * bdrv queue is managed by record/replay,
2222c8aa7895SPavel Dovgalyuk      * creating new flush request for stopping
2223c8aa7895SPavel Dovgalyuk      * the VM may break the determinism
2224c8aa7895SPavel Dovgalyuk      */
2225c8aa7895SPavel Dovgalyuk     if (replay_events_enabled()) {
2226c8aa7895SPavel Dovgalyuk         return result;
2227c8aa7895SPavel Dovgalyuk     }
2228c8aa7895SPavel Dovgalyuk 
22294085f5c7SJohn Snow     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
22304085f5c7SJohn Snow         AioContext *aio_context = bdrv_get_aio_context(bs);
22314085f5c7SJohn Snow         int ret;
22324085f5c7SJohn Snow 
22334085f5c7SJohn Snow         aio_context_acquire(aio_context);
22344085f5c7SJohn Snow         ret = bdrv_flush(bs);
22354085f5c7SJohn Snow         if (ret < 0 && !result) {
22364085f5c7SJohn Snow             result = ret;
22374085f5c7SJohn Snow         }
22384085f5c7SJohn Snow         aio_context_release(aio_context);
22394085f5c7SJohn Snow     }
22404085f5c7SJohn Snow 
22414085f5c7SJohn Snow     return result;
22424085f5c7SJohn Snow }
22434085f5c7SJohn Snow 
22444085f5c7SJohn Snow 
22454bcd936eSEric Blake typedef struct BdrvCoBlockStatusData {
224661007b31SStefan Hajnoczi     BlockDriverState *bs;
224761007b31SStefan Hajnoczi     BlockDriverState *base;
2248c9ce8c4dSEric Blake     bool want_zero;
22494bcd936eSEric Blake     int64_t offset;
22504bcd936eSEric Blake     int64_t bytes;
22514bcd936eSEric Blake     int64_t *pnum;
22524bcd936eSEric Blake     int64_t *map;
2253c9ce8c4dSEric Blake     BlockDriverState **file;
22544bcd936eSEric Blake } BdrvCoBlockStatusData;
225561007b31SStefan Hajnoczi 
22563e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
22573e4d0e72SEric Blake                                                 bool want_zero,
22583e4d0e72SEric Blake                                                 int64_t offset,
22593e4d0e72SEric Blake                                                 int64_t bytes,
22603e4d0e72SEric Blake                                                 int64_t *pnum,
22613e4d0e72SEric Blake                                                 int64_t *map,
2262f7cc69b3SManos Pitsidianakis                                                 BlockDriverState **file)
2263f7cc69b3SManos Pitsidianakis {
2264f7cc69b3SManos Pitsidianakis     assert(bs->file && bs->file->bs);
22653e4d0e72SEric Blake     *pnum = bytes;
22663e4d0e72SEric Blake     *map = offset;
2267f7cc69b3SManos Pitsidianakis     *file = bs->file->bs;
22683e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2269f7cc69b3SManos Pitsidianakis }
2270f7cc69b3SManos Pitsidianakis 
22713e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
22723e4d0e72SEric Blake                                                    bool want_zero,
22733e4d0e72SEric Blake                                                    int64_t offset,
22743e4d0e72SEric Blake                                                    int64_t bytes,
22753e4d0e72SEric Blake                                                    int64_t *pnum,
22763e4d0e72SEric Blake                                                    int64_t *map,
2277f7cc69b3SManos Pitsidianakis                                                    BlockDriverState **file)
2278f7cc69b3SManos Pitsidianakis {
2279f7cc69b3SManos Pitsidianakis     assert(bs->backing && bs->backing->bs);
22803e4d0e72SEric Blake     *pnum = bytes;
22813e4d0e72SEric Blake     *map = offset;
2282f7cc69b3SManos Pitsidianakis     *file = bs->backing->bs;
22833e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2284f7cc69b3SManos Pitsidianakis }
2285f7cc69b3SManos Pitsidianakis 
228661007b31SStefan Hajnoczi /*
228761007b31SStefan Hajnoczi  * Returns the allocation status of the specified sectors.
228861007b31SStefan Hajnoczi  * Drivers not implementing the functionality are assumed to not support
228961007b31SStefan Hajnoczi  * backing files, hence all their sectors are reported as allocated.
229061007b31SStefan Hajnoczi  *
229186a3d5c6SEric Blake  * If 'want_zero' is true, the caller is querying for mapping
229286a3d5c6SEric Blake  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
229386a3d5c6SEric Blake  * _ZERO where possible; otherwise, the result favors larger 'pnum',
229486a3d5c6SEric Blake  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2295c9ce8c4dSEric Blake  *
22962e8bc787SEric Blake  * If 'offset' is beyond the end of the disk image the return value is
2297fb0d8654SEric Blake  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
229861007b31SStefan Hajnoczi  *
22992e8bc787SEric Blake  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2300fb0d8654SEric Blake  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2301fb0d8654SEric Blake  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
230267a0fd2aSFam Zheng  *
23032e8bc787SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
23042e8bc787SEric Blake  * following the specified offset) that are easily known to be in the
23052e8bc787SEric Blake  * same allocated/unallocated state.  Note that a second call starting
23062e8bc787SEric Blake  * at the original offset plus returned pnum may have the same status.
23072e8bc787SEric Blake  * The returned value is non-zero on success except at end-of-file.
23082e8bc787SEric Blake  *
23092e8bc787SEric Blake  * Returns negative errno on failure.  Otherwise, if the
23102e8bc787SEric Blake  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
23112e8bc787SEric Blake  * set to the host mapping and BDS corresponding to the guest offset.
231261007b31SStefan Hajnoczi  */
23132e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2314c9ce8c4dSEric Blake                                              bool want_zero,
23152e8bc787SEric Blake                                              int64_t offset, int64_t bytes,
23162e8bc787SEric Blake                                              int64_t *pnum, int64_t *map,
231767a0fd2aSFam Zheng                                              BlockDriverState **file)
231861007b31SStefan Hajnoczi {
23192e8bc787SEric Blake     int64_t total_size;
23202e8bc787SEric Blake     int64_t n; /* bytes */
2321efa6e2edSEric Blake     int ret;
23222e8bc787SEric Blake     int64_t local_map = 0;
2323298a1665SEric Blake     BlockDriverState *local_file = NULL;
2324efa6e2edSEric Blake     int64_t aligned_offset, aligned_bytes;
2325efa6e2edSEric Blake     uint32_t align;
232661007b31SStefan Hajnoczi 
2327298a1665SEric Blake     assert(pnum);
2328298a1665SEric Blake     *pnum = 0;
23292e8bc787SEric Blake     total_size = bdrv_getlength(bs);
23302e8bc787SEric Blake     if (total_size < 0) {
23312e8bc787SEric Blake         ret = total_size;
2332298a1665SEric Blake         goto early_out;
233361007b31SStefan Hajnoczi     }
233461007b31SStefan Hajnoczi 
23352e8bc787SEric Blake     if (offset >= total_size) {
2336298a1665SEric Blake         ret = BDRV_BLOCK_EOF;
2337298a1665SEric Blake         goto early_out;
233861007b31SStefan Hajnoczi     }
23392e8bc787SEric Blake     if (!bytes) {
2340298a1665SEric Blake         ret = 0;
2341298a1665SEric Blake         goto early_out;
23429cdcfd9fSEric Blake     }
234361007b31SStefan Hajnoczi 
23442e8bc787SEric Blake     n = total_size - offset;
23452e8bc787SEric Blake     if (n < bytes) {
23462e8bc787SEric Blake         bytes = n;
234761007b31SStefan Hajnoczi     }
234861007b31SStefan Hajnoczi 
2349d470ad42SMax Reitz     /* Must be non-NULL or bdrv_getlength() would have failed */
2350d470ad42SMax Reitz     assert(bs->drv);
2351636cb512SEric Blake     if (!bs->drv->bdrv_co_block_status) {
23522e8bc787SEric Blake         *pnum = bytes;
235361007b31SStefan Hajnoczi         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
23542e8bc787SEric Blake         if (offset + bytes == total_size) {
2355fb0d8654SEric Blake             ret |= BDRV_BLOCK_EOF;
2356fb0d8654SEric Blake         }
235761007b31SStefan Hajnoczi         if (bs->drv->protocol_name) {
23582e8bc787SEric Blake             ret |= BDRV_BLOCK_OFFSET_VALID;
23592e8bc787SEric Blake             local_map = offset;
2360298a1665SEric Blake             local_file = bs;
236161007b31SStefan Hajnoczi         }
2362298a1665SEric Blake         goto early_out;
236361007b31SStefan Hajnoczi     }
236461007b31SStefan Hajnoczi 
236599723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2366efa6e2edSEric Blake 
2367efa6e2edSEric Blake     /* Round out to request_alignment boundaries */
236886a3d5c6SEric Blake     align = bs->bl.request_alignment;
2369efa6e2edSEric Blake     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2370efa6e2edSEric Blake     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2371efa6e2edSEric Blake 
237286a3d5c6SEric Blake     ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
237386a3d5c6SEric Blake                                         aligned_bytes, pnum, &local_map,
237486a3d5c6SEric Blake                                         &local_file);
237586a3d5c6SEric Blake     if (ret < 0) {
237686a3d5c6SEric Blake         *pnum = 0;
237786a3d5c6SEric Blake         goto out;
237886a3d5c6SEric Blake     }
2379efa6e2edSEric Blake 
2380efa6e2edSEric Blake     /*
2381636cb512SEric Blake      * The driver's result must be a non-zero multiple of request_alignment.
2382efa6e2edSEric Blake      * Clamp pnum and adjust map to original request.
2383efa6e2edSEric Blake      */
2384636cb512SEric Blake     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2385636cb512SEric Blake            align > offset - aligned_offset);
238669f47505SVladimir Sementsov-Ogievskiy     if (ret & BDRV_BLOCK_RECURSE) {
238769f47505SVladimir Sementsov-Ogievskiy         assert(ret & BDRV_BLOCK_DATA);
238869f47505SVladimir Sementsov-Ogievskiy         assert(ret & BDRV_BLOCK_OFFSET_VALID);
238969f47505SVladimir Sementsov-Ogievskiy         assert(!(ret & BDRV_BLOCK_ZERO));
239069f47505SVladimir Sementsov-Ogievskiy     }
239169f47505SVladimir Sementsov-Ogievskiy 
2392efa6e2edSEric Blake     *pnum -= offset - aligned_offset;
2393efa6e2edSEric Blake     if (*pnum > bytes) {
2394efa6e2edSEric Blake         *pnum = bytes;
2395efa6e2edSEric Blake     }
2396efa6e2edSEric Blake     if (ret & BDRV_BLOCK_OFFSET_VALID) {
2397efa6e2edSEric Blake         local_map += offset - aligned_offset;
2398efa6e2edSEric Blake     }
239961007b31SStefan Hajnoczi 
240061007b31SStefan Hajnoczi     if (ret & BDRV_BLOCK_RAW) {
2401298a1665SEric Blake         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
24022e8bc787SEric Blake         ret = bdrv_co_block_status(local_file, want_zero, local_map,
24032e8bc787SEric Blake                                    *pnum, pnum, &local_map, &local_file);
240499723548SPaolo Bonzini         goto out;
240561007b31SStefan Hajnoczi     }
240661007b31SStefan Hajnoczi 
240761007b31SStefan Hajnoczi     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
240861007b31SStefan Hajnoczi         ret |= BDRV_BLOCK_ALLOCATED;
2409*a2adbbf6SVladimir Sementsov-Ogievskiy     } else if (want_zero && bs->drv->supports_backing) {
24107b1efe99SVladimir Sementsov-Ogievskiy         if (bs->backing) {
2411760e0063SKevin Wolf             BlockDriverState *bs2 = bs->backing->bs;
24122e8bc787SEric Blake             int64_t size2 = bdrv_getlength(bs2);
2413c9ce8c4dSEric Blake 
24142e8bc787SEric Blake             if (size2 >= 0 && offset >= size2) {
241561007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
241661007b31SStefan Hajnoczi             }
24177b1efe99SVladimir Sementsov-Ogievskiy         } else {
24187b1efe99SVladimir Sementsov-Ogievskiy             ret |= BDRV_BLOCK_ZERO;
24197b1efe99SVladimir Sementsov-Ogievskiy         }
242061007b31SStefan Hajnoczi     }
242161007b31SStefan Hajnoczi 
242269f47505SVladimir Sementsov-Ogievskiy     if (want_zero && ret & BDRV_BLOCK_RECURSE &&
242369f47505SVladimir Sementsov-Ogievskiy         local_file && local_file != bs &&
242461007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
242561007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_OFFSET_VALID)) {
24262e8bc787SEric Blake         int64_t file_pnum;
24272e8bc787SEric Blake         int ret2;
242861007b31SStefan Hajnoczi 
24292e8bc787SEric Blake         ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
24302e8bc787SEric Blake                                     *pnum, &file_pnum, NULL, NULL);
243161007b31SStefan Hajnoczi         if (ret2 >= 0) {
243261007b31SStefan Hajnoczi             /* Ignore errors.  This is just providing extra information, it
243361007b31SStefan Hajnoczi              * is useful but not necessary.
243461007b31SStefan Hajnoczi              */
2435c61e684eSEric Blake             if (ret2 & BDRV_BLOCK_EOF &&
2436c61e684eSEric Blake                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2437c61e684eSEric Blake                 /*
2438c61e684eSEric Blake                  * It is valid for the format block driver to read
2439c61e684eSEric Blake                  * beyond the end of the underlying file's current
2440c61e684eSEric Blake                  * size; such areas read as zero.
2441c61e684eSEric Blake                  */
244261007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
244361007b31SStefan Hajnoczi             } else {
244461007b31SStefan Hajnoczi                 /* Limit request to the range reported by the protocol driver */
244561007b31SStefan Hajnoczi                 *pnum = file_pnum;
244661007b31SStefan Hajnoczi                 ret |= (ret2 & BDRV_BLOCK_ZERO);
244761007b31SStefan Hajnoczi             }
244861007b31SStefan Hajnoczi         }
244961007b31SStefan Hajnoczi     }
245061007b31SStefan Hajnoczi 
245199723548SPaolo Bonzini out:
245299723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
24532e8bc787SEric Blake     if (ret >= 0 && offset + *pnum == total_size) {
2454fb0d8654SEric Blake         ret |= BDRV_BLOCK_EOF;
2455fb0d8654SEric Blake     }
2456298a1665SEric Blake early_out:
2457298a1665SEric Blake     if (file) {
2458298a1665SEric Blake         *file = local_file;
2459298a1665SEric Blake     }
24602e8bc787SEric Blake     if (map) {
24612e8bc787SEric Blake         *map = local_map;
24622e8bc787SEric Blake     }
246361007b31SStefan Hajnoczi     return ret;
246461007b31SStefan Hajnoczi }
246561007b31SStefan Hajnoczi 
24665b648c67SEric Blake static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2467ba3f0e25SFam Zheng                                                    BlockDriverState *base,
2468c9ce8c4dSEric Blake                                                    bool want_zero,
24695b648c67SEric Blake                                                    int64_t offset,
24705b648c67SEric Blake                                                    int64_t bytes,
24715b648c67SEric Blake                                                    int64_t *pnum,
24725b648c67SEric Blake                                                    int64_t *map,
247367a0fd2aSFam Zheng                                                    BlockDriverState **file)
2474ba3f0e25SFam Zheng {
2475ba3f0e25SFam Zheng     BlockDriverState *p;
24765b648c67SEric Blake     int ret = 0;
2477c61e684eSEric Blake     bool first = true;
2478ba3f0e25SFam Zheng 
2479ba3f0e25SFam Zheng     assert(bs != base);
2480760e0063SKevin Wolf     for (p = bs; p != base; p = backing_bs(p)) {
24815b648c67SEric Blake         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
24825b648c67SEric Blake                                    file);
2483c61e684eSEric Blake         if (ret < 0) {
2484c61e684eSEric Blake             break;
2485c61e684eSEric Blake         }
2486c61e684eSEric Blake         if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2487c61e684eSEric Blake             /*
2488c61e684eSEric Blake              * Reading beyond the end of the file continues to read
2489c61e684eSEric Blake              * zeroes, but we can only widen the result to the
2490c61e684eSEric Blake              * unallocated length we learned from an earlier
2491c61e684eSEric Blake              * iteration.
2492c61e684eSEric Blake              */
24935b648c67SEric Blake             *pnum = bytes;
2494c61e684eSEric Blake         }
2495c61e684eSEric Blake         if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2496ba3f0e25SFam Zheng             break;
2497ba3f0e25SFam Zheng         }
24985b648c67SEric Blake         /* [offset, pnum] unallocated on this layer, which could be only
24995b648c67SEric Blake          * the first part of [offset, bytes].  */
25005b648c67SEric Blake         bytes = MIN(bytes, *pnum);
2501c61e684eSEric Blake         first = false;
2502ba3f0e25SFam Zheng     }
2503ba3f0e25SFam Zheng     return ret;
2504ba3f0e25SFam Zheng }
2505ba3f0e25SFam Zheng 
250631826642SEric Blake /* Coroutine wrapper for bdrv_block_status_above() */
25077d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
250861007b31SStefan Hajnoczi {
25094bcd936eSEric Blake     BdrvCoBlockStatusData *data = opaque;
251061007b31SStefan Hajnoczi 
25117d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_block_status_above(data->bs, data->base,
2512c9ce8c4dSEric Blake                                       data->want_zero,
25135b648c67SEric Blake                                       data->offset, data->bytes,
25145b648c67SEric Blake                                       data->pnum, data->map, data->file);
251561007b31SStefan Hajnoczi }
251661007b31SStefan Hajnoczi 
251761007b31SStefan Hajnoczi /*
25185b648c67SEric Blake  * Synchronous wrapper around bdrv_co_block_status_above().
251961007b31SStefan Hajnoczi  *
25205b648c67SEric Blake  * See bdrv_co_block_status_above() for details.
252161007b31SStefan Hajnoczi  */
25227ddb99b9SEric Blake static int bdrv_common_block_status_above(BlockDriverState *bs,
2523ba3f0e25SFam Zheng                                           BlockDriverState *base,
25247ddb99b9SEric Blake                                           bool want_zero, int64_t offset,
25257ddb99b9SEric Blake                                           int64_t bytes, int64_t *pnum,
25267ddb99b9SEric Blake                                           int64_t *map,
252767a0fd2aSFam Zheng                                           BlockDriverState **file)
252861007b31SStefan Hajnoczi {
25294bcd936eSEric Blake     BdrvCoBlockStatusData data = {
253061007b31SStefan Hajnoczi         .bs = bs,
2531ba3f0e25SFam Zheng         .base = base,
2532c9ce8c4dSEric Blake         .want_zero = want_zero,
25337ddb99b9SEric Blake         .offset = offset,
25347ddb99b9SEric Blake         .bytes = bytes,
25357ddb99b9SEric Blake         .pnum = pnum,
25367ddb99b9SEric Blake         .map = map,
2537c9ce8c4dSEric Blake         .file = file,
253861007b31SStefan Hajnoczi     };
253961007b31SStefan Hajnoczi 
25407d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data);
254161007b31SStefan Hajnoczi }
254261007b31SStefan Hajnoczi 
254331826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
254431826642SEric Blake                             int64_t offset, int64_t bytes, int64_t *pnum,
254531826642SEric Blake                             int64_t *map, BlockDriverState **file)
2546c9ce8c4dSEric Blake {
254731826642SEric Blake     return bdrv_common_block_status_above(bs, base, true, offset, bytes,
254831826642SEric Blake                                           pnum, map, file);
2549c9ce8c4dSEric Blake }
2550c9ce8c4dSEric Blake 
2551237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2552237d78f8SEric Blake                       int64_t *pnum, int64_t *map, BlockDriverState **file)
2553ba3f0e25SFam Zheng {
255431826642SEric Blake     return bdrv_block_status_above(bs, backing_bs(bs),
255531826642SEric Blake                                    offset, bytes, pnum, map, file);
2556ba3f0e25SFam Zheng }
2557ba3f0e25SFam Zheng 
2558d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2559d6a644bbSEric Blake                                    int64_t bytes, int64_t *pnum)
256061007b31SStefan Hajnoczi {
25617ddb99b9SEric Blake     int ret;
25627ddb99b9SEric Blake     int64_t dummy;
2563d6a644bbSEric Blake 
25647ddb99b9SEric Blake     ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
25657ddb99b9SEric Blake                                          bytes, pnum ? pnum : &dummy, NULL,
2566298a1665SEric Blake                                          NULL);
256761007b31SStefan Hajnoczi     if (ret < 0) {
256861007b31SStefan Hajnoczi         return ret;
256961007b31SStefan Hajnoczi     }
257061007b31SStefan Hajnoczi     return !!(ret & BDRV_BLOCK_ALLOCATED);
257161007b31SStefan Hajnoczi }
257261007b31SStefan Hajnoczi 
257361007b31SStefan Hajnoczi /*
257461007b31SStefan Hajnoczi  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
257561007b31SStefan Hajnoczi  *
2576170d3bd3SAndrey Shinkevich  * Return 1 if (a prefix of) the given range is allocated in any image
2577170d3bd3SAndrey Shinkevich  * between BASE and TOP (BASE is only included if include_base is set).
2578170d3bd3SAndrey Shinkevich  * BASE can be NULL to check if the given offset is allocated in any
2579170d3bd3SAndrey Shinkevich  * image of the chain.  Return 0 otherwise, or negative errno on
2580170d3bd3SAndrey Shinkevich  * failure.
258161007b31SStefan Hajnoczi  *
258251b0a488SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
258351b0a488SEric Blake  * following the specified offset) that are known to be in the same
258451b0a488SEric Blake  * allocated/unallocated state.  Note that a subsequent call starting
258551b0a488SEric Blake  * at 'offset + *pnum' may return the same allocation status (in other
258651b0a488SEric Blake  * words, the result is not necessarily the maximum possible range);
258751b0a488SEric Blake  * but 'pnum' will only be 0 when end of file is reached.
258861007b31SStefan Hajnoczi  *
258961007b31SStefan Hajnoczi  */
259061007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top,
259161007b31SStefan Hajnoczi                             BlockDriverState *base,
2592170d3bd3SAndrey Shinkevich                             bool include_base, int64_t offset,
2593170d3bd3SAndrey Shinkevich                             int64_t bytes, int64_t *pnum)
259461007b31SStefan Hajnoczi {
259561007b31SStefan Hajnoczi     BlockDriverState *intermediate;
259651b0a488SEric Blake     int ret;
259751b0a488SEric Blake     int64_t n = bytes;
259861007b31SStefan Hajnoczi 
2599170d3bd3SAndrey Shinkevich     assert(base || !include_base);
2600170d3bd3SAndrey Shinkevich 
260161007b31SStefan Hajnoczi     intermediate = top;
2602170d3bd3SAndrey Shinkevich     while (include_base || intermediate != base) {
2603d6a644bbSEric Blake         int64_t pnum_inter;
2604c00716beSEric Blake         int64_t size_inter;
2605d6a644bbSEric Blake 
2606170d3bd3SAndrey Shinkevich         assert(intermediate);
260751b0a488SEric Blake         ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
260861007b31SStefan Hajnoczi         if (ret < 0) {
260961007b31SStefan Hajnoczi             return ret;
2610d6a644bbSEric Blake         }
2611d6a644bbSEric Blake         if (ret) {
261251b0a488SEric Blake             *pnum = pnum_inter;
261361007b31SStefan Hajnoczi             return 1;
261461007b31SStefan Hajnoczi         }
261561007b31SStefan Hajnoczi 
261651b0a488SEric Blake         size_inter = bdrv_getlength(intermediate);
2617c00716beSEric Blake         if (size_inter < 0) {
2618c00716beSEric Blake             return size_inter;
2619c00716beSEric Blake         }
262051b0a488SEric Blake         if (n > pnum_inter &&
262151b0a488SEric Blake             (intermediate == top || offset + pnum_inter < size_inter)) {
262251b0a488SEric Blake             n = pnum_inter;
262361007b31SStefan Hajnoczi         }
262461007b31SStefan Hajnoczi 
2625170d3bd3SAndrey Shinkevich         if (intermediate == base) {
2626170d3bd3SAndrey Shinkevich             break;
2627170d3bd3SAndrey Shinkevich         }
2628170d3bd3SAndrey Shinkevich 
2629760e0063SKevin Wolf         intermediate = backing_bs(intermediate);
263061007b31SStefan Hajnoczi     }
263161007b31SStefan Hajnoczi 
263261007b31SStefan Hajnoczi     *pnum = n;
263361007b31SStefan Hajnoczi     return 0;
263461007b31SStefan Hajnoczi }
263561007b31SStefan Hajnoczi 
26361a8ae822SKevin Wolf typedef struct BdrvVmstateCo {
26371a8ae822SKevin Wolf     BlockDriverState    *bs;
26381a8ae822SKevin Wolf     QEMUIOVector        *qiov;
26391a8ae822SKevin Wolf     int64_t             pos;
26401a8ae822SKevin Wolf     bool                is_read;
26411a8ae822SKevin Wolf } BdrvVmstateCo;
26421a8ae822SKevin Wolf 
26431a8ae822SKevin Wolf static int coroutine_fn
26441a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
26451a8ae822SKevin Wolf                    bool is_read)
26461a8ae822SKevin Wolf {
26471a8ae822SKevin Wolf     BlockDriver *drv = bs->drv;
2648dc88a467SStefan Hajnoczi     int ret = -ENOTSUP;
2649dc88a467SStefan Hajnoczi 
2650dc88a467SStefan Hajnoczi     bdrv_inc_in_flight(bs);
26511a8ae822SKevin Wolf 
26521a8ae822SKevin Wolf     if (!drv) {
2653dc88a467SStefan Hajnoczi         ret = -ENOMEDIUM;
26541a8ae822SKevin Wolf     } else if (drv->bdrv_load_vmstate) {
2655dc88a467SStefan Hajnoczi         if (is_read) {
2656dc88a467SStefan Hajnoczi             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2657dc88a467SStefan Hajnoczi         } else {
2658dc88a467SStefan Hajnoczi             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2659dc88a467SStefan Hajnoczi         }
26601a8ae822SKevin Wolf     } else if (bs->file) {
2661dc88a467SStefan Hajnoczi         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
26621a8ae822SKevin Wolf     }
26631a8ae822SKevin Wolf 
2664dc88a467SStefan Hajnoczi     bdrv_dec_in_flight(bs);
2665dc88a467SStefan Hajnoczi     return ret;
26661a8ae822SKevin Wolf }
26671a8ae822SKevin Wolf 
26687d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
26691a8ae822SKevin Wolf {
26701a8ae822SKevin Wolf     BdrvVmstateCo *co = opaque;
26717d2410ceSVladimir Sementsov-Ogievskiy 
26727d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
26731a8ae822SKevin Wolf }
26741a8ae822SKevin Wolf 
26751a8ae822SKevin Wolf static inline int
26761a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
26771a8ae822SKevin Wolf                 bool is_read)
26781a8ae822SKevin Wolf {
26791a8ae822SKevin Wolf     BdrvVmstateCo data = {
26801a8ae822SKevin Wolf         .bs         = bs,
26811a8ae822SKevin Wolf         .qiov       = qiov,
26821a8ae822SKevin Wolf         .pos        = pos,
26831a8ae822SKevin Wolf         .is_read    = is_read,
26841a8ae822SKevin Wolf     };
26851a8ae822SKevin Wolf 
26867d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data);
26871a8ae822SKevin Wolf }
26881a8ae822SKevin Wolf 
268961007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
269061007b31SStefan Hajnoczi                       int64_t pos, int size)
269161007b31SStefan Hajnoczi {
26920d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2693b433d942SKevin Wolf     int ret;
269461007b31SStefan Hajnoczi 
2695b433d942SKevin Wolf     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2696b433d942SKevin Wolf     if (ret < 0) {
2697b433d942SKevin Wolf         return ret;
2698b433d942SKevin Wolf     }
2699b433d942SKevin Wolf 
2700b433d942SKevin Wolf     return size;
270161007b31SStefan Hajnoczi }
270261007b31SStefan Hajnoczi 
270361007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
270461007b31SStefan Hajnoczi {
27051a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, false);
270661007b31SStefan Hajnoczi }
270761007b31SStefan Hajnoczi 
270861007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
270961007b31SStefan Hajnoczi                       int64_t pos, int size)
271061007b31SStefan Hajnoczi {
27110d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2712b433d942SKevin Wolf     int ret;
27135ddda0b8SKevin Wolf 
2714b433d942SKevin Wolf     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2715b433d942SKevin Wolf     if (ret < 0) {
2716b433d942SKevin Wolf         return ret;
2717b433d942SKevin Wolf     }
2718b433d942SKevin Wolf 
2719b433d942SKevin Wolf     return size;
27205ddda0b8SKevin Wolf }
27215ddda0b8SKevin Wolf 
27225ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
27235ddda0b8SKevin Wolf {
27241a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, true);
272561007b31SStefan Hajnoczi }
272661007b31SStefan Hajnoczi 
272761007b31SStefan Hajnoczi /**************************************************************/
272861007b31SStefan Hajnoczi /* async I/Os */
272961007b31SStefan Hajnoczi 
273061007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb)
273161007b31SStefan Hajnoczi {
273261007b31SStefan Hajnoczi     qemu_aio_ref(acb);
273361007b31SStefan Hajnoczi     bdrv_aio_cancel_async(acb);
273461007b31SStefan Hajnoczi     while (acb->refcnt > 1) {
273561007b31SStefan Hajnoczi         if (acb->aiocb_info->get_aio_context) {
273661007b31SStefan Hajnoczi             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
273761007b31SStefan Hajnoczi         } else if (acb->bs) {
27382f47da5fSPaolo Bonzini             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
27392f47da5fSPaolo Bonzini              * assert that we're not using an I/O thread.  Thread-safe
27402f47da5fSPaolo Bonzini              * code should use bdrv_aio_cancel_async exclusively.
27412f47da5fSPaolo Bonzini              */
27422f47da5fSPaolo Bonzini             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
274361007b31SStefan Hajnoczi             aio_poll(bdrv_get_aio_context(acb->bs), true);
274461007b31SStefan Hajnoczi         } else {
274561007b31SStefan Hajnoczi             abort();
274661007b31SStefan Hajnoczi         }
274761007b31SStefan Hajnoczi     }
274861007b31SStefan Hajnoczi     qemu_aio_unref(acb);
274961007b31SStefan Hajnoczi }
275061007b31SStefan Hajnoczi 
275161007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements
275261007b31SStefan Hajnoczi  * cancel_async, otherwise we do nothing and let the request normally complete.
275361007b31SStefan Hajnoczi  * In either case the completion callback must be called. */
275461007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb)
275561007b31SStefan Hajnoczi {
275661007b31SStefan Hajnoczi     if (acb->aiocb_info->cancel_async) {
275761007b31SStefan Hajnoczi         acb->aiocb_info->cancel_async(acb);
275861007b31SStefan Hajnoczi     }
275961007b31SStefan Hajnoczi }
276061007b31SStefan Hajnoczi 
276161007b31SStefan Hajnoczi /**************************************************************/
276261007b31SStefan Hajnoczi /* Coroutine block device emulation */
276361007b31SStefan Hajnoczi 
27647d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_flush_co_entry(void *opaque)
276561007b31SStefan Hajnoczi {
27667d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_flush(opaque);
276761007b31SStefan Hajnoczi }
276861007b31SStefan Hajnoczi 
276961007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
277061007b31SStefan Hajnoczi {
277149ca6259SFam Zheng     int current_gen;
277249ca6259SFam Zheng     int ret = 0;
277361007b31SStefan Hajnoczi 
277499723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2775c32b82afSPavel Dovgalyuk 
2776e914404eSFam Zheng     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
277749ca6259SFam Zheng         bdrv_is_sg(bs)) {
277849ca6259SFam Zheng         goto early_exit;
277949ca6259SFam Zheng     }
278049ca6259SFam Zheng 
27813783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
278247fec599SPaolo Bonzini     current_gen = atomic_read(&bs->write_gen);
27833ff2f67aSEvgeny Yakovlev 
27843ff2f67aSEvgeny Yakovlev     /* Wait until any previous flushes are completed */
278599723548SPaolo Bonzini     while (bs->active_flush_req) {
27863783fa3dSPaolo Bonzini         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
27873ff2f67aSEvgeny Yakovlev     }
27883ff2f67aSEvgeny Yakovlev 
27893783fa3dSPaolo Bonzini     /* Flushes reach this point in nondecreasing current_gen order.  */
279099723548SPaolo Bonzini     bs->active_flush_req = true;
27913783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
27923ff2f67aSEvgeny Yakovlev 
2793c32b82afSPavel Dovgalyuk     /* Write back all layers by calling one driver function */
2794c32b82afSPavel Dovgalyuk     if (bs->drv->bdrv_co_flush) {
2795c32b82afSPavel Dovgalyuk         ret = bs->drv->bdrv_co_flush(bs);
2796c32b82afSPavel Dovgalyuk         goto out;
2797c32b82afSPavel Dovgalyuk     }
2798c32b82afSPavel Dovgalyuk 
279961007b31SStefan Hajnoczi     /* Write back cached data to the OS even with cache=unsafe */
280061007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
280161007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_os) {
280261007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_os(bs);
280361007b31SStefan Hajnoczi         if (ret < 0) {
2804cdb5e315SFam Zheng             goto out;
280561007b31SStefan Hajnoczi         }
280661007b31SStefan Hajnoczi     }
280761007b31SStefan Hajnoczi 
280861007b31SStefan Hajnoczi     /* But don't actually force it to the disk with cache=unsafe */
280961007b31SStefan Hajnoczi     if (bs->open_flags & BDRV_O_NO_FLUSH) {
281061007b31SStefan Hajnoczi         goto flush_parent;
281161007b31SStefan Hajnoczi     }
281261007b31SStefan Hajnoczi 
28133ff2f67aSEvgeny Yakovlev     /* Check if we really need to flush anything */
28143ff2f67aSEvgeny Yakovlev     if (bs->flushed_gen == current_gen) {
28153ff2f67aSEvgeny Yakovlev         goto flush_parent;
28163ff2f67aSEvgeny Yakovlev     }
28173ff2f67aSEvgeny Yakovlev 
281861007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2819d470ad42SMax Reitz     if (!bs->drv) {
2820d470ad42SMax Reitz         /* bs->drv->bdrv_co_flush() might have ejected the BDS
2821d470ad42SMax Reitz          * (even in case of apparent success) */
2822d470ad42SMax Reitz         ret = -ENOMEDIUM;
2823d470ad42SMax Reitz         goto out;
2824d470ad42SMax Reitz     }
282561007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_disk) {
282661007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_disk(bs);
282761007b31SStefan Hajnoczi     } else if (bs->drv->bdrv_aio_flush) {
282861007b31SStefan Hajnoczi         BlockAIOCB *acb;
282961007b31SStefan Hajnoczi         CoroutineIOCompletion co = {
283061007b31SStefan Hajnoczi             .coroutine = qemu_coroutine_self(),
283161007b31SStefan Hajnoczi         };
283261007b31SStefan Hajnoczi 
283361007b31SStefan Hajnoczi         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
283461007b31SStefan Hajnoczi         if (acb == NULL) {
283561007b31SStefan Hajnoczi             ret = -EIO;
283661007b31SStefan Hajnoczi         } else {
283761007b31SStefan Hajnoczi             qemu_coroutine_yield();
283861007b31SStefan Hajnoczi             ret = co.ret;
283961007b31SStefan Hajnoczi         }
284061007b31SStefan Hajnoczi     } else {
284161007b31SStefan Hajnoczi         /*
284261007b31SStefan Hajnoczi          * Some block drivers always operate in either writethrough or unsafe
284361007b31SStefan Hajnoczi          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
284461007b31SStefan Hajnoczi          * know how the server works (because the behaviour is hardcoded or
284561007b31SStefan Hajnoczi          * depends on server-side configuration), so we can't ensure that
284661007b31SStefan Hajnoczi          * everything is safe on disk. Returning an error doesn't work because
284761007b31SStefan Hajnoczi          * that would break guests even if the server operates in writethrough
284861007b31SStefan Hajnoczi          * mode.
284961007b31SStefan Hajnoczi          *
285061007b31SStefan Hajnoczi          * Let's hope the user knows what he's doing.
285161007b31SStefan Hajnoczi          */
285261007b31SStefan Hajnoczi         ret = 0;
285361007b31SStefan Hajnoczi     }
28543ff2f67aSEvgeny Yakovlev 
285561007b31SStefan Hajnoczi     if (ret < 0) {
2856cdb5e315SFam Zheng         goto out;
285761007b31SStefan Hajnoczi     }
285861007b31SStefan Hajnoczi 
285961007b31SStefan Hajnoczi     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
286061007b31SStefan Hajnoczi      * in the case of cache=unsafe, so there are no useless flushes.
286161007b31SStefan Hajnoczi      */
286261007b31SStefan Hajnoczi flush_parent:
2863cdb5e315SFam Zheng     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2864cdb5e315SFam Zheng out:
28653ff2f67aSEvgeny Yakovlev     /* Notify any pending flushes that we have completed */
2866e6af1e08SKevin Wolf     if (ret == 0) {
28673ff2f67aSEvgeny Yakovlev         bs->flushed_gen = current_gen;
2868e6af1e08SKevin Wolf     }
28693783fa3dSPaolo Bonzini 
28703783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
287199723548SPaolo Bonzini     bs->active_flush_req = false;
2872156af3acSDenis V. Lunev     /* Return value is ignored - it's ok if wait queue is empty */
2873156af3acSDenis V. Lunev     qemu_co_queue_next(&bs->flush_queue);
28743783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
28753ff2f67aSEvgeny Yakovlev 
287649ca6259SFam Zheng early_exit:
287799723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2878cdb5e315SFam Zheng     return ret;
287961007b31SStefan Hajnoczi }
288061007b31SStefan Hajnoczi 
288161007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs)
288261007b31SStefan Hajnoczi {
28837d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(bs, bdrv_flush_co_entry, bs);
288461007b31SStefan Hajnoczi }
288561007b31SStefan Hajnoczi 
288661007b31SStefan Hajnoczi typedef struct DiscardCo {
28870b9fd3f4SFam Zheng     BdrvChild *child;
28880c51a893SEric Blake     int64_t offset;
2889d93e5726SVladimir Sementsov-Ogievskiy     int64_t bytes;
289061007b31SStefan Hajnoczi } DiscardCo;
28917d2410ceSVladimir Sementsov-Ogievskiy 
28927d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
289361007b31SStefan Hajnoczi {
289461007b31SStefan Hajnoczi     DiscardCo *rwco = opaque;
289561007b31SStefan Hajnoczi 
28967d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
289761007b31SStefan Hajnoczi }
289861007b31SStefan Hajnoczi 
2899d93e5726SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
2900d93e5726SVladimir Sementsov-Ogievskiy                                   int64_t bytes)
290161007b31SStefan Hajnoczi {
2902b1066c87SFam Zheng     BdrvTrackedRequest req;
29039f1963b3SEric Blake     int max_pdiscard, ret;
29043482b9bcSEric Blake     int head, tail, align;
29050b9fd3f4SFam Zheng     BlockDriverState *bs = child->bs;
290661007b31SStefan Hajnoczi 
2907d93e5726SVladimir Sementsov-Ogievskiy     if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
290861007b31SStefan Hajnoczi         return -ENOMEDIUM;
290961007b31SStefan Hajnoczi     }
291061007b31SStefan Hajnoczi 
2911d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
2912d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
2913d6883bc9SVladimir Sementsov-Ogievskiy     }
2914d6883bc9SVladimir Sementsov-Ogievskiy 
2915d93e5726SVladimir Sementsov-Ogievskiy     if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) {
2916d93e5726SVladimir Sementsov-Ogievskiy         return -EIO;
291761007b31SStefan Hajnoczi     }
291861007b31SStefan Hajnoczi 
291961007b31SStefan Hajnoczi     /* Do nothing if disabled.  */
292061007b31SStefan Hajnoczi     if (!(bs->open_flags & BDRV_O_UNMAP)) {
292161007b31SStefan Hajnoczi         return 0;
292261007b31SStefan Hajnoczi     }
292361007b31SStefan Hajnoczi 
292402aefe43SEric Blake     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
292561007b31SStefan Hajnoczi         return 0;
292661007b31SStefan Hajnoczi     }
292761007b31SStefan Hajnoczi 
29283482b9bcSEric Blake     /* Discard is advisory, but some devices track and coalesce
29293482b9bcSEric Blake      * unaligned requests, so we must pass everything down rather than
29303482b9bcSEric Blake      * round here.  Still, most devices will just silently ignore
29313482b9bcSEric Blake      * unaligned requests (by returning -ENOTSUP), so we must fragment
29323482b9bcSEric Blake      * the request accordingly.  */
293302aefe43SEric Blake     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2934b8d0a980SEric Blake     assert(align % bs->bl.request_alignment == 0);
2935b8d0a980SEric Blake     head = offset % align;
2936f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % align;
29379f1963b3SEric Blake 
293899723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2939f5a5ca79SManos Pitsidianakis     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
294050824995SFam Zheng 
294100695c27SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2942ec050f77SDenis V. Lunev     if (ret < 0) {
2943ec050f77SDenis V. Lunev         goto out;
2944ec050f77SDenis V. Lunev     }
2945ec050f77SDenis V. Lunev 
29469f1963b3SEric Blake     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
29479f1963b3SEric Blake                                    align);
29483482b9bcSEric Blake     assert(max_pdiscard >= bs->bl.request_alignment);
29499f1963b3SEric Blake 
2950f5a5ca79SManos Pitsidianakis     while (bytes > 0) {
2951d93e5726SVladimir Sementsov-Ogievskiy         int64_t num = bytes;
29523482b9bcSEric Blake 
29533482b9bcSEric Blake         if (head) {
29543482b9bcSEric Blake             /* Make small requests to get to alignment boundaries. */
2955f5a5ca79SManos Pitsidianakis             num = MIN(bytes, align - head);
29563482b9bcSEric Blake             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
29573482b9bcSEric Blake                 num %= bs->bl.request_alignment;
29583482b9bcSEric Blake             }
29593482b9bcSEric Blake             head = (head + num) % align;
29603482b9bcSEric Blake             assert(num < max_pdiscard);
29613482b9bcSEric Blake         } else if (tail) {
29623482b9bcSEric Blake             if (num > align) {
29633482b9bcSEric Blake                 /* Shorten the request to the last aligned cluster.  */
29643482b9bcSEric Blake                 num -= tail;
29653482b9bcSEric Blake             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
29663482b9bcSEric Blake                        tail > bs->bl.request_alignment) {
29673482b9bcSEric Blake                 tail %= bs->bl.request_alignment;
29683482b9bcSEric Blake                 num -= tail;
29693482b9bcSEric Blake             }
29703482b9bcSEric Blake         }
29713482b9bcSEric Blake         /* limit request size */
29723482b9bcSEric Blake         if (num > max_pdiscard) {
29733482b9bcSEric Blake             num = max_pdiscard;
29743482b9bcSEric Blake         }
297561007b31SStefan Hajnoczi 
2976d470ad42SMax Reitz         if (!bs->drv) {
2977d470ad42SMax Reitz             ret = -ENOMEDIUM;
2978d470ad42SMax Reitz             goto out;
2979d470ad42SMax Reitz         }
298047a5486dSEric Blake         if (bs->drv->bdrv_co_pdiscard) {
298147a5486dSEric Blake             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
298261007b31SStefan Hajnoczi         } else {
298361007b31SStefan Hajnoczi             BlockAIOCB *acb;
298461007b31SStefan Hajnoczi             CoroutineIOCompletion co = {
298561007b31SStefan Hajnoczi                 .coroutine = qemu_coroutine_self(),
298661007b31SStefan Hajnoczi             };
298761007b31SStefan Hajnoczi 
29884da444a0SEric Blake             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
298961007b31SStefan Hajnoczi                                              bdrv_co_io_em_complete, &co);
299061007b31SStefan Hajnoczi             if (acb == NULL) {
2991b1066c87SFam Zheng                 ret = -EIO;
2992b1066c87SFam Zheng                 goto out;
299361007b31SStefan Hajnoczi             } else {
299461007b31SStefan Hajnoczi                 qemu_coroutine_yield();
299561007b31SStefan Hajnoczi                 ret = co.ret;
299661007b31SStefan Hajnoczi             }
299761007b31SStefan Hajnoczi         }
299861007b31SStefan Hajnoczi         if (ret && ret != -ENOTSUP) {
2999b1066c87SFam Zheng             goto out;
300061007b31SStefan Hajnoczi         }
300161007b31SStefan Hajnoczi 
30029f1963b3SEric Blake         offset += num;
3003f5a5ca79SManos Pitsidianakis         bytes -= num;
300461007b31SStefan Hajnoczi     }
3005b1066c87SFam Zheng     ret = 0;
3006b1066c87SFam Zheng out:
300700695c27SFam Zheng     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3008b1066c87SFam Zheng     tracked_request_end(&req);
300999723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
3010b1066c87SFam Zheng     return ret;
301161007b31SStefan Hajnoczi }
301261007b31SStefan Hajnoczi 
3013d93e5726SVladimir Sementsov-Ogievskiy int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
301461007b31SStefan Hajnoczi {
301561007b31SStefan Hajnoczi     DiscardCo rwco = {
30160b9fd3f4SFam Zheng         .child = child,
30170c51a893SEric Blake         .offset = offset,
3018f5a5ca79SManos Pitsidianakis         .bytes = bytes,
301961007b31SStefan Hajnoczi     };
302061007b31SStefan Hajnoczi 
30217d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco);
302261007b31SStefan Hajnoczi }
302361007b31SStefan Hajnoczi 
302448af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
302561007b31SStefan Hajnoczi {
302661007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
30275c5ae76aSFam Zheng     CoroutineIOCompletion co = {
30285c5ae76aSFam Zheng         .coroutine = qemu_coroutine_self(),
30295c5ae76aSFam Zheng     };
30305c5ae76aSFam Zheng     BlockAIOCB *acb;
303161007b31SStefan Hajnoczi 
303299723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
303316a389dcSKevin Wolf     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
30345c5ae76aSFam Zheng         co.ret = -ENOTSUP;
30355c5ae76aSFam Zheng         goto out;
30365c5ae76aSFam Zheng     }
30375c5ae76aSFam Zheng 
303816a389dcSKevin Wolf     if (drv->bdrv_co_ioctl) {
303916a389dcSKevin Wolf         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
304016a389dcSKevin Wolf     } else {
30415c5ae76aSFam Zheng         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
30425c5ae76aSFam Zheng         if (!acb) {
3043c8a9fd80SFam Zheng             co.ret = -ENOTSUP;
3044c8a9fd80SFam Zheng             goto out;
30455c5ae76aSFam Zheng         }
30465c5ae76aSFam Zheng         qemu_coroutine_yield();
304716a389dcSKevin Wolf     }
30485c5ae76aSFam Zheng out:
304999723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
30505c5ae76aSFam Zheng     return co.ret;
30515c5ae76aSFam Zheng }
30525c5ae76aSFam Zheng 
305361007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size)
305461007b31SStefan Hajnoczi {
305561007b31SStefan Hajnoczi     return qemu_memalign(bdrv_opt_mem_align(bs), size);
305661007b31SStefan Hajnoczi }
305761007b31SStefan Hajnoczi 
305861007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size)
305961007b31SStefan Hajnoczi {
306061007b31SStefan Hajnoczi     return memset(qemu_blockalign(bs, size), 0, size);
306161007b31SStefan Hajnoczi }
306261007b31SStefan Hajnoczi 
306361007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
306461007b31SStefan Hajnoczi {
306561007b31SStefan Hajnoczi     size_t align = bdrv_opt_mem_align(bs);
306661007b31SStefan Hajnoczi 
306761007b31SStefan Hajnoczi     /* Ensure that NULL is never returned on success */
306861007b31SStefan Hajnoczi     assert(align > 0);
306961007b31SStefan Hajnoczi     if (size == 0) {
307061007b31SStefan Hajnoczi         size = align;
307161007b31SStefan Hajnoczi     }
307261007b31SStefan Hajnoczi 
307361007b31SStefan Hajnoczi     return qemu_try_memalign(align, size);
307461007b31SStefan Hajnoczi }
307561007b31SStefan Hajnoczi 
307661007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
307761007b31SStefan Hajnoczi {
307861007b31SStefan Hajnoczi     void *mem = qemu_try_blockalign(bs, size);
307961007b31SStefan Hajnoczi 
308061007b31SStefan Hajnoczi     if (mem) {
308161007b31SStefan Hajnoczi         memset(mem, 0, size);
308261007b31SStefan Hajnoczi     }
308361007b31SStefan Hajnoczi 
308461007b31SStefan Hajnoczi     return mem;
308561007b31SStefan Hajnoczi }
308661007b31SStefan Hajnoczi 
308761007b31SStefan Hajnoczi /*
308861007b31SStefan Hajnoczi  * Check if all memory in this vector is sector aligned.
308961007b31SStefan Hajnoczi  */
309061007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
309161007b31SStefan Hajnoczi {
309261007b31SStefan Hajnoczi     int i;
30934196d2f0SDenis V. Lunev     size_t alignment = bdrv_min_mem_align(bs);
309461007b31SStefan Hajnoczi 
309561007b31SStefan Hajnoczi     for (i = 0; i < qiov->niov; i++) {
309661007b31SStefan Hajnoczi         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
309761007b31SStefan Hajnoczi             return false;
309861007b31SStefan Hajnoczi         }
309961007b31SStefan Hajnoczi         if (qiov->iov[i].iov_len % alignment) {
310061007b31SStefan Hajnoczi             return false;
310161007b31SStefan Hajnoczi         }
310261007b31SStefan Hajnoczi     }
310361007b31SStefan Hajnoczi 
310461007b31SStefan Hajnoczi     return true;
310561007b31SStefan Hajnoczi }
310661007b31SStefan Hajnoczi 
310761007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs,
310861007b31SStefan Hajnoczi                                     NotifierWithReturn *notifier)
310961007b31SStefan Hajnoczi {
311061007b31SStefan Hajnoczi     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
311161007b31SStefan Hajnoczi }
311261007b31SStefan Hajnoczi 
311361007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs)
311461007b31SStefan Hajnoczi {
31156b98bd64SPaolo Bonzini     BdrvChild *child;
31166b98bd64SPaolo Bonzini 
31176b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
31186b98bd64SPaolo Bonzini         bdrv_io_plug(child->bs);
31196b98bd64SPaolo Bonzini     }
31206b98bd64SPaolo Bonzini 
3121850d54a2SPaolo Bonzini     if (atomic_fetch_inc(&bs->io_plugged) == 0) {
312261007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
312361007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_plug) {
312461007b31SStefan Hajnoczi             drv->bdrv_io_plug(bs);
31256b98bd64SPaolo Bonzini         }
312661007b31SStefan Hajnoczi     }
312761007b31SStefan Hajnoczi }
312861007b31SStefan Hajnoczi 
312961007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs)
313061007b31SStefan Hajnoczi {
31316b98bd64SPaolo Bonzini     BdrvChild *child;
31326b98bd64SPaolo Bonzini 
31336b98bd64SPaolo Bonzini     assert(bs->io_plugged);
3134850d54a2SPaolo Bonzini     if (atomic_fetch_dec(&bs->io_plugged) == 1) {
313561007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
313661007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_unplug) {
313761007b31SStefan Hajnoczi             drv->bdrv_io_unplug(bs);
313861007b31SStefan Hajnoczi         }
313961007b31SStefan Hajnoczi     }
314061007b31SStefan Hajnoczi 
31416b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
31426b98bd64SPaolo Bonzini         bdrv_io_unplug(child->bs);
31436b98bd64SPaolo Bonzini     }
31446b98bd64SPaolo Bonzini }
314523d0ba93SFam Zheng 
314623d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
314723d0ba93SFam Zheng {
314823d0ba93SFam Zheng     BdrvChild *child;
314923d0ba93SFam Zheng 
315023d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_register_buf) {
315123d0ba93SFam Zheng         bs->drv->bdrv_register_buf(bs, host, size);
315223d0ba93SFam Zheng     }
315323d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
315423d0ba93SFam Zheng         bdrv_register_buf(child->bs, host, size);
315523d0ba93SFam Zheng     }
315623d0ba93SFam Zheng }
315723d0ba93SFam Zheng 
315823d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host)
315923d0ba93SFam Zheng {
316023d0ba93SFam Zheng     BdrvChild *child;
316123d0ba93SFam Zheng 
316223d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_unregister_buf) {
316323d0ba93SFam Zheng         bs->drv->bdrv_unregister_buf(bs, host);
316423d0ba93SFam Zheng     }
316523d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
316623d0ba93SFam Zheng         bdrv_unregister_buf(child->bs, host);
316723d0ba93SFam Zheng     }
316823d0ba93SFam Zheng }
3169fcc67678SFam Zheng 
317067b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal(
317167b51fb9SVladimir Sementsov-Ogievskiy         BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
317267b51fb9SVladimir Sementsov-Ogievskiy         uint64_t dst_offset, uint64_t bytes,
317367b51fb9SVladimir Sementsov-Ogievskiy         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3174fcc67678SFam Zheng         bool recurse_src)
3175fcc67678SFam Zheng {
3176999658a0SVladimir Sementsov-Ogievskiy     BdrvTrackedRequest req;
3177fcc67678SFam Zheng     int ret;
3178fcc67678SFam Zheng 
3179fe0480d6SKevin Wolf     /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3180fe0480d6SKevin Wolf     assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3181fe0480d6SKevin Wolf     assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3182fe0480d6SKevin Wolf 
3183d4d3e5a0SFam Zheng     if (!dst || !dst->bs) {
3184fcc67678SFam Zheng         return -ENOMEDIUM;
3185fcc67678SFam Zheng     }
3186fcc67678SFam Zheng     ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
3187fcc67678SFam Zheng     if (ret) {
3188fcc67678SFam Zheng         return ret;
3189fcc67678SFam Zheng     }
319067b51fb9SVladimir Sementsov-Ogievskiy     if (write_flags & BDRV_REQ_ZERO_WRITE) {
319167b51fb9SVladimir Sementsov-Ogievskiy         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3192fcc67678SFam Zheng     }
3193fcc67678SFam Zheng 
3194d4d3e5a0SFam Zheng     if (!src || !src->bs) {
3195d4d3e5a0SFam Zheng         return -ENOMEDIUM;
3196d4d3e5a0SFam Zheng     }
3197d4d3e5a0SFam Zheng     ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
3198d4d3e5a0SFam Zheng     if (ret) {
3199d4d3e5a0SFam Zheng         return ret;
3200d4d3e5a0SFam Zheng     }
3201d4d3e5a0SFam Zheng 
3202fcc67678SFam Zheng     if (!src->bs->drv->bdrv_co_copy_range_from
3203fcc67678SFam Zheng         || !dst->bs->drv->bdrv_co_copy_range_to
3204fcc67678SFam Zheng         || src->bs->encrypted || dst->bs->encrypted) {
3205fcc67678SFam Zheng         return -ENOTSUP;
3206fcc67678SFam Zheng     }
3207999658a0SVladimir Sementsov-Ogievskiy 
3208999658a0SVladimir Sementsov-Ogievskiy     if (recurse_src) {
3209d4d3e5a0SFam Zheng         bdrv_inc_in_flight(src->bs);
3210999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, src->bs, src_offset, bytes,
3211999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_READ);
321237aec7d7SFam Zheng 
321309d2f948SVladimir Sementsov-Ogievskiy         /* BDRV_REQ_SERIALISING is only for write operation */
321409d2f948SVladimir Sementsov-Ogievskiy         assert(!(read_flags & BDRV_REQ_SERIALISING));
3215304d9d7fSMax Reitz         bdrv_wait_serialising_requests(&req);
3216999658a0SVladimir Sementsov-Ogievskiy 
321737aec7d7SFam Zheng         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3218fcc67678SFam Zheng                                                     src, src_offset,
3219fcc67678SFam Zheng                                                     dst, dst_offset,
322067b51fb9SVladimir Sementsov-Ogievskiy                                                     bytes,
322167b51fb9SVladimir Sementsov-Ogievskiy                                                     read_flags, write_flags);
3222999658a0SVladimir Sementsov-Ogievskiy 
3223999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
3224999658a0SVladimir Sementsov-Ogievskiy         bdrv_dec_in_flight(src->bs);
3225fcc67678SFam Zheng     } else {
3226999658a0SVladimir Sementsov-Ogievskiy         bdrv_inc_in_flight(dst->bs);
3227999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3228999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_WRITE);
32290eb1e891SFam Zheng         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
32300eb1e891SFam Zheng                                         write_flags);
32310eb1e891SFam Zheng         if (!ret) {
323237aec7d7SFam Zheng             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3233fcc67678SFam Zheng                                                       src, src_offset,
3234fcc67678SFam Zheng                                                       dst, dst_offset,
323567b51fb9SVladimir Sementsov-Ogievskiy                                                       bytes,
323667b51fb9SVladimir Sementsov-Ogievskiy                                                       read_flags, write_flags);
32370eb1e891SFam Zheng         }
32380eb1e891SFam Zheng         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3239999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
3240d4d3e5a0SFam Zheng         bdrv_dec_in_flight(dst->bs);
3241999658a0SVladimir Sementsov-Ogievskiy     }
3242999658a0SVladimir Sementsov-Ogievskiy 
324337aec7d7SFam Zheng     return ret;
3244fcc67678SFam Zheng }
3245fcc67678SFam Zheng 
3246fcc67678SFam Zheng /* Copy range from @src to @dst.
3247fcc67678SFam Zheng  *
3248fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3249fcc67678SFam Zheng  * semantics. */
3250fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3251fcc67678SFam Zheng                                          BdrvChild *dst, uint64_t dst_offset,
325267b51fb9SVladimir Sementsov-Ogievskiy                                          uint64_t bytes,
325367b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags read_flags,
325467b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags write_flags)
3255fcc67678SFam Zheng {
3256ecc983a5SFam Zheng     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3257ecc983a5SFam Zheng                                   read_flags, write_flags);
3258fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
325967b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, true);
3260fcc67678SFam Zheng }
3261fcc67678SFam Zheng 
3262fcc67678SFam Zheng /* Copy range from @src to @dst.
3263fcc67678SFam Zheng  *
3264fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3265fcc67678SFam Zheng  * semantics. */
3266fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3267fcc67678SFam Zheng                                        BdrvChild *dst, uint64_t dst_offset,
326867b51fb9SVladimir Sementsov-Ogievskiy                                        uint64_t bytes,
326967b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags read_flags,
327067b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags write_flags)
3271fcc67678SFam Zheng {
3272ecc983a5SFam Zheng     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3273ecc983a5SFam Zheng                                 read_flags, write_flags);
3274fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
327567b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, false);
3276fcc67678SFam Zheng }
3277fcc67678SFam Zheng 
3278fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3279fcc67678SFam Zheng                                     BdrvChild *dst, uint64_t dst_offset,
328067b51fb9SVladimir Sementsov-Ogievskiy                                     uint64_t bytes, BdrvRequestFlags read_flags,
328167b51fb9SVladimir Sementsov-Ogievskiy                                     BdrvRequestFlags write_flags)
3282fcc67678SFam Zheng {
328337aec7d7SFam Zheng     return bdrv_co_copy_range_from(src, src_offset,
3284fcc67678SFam Zheng                                    dst, dst_offset,
328567b51fb9SVladimir Sementsov-Ogievskiy                                    bytes, read_flags, write_flags);
3286fcc67678SFam Zheng }
32873d9f2d2aSKevin Wolf 
32883d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs)
32893d9f2d2aSKevin Wolf {
32903d9f2d2aSKevin Wolf     BdrvChild *c;
32913d9f2d2aSKevin Wolf     QLIST_FOREACH(c, &bs->parents, next_parent) {
3292bd86fb99SMax Reitz         if (c->klass->resize) {
3293bd86fb99SMax Reitz             c->klass->resize(c);
32943d9f2d2aSKevin Wolf         }
32953d9f2d2aSKevin Wolf     }
32963d9f2d2aSKevin Wolf }
32973d9f2d2aSKevin Wolf 
32983d9f2d2aSKevin Wolf /**
32993d9f2d2aSKevin Wolf  * Truncate file to 'offset' bytes (needed only for file protocols)
3300c80d8b06SMax Reitz  *
3301c80d8b06SMax Reitz  * If 'exact' is true, the file must be resized to exactly the given
3302c80d8b06SMax Reitz  * 'offset'.  Otherwise, it is sufficient for the node to be at least
3303c80d8b06SMax Reitz  * 'offset' bytes in length.
33043d9f2d2aSKevin Wolf  */
3305c80d8b06SMax Reitz int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
33067b8e4857SKevin Wolf                                   PreallocMode prealloc, BdrvRequestFlags flags,
33077b8e4857SKevin Wolf                                   Error **errp)
33083d9f2d2aSKevin Wolf {
33093d9f2d2aSKevin Wolf     BlockDriverState *bs = child->bs;
33103d9f2d2aSKevin Wolf     BlockDriver *drv = bs->drv;
33111bc5f09fSKevin Wolf     BdrvTrackedRequest req;
33121bc5f09fSKevin Wolf     int64_t old_size, new_bytes;
33133d9f2d2aSKevin Wolf     int ret;
33143d9f2d2aSKevin Wolf 
33153d9f2d2aSKevin Wolf 
33163d9f2d2aSKevin Wolf     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
33173d9f2d2aSKevin Wolf     if (!drv) {
33183d9f2d2aSKevin Wolf         error_setg(errp, "No medium inserted");
33193d9f2d2aSKevin Wolf         return -ENOMEDIUM;
33203d9f2d2aSKevin Wolf     }
33213d9f2d2aSKevin Wolf     if (offset < 0) {
33223d9f2d2aSKevin Wolf         error_setg(errp, "Image size cannot be negative");
33233d9f2d2aSKevin Wolf         return -EINVAL;
33243d9f2d2aSKevin Wolf     }
33253d9f2d2aSKevin Wolf 
33261bc5f09fSKevin Wolf     old_size = bdrv_getlength(bs);
33271bc5f09fSKevin Wolf     if (old_size < 0) {
33281bc5f09fSKevin Wolf         error_setg_errno(errp, -old_size, "Failed to get old image size");
33291bc5f09fSKevin Wolf         return old_size;
33301bc5f09fSKevin Wolf     }
33311bc5f09fSKevin Wolf 
33321bc5f09fSKevin Wolf     if (offset > old_size) {
33331bc5f09fSKevin Wolf         new_bytes = offset - old_size;
33341bc5f09fSKevin Wolf     } else {
33351bc5f09fSKevin Wolf         new_bytes = 0;
33361bc5f09fSKevin Wolf     }
33371bc5f09fSKevin Wolf 
33383d9f2d2aSKevin Wolf     bdrv_inc_in_flight(bs);
33395416a11eSFam Zheng     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
33405416a11eSFam Zheng                           BDRV_TRACKED_TRUNCATE);
33411bc5f09fSKevin Wolf 
33421bc5f09fSKevin Wolf     /* If we are growing the image and potentially using preallocation for the
33431bc5f09fSKevin Wolf      * new area, we need to make sure that no write requests are made to it
33441bc5f09fSKevin Wolf      * concurrently or they might be overwritten by preallocation. */
33451bc5f09fSKevin Wolf     if (new_bytes) {
3346304d9d7fSMax Reitz         bdrv_mark_request_serialising(&req, 1);
3347cd47d792SFam Zheng     }
3348cd47d792SFam Zheng     if (bs->read_only) {
3349cd47d792SFam Zheng         error_setg(errp, "Image is read-only");
3350cd47d792SFam Zheng         ret = -EACCES;
3351cd47d792SFam Zheng         goto out;
3352cd47d792SFam Zheng     }
3353cd47d792SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3354cd47d792SFam Zheng                                     0);
3355cd47d792SFam Zheng     if (ret < 0) {
3356cd47d792SFam Zheng         error_setg_errno(errp, -ret,
3357cd47d792SFam Zheng                          "Failed to prepare request for truncation");
3358cd47d792SFam Zheng         goto out;
33591bc5f09fSKevin Wolf     }
33603d9f2d2aSKevin Wolf 
3361955c7d66SKevin Wolf     /*
3362955c7d66SKevin Wolf      * If the image has a backing file that is large enough that it would
3363955c7d66SKevin Wolf      * provide data for the new area, we cannot leave it unallocated because
3364955c7d66SKevin Wolf      * then the backing file content would become visible. Instead, zero-fill
3365955c7d66SKevin Wolf      * the new area.
3366955c7d66SKevin Wolf      *
3367955c7d66SKevin Wolf      * Note that if the image has a backing file, but was opened without the
3368955c7d66SKevin Wolf      * backing file, taking care of keeping things consistent with that backing
3369955c7d66SKevin Wolf      * file is the user's responsibility.
3370955c7d66SKevin Wolf      */
3371955c7d66SKevin Wolf     if (new_bytes && bs->backing) {
3372955c7d66SKevin Wolf         int64_t backing_len;
3373955c7d66SKevin Wolf 
3374955c7d66SKevin Wolf         backing_len = bdrv_getlength(backing_bs(bs));
3375955c7d66SKevin Wolf         if (backing_len < 0) {
3376955c7d66SKevin Wolf             ret = backing_len;
3377955c7d66SKevin Wolf             error_setg_errno(errp, -ret, "Could not get backing file size");
3378955c7d66SKevin Wolf             goto out;
3379955c7d66SKevin Wolf         }
3380955c7d66SKevin Wolf 
3381955c7d66SKevin Wolf         if (backing_len > old_size) {
3382955c7d66SKevin Wolf             flags |= BDRV_REQ_ZERO_WRITE;
3383955c7d66SKevin Wolf         }
3384955c7d66SKevin Wolf     }
3385955c7d66SKevin Wolf 
33866b7e8f8bSMax Reitz     if (drv->bdrv_co_truncate) {
338792b92799SKevin Wolf         if (flags & ~bs->supported_truncate_flags) {
338892b92799SKevin Wolf             error_setg(errp, "Block driver does not support requested flags");
338992b92799SKevin Wolf             ret = -ENOTSUP;
339092b92799SKevin Wolf             goto out;
339192b92799SKevin Wolf         }
339292b92799SKevin Wolf         ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
33936b7e8f8bSMax Reitz     } else if (bs->file && drv->is_filter) {
33947b8e4857SKevin Wolf         ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
33956b7e8f8bSMax Reitz     } else {
33963d9f2d2aSKevin Wolf         error_setg(errp, "Image format driver does not support resize");
33973d9f2d2aSKevin Wolf         ret = -ENOTSUP;
33983d9f2d2aSKevin Wolf         goto out;
33993d9f2d2aSKevin Wolf     }
34003d9f2d2aSKevin Wolf     if (ret < 0) {
34013d9f2d2aSKevin Wolf         goto out;
34023d9f2d2aSKevin Wolf     }
34036b7e8f8bSMax Reitz 
34043d9f2d2aSKevin Wolf     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
34053d9f2d2aSKevin Wolf     if (ret < 0) {
34063d9f2d2aSKevin Wolf         error_setg_errno(errp, -ret, "Could not refresh total sector count");
34073d9f2d2aSKevin Wolf     } else {
34083d9f2d2aSKevin Wolf         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
34093d9f2d2aSKevin Wolf     }
3410cd47d792SFam Zheng     /* It's possible that truncation succeeded but refresh_total_sectors
3411cd47d792SFam Zheng      * failed, but the latter doesn't affect how we should finish the request.
3412cd47d792SFam Zheng      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3413cd47d792SFam Zheng     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
34143d9f2d2aSKevin Wolf 
34153d9f2d2aSKevin Wolf out:
34161bc5f09fSKevin Wolf     tracked_request_end(&req);
34173d9f2d2aSKevin Wolf     bdrv_dec_in_flight(bs);
34181bc5f09fSKevin Wolf 
34193d9f2d2aSKevin Wolf     return ret;
34203d9f2d2aSKevin Wolf }
34213d9f2d2aSKevin Wolf 
34223d9f2d2aSKevin Wolf typedef struct TruncateCo {
34233d9f2d2aSKevin Wolf     BdrvChild *child;
34243d9f2d2aSKevin Wolf     int64_t offset;
3425c80d8b06SMax Reitz     bool exact;
34263d9f2d2aSKevin Wolf     PreallocMode prealloc;
34277b8e4857SKevin Wolf     BdrvRequestFlags flags;
34283d9f2d2aSKevin Wolf     Error **errp;
34293d9f2d2aSKevin Wolf } TruncateCo;
34303d9f2d2aSKevin Wolf 
34317d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_truncate_co_entry(void *opaque)
34323d9f2d2aSKevin Wolf {
34333d9f2d2aSKevin Wolf     TruncateCo *tco = opaque;
34347d2410ceSVladimir Sementsov-Ogievskiy 
34357d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_truncate(tco->child, tco->offset, tco->exact,
34367b8e4857SKevin Wolf                             tco->prealloc, tco->flags, tco->errp);
34373d9f2d2aSKevin Wolf }
34383d9f2d2aSKevin Wolf 
3439c80d8b06SMax Reitz int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
34407b8e4857SKevin Wolf                   PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
34413d9f2d2aSKevin Wolf {
34423d9f2d2aSKevin Wolf     TruncateCo tco = {
34433d9f2d2aSKevin Wolf         .child      = child,
34443d9f2d2aSKevin Wolf         .offset     = offset,
3445c80d8b06SMax Reitz         .exact      = exact,
34463d9f2d2aSKevin Wolf         .prealloc   = prealloc,
34477b8e4857SKevin Wolf         .flags      = flags,
34483d9f2d2aSKevin Wolf         .errp       = errp,
34493d9f2d2aSKevin Wolf     };
34503d9f2d2aSKevin Wolf 
34517d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco);
34523d9f2d2aSKevin Wolf }
3453