xref: /qemu/block/io.c (revision 23b93525a2f30915f6c8418eb23db6912a3b5811)
161007b31SStefan Hajnoczi /*
261007b31SStefan Hajnoczi  * Block layer I/O functions
361007b31SStefan Hajnoczi  *
461007b31SStefan Hajnoczi  * Copyright (c) 2003 Fabrice Bellard
561007b31SStefan Hajnoczi  *
661007b31SStefan Hajnoczi  * Permission is hereby granted, free of charge, to any person obtaining a copy
761007b31SStefan Hajnoczi  * of this software and associated documentation files (the "Software"), to deal
861007b31SStefan Hajnoczi  * in the Software without restriction, including without limitation the rights
961007b31SStefan Hajnoczi  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1061007b31SStefan Hajnoczi  * copies of the Software, and to permit persons to whom the Software is
1161007b31SStefan Hajnoczi  * furnished to do so, subject to the following conditions:
1261007b31SStefan Hajnoczi  *
1361007b31SStefan Hajnoczi  * The above copyright notice and this permission notice shall be included in
1461007b31SStefan Hajnoczi  * all copies or substantial portions of the Software.
1561007b31SStefan Hajnoczi  *
1661007b31SStefan Hajnoczi  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1761007b31SStefan Hajnoczi  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1861007b31SStefan Hajnoczi  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1961007b31SStefan Hajnoczi  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2061007b31SStefan Hajnoczi  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2161007b31SStefan Hajnoczi  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2261007b31SStefan Hajnoczi  * THE SOFTWARE.
2361007b31SStefan Hajnoczi  */
2461007b31SStefan Hajnoczi 
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2661007b31SStefan Hajnoczi #include "trace.h"
277f0e9da6SMax Reitz #include "sysemu/block-backend.h"
287719f3c9SStefan Hajnoczi #include "block/aio-wait.h"
2961007b31SStefan Hajnoczi #include "block/blockjob.h"
30f321dcb5SPaolo Bonzini #include "block/blockjob_int.h"
3161007b31SStefan Hajnoczi #include "block/block_int.h"
32f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
33da34e65cSMarkus Armbruster #include "qapi/error.h"
34d49b6836SMarkus Armbruster #include "qemu/error-report.h"
35db725815SMarkus Armbruster #include "qemu/main-loop.h"
36c8aa7895SPavel Dovgalyuk #include "sysemu/replay.h"
3761007b31SStefan Hajnoczi 
38cb2e2878SEric Blake /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39cb2e2878SEric Blake #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
40cb2e2878SEric Blake 
417f8f03efSFam Zheng static void bdrv_parent_cb_resize(BlockDriverState *bs);
42d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
43f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags);
4461007b31SStefan Hajnoczi 
45f4c8a43bSMax Reitz static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
466cd5c9d7SKevin Wolf                                       bool ignore_bds_parents)
4761007b31SStefan Hajnoczi {
4802d21300SKevin Wolf     BdrvChild *c, *next;
4927ccdd52SKevin Wolf 
5002d21300SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
51bd86fb99SMax Reitz         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
520152bf40SKevin Wolf             continue;
530152bf40SKevin Wolf         }
544be6a6d1SKevin Wolf         bdrv_parent_drained_begin_single(c, false);
55ce0f1412SPaolo Bonzini     }
56ce0f1412SPaolo Bonzini }
57ce0f1412SPaolo Bonzini 
58e037c09cSMax Reitz static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
59e037c09cSMax Reitz                                                    int *drained_end_counter)
60804db8eaSMax Reitz {
61804db8eaSMax Reitz     assert(c->parent_quiesce_counter > 0);
62804db8eaSMax Reitz     c->parent_quiesce_counter--;
63bd86fb99SMax Reitz     if (c->klass->drained_end) {
64bd86fb99SMax Reitz         c->klass->drained_end(c, drained_end_counter);
65804db8eaSMax Reitz     }
66804db8eaSMax Reitz }
67804db8eaSMax Reitz 
68e037c09cSMax Reitz void bdrv_parent_drained_end_single(BdrvChild *c)
69e037c09cSMax Reitz {
70e037c09cSMax Reitz     int drained_end_counter = 0;
71e037c09cSMax Reitz     bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
72e037c09cSMax Reitz     BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0);
73e037c09cSMax Reitz }
74e037c09cSMax Reitz 
75f4c8a43bSMax Reitz static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
76e037c09cSMax Reitz                                     bool ignore_bds_parents,
77e037c09cSMax Reitz                                     int *drained_end_counter)
78ce0f1412SPaolo Bonzini {
7961ad631cSMax Reitz     BdrvChild *c;
8027ccdd52SKevin Wolf 
8161ad631cSMax Reitz     QLIST_FOREACH(c, &bs->parents, next_parent) {
82bd86fb99SMax Reitz         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
830152bf40SKevin Wolf             continue;
840152bf40SKevin Wolf         }
85e037c09cSMax Reitz         bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
86c2066af0SKevin Wolf     }
8761007b31SStefan Hajnoczi }
8861007b31SStefan Hajnoczi 
894be6a6d1SKevin Wolf static bool bdrv_parent_drained_poll_single(BdrvChild *c)
904be6a6d1SKevin Wolf {
91bd86fb99SMax Reitz     if (c->klass->drained_poll) {
92bd86fb99SMax Reitz         return c->klass->drained_poll(c);
934be6a6d1SKevin Wolf     }
944be6a6d1SKevin Wolf     return false;
954be6a6d1SKevin Wolf }
964be6a6d1SKevin Wolf 
976cd5c9d7SKevin Wolf static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
986cd5c9d7SKevin Wolf                                      bool ignore_bds_parents)
9989bd0305SKevin Wolf {
10089bd0305SKevin Wolf     BdrvChild *c, *next;
10189bd0305SKevin Wolf     bool busy = false;
10289bd0305SKevin Wolf 
10389bd0305SKevin Wolf     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
104bd86fb99SMax Reitz         if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
10589bd0305SKevin Wolf             continue;
10689bd0305SKevin Wolf         }
1074be6a6d1SKevin Wolf         busy |= bdrv_parent_drained_poll_single(c);
10889bd0305SKevin Wolf     }
10989bd0305SKevin Wolf 
11089bd0305SKevin Wolf     return busy;
11189bd0305SKevin Wolf }
11289bd0305SKevin Wolf 
1134be6a6d1SKevin Wolf void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
1144be6a6d1SKevin Wolf {
115804db8eaSMax Reitz     c->parent_quiesce_counter++;
116bd86fb99SMax Reitz     if (c->klass->drained_begin) {
117bd86fb99SMax Reitz         c->klass->drained_begin(c);
1184be6a6d1SKevin Wolf     }
1194be6a6d1SKevin Wolf     if (poll) {
1204be6a6d1SKevin Wolf         BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
1214be6a6d1SKevin Wolf     }
1224be6a6d1SKevin Wolf }
1234be6a6d1SKevin Wolf 
124d9e0dfa2SEric Blake static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
125d9e0dfa2SEric Blake {
126d9e0dfa2SEric Blake     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
127d9e0dfa2SEric Blake     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
128d9e0dfa2SEric Blake     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
129d9e0dfa2SEric Blake                                  src->opt_mem_alignment);
130d9e0dfa2SEric Blake     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
131d9e0dfa2SEric Blake                                  src->min_mem_alignment);
132d9e0dfa2SEric Blake     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
133d9e0dfa2SEric Blake }
134d9e0dfa2SEric Blake 
13561007b31SStefan Hajnoczi void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
13661007b31SStefan Hajnoczi {
13761007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
13861007b31SStefan Hajnoczi     Error *local_err = NULL;
13961007b31SStefan Hajnoczi 
14061007b31SStefan Hajnoczi     memset(&bs->bl, 0, sizeof(bs->bl));
14161007b31SStefan Hajnoczi 
14261007b31SStefan Hajnoczi     if (!drv) {
14361007b31SStefan Hajnoczi         return;
14461007b31SStefan Hajnoczi     }
14561007b31SStefan Hajnoczi 
14679ba8c98SEric Blake     /* Default alignment based on whether driver has byte interface */
147e31f6864SEric Blake     bs->bl.request_alignment = (drv->bdrv_co_preadv ||
148ac850bf0SVladimir Sementsov-Ogievskiy                                 drv->bdrv_aio_preadv ||
149ac850bf0SVladimir Sementsov-Ogievskiy                                 drv->bdrv_co_preadv_part) ? 1 : 512;
15079ba8c98SEric Blake 
15161007b31SStefan Hajnoczi     /* Take some limits from the children as a default */
15261007b31SStefan Hajnoczi     if (bs->file) {
1539a4f4c31SKevin Wolf         bdrv_refresh_limits(bs->file->bs, &local_err);
15461007b31SStefan Hajnoczi         if (local_err) {
15561007b31SStefan Hajnoczi             error_propagate(errp, local_err);
15661007b31SStefan Hajnoczi             return;
15761007b31SStefan Hajnoczi         }
158d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
15961007b31SStefan Hajnoczi     } else {
1604196d2f0SDenis V. Lunev         bs->bl.min_mem_alignment = 512;
161038adc2fSWei Yang         bs->bl.opt_mem_alignment = qemu_real_host_page_size;
162bd44feb7SStefan Hajnoczi 
163bd44feb7SStefan Hajnoczi         /* Safe default since most protocols use readv()/writev()/etc */
164bd44feb7SStefan Hajnoczi         bs->bl.max_iov = IOV_MAX;
16561007b31SStefan Hajnoczi     }
16661007b31SStefan Hajnoczi 
167760e0063SKevin Wolf     if (bs->backing) {
168760e0063SKevin Wolf         bdrv_refresh_limits(bs->backing->bs, &local_err);
16961007b31SStefan Hajnoczi         if (local_err) {
17061007b31SStefan Hajnoczi             error_propagate(errp, local_err);
17161007b31SStefan Hajnoczi             return;
17261007b31SStefan Hajnoczi         }
173d9e0dfa2SEric Blake         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
17461007b31SStefan Hajnoczi     }
17561007b31SStefan Hajnoczi 
17661007b31SStefan Hajnoczi     /* Then let the driver override it */
17761007b31SStefan Hajnoczi     if (drv->bdrv_refresh_limits) {
17861007b31SStefan Hajnoczi         drv->bdrv_refresh_limits(bs, errp);
17961007b31SStefan Hajnoczi     }
18061007b31SStefan Hajnoczi }
18161007b31SStefan Hajnoczi 
18261007b31SStefan Hajnoczi /**
18361007b31SStefan Hajnoczi  * The copy-on-read flag is actually a reference count so multiple users may
18461007b31SStefan Hajnoczi  * use the feature without worrying about clobbering its previous state.
18561007b31SStefan Hajnoczi  * Copy-on-read stays enabled until all users have called to disable it.
18661007b31SStefan Hajnoczi  */
18761007b31SStefan Hajnoczi void bdrv_enable_copy_on_read(BlockDriverState *bs)
18861007b31SStefan Hajnoczi {
189d3faa13eSPaolo Bonzini     atomic_inc(&bs->copy_on_read);
19061007b31SStefan Hajnoczi }
19161007b31SStefan Hajnoczi 
19261007b31SStefan Hajnoczi void bdrv_disable_copy_on_read(BlockDriverState *bs)
19361007b31SStefan Hajnoczi {
194d3faa13eSPaolo Bonzini     int old = atomic_fetch_dec(&bs->copy_on_read);
195d3faa13eSPaolo Bonzini     assert(old >= 1);
19661007b31SStefan Hajnoczi }
19761007b31SStefan Hajnoczi 
19861124f03SPaolo Bonzini typedef struct {
19961124f03SPaolo Bonzini     Coroutine *co;
20061124f03SPaolo Bonzini     BlockDriverState *bs;
20161124f03SPaolo Bonzini     bool done;
202481cad48SManos Pitsidianakis     bool begin;
203b0165585SKevin Wolf     bool recursive;
204fe4f0614SKevin Wolf     bool poll;
2050152bf40SKevin Wolf     BdrvChild *parent;
2066cd5c9d7SKevin Wolf     bool ignore_bds_parents;
2078e1da77eSMax Reitz     int *drained_end_counter;
20861124f03SPaolo Bonzini } BdrvCoDrainData;
20961124f03SPaolo Bonzini 
21061124f03SPaolo Bonzini static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
21161124f03SPaolo Bonzini {
21261124f03SPaolo Bonzini     BdrvCoDrainData *data = opaque;
21361124f03SPaolo Bonzini     BlockDriverState *bs = data->bs;
21461124f03SPaolo Bonzini 
215481cad48SManos Pitsidianakis     if (data->begin) {
216f8ea8dacSManos Pitsidianakis         bs->drv->bdrv_co_drain_begin(bs);
217481cad48SManos Pitsidianakis     } else {
218481cad48SManos Pitsidianakis         bs->drv->bdrv_co_drain_end(bs);
219481cad48SManos Pitsidianakis     }
22061124f03SPaolo Bonzini 
22165181d63SMax Reitz     /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
22261124f03SPaolo Bonzini     atomic_mb_set(&data->done, true);
223e037c09cSMax Reitz     if (!data->begin) {
2248e1da77eSMax Reitz         atomic_dec(data->drained_end_counter);
2258e1da77eSMax Reitz     }
22665181d63SMax Reitz     bdrv_dec_in_flight(bs);
2278e1da77eSMax Reitz 
2280109e7e6SKevin Wolf     g_free(data);
2290109e7e6SKevin Wolf }
23061124f03SPaolo Bonzini 
231db0289b9SKevin Wolf /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
2328e1da77eSMax Reitz static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
2338e1da77eSMax Reitz                               int *drained_end_counter)
23461124f03SPaolo Bonzini {
2350109e7e6SKevin Wolf     BdrvCoDrainData *data;
23661124f03SPaolo Bonzini 
237f8ea8dacSManos Pitsidianakis     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
238481cad48SManos Pitsidianakis             (!begin && !bs->drv->bdrv_co_drain_end)) {
23961124f03SPaolo Bonzini         return;
24061124f03SPaolo Bonzini     }
24161124f03SPaolo Bonzini 
2420109e7e6SKevin Wolf     data = g_new(BdrvCoDrainData, 1);
2430109e7e6SKevin Wolf     *data = (BdrvCoDrainData) {
2440109e7e6SKevin Wolf         .bs = bs,
2450109e7e6SKevin Wolf         .done = false,
2468e1da77eSMax Reitz         .begin = begin,
2478e1da77eSMax Reitz         .drained_end_counter = drained_end_counter,
2480109e7e6SKevin Wolf     };
2490109e7e6SKevin Wolf 
250e037c09cSMax Reitz     if (!begin) {
2518e1da77eSMax Reitz         atomic_inc(drained_end_counter);
2528e1da77eSMax Reitz     }
2538e1da77eSMax Reitz 
2540109e7e6SKevin Wolf     /* Make sure the driver callback completes during the polling phase for
2550109e7e6SKevin Wolf      * drain_begin. */
2560109e7e6SKevin Wolf     bdrv_inc_in_flight(bs);
2570109e7e6SKevin Wolf     data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
2580109e7e6SKevin Wolf     aio_co_schedule(bdrv_get_aio_context(bs), data->co);
25961124f03SPaolo Bonzini }
26061124f03SPaolo Bonzini 
2611cc8e54aSKevin Wolf /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
262fe4f0614SKevin Wolf bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
2636cd5c9d7SKevin Wolf                      BdrvChild *ignore_parent, bool ignore_bds_parents)
26489bd0305SKevin Wolf {
265fe4f0614SKevin Wolf     BdrvChild *child, *next;
266fe4f0614SKevin Wolf 
2676cd5c9d7SKevin Wolf     if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
26889bd0305SKevin Wolf         return true;
26989bd0305SKevin Wolf     }
27089bd0305SKevin Wolf 
271fe4f0614SKevin Wolf     if (atomic_read(&bs->in_flight)) {
272fe4f0614SKevin Wolf         return true;
27389bd0305SKevin Wolf     }
27489bd0305SKevin Wolf 
275fe4f0614SKevin Wolf     if (recursive) {
2766cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
277fe4f0614SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
2786cd5c9d7SKevin Wolf             if (bdrv_drain_poll(child->bs, recursive, child, false)) {
279fe4f0614SKevin Wolf                 return true;
280fe4f0614SKevin Wolf             }
281fe4f0614SKevin Wolf         }
282fe4f0614SKevin Wolf     }
283fe4f0614SKevin Wolf 
284fe4f0614SKevin Wolf     return false;
285fe4f0614SKevin Wolf }
286fe4f0614SKevin Wolf 
287fe4f0614SKevin Wolf static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
28889bd0305SKevin Wolf                                       BdrvChild *ignore_parent)
2891cc8e54aSKevin Wolf {
2906cd5c9d7SKevin Wolf     return bdrv_drain_poll(bs, recursive, ignore_parent, false);
2911cc8e54aSKevin Wolf }
2921cc8e54aSKevin Wolf 
293b0165585SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
2946cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
2956cd5c9d7SKevin Wolf                                   bool poll);
296b0165585SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
2978e1da77eSMax Reitz                                 BdrvChild *parent, bool ignore_bds_parents,
2988e1da77eSMax Reitz                                 int *drained_end_counter);
2990152bf40SKevin Wolf 
300a77fd4bbSFam Zheng static void bdrv_co_drain_bh_cb(void *opaque)
301a77fd4bbSFam Zheng {
302a77fd4bbSFam Zheng     BdrvCoDrainData *data = opaque;
303a77fd4bbSFam Zheng     Coroutine *co = data->co;
30499723548SPaolo Bonzini     BlockDriverState *bs = data->bs;
305a77fd4bbSFam Zheng 
306c8ca33d0SKevin Wolf     if (bs) {
307aa1361d5SKevin Wolf         AioContext *ctx = bdrv_get_aio_context(bs);
308aa1361d5SKevin Wolf         AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
309aa1361d5SKevin Wolf 
310aa1361d5SKevin Wolf         /*
311aa1361d5SKevin Wolf          * When the coroutine yielded, the lock for its home context was
312aa1361d5SKevin Wolf          * released, so we need to re-acquire it here. If it explicitly
313aa1361d5SKevin Wolf          * acquired a different context, the lock is still held and we don't
314aa1361d5SKevin Wolf          * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
315aa1361d5SKevin Wolf          */
316aa1361d5SKevin Wolf         if (ctx == co_ctx) {
317aa1361d5SKevin Wolf             aio_context_acquire(ctx);
318aa1361d5SKevin Wolf         }
31999723548SPaolo Bonzini         bdrv_dec_in_flight(bs);
320481cad48SManos Pitsidianakis         if (data->begin) {
321e037c09cSMax Reitz             assert(!data->drained_end_counter);
3226cd5c9d7SKevin Wolf             bdrv_do_drained_begin(bs, data->recursive, data->parent,
3236cd5c9d7SKevin Wolf                                   data->ignore_bds_parents, data->poll);
324481cad48SManos Pitsidianakis         } else {
325e037c09cSMax Reitz             assert(!data->poll);
3266cd5c9d7SKevin Wolf             bdrv_do_drained_end(bs, data->recursive, data->parent,
3278e1da77eSMax Reitz                                 data->ignore_bds_parents,
3288e1da77eSMax Reitz                                 data->drained_end_counter);
329481cad48SManos Pitsidianakis         }
330aa1361d5SKevin Wolf         if (ctx == co_ctx) {
331aa1361d5SKevin Wolf             aio_context_release(ctx);
332aa1361d5SKevin Wolf         }
333c8ca33d0SKevin Wolf     } else {
334c8ca33d0SKevin Wolf         assert(data->begin);
335c8ca33d0SKevin Wolf         bdrv_drain_all_begin();
336c8ca33d0SKevin Wolf     }
337481cad48SManos Pitsidianakis 
338a77fd4bbSFam Zheng     data->done = true;
3391919631eSPaolo Bonzini     aio_co_wake(co);
340a77fd4bbSFam Zheng }
341a77fd4bbSFam Zheng 
342481cad48SManos Pitsidianakis static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
343b0165585SKevin Wolf                                                 bool begin, bool recursive,
3446cd5c9d7SKevin Wolf                                                 BdrvChild *parent,
3456cd5c9d7SKevin Wolf                                                 bool ignore_bds_parents,
3468e1da77eSMax Reitz                                                 bool poll,
3478e1da77eSMax Reitz                                                 int *drained_end_counter)
348a77fd4bbSFam Zheng {
349a77fd4bbSFam Zheng     BdrvCoDrainData data;
350a77fd4bbSFam Zheng 
351a77fd4bbSFam Zheng     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
352c40a2545SStefan Hajnoczi      * other coroutines run if they were queued by aio_co_enter(). */
353a77fd4bbSFam Zheng 
354a77fd4bbSFam Zheng     assert(qemu_in_coroutine());
355a77fd4bbSFam Zheng     data = (BdrvCoDrainData) {
356a77fd4bbSFam Zheng         .co = qemu_coroutine_self(),
357a77fd4bbSFam Zheng         .bs = bs,
358a77fd4bbSFam Zheng         .done = false,
359481cad48SManos Pitsidianakis         .begin = begin,
360b0165585SKevin Wolf         .recursive = recursive,
3610152bf40SKevin Wolf         .parent = parent,
3626cd5c9d7SKevin Wolf         .ignore_bds_parents = ignore_bds_parents,
363fe4f0614SKevin Wolf         .poll = poll,
3648e1da77eSMax Reitz         .drained_end_counter = drained_end_counter,
365a77fd4bbSFam Zheng     };
3668e1da77eSMax Reitz 
367c8ca33d0SKevin Wolf     if (bs) {
36899723548SPaolo Bonzini         bdrv_inc_in_flight(bs);
369c8ca33d0SKevin Wolf     }
370e4ec5ad4SPavel Dovgalyuk     replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs),
371fffb6e12SPaolo Bonzini                                      bdrv_co_drain_bh_cb, &data);
372a77fd4bbSFam Zheng 
373a77fd4bbSFam Zheng     qemu_coroutine_yield();
374a77fd4bbSFam Zheng     /* If we are resumed from some other event (such as an aio completion or a
375a77fd4bbSFam Zheng      * timer callback), it is a bug in the caller that should be fixed. */
376a77fd4bbSFam Zheng     assert(data.done);
377a77fd4bbSFam Zheng }
378a77fd4bbSFam Zheng 
379dcf94a23SKevin Wolf void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
3806cd5c9d7SKevin Wolf                                    BdrvChild *parent, bool ignore_bds_parents)
381dcf94a23SKevin Wolf {
382dcf94a23SKevin Wolf     assert(!qemu_in_coroutine());
383dcf94a23SKevin Wolf 
384dcf94a23SKevin Wolf     /* Stop things in parent-to-child order */
385dcf94a23SKevin Wolf     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
386dcf94a23SKevin Wolf         aio_disable_external(bdrv_get_aio_context(bs));
387dcf94a23SKevin Wolf     }
388dcf94a23SKevin Wolf 
3896cd5c9d7SKevin Wolf     bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
3908e1da77eSMax Reitz     bdrv_drain_invoke(bs, true, NULL);
391dcf94a23SKevin Wolf }
392dcf94a23SKevin Wolf 
393dcf94a23SKevin Wolf static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
3946cd5c9d7SKevin Wolf                                   BdrvChild *parent, bool ignore_bds_parents,
3956cd5c9d7SKevin Wolf                                   bool poll)
3966820643fSKevin Wolf {
397b0165585SKevin Wolf     BdrvChild *child, *next;
398b0165585SKevin Wolf 
399d42cf288SPaolo Bonzini     if (qemu_in_coroutine()) {
4006cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
4018e1da77eSMax Reitz                                poll, NULL);
402d42cf288SPaolo Bonzini         return;
403d42cf288SPaolo Bonzini     }
404d42cf288SPaolo Bonzini 
4056cd5c9d7SKevin Wolf     bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
406d30b8e64SKevin Wolf 
407b0165585SKevin Wolf     if (recursive) {
4086cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
409d736f119SKevin Wolf         bs->recursive_quiesce_counter++;
410b0165585SKevin Wolf         QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
4116cd5c9d7SKevin Wolf             bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
4126cd5c9d7SKevin Wolf                                   false);
413b0165585SKevin Wolf         }
414b0165585SKevin Wolf     }
415fe4f0614SKevin Wolf 
416fe4f0614SKevin Wolf     /*
417fe4f0614SKevin Wolf      * Wait for drained requests to finish.
418fe4f0614SKevin Wolf      *
419fe4f0614SKevin Wolf      * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
420fe4f0614SKevin Wolf      * call is needed so things in this AioContext can make progress even
421fe4f0614SKevin Wolf      * though we don't return to the main AioContext loop - this automatically
422fe4f0614SKevin Wolf      * includes other nodes in the same AioContext and therefore all child
423fe4f0614SKevin Wolf      * nodes.
424fe4f0614SKevin Wolf      */
425fe4f0614SKevin Wolf     if (poll) {
4266cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
427fe4f0614SKevin Wolf         BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
428fe4f0614SKevin Wolf     }
4296820643fSKevin Wolf }
4306820643fSKevin Wolf 
4310152bf40SKevin Wolf void bdrv_drained_begin(BlockDriverState *bs)
4320152bf40SKevin Wolf {
4336cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, false, NULL, false, true);
4340152bf40SKevin Wolf }
4350152bf40SKevin Wolf 
436b0165585SKevin Wolf void bdrv_subtree_drained_begin(BlockDriverState *bs)
4376820643fSKevin Wolf {
4386cd5c9d7SKevin Wolf     bdrv_do_drained_begin(bs, true, NULL, false, true);
439b0165585SKevin Wolf }
440b0165585SKevin Wolf 
441e037c09cSMax Reitz /**
442e037c09cSMax Reitz  * This function does not poll, nor must any of its recursively called
443e037c09cSMax Reitz  * functions.  The *drained_end_counter pointee will be incremented
444e037c09cSMax Reitz  * once for every background operation scheduled, and decremented once
445e037c09cSMax Reitz  * the operation settles.  Therefore, the pointer must remain valid
446e037c09cSMax Reitz  * until the pointee reaches 0.  That implies that whoever sets up the
447e037c09cSMax Reitz  * pointee has to poll until it is 0.
448e037c09cSMax Reitz  *
449e037c09cSMax Reitz  * We use atomic operations to access *drained_end_counter, because
450e037c09cSMax Reitz  * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
451e037c09cSMax Reitz  *     @bs may contain nodes in different AioContexts,
452e037c09cSMax Reitz  * (2) bdrv_drain_all_end() uses the same counter for all nodes,
453e037c09cSMax Reitz  *     regardless of which AioContext they are in.
454e037c09cSMax Reitz  */
4556cd5c9d7SKevin Wolf static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
4568e1da77eSMax Reitz                                 BdrvChild *parent, bool ignore_bds_parents,
4578e1da77eSMax Reitz                                 int *drained_end_counter)
458b0165585SKevin Wolf {
45961ad631cSMax Reitz     BdrvChild *child;
4600f115168SKevin Wolf     int old_quiesce_counter;
4610f115168SKevin Wolf 
462e037c09cSMax Reitz     assert(drained_end_counter != NULL);
463e037c09cSMax Reitz 
464481cad48SManos Pitsidianakis     if (qemu_in_coroutine()) {
4656cd5c9d7SKevin Wolf         bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
4668e1da77eSMax Reitz                                false, drained_end_counter);
467481cad48SManos Pitsidianakis         return;
468481cad48SManos Pitsidianakis     }
4696820643fSKevin Wolf     assert(bs->quiesce_counter > 0);
4706820643fSKevin Wolf 
47160369b86SKevin Wolf     /* Re-enable things in child-to-parent order */
4728e1da77eSMax Reitz     bdrv_drain_invoke(bs, false, drained_end_counter);
473e037c09cSMax Reitz     bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
474e037c09cSMax Reitz                             drained_end_counter);
4755cb2737eSMax Reitz 
4765cb2737eSMax Reitz     old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
4770f115168SKevin Wolf     if (old_quiesce_counter == 1) {
4786820643fSKevin Wolf         aio_enable_external(bdrv_get_aio_context(bs));
4796820643fSKevin Wolf     }
480b0165585SKevin Wolf 
481b0165585SKevin Wolf     if (recursive) {
4826cd5c9d7SKevin Wolf         assert(!ignore_bds_parents);
483d736f119SKevin Wolf         bs->recursive_quiesce_counter--;
48461ad631cSMax Reitz         QLIST_FOREACH(child, &bs->children, next) {
4858e1da77eSMax Reitz             bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
4868e1da77eSMax Reitz                                 drained_end_counter);
487b0165585SKevin Wolf         }
488b0165585SKevin Wolf     }
4890f115168SKevin Wolf }
4906820643fSKevin Wolf 
4910152bf40SKevin Wolf void bdrv_drained_end(BlockDriverState *bs)
4920152bf40SKevin Wolf {
493e037c09cSMax Reitz     int drained_end_counter = 0;
494e037c09cSMax Reitz     bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
495e037c09cSMax Reitz     BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
496e037c09cSMax Reitz }
497e037c09cSMax Reitz 
498e037c09cSMax Reitz void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
499e037c09cSMax Reitz {
500e037c09cSMax Reitz     bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
501b0165585SKevin Wolf }
502b0165585SKevin Wolf 
503b0165585SKevin Wolf void bdrv_subtree_drained_end(BlockDriverState *bs)
504b0165585SKevin Wolf {
505e037c09cSMax Reitz     int drained_end_counter = 0;
506e037c09cSMax Reitz     bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
507e037c09cSMax Reitz     BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
5080152bf40SKevin Wolf }
5090152bf40SKevin Wolf 
510d736f119SKevin Wolf void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
511d736f119SKevin Wolf {
512d736f119SKevin Wolf     int i;
513d736f119SKevin Wolf 
514d736f119SKevin Wolf     for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
5156cd5c9d7SKevin Wolf         bdrv_do_drained_begin(child->bs, true, child, false, true);
516d736f119SKevin Wolf     }
517d736f119SKevin Wolf }
518d736f119SKevin Wolf 
519d736f119SKevin Wolf void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
520d736f119SKevin Wolf {
521e037c09cSMax Reitz     int drained_end_counter = 0;
522d736f119SKevin Wolf     int i;
523d736f119SKevin Wolf 
524d736f119SKevin Wolf     for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
525e037c09cSMax Reitz         bdrv_do_drained_end(child->bs, true, child, false,
526e037c09cSMax Reitz                             &drained_end_counter);
527d736f119SKevin Wolf     }
528e037c09cSMax Reitz 
529e037c09cSMax Reitz     BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0);
530d736f119SKevin Wolf }
531d736f119SKevin Wolf 
53261007b31SStefan Hajnoczi /*
53367da1dc5SFam Zheng  * Wait for pending requests to complete on a single BlockDriverState subtree,
53467da1dc5SFam Zheng  * and suspend block driver's internal I/O until next request arrives.
53561007b31SStefan Hajnoczi  *
53661007b31SStefan Hajnoczi  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
53761007b31SStefan Hajnoczi  * AioContext.
53861007b31SStefan Hajnoczi  */
539b6e84c97SPaolo Bonzini void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
540b6e84c97SPaolo Bonzini {
5416820643fSKevin Wolf     assert(qemu_in_coroutine());
5426820643fSKevin Wolf     bdrv_drained_begin(bs);
5436820643fSKevin Wolf     bdrv_drained_end(bs);
544b6e84c97SPaolo Bonzini }
545b6e84c97SPaolo Bonzini 
54661007b31SStefan Hajnoczi void bdrv_drain(BlockDriverState *bs)
54761007b31SStefan Hajnoczi {
5486820643fSKevin Wolf     bdrv_drained_begin(bs);
5496820643fSKevin Wolf     bdrv_drained_end(bs);
55061007b31SStefan Hajnoczi }
55161007b31SStefan Hajnoczi 
552c13ad59fSKevin Wolf static void bdrv_drain_assert_idle(BlockDriverState *bs)
553c13ad59fSKevin Wolf {
554c13ad59fSKevin Wolf     BdrvChild *child, *next;
555c13ad59fSKevin Wolf 
556c13ad59fSKevin Wolf     assert(atomic_read(&bs->in_flight) == 0);
557c13ad59fSKevin Wolf     QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
558c13ad59fSKevin Wolf         bdrv_drain_assert_idle(child->bs);
559c13ad59fSKevin Wolf     }
560c13ad59fSKevin Wolf }
561c13ad59fSKevin Wolf 
5620f12264eSKevin Wolf unsigned int bdrv_drain_all_count = 0;
5630f12264eSKevin Wolf 
5640f12264eSKevin Wolf static bool bdrv_drain_all_poll(void)
5650f12264eSKevin Wolf {
5660f12264eSKevin Wolf     BlockDriverState *bs = NULL;
5670f12264eSKevin Wolf     bool result = false;
5680f12264eSKevin Wolf 
5690f12264eSKevin Wolf     /* bdrv_drain_poll() can't make changes to the graph and we are holding the
5700f12264eSKevin Wolf      * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
5710f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
5720f12264eSKevin Wolf         AioContext *aio_context = bdrv_get_aio_context(bs);
5730f12264eSKevin Wolf         aio_context_acquire(aio_context);
5740f12264eSKevin Wolf         result |= bdrv_drain_poll(bs, false, NULL, true);
5750f12264eSKevin Wolf         aio_context_release(aio_context);
5760f12264eSKevin Wolf     }
5770f12264eSKevin Wolf 
5780f12264eSKevin Wolf     return result;
5790f12264eSKevin Wolf }
5800f12264eSKevin Wolf 
58161007b31SStefan Hajnoczi /*
58261007b31SStefan Hajnoczi  * Wait for pending requests to complete across all BlockDriverStates
58361007b31SStefan Hajnoczi  *
58461007b31SStefan Hajnoczi  * This function does not flush data to disk, use bdrv_flush_all() for that
58561007b31SStefan Hajnoczi  * after calling this function.
586c0778f66SAlberto Garcia  *
587c0778f66SAlberto Garcia  * This pauses all block jobs and disables external clients. It must
588c0778f66SAlberto Garcia  * be paired with bdrv_drain_all_end().
589c0778f66SAlberto Garcia  *
590c0778f66SAlberto Garcia  * NOTE: no new block jobs or BlockDriverStates can be created between
591c0778f66SAlberto Garcia  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
59261007b31SStefan Hajnoczi  */
593c0778f66SAlberto Garcia void bdrv_drain_all_begin(void)
59461007b31SStefan Hajnoczi {
5950f12264eSKevin Wolf     BlockDriverState *bs = NULL;
59661007b31SStefan Hajnoczi 
597c8ca33d0SKevin Wolf     if (qemu_in_coroutine()) {
5988e1da77eSMax Reitz         bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
599c8ca33d0SKevin Wolf         return;
600c8ca33d0SKevin Wolf     }
601c8ca33d0SKevin Wolf 
602c8aa7895SPavel Dovgalyuk     /*
603c8aa7895SPavel Dovgalyuk      * bdrv queue is managed by record/replay,
604c8aa7895SPavel Dovgalyuk      * waiting for finishing the I/O requests may
605c8aa7895SPavel Dovgalyuk      * be infinite
606c8aa7895SPavel Dovgalyuk      */
607c8aa7895SPavel Dovgalyuk     if (replay_events_enabled()) {
608c8aa7895SPavel Dovgalyuk         return;
609c8aa7895SPavel Dovgalyuk     }
610c8aa7895SPavel Dovgalyuk 
6110f12264eSKevin Wolf     /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
6120f12264eSKevin Wolf      * loop AioContext, so make sure we're in the main context. */
6139a7e86c8SKevin Wolf     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
6140f12264eSKevin Wolf     assert(bdrv_drain_all_count < INT_MAX);
6150f12264eSKevin Wolf     bdrv_drain_all_count++;
6169a7e86c8SKevin Wolf 
6170f12264eSKevin Wolf     /* Quiesce all nodes, without polling in-flight requests yet. The graph
6180f12264eSKevin Wolf      * cannot change during this loop. */
6190f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
62061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
62161007b31SStefan Hajnoczi 
62261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
6230f12264eSKevin Wolf         bdrv_do_drained_begin(bs, false, NULL, true, false);
62461007b31SStefan Hajnoczi         aio_context_release(aio_context);
62561007b31SStefan Hajnoczi     }
62661007b31SStefan Hajnoczi 
6270f12264eSKevin Wolf     /* Now poll the in-flight requests */
628cfe29d82SKevin Wolf     AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
6290f12264eSKevin Wolf 
6300f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
631c13ad59fSKevin Wolf         bdrv_drain_assert_idle(bs);
632f406c03cSAlexander Yarygin     }
633f406c03cSAlexander Yarygin }
634c0778f66SAlberto Garcia 
635c0778f66SAlberto Garcia void bdrv_drain_all_end(void)
636c0778f66SAlberto Garcia {
6370f12264eSKevin Wolf     BlockDriverState *bs = NULL;
638e037c09cSMax Reitz     int drained_end_counter = 0;
639c0778f66SAlberto Garcia 
640c8aa7895SPavel Dovgalyuk     /*
641c8aa7895SPavel Dovgalyuk      * bdrv queue is managed by record/replay,
642c8aa7895SPavel Dovgalyuk      * waiting for finishing the I/O requests may
643c8aa7895SPavel Dovgalyuk      * be endless
644c8aa7895SPavel Dovgalyuk      */
645c8aa7895SPavel Dovgalyuk     if (replay_events_enabled()) {
646c8aa7895SPavel Dovgalyuk         return;
647c8aa7895SPavel Dovgalyuk     }
648c8aa7895SPavel Dovgalyuk 
6490f12264eSKevin Wolf     while ((bs = bdrv_next_all_states(bs))) {
65061007b31SStefan Hajnoczi         AioContext *aio_context = bdrv_get_aio_context(bs);
65161007b31SStefan Hajnoczi 
65261007b31SStefan Hajnoczi         aio_context_acquire(aio_context);
653e037c09cSMax Reitz         bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
65461007b31SStefan Hajnoczi         aio_context_release(aio_context);
65561007b31SStefan Hajnoczi     }
6560f12264eSKevin Wolf 
657e037c09cSMax Reitz     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
658e037c09cSMax Reitz     AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0);
659e037c09cSMax Reitz 
6600f12264eSKevin Wolf     assert(bdrv_drain_all_count > 0);
6610f12264eSKevin Wolf     bdrv_drain_all_count--;
66261007b31SStefan Hajnoczi }
66361007b31SStefan Hajnoczi 
664c0778f66SAlberto Garcia void bdrv_drain_all(void)
665c0778f66SAlberto Garcia {
666c0778f66SAlberto Garcia     bdrv_drain_all_begin();
667c0778f66SAlberto Garcia     bdrv_drain_all_end();
668c0778f66SAlberto Garcia }
669c0778f66SAlberto Garcia 
67061007b31SStefan Hajnoczi /**
67161007b31SStefan Hajnoczi  * Remove an active request from the tracked requests list
67261007b31SStefan Hajnoczi  *
67361007b31SStefan Hajnoczi  * This function should be called when a tracked request is completing.
67461007b31SStefan Hajnoczi  */
67561007b31SStefan Hajnoczi static void tracked_request_end(BdrvTrackedRequest *req)
67661007b31SStefan Hajnoczi {
67761007b31SStefan Hajnoczi     if (req->serialising) {
67820fc71b2SPaolo Bonzini         atomic_dec(&req->bs->serialising_in_flight);
67961007b31SStefan Hajnoczi     }
68061007b31SStefan Hajnoczi 
6813783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&req->bs->reqs_lock);
68261007b31SStefan Hajnoczi     QLIST_REMOVE(req, list);
68361007b31SStefan Hajnoczi     qemu_co_queue_restart_all(&req->wait_queue);
6843783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&req->bs->reqs_lock);
68561007b31SStefan Hajnoczi }
68661007b31SStefan Hajnoczi 
68761007b31SStefan Hajnoczi /**
68861007b31SStefan Hajnoczi  * Add an active request to the tracked requests list
68961007b31SStefan Hajnoczi  */
69061007b31SStefan Hajnoczi static void tracked_request_begin(BdrvTrackedRequest *req,
69161007b31SStefan Hajnoczi                                   BlockDriverState *bs,
69261007b31SStefan Hajnoczi                                   int64_t offset,
69322931a15SFam Zheng                                   uint64_t bytes,
694ebde595cSFam Zheng                                   enum BdrvTrackedRequestType type)
69561007b31SStefan Hajnoczi {
69622931a15SFam Zheng     assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
69722931a15SFam Zheng 
69861007b31SStefan Hajnoczi     *req = (BdrvTrackedRequest){
69961007b31SStefan Hajnoczi         .bs = bs,
70061007b31SStefan Hajnoczi         .offset         = offset,
70161007b31SStefan Hajnoczi         .bytes          = bytes,
702ebde595cSFam Zheng         .type           = type,
70361007b31SStefan Hajnoczi         .co             = qemu_coroutine_self(),
70461007b31SStefan Hajnoczi         .serialising    = false,
70561007b31SStefan Hajnoczi         .overlap_offset = offset,
70661007b31SStefan Hajnoczi         .overlap_bytes  = bytes,
70761007b31SStefan Hajnoczi     };
70861007b31SStefan Hajnoczi 
70961007b31SStefan Hajnoczi     qemu_co_queue_init(&req->wait_queue);
71061007b31SStefan Hajnoczi 
7113783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
71261007b31SStefan Hajnoczi     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
7133783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
71461007b31SStefan Hajnoczi }
71561007b31SStefan Hajnoczi 
7163ba0e1a0SPaolo Bonzini static bool tracked_request_overlaps(BdrvTrackedRequest *req,
7173ba0e1a0SPaolo Bonzini                                      int64_t offset, uint64_t bytes)
7183ba0e1a0SPaolo Bonzini {
7193ba0e1a0SPaolo Bonzini     /*        aaaa   bbbb */
7203ba0e1a0SPaolo Bonzini     if (offset >= req->overlap_offset + req->overlap_bytes) {
7213ba0e1a0SPaolo Bonzini         return false;
7223ba0e1a0SPaolo Bonzini     }
7233ba0e1a0SPaolo Bonzini     /* bbbb   aaaa        */
7243ba0e1a0SPaolo Bonzini     if (req->overlap_offset >= offset + bytes) {
7253ba0e1a0SPaolo Bonzini         return false;
7263ba0e1a0SPaolo Bonzini     }
7273ba0e1a0SPaolo Bonzini     return true;
7283ba0e1a0SPaolo Bonzini }
7293ba0e1a0SPaolo Bonzini 
7303ba0e1a0SPaolo Bonzini static bool coroutine_fn
7313ba0e1a0SPaolo Bonzini bdrv_wait_serialising_requests_locked(BlockDriverState *bs,
7323ba0e1a0SPaolo Bonzini                                       BdrvTrackedRequest *self)
7333ba0e1a0SPaolo Bonzini {
7343ba0e1a0SPaolo Bonzini     BdrvTrackedRequest *req;
7353ba0e1a0SPaolo Bonzini     bool retry;
7363ba0e1a0SPaolo Bonzini     bool waited = false;
7373ba0e1a0SPaolo Bonzini 
7383ba0e1a0SPaolo Bonzini     do {
7393ba0e1a0SPaolo Bonzini         retry = false;
7403ba0e1a0SPaolo Bonzini         QLIST_FOREACH(req, &bs->tracked_requests, list) {
7413ba0e1a0SPaolo Bonzini             if (req == self || (!req->serialising && !self->serialising)) {
7423ba0e1a0SPaolo Bonzini                 continue;
7433ba0e1a0SPaolo Bonzini             }
7443ba0e1a0SPaolo Bonzini             if (tracked_request_overlaps(req, self->overlap_offset,
7453ba0e1a0SPaolo Bonzini                                          self->overlap_bytes))
7463ba0e1a0SPaolo Bonzini             {
7473ba0e1a0SPaolo Bonzini                 /* Hitting this means there was a reentrant request, for
7483ba0e1a0SPaolo Bonzini                  * example, a block driver issuing nested requests.  This must
7493ba0e1a0SPaolo Bonzini                  * never happen since it means deadlock.
7503ba0e1a0SPaolo Bonzini                  */
7513ba0e1a0SPaolo Bonzini                 assert(qemu_coroutine_self() != req->co);
7523ba0e1a0SPaolo Bonzini 
7533ba0e1a0SPaolo Bonzini                 /* If the request is already (indirectly) waiting for us, or
7543ba0e1a0SPaolo Bonzini                  * will wait for us as soon as it wakes up, then just go on
7553ba0e1a0SPaolo Bonzini                  * (instead of producing a deadlock in the former case). */
7563ba0e1a0SPaolo Bonzini                 if (!req->waiting_for) {
7573ba0e1a0SPaolo Bonzini                     self->waiting_for = req;
7583ba0e1a0SPaolo Bonzini                     qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
7593ba0e1a0SPaolo Bonzini                     self->waiting_for = NULL;
7603ba0e1a0SPaolo Bonzini                     retry = true;
7613ba0e1a0SPaolo Bonzini                     waited = true;
7623ba0e1a0SPaolo Bonzini                     break;
7633ba0e1a0SPaolo Bonzini                 }
7643ba0e1a0SPaolo Bonzini             }
7653ba0e1a0SPaolo Bonzini         }
7663ba0e1a0SPaolo Bonzini     } while (retry);
7673ba0e1a0SPaolo Bonzini     return waited;
7683ba0e1a0SPaolo Bonzini }
7693ba0e1a0SPaolo Bonzini 
77018fbd0deSPaolo Bonzini bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
77161007b31SStefan Hajnoczi {
7723ba0e1a0SPaolo Bonzini     BlockDriverState *bs = req->bs;
77361007b31SStefan Hajnoczi     int64_t overlap_offset = req->offset & ~(align - 1);
77422931a15SFam Zheng     uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
77561007b31SStefan Hajnoczi                                - overlap_offset;
7763ba0e1a0SPaolo Bonzini     bool waited;
77761007b31SStefan Hajnoczi 
7783ba0e1a0SPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
77961007b31SStefan Hajnoczi     if (!req->serialising) {
78020fc71b2SPaolo Bonzini         atomic_inc(&req->bs->serialising_in_flight);
78161007b31SStefan Hajnoczi         req->serialising = true;
78261007b31SStefan Hajnoczi     }
78361007b31SStefan Hajnoczi 
78461007b31SStefan Hajnoczi     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
78561007b31SStefan Hajnoczi     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
7863ba0e1a0SPaolo Bonzini     waited = bdrv_wait_serialising_requests_locked(bs, req);
7873ba0e1a0SPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
7883ba0e1a0SPaolo Bonzini     return waited;
78909d2f948SVladimir Sementsov-Ogievskiy }
79009d2f948SVladimir Sementsov-Ogievskiy 
79161007b31SStefan Hajnoczi /**
792c28107e9SMax Reitz  * Return the tracked request on @bs for the current coroutine, or
793c28107e9SMax Reitz  * NULL if there is none.
794c28107e9SMax Reitz  */
795c28107e9SMax Reitz BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
796c28107e9SMax Reitz {
797c28107e9SMax Reitz     BdrvTrackedRequest *req;
798c28107e9SMax Reitz     Coroutine *self = qemu_coroutine_self();
799c28107e9SMax Reitz 
800c28107e9SMax Reitz     QLIST_FOREACH(req, &bs->tracked_requests, list) {
801c28107e9SMax Reitz         if (req->co == self) {
802c28107e9SMax Reitz             return req;
803c28107e9SMax Reitz         }
804c28107e9SMax Reitz     }
805c28107e9SMax Reitz 
806c28107e9SMax Reitz     return NULL;
807c28107e9SMax Reitz }
808c28107e9SMax Reitz 
809c28107e9SMax Reitz /**
810244483e6SKevin Wolf  * Round a region to cluster boundaries
811244483e6SKevin Wolf  */
812244483e6SKevin Wolf void bdrv_round_to_clusters(BlockDriverState *bs,
8137cfd5275SEric Blake                             int64_t offset, int64_t bytes,
814244483e6SKevin Wolf                             int64_t *cluster_offset,
8157cfd5275SEric Blake                             int64_t *cluster_bytes)
816244483e6SKevin Wolf {
817244483e6SKevin Wolf     BlockDriverInfo bdi;
818244483e6SKevin Wolf 
819244483e6SKevin Wolf     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
820244483e6SKevin Wolf         *cluster_offset = offset;
821244483e6SKevin Wolf         *cluster_bytes = bytes;
822244483e6SKevin Wolf     } else {
823244483e6SKevin Wolf         int64_t c = bdi.cluster_size;
824244483e6SKevin Wolf         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
825244483e6SKevin Wolf         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
826244483e6SKevin Wolf     }
827244483e6SKevin Wolf }
828244483e6SKevin Wolf 
82961007b31SStefan Hajnoczi static int bdrv_get_cluster_size(BlockDriverState *bs)
83061007b31SStefan Hajnoczi {
83161007b31SStefan Hajnoczi     BlockDriverInfo bdi;
83261007b31SStefan Hajnoczi     int ret;
83361007b31SStefan Hajnoczi 
83461007b31SStefan Hajnoczi     ret = bdrv_get_info(bs, &bdi);
83561007b31SStefan Hajnoczi     if (ret < 0 || bdi.cluster_size == 0) {
836a5b8dd2cSEric Blake         return bs->bl.request_alignment;
83761007b31SStefan Hajnoczi     } else {
83861007b31SStefan Hajnoczi         return bdi.cluster_size;
83961007b31SStefan Hajnoczi     }
84061007b31SStefan Hajnoczi }
84161007b31SStefan Hajnoczi 
84299723548SPaolo Bonzini void bdrv_inc_in_flight(BlockDriverState *bs)
84399723548SPaolo Bonzini {
84499723548SPaolo Bonzini     atomic_inc(&bs->in_flight);
84599723548SPaolo Bonzini }
84699723548SPaolo Bonzini 
847c9d1a561SPaolo Bonzini void bdrv_wakeup(BlockDriverState *bs)
848c9d1a561SPaolo Bonzini {
849cfe29d82SKevin Wolf     aio_wait_kick();
850c9d1a561SPaolo Bonzini }
851c9d1a561SPaolo Bonzini 
85299723548SPaolo Bonzini void bdrv_dec_in_flight(BlockDriverState *bs)
85399723548SPaolo Bonzini {
85499723548SPaolo Bonzini     atomic_dec(&bs->in_flight);
855c9d1a561SPaolo Bonzini     bdrv_wakeup(bs);
85699723548SPaolo Bonzini }
85799723548SPaolo Bonzini 
85818fbd0deSPaolo Bonzini static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
85961007b31SStefan Hajnoczi {
86061007b31SStefan Hajnoczi     BlockDriverState *bs = self->bs;
86161007b31SStefan Hajnoczi     bool waited = false;
86261007b31SStefan Hajnoczi 
86320fc71b2SPaolo Bonzini     if (!atomic_read(&bs->serialising_in_flight)) {
86461007b31SStefan Hajnoczi         return false;
86561007b31SStefan Hajnoczi     }
86661007b31SStefan Hajnoczi 
8673783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
8683ba0e1a0SPaolo Bonzini     waited = bdrv_wait_serialising_requests_locked(bs, self);
8693783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
87061007b31SStefan Hajnoczi 
87161007b31SStefan Hajnoczi     return waited;
87261007b31SStefan Hajnoczi }
87361007b31SStefan Hajnoczi 
87461007b31SStefan Hajnoczi static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
87561007b31SStefan Hajnoczi                                    size_t size)
87661007b31SStefan Hajnoczi {
87741ae31e3SAlberto Garcia     if (size > BDRV_REQUEST_MAX_BYTES) {
87861007b31SStefan Hajnoczi         return -EIO;
87961007b31SStefan Hajnoczi     }
88061007b31SStefan Hajnoczi 
88161007b31SStefan Hajnoczi     if (!bdrv_is_inserted(bs)) {
88261007b31SStefan Hajnoczi         return -ENOMEDIUM;
88361007b31SStefan Hajnoczi     }
88461007b31SStefan Hajnoczi 
88561007b31SStefan Hajnoczi     if (offset < 0) {
88661007b31SStefan Hajnoczi         return -EIO;
88761007b31SStefan Hajnoczi     }
88861007b31SStefan Hajnoczi 
88961007b31SStefan Hajnoczi     return 0;
89061007b31SStefan Hajnoczi }
89161007b31SStefan Hajnoczi 
8927d2410ceSVladimir Sementsov-Ogievskiy typedef int coroutine_fn BdrvRequestEntry(void *opaque);
8937d2410ceSVladimir Sementsov-Ogievskiy typedef struct BdrvRunCo {
8947d2410ceSVladimir Sementsov-Ogievskiy     BdrvRequestEntry *entry;
8957d2410ceSVladimir Sementsov-Ogievskiy     void *opaque;
8967d2410ceSVladimir Sementsov-Ogievskiy     int ret;
8977d2410ceSVladimir Sementsov-Ogievskiy     bool done;
8987d2410ceSVladimir Sementsov-Ogievskiy     Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */
8997d2410ceSVladimir Sementsov-Ogievskiy } BdrvRunCo;
9007d2410ceSVladimir Sementsov-Ogievskiy 
9017d2410ceSVladimir Sementsov-Ogievskiy static void coroutine_fn bdrv_run_co_entry(void *opaque)
9027d2410ceSVladimir Sementsov-Ogievskiy {
9037d2410ceSVladimir Sementsov-Ogievskiy     BdrvRunCo *arg = opaque;
9047d2410ceSVladimir Sementsov-Ogievskiy 
9057d2410ceSVladimir Sementsov-Ogievskiy     arg->ret = arg->entry(arg->opaque);
9067d2410ceSVladimir Sementsov-Ogievskiy     arg->done = true;
9077d2410ceSVladimir Sementsov-Ogievskiy     aio_wait_kick();
9087d2410ceSVladimir Sementsov-Ogievskiy }
9097d2410ceSVladimir Sementsov-Ogievskiy 
9107d2410ceSVladimir Sementsov-Ogievskiy static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry,
9117d2410ceSVladimir Sementsov-Ogievskiy                        void *opaque)
9127d2410ceSVladimir Sementsov-Ogievskiy {
9137d2410ceSVladimir Sementsov-Ogievskiy     if (qemu_in_coroutine()) {
9147d2410ceSVladimir Sementsov-Ogievskiy         /* Fast-path if already in coroutine context */
9157d2410ceSVladimir Sementsov-Ogievskiy         return entry(opaque);
9167d2410ceSVladimir Sementsov-Ogievskiy     } else {
9177d2410ceSVladimir Sementsov-Ogievskiy         BdrvRunCo s = { .entry = entry, .opaque = opaque };
9187d2410ceSVladimir Sementsov-Ogievskiy 
9197d2410ceSVladimir Sementsov-Ogievskiy         s.co = qemu_coroutine_create(bdrv_run_co_entry, &s);
9207d2410ceSVladimir Sementsov-Ogievskiy         bdrv_coroutine_enter(bs, s.co);
9217d2410ceSVladimir Sementsov-Ogievskiy 
9227d2410ceSVladimir Sementsov-Ogievskiy         BDRV_POLL_WHILE(bs, !s.done);
9237d2410ceSVladimir Sementsov-Ogievskiy 
9247d2410ceSVladimir Sementsov-Ogievskiy         return s.ret;
9257d2410ceSVladimir Sementsov-Ogievskiy     }
9267d2410ceSVladimir Sementsov-Ogievskiy }
9277d2410ceSVladimir Sementsov-Ogievskiy 
92861007b31SStefan Hajnoczi typedef struct RwCo {
929e293b7a3SKevin Wolf     BdrvChild *child;
93061007b31SStefan Hajnoczi     int64_t offset;
93161007b31SStefan Hajnoczi     QEMUIOVector *qiov;
93261007b31SStefan Hajnoczi     bool is_write;
93361007b31SStefan Hajnoczi     BdrvRequestFlags flags;
93461007b31SStefan Hajnoczi } RwCo;
93561007b31SStefan Hajnoczi 
9367d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_rw_co_entry(void *opaque)
93761007b31SStefan Hajnoczi {
93861007b31SStefan Hajnoczi     RwCo *rwco = opaque;
93961007b31SStefan Hajnoczi 
94061007b31SStefan Hajnoczi     if (!rwco->is_write) {
9417d2410ceSVladimir Sementsov-Ogievskiy         return bdrv_co_preadv(rwco->child, rwco->offset,
94261007b31SStefan Hajnoczi                               rwco->qiov->size, rwco->qiov,
94361007b31SStefan Hajnoczi                               rwco->flags);
94461007b31SStefan Hajnoczi     } else {
9457d2410ceSVladimir Sementsov-Ogievskiy         return bdrv_co_pwritev(rwco->child, rwco->offset,
94661007b31SStefan Hajnoczi                                rwco->qiov->size, rwco->qiov,
94761007b31SStefan Hajnoczi                                rwco->flags);
94861007b31SStefan Hajnoczi     }
94961007b31SStefan Hajnoczi }
95061007b31SStefan Hajnoczi 
95161007b31SStefan Hajnoczi /*
95261007b31SStefan Hajnoczi  * Process a vectored synchronous request using coroutines
95361007b31SStefan Hajnoczi  */
954e293b7a3SKevin Wolf static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
95561007b31SStefan Hajnoczi                         QEMUIOVector *qiov, bool is_write,
95661007b31SStefan Hajnoczi                         BdrvRequestFlags flags)
95761007b31SStefan Hajnoczi {
95861007b31SStefan Hajnoczi     RwCo rwco = {
959e293b7a3SKevin Wolf         .child = child,
96061007b31SStefan Hajnoczi         .offset = offset,
96161007b31SStefan Hajnoczi         .qiov = qiov,
96261007b31SStefan Hajnoczi         .is_write = is_write,
96361007b31SStefan Hajnoczi         .flags = flags,
96461007b31SStefan Hajnoczi     };
96561007b31SStefan Hajnoczi 
9667d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco);
96761007b31SStefan Hajnoczi }
96861007b31SStefan Hajnoczi 
969720ff280SKevin Wolf int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
970f5a5ca79SManos Pitsidianakis                        int bytes, BdrvRequestFlags flags)
97161007b31SStefan Hajnoczi {
9720d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
97374021bc4SEric Blake 
974e293b7a3SKevin Wolf     return bdrv_prwv_co(child, offset, &qiov, true,
97561007b31SStefan Hajnoczi                         BDRV_REQ_ZERO_WRITE | flags);
97661007b31SStefan Hajnoczi }
97761007b31SStefan Hajnoczi 
97861007b31SStefan Hajnoczi /*
97974021bc4SEric Blake  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
98061007b31SStefan Hajnoczi  * The operation is sped up by checking the block status and only writing
98161007b31SStefan Hajnoczi  * zeroes to the device if they currently do not return zeroes. Optional
98274021bc4SEric Blake  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
983465fe887SEric Blake  * BDRV_REQ_FUA).
98461007b31SStefan Hajnoczi  *
985f4649069SEric Blake  * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
98661007b31SStefan Hajnoczi  */
987720ff280SKevin Wolf int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
98861007b31SStefan Hajnoczi {
989237d78f8SEric Blake     int ret;
990237d78f8SEric Blake     int64_t target_size, bytes, offset = 0;
991720ff280SKevin Wolf     BlockDriverState *bs = child->bs;
99261007b31SStefan Hajnoczi 
9937286d610SEric Blake     target_size = bdrv_getlength(bs);
9947286d610SEric Blake     if (target_size < 0) {
9957286d610SEric Blake         return target_size;
99661007b31SStefan Hajnoczi     }
99761007b31SStefan Hajnoczi 
99861007b31SStefan Hajnoczi     for (;;) {
9997286d610SEric Blake         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
10007286d610SEric Blake         if (bytes <= 0) {
100161007b31SStefan Hajnoczi             return 0;
100261007b31SStefan Hajnoczi         }
1003237d78f8SEric Blake         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
100461007b31SStefan Hajnoczi         if (ret < 0) {
100561007b31SStefan Hajnoczi             return ret;
100661007b31SStefan Hajnoczi         }
100761007b31SStefan Hajnoczi         if (ret & BDRV_BLOCK_ZERO) {
1008237d78f8SEric Blake             offset += bytes;
100961007b31SStefan Hajnoczi             continue;
101061007b31SStefan Hajnoczi         }
1011237d78f8SEric Blake         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
101261007b31SStefan Hajnoczi         if (ret < 0) {
101361007b31SStefan Hajnoczi             return ret;
101461007b31SStefan Hajnoczi         }
1015237d78f8SEric Blake         offset += bytes;
101661007b31SStefan Hajnoczi     }
101761007b31SStefan Hajnoczi }
101861007b31SStefan Hajnoczi 
1019f4649069SEric Blake /* return < 0 if error. See bdrv_pwrite() for the return codes */
1020cf2ab8fcSKevin Wolf int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
1021f1e84741SKevin Wolf {
1022f1e84741SKevin Wolf     int ret;
1023f1e84741SKevin Wolf 
1024e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
1025f1e84741SKevin Wolf     if (ret < 0) {
1026f1e84741SKevin Wolf         return ret;
1027f1e84741SKevin Wolf     }
1028f1e84741SKevin Wolf 
1029f1e84741SKevin Wolf     return qiov->size;
1030f1e84741SKevin Wolf }
1031f1e84741SKevin Wolf 
10322e11d756SAlberto Garcia /* See bdrv_pwrite() for the return codes */
1033cf2ab8fcSKevin Wolf int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
103461007b31SStefan Hajnoczi {
10350d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
103661007b31SStefan Hajnoczi 
103761007b31SStefan Hajnoczi     if (bytes < 0) {
103861007b31SStefan Hajnoczi         return -EINVAL;
103961007b31SStefan Hajnoczi     }
104061007b31SStefan Hajnoczi 
1041cf2ab8fcSKevin Wolf     return bdrv_preadv(child, offset, &qiov);
104261007b31SStefan Hajnoczi }
104361007b31SStefan Hajnoczi 
1044d9ca2ea2SKevin Wolf int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
104561007b31SStefan Hajnoczi {
104661007b31SStefan Hajnoczi     int ret;
104761007b31SStefan Hajnoczi 
1048e293b7a3SKevin Wolf     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
104961007b31SStefan Hajnoczi     if (ret < 0) {
105061007b31SStefan Hajnoczi         return ret;
105161007b31SStefan Hajnoczi     }
105261007b31SStefan Hajnoczi 
105361007b31SStefan Hajnoczi     return qiov->size;
105461007b31SStefan Hajnoczi }
105561007b31SStefan Hajnoczi 
10562e11d756SAlberto Garcia /* Return no. of bytes on success or < 0 on error. Important errors are:
10572e11d756SAlberto Garcia   -EIO         generic I/O error (may happen for all errors)
10582e11d756SAlberto Garcia   -ENOMEDIUM   No media inserted.
10592e11d756SAlberto Garcia   -EINVAL      Invalid offset or number of bytes
10602e11d756SAlberto Garcia   -EACCES      Trying to write a read-only device
10612e11d756SAlberto Garcia */
1062d9ca2ea2SKevin Wolf int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
106361007b31SStefan Hajnoczi {
10640d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
106561007b31SStefan Hajnoczi 
106661007b31SStefan Hajnoczi     if (bytes < 0) {
106761007b31SStefan Hajnoczi         return -EINVAL;
106861007b31SStefan Hajnoczi     }
106961007b31SStefan Hajnoczi 
1070d9ca2ea2SKevin Wolf     return bdrv_pwritev(child, offset, &qiov);
107161007b31SStefan Hajnoczi }
107261007b31SStefan Hajnoczi 
107361007b31SStefan Hajnoczi /*
107461007b31SStefan Hajnoczi  * Writes to the file and ensures that no writes are reordered across this
107561007b31SStefan Hajnoczi  * request (acts as a barrier)
107661007b31SStefan Hajnoczi  *
107761007b31SStefan Hajnoczi  * Returns 0 on success, -errno in error cases.
107861007b31SStefan Hajnoczi  */
1079d9ca2ea2SKevin Wolf int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
108061007b31SStefan Hajnoczi                      const void *buf, int count)
108161007b31SStefan Hajnoczi {
108261007b31SStefan Hajnoczi     int ret;
108361007b31SStefan Hajnoczi 
1084d9ca2ea2SKevin Wolf     ret = bdrv_pwrite(child, offset, buf, count);
108561007b31SStefan Hajnoczi     if (ret < 0) {
108661007b31SStefan Hajnoczi         return ret;
108761007b31SStefan Hajnoczi     }
108861007b31SStefan Hajnoczi 
1089d9ca2ea2SKevin Wolf     ret = bdrv_flush(child->bs);
1090855a6a93SKevin Wolf     if (ret < 0) {
1091855a6a93SKevin Wolf         return ret;
109261007b31SStefan Hajnoczi     }
109361007b31SStefan Hajnoczi 
109461007b31SStefan Hajnoczi     return 0;
109561007b31SStefan Hajnoczi }
109661007b31SStefan Hajnoczi 
109708844473SKevin Wolf typedef struct CoroutineIOCompletion {
109808844473SKevin Wolf     Coroutine *coroutine;
109908844473SKevin Wolf     int ret;
110008844473SKevin Wolf } CoroutineIOCompletion;
110108844473SKevin Wolf 
110208844473SKevin Wolf static void bdrv_co_io_em_complete(void *opaque, int ret)
110308844473SKevin Wolf {
110408844473SKevin Wolf     CoroutineIOCompletion *co = opaque;
110508844473SKevin Wolf 
110608844473SKevin Wolf     co->ret = ret;
1107b9e413ddSPaolo Bonzini     aio_co_wake(co->coroutine);
110808844473SKevin Wolf }
110908844473SKevin Wolf 
1110166fe960SKevin Wolf static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1111166fe960SKevin Wolf                                            uint64_t offset, uint64_t bytes,
1112ac850bf0SVladimir Sementsov-Ogievskiy                                            QEMUIOVector *qiov,
1113ac850bf0SVladimir Sementsov-Ogievskiy                                            size_t qiov_offset, int flags)
1114166fe960SKevin Wolf {
1115166fe960SKevin Wolf     BlockDriver *drv = bs->drv;
11163fb06697SKevin Wolf     int64_t sector_num;
11173fb06697SKevin Wolf     unsigned int nb_sectors;
1118ac850bf0SVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
1119ac850bf0SVladimir Sementsov-Ogievskiy     int ret;
11203fb06697SKevin Wolf 
1121fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
1122fe0480d6SKevin Wolf     assert(!(flags & BDRV_REQ_NO_FALLBACK));
1123fa166538SEric Blake 
1124d470ad42SMax Reitz     if (!drv) {
1125d470ad42SMax Reitz         return -ENOMEDIUM;
1126d470ad42SMax Reitz     }
1127d470ad42SMax Reitz 
1128ac850bf0SVladimir Sementsov-Ogievskiy     if (drv->bdrv_co_preadv_part) {
1129ac850bf0SVladimir Sementsov-Ogievskiy         return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1130ac850bf0SVladimir Sementsov-Ogievskiy                                         flags);
1131ac850bf0SVladimir Sementsov-Ogievskiy     }
1132ac850bf0SVladimir Sementsov-Ogievskiy 
1133ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov_offset > 0 || bytes != qiov->size) {
1134ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1135ac850bf0SVladimir Sementsov-Ogievskiy         qiov = &local_qiov;
1136ac850bf0SVladimir Sementsov-Ogievskiy     }
1137ac850bf0SVladimir Sementsov-Ogievskiy 
11383fb06697SKevin Wolf     if (drv->bdrv_co_preadv) {
1139ac850bf0SVladimir Sementsov-Ogievskiy         ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1140ac850bf0SVladimir Sementsov-Ogievskiy         goto out;
11413fb06697SKevin Wolf     }
11423fb06697SKevin Wolf 
1143edfab6a0SEric Blake     if (drv->bdrv_aio_preadv) {
114408844473SKevin Wolf         BlockAIOCB *acb;
114508844473SKevin Wolf         CoroutineIOCompletion co = {
114608844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
114708844473SKevin Wolf         };
114808844473SKevin Wolf 
1149e31f6864SEric Blake         acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
115008844473SKevin Wolf                                    bdrv_co_io_em_complete, &co);
115108844473SKevin Wolf         if (acb == NULL) {
1152ac850bf0SVladimir Sementsov-Ogievskiy             ret = -EIO;
1153ac850bf0SVladimir Sementsov-Ogievskiy             goto out;
115408844473SKevin Wolf         } else {
115508844473SKevin Wolf             qemu_coroutine_yield();
1156ac850bf0SVladimir Sementsov-Ogievskiy             ret = co.ret;
1157ac850bf0SVladimir Sementsov-Ogievskiy             goto out;
115808844473SKevin Wolf         }
115908844473SKevin Wolf     }
1160edfab6a0SEric Blake 
1161edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1162edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1163edfab6a0SEric Blake 
11641bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
11651bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
116641ae31e3SAlberto Garcia     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1167edfab6a0SEric Blake     assert(drv->bdrv_co_readv);
1168edfab6a0SEric Blake 
1169ac850bf0SVladimir Sementsov-Ogievskiy     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1170ac850bf0SVladimir Sementsov-Ogievskiy 
1171ac850bf0SVladimir Sementsov-Ogievskiy out:
1172ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov == &local_qiov) {
1173ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_destroy(&local_qiov);
1174ac850bf0SVladimir Sementsov-Ogievskiy     }
1175ac850bf0SVladimir Sementsov-Ogievskiy 
1176ac850bf0SVladimir Sementsov-Ogievskiy     return ret;
1177166fe960SKevin Wolf }
1178166fe960SKevin Wolf 
117978a07294SKevin Wolf static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
118078a07294SKevin Wolf                                             uint64_t offset, uint64_t bytes,
1181ac850bf0SVladimir Sementsov-Ogievskiy                                             QEMUIOVector *qiov,
1182ac850bf0SVladimir Sementsov-Ogievskiy                                             size_t qiov_offset, int flags)
118378a07294SKevin Wolf {
118478a07294SKevin Wolf     BlockDriver *drv = bs->drv;
11853fb06697SKevin Wolf     int64_t sector_num;
11863fb06697SKevin Wolf     unsigned int nb_sectors;
1187ac850bf0SVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
118878a07294SKevin Wolf     int ret;
118978a07294SKevin Wolf 
1190fa166538SEric Blake     assert(!(flags & ~BDRV_REQ_MASK));
1191fe0480d6SKevin Wolf     assert(!(flags & BDRV_REQ_NO_FALLBACK));
1192fa166538SEric Blake 
1193d470ad42SMax Reitz     if (!drv) {
1194d470ad42SMax Reitz         return -ENOMEDIUM;
1195d470ad42SMax Reitz     }
1196d470ad42SMax Reitz 
1197ac850bf0SVladimir Sementsov-Ogievskiy     if (drv->bdrv_co_pwritev_part) {
1198ac850bf0SVladimir Sementsov-Ogievskiy         ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1199ac850bf0SVladimir Sementsov-Ogievskiy                                         flags & bs->supported_write_flags);
1200ac850bf0SVladimir Sementsov-Ogievskiy         flags &= ~bs->supported_write_flags;
1201ac850bf0SVladimir Sementsov-Ogievskiy         goto emulate_flags;
1202ac850bf0SVladimir Sementsov-Ogievskiy     }
1203ac850bf0SVladimir Sementsov-Ogievskiy 
1204ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov_offset > 0 || bytes != qiov->size) {
1205ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1206ac850bf0SVladimir Sementsov-Ogievskiy         qiov = &local_qiov;
1207ac850bf0SVladimir Sementsov-Ogievskiy     }
1208ac850bf0SVladimir Sementsov-Ogievskiy 
12093fb06697SKevin Wolf     if (drv->bdrv_co_pwritev) {
1210515c2f43SKevin Wolf         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1211515c2f43SKevin Wolf                                    flags & bs->supported_write_flags);
1212515c2f43SKevin Wolf         flags &= ~bs->supported_write_flags;
12133fb06697SKevin Wolf         goto emulate_flags;
12143fb06697SKevin Wolf     }
12153fb06697SKevin Wolf 
1216edfab6a0SEric Blake     if (drv->bdrv_aio_pwritev) {
121708844473SKevin Wolf         BlockAIOCB *acb;
121808844473SKevin Wolf         CoroutineIOCompletion co = {
121908844473SKevin Wolf             .coroutine = qemu_coroutine_self(),
122008844473SKevin Wolf         };
122108844473SKevin Wolf 
1222e31f6864SEric Blake         acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1223e31f6864SEric Blake                                     flags & bs->supported_write_flags,
122408844473SKevin Wolf                                     bdrv_co_io_em_complete, &co);
1225e31f6864SEric Blake         flags &= ~bs->supported_write_flags;
122608844473SKevin Wolf         if (acb == NULL) {
12273fb06697SKevin Wolf             ret = -EIO;
122808844473SKevin Wolf         } else {
122908844473SKevin Wolf             qemu_coroutine_yield();
12303fb06697SKevin Wolf             ret = co.ret;
123108844473SKevin Wolf         }
1232edfab6a0SEric Blake         goto emulate_flags;
1233edfab6a0SEric Blake     }
1234edfab6a0SEric Blake 
1235edfab6a0SEric Blake     sector_num = offset >> BDRV_SECTOR_BITS;
1236edfab6a0SEric Blake     nb_sectors = bytes >> BDRV_SECTOR_BITS;
1237edfab6a0SEric Blake 
12381bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
12391bbbf32dSNir Soffer     assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
124041ae31e3SAlberto Garcia     assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1241edfab6a0SEric Blake 
1242e18a58b4SEric Blake     assert(drv->bdrv_co_writev);
1243e18a58b4SEric Blake     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1244edfab6a0SEric Blake                               flags & bs->supported_write_flags);
1245edfab6a0SEric Blake     flags &= ~bs->supported_write_flags;
124678a07294SKevin Wolf 
12473fb06697SKevin Wolf emulate_flags:
12484df863f3SEric Blake     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
124978a07294SKevin Wolf         ret = bdrv_co_flush(bs);
125078a07294SKevin Wolf     }
125178a07294SKevin Wolf 
1252ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov == &local_qiov) {
1253ac850bf0SVladimir Sementsov-Ogievskiy         qemu_iovec_destroy(&local_qiov);
1254ac850bf0SVladimir Sementsov-Ogievskiy     }
1255ac850bf0SVladimir Sementsov-Ogievskiy 
125678a07294SKevin Wolf     return ret;
125778a07294SKevin Wolf }
125878a07294SKevin Wolf 
125929a298afSPavel Butsykin static int coroutine_fn
126029a298afSPavel Butsykin bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1261ac850bf0SVladimir Sementsov-Ogievskiy                                uint64_t bytes, QEMUIOVector *qiov,
1262ac850bf0SVladimir Sementsov-Ogievskiy                                size_t qiov_offset)
126329a298afSPavel Butsykin {
126429a298afSPavel Butsykin     BlockDriver *drv = bs->drv;
1265ac850bf0SVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
1266ac850bf0SVladimir Sementsov-Ogievskiy     int ret;
126729a298afSPavel Butsykin 
1268d470ad42SMax Reitz     if (!drv) {
1269d470ad42SMax Reitz         return -ENOMEDIUM;
1270d470ad42SMax Reitz     }
1271d470ad42SMax Reitz 
1272ac850bf0SVladimir Sementsov-Ogievskiy     if (!block_driver_can_compress(drv)) {
127329a298afSPavel Butsykin         return -ENOTSUP;
127429a298afSPavel Butsykin     }
127529a298afSPavel Butsykin 
1276ac850bf0SVladimir Sementsov-Ogievskiy     if (drv->bdrv_co_pwritev_compressed_part) {
1277ac850bf0SVladimir Sementsov-Ogievskiy         return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1278ac850bf0SVladimir Sementsov-Ogievskiy                                                     qiov, qiov_offset);
1279ac850bf0SVladimir Sementsov-Ogievskiy     }
1280ac850bf0SVladimir Sementsov-Ogievskiy 
1281ac850bf0SVladimir Sementsov-Ogievskiy     if (qiov_offset == 0) {
128229a298afSPavel Butsykin         return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
128329a298afSPavel Butsykin     }
128429a298afSPavel Butsykin 
1285ac850bf0SVladimir Sementsov-Ogievskiy     qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1286ac850bf0SVladimir Sementsov-Ogievskiy     ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1287ac850bf0SVladimir Sementsov-Ogievskiy     qemu_iovec_destroy(&local_qiov);
1288ac850bf0SVladimir Sementsov-Ogievskiy 
1289ac850bf0SVladimir Sementsov-Ogievskiy     return ret;
1290ac850bf0SVladimir Sementsov-Ogievskiy }
1291ac850bf0SVladimir Sementsov-Ogievskiy 
129285c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
12933299e5ecSVladimir Sementsov-Ogievskiy         int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
12941143ec5eSVladimir Sementsov-Ogievskiy         size_t qiov_offset, int flags)
129561007b31SStefan Hajnoczi {
129685c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
129785c97ca7SKevin Wolf 
129861007b31SStefan Hajnoczi     /* Perform I/O through a temporary buffer so that users who scribble over
129961007b31SStefan Hajnoczi      * their read buffer while the operation is in progress do not end up
130061007b31SStefan Hajnoczi      * modifying the image file.  This is critical for zero-copy guest I/O
130161007b31SStefan Hajnoczi      * where anything might happen inside guest memory.
130261007b31SStefan Hajnoczi      */
13032275cc90SVladimir Sementsov-Ogievskiy     void *bounce_buffer = NULL;
130461007b31SStefan Hajnoczi 
130561007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
1306244483e6SKevin Wolf     int64_t cluster_offset;
13077cfd5275SEric Blake     int64_t cluster_bytes;
130861007b31SStefan Hajnoczi     size_t skip_bytes;
130961007b31SStefan Hajnoczi     int ret;
1310cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1311cb2e2878SEric Blake                                     BDRV_REQUEST_MAX_BYTES);
1312cb2e2878SEric Blake     unsigned int progress = 0;
13138644476eSMax Reitz     bool skip_write;
131461007b31SStefan Hajnoczi 
1315d470ad42SMax Reitz     if (!drv) {
1316d470ad42SMax Reitz         return -ENOMEDIUM;
1317d470ad42SMax Reitz     }
1318d470ad42SMax Reitz 
13198644476eSMax Reitz     /*
13208644476eSMax Reitz      * Do not write anything when the BDS is inactive.  That is not
13218644476eSMax Reitz      * allowed, and it would not help.
13228644476eSMax Reitz      */
13238644476eSMax Reitz     skip_write = (bs->open_flags & BDRV_O_INACTIVE);
13248644476eSMax Reitz 
13251bf03e66SKevin Wolf     /* FIXME We cannot require callers to have write permissions when all they
13261bf03e66SKevin Wolf      * are doing is a read request. If we did things right, write permissions
13271bf03e66SKevin Wolf      * would be obtained anyway, but internally by the copy-on-read code. As
1328765d9df9SEric Blake      * long as it is implemented here rather than in a separate filter driver,
13291bf03e66SKevin Wolf      * the copy-on-read code doesn't have its own BdrvChild, however, for which
13301bf03e66SKevin Wolf      * it could request permissions. Therefore we have to bypass the permission
13311bf03e66SKevin Wolf      * system for the moment. */
13321bf03e66SKevin Wolf     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1333afa4b293SKevin Wolf 
133461007b31SStefan Hajnoczi     /* Cover entire cluster so no additional backing file I/O is required when
1335cb2e2878SEric Blake      * allocating cluster in the image file.  Note that this value may exceed
1336cb2e2878SEric Blake      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1337cb2e2878SEric Blake      * is one reason we loop rather than doing it all at once.
133861007b31SStefan Hajnoczi      */
1339244483e6SKevin Wolf     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1340cb2e2878SEric Blake     skip_bytes = offset - cluster_offset;
134161007b31SStefan Hajnoczi 
1342244483e6SKevin Wolf     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1343244483e6SKevin Wolf                                    cluster_offset, cluster_bytes);
134461007b31SStefan Hajnoczi 
1345cb2e2878SEric Blake     while (cluster_bytes) {
1346cb2e2878SEric Blake         int64_t pnum;
134761007b31SStefan Hajnoczi 
13488644476eSMax Reitz         if (skip_write) {
13498644476eSMax Reitz             ret = 1; /* "already allocated", so nothing will be copied */
13508644476eSMax Reitz             pnum = MIN(cluster_bytes, max_transfer);
13518644476eSMax Reitz         } else {
1352cb2e2878SEric Blake             ret = bdrv_is_allocated(bs, cluster_offset,
1353cb2e2878SEric Blake                                     MIN(cluster_bytes, max_transfer), &pnum);
1354cb2e2878SEric Blake             if (ret < 0) {
13558644476eSMax Reitz                 /*
13568644476eSMax Reitz                  * Safe to treat errors in querying allocation as if
1357cb2e2878SEric Blake                  * unallocated; we'll probably fail again soon on the
1358cb2e2878SEric Blake                  * read, but at least that will set a decent errno.
1359cb2e2878SEric Blake                  */
1360cb2e2878SEric Blake                 pnum = MIN(cluster_bytes, max_transfer);
1361cb2e2878SEric Blake             }
1362cb2e2878SEric Blake 
1363b0ddcbbbSKevin Wolf             /* Stop at EOF if the image ends in the middle of the cluster */
1364b0ddcbbbSKevin Wolf             if (ret == 0 && pnum == 0) {
1365b0ddcbbbSKevin Wolf                 assert(progress >= bytes);
1366b0ddcbbbSKevin Wolf                 break;
1367b0ddcbbbSKevin Wolf             }
1368b0ddcbbbSKevin Wolf 
1369cb2e2878SEric Blake             assert(skip_bytes < pnum);
13708644476eSMax Reitz         }
1371cb2e2878SEric Blake 
1372cb2e2878SEric Blake         if (ret <= 0) {
13731143ec5eSVladimir Sementsov-Ogievskiy             QEMUIOVector local_qiov;
13741143ec5eSVladimir Sementsov-Ogievskiy 
1375cb2e2878SEric Blake             /* Must copy-on-read; use the bounce buffer */
13760d93ed08SVladimir Sementsov-Ogievskiy             pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
13772275cc90SVladimir Sementsov-Ogievskiy             if (!bounce_buffer) {
13782275cc90SVladimir Sementsov-Ogievskiy                 int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
13792275cc90SVladimir Sementsov-Ogievskiy                 int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
13802275cc90SVladimir Sementsov-Ogievskiy                 int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
13812275cc90SVladimir Sementsov-Ogievskiy 
13822275cc90SVladimir Sementsov-Ogievskiy                 bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
13832275cc90SVladimir Sementsov-Ogievskiy                 if (!bounce_buffer) {
13842275cc90SVladimir Sementsov-Ogievskiy                     ret = -ENOMEM;
13852275cc90SVladimir Sementsov-Ogievskiy                     goto err;
13862275cc90SVladimir Sementsov-Ogievskiy                 }
13872275cc90SVladimir Sementsov-Ogievskiy             }
13880d93ed08SVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1389cb2e2878SEric Blake 
1390cb2e2878SEric Blake             ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1391ac850bf0SVladimir Sementsov-Ogievskiy                                      &local_qiov, 0, 0);
139261007b31SStefan Hajnoczi             if (ret < 0) {
139361007b31SStefan Hajnoczi                 goto err;
139461007b31SStefan Hajnoczi             }
139561007b31SStefan Hajnoczi 
1396d855ebcdSEric Blake             bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1397c1499a5eSEric Blake             if (drv->bdrv_co_pwrite_zeroes &&
1398cb2e2878SEric Blake                 buffer_is_zero(bounce_buffer, pnum)) {
1399a604fa2bSEric Blake                 /* FIXME: Should we (perhaps conditionally) be setting
1400a604fa2bSEric Blake                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1401a604fa2bSEric Blake                  * that still correctly reads as zero? */
14027adcf59fSMax Reitz                 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
14037adcf59fSMax Reitz                                                BDRV_REQ_WRITE_UNCHANGED);
140461007b31SStefan Hajnoczi             } else {
1405cb2e2878SEric Blake                 /* This does not change the data on the disk, it is not
1406cb2e2878SEric Blake                  * necessary to flush even in cache=writethrough mode.
140761007b31SStefan Hajnoczi                  */
1408cb2e2878SEric Blake                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1409ac850bf0SVladimir Sementsov-Ogievskiy                                           &local_qiov, 0,
14107adcf59fSMax Reitz                                           BDRV_REQ_WRITE_UNCHANGED);
141161007b31SStefan Hajnoczi             }
141261007b31SStefan Hajnoczi 
141361007b31SStefan Hajnoczi             if (ret < 0) {
1414cb2e2878SEric Blake                 /* It might be okay to ignore write errors for guest
1415cb2e2878SEric Blake                  * requests.  If this is a deliberate copy-on-read
1416cb2e2878SEric Blake                  * then we don't want to ignore the error.  Simply
1417cb2e2878SEric Blake                  * report it in all cases.
141861007b31SStefan Hajnoczi                  */
141961007b31SStefan Hajnoczi                 goto err;
142061007b31SStefan Hajnoczi             }
142161007b31SStefan Hajnoczi 
14223299e5ecSVladimir Sementsov-Ogievskiy             if (!(flags & BDRV_REQ_PREFETCH)) {
14231143ec5eSVladimir Sementsov-Ogievskiy                 qemu_iovec_from_buf(qiov, qiov_offset + progress,
14241143ec5eSVladimir Sementsov-Ogievskiy                                     bounce_buffer + skip_bytes,
14254ab78b19SVladimir Sementsov-Ogievskiy                                     MIN(pnum - skip_bytes, bytes - progress));
14263299e5ecSVladimir Sementsov-Ogievskiy             }
14273299e5ecSVladimir Sementsov-Ogievskiy         } else if (!(flags & BDRV_REQ_PREFETCH)) {
1428cb2e2878SEric Blake             /* Read directly into the destination */
14291143ec5eSVladimir Sementsov-Ogievskiy             ret = bdrv_driver_preadv(bs, offset + progress,
14301143ec5eSVladimir Sementsov-Ogievskiy                                      MIN(pnum - skip_bytes, bytes - progress),
14311143ec5eSVladimir Sementsov-Ogievskiy                                      qiov, qiov_offset + progress, 0);
1432cb2e2878SEric Blake             if (ret < 0) {
1433cb2e2878SEric Blake                 goto err;
1434cb2e2878SEric Blake             }
1435cb2e2878SEric Blake         }
1436cb2e2878SEric Blake 
1437cb2e2878SEric Blake         cluster_offset += pnum;
1438cb2e2878SEric Blake         cluster_bytes -= pnum;
1439cb2e2878SEric Blake         progress += pnum - skip_bytes;
1440cb2e2878SEric Blake         skip_bytes = 0;
1441cb2e2878SEric Blake     }
1442cb2e2878SEric Blake     ret = 0;
144361007b31SStefan Hajnoczi 
144461007b31SStefan Hajnoczi err:
144561007b31SStefan Hajnoczi     qemu_vfree(bounce_buffer);
144661007b31SStefan Hajnoczi     return ret;
144761007b31SStefan Hajnoczi }
144861007b31SStefan Hajnoczi 
144961007b31SStefan Hajnoczi /*
145061007b31SStefan Hajnoczi  * Forwards an already correctly aligned request to the BlockDriver. This
14511a62d0acSEric Blake  * handles copy on read, zeroing after EOF, and fragmentation of large
14521a62d0acSEric Blake  * reads; any other features must be implemented by the caller.
145361007b31SStefan Hajnoczi  */
145485c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
145561007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
145665cd4424SVladimir Sementsov-Ogievskiy     int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
145761007b31SStefan Hajnoczi {
145885c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
1459c9d20029SKevin Wolf     int64_t total_bytes, max_bytes;
14601a62d0acSEric Blake     int ret = 0;
14611a62d0acSEric Blake     uint64_t bytes_remaining = bytes;
14621a62d0acSEric Blake     int max_transfer;
146361007b31SStefan Hajnoczi 
146449c07526SKevin Wolf     assert(is_power_of_2(align));
146549c07526SKevin Wolf     assert((offset & (align - 1)) == 0);
146649c07526SKevin Wolf     assert((bytes & (align - 1)) == 0);
1467abb06c5aSDaniel P. Berrange     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
14681a62d0acSEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
14691a62d0acSEric Blake                                    align);
1470a604fa2bSEric Blake 
1471a604fa2bSEric Blake     /* TODO: We would need a per-BDS .supported_read_flags and
1472a604fa2bSEric Blake      * potential fallback support, if we ever implement any read flags
1473a604fa2bSEric Blake      * to pass through to drivers.  For now, there aren't any
1474a604fa2bSEric Blake      * passthrough flags.  */
1475c53cb427SPaolo Bonzini     assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));
147661007b31SStefan Hajnoczi 
147761007b31SStefan Hajnoczi     /* Handle Copy on Read and associated serialisation */
147861007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
147961007b31SStefan Hajnoczi         /* If we touch the same cluster it counts as an overlap.  This
148061007b31SStefan Hajnoczi          * guarantees that allocating writes will be serialized and not race
148161007b31SStefan Hajnoczi          * with each other for the same cluster.  For example, in copy-on-read
148261007b31SStefan Hajnoczi          * it ensures that the CoR read and write operations are atomic and
148361007b31SStefan Hajnoczi          * guest writes cannot interleave between them. */
1484304d9d7fSMax Reitz         bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
148518fbd0deSPaolo Bonzini     } else {
1486304d9d7fSMax Reitz         bdrv_wait_serialising_requests(req);
148718fbd0deSPaolo Bonzini     }
148861007b31SStefan Hajnoczi 
148961007b31SStefan Hajnoczi     if (flags & BDRV_REQ_COPY_ON_READ) {
1490d6a644bbSEric Blake         int64_t pnum;
149161007b31SStefan Hajnoczi 
149288e63df2SEric Blake         ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
149361007b31SStefan Hajnoczi         if (ret < 0) {
149461007b31SStefan Hajnoczi             goto out;
149561007b31SStefan Hajnoczi         }
149661007b31SStefan Hajnoczi 
149788e63df2SEric Blake         if (!ret || pnum != bytes) {
149865cd4424SVladimir Sementsov-Ogievskiy             ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
149965cd4424SVladimir Sementsov-Ogievskiy                                            qiov, qiov_offset, flags);
15003299e5ecSVladimir Sementsov-Ogievskiy             goto out;
15013299e5ecSVladimir Sementsov-Ogievskiy         } else if (flags & BDRV_REQ_PREFETCH) {
150261007b31SStefan Hajnoczi             goto out;
150361007b31SStefan Hajnoczi         }
150461007b31SStefan Hajnoczi     }
150561007b31SStefan Hajnoczi 
15061a62d0acSEric Blake     /* Forward the request to the BlockDriver, possibly fragmenting it */
150749c07526SKevin Wolf     total_bytes = bdrv_getlength(bs);
150849c07526SKevin Wolf     if (total_bytes < 0) {
150949c07526SKevin Wolf         ret = total_bytes;
151061007b31SStefan Hajnoczi         goto out;
151161007b31SStefan Hajnoczi     }
151261007b31SStefan Hajnoczi 
151349c07526SKevin Wolf     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
15141a62d0acSEric Blake     if (bytes <= max_bytes && bytes <= max_transfer) {
151565cd4424SVladimir Sementsov-Ogievskiy         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0);
15161a62d0acSEric Blake         goto out;
151761007b31SStefan Hajnoczi     }
151861007b31SStefan Hajnoczi 
15191a62d0acSEric Blake     while (bytes_remaining) {
15201a62d0acSEric Blake         int num;
15211a62d0acSEric Blake 
15221a62d0acSEric Blake         if (max_bytes) {
15231a62d0acSEric Blake             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
15241a62d0acSEric Blake             assert(num);
15251a62d0acSEric Blake 
15261a62d0acSEric Blake             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1527134b7decSMax Reitz                                      num, qiov,
1528134b7decSMax Reitz                                      qiov_offset + bytes - bytes_remaining, 0);
15291a62d0acSEric Blake             max_bytes -= num;
15301a62d0acSEric Blake         } else {
15311a62d0acSEric Blake             num = bytes_remaining;
1532134b7decSMax Reitz             ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining,
1533134b7decSMax Reitz                                     0, bytes_remaining);
15341a62d0acSEric Blake         }
15351a62d0acSEric Blake         if (ret < 0) {
15361a62d0acSEric Blake             goto out;
15371a62d0acSEric Blake         }
15381a62d0acSEric Blake         bytes_remaining -= num;
153961007b31SStefan Hajnoczi     }
154061007b31SStefan Hajnoczi 
154161007b31SStefan Hajnoczi out:
15421a62d0acSEric Blake     return ret < 0 ? ret : 0;
154361007b31SStefan Hajnoczi }
154461007b31SStefan Hajnoczi 
154561007b31SStefan Hajnoczi /*
15467a3f542fSVladimir Sementsov-Ogievskiy  * Request padding
15477a3f542fSVladimir Sementsov-Ogievskiy  *
15487a3f542fSVladimir Sementsov-Ogievskiy  *  |<---- align ----->|                     |<----- align ---->|
15497a3f542fSVladimir Sementsov-Ogievskiy  *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
15507a3f542fSVladimir Sementsov-Ogievskiy  *  |          |       |                     |     |            |
15517a3f542fSVladimir Sementsov-Ogievskiy  * -*----------$-------*-------- ... --------*-----$------------*---
15527a3f542fSVladimir Sementsov-Ogievskiy  *  |          |       |                     |     |            |
15537a3f542fSVladimir Sementsov-Ogievskiy  *  |          offset  |                     |     end          |
15547a3f542fSVladimir Sementsov-Ogievskiy  *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
15557a3f542fSVladimir Sementsov-Ogievskiy  *  [buf   ... )                             [tail_buf          )
15567a3f542fSVladimir Sementsov-Ogievskiy  *
15577a3f542fSVladimir Sementsov-Ogievskiy  * @buf is an aligned allocation needed to store @head and @tail paddings. @head
15587a3f542fSVladimir Sementsov-Ogievskiy  * is placed at the beginning of @buf and @tail at the @end.
15597a3f542fSVladimir Sementsov-Ogievskiy  *
15607a3f542fSVladimir Sementsov-Ogievskiy  * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
15617a3f542fSVladimir Sementsov-Ogievskiy  * around tail, if tail exists.
15627a3f542fSVladimir Sementsov-Ogievskiy  *
15637a3f542fSVladimir Sementsov-Ogievskiy  * @merge_reads is true for small requests,
15647a3f542fSVladimir Sementsov-Ogievskiy  * if @buf_len == @head + bytes + @tail. In this case it is possible that both
15657a3f542fSVladimir Sementsov-Ogievskiy  * head and tail exist but @buf_len == align and @tail_buf == @buf.
156661007b31SStefan Hajnoczi  */
15677a3f542fSVladimir Sementsov-Ogievskiy typedef struct BdrvRequestPadding {
15687a3f542fSVladimir Sementsov-Ogievskiy     uint8_t *buf;
15697a3f542fSVladimir Sementsov-Ogievskiy     size_t buf_len;
15707a3f542fSVladimir Sementsov-Ogievskiy     uint8_t *tail_buf;
15717a3f542fSVladimir Sementsov-Ogievskiy     size_t head;
15727a3f542fSVladimir Sementsov-Ogievskiy     size_t tail;
15737a3f542fSVladimir Sementsov-Ogievskiy     bool merge_reads;
15747a3f542fSVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
15757a3f542fSVladimir Sementsov-Ogievskiy } BdrvRequestPadding;
15767a3f542fSVladimir Sementsov-Ogievskiy 
15777a3f542fSVladimir Sementsov-Ogievskiy static bool bdrv_init_padding(BlockDriverState *bs,
15787a3f542fSVladimir Sementsov-Ogievskiy                               int64_t offset, int64_t bytes,
15797a3f542fSVladimir Sementsov-Ogievskiy                               BdrvRequestPadding *pad)
15807a3f542fSVladimir Sementsov-Ogievskiy {
15817a3f542fSVladimir Sementsov-Ogievskiy     uint64_t align = bs->bl.request_alignment;
15827a3f542fSVladimir Sementsov-Ogievskiy     size_t sum;
15837a3f542fSVladimir Sementsov-Ogievskiy 
15847a3f542fSVladimir Sementsov-Ogievskiy     memset(pad, 0, sizeof(*pad));
15857a3f542fSVladimir Sementsov-Ogievskiy 
15867a3f542fSVladimir Sementsov-Ogievskiy     pad->head = offset & (align - 1);
15877a3f542fSVladimir Sementsov-Ogievskiy     pad->tail = ((offset + bytes) & (align - 1));
15887a3f542fSVladimir Sementsov-Ogievskiy     if (pad->tail) {
15897a3f542fSVladimir Sementsov-Ogievskiy         pad->tail = align - pad->tail;
15907a3f542fSVladimir Sementsov-Ogievskiy     }
15917a3f542fSVladimir Sementsov-Ogievskiy 
1592ac9d00bfSVladimir Sementsov-Ogievskiy     if (!pad->head && !pad->tail) {
15937a3f542fSVladimir Sementsov-Ogievskiy         return false;
15947a3f542fSVladimir Sementsov-Ogievskiy     }
15957a3f542fSVladimir Sementsov-Ogievskiy 
1596ac9d00bfSVladimir Sementsov-Ogievskiy     assert(bytes); /* Nothing good in aligning zero-length requests */
1597ac9d00bfSVladimir Sementsov-Ogievskiy 
15987a3f542fSVladimir Sementsov-Ogievskiy     sum = pad->head + bytes + pad->tail;
15997a3f542fSVladimir Sementsov-Ogievskiy     pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
16007a3f542fSVladimir Sementsov-Ogievskiy     pad->buf = qemu_blockalign(bs, pad->buf_len);
16017a3f542fSVladimir Sementsov-Ogievskiy     pad->merge_reads = sum == pad->buf_len;
16027a3f542fSVladimir Sementsov-Ogievskiy     if (pad->tail) {
16037a3f542fSVladimir Sementsov-Ogievskiy         pad->tail_buf = pad->buf + pad->buf_len - align;
16047a3f542fSVladimir Sementsov-Ogievskiy     }
16057a3f542fSVladimir Sementsov-Ogievskiy 
16067a3f542fSVladimir Sementsov-Ogievskiy     return true;
16077a3f542fSVladimir Sementsov-Ogievskiy }
16087a3f542fSVladimir Sementsov-Ogievskiy 
16097a3f542fSVladimir Sementsov-Ogievskiy static int bdrv_padding_rmw_read(BdrvChild *child,
16107a3f542fSVladimir Sementsov-Ogievskiy                                  BdrvTrackedRequest *req,
16117a3f542fSVladimir Sementsov-Ogievskiy                                  BdrvRequestPadding *pad,
16127a3f542fSVladimir Sementsov-Ogievskiy                                  bool zero_middle)
16137a3f542fSVladimir Sementsov-Ogievskiy {
16147a3f542fSVladimir Sementsov-Ogievskiy     QEMUIOVector local_qiov;
16157a3f542fSVladimir Sementsov-Ogievskiy     BlockDriverState *bs = child->bs;
16167a3f542fSVladimir Sementsov-Ogievskiy     uint64_t align = bs->bl.request_alignment;
16177a3f542fSVladimir Sementsov-Ogievskiy     int ret;
16187a3f542fSVladimir Sementsov-Ogievskiy 
16197a3f542fSVladimir Sementsov-Ogievskiy     assert(req->serialising && pad->buf);
16207a3f542fSVladimir Sementsov-Ogievskiy 
16217a3f542fSVladimir Sementsov-Ogievskiy     if (pad->head || pad->merge_reads) {
16227a3f542fSVladimir Sementsov-Ogievskiy         uint64_t bytes = pad->merge_reads ? pad->buf_len : align;
16237a3f542fSVladimir Sementsov-Ogievskiy 
16247a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
16257a3f542fSVladimir Sementsov-Ogievskiy 
16267a3f542fSVladimir Sementsov-Ogievskiy         if (pad->head) {
16277a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
16287a3f542fSVladimir Sementsov-Ogievskiy         }
16297a3f542fSVladimir Sementsov-Ogievskiy         if (pad->merge_reads && pad->tail) {
16307a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
16317a3f542fSVladimir Sementsov-Ogievskiy         }
16327a3f542fSVladimir Sementsov-Ogievskiy         ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
163365cd4424SVladimir Sementsov-Ogievskiy                                   align, &local_qiov, 0, 0);
16347a3f542fSVladimir Sementsov-Ogievskiy         if (ret < 0) {
16357a3f542fSVladimir Sementsov-Ogievskiy             return ret;
16367a3f542fSVladimir Sementsov-Ogievskiy         }
16377a3f542fSVladimir Sementsov-Ogievskiy         if (pad->head) {
16387a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
16397a3f542fSVladimir Sementsov-Ogievskiy         }
16407a3f542fSVladimir Sementsov-Ogievskiy         if (pad->merge_reads && pad->tail) {
16417a3f542fSVladimir Sementsov-Ogievskiy             bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
16427a3f542fSVladimir Sementsov-Ogievskiy         }
16437a3f542fSVladimir Sementsov-Ogievskiy 
16447a3f542fSVladimir Sementsov-Ogievskiy         if (pad->merge_reads) {
16457a3f542fSVladimir Sementsov-Ogievskiy             goto zero_mem;
16467a3f542fSVladimir Sementsov-Ogievskiy         }
16477a3f542fSVladimir Sementsov-Ogievskiy     }
16487a3f542fSVladimir Sementsov-Ogievskiy 
16497a3f542fSVladimir Sementsov-Ogievskiy     if (pad->tail) {
16507a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
16517a3f542fSVladimir Sementsov-Ogievskiy 
16527a3f542fSVladimir Sementsov-Ogievskiy         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
16537a3f542fSVladimir Sementsov-Ogievskiy         ret = bdrv_aligned_preadv(
16547a3f542fSVladimir Sementsov-Ogievskiy                 child, req,
16557a3f542fSVladimir Sementsov-Ogievskiy                 req->overlap_offset + req->overlap_bytes - align,
165665cd4424SVladimir Sementsov-Ogievskiy                 align, align, &local_qiov, 0, 0);
16577a3f542fSVladimir Sementsov-Ogievskiy         if (ret < 0) {
16587a3f542fSVladimir Sementsov-Ogievskiy             return ret;
16597a3f542fSVladimir Sementsov-Ogievskiy         }
16607a3f542fSVladimir Sementsov-Ogievskiy         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
16617a3f542fSVladimir Sementsov-Ogievskiy     }
16627a3f542fSVladimir Sementsov-Ogievskiy 
16637a3f542fSVladimir Sementsov-Ogievskiy zero_mem:
16647a3f542fSVladimir Sementsov-Ogievskiy     if (zero_middle) {
16657a3f542fSVladimir Sementsov-Ogievskiy         memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
16667a3f542fSVladimir Sementsov-Ogievskiy     }
16677a3f542fSVladimir Sementsov-Ogievskiy 
16687a3f542fSVladimir Sementsov-Ogievskiy     return 0;
16697a3f542fSVladimir Sementsov-Ogievskiy }
16707a3f542fSVladimir Sementsov-Ogievskiy 
16717a3f542fSVladimir Sementsov-Ogievskiy static void bdrv_padding_destroy(BdrvRequestPadding *pad)
16727a3f542fSVladimir Sementsov-Ogievskiy {
16737a3f542fSVladimir Sementsov-Ogievskiy     if (pad->buf) {
16747a3f542fSVladimir Sementsov-Ogievskiy         qemu_vfree(pad->buf);
16757a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_destroy(&pad->local_qiov);
16767a3f542fSVladimir Sementsov-Ogievskiy     }
16777a3f542fSVladimir Sementsov-Ogievskiy }
16787a3f542fSVladimir Sementsov-Ogievskiy 
16797a3f542fSVladimir Sementsov-Ogievskiy /*
16807a3f542fSVladimir Sementsov-Ogievskiy  * bdrv_pad_request
16817a3f542fSVladimir Sementsov-Ogievskiy  *
16827a3f542fSVladimir Sementsov-Ogievskiy  * Exchange request parameters with padded request if needed. Don't include RMW
16837a3f542fSVladimir Sementsov-Ogievskiy  * read of padding, bdrv_padding_rmw_read() should be called separately if
16847a3f542fSVladimir Sementsov-Ogievskiy  * needed.
16857a3f542fSVladimir Sementsov-Ogievskiy  *
16867a3f542fSVladimir Sementsov-Ogievskiy  * All parameters except @bs are in-out: they represent original request at
16877a3f542fSVladimir Sementsov-Ogievskiy  * function call and padded (if padding needed) at function finish.
16887a3f542fSVladimir Sementsov-Ogievskiy  *
16897a3f542fSVladimir Sementsov-Ogievskiy  * Function always succeeds.
16907a3f542fSVladimir Sementsov-Ogievskiy  */
16911acc3466SVladimir Sementsov-Ogievskiy static bool bdrv_pad_request(BlockDriverState *bs,
16921acc3466SVladimir Sementsov-Ogievskiy                              QEMUIOVector **qiov, size_t *qiov_offset,
16937a3f542fSVladimir Sementsov-Ogievskiy                              int64_t *offset, unsigned int *bytes,
16947a3f542fSVladimir Sementsov-Ogievskiy                              BdrvRequestPadding *pad)
16957a3f542fSVladimir Sementsov-Ogievskiy {
16967a3f542fSVladimir Sementsov-Ogievskiy     if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
16977a3f542fSVladimir Sementsov-Ogievskiy         return false;
16987a3f542fSVladimir Sementsov-Ogievskiy     }
16997a3f542fSVladimir Sementsov-Ogievskiy 
17007a3f542fSVladimir Sementsov-Ogievskiy     qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
17011acc3466SVladimir Sementsov-Ogievskiy                              *qiov, *qiov_offset, *bytes,
17027a3f542fSVladimir Sementsov-Ogievskiy                              pad->buf + pad->buf_len - pad->tail, pad->tail);
17037a3f542fSVladimir Sementsov-Ogievskiy     *bytes += pad->head + pad->tail;
17047a3f542fSVladimir Sementsov-Ogievskiy     *offset -= pad->head;
17057a3f542fSVladimir Sementsov-Ogievskiy     *qiov = &pad->local_qiov;
17061acc3466SVladimir Sementsov-Ogievskiy     *qiov_offset = 0;
17077a3f542fSVladimir Sementsov-Ogievskiy 
17087a3f542fSVladimir Sementsov-Ogievskiy     return true;
17097a3f542fSVladimir Sementsov-Ogievskiy }
17107a3f542fSVladimir Sementsov-Ogievskiy 
1711a03ef88fSKevin Wolf int coroutine_fn bdrv_co_preadv(BdrvChild *child,
171261007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
171361007b31SStefan Hajnoczi     BdrvRequestFlags flags)
171461007b31SStefan Hajnoczi {
17151acc3466SVladimir Sementsov-Ogievskiy     return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
17161acc3466SVladimir Sementsov-Ogievskiy }
17171acc3466SVladimir Sementsov-Ogievskiy 
17181acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
17191acc3466SVladimir Sementsov-Ogievskiy     int64_t offset, unsigned int bytes,
17201acc3466SVladimir Sementsov-Ogievskiy     QEMUIOVector *qiov, size_t qiov_offset,
17211acc3466SVladimir Sementsov-Ogievskiy     BdrvRequestFlags flags)
17221acc3466SVladimir Sementsov-Ogievskiy {
1723a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
172461007b31SStefan Hajnoczi     BdrvTrackedRequest req;
17257a3f542fSVladimir Sementsov-Ogievskiy     BdrvRequestPadding pad;
172661007b31SStefan Hajnoczi     int ret;
172761007b31SStefan Hajnoczi 
17287a3f542fSVladimir Sementsov-Ogievskiy     trace_bdrv_co_preadv(bs, offset, bytes, flags);
172961007b31SStefan Hajnoczi 
173061007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
173161007b31SStefan Hajnoczi     if (ret < 0) {
173261007b31SStefan Hajnoczi         return ret;
173361007b31SStefan Hajnoczi     }
173461007b31SStefan Hajnoczi 
1735ac9d00bfSVladimir Sementsov-Ogievskiy     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
1736ac9d00bfSVladimir Sementsov-Ogievskiy         /*
1737ac9d00bfSVladimir Sementsov-Ogievskiy          * Aligning zero request is nonsense. Even if driver has special meaning
1738ac9d00bfSVladimir Sementsov-Ogievskiy          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
1739ac9d00bfSVladimir Sementsov-Ogievskiy          * it to driver due to request_alignment.
1740ac9d00bfSVladimir Sementsov-Ogievskiy          *
1741ac9d00bfSVladimir Sementsov-Ogievskiy          * Still, no reason to return an error if someone do unaligned
1742ac9d00bfSVladimir Sementsov-Ogievskiy          * zero-length read occasionally.
1743ac9d00bfSVladimir Sementsov-Ogievskiy          */
1744ac9d00bfSVladimir Sementsov-Ogievskiy         return 0;
1745ac9d00bfSVladimir Sementsov-Ogievskiy     }
1746ac9d00bfSVladimir Sementsov-Ogievskiy 
174799723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
174899723548SPaolo Bonzini 
17499568b511SWen Congyang     /* Don't do copy-on-read if we read data before write operation */
1750c53cb427SPaolo Bonzini     if (atomic_read(&bs->copy_on_read)) {
175161007b31SStefan Hajnoczi         flags |= BDRV_REQ_COPY_ON_READ;
175261007b31SStefan Hajnoczi     }
175361007b31SStefan Hajnoczi 
17541acc3466SVladimir Sementsov-Ogievskiy     bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad);
175561007b31SStefan Hajnoczi 
1756ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
17577a3f542fSVladimir Sementsov-Ogievskiy     ret = bdrv_aligned_preadv(child, &req, offset, bytes,
17587a3f542fSVladimir Sementsov-Ogievskiy                               bs->bl.request_alignment,
17591acc3466SVladimir Sementsov-Ogievskiy                               qiov, qiov_offset, flags);
176061007b31SStefan Hajnoczi     tracked_request_end(&req);
176199723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
176261007b31SStefan Hajnoczi 
17637a3f542fSVladimir Sementsov-Ogievskiy     bdrv_padding_destroy(&pad);
176461007b31SStefan Hajnoczi 
176561007b31SStefan Hajnoczi     return ret;
176661007b31SStefan Hajnoczi }
176761007b31SStefan Hajnoczi 
1768d05aa8bbSEric Blake static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1769f5a5ca79SManos Pitsidianakis     int64_t offset, int bytes, BdrvRequestFlags flags)
177061007b31SStefan Hajnoczi {
177161007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
177261007b31SStefan Hajnoczi     QEMUIOVector qiov;
17730d93ed08SVladimir Sementsov-Ogievskiy     void *buf = NULL;
177461007b31SStefan Hajnoczi     int ret = 0;
1775465fe887SEric Blake     bool need_flush = false;
1776443668caSDenis V. Lunev     int head = 0;
1777443668caSDenis V. Lunev     int tail = 0;
177861007b31SStefan Hajnoczi 
1779cf081fcaSEric Blake     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1780a5b8dd2cSEric Blake     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1781a5b8dd2cSEric Blake                         bs->bl.request_alignment);
1782cb2e2878SEric Blake     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1783cf081fcaSEric Blake 
1784d470ad42SMax Reitz     if (!drv) {
1785d470ad42SMax Reitz         return -ENOMEDIUM;
1786d470ad42SMax Reitz     }
1787d470ad42SMax Reitz 
1788fe0480d6SKevin Wolf     if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1789fe0480d6SKevin Wolf         return -ENOTSUP;
1790fe0480d6SKevin Wolf     }
1791fe0480d6SKevin Wolf 
1792b8d0a980SEric Blake     assert(alignment % bs->bl.request_alignment == 0);
1793b8d0a980SEric Blake     head = offset % alignment;
1794f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % alignment;
1795b8d0a980SEric Blake     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1796b8d0a980SEric Blake     assert(max_write_zeroes >= bs->bl.request_alignment);
179761007b31SStefan Hajnoczi 
1798f5a5ca79SManos Pitsidianakis     while (bytes > 0 && !ret) {
1799f5a5ca79SManos Pitsidianakis         int num = bytes;
180061007b31SStefan Hajnoczi 
180161007b31SStefan Hajnoczi         /* Align request.  Block drivers can expect the "bulk" of the request
1802443668caSDenis V. Lunev          * to be aligned, and that unaligned requests do not cross cluster
1803443668caSDenis V. Lunev          * boundaries.
180461007b31SStefan Hajnoczi          */
1805443668caSDenis V. Lunev         if (head) {
1806b2f95feeSEric Blake             /* Make a small request up to the first aligned sector. For
1807b2f95feeSEric Blake              * convenience, limit this request to max_transfer even if
1808b2f95feeSEric Blake              * we don't need to fall back to writes.  */
1809f5a5ca79SManos Pitsidianakis             num = MIN(MIN(bytes, max_transfer), alignment - head);
1810b2f95feeSEric Blake             head = (head + num) % alignment;
1811b2f95feeSEric Blake             assert(num < max_write_zeroes);
1812d05aa8bbSEric Blake         } else if (tail && num > alignment) {
1813443668caSDenis V. Lunev             /* Shorten the request to the last aligned sector.  */
1814443668caSDenis V. Lunev             num -= tail;
181561007b31SStefan Hajnoczi         }
181661007b31SStefan Hajnoczi 
181761007b31SStefan Hajnoczi         /* limit request size */
181861007b31SStefan Hajnoczi         if (num > max_write_zeroes) {
181961007b31SStefan Hajnoczi             num = max_write_zeroes;
182061007b31SStefan Hajnoczi         }
182161007b31SStefan Hajnoczi 
182261007b31SStefan Hajnoczi         ret = -ENOTSUP;
182361007b31SStefan Hajnoczi         /* First try the efficient write zeroes operation */
1824d05aa8bbSEric Blake         if (drv->bdrv_co_pwrite_zeroes) {
1825d05aa8bbSEric Blake             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1826d05aa8bbSEric Blake                                              flags & bs->supported_zero_flags);
1827d05aa8bbSEric Blake             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1828d05aa8bbSEric Blake                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1829d05aa8bbSEric Blake                 need_flush = true;
1830d05aa8bbSEric Blake             }
1831465fe887SEric Blake         } else {
1832465fe887SEric Blake             assert(!bs->supported_zero_flags);
183361007b31SStefan Hajnoczi         }
183461007b31SStefan Hajnoczi 
1835294682ccSAndrey Shinkevich         if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
183661007b31SStefan Hajnoczi             /* Fall back to bounce buffer if write zeroes is unsupported */
1837465fe887SEric Blake             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1838465fe887SEric Blake 
1839465fe887SEric Blake             if ((flags & BDRV_REQ_FUA) &&
1840465fe887SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1841465fe887SEric Blake                 /* No need for bdrv_driver_pwrite() to do a fallback
1842465fe887SEric Blake                  * flush on each chunk; use just one at the end */
1843465fe887SEric Blake                 write_flags &= ~BDRV_REQ_FUA;
1844465fe887SEric Blake                 need_flush = true;
1845465fe887SEric Blake             }
18465def6b80SEric Blake             num = MIN(num, max_transfer);
18470d93ed08SVladimir Sementsov-Ogievskiy             if (buf == NULL) {
18480d93ed08SVladimir Sementsov-Ogievskiy                 buf = qemu_try_blockalign0(bs, num);
18490d93ed08SVladimir Sementsov-Ogievskiy                 if (buf == NULL) {
185061007b31SStefan Hajnoczi                     ret = -ENOMEM;
185161007b31SStefan Hajnoczi                     goto fail;
185261007b31SStefan Hajnoczi                 }
185361007b31SStefan Hajnoczi             }
18540d93ed08SVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&qiov, buf, num);
185561007b31SStefan Hajnoczi 
1856ac850bf0SVladimir Sementsov-Ogievskiy             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
185761007b31SStefan Hajnoczi 
185861007b31SStefan Hajnoczi             /* Keep bounce buffer around if it is big enough for all
185961007b31SStefan Hajnoczi              * all future requests.
186061007b31SStefan Hajnoczi              */
18615def6b80SEric Blake             if (num < max_transfer) {
18620d93ed08SVladimir Sementsov-Ogievskiy                 qemu_vfree(buf);
18630d93ed08SVladimir Sementsov-Ogievskiy                 buf = NULL;
186461007b31SStefan Hajnoczi             }
186561007b31SStefan Hajnoczi         }
186661007b31SStefan Hajnoczi 
1867d05aa8bbSEric Blake         offset += num;
1868f5a5ca79SManos Pitsidianakis         bytes -= num;
186961007b31SStefan Hajnoczi     }
187061007b31SStefan Hajnoczi 
187161007b31SStefan Hajnoczi fail:
1872465fe887SEric Blake     if (ret == 0 && need_flush) {
1873465fe887SEric Blake         ret = bdrv_co_flush(bs);
1874465fe887SEric Blake     }
18750d93ed08SVladimir Sementsov-Ogievskiy     qemu_vfree(buf);
187661007b31SStefan Hajnoczi     return ret;
187761007b31SStefan Hajnoczi }
187861007b31SStefan Hajnoczi 
187985fe2479SFam Zheng static inline int coroutine_fn
188085fe2479SFam Zheng bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
188185fe2479SFam Zheng                           BdrvTrackedRequest *req, int flags)
188285fe2479SFam Zheng {
188385fe2479SFam Zheng     BlockDriverState *bs = child->bs;
188485fe2479SFam Zheng     bool waited;
188585fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
188685fe2479SFam Zheng 
188785fe2479SFam Zheng     if (bs->read_only) {
188885fe2479SFam Zheng         return -EPERM;
188985fe2479SFam Zheng     }
189085fe2479SFam Zheng 
189185fe2479SFam Zheng     assert(!(bs->open_flags & BDRV_O_INACTIVE));
189285fe2479SFam Zheng     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
189385fe2479SFam Zheng     assert(!(flags & ~BDRV_REQ_MASK));
189485fe2479SFam Zheng 
189585fe2479SFam Zheng     if (flags & BDRV_REQ_SERIALISING) {
189618fbd0deSPaolo Bonzini         waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
189718fbd0deSPaolo Bonzini         /*
189818fbd0deSPaolo Bonzini          * For a misaligned request we should have already waited earlier,
189918fbd0deSPaolo Bonzini          * because we come after bdrv_padding_rmw_read which must be called
190018fbd0deSPaolo Bonzini          * with the request already marked as serialising.
190118fbd0deSPaolo Bonzini          */
190218fbd0deSPaolo Bonzini         assert(!waited ||
190318fbd0deSPaolo Bonzini                (req->offset == req->overlap_offset &&
190418fbd0deSPaolo Bonzini                 req->bytes == req->overlap_bytes));
190518fbd0deSPaolo Bonzini     } else {
190618fbd0deSPaolo Bonzini         bdrv_wait_serialising_requests(req);
190785fe2479SFam Zheng     }
190885fe2479SFam Zheng 
190985fe2479SFam Zheng     assert(req->overlap_offset <= offset);
191085fe2479SFam Zheng     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1911cd47d792SFam Zheng     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
191285fe2479SFam Zheng 
1913cd47d792SFam Zheng     switch (req->type) {
1914cd47d792SFam Zheng     case BDRV_TRACKED_WRITE:
1915cd47d792SFam Zheng     case BDRV_TRACKED_DISCARD:
191685fe2479SFam Zheng         if (flags & BDRV_REQ_WRITE_UNCHANGED) {
191785fe2479SFam Zheng             assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
191885fe2479SFam Zheng         } else {
191985fe2479SFam Zheng             assert(child->perm & BLK_PERM_WRITE);
192085fe2479SFam Zheng         }
1921cd47d792SFam Zheng         return notifier_with_return_list_notify(&bs->before_write_notifiers,
1922cd47d792SFam Zheng                                                 req);
1923cd47d792SFam Zheng     case BDRV_TRACKED_TRUNCATE:
1924cd47d792SFam Zheng         assert(child->perm & BLK_PERM_RESIZE);
1925cd47d792SFam Zheng         return 0;
1926cd47d792SFam Zheng     default:
1927cd47d792SFam Zheng         abort();
1928cd47d792SFam Zheng     }
192985fe2479SFam Zheng }
193085fe2479SFam Zheng 
193185fe2479SFam Zheng static inline void coroutine_fn
193285fe2479SFam Zheng bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
193385fe2479SFam Zheng                          BdrvTrackedRequest *req, int ret)
193485fe2479SFam Zheng {
193585fe2479SFam Zheng     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
193685fe2479SFam Zheng     BlockDriverState *bs = child->bs;
193785fe2479SFam Zheng 
193885fe2479SFam Zheng     atomic_inc(&bs->write_gen);
193985fe2479SFam Zheng 
194000695c27SFam Zheng     /*
194100695c27SFam Zheng      * Discard cannot extend the image, but in error handling cases, such as
194200695c27SFam Zheng      * when reverting a qcow2 cluster allocation, the discarded range can pass
194300695c27SFam Zheng      * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
194400695c27SFam Zheng      * here. Instead, just skip it, since semantically a discard request
194500695c27SFam Zheng      * beyond EOF cannot expand the image anyway.
194600695c27SFam Zheng      */
19477f8f03efSFam Zheng     if (ret == 0 &&
1948cd47d792SFam Zheng         (req->type == BDRV_TRACKED_TRUNCATE ||
1949cd47d792SFam Zheng          end_sector > bs->total_sectors) &&
195000695c27SFam Zheng         req->type != BDRV_TRACKED_DISCARD) {
19517f8f03efSFam Zheng         bs->total_sectors = end_sector;
19527f8f03efSFam Zheng         bdrv_parent_cb_resize(bs);
19537f8f03efSFam Zheng         bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
195485fe2479SFam Zheng     }
195500695c27SFam Zheng     if (req->bytes) {
195600695c27SFam Zheng         switch (req->type) {
195700695c27SFam Zheng         case BDRV_TRACKED_WRITE:
195800695c27SFam Zheng             stat64_max(&bs->wr_highest_offset, offset + bytes);
195900695c27SFam Zheng             /* fall through, to set dirty bits */
196000695c27SFam Zheng         case BDRV_TRACKED_DISCARD:
19617f8f03efSFam Zheng             bdrv_set_dirty(bs, offset, bytes);
196200695c27SFam Zheng             break;
196300695c27SFam Zheng         default:
196400695c27SFam Zheng             break;
196500695c27SFam Zheng         }
196600695c27SFam Zheng     }
196785fe2479SFam Zheng }
196885fe2479SFam Zheng 
196961007b31SStefan Hajnoczi /*
197004ed95f4SEric Blake  * Forwards an already correctly aligned write request to the BlockDriver,
197104ed95f4SEric Blake  * after possibly fragmenting it.
197261007b31SStefan Hajnoczi  */
197385c97ca7SKevin Wolf static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
197461007b31SStefan Hajnoczi     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
197528c4da28SVladimir Sementsov-Ogievskiy     int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
197661007b31SStefan Hajnoczi {
197785c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
197861007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
197961007b31SStefan Hajnoczi     int ret;
198061007b31SStefan Hajnoczi 
198104ed95f4SEric Blake     uint64_t bytes_remaining = bytes;
198204ed95f4SEric Blake     int max_transfer;
198361007b31SStefan Hajnoczi 
1984d470ad42SMax Reitz     if (!drv) {
1985d470ad42SMax Reitz         return -ENOMEDIUM;
1986d470ad42SMax Reitz     }
1987d470ad42SMax Reitz 
1988d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
1989d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
1990d6883bc9SVladimir Sementsov-Ogievskiy     }
1991d6883bc9SVladimir Sementsov-Ogievskiy 
1992cff86b38SEric Blake     assert(is_power_of_2(align));
1993cff86b38SEric Blake     assert((offset & (align - 1)) == 0);
1994cff86b38SEric Blake     assert((bytes & (align - 1)) == 0);
199528c4da28SVladimir Sementsov-Ogievskiy     assert(!qiov || qiov_offset + bytes <= qiov->size);
199604ed95f4SEric Blake     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
199704ed95f4SEric Blake                                    align);
199861007b31SStefan Hajnoczi 
199985fe2479SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
200061007b31SStefan Hajnoczi 
200161007b31SStefan Hajnoczi     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
2002c1499a5eSEric Blake         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
200328c4da28SVladimir Sementsov-Ogievskiy         qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
200461007b31SStefan Hajnoczi         flags |= BDRV_REQ_ZERO_WRITE;
200561007b31SStefan Hajnoczi         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
200661007b31SStefan Hajnoczi             flags |= BDRV_REQ_MAY_UNMAP;
200761007b31SStefan Hajnoczi         }
200861007b31SStefan Hajnoczi     }
200961007b31SStefan Hajnoczi 
201061007b31SStefan Hajnoczi     if (ret < 0) {
201161007b31SStefan Hajnoczi         /* Do nothing, write notifier decided to fail this request */
201261007b31SStefan Hajnoczi     } else if (flags & BDRV_REQ_ZERO_WRITE) {
20139a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
20149896c876SKevin Wolf         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
20153ea1a091SPavel Butsykin     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
201628c4da28SVladimir Sementsov-Ogievskiy         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
201728c4da28SVladimir Sementsov-Ogievskiy                                              qiov, qiov_offset);
201804ed95f4SEric Blake     } else if (bytes <= max_transfer) {
20199a4f4c31SKevin Wolf         bdrv_debug_event(bs, BLKDBG_PWRITEV);
202028c4da28SVladimir Sementsov-Ogievskiy         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
202104ed95f4SEric Blake     } else {
202204ed95f4SEric Blake         bdrv_debug_event(bs, BLKDBG_PWRITEV);
202304ed95f4SEric Blake         while (bytes_remaining) {
202404ed95f4SEric Blake             int num = MIN(bytes_remaining, max_transfer);
202504ed95f4SEric Blake             int local_flags = flags;
202604ed95f4SEric Blake 
202704ed95f4SEric Blake             assert(num);
202804ed95f4SEric Blake             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
202904ed95f4SEric Blake                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
203004ed95f4SEric Blake                 /* If FUA is going to be emulated by flush, we only
203104ed95f4SEric Blake                  * need to flush on the last iteration */
203204ed95f4SEric Blake                 local_flags &= ~BDRV_REQ_FUA;
203304ed95f4SEric Blake             }
203404ed95f4SEric Blake 
203504ed95f4SEric Blake             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
2036134b7decSMax Reitz                                       num, qiov,
2037134b7decSMax Reitz                                       qiov_offset + bytes - bytes_remaining,
203828c4da28SVladimir Sementsov-Ogievskiy                                       local_flags);
203904ed95f4SEric Blake             if (ret < 0) {
204004ed95f4SEric Blake                 break;
204104ed95f4SEric Blake             }
204204ed95f4SEric Blake             bytes_remaining -= num;
204304ed95f4SEric Blake         }
204461007b31SStefan Hajnoczi     }
20459a4f4c31SKevin Wolf     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
204661007b31SStefan Hajnoczi 
204761007b31SStefan Hajnoczi     if (ret >= 0) {
204804ed95f4SEric Blake         ret = 0;
204961007b31SStefan Hajnoczi     }
205085fe2479SFam Zheng     bdrv_co_write_req_finish(child, offset, bytes, req, ret);
205161007b31SStefan Hajnoczi 
205261007b31SStefan Hajnoczi     return ret;
205361007b31SStefan Hajnoczi }
205461007b31SStefan Hajnoczi 
205585c97ca7SKevin Wolf static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
20569eeb6dd1SFam Zheng                                                 int64_t offset,
20579eeb6dd1SFam Zheng                                                 unsigned int bytes,
20589eeb6dd1SFam Zheng                                                 BdrvRequestFlags flags,
20599eeb6dd1SFam Zheng                                                 BdrvTrackedRequest *req)
20609eeb6dd1SFam Zheng {
206185c97ca7SKevin Wolf     BlockDriverState *bs = child->bs;
20629eeb6dd1SFam Zheng     QEMUIOVector local_qiov;
2063a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
20649eeb6dd1SFam Zheng     int ret = 0;
20657a3f542fSVladimir Sementsov-Ogievskiy     bool padding;
20667a3f542fSVladimir Sementsov-Ogievskiy     BdrvRequestPadding pad;
20679eeb6dd1SFam Zheng 
20687a3f542fSVladimir Sementsov-Ogievskiy     padding = bdrv_init_padding(bs, offset, bytes, &pad);
20697a3f542fSVladimir Sementsov-Ogievskiy     if (padding) {
2070304d9d7fSMax Reitz         bdrv_mark_request_serialising(req, align);
20719eeb6dd1SFam Zheng 
20727a3f542fSVladimir Sementsov-Ogievskiy         bdrv_padding_rmw_read(child, req, &pad, true);
20737a3f542fSVladimir Sementsov-Ogievskiy 
20747a3f542fSVladimir Sementsov-Ogievskiy         if (pad.head || pad.merge_reads) {
20757a3f542fSVladimir Sementsov-Ogievskiy             int64_t aligned_offset = offset & ~(align - 1);
20767a3f542fSVladimir Sementsov-Ogievskiy             int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
20777a3f542fSVladimir Sementsov-Ogievskiy 
20787a3f542fSVladimir Sementsov-Ogievskiy             qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
20797a3f542fSVladimir Sementsov-Ogievskiy             ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
208028c4da28SVladimir Sementsov-Ogievskiy                                        align, &local_qiov, 0,
20819eeb6dd1SFam Zheng                                        flags & ~BDRV_REQ_ZERO_WRITE);
20827a3f542fSVladimir Sementsov-Ogievskiy             if (ret < 0 || pad.merge_reads) {
20837a3f542fSVladimir Sementsov-Ogievskiy                 /* Error or all work is done */
20847a3f542fSVladimir Sementsov-Ogievskiy                 goto out;
20859eeb6dd1SFam Zheng             }
20867a3f542fSVladimir Sementsov-Ogievskiy             offset += write_bytes - pad.head;
20877a3f542fSVladimir Sementsov-Ogievskiy             bytes -= write_bytes - pad.head;
20887a3f542fSVladimir Sementsov-Ogievskiy         }
20899eeb6dd1SFam Zheng     }
20909eeb6dd1SFam Zheng 
20919eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
20929eeb6dd1SFam Zheng     if (bytes >= align) {
20939eeb6dd1SFam Zheng         /* Write the aligned part in the middle. */
20949eeb6dd1SFam Zheng         uint64_t aligned_bytes = bytes & ~(align - 1);
209585c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
209628c4da28SVladimir Sementsov-Ogievskiy                                    NULL, 0, flags);
20979eeb6dd1SFam Zheng         if (ret < 0) {
20987a3f542fSVladimir Sementsov-Ogievskiy             goto out;
20999eeb6dd1SFam Zheng         }
21009eeb6dd1SFam Zheng         bytes -= aligned_bytes;
21019eeb6dd1SFam Zheng         offset += aligned_bytes;
21029eeb6dd1SFam Zheng     }
21039eeb6dd1SFam Zheng 
21049eeb6dd1SFam Zheng     assert(!bytes || (offset & (align - 1)) == 0);
21059eeb6dd1SFam Zheng     if (bytes) {
21067a3f542fSVladimir Sementsov-Ogievskiy         assert(align == pad.tail + bytes);
21079eeb6dd1SFam Zheng 
21087a3f542fSVladimir Sementsov-Ogievskiy         qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
210985c97ca7SKevin Wolf         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
211028c4da28SVladimir Sementsov-Ogievskiy                                    &local_qiov, 0,
211128c4da28SVladimir Sementsov-Ogievskiy                                    flags & ~BDRV_REQ_ZERO_WRITE);
21129eeb6dd1SFam Zheng     }
21139eeb6dd1SFam Zheng 
21147a3f542fSVladimir Sementsov-Ogievskiy out:
21157a3f542fSVladimir Sementsov-Ogievskiy     bdrv_padding_destroy(&pad);
21167a3f542fSVladimir Sementsov-Ogievskiy 
21177a3f542fSVladimir Sementsov-Ogievskiy     return ret;
21189eeb6dd1SFam Zheng }
21199eeb6dd1SFam Zheng 
212061007b31SStefan Hajnoczi /*
212161007b31SStefan Hajnoczi  * Handle a write request in coroutine context
212261007b31SStefan Hajnoczi  */
2123a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
212461007b31SStefan Hajnoczi     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
212561007b31SStefan Hajnoczi     BdrvRequestFlags flags)
212661007b31SStefan Hajnoczi {
21271acc3466SVladimir Sementsov-Ogievskiy     return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
21281acc3466SVladimir Sementsov-Ogievskiy }
21291acc3466SVladimir Sementsov-Ogievskiy 
21301acc3466SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
21311acc3466SVladimir Sementsov-Ogievskiy     int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
21321acc3466SVladimir Sementsov-Ogievskiy     BdrvRequestFlags flags)
21331acc3466SVladimir Sementsov-Ogievskiy {
2134a03ef88fSKevin Wolf     BlockDriverState *bs = child->bs;
213561007b31SStefan Hajnoczi     BdrvTrackedRequest req;
2136a5b8dd2cSEric Blake     uint64_t align = bs->bl.request_alignment;
21377a3f542fSVladimir Sementsov-Ogievskiy     BdrvRequestPadding pad;
213861007b31SStefan Hajnoczi     int ret;
213961007b31SStefan Hajnoczi 
2140f42cf447SDaniel P. Berrange     trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
2141f42cf447SDaniel P. Berrange 
214261007b31SStefan Hajnoczi     if (!bs->drv) {
214361007b31SStefan Hajnoczi         return -ENOMEDIUM;
214461007b31SStefan Hajnoczi     }
214561007b31SStefan Hajnoczi 
214661007b31SStefan Hajnoczi     ret = bdrv_check_byte_request(bs, offset, bytes);
214761007b31SStefan Hajnoczi     if (ret < 0) {
214861007b31SStefan Hajnoczi         return ret;
214961007b31SStefan Hajnoczi     }
215061007b31SStefan Hajnoczi 
2151f2208fdcSAlberto Garcia     /* If the request is misaligned then we can't make it efficient */
2152f2208fdcSAlberto Garcia     if ((flags & BDRV_REQ_NO_FALLBACK) &&
2153f2208fdcSAlberto Garcia         !QEMU_IS_ALIGNED(offset | bytes, align))
2154f2208fdcSAlberto Garcia     {
2155f2208fdcSAlberto Garcia         return -ENOTSUP;
2156f2208fdcSAlberto Garcia     }
2157f2208fdcSAlberto Garcia 
2158ac9d00bfSVladimir Sementsov-Ogievskiy     if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
2159ac9d00bfSVladimir Sementsov-Ogievskiy         /*
2160ac9d00bfSVladimir Sementsov-Ogievskiy          * Aligning zero request is nonsense. Even if driver has special meaning
2161ac9d00bfSVladimir Sementsov-Ogievskiy          * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
2162ac9d00bfSVladimir Sementsov-Ogievskiy          * it to driver due to request_alignment.
2163ac9d00bfSVladimir Sementsov-Ogievskiy          *
2164ac9d00bfSVladimir Sementsov-Ogievskiy          * Still, no reason to return an error if someone do unaligned
2165ac9d00bfSVladimir Sementsov-Ogievskiy          * zero-length write occasionally.
2166ac9d00bfSVladimir Sementsov-Ogievskiy          */
2167ac9d00bfSVladimir Sementsov-Ogievskiy         return 0;
2168ac9d00bfSVladimir Sementsov-Ogievskiy     }
2169ac9d00bfSVladimir Sementsov-Ogievskiy 
217099723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
217161007b31SStefan Hajnoczi     /*
217261007b31SStefan Hajnoczi      * Align write if necessary by performing a read-modify-write cycle.
217361007b31SStefan Hajnoczi      * Pad qiov with the read parts and be sure to have a tracked request not
217461007b31SStefan Hajnoczi      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
217561007b31SStefan Hajnoczi      */
2176ebde595cSFam Zheng     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
217761007b31SStefan Hajnoczi 
217818a59f03SAnton Nefedov     if (flags & BDRV_REQ_ZERO_WRITE) {
217985c97ca7SKevin Wolf         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
21809eeb6dd1SFam Zheng         goto out;
21819eeb6dd1SFam Zheng     }
21829eeb6dd1SFam Zheng 
21831acc3466SVladimir Sementsov-Ogievskiy     if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) {
2184304d9d7fSMax Reitz         bdrv_mark_request_serialising(&req, align);
21857a3f542fSVladimir Sementsov-Ogievskiy         bdrv_padding_rmw_read(child, &req, &pad, false);
218661007b31SStefan Hajnoczi     }
218761007b31SStefan Hajnoczi 
218885c97ca7SKevin Wolf     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
21891acc3466SVladimir Sementsov-Ogievskiy                                qiov, qiov_offset, flags);
219061007b31SStefan Hajnoczi 
21917a3f542fSVladimir Sementsov-Ogievskiy     bdrv_padding_destroy(&pad);
219261007b31SStefan Hajnoczi 
21939eeb6dd1SFam Zheng out:
21949eeb6dd1SFam Zheng     tracked_request_end(&req);
219599723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
21967a3f542fSVladimir Sementsov-Ogievskiy 
219761007b31SStefan Hajnoczi     return ret;
219861007b31SStefan Hajnoczi }
219961007b31SStefan Hajnoczi 
2200a03ef88fSKevin Wolf int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2201f5a5ca79SManos Pitsidianakis                                        int bytes, BdrvRequestFlags flags)
220261007b31SStefan Hajnoczi {
2203f5a5ca79SManos Pitsidianakis     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
220461007b31SStefan Hajnoczi 
2205a03ef88fSKevin Wolf     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
220661007b31SStefan Hajnoczi         flags &= ~BDRV_REQ_MAY_UNMAP;
220761007b31SStefan Hajnoczi     }
220861007b31SStefan Hajnoczi 
2209f5a5ca79SManos Pitsidianakis     return bdrv_co_pwritev(child, offset, bytes, NULL,
221061007b31SStefan Hajnoczi                            BDRV_REQ_ZERO_WRITE | flags);
221161007b31SStefan Hajnoczi }
221261007b31SStefan Hajnoczi 
22134085f5c7SJohn Snow /*
22144085f5c7SJohn Snow  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
22154085f5c7SJohn Snow  */
22164085f5c7SJohn Snow int bdrv_flush_all(void)
22174085f5c7SJohn Snow {
22184085f5c7SJohn Snow     BdrvNextIterator it;
22194085f5c7SJohn Snow     BlockDriverState *bs = NULL;
22204085f5c7SJohn Snow     int result = 0;
22214085f5c7SJohn Snow 
2222c8aa7895SPavel Dovgalyuk     /*
2223c8aa7895SPavel Dovgalyuk      * bdrv queue is managed by record/replay,
2224c8aa7895SPavel Dovgalyuk      * creating new flush request for stopping
2225c8aa7895SPavel Dovgalyuk      * the VM may break the determinism
2226c8aa7895SPavel Dovgalyuk      */
2227c8aa7895SPavel Dovgalyuk     if (replay_events_enabled()) {
2228c8aa7895SPavel Dovgalyuk         return result;
2229c8aa7895SPavel Dovgalyuk     }
2230c8aa7895SPavel Dovgalyuk 
22314085f5c7SJohn Snow     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
22324085f5c7SJohn Snow         AioContext *aio_context = bdrv_get_aio_context(bs);
22334085f5c7SJohn Snow         int ret;
22344085f5c7SJohn Snow 
22354085f5c7SJohn Snow         aio_context_acquire(aio_context);
22364085f5c7SJohn Snow         ret = bdrv_flush(bs);
22374085f5c7SJohn Snow         if (ret < 0 && !result) {
22384085f5c7SJohn Snow             result = ret;
22394085f5c7SJohn Snow         }
22404085f5c7SJohn Snow         aio_context_release(aio_context);
22414085f5c7SJohn Snow     }
22424085f5c7SJohn Snow 
22434085f5c7SJohn Snow     return result;
22444085f5c7SJohn Snow }
22454085f5c7SJohn Snow 
22464085f5c7SJohn Snow 
22474bcd936eSEric Blake typedef struct BdrvCoBlockStatusData {
224861007b31SStefan Hajnoczi     BlockDriverState *bs;
224961007b31SStefan Hajnoczi     BlockDriverState *base;
2250c9ce8c4dSEric Blake     bool want_zero;
22514bcd936eSEric Blake     int64_t offset;
22524bcd936eSEric Blake     int64_t bytes;
22534bcd936eSEric Blake     int64_t *pnum;
22544bcd936eSEric Blake     int64_t *map;
2255c9ce8c4dSEric Blake     BlockDriverState **file;
22564bcd936eSEric Blake } BdrvCoBlockStatusData;
225761007b31SStefan Hajnoczi 
22583e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
22593e4d0e72SEric Blake                                                 bool want_zero,
22603e4d0e72SEric Blake                                                 int64_t offset,
22613e4d0e72SEric Blake                                                 int64_t bytes,
22623e4d0e72SEric Blake                                                 int64_t *pnum,
22633e4d0e72SEric Blake                                                 int64_t *map,
2264f7cc69b3SManos Pitsidianakis                                                 BlockDriverState **file)
2265f7cc69b3SManos Pitsidianakis {
2266f7cc69b3SManos Pitsidianakis     assert(bs->file && bs->file->bs);
22673e4d0e72SEric Blake     *pnum = bytes;
22683e4d0e72SEric Blake     *map = offset;
2269f7cc69b3SManos Pitsidianakis     *file = bs->file->bs;
22703e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2271f7cc69b3SManos Pitsidianakis }
2272f7cc69b3SManos Pitsidianakis 
22733e4d0e72SEric Blake int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
22743e4d0e72SEric Blake                                                    bool want_zero,
22753e4d0e72SEric Blake                                                    int64_t offset,
22763e4d0e72SEric Blake                                                    int64_t bytes,
22773e4d0e72SEric Blake                                                    int64_t *pnum,
22783e4d0e72SEric Blake                                                    int64_t *map,
2279f7cc69b3SManos Pitsidianakis                                                    BlockDriverState **file)
2280f7cc69b3SManos Pitsidianakis {
2281f7cc69b3SManos Pitsidianakis     assert(bs->backing && bs->backing->bs);
22823e4d0e72SEric Blake     *pnum = bytes;
22833e4d0e72SEric Blake     *map = offset;
2284f7cc69b3SManos Pitsidianakis     *file = bs->backing->bs;
22853e4d0e72SEric Blake     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2286f7cc69b3SManos Pitsidianakis }
2287f7cc69b3SManos Pitsidianakis 
228861007b31SStefan Hajnoczi /*
228961007b31SStefan Hajnoczi  * Returns the allocation status of the specified sectors.
229061007b31SStefan Hajnoczi  * Drivers not implementing the functionality are assumed to not support
229161007b31SStefan Hajnoczi  * backing files, hence all their sectors are reported as allocated.
229261007b31SStefan Hajnoczi  *
229386a3d5c6SEric Blake  * If 'want_zero' is true, the caller is querying for mapping
229486a3d5c6SEric Blake  * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
229586a3d5c6SEric Blake  * _ZERO where possible; otherwise, the result favors larger 'pnum',
229686a3d5c6SEric Blake  * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2297c9ce8c4dSEric Blake  *
22982e8bc787SEric Blake  * If 'offset' is beyond the end of the disk image the return value is
2299fb0d8654SEric Blake  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
230061007b31SStefan Hajnoczi  *
23012e8bc787SEric Blake  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2302fb0d8654SEric Blake  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2303fb0d8654SEric Blake  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
230467a0fd2aSFam Zheng  *
23052e8bc787SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
23062e8bc787SEric Blake  * following the specified offset) that are easily known to be in the
23072e8bc787SEric Blake  * same allocated/unallocated state.  Note that a second call starting
23082e8bc787SEric Blake  * at the original offset plus returned pnum may have the same status.
23092e8bc787SEric Blake  * The returned value is non-zero on success except at end-of-file.
23102e8bc787SEric Blake  *
23112e8bc787SEric Blake  * Returns negative errno on failure.  Otherwise, if the
23122e8bc787SEric Blake  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
23132e8bc787SEric Blake  * set to the host mapping and BDS corresponding to the guest offset.
231461007b31SStefan Hajnoczi  */
23152e8bc787SEric Blake static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2316c9ce8c4dSEric Blake                                              bool want_zero,
23172e8bc787SEric Blake                                              int64_t offset, int64_t bytes,
23182e8bc787SEric Blake                                              int64_t *pnum, int64_t *map,
231967a0fd2aSFam Zheng                                              BlockDriverState **file)
232061007b31SStefan Hajnoczi {
23212e8bc787SEric Blake     int64_t total_size;
23222e8bc787SEric Blake     int64_t n; /* bytes */
2323efa6e2edSEric Blake     int ret;
23242e8bc787SEric Blake     int64_t local_map = 0;
2325298a1665SEric Blake     BlockDriverState *local_file = NULL;
2326efa6e2edSEric Blake     int64_t aligned_offset, aligned_bytes;
2327efa6e2edSEric Blake     uint32_t align;
232861007b31SStefan Hajnoczi 
2329298a1665SEric Blake     assert(pnum);
2330298a1665SEric Blake     *pnum = 0;
23312e8bc787SEric Blake     total_size = bdrv_getlength(bs);
23322e8bc787SEric Blake     if (total_size < 0) {
23332e8bc787SEric Blake         ret = total_size;
2334298a1665SEric Blake         goto early_out;
233561007b31SStefan Hajnoczi     }
233661007b31SStefan Hajnoczi 
23372e8bc787SEric Blake     if (offset >= total_size) {
2338298a1665SEric Blake         ret = BDRV_BLOCK_EOF;
2339298a1665SEric Blake         goto early_out;
234061007b31SStefan Hajnoczi     }
23412e8bc787SEric Blake     if (!bytes) {
2342298a1665SEric Blake         ret = 0;
2343298a1665SEric Blake         goto early_out;
23449cdcfd9fSEric Blake     }
234561007b31SStefan Hajnoczi 
23462e8bc787SEric Blake     n = total_size - offset;
23472e8bc787SEric Blake     if (n < bytes) {
23482e8bc787SEric Blake         bytes = n;
234961007b31SStefan Hajnoczi     }
235061007b31SStefan Hajnoczi 
2351d470ad42SMax Reitz     /* Must be non-NULL or bdrv_getlength() would have failed */
2352d470ad42SMax Reitz     assert(bs->drv);
2353636cb512SEric Blake     if (!bs->drv->bdrv_co_block_status) {
23542e8bc787SEric Blake         *pnum = bytes;
235561007b31SStefan Hajnoczi         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
23562e8bc787SEric Blake         if (offset + bytes == total_size) {
2357fb0d8654SEric Blake             ret |= BDRV_BLOCK_EOF;
2358fb0d8654SEric Blake         }
235961007b31SStefan Hajnoczi         if (bs->drv->protocol_name) {
23602e8bc787SEric Blake             ret |= BDRV_BLOCK_OFFSET_VALID;
23612e8bc787SEric Blake             local_map = offset;
2362298a1665SEric Blake             local_file = bs;
236361007b31SStefan Hajnoczi         }
2364298a1665SEric Blake         goto early_out;
236561007b31SStefan Hajnoczi     }
236661007b31SStefan Hajnoczi 
236799723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2368efa6e2edSEric Blake 
2369efa6e2edSEric Blake     /* Round out to request_alignment boundaries */
237086a3d5c6SEric Blake     align = bs->bl.request_alignment;
2371efa6e2edSEric Blake     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2372efa6e2edSEric Blake     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2373efa6e2edSEric Blake 
237486a3d5c6SEric Blake     ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
237586a3d5c6SEric Blake                                         aligned_bytes, pnum, &local_map,
237686a3d5c6SEric Blake                                         &local_file);
237786a3d5c6SEric Blake     if (ret < 0) {
237886a3d5c6SEric Blake         *pnum = 0;
237986a3d5c6SEric Blake         goto out;
238086a3d5c6SEric Blake     }
2381efa6e2edSEric Blake 
2382efa6e2edSEric Blake     /*
2383636cb512SEric Blake      * The driver's result must be a non-zero multiple of request_alignment.
2384efa6e2edSEric Blake      * Clamp pnum and adjust map to original request.
2385efa6e2edSEric Blake      */
2386636cb512SEric Blake     assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2387636cb512SEric Blake            align > offset - aligned_offset);
238869f47505SVladimir Sementsov-Ogievskiy     if (ret & BDRV_BLOCK_RECURSE) {
238969f47505SVladimir Sementsov-Ogievskiy         assert(ret & BDRV_BLOCK_DATA);
239069f47505SVladimir Sementsov-Ogievskiy         assert(ret & BDRV_BLOCK_OFFSET_VALID);
239169f47505SVladimir Sementsov-Ogievskiy         assert(!(ret & BDRV_BLOCK_ZERO));
239269f47505SVladimir Sementsov-Ogievskiy     }
239369f47505SVladimir Sementsov-Ogievskiy 
2394efa6e2edSEric Blake     *pnum -= offset - aligned_offset;
2395efa6e2edSEric Blake     if (*pnum > bytes) {
2396efa6e2edSEric Blake         *pnum = bytes;
2397efa6e2edSEric Blake     }
2398efa6e2edSEric Blake     if (ret & BDRV_BLOCK_OFFSET_VALID) {
2399efa6e2edSEric Blake         local_map += offset - aligned_offset;
2400efa6e2edSEric Blake     }
240161007b31SStefan Hajnoczi 
240261007b31SStefan Hajnoczi     if (ret & BDRV_BLOCK_RAW) {
2403298a1665SEric Blake         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
24042e8bc787SEric Blake         ret = bdrv_co_block_status(local_file, want_zero, local_map,
24052e8bc787SEric Blake                                    *pnum, pnum, &local_map, &local_file);
240699723548SPaolo Bonzini         goto out;
240761007b31SStefan Hajnoczi     }
240861007b31SStefan Hajnoczi 
240961007b31SStefan Hajnoczi     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
241061007b31SStefan Hajnoczi         ret |= BDRV_BLOCK_ALLOCATED;
2411a2adbbf6SVladimir Sementsov-Ogievskiy     } else if (want_zero && bs->drv->supports_backing) {
2412cb850315SMax Reitz         BlockDriverState *cow_bs = bdrv_cow_bs(bs);
2413cb850315SMax Reitz 
2414cb850315SMax Reitz         if (cow_bs) {
2415cb850315SMax Reitz             int64_t size2 = bdrv_getlength(cow_bs);
2416c9ce8c4dSEric Blake 
24172e8bc787SEric Blake             if (size2 >= 0 && offset >= size2) {
241861007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
241961007b31SStefan Hajnoczi             }
24207b1efe99SVladimir Sementsov-Ogievskiy         } else {
24217b1efe99SVladimir Sementsov-Ogievskiy             ret |= BDRV_BLOCK_ZERO;
24227b1efe99SVladimir Sementsov-Ogievskiy         }
242361007b31SStefan Hajnoczi     }
242461007b31SStefan Hajnoczi 
242569f47505SVladimir Sementsov-Ogievskiy     if (want_zero && ret & BDRV_BLOCK_RECURSE &&
242669f47505SVladimir Sementsov-Ogievskiy         local_file && local_file != bs &&
242761007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
242861007b31SStefan Hajnoczi         (ret & BDRV_BLOCK_OFFSET_VALID)) {
24292e8bc787SEric Blake         int64_t file_pnum;
24302e8bc787SEric Blake         int ret2;
243161007b31SStefan Hajnoczi 
24322e8bc787SEric Blake         ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
24332e8bc787SEric Blake                                     *pnum, &file_pnum, NULL, NULL);
243461007b31SStefan Hajnoczi         if (ret2 >= 0) {
243561007b31SStefan Hajnoczi             /* Ignore errors.  This is just providing extra information, it
243661007b31SStefan Hajnoczi              * is useful but not necessary.
243761007b31SStefan Hajnoczi              */
2438c61e684eSEric Blake             if (ret2 & BDRV_BLOCK_EOF &&
2439c61e684eSEric Blake                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2440c61e684eSEric Blake                 /*
2441c61e684eSEric Blake                  * It is valid for the format block driver to read
2442c61e684eSEric Blake                  * beyond the end of the underlying file's current
2443c61e684eSEric Blake                  * size; such areas read as zero.
2444c61e684eSEric Blake                  */
244561007b31SStefan Hajnoczi                 ret |= BDRV_BLOCK_ZERO;
244661007b31SStefan Hajnoczi             } else {
244761007b31SStefan Hajnoczi                 /* Limit request to the range reported by the protocol driver */
244861007b31SStefan Hajnoczi                 *pnum = file_pnum;
244961007b31SStefan Hajnoczi                 ret |= (ret2 & BDRV_BLOCK_ZERO);
245061007b31SStefan Hajnoczi             }
245161007b31SStefan Hajnoczi         }
245261007b31SStefan Hajnoczi     }
245361007b31SStefan Hajnoczi 
245499723548SPaolo Bonzini out:
245599723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
24562e8bc787SEric Blake     if (ret >= 0 && offset + *pnum == total_size) {
2457fb0d8654SEric Blake         ret |= BDRV_BLOCK_EOF;
2458fb0d8654SEric Blake     }
2459298a1665SEric Blake early_out:
2460298a1665SEric Blake     if (file) {
2461298a1665SEric Blake         *file = local_file;
2462298a1665SEric Blake     }
24632e8bc787SEric Blake     if (map) {
24642e8bc787SEric Blake         *map = local_map;
24652e8bc787SEric Blake     }
246661007b31SStefan Hajnoczi     return ret;
246761007b31SStefan Hajnoczi }
246861007b31SStefan Hajnoczi 
24695b648c67SEric Blake static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2470ba3f0e25SFam Zheng                                                    BlockDriverState *base,
2471c9ce8c4dSEric Blake                                                    bool want_zero,
24725b648c67SEric Blake                                                    int64_t offset,
24735b648c67SEric Blake                                                    int64_t bytes,
24745b648c67SEric Blake                                                    int64_t *pnum,
24755b648c67SEric Blake                                                    int64_t *map,
247667a0fd2aSFam Zheng                                                    BlockDriverState **file)
2477ba3f0e25SFam Zheng {
2478ba3f0e25SFam Zheng     BlockDriverState *p;
24795b648c67SEric Blake     int ret = 0;
2480c61e684eSEric Blake     bool first = true;
2481ba3f0e25SFam Zheng 
2482ba3f0e25SFam Zheng     assert(bs != base);
2483cb850315SMax Reitz     for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) {
24845b648c67SEric Blake         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
24855b648c67SEric Blake                                    file);
2486c61e684eSEric Blake         if (ret < 0) {
2487c61e684eSEric Blake             break;
2488c61e684eSEric Blake         }
2489c61e684eSEric Blake         if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2490c61e684eSEric Blake             /*
2491c61e684eSEric Blake              * Reading beyond the end of the file continues to read
2492c61e684eSEric Blake              * zeroes, but we can only widen the result to the
2493c61e684eSEric Blake              * unallocated length we learned from an earlier
2494c61e684eSEric Blake              * iteration.
2495c61e684eSEric Blake              */
24965b648c67SEric Blake             *pnum = bytes;
2497c61e684eSEric Blake         }
2498c61e684eSEric Blake         if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2499ba3f0e25SFam Zheng             break;
2500ba3f0e25SFam Zheng         }
25015b648c67SEric Blake         /* [offset, pnum] unallocated on this layer, which could be only
25025b648c67SEric Blake          * the first part of [offset, bytes].  */
25035b648c67SEric Blake         bytes = MIN(bytes, *pnum);
2504c61e684eSEric Blake         first = false;
2505ba3f0e25SFam Zheng     }
2506ba3f0e25SFam Zheng     return ret;
2507ba3f0e25SFam Zheng }
2508ba3f0e25SFam Zheng 
250931826642SEric Blake /* Coroutine wrapper for bdrv_block_status_above() */
25107d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
251161007b31SStefan Hajnoczi {
25124bcd936eSEric Blake     BdrvCoBlockStatusData *data = opaque;
251361007b31SStefan Hajnoczi 
25147d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_block_status_above(data->bs, data->base,
2515c9ce8c4dSEric Blake                                       data->want_zero,
25165b648c67SEric Blake                                       data->offset, data->bytes,
25175b648c67SEric Blake                                       data->pnum, data->map, data->file);
251861007b31SStefan Hajnoczi }
251961007b31SStefan Hajnoczi 
252061007b31SStefan Hajnoczi /*
25215b648c67SEric Blake  * Synchronous wrapper around bdrv_co_block_status_above().
252261007b31SStefan Hajnoczi  *
25235b648c67SEric Blake  * See bdrv_co_block_status_above() for details.
252461007b31SStefan Hajnoczi  */
25257ddb99b9SEric Blake static int bdrv_common_block_status_above(BlockDriverState *bs,
2526ba3f0e25SFam Zheng                                           BlockDriverState *base,
25277ddb99b9SEric Blake                                           bool want_zero, int64_t offset,
25287ddb99b9SEric Blake                                           int64_t bytes, int64_t *pnum,
25297ddb99b9SEric Blake                                           int64_t *map,
253067a0fd2aSFam Zheng                                           BlockDriverState **file)
253161007b31SStefan Hajnoczi {
25324bcd936eSEric Blake     BdrvCoBlockStatusData data = {
253361007b31SStefan Hajnoczi         .bs = bs,
2534ba3f0e25SFam Zheng         .base = base,
2535c9ce8c4dSEric Blake         .want_zero = want_zero,
25367ddb99b9SEric Blake         .offset = offset,
25377ddb99b9SEric Blake         .bytes = bytes,
25387ddb99b9SEric Blake         .pnum = pnum,
25397ddb99b9SEric Blake         .map = map,
2540c9ce8c4dSEric Blake         .file = file,
254161007b31SStefan Hajnoczi     };
254261007b31SStefan Hajnoczi 
25437d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data);
254461007b31SStefan Hajnoczi }
254561007b31SStefan Hajnoczi 
254631826642SEric Blake int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
254731826642SEric Blake                             int64_t offset, int64_t bytes, int64_t *pnum,
254831826642SEric Blake                             int64_t *map, BlockDriverState **file)
2549c9ce8c4dSEric Blake {
255031826642SEric Blake     return bdrv_common_block_status_above(bs, base, true, offset, bytes,
255131826642SEric Blake                                           pnum, map, file);
2552c9ce8c4dSEric Blake }
2553c9ce8c4dSEric Blake 
2554237d78f8SEric Blake int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2555237d78f8SEric Blake                       int64_t *pnum, int64_t *map, BlockDriverState **file)
2556ba3f0e25SFam Zheng {
2557cb850315SMax Reitz     return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
255831826642SEric Blake                                    offset, bytes, pnum, map, file);
2559ba3f0e25SFam Zheng }
2560ba3f0e25SFam Zheng 
2561d6a644bbSEric Blake int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2562d6a644bbSEric Blake                                    int64_t bytes, int64_t *pnum)
256361007b31SStefan Hajnoczi {
25647ddb99b9SEric Blake     int ret;
25657ddb99b9SEric Blake     int64_t dummy;
2566d6a644bbSEric Blake 
2567cb850315SMax Reitz     ret = bdrv_common_block_status_above(bs, bdrv_filter_or_cow_bs(bs), false,
2568cb850315SMax Reitz                                          offset, bytes, pnum ? pnum : &dummy,
2569cb850315SMax Reitz                                          NULL, NULL);
257061007b31SStefan Hajnoczi     if (ret < 0) {
257161007b31SStefan Hajnoczi         return ret;
257261007b31SStefan Hajnoczi     }
257361007b31SStefan Hajnoczi     return !!(ret & BDRV_BLOCK_ALLOCATED);
257461007b31SStefan Hajnoczi }
257561007b31SStefan Hajnoczi 
257661007b31SStefan Hajnoczi /*
257761007b31SStefan Hajnoczi  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
257861007b31SStefan Hajnoczi  *
2579170d3bd3SAndrey Shinkevich  * Return 1 if (a prefix of) the given range is allocated in any image
2580170d3bd3SAndrey Shinkevich  * between BASE and TOP (BASE is only included if include_base is set).
2581170d3bd3SAndrey Shinkevich  * BASE can be NULL to check if the given offset is allocated in any
2582170d3bd3SAndrey Shinkevich  * image of the chain.  Return 0 otherwise, or negative errno on
2583170d3bd3SAndrey Shinkevich  * failure.
258461007b31SStefan Hajnoczi  *
258551b0a488SEric Blake  * 'pnum' is set to the number of bytes (including and immediately
258651b0a488SEric Blake  * following the specified offset) that are known to be in the same
258751b0a488SEric Blake  * allocated/unallocated state.  Note that a subsequent call starting
258851b0a488SEric Blake  * at 'offset + *pnum' may return the same allocation status (in other
258951b0a488SEric Blake  * words, the result is not necessarily the maximum possible range);
259051b0a488SEric Blake  * but 'pnum' will only be 0 when end of file is reached.
259161007b31SStefan Hajnoczi  *
259261007b31SStefan Hajnoczi  */
259361007b31SStefan Hajnoczi int bdrv_is_allocated_above(BlockDriverState *top,
259461007b31SStefan Hajnoczi                             BlockDriverState *base,
2595170d3bd3SAndrey Shinkevich                             bool include_base, int64_t offset,
2596170d3bd3SAndrey Shinkevich                             int64_t bytes, int64_t *pnum)
259761007b31SStefan Hajnoczi {
259861007b31SStefan Hajnoczi     BlockDriverState *intermediate;
259951b0a488SEric Blake     int ret;
260051b0a488SEric Blake     int64_t n = bytes;
260161007b31SStefan Hajnoczi 
2602170d3bd3SAndrey Shinkevich     assert(base || !include_base);
2603170d3bd3SAndrey Shinkevich 
260461007b31SStefan Hajnoczi     intermediate = top;
2605170d3bd3SAndrey Shinkevich     while (include_base || intermediate != base) {
2606d6a644bbSEric Blake         int64_t pnum_inter;
2607c00716beSEric Blake         int64_t size_inter;
2608d6a644bbSEric Blake 
2609170d3bd3SAndrey Shinkevich         assert(intermediate);
261051b0a488SEric Blake         ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
261161007b31SStefan Hajnoczi         if (ret < 0) {
261261007b31SStefan Hajnoczi             return ret;
2613d6a644bbSEric Blake         }
2614d6a644bbSEric Blake         if (ret) {
261551b0a488SEric Blake             *pnum = pnum_inter;
261661007b31SStefan Hajnoczi             return 1;
261761007b31SStefan Hajnoczi         }
261861007b31SStefan Hajnoczi 
261951b0a488SEric Blake         size_inter = bdrv_getlength(intermediate);
2620c00716beSEric Blake         if (size_inter < 0) {
2621c00716beSEric Blake             return size_inter;
2622c00716beSEric Blake         }
262351b0a488SEric Blake         if (n > pnum_inter &&
262451b0a488SEric Blake             (intermediate == top || offset + pnum_inter < size_inter)) {
262551b0a488SEric Blake             n = pnum_inter;
262661007b31SStefan Hajnoczi         }
262761007b31SStefan Hajnoczi 
2628170d3bd3SAndrey Shinkevich         if (intermediate == base) {
2629170d3bd3SAndrey Shinkevich             break;
2630170d3bd3SAndrey Shinkevich         }
2631170d3bd3SAndrey Shinkevich 
2632cb850315SMax Reitz         intermediate = bdrv_filter_or_cow_bs(intermediate);
263361007b31SStefan Hajnoczi     }
263461007b31SStefan Hajnoczi 
263561007b31SStefan Hajnoczi     *pnum = n;
263661007b31SStefan Hajnoczi     return 0;
263761007b31SStefan Hajnoczi }
263861007b31SStefan Hajnoczi 
26391a8ae822SKevin Wolf typedef struct BdrvVmstateCo {
26401a8ae822SKevin Wolf     BlockDriverState    *bs;
26411a8ae822SKevin Wolf     QEMUIOVector        *qiov;
26421a8ae822SKevin Wolf     int64_t             pos;
26431a8ae822SKevin Wolf     bool                is_read;
26441a8ae822SKevin Wolf } BdrvVmstateCo;
26451a8ae822SKevin Wolf 
26461a8ae822SKevin Wolf static int coroutine_fn
26471a8ae822SKevin Wolf bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
26481a8ae822SKevin Wolf                    bool is_read)
26491a8ae822SKevin Wolf {
26501a8ae822SKevin Wolf     BlockDriver *drv = bs->drv;
2651dc88a467SStefan Hajnoczi     int ret = -ENOTSUP;
2652dc88a467SStefan Hajnoczi 
2653dc88a467SStefan Hajnoczi     bdrv_inc_in_flight(bs);
26541a8ae822SKevin Wolf 
26551a8ae822SKevin Wolf     if (!drv) {
2656dc88a467SStefan Hajnoczi         ret = -ENOMEDIUM;
26571a8ae822SKevin Wolf     } else if (drv->bdrv_load_vmstate) {
2658dc88a467SStefan Hajnoczi         if (is_read) {
2659dc88a467SStefan Hajnoczi             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2660dc88a467SStefan Hajnoczi         } else {
2661dc88a467SStefan Hajnoczi             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2662dc88a467SStefan Hajnoczi         }
26631a8ae822SKevin Wolf     } else if (bs->file) {
2664dc88a467SStefan Hajnoczi         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
26651a8ae822SKevin Wolf     }
26661a8ae822SKevin Wolf 
2667dc88a467SStefan Hajnoczi     bdrv_dec_in_flight(bs);
2668dc88a467SStefan Hajnoczi     return ret;
26691a8ae822SKevin Wolf }
26701a8ae822SKevin Wolf 
26717d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
26721a8ae822SKevin Wolf {
26731a8ae822SKevin Wolf     BdrvVmstateCo *co = opaque;
26747d2410ceSVladimir Sementsov-Ogievskiy 
26757d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
26761a8ae822SKevin Wolf }
26771a8ae822SKevin Wolf 
26781a8ae822SKevin Wolf static inline int
26791a8ae822SKevin Wolf bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
26801a8ae822SKevin Wolf                 bool is_read)
26811a8ae822SKevin Wolf {
26821a8ae822SKevin Wolf     BdrvVmstateCo data = {
26831a8ae822SKevin Wolf         .bs         = bs,
26841a8ae822SKevin Wolf         .qiov       = qiov,
26851a8ae822SKevin Wolf         .pos        = pos,
26861a8ae822SKevin Wolf         .is_read    = is_read,
26871a8ae822SKevin Wolf     };
26881a8ae822SKevin Wolf 
26897d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data);
26901a8ae822SKevin Wolf }
26911a8ae822SKevin Wolf 
269261007b31SStefan Hajnoczi int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
269361007b31SStefan Hajnoczi                       int64_t pos, int size)
269461007b31SStefan Hajnoczi {
26950d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2696b433d942SKevin Wolf     int ret;
269761007b31SStefan Hajnoczi 
2698b433d942SKevin Wolf     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2699b433d942SKevin Wolf     if (ret < 0) {
2700b433d942SKevin Wolf         return ret;
2701b433d942SKevin Wolf     }
2702b433d942SKevin Wolf 
2703b433d942SKevin Wolf     return size;
270461007b31SStefan Hajnoczi }
270561007b31SStefan Hajnoczi 
270661007b31SStefan Hajnoczi int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
270761007b31SStefan Hajnoczi {
27081a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, false);
270961007b31SStefan Hajnoczi }
271061007b31SStefan Hajnoczi 
271161007b31SStefan Hajnoczi int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
271261007b31SStefan Hajnoczi                       int64_t pos, int size)
271361007b31SStefan Hajnoczi {
27140d93ed08SVladimir Sementsov-Ogievskiy     QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2715b433d942SKevin Wolf     int ret;
27165ddda0b8SKevin Wolf 
2717b433d942SKevin Wolf     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2718b433d942SKevin Wolf     if (ret < 0) {
2719b433d942SKevin Wolf         return ret;
2720b433d942SKevin Wolf     }
2721b433d942SKevin Wolf 
2722b433d942SKevin Wolf     return size;
27235ddda0b8SKevin Wolf }
27245ddda0b8SKevin Wolf 
27255ddda0b8SKevin Wolf int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
27265ddda0b8SKevin Wolf {
27271a8ae822SKevin Wolf     return bdrv_rw_vmstate(bs, qiov, pos, true);
272861007b31SStefan Hajnoczi }
272961007b31SStefan Hajnoczi 
273061007b31SStefan Hajnoczi /**************************************************************/
273161007b31SStefan Hajnoczi /* async I/Os */
273261007b31SStefan Hajnoczi 
273361007b31SStefan Hajnoczi void bdrv_aio_cancel(BlockAIOCB *acb)
273461007b31SStefan Hajnoczi {
273561007b31SStefan Hajnoczi     qemu_aio_ref(acb);
273661007b31SStefan Hajnoczi     bdrv_aio_cancel_async(acb);
273761007b31SStefan Hajnoczi     while (acb->refcnt > 1) {
273861007b31SStefan Hajnoczi         if (acb->aiocb_info->get_aio_context) {
273961007b31SStefan Hajnoczi             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
274061007b31SStefan Hajnoczi         } else if (acb->bs) {
27412f47da5fSPaolo Bonzini             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
27422f47da5fSPaolo Bonzini              * assert that we're not using an I/O thread.  Thread-safe
27432f47da5fSPaolo Bonzini              * code should use bdrv_aio_cancel_async exclusively.
27442f47da5fSPaolo Bonzini              */
27452f47da5fSPaolo Bonzini             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
274661007b31SStefan Hajnoczi             aio_poll(bdrv_get_aio_context(acb->bs), true);
274761007b31SStefan Hajnoczi         } else {
274861007b31SStefan Hajnoczi             abort();
274961007b31SStefan Hajnoczi         }
275061007b31SStefan Hajnoczi     }
275161007b31SStefan Hajnoczi     qemu_aio_unref(acb);
275261007b31SStefan Hajnoczi }
275361007b31SStefan Hajnoczi 
275461007b31SStefan Hajnoczi /* Async version of aio cancel. The caller is not blocked if the acb implements
275561007b31SStefan Hajnoczi  * cancel_async, otherwise we do nothing and let the request normally complete.
275661007b31SStefan Hajnoczi  * In either case the completion callback must be called. */
275761007b31SStefan Hajnoczi void bdrv_aio_cancel_async(BlockAIOCB *acb)
275861007b31SStefan Hajnoczi {
275961007b31SStefan Hajnoczi     if (acb->aiocb_info->cancel_async) {
276061007b31SStefan Hajnoczi         acb->aiocb_info->cancel_async(acb);
276161007b31SStefan Hajnoczi     }
276261007b31SStefan Hajnoczi }
276361007b31SStefan Hajnoczi 
276461007b31SStefan Hajnoczi /**************************************************************/
276561007b31SStefan Hajnoczi /* Coroutine block device emulation */
276661007b31SStefan Hajnoczi 
27677d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_flush_co_entry(void *opaque)
276861007b31SStefan Hajnoczi {
27697d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_flush(opaque);
277061007b31SStefan Hajnoczi }
277161007b31SStefan Hajnoczi 
277261007b31SStefan Hajnoczi int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
277361007b31SStefan Hajnoczi {
277449ca6259SFam Zheng     int current_gen;
277549ca6259SFam Zheng     int ret = 0;
277661007b31SStefan Hajnoczi 
277799723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2778c32b82afSPavel Dovgalyuk 
2779e914404eSFam Zheng     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
278049ca6259SFam Zheng         bdrv_is_sg(bs)) {
278149ca6259SFam Zheng         goto early_exit;
278249ca6259SFam Zheng     }
278349ca6259SFam Zheng 
27843783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
278547fec599SPaolo Bonzini     current_gen = atomic_read(&bs->write_gen);
27863ff2f67aSEvgeny Yakovlev 
27873ff2f67aSEvgeny Yakovlev     /* Wait until any previous flushes are completed */
278899723548SPaolo Bonzini     while (bs->active_flush_req) {
27893783fa3dSPaolo Bonzini         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
27903ff2f67aSEvgeny Yakovlev     }
27913ff2f67aSEvgeny Yakovlev 
27923783fa3dSPaolo Bonzini     /* Flushes reach this point in nondecreasing current_gen order.  */
279399723548SPaolo Bonzini     bs->active_flush_req = true;
27943783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
27953ff2f67aSEvgeny Yakovlev 
2796c32b82afSPavel Dovgalyuk     /* Write back all layers by calling one driver function */
2797c32b82afSPavel Dovgalyuk     if (bs->drv->bdrv_co_flush) {
2798c32b82afSPavel Dovgalyuk         ret = bs->drv->bdrv_co_flush(bs);
2799c32b82afSPavel Dovgalyuk         goto out;
2800c32b82afSPavel Dovgalyuk     }
2801c32b82afSPavel Dovgalyuk 
280261007b31SStefan Hajnoczi     /* Write back cached data to the OS even with cache=unsafe */
280361007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
280461007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_os) {
280561007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_os(bs);
280661007b31SStefan Hajnoczi         if (ret < 0) {
2807cdb5e315SFam Zheng             goto out;
280861007b31SStefan Hajnoczi         }
280961007b31SStefan Hajnoczi     }
281061007b31SStefan Hajnoczi 
281161007b31SStefan Hajnoczi     /* But don't actually force it to the disk with cache=unsafe */
281261007b31SStefan Hajnoczi     if (bs->open_flags & BDRV_O_NO_FLUSH) {
281361007b31SStefan Hajnoczi         goto flush_parent;
281461007b31SStefan Hajnoczi     }
281561007b31SStefan Hajnoczi 
28163ff2f67aSEvgeny Yakovlev     /* Check if we really need to flush anything */
28173ff2f67aSEvgeny Yakovlev     if (bs->flushed_gen == current_gen) {
28183ff2f67aSEvgeny Yakovlev         goto flush_parent;
28193ff2f67aSEvgeny Yakovlev     }
28203ff2f67aSEvgeny Yakovlev 
282161007b31SStefan Hajnoczi     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2822d470ad42SMax Reitz     if (!bs->drv) {
2823d470ad42SMax Reitz         /* bs->drv->bdrv_co_flush() might have ejected the BDS
2824d470ad42SMax Reitz          * (even in case of apparent success) */
2825d470ad42SMax Reitz         ret = -ENOMEDIUM;
2826d470ad42SMax Reitz         goto out;
2827d470ad42SMax Reitz     }
282861007b31SStefan Hajnoczi     if (bs->drv->bdrv_co_flush_to_disk) {
282961007b31SStefan Hajnoczi         ret = bs->drv->bdrv_co_flush_to_disk(bs);
283061007b31SStefan Hajnoczi     } else if (bs->drv->bdrv_aio_flush) {
283161007b31SStefan Hajnoczi         BlockAIOCB *acb;
283261007b31SStefan Hajnoczi         CoroutineIOCompletion co = {
283361007b31SStefan Hajnoczi             .coroutine = qemu_coroutine_self(),
283461007b31SStefan Hajnoczi         };
283561007b31SStefan Hajnoczi 
283661007b31SStefan Hajnoczi         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
283761007b31SStefan Hajnoczi         if (acb == NULL) {
283861007b31SStefan Hajnoczi             ret = -EIO;
283961007b31SStefan Hajnoczi         } else {
284061007b31SStefan Hajnoczi             qemu_coroutine_yield();
284161007b31SStefan Hajnoczi             ret = co.ret;
284261007b31SStefan Hajnoczi         }
284361007b31SStefan Hajnoczi     } else {
284461007b31SStefan Hajnoczi         /*
284561007b31SStefan Hajnoczi          * Some block drivers always operate in either writethrough or unsafe
284661007b31SStefan Hajnoczi          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
284761007b31SStefan Hajnoczi          * know how the server works (because the behaviour is hardcoded or
284861007b31SStefan Hajnoczi          * depends on server-side configuration), so we can't ensure that
284961007b31SStefan Hajnoczi          * everything is safe on disk. Returning an error doesn't work because
285061007b31SStefan Hajnoczi          * that would break guests even if the server operates in writethrough
285161007b31SStefan Hajnoczi          * mode.
285261007b31SStefan Hajnoczi          *
285361007b31SStefan Hajnoczi          * Let's hope the user knows what he's doing.
285461007b31SStefan Hajnoczi          */
285561007b31SStefan Hajnoczi         ret = 0;
285661007b31SStefan Hajnoczi     }
28573ff2f67aSEvgeny Yakovlev 
285861007b31SStefan Hajnoczi     if (ret < 0) {
2859cdb5e315SFam Zheng         goto out;
286061007b31SStefan Hajnoczi     }
286161007b31SStefan Hajnoczi 
286261007b31SStefan Hajnoczi     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
286361007b31SStefan Hajnoczi      * in the case of cache=unsafe, so there are no useless flushes.
286461007b31SStefan Hajnoczi      */
286561007b31SStefan Hajnoczi flush_parent:
2866cdb5e315SFam Zheng     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2867cdb5e315SFam Zheng out:
28683ff2f67aSEvgeny Yakovlev     /* Notify any pending flushes that we have completed */
2869e6af1e08SKevin Wolf     if (ret == 0) {
28703ff2f67aSEvgeny Yakovlev         bs->flushed_gen = current_gen;
2871e6af1e08SKevin Wolf     }
28723783fa3dSPaolo Bonzini 
28733783fa3dSPaolo Bonzini     qemu_co_mutex_lock(&bs->reqs_lock);
287499723548SPaolo Bonzini     bs->active_flush_req = false;
2875156af3acSDenis V. Lunev     /* Return value is ignored - it's ok if wait queue is empty */
2876156af3acSDenis V. Lunev     qemu_co_queue_next(&bs->flush_queue);
28773783fa3dSPaolo Bonzini     qemu_co_mutex_unlock(&bs->reqs_lock);
28783ff2f67aSEvgeny Yakovlev 
287949ca6259SFam Zheng early_exit:
288099723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
2881cdb5e315SFam Zheng     return ret;
288261007b31SStefan Hajnoczi }
288361007b31SStefan Hajnoczi 
288461007b31SStefan Hajnoczi int bdrv_flush(BlockDriverState *bs)
288561007b31SStefan Hajnoczi {
28867d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(bs, bdrv_flush_co_entry, bs);
288761007b31SStefan Hajnoczi }
288861007b31SStefan Hajnoczi 
288961007b31SStefan Hajnoczi typedef struct DiscardCo {
28900b9fd3f4SFam Zheng     BdrvChild *child;
28910c51a893SEric Blake     int64_t offset;
2892d93e5726SVladimir Sementsov-Ogievskiy     int64_t bytes;
289361007b31SStefan Hajnoczi } DiscardCo;
28947d2410ceSVladimir Sementsov-Ogievskiy 
28957d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
289661007b31SStefan Hajnoczi {
289761007b31SStefan Hajnoczi     DiscardCo *rwco = opaque;
289861007b31SStefan Hajnoczi 
28997d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
290061007b31SStefan Hajnoczi }
290161007b31SStefan Hajnoczi 
2902d93e5726SVladimir Sementsov-Ogievskiy int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
2903d93e5726SVladimir Sementsov-Ogievskiy                                   int64_t bytes)
290461007b31SStefan Hajnoczi {
2905b1066c87SFam Zheng     BdrvTrackedRequest req;
29069f1963b3SEric Blake     int max_pdiscard, ret;
29073482b9bcSEric Blake     int head, tail, align;
29080b9fd3f4SFam Zheng     BlockDriverState *bs = child->bs;
290961007b31SStefan Hajnoczi 
2910d93e5726SVladimir Sementsov-Ogievskiy     if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
291161007b31SStefan Hajnoczi         return -ENOMEDIUM;
291261007b31SStefan Hajnoczi     }
291361007b31SStefan Hajnoczi 
2914d6883bc9SVladimir Sementsov-Ogievskiy     if (bdrv_has_readonly_bitmaps(bs)) {
2915d6883bc9SVladimir Sementsov-Ogievskiy         return -EPERM;
2916d6883bc9SVladimir Sementsov-Ogievskiy     }
2917d6883bc9SVladimir Sementsov-Ogievskiy 
2918d93e5726SVladimir Sementsov-Ogievskiy     if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) {
2919d93e5726SVladimir Sementsov-Ogievskiy         return -EIO;
292061007b31SStefan Hajnoczi     }
292161007b31SStefan Hajnoczi 
292261007b31SStefan Hajnoczi     /* Do nothing if disabled.  */
292361007b31SStefan Hajnoczi     if (!(bs->open_flags & BDRV_O_UNMAP)) {
292461007b31SStefan Hajnoczi         return 0;
292561007b31SStefan Hajnoczi     }
292661007b31SStefan Hajnoczi 
292702aefe43SEric Blake     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
292861007b31SStefan Hajnoczi         return 0;
292961007b31SStefan Hajnoczi     }
293061007b31SStefan Hajnoczi 
29313482b9bcSEric Blake     /* Discard is advisory, but some devices track and coalesce
29323482b9bcSEric Blake      * unaligned requests, so we must pass everything down rather than
29333482b9bcSEric Blake      * round here.  Still, most devices will just silently ignore
29343482b9bcSEric Blake      * unaligned requests (by returning -ENOTSUP), so we must fragment
29353482b9bcSEric Blake      * the request accordingly.  */
293602aefe43SEric Blake     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2937b8d0a980SEric Blake     assert(align % bs->bl.request_alignment == 0);
2938b8d0a980SEric Blake     head = offset % align;
2939f5a5ca79SManos Pitsidianakis     tail = (offset + bytes) % align;
29409f1963b3SEric Blake 
294199723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
2942f5a5ca79SManos Pitsidianakis     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
294350824995SFam Zheng 
294400695c27SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2945ec050f77SDenis V. Lunev     if (ret < 0) {
2946ec050f77SDenis V. Lunev         goto out;
2947ec050f77SDenis V. Lunev     }
2948ec050f77SDenis V. Lunev 
29499f1963b3SEric Blake     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
29509f1963b3SEric Blake                                    align);
29513482b9bcSEric Blake     assert(max_pdiscard >= bs->bl.request_alignment);
29529f1963b3SEric Blake 
2953f5a5ca79SManos Pitsidianakis     while (bytes > 0) {
2954d93e5726SVladimir Sementsov-Ogievskiy         int64_t num = bytes;
29553482b9bcSEric Blake 
29563482b9bcSEric Blake         if (head) {
29573482b9bcSEric Blake             /* Make small requests to get to alignment boundaries. */
2958f5a5ca79SManos Pitsidianakis             num = MIN(bytes, align - head);
29593482b9bcSEric Blake             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
29603482b9bcSEric Blake                 num %= bs->bl.request_alignment;
29613482b9bcSEric Blake             }
29623482b9bcSEric Blake             head = (head + num) % align;
29633482b9bcSEric Blake             assert(num < max_pdiscard);
29643482b9bcSEric Blake         } else if (tail) {
29653482b9bcSEric Blake             if (num > align) {
29663482b9bcSEric Blake                 /* Shorten the request to the last aligned cluster.  */
29673482b9bcSEric Blake                 num -= tail;
29683482b9bcSEric Blake             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
29693482b9bcSEric Blake                        tail > bs->bl.request_alignment) {
29703482b9bcSEric Blake                 tail %= bs->bl.request_alignment;
29713482b9bcSEric Blake                 num -= tail;
29723482b9bcSEric Blake             }
29733482b9bcSEric Blake         }
29743482b9bcSEric Blake         /* limit request size */
29753482b9bcSEric Blake         if (num > max_pdiscard) {
29763482b9bcSEric Blake             num = max_pdiscard;
29773482b9bcSEric Blake         }
297861007b31SStefan Hajnoczi 
2979d470ad42SMax Reitz         if (!bs->drv) {
2980d470ad42SMax Reitz             ret = -ENOMEDIUM;
2981d470ad42SMax Reitz             goto out;
2982d470ad42SMax Reitz         }
298347a5486dSEric Blake         if (bs->drv->bdrv_co_pdiscard) {
298447a5486dSEric Blake             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
298561007b31SStefan Hajnoczi         } else {
298661007b31SStefan Hajnoczi             BlockAIOCB *acb;
298761007b31SStefan Hajnoczi             CoroutineIOCompletion co = {
298861007b31SStefan Hajnoczi                 .coroutine = qemu_coroutine_self(),
298961007b31SStefan Hajnoczi             };
299061007b31SStefan Hajnoczi 
29914da444a0SEric Blake             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
299261007b31SStefan Hajnoczi                                              bdrv_co_io_em_complete, &co);
299361007b31SStefan Hajnoczi             if (acb == NULL) {
2994b1066c87SFam Zheng                 ret = -EIO;
2995b1066c87SFam Zheng                 goto out;
299661007b31SStefan Hajnoczi             } else {
299761007b31SStefan Hajnoczi                 qemu_coroutine_yield();
299861007b31SStefan Hajnoczi                 ret = co.ret;
299961007b31SStefan Hajnoczi             }
300061007b31SStefan Hajnoczi         }
300161007b31SStefan Hajnoczi         if (ret && ret != -ENOTSUP) {
3002b1066c87SFam Zheng             goto out;
300361007b31SStefan Hajnoczi         }
300461007b31SStefan Hajnoczi 
30059f1963b3SEric Blake         offset += num;
3006f5a5ca79SManos Pitsidianakis         bytes -= num;
300761007b31SStefan Hajnoczi     }
3008b1066c87SFam Zheng     ret = 0;
3009b1066c87SFam Zheng out:
301000695c27SFam Zheng     bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3011b1066c87SFam Zheng     tracked_request_end(&req);
301299723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
3013b1066c87SFam Zheng     return ret;
301461007b31SStefan Hajnoczi }
301561007b31SStefan Hajnoczi 
3016d93e5726SVladimir Sementsov-Ogievskiy int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
301761007b31SStefan Hajnoczi {
301861007b31SStefan Hajnoczi     DiscardCo rwco = {
30190b9fd3f4SFam Zheng         .child = child,
30200c51a893SEric Blake         .offset = offset,
3021f5a5ca79SManos Pitsidianakis         .bytes = bytes,
302261007b31SStefan Hajnoczi     };
302361007b31SStefan Hajnoczi 
30247d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco);
302561007b31SStefan Hajnoczi }
302661007b31SStefan Hajnoczi 
302748af776aSKevin Wolf int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
302861007b31SStefan Hajnoczi {
302961007b31SStefan Hajnoczi     BlockDriver *drv = bs->drv;
30305c5ae76aSFam Zheng     CoroutineIOCompletion co = {
30315c5ae76aSFam Zheng         .coroutine = qemu_coroutine_self(),
30325c5ae76aSFam Zheng     };
30335c5ae76aSFam Zheng     BlockAIOCB *acb;
303461007b31SStefan Hajnoczi 
303599723548SPaolo Bonzini     bdrv_inc_in_flight(bs);
303616a389dcSKevin Wolf     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
30375c5ae76aSFam Zheng         co.ret = -ENOTSUP;
30385c5ae76aSFam Zheng         goto out;
30395c5ae76aSFam Zheng     }
30405c5ae76aSFam Zheng 
304116a389dcSKevin Wolf     if (drv->bdrv_co_ioctl) {
304216a389dcSKevin Wolf         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
304316a389dcSKevin Wolf     } else {
30445c5ae76aSFam Zheng         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
30455c5ae76aSFam Zheng         if (!acb) {
3046c8a9fd80SFam Zheng             co.ret = -ENOTSUP;
3047c8a9fd80SFam Zheng             goto out;
30485c5ae76aSFam Zheng         }
30495c5ae76aSFam Zheng         qemu_coroutine_yield();
305016a389dcSKevin Wolf     }
30515c5ae76aSFam Zheng out:
305299723548SPaolo Bonzini     bdrv_dec_in_flight(bs);
30535c5ae76aSFam Zheng     return co.ret;
30545c5ae76aSFam Zheng }
30555c5ae76aSFam Zheng 
305661007b31SStefan Hajnoczi void *qemu_blockalign(BlockDriverState *bs, size_t size)
305761007b31SStefan Hajnoczi {
305861007b31SStefan Hajnoczi     return qemu_memalign(bdrv_opt_mem_align(bs), size);
305961007b31SStefan Hajnoczi }
306061007b31SStefan Hajnoczi 
306161007b31SStefan Hajnoczi void *qemu_blockalign0(BlockDriverState *bs, size_t size)
306261007b31SStefan Hajnoczi {
306361007b31SStefan Hajnoczi     return memset(qemu_blockalign(bs, size), 0, size);
306461007b31SStefan Hajnoczi }
306561007b31SStefan Hajnoczi 
306661007b31SStefan Hajnoczi void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
306761007b31SStefan Hajnoczi {
306861007b31SStefan Hajnoczi     size_t align = bdrv_opt_mem_align(bs);
306961007b31SStefan Hajnoczi 
307061007b31SStefan Hajnoczi     /* Ensure that NULL is never returned on success */
307161007b31SStefan Hajnoczi     assert(align > 0);
307261007b31SStefan Hajnoczi     if (size == 0) {
307361007b31SStefan Hajnoczi         size = align;
307461007b31SStefan Hajnoczi     }
307561007b31SStefan Hajnoczi 
307661007b31SStefan Hajnoczi     return qemu_try_memalign(align, size);
307761007b31SStefan Hajnoczi }
307861007b31SStefan Hajnoczi 
307961007b31SStefan Hajnoczi void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
308061007b31SStefan Hajnoczi {
308161007b31SStefan Hajnoczi     void *mem = qemu_try_blockalign(bs, size);
308261007b31SStefan Hajnoczi 
308361007b31SStefan Hajnoczi     if (mem) {
308461007b31SStefan Hajnoczi         memset(mem, 0, size);
308561007b31SStefan Hajnoczi     }
308661007b31SStefan Hajnoczi 
308761007b31SStefan Hajnoczi     return mem;
308861007b31SStefan Hajnoczi }
308961007b31SStefan Hajnoczi 
309061007b31SStefan Hajnoczi /*
309161007b31SStefan Hajnoczi  * Check if all memory in this vector is sector aligned.
309261007b31SStefan Hajnoczi  */
309361007b31SStefan Hajnoczi bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
309461007b31SStefan Hajnoczi {
309561007b31SStefan Hajnoczi     int i;
30964196d2f0SDenis V. Lunev     size_t alignment = bdrv_min_mem_align(bs);
309761007b31SStefan Hajnoczi 
309861007b31SStefan Hajnoczi     for (i = 0; i < qiov->niov; i++) {
309961007b31SStefan Hajnoczi         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
310061007b31SStefan Hajnoczi             return false;
310161007b31SStefan Hajnoczi         }
310261007b31SStefan Hajnoczi         if (qiov->iov[i].iov_len % alignment) {
310361007b31SStefan Hajnoczi             return false;
310461007b31SStefan Hajnoczi         }
310561007b31SStefan Hajnoczi     }
310661007b31SStefan Hajnoczi 
310761007b31SStefan Hajnoczi     return true;
310861007b31SStefan Hajnoczi }
310961007b31SStefan Hajnoczi 
311061007b31SStefan Hajnoczi void bdrv_add_before_write_notifier(BlockDriverState *bs,
311161007b31SStefan Hajnoczi                                     NotifierWithReturn *notifier)
311261007b31SStefan Hajnoczi {
311361007b31SStefan Hajnoczi     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
311461007b31SStefan Hajnoczi }
311561007b31SStefan Hajnoczi 
311661007b31SStefan Hajnoczi void bdrv_io_plug(BlockDriverState *bs)
311761007b31SStefan Hajnoczi {
31186b98bd64SPaolo Bonzini     BdrvChild *child;
31196b98bd64SPaolo Bonzini 
31206b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
31216b98bd64SPaolo Bonzini         bdrv_io_plug(child->bs);
31226b98bd64SPaolo Bonzini     }
31236b98bd64SPaolo Bonzini 
3124850d54a2SPaolo Bonzini     if (atomic_fetch_inc(&bs->io_plugged) == 0) {
312561007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
312661007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_plug) {
312761007b31SStefan Hajnoczi             drv->bdrv_io_plug(bs);
31286b98bd64SPaolo Bonzini         }
312961007b31SStefan Hajnoczi     }
313061007b31SStefan Hajnoczi }
313161007b31SStefan Hajnoczi 
313261007b31SStefan Hajnoczi void bdrv_io_unplug(BlockDriverState *bs)
313361007b31SStefan Hajnoczi {
31346b98bd64SPaolo Bonzini     BdrvChild *child;
31356b98bd64SPaolo Bonzini 
31366b98bd64SPaolo Bonzini     assert(bs->io_plugged);
3137850d54a2SPaolo Bonzini     if (atomic_fetch_dec(&bs->io_plugged) == 1) {
313861007b31SStefan Hajnoczi         BlockDriver *drv = bs->drv;
313961007b31SStefan Hajnoczi         if (drv && drv->bdrv_io_unplug) {
314061007b31SStefan Hajnoczi             drv->bdrv_io_unplug(bs);
314161007b31SStefan Hajnoczi         }
314261007b31SStefan Hajnoczi     }
314361007b31SStefan Hajnoczi 
31446b98bd64SPaolo Bonzini     QLIST_FOREACH(child, &bs->children, next) {
31456b98bd64SPaolo Bonzini         bdrv_io_unplug(child->bs);
31466b98bd64SPaolo Bonzini     }
31476b98bd64SPaolo Bonzini }
314823d0ba93SFam Zheng 
314923d0ba93SFam Zheng void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
315023d0ba93SFam Zheng {
315123d0ba93SFam Zheng     BdrvChild *child;
315223d0ba93SFam Zheng 
315323d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_register_buf) {
315423d0ba93SFam Zheng         bs->drv->bdrv_register_buf(bs, host, size);
315523d0ba93SFam Zheng     }
315623d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
315723d0ba93SFam Zheng         bdrv_register_buf(child->bs, host, size);
315823d0ba93SFam Zheng     }
315923d0ba93SFam Zheng }
316023d0ba93SFam Zheng 
316123d0ba93SFam Zheng void bdrv_unregister_buf(BlockDriverState *bs, void *host)
316223d0ba93SFam Zheng {
316323d0ba93SFam Zheng     BdrvChild *child;
316423d0ba93SFam Zheng 
316523d0ba93SFam Zheng     if (bs->drv && bs->drv->bdrv_unregister_buf) {
316623d0ba93SFam Zheng         bs->drv->bdrv_unregister_buf(bs, host);
316723d0ba93SFam Zheng     }
316823d0ba93SFam Zheng     QLIST_FOREACH(child, &bs->children, next) {
316923d0ba93SFam Zheng         bdrv_unregister_buf(child->bs, host);
317023d0ba93SFam Zheng     }
317123d0ba93SFam Zheng }
3172fcc67678SFam Zheng 
317367b51fb9SVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_co_copy_range_internal(
317467b51fb9SVladimir Sementsov-Ogievskiy         BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
317567b51fb9SVladimir Sementsov-Ogievskiy         uint64_t dst_offset, uint64_t bytes,
317667b51fb9SVladimir Sementsov-Ogievskiy         BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3177fcc67678SFam Zheng         bool recurse_src)
3178fcc67678SFam Zheng {
3179999658a0SVladimir Sementsov-Ogievskiy     BdrvTrackedRequest req;
3180fcc67678SFam Zheng     int ret;
3181fcc67678SFam Zheng 
3182fe0480d6SKevin Wolf     /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3183fe0480d6SKevin Wolf     assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3184fe0480d6SKevin Wolf     assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3185fe0480d6SKevin Wolf 
3186d4d3e5a0SFam Zheng     if (!dst || !dst->bs) {
3187fcc67678SFam Zheng         return -ENOMEDIUM;
3188fcc67678SFam Zheng     }
3189fcc67678SFam Zheng     ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
3190fcc67678SFam Zheng     if (ret) {
3191fcc67678SFam Zheng         return ret;
3192fcc67678SFam Zheng     }
319367b51fb9SVladimir Sementsov-Ogievskiy     if (write_flags & BDRV_REQ_ZERO_WRITE) {
319467b51fb9SVladimir Sementsov-Ogievskiy         return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3195fcc67678SFam Zheng     }
3196fcc67678SFam Zheng 
3197d4d3e5a0SFam Zheng     if (!src || !src->bs) {
3198d4d3e5a0SFam Zheng         return -ENOMEDIUM;
3199d4d3e5a0SFam Zheng     }
3200d4d3e5a0SFam Zheng     ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
3201d4d3e5a0SFam Zheng     if (ret) {
3202d4d3e5a0SFam Zheng         return ret;
3203d4d3e5a0SFam Zheng     }
3204d4d3e5a0SFam Zheng 
3205fcc67678SFam Zheng     if (!src->bs->drv->bdrv_co_copy_range_from
3206fcc67678SFam Zheng         || !dst->bs->drv->bdrv_co_copy_range_to
3207fcc67678SFam Zheng         || src->bs->encrypted || dst->bs->encrypted) {
3208fcc67678SFam Zheng         return -ENOTSUP;
3209fcc67678SFam Zheng     }
3210999658a0SVladimir Sementsov-Ogievskiy 
3211999658a0SVladimir Sementsov-Ogievskiy     if (recurse_src) {
3212d4d3e5a0SFam Zheng         bdrv_inc_in_flight(src->bs);
3213999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, src->bs, src_offset, bytes,
3214999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_READ);
321537aec7d7SFam Zheng 
321609d2f948SVladimir Sementsov-Ogievskiy         /* BDRV_REQ_SERIALISING is only for write operation */
321709d2f948SVladimir Sementsov-Ogievskiy         assert(!(read_flags & BDRV_REQ_SERIALISING));
3218304d9d7fSMax Reitz         bdrv_wait_serialising_requests(&req);
3219999658a0SVladimir Sementsov-Ogievskiy 
322037aec7d7SFam Zheng         ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3221fcc67678SFam Zheng                                                     src, src_offset,
3222fcc67678SFam Zheng                                                     dst, dst_offset,
322367b51fb9SVladimir Sementsov-Ogievskiy                                                     bytes,
322467b51fb9SVladimir Sementsov-Ogievskiy                                                     read_flags, write_flags);
3225999658a0SVladimir Sementsov-Ogievskiy 
3226999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
3227999658a0SVladimir Sementsov-Ogievskiy         bdrv_dec_in_flight(src->bs);
3228fcc67678SFam Zheng     } else {
3229999658a0SVladimir Sementsov-Ogievskiy         bdrv_inc_in_flight(dst->bs);
3230999658a0SVladimir Sementsov-Ogievskiy         tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3231999658a0SVladimir Sementsov-Ogievskiy                               BDRV_TRACKED_WRITE);
32320eb1e891SFam Zheng         ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
32330eb1e891SFam Zheng                                         write_flags);
32340eb1e891SFam Zheng         if (!ret) {
323537aec7d7SFam Zheng             ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3236fcc67678SFam Zheng                                                       src, src_offset,
3237fcc67678SFam Zheng                                                       dst, dst_offset,
323867b51fb9SVladimir Sementsov-Ogievskiy                                                       bytes,
323967b51fb9SVladimir Sementsov-Ogievskiy                                                       read_flags, write_flags);
32400eb1e891SFam Zheng         }
32410eb1e891SFam Zheng         bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3242999658a0SVladimir Sementsov-Ogievskiy         tracked_request_end(&req);
3243d4d3e5a0SFam Zheng         bdrv_dec_in_flight(dst->bs);
3244999658a0SVladimir Sementsov-Ogievskiy     }
3245999658a0SVladimir Sementsov-Ogievskiy 
324637aec7d7SFam Zheng     return ret;
3247fcc67678SFam Zheng }
3248fcc67678SFam Zheng 
3249fcc67678SFam Zheng /* Copy range from @src to @dst.
3250fcc67678SFam Zheng  *
3251fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3252fcc67678SFam Zheng  * semantics. */
3253fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3254fcc67678SFam Zheng                                          BdrvChild *dst, uint64_t dst_offset,
325567b51fb9SVladimir Sementsov-Ogievskiy                                          uint64_t bytes,
325667b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags read_flags,
325767b51fb9SVladimir Sementsov-Ogievskiy                                          BdrvRequestFlags write_flags)
3258fcc67678SFam Zheng {
3259ecc983a5SFam Zheng     trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3260ecc983a5SFam Zheng                                   read_flags, write_flags);
3261fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
326267b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, true);
3263fcc67678SFam Zheng }
3264fcc67678SFam Zheng 
3265fcc67678SFam Zheng /* Copy range from @src to @dst.
3266fcc67678SFam Zheng  *
3267fcc67678SFam Zheng  * See the comment of bdrv_co_copy_range for the parameter and return value
3268fcc67678SFam Zheng  * semantics. */
3269fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3270fcc67678SFam Zheng                                        BdrvChild *dst, uint64_t dst_offset,
327167b51fb9SVladimir Sementsov-Ogievskiy                                        uint64_t bytes,
327267b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags read_flags,
327367b51fb9SVladimir Sementsov-Ogievskiy                                        BdrvRequestFlags write_flags)
3274fcc67678SFam Zheng {
3275ecc983a5SFam Zheng     trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3276ecc983a5SFam Zheng                                 read_flags, write_flags);
3277fcc67678SFam Zheng     return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
327867b51fb9SVladimir Sementsov-Ogievskiy                                        bytes, read_flags, write_flags, false);
3279fcc67678SFam Zheng }
3280fcc67678SFam Zheng 
3281fcc67678SFam Zheng int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3282fcc67678SFam Zheng                                     BdrvChild *dst, uint64_t dst_offset,
328367b51fb9SVladimir Sementsov-Ogievskiy                                     uint64_t bytes, BdrvRequestFlags read_flags,
328467b51fb9SVladimir Sementsov-Ogievskiy                                     BdrvRequestFlags write_flags)
3285fcc67678SFam Zheng {
328637aec7d7SFam Zheng     return bdrv_co_copy_range_from(src, src_offset,
3287fcc67678SFam Zheng                                    dst, dst_offset,
328867b51fb9SVladimir Sementsov-Ogievskiy                                    bytes, read_flags, write_flags);
3289fcc67678SFam Zheng }
32903d9f2d2aSKevin Wolf 
32913d9f2d2aSKevin Wolf static void bdrv_parent_cb_resize(BlockDriverState *bs)
32923d9f2d2aSKevin Wolf {
32933d9f2d2aSKevin Wolf     BdrvChild *c;
32943d9f2d2aSKevin Wolf     QLIST_FOREACH(c, &bs->parents, next_parent) {
3295bd86fb99SMax Reitz         if (c->klass->resize) {
3296bd86fb99SMax Reitz             c->klass->resize(c);
32973d9f2d2aSKevin Wolf         }
32983d9f2d2aSKevin Wolf     }
32993d9f2d2aSKevin Wolf }
33003d9f2d2aSKevin Wolf 
33013d9f2d2aSKevin Wolf /**
33023d9f2d2aSKevin Wolf  * Truncate file to 'offset' bytes (needed only for file protocols)
3303c80d8b06SMax Reitz  *
3304c80d8b06SMax Reitz  * If 'exact' is true, the file must be resized to exactly the given
3305c80d8b06SMax Reitz  * 'offset'.  Otherwise, it is sufficient for the node to be at least
3306c80d8b06SMax Reitz  * 'offset' bytes in length.
33073d9f2d2aSKevin Wolf  */
3308c80d8b06SMax Reitz int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
33097b8e4857SKevin Wolf                                   PreallocMode prealloc, BdrvRequestFlags flags,
33107b8e4857SKevin Wolf                                   Error **errp)
33113d9f2d2aSKevin Wolf {
33123d9f2d2aSKevin Wolf     BlockDriverState *bs = child->bs;
3313*23b93525SMax Reitz     BdrvChild *filtered, *backing;
33143d9f2d2aSKevin Wolf     BlockDriver *drv = bs->drv;
33151bc5f09fSKevin Wolf     BdrvTrackedRequest req;
33161bc5f09fSKevin Wolf     int64_t old_size, new_bytes;
33173d9f2d2aSKevin Wolf     int ret;
33183d9f2d2aSKevin Wolf 
33193d9f2d2aSKevin Wolf 
33203d9f2d2aSKevin Wolf     /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
33213d9f2d2aSKevin Wolf     if (!drv) {
33223d9f2d2aSKevin Wolf         error_setg(errp, "No medium inserted");
33233d9f2d2aSKevin Wolf         return -ENOMEDIUM;
33243d9f2d2aSKevin Wolf     }
33253d9f2d2aSKevin Wolf     if (offset < 0) {
33263d9f2d2aSKevin Wolf         error_setg(errp, "Image size cannot be negative");
33273d9f2d2aSKevin Wolf         return -EINVAL;
33283d9f2d2aSKevin Wolf     }
33293d9f2d2aSKevin Wolf 
33301bc5f09fSKevin Wolf     old_size = bdrv_getlength(bs);
33311bc5f09fSKevin Wolf     if (old_size < 0) {
33321bc5f09fSKevin Wolf         error_setg_errno(errp, -old_size, "Failed to get old image size");
33331bc5f09fSKevin Wolf         return old_size;
33341bc5f09fSKevin Wolf     }
33351bc5f09fSKevin Wolf 
33361bc5f09fSKevin Wolf     if (offset > old_size) {
33371bc5f09fSKevin Wolf         new_bytes = offset - old_size;
33381bc5f09fSKevin Wolf     } else {
33391bc5f09fSKevin Wolf         new_bytes = 0;
33401bc5f09fSKevin Wolf     }
33411bc5f09fSKevin Wolf 
33423d9f2d2aSKevin Wolf     bdrv_inc_in_flight(bs);
33435416a11eSFam Zheng     tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
33445416a11eSFam Zheng                           BDRV_TRACKED_TRUNCATE);
33451bc5f09fSKevin Wolf 
33461bc5f09fSKevin Wolf     /* If we are growing the image and potentially using preallocation for the
33471bc5f09fSKevin Wolf      * new area, we need to make sure that no write requests are made to it
33481bc5f09fSKevin Wolf      * concurrently or they might be overwritten by preallocation. */
33491bc5f09fSKevin Wolf     if (new_bytes) {
3350304d9d7fSMax Reitz         bdrv_mark_request_serialising(&req, 1);
3351cd47d792SFam Zheng     }
3352cd47d792SFam Zheng     if (bs->read_only) {
3353cd47d792SFam Zheng         error_setg(errp, "Image is read-only");
3354cd47d792SFam Zheng         ret = -EACCES;
3355cd47d792SFam Zheng         goto out;
3356cd47d792SFam Zheng     }
3357cd47d792SFam Zheng     ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3358cd47d792SFam Zheng                                     0);
3359cd47d792SFam Zheng     if (ret < 0) {
3360cd47d792SFam Zheng         error_setg_errno(errp, -ret,
3361cd47d792SFam Zheng                          "Failed to prepare request for truncation");
3362cd47d792SFam Zheng         goto out;
33631bc5f09fSKevin Wolf     }
33643d9f2d2aSKevin Wolf 
336593393e69SMax Reitz     filtered = bdrv_filter_child(bs);
3366*23b93525SMax Reitz     backing = bdrv_cow_child(bs);
336793393e69SMax Reitz 
3368955c7d66SKevin Wolf     /*
3369955c7d66SKevin Wolf      * If the image has a backing file that is large enough that it would
3370955c7d66SKevin Wolf      * provide data for the new area, we cannot leave it unallocated because
3371955c7d66SKevin Wolf      * then the backing file content would become visible. Instead, zero-fill
3372955c7d66SKevin Wolf      * the new area.
3373955c7d66SKevin Wolf      *
3374955c7d66SKevin Wolf      * Note that if the image has a backing file, but was opened without the
3375955c7d66SKevin Wolf      * backing file, taking care of keeping things consistent with that backing
3376955c7d66SKevin Wolf      * file is the user's responsibility.
3377955c7d66SKevin Wolf      */
3378*23b93525SMax Reitz     if (new_bytes && backing) {
3379955c7d66SKevin Wolf         int64_t backing_len;
3380955c7d66SKevin Wolf 
3381*23b93525SMax Reitz         backing_len = bdrv_getlength(backing->bs);
3382955c7d66SKevin Wolf         if (backing_len < 0) {
3383955c7d66SKevin Wolf             ret = backing_len;
3384955c7d66SKevin Wolf             error_setg_errno(errp, -ret, "Could not get backing file size");
3385955c7d66SKevin Wolf             goto out;
3386955c7d66SKevin Wolf         }
3387955c7d66SKevin Wolf 
3388955c7d66SKevin Wolf         if (backing_len > old_size) {
3389955c7d66SKevin Wolf             flags |= BDRV_REQ_ZERO_WRITE;
3390955c7d66SKevin Wolf         }
3391955c7d66SKevin Wolf     }
3392955c7d66SKevin Wolf 
33936b7e8f8bSMax Reitz     if (drv->bdrv_co_truncate) {
339492b92799SKevin Wolf         if (flags & ~bs->supported_truncate_flags) {
339592b92799SKevin Wolf             error_setg(errp, "Block driver does not support requested flags");
339692b92799SKevin Wolf             ret = -ENOTSUP;
339792b92799SKevin Wolf             goto out;
339892b92799SKevin Wolf         }
339992b92799SKevin Wolf         ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
340093393e69SMax Reitz     } else if (filtered) {
340193393e69SMax Reitz         ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp);
34026b7e8f8bSMax Reitz     } else {
34033d9f2d2aSKevin Wolf         error_setg(errp, "Image format driver does not support resize");
34043d9f2d2aSKevin Wolf         ret = -ENOTSUP;
34053d9f2d2aSKevin Wolf         goto out;
34063d9f2d2aSKevin Wolf     }
34073d9f2d2aSKevin Wolf     if (ret < 0) {
34083d9f2d2aSKevin Wolf         goto out;
34093d9f2d2aSKevin Wolf     }
34106b7e8f8bSMax Reitz 
34113d9f2d2aSKevin Wolf     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
34123d9f2d2aSKevin Wolf     if (ret < 0) {
34133d9f2d2aSKevin Wolf         error_setg_errno(errp, -ret, "Could not refresh total sector count");
34143d9f2d2aSKevin Wolf     } else {
34153d9f2d2aSKevin Wolf         offset = bs->total_sectors * BDRV_SECTOR_SIZE;
34163d9f2d2aSKevin Wolf     }
3417cd47d792SFam Zheng     /* It's possible that truncation succeeded but refresh_total_sectors
3418cd47d792SFam Zheng      * failed, but the latter doesn't affect how we should finish the request.
3419cd47d792SFam Zheng      * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3420cd47d792SFam Zheng     bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
34213d9f2d2aSKevin Wolf 
34223d9f2d2aSKevin Wolf out:
34231bc5f09fSKevin Wolf     tracked_request_end(&req);
34243d9f2d2aSKevin Wolf     bdrv_dec_in_flight(bs);
34251bc5f09fSKevin Wolf 
34263d9f2d2aSKevin Wolf     return ret;
34273d9f2d2aSKevin Wolf }
34283d9f2d2aSKevin Wolf 
34293d9f2d2aSKevin Wolf typedef struct TruncateCo {
34303d9f2d2aSKevin Wolf     BdrvChild *child;
34313d9f2d2aSKevin Wolf     int64_t offset;
3432c80d8b06SMax Reitz     bool exact;
34333d9f2d2aSKevin Wolf     PreallocMode prealloc;
34347b8e4857SKevin Wolf     BdrvRequestFlags flags;
34353d9f2d2aSKevin Wolf     Error **errp;
34363d9f2d2aSKevin Wolf } TruncateCo;
34373d9f2d2aSKevin Wolf 
34387d2410ceSVladimir Sementsov-Ogievskiy static int coroutine_fn bdrv_truncate_co_entry(void *opaque)
34393d9f2d2aSKevin Wolf {
34403d9f2d2aSKevin Wolf     TruncateCo *tco = opaque;
34417d2410ceSVladimir Sementsov-Ogievskiy 
34427d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_co_truncate(tco->child, tco->offset, tco->exact,
34437b8e4857SKevin Wolf                             tco->prealloc, tco->flags, tco->errp);
34443d9f2d2aSKevin Wolf }
34453d9f2d2aSKevin Wolf 
3446c80d8b06SMax Reitz int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
34477b8e4857SKevin Wolf                   PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
34483d9f2d2aSKevin Wolf {
34493d9f2d2aSKevin Wolf     TruncateCo tco = {
34503d9f2d2aSKevin Wolf         .child      = child,
34513d9f2d2aSKevin Wolf         .offset     = offset,
3452c80d8b06SMax Reitz         .exact      = exact,
34533d9f2d2aSKevin Wolf         .prealloc   = prealloc,
34547b8e4857SKevin Wolf         .flags      = flags,
34553d9f2d2aSKevin Wolf         .errp       = errp,
34563d9f2d2aSKevin Wolf     };
34573d9f2d2aSKevin Wolf 
34587d2410ceSVladimir Sementsov-Ogievskiy     return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco);
34593d9f2d2aSKevin Wolf }
3460