xref: /qemu/block/copy-before-write.c (revision 91e11db7bcc486db2dc2bdab94ac5de62c02ce9a)
1  /*
2   * copy-before-write filter driver
3   *
4   * The driver performs Copy-Before-Write (CBW) operation: it is injected above
5   * some node, and before each write it copies _old_ data to the target node.
6   *
7   * Copyright (c) 2018-2021 Virtuozzo International GmbH.
8   *
9   * Author:
10   *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
11   *
12   * This program is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU General Public License as published by
14   * the Free Software Foundation; either version 2 of the License, or
15   * (at your option) any later version.
16   *
17   * This program is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU General Public License for more details.
21   *
22   * You should have received a copy of the GNU General Public License
23   * along with this program. If not, see <http://www.gnu.org/licenses/>.
24   */
25  
26  #include "qemu/osdep.h"
27  #include "qapi/qmp/qjson.h"
28  
29  #include "sysemu/block-backend.h"
30  #include "qemu/cutils.h"
31  #include "qapi/error.h"
32  #include "block/block_int.h"
33  #include "block/qdict.h"
34  #include "block/block-copy.h"
35  #include "block/dirty-bitmap.h"
36  
37  #include "block/copy-before-write.h"
38  #include "block/reqlist.h"
39  
40  #include "qapi/qapi-visit-block-core.h"
41  
42  typedef struct BDRVCopyBeforeWriteState {
43      BlockCopyState *bcs;
44      BdrvChild *target;
45      OnCbwError on_cbw_error;
46      uint32_t cbw_timeout_ns;
47  
48      /*
49       * @lock: protects access to @access_bitmap, @done_bitmap and
50       * @frozen_read_reqs
51       */
52      CoMutex lock;
53  
54      /*
55       * @access_bitmap: represents areas allowed for reading by fleecing user.
56       * Reading from non-dirty areas leads to -EACCES.
57       */
58      BdrvDirtyBitmap *access_bitmap;
59  
60      /*
61       * @done_bitmap: represents areas that was successfully copied to @target by
62       * copy-before-write operations.
63       */
64      BdrvDirtyBitmap *done_bitmap;
65  
66      /*
67       * @frozen_read_reqs: current read requests for fleecing user in bs->file
68       * node. These areas must not be rewritten by guest.
69       */
70      BlockReqList frozen_read_reqs;
71  
72      /*
73       * @snapshot_error is normally zero. But on first copy-before-write failure
74       * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes
75       * value of this error (<0). After that all in-flight and further
76       * snapshot-API requests will fail with that error.
77       */
78      int snapshot_error;
79  } BDRVCopyBeforeWriteState;
80  
81  static int coroutine_fn GRAPH_RDLOCK
82  cbw_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
83                QEMUIOVector *qiov, BdrvRequestFlags flags)
84  {
85      return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
86  }
87  
88  static void block_copy_cb(void *opaque)
89  {
90      BlockDriverState *bs = opaque;
91  
92      bdrv_dec_in_flight(bs);
93  }
94  
95  /*
96   * Do copy-before-write operation.
97   *
98   * On failure guest request must be failed too.
99   *
100   * On success, we also wait for all in-flight fleecing read requests in source
101   * node, and it's guaranteed that after cbw_do_copy_before_write() successful
102   * return there are no such requests and they will never appear.
103   */
104  static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
105          uint64_t offset, uint64_t bytes, BdrvRequestFlags flags)
106  {
107      BDRVCopyBeforeWriteState *s = bs->opaque;
108      int ret;
109      uint64_t off, end;
110      int64_t cluster_size = block_copy_cluster_size(s->bcs);
111  
112      if (flags & BDRV_REQ_WRITE_UNCHANGED) {
113          return 0;
114      }
115  
116      if (s->snapshot_error) {
117          return 0;
118      }
119  
120      off = QEMU_ALIGN_DOWN(offset, cluster_size);
121      end = QEMU_ALIGN_UP(offset + bytes, cluster_size);
122  
123      /*
124       * Increase in_flight, so that in case of timed-out block-copy, the
125       * remaining background block_copy() request (which can't be immediately
126       * cancelled by timeout) is presented in bs->in_flight. This way we are
127       * sure that on bs close() we'll previously wait for all timed-out but yet
128       * running block_copy calls.
129       */
130      bdrv_inc_in_flight(bs);
131      ret = block_copy(s->bcs, off, end - off, true, s->cbw_timeout_ns,
132                       block_copy_cb, bs);
133      if (ret < 0 && s->on_cbw_error == ON_CBW_ERROR_BREAK_GUEST_WRITE) {
134          return ret;
135      }
136  
137      WITH_QEMU_LOCK_GUARD(&s->lock) {
138          if (ret < 0) {
139              assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT);
140              if (!s->snapshot_error) {
141                  s->snapshot_error = ret;
142              }
143          } else {
144              bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off);
145          }
146          reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock);
147      }
148  
149      return 0;
150  }
151  
152  static int coroutine_fn GRAPH_RDLOCK
153  cbw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
154  {
155      int ret = cbw_do_copy_before_write(bs, offset, bytes, 0);
156      if (ret < 0) {
157          return ret;
158      }
159  
160      return bdrv_co_pdiscard(bs->file, offset, bytes);
161  }
162  
163  static int coroutine_fn GRAPH_RDLOCK
164  cbw_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
165                       BdrvRequestFlags flags)
166  {
167      int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
168      if (ret < 0) {
169          return ret;
170      }
171  
172      return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
173  }
174  
175  static coroutine_fn GRAPH_RDLOCK
176  int cbw_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
177                     QEMUIOVector *qiov, BdrvRequestFlags flags)
178  {
179      int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
180      if (ret < 0) {
181          return ret;
182      }
183  
184      return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
185  }
186  
187  static int coroutine_fn GRAPH_RDLOCK cbw_co_flush(BlockDriverState *bs)
188  {
189      if (!bs->file) {
190          return 0;
191      }
192  
193      return bdrv_co_flush(bs->file->bs);
194  }
195  
196  /*
197   * If @offset not accessible - return NULL.
198   *
199   * Otherwise, set @pnum to some bytes that accessible from @file (@file is set
200   * to bs->file or to s->target). Return newly allocated BlockReq object that
201   * should be than passed to cbw_snapshot_read_unlock().
202   *
203   * It's guaranteed that guest writes will not interact in the region until
204   * cbw_snapshot_read_unlock() called.
205   */
206  static coroutine_fn BlockReq *
207  cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes,
208                         int64_t *pnum, BdrvChild **file)
209  {
210      BDRVCopyBeforeWriteState *s = bs->opaque;
211      BlockReq *req = g_new(BlockReq, 1);
212      bool done;
213  
214      QEMU_LOCK_GUARD(&s->lock);
215  
216      if (s->snapshot_error) {
217          g_free(req);
218          return NULL;
219      }
220  
221      if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) {
222          g_free(req);
223          return NULL;
224      }
225  
226      done = bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, pnum);
227      if (done) {
228          /*
229           * Special invalid BlockReq, that is handled in
230           * cbw_snapshot_read_unlock(). We don't need to lock something to read
231           * from s->target.
232           */
233          *req = (BlockReq) {.offset = -1, .bytes = -1};
234          *file = s->target;
235      } else {
236          reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes);
237          *file = bs->file;
238      }
239  
240      return req;
241  }
242  
243  static coroutine_fn void
244  cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
245  {
246      BDRVCopyBeforeWriteState *s = bs->opaque;
247  
248      if (req->offset == -1 && req->bytes == -1) {
249          g_free(req);
250          return;
251      }
252  
253      QEMU_LOCK_GUARD(&s->lock);
254  
255      reqlist_remove_req(req);
256      g_free(req);
257  }
258  
259  static int coroutine_fn GRAPH_RDLOCK
260  cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes,
261                         QEMUIOVector *qiov, size_t qiov_offset)
262  {
263      BlockReq *req;
264      BdrvChild *file;
265      int ret;
266  
267      /* TODO: upgrade to async loop using AioTask */
268      while (bytes) {
269          int64_t cur_bytes;
270  
271          req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &file);
272          if (!req) {
273              return -EACCES;
274          }
275  
276          ret = bdrv_co_preadv_part(file, offset, cur_bytes,
277                                    qiov, qiov_offset, 0);
278          cbw_snapshot_read_unlock(bs, req);
279          if (ret < 0) {
280              return ret;
281          }
282  
283          bytes -= cur_bytes;
284          offset += cur_bytes;
285          qiov_offset += cur_bytes;
286      }
287  
288      return 0;
289  }
290  
291  static int coroutine_fn GRAPH_RDLOCK
292  cbw_co_snapshot_block_status(BlockDriverState *bs,
293                               bool want_zero, int64_t offset, int64_t bytes,
294                               int64_t *pnum, int64_t *map,
295                               BlockDriverState **file)
296  {
297      BDRVCopyBeforeWriteState *s = bs->opaque;
298      BlockReq *req;
299      int ret;
300      int64_t cur_bytes;
301      BdrvChild *child;
302  
303      req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &child);
304      if (!req) {
305          return -EACCES;
306      }
307  
308      ret = bdrv_block_status(child->bs, offset, cur_bytes, pnum, map, file);
309      if (child == s->target) {
310          /*
311           * We refer to s->target only for areas that we've written to it.
312           * And we can not report unallocated blocks in s->target: this will
313           * break generic block-status-above logic, that will go to
314           * copy-before-write filtered child in this case.
315           */
316          assert(ret & BDRV_BLOCK_ALLOCATED);
317      }
318  
319      cbw_snapshot_read_unlock(bs, req);
320  
321      return ret;
322  }
323  
324  static int coroutine_fn GRAPH_RDLOCK
325  cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
326  {
327      BDRVCopyBeforeWriteState *s = bs->opaque;
328  
329      WITH_QEMU_LOCK_GUARD(&s->lock) {
330          bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
331      }
332  
333      block_copy_reset(s->bcs, offset, bytes);
334  
335      return bdrv_co_pdiscard(s->target, offset, bytes);
336  }
337  
338  static void cbw_refresh_filename(BlockDriverState *bs)
339  {
340      pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
341              bs->file->bs->filename);
342  }
343  
344  static void GRAPH_RDLOCK
345  cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
346                 BlockReopenQueue *reopen_queue,
347                 uint64_t perm, uint64_t shared,
348                 uint64_t *nperm, uint64_t *nshared)
349  {
350      if (!(role & BDRV_CHILD_FILTERED)) {
351          /*
352           * Target child
353           *
354           * Share write to target (child_file), to not interfere
355           * with guest writes to its disk which may be in target backing chain.
356           * Can't resize during a backup block job because we check the size
357           * only upfront.
358           */
359          *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
360          *nperm = BLK_PERM_WRITE;
361      } else {
362          /* Source child */
363          bdrv_default_perms(bs, c, role, reopen_queue,
364                             perm, shared, nperm, nshared);
365  
366          if (!QLIST_EMPTY(&bs->parents)) {
367              if (perm & BLK_PERM_WRITE) {
368                  *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
369              }
370              *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
371          }
372      }
373  }
374  
375  static BlockdevOptions *cbw_parse_options(QDict *options, Error **errp)
376  {
377      BlockdevOptions *opts = NULL;
378      Visitor *v = NULL;
379  
380      qdict_put_str(options, "driver", "copy-before-write");
381  
382      v = qobject_input_visitor_new_flat_confused(options, errp);
383      if (!v) {
384          goto out;
385      }
386  
387      visit_type_BlockdevOptions(v, NULL, &opts, errp);
388      if (!opts) {
389          goto out;
390      }
391  
392      /*
393       * Delete options which we are going to parse through BlockdevOptions
394       * object for original options.
395       */
396      qdict_extract_subqdict(options, NULL, "bitmap");
397      qdict_del(options, "on-cbw-error");
398      qdict_del(options, "cbw-timeout");
399  
400  out:
401      visit_free(v);
402      qdict_del(options, "driver");
403  
404      return opts;
405  }
406  
407  static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
408                      Error **errp)
409  {
410      BDRVCopyBeforeWriteState *s = bs->opaque;
411      BdrvDirtyBitmap *bitmap = NULL;
412      int64_t cluster_size;
413      g_autoptr(BlockdevOptions) full_opts = NULL;
414      BlockdevOptionsCbw *opts;
415      AioContext *ctx;
416      int ret;
417  
418      full_opts = cbw_parse_options(options, errp);
419      if (!full_opts) {
420          return -EINVAL;
421      }
422      assert(full_opts->driver == BLOCKDEV_DRIVER_COPY_BEFORE_WRITE);
423      opts = &full_opts->u.copy_before_write;
424  
425      ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
426      if (ret < 0) {
427          return ret;
428      }
429  
430      s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds,
431                                  BDRV_CHILD_DATA, false, errp);
432      if (!s->target) {
433          return -EINVAL;
434      }
435  
436      ctx = bdrv_get_aio_context(bs);
437      aio_context_acquire(ctx);
438  
439      if (opts->bitmap) {
440          bitmap = block_dirty_bitmap_lookup(opts->bitmap->node,
441                                             opts->bitmap->name, NULL, errp);
442          if (!bitmap) {
443              ret = -EINVAL;
444              goto out;
445          }
446      }
447      s->on_cbw_error = opts->has_on_cbw_error ? opts->on_cbw_error :
448              ON_CBW_ERROR_BREAK_GUEST_WRITE;
449      s->cbw_timeout_ns = opts->has_cbw_timeout ?
450          opts->cbw_timeout * NANOSECONDS_PER_SECOND : 0;
451  
452      bs->total_sectors = bs->file->bs->total_sectors;
453      bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
454              (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
455      bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
456              ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
457               bs->file->bs->supported_zero_flags);
458  
459      s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
460      if (!s->bcs) {
461          error_prepend(errp, "Cannot create block-copy-state: ");
462          ret = -EINVAL;
463          goto out;
464      }
465  
466      cluster_size = block_copy_cluster_size(s->bcs);
467  
468      s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
469      if (!s->done_bitmap) {
470          ret = -EINVAL;
471          goto out;
472      }
473      bdrv_disable_dirty_bitmap(s->done_bitmap);
474  
475      /* s->access_bitmap starts equal to bcs bitmap */
476      s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
477      if (!s->access_bitmap) {
478          ret = -EINVAL;
479          goto out;
480      }
481      bdrv_disable_dirty_bitmap(s->access_bitmap);
482      bdrv_dirty_bitmap_merge_internal(s->access_bitmap,
483                                       block_copy_dirty_bitmap(s->bcs), NULL,
484                                       true);
485  
486      qemu_co_mutex_init(&s->lock);
487      QLIST_INIT(&s->frozen_read_reqs);
488  
489      ret = 0;
490  out:
491      aio_context_release(ctx);
492      return ret;
493  }
494  
495  static void cbw_close(BlockDriverState *bs)
496  {
497      BDRVCopyBeforeWriteState *s = bs->opaque;
498  
499      bdrv_release_dirty_bitmap(s->access_bitmap);
500      bdrv_release_dirty_bitmap(s->done_bitmap);
501  
502      block_copy_state_free(s->bcs);
503      s->bcs = NULL;
504  }
505  
506  static BlockDriver bdrv_cbw_filter = {
507      .format_name = "copy-before-write",
508      .instance_size = sizeof(BDRVCopyBeforeWriteState),
509  
510      .bdrv_open                  = cbw_open,
511      .bdrv_close                 = cbw_close,
512  
513      .bdrv_co_preadv             = cbw_co_preadv,
514      .bdrv_co_pwritev            = cbw_co_pwritev,
515      .bdrv_co_pwrite_zeroes      = cbw_co_pwrite_zeroes,
516      .bdrv_co_pdiscard           = cbw_co_pdiscard,
517      .bdrv_co_flush              = cbw_co_flush,
518  
519      .bdrv_co_preadv_snapshot       = cbw_co_preadv_snapshot,
520      .bdrv_co_pdiscard_snapshot     = cbw_co_pdiscard_snapshot,
521      .bdrv_co_snapshot_block_status = cbw_co_snapshot_block_status,
522  
523      .bdrv_refresh_filename      = cbw_refresh_filename,
524  
525      .bdrv_child_perm            = cbw_child_perm,
526  
527      .is_filter = true,
528  };
529  
530  BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
531                                    BlockDriverState *target,
532                                    const char *filter_node_name,
533                                    BlockCopyState **bcs,
534                                    Error **errp)
535  {
536      BDRVCopyBeforeWriteState *state;
537      BlockDriverState *top;
538      QDict *opts;
539  
540      assert(source->total_sectors == target->total_sectors);
541      GLOBAL_STATE_CODE();
542  
543      opts = qdict_new();
544      qdict_put_str(opts, "driver", "copy-before-write");
545      if (filter_node_name) {
546          qdict_put_str(opts, "node-name", filter_node_name);
547      }
548      qdict_put_str(opts, "file", bdrv_get_node_name(source));
549      qdict_put_str(opts, "target", bdrv_get_node_name(target));
550  
551      top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
552      if (!top) {
553          return NULL;
554      }
555  
556      state = top->opaque;
557      *bcs = state->bcs;
558  
559      return top;
560  }
561  
562  void bdrv_cbw_drop(BlockDriverState *bs)
563  {
564      GLOBAL_STATE_CODE();
565      bdrv_drop_filter(bs, &error_abort);
566      bdrv_unref(bs);
567  }
568  
569  static void cbw_init(void)
570  {
571      bdrv_register(&bdrv_cbw_filter);
572  }
573  
574  block_init(cbw_init);
575