xref: /qemu/block/commit.c (revision 9673d7157ca5613af68d73b022152ef0fa878aff)
1  /*
2   * Live block commit
3   *
4   * Copyright Red Hat, Inc. 2012
5   *
6   * Authors:
7   *  Jeff Cody   <jcody@redhat.com>
8   *  Based on stream.c by Stefan Hajnoczi
9   *
10   * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11   * See the COPYING.LIB file in the top-level directory.
12   *
13   */
14  
15  #include "qemu/osdep.h"
16  #include "qemu/cutils.h"
17  #include "trace.h"
18  #include "block/block_int.h"
19  #include "block/blockjob_int.h"
20  #include "qapi/error.h"
21  #include "qemu/ratelimit.h"
22  #include "qemu/memalign.h"
23  #include "sysemu/block-backend.h"
24  
25  enum {
26      /*
27       * Size of data buffer for populating the image file.  This should be large
28       * enough to process multiple clusters in a single call, so that populating
29       * contiguous regions of the image is efficient.
30       */
31      COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
32  };
33  
34  typedef struct CommitBlockJob {
35      BlockJob common;
36      BlockDriverState *commit_top_bs;
37      BlockBackend *top;
38      BlockBackend *base;
39      BlockDriverState *base_bs;
40      BlockDriverState *base_overlay;
41      BlockdevOnError on_error;
42      bool base_read_only;
43      bool chain_frozen;
44      char *backing_file_str;
45  } CommitBlockJob;
46  
47  static int commit_prepare(Job *job)
48  {
49      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
50  
51      bdrv_graph_rdlock_main_loop();
52      bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
53      s->chain_frozen = false;
54      bdrv_graph_rdunlock_main_loop();
55  
56      /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
57       * the normal backing chain can be restored. */
58      blk_unref(s->base);
59      s->base = NULL;
60  
61      /* FIXME: bdrv_drop_intermediate treats total failures and partial failures
62       * identically. Further work is needed to disambiguate these cases. */
63      return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
64                                    s->backing_file_str);
65  }
66  
67  static void commit_abort(Job *job)
68  {
69      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
70      BlockDriverState *top_bs = blk_bs(s->top);
71      BlockDriverState *commit_top_backing_bs;
72  
73      if (s->chain_frozen) {
74          bdrv_graph_rdlock_main_loop();
75          bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
76          bdrv_graph_rdunlock_main_loop();
77      }
78  
79      /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
80      bdrv_ref(top_bs);
81      bdrv_ref(s->commit_top_bs);
82  
83      if (s->base) {
84          blk_unref(s->base);
85      }
86  
87      /* free the blockers on the intermediate nodes so that bdrv_replace_nodes
88       * can succeed */
89      block_job_remove_all_bdrv(&s->common);
90  
91      /* If bdrv_drop_intermediate() failed (or was not invoked), remove the
92       * commit filter driver from the backing chain now. Do this as the final
93       * step so that the 'consistent read' permission can be granted.
94       *
95       * XXX Can (or should) we somehow keep 'consistent read' blocked even
96       * after the failed/cancelled commit job is gone? If we already wrote
97       * something to base, the intermediate images aren't valid any more. */
98      bdrv_graph_rdlock_main_loop();
99      commit_top_backing_bs = s->commit_top_bs->backing->bs;
100      bdrv_graph_rdunlock_main_loop();
101  
102      bdrv_drained_begin(commit_top_backing_bs);
103      bdrv_graph_wrlock();
104      bdrv_replace_node(s->commit_top_bs, commit_top_backing_bs, &error_abort);
105      bdrv_graph_wrunlock();
106      bdrv_drained_end(commit_top_backing_bs);
107  
108      bdrv_unref(s->commit_top_bs);
109      bdrv_unref(top_bs);
110  }
111  
112  static void commit_clean(Job *job)
113  {
114      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
115  
116      /* restore base open flags here if appropriate (e.g., change the base back
117       * to r/o). These reopens do not need to be atomic, since we won't abort
118       * even on failure here */
119      if (s->base_read_only) {
120          bdrv_reopen_set_read_only(s->base_bs, true, NULL);
121      }
122  
123      g_free(s->backing_file_str);
124      blk_unref(s->top);
125  }
126  
127  static int coroutine_fn commit_run(Job *job, Error **errp)
128  {
129      CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
130      int64_t offset;
131      int ret = 0;
132      int64_t n = 0; /* bytes */
133      QEMU_AUTO_VFREE void *buf = NULL;
134      int64_t len, base_len;
135  
136      len = blk_co_getlength(s->top);
137      if (len < 0) {
138          return len;
139      }
140      job_progress_set_remaining(&s->common.job, len);
141  
142      base_len = blk_co_getlength(s->base);
143      if (base_len < 0) {
144          return base_len;
145      }
146  
147      if (base_len < len) {
148          ret = blk_co_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL);
149          if (ret) {
150              return ret;
151          }
152      }
153  
154      buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
155  
156      for (offset = 0; offset < len; offset += n) {
157          bool copy;
158          bool error_in_source = true;
159  
160          /* Note that even when no rate limit is applied we need to yield
161           * with no pending I/O here so that bdrv_drain_all() returns.
162           */
163          block_job_ratelimit_sleep(&s->common);
164          if (job_is_cancelled(&s->common.job)) {
165              break;
166          }
167          /* Copy if allocated above the base */
168          ret = blk_co_is_allocated_above(s->top, s->base_overlay, true,
169                                          offset, COMMIT_BUFFER_SIZE, &n);
170          copy = (ret > 0);
171          trace_commit_one_iteration(s, offset, n, ret);
172          if (copy) {
173              assert(n < SIZE_MAX);
174  
175              ret = blk_co_pread(s->top, offset, n, buf, 0);
176              if (ret >= 0) {
177                  ret = blk_co_pwrite(s->base, offset, n, buf, 0);
178                  if (ret < 0) {
179                      error_in_source = false;
180                  }
181              }
182          }
183          if (ret < 0) {
184              BlockErrorAction action =
185                  block_job_error_action(&s->common, s->on_error,
186                                         error_in_source, -ret);
187              if (action == BLOCK_ERROR_ACTION_REPORT) {
188                  return ret;
189              } else {
190                  n = 0;
191                  continue;
192              }
193          }
194          /* Publish progress */
195          job_progress_update(&s->common.job, n);
196  
197          if (copy) {
198              block_job_ratelimit_processed_bytes(&s->common, n);
199          }
200      }
201  
202      return 0;
203  }
204  
205  static const BlockJobDriver commit_job_driver = {
206      .job_driver = {
207          .instance_size = sizeof(CommitBlockJob),
208          .job_type      = JOB_TYPE_COMMIT,
209          .free          = block_job_free,
210          .user_resume   = block_job_user_resume,
211          .run           = commit_run,
212          .prepare       = commit_prepare,
213          .abort         = commit_abort,
214          .clean         = commit_clean
215      },
216  };
217  
218  static int coroutine_fn GRAPH_RDLOCK
219  bdrv_commit_top_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
220                         QEMUIOVector *qiov, BdrvRequestFlags flags)
221  {
222      return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
223  }
224  
225  static GRAPH_RDLOCK void bdrv_commit_top_refresh_filename(BlockDriverState *bs)
226  {
227      pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
228              bs->backing->bs->filename);
229  }
230  
231  static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
232                                         BdrvChildRole role,
233                                         BlockReopenQueue *reopen_queue,
234                                         uint64_t perm, uint64_t shared,
235                                         uint64_t *nperm, uint64_t *nshared)
236  {
237      *nperm = 0;
238      *nshared = BLK_PERM_ALL;
239  }
240  
241  /* Dummy node that provides consistent read to its users without requiring it
242   * from its backing file and that allows writes on the backing file chain. */
243  static BlockDriver bdrv_commit_top = {
244      .format_name                = "commit_top",
245      .bdrv_co_preadv             = bdrv_commit_top_preadv,
246      .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
247      .bdrv_child_perm            = bdrv_commit_top_child_perm,
248  
249      .is_filter                  = true,
250      .filtered_child_is_backing  = true,
251  };
252  
253  void commit_start(const char *job_id, BlockDriverState *bs,
254                    BlockDriverState *base, BlockDriverState *top,
255                    int creation_flags, int64_t speed,
256                    BlockdevOnError on_error, const char *backing_file_str,
257                    const char *filter_node_name, Error **errp)
258  {
259      CommitBlockJob *s;
260      BlockDriverState *iter;
261      BlockDriverState *commit_top_bs = NULL;
262      BlockDriverState *filtered_base;
263      int64_t base_size, top_size;
264      uint64_t base_perms, iter_shared_perms;
265      int ret;
266  
267      GLOBAL_STATE_CODE();
268  
269      assert(top != bs);
270      bdrv_graph_rdlock_main_loop();
271      if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) {
272          error_setg(errp, "Invalid files for merge: top and base are the same");
273          bdrv_graph_rdunlock_main_loop();
274          return;
275      }
276      bdrv_graph_rdunlock_main_loop();
277  
278      base_size = bdrv_getlength(base);
279      if (base_size < 0) {
280          error_setg_errno(errp, -base_size, "Could not inquire base image size");
281          return;
282      }
283  
284      top_size = bdrv_getlength(top);
285      if (top_size < 0) {
286          error_setg_errno(errp, -top_size, "Could not inquire top image size");
287          return;
288      }
289  
290      base_perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
291      if (base_size < top_size) {
292          base_perms |= BLK_PERM_RESIZE;
293      }
294  
295      s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
296                           speed, creation_flags, NULL, NULL, errp);
297      if (!s) {
298          return;
299      }
300  
301      /* convert base to r/w, if necessary */
302      s->base_read_only = bdrv_is_read_only(base);
303      if (s->base_read_only) {
304          if (bdrv_reopen_set_read_only(base, false, errp) != 0) {
305              goto fail;
306          }
307      }
308  
309      /* Insert commit_top block node above top, so we can block consistent read
310       * on the backing chain below it */
311      commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0,
312                                           errp);
313      if (commit_top_bs == NULL) {
314          goto fail;
315      }
316      if (!filter_node_name) {
317          commit_top_bs->implicit = true;
318      }
319  
320      /* So that we can always drop this node */
321      commit_top_bs->never_freeze = true;
322  
323      commit_top_bs->total_sectors = top->total_sectors;
324  
325      ret = bdrv_append(commit_top_bs, top, errp);
326      bdrv_unref(commit_top_bs); /* referenced by new parents or failed */
327      if (ret < 0) {
328          commit_top_bs = NULL;
329          goto fail;
330      }
331  
332      s->commit_top_bs = commit_top_bs;
333  
334      /*
335       * Block all nodes between top and base, because they will
336       * disappear from the chain after this operation.
337       * Note that this assumes that the user is fine with removing all
338       * nodes (including R/W filters) between top and base.  Assuring
339       * this is the responsibility of the interface (i.e. whoever calls
340       * commit_start()).
341       */
342      bdrv_graph_wrlock();
343      s->base_overlay = bdrv_find_overlay(top, base);
344      assert(s->base_overlay);
345  
346      /*
347       * The topmost node with
348       * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)
349       */
350      filtered_base = bdrv_cow_bs(s->base_overlay);
351      assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base));
352  
353      /*
354       * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
355       * at s->base (if writes are blocked for a node, they are also blocked
356       * for its backing file). The other options would be a second filter
357       * driver above s->base.
358       */
359      iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
360  
361      for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) {
362          if (iter == filtered_base) {
363              /*
364               * From here on, all nodes are filters on the base.  This
365               * allows us to share BLK_PERM_CONSISTENT_READ.
366               */
367              iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
368          }
369  
370          ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
371                                   iter_shared_perms, errp);
372          if (ret < 0) {
373              bdrv_graph_wrunlock();
374              goto fail;
375          }
376      }
377  
378      if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) {
379          bdrv_graph_wrunlock();
380          goto fail;
381      }
382      s->chain_frozen = true;
383  
384      ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
385      bdrv_graph_wrunlock();
386  
387      if (ret < 0) {
388          goto fail;
389      }
390  
391      s->base = blk_new(s->common.job.aio_context,
392                        base_perms,
393                        BLK_PERM_CONSISTENT_READ
394                        | BLK_PERM_WRITE_UNCHANGED);
395      ret = blk_insert_bs(s->base, base, errp);
396      if (ret < 0) {
397          goto fail;
398      }
399      blk_set_disable_request_queuing(s->base, true);
400      s->base_bs = base;
401  
402      /* Required permissions are already taken with block_job_add_bdrv() */
403      s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
404      ret = blk_insert_bs(s->top, top, errp);
405      if (ret < 0) {
406          goto fail;
407      }
408      blk_set_disable_request_queuing(s->top, true);
409  
410      s->backing_file_str = g_strdup(backing_file_str);
411      s->on_error = on_error;
412  
413      trace_commit_start(bs, base, top, s);
414      job_start(&s->common.job);
415      return;
416  
417  fail:
418      if (s->chain_frozen) {
419          bdrv_graph_rdlock_main_loop();
420          bdrv_unfreeze_backing_chain(commit_top_bs, base);
421          bdrv_graph_rdunlock_main_loop();
422      }
423      if (s->base) {
424          blk_unref(s->base);
425      }
426      if (s->top) {
427          blk_unref(s->top);
428      }
429      if (s->base_read_only) {
430          bdrv_reopen_set_read_only(base, true, NULL);
431      }
432      job_early_fail(&s->common.job);
433      /* commit_top_bs has to be replaced after deleting the block job,
434       * otherwise this would fail because of lack of permissions. */
435      if (commit_top_bs) {
436          bdrv_drained_begin(top);
437          bdrv_graph_wrlock();
438          bdrv_replace_node(commit_top_bs, top, &error_abort);
439          bdrv_graph_wrunlock();
440          bdrv_drained_end(top);
441      }
442  }
443  
444  
445  #define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)
446  
447  /* commit COW file into the raw image */
448  int bdrv_commit(BlockDriverState *bs)
449  {
450      BlockBackend *src, *backing;
451      BlockDriverState *backing_file_bs = NULL;
452      BlockDriverState *commit_top_bs = NULL;
453      BlockDriver *drv = bs->drv;
454      AioContext *ctx;
455      int64_t offset, length, backing_length;
456      int ro;
457      int64_t n;
458      int ret = 0;
459      QEMU_AUTO_VFREE uint8_t *buf = NULL;
460      Error *local_err = NULL;
461  
462      GLOBAL_STATE_CODE();
463      GRAPH_RDLOCK_GUARD_MAINLOOP();
464  
465      if (!drv)
466          return -ENOMEDIUM;
467  
468      backing_file_bs = bdrv_cow_bs(bs);
469  
470      if (!backing_file_bs) {
471          return -ENOTSUP;
472      }
473  
474      if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
475          bdrv_op_is_blocked(backing_file_bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL))
476      {
477          return -EBUSY;
478      }
479  
480      ro = bdrv_is_read_only(backing_file_bs);
481  
482      if (ro) {
483          if (bdrv_reopen_set_read_only(backing_file_bs, false, NULL)) {
484              return -EACCES;
485          }
486      }
487  
488      ctx = bdrv_get_aio_context(bs);
489      /* WRITE_UNCHANGED is required for bdrv_make_empty() */
490      src = blk_new(ctx, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED,
491                    BLK_PERM_ALL);
492      backing = blk_new(ctx, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
493  
494      ret = blk_insert_bs(src, bs, &local_err);
495      if (ret < 0) {
496          error_report_err(local_err);
497          goto ro_cleanup;
498      }
499  
500      /* Insert commit_top block node above backing, so we can write to it */
501      commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
502                                           &local_err);
503      if (commit_top_bs == NULL) {
504          error_report_err(local_err);
505          goto ro_cleanup;
506      }
507  
508      bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
509      bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
510  
511      ret = blk_insert_bs(backing, backing_file_bs, &local_err);
512      if (ret < 0) {
513          error_report_err(local_err);
514          goto ro_cleanup;
515      }
516  
517      length = blk_getlength(src);
518      if (length < 0) {
519          ret = length;
520          goto ro_cleanup;
521      }
522  
523      backing_length = blk_getlength(backing);
524      if (backing_length < 0) {
525          ret = backing_length;
526          goto ro_cleanup;
527      }
528  
529      /* If our top snapshot is larger than the backing file image,
530       * grow the backing file image if possible.  If not possible,
531       * we must return an error */
532      if (length > backing_length) {
533          ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0,
534                             &local_err);
535          if (ret < 0) {
536              error_report_err(local_err);
537              goto ro_cleanup;
538          }
539      }
540  
541      /* blk_try_blockalign() for src will choose an alignment that works for
542       * backing as well, so no need to compare the alignment manually. */
543      buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
544      if (buf == NULL) {
545          ret = -ENOMEM;
546          goto ro_cleanup;
547      }
548  
549      for (offset = 0; offset < length; offset += n) {
550          ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
551          if (ret < 0) {
552              goto ro_cleanup;
553          }
554          if (ret) {
555              ret = blk_pread(src, offset, n, buf, 0);
556              if (ret < 0) {
557                  goto ro_cleanup;
558              }
559  
560              ret = blk_pwrite(backing, offset, n, buf, 0);
561              if (ret < 0) {
562                  goto ro_cleanup;
563              }
564          }
565      }
566  
567      ret = blk_make_empty(src, NULL);
568      /* Ignore -ENOTSUP */
569      if (ret < 0 && ret != -ENOTSUP) {
570          goto ro_cleanup;
571      }
572  
573      blk_flush(src);
574  
575      /*
576       * Make sure all data we wrote to the backing device is actually
577       * stable on disk.
578       */
579      blk_flush(backing);
580  
581      ret = 0;
582  ro_cleanup:
583      blk_unref(backing);
584      if (bdrv_cow_bs(bs) != backing_file_bs) {
585          bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
586      }
587      bdrv_unref(commit_top_bs);
588      blk_unref(src);
589  
590      if (ro) {
591          /* ignoring error return here */
592          bdrv_reopen_set_read_only(backing_file_bs, true, NULL);
593      }
594  
595      return ret;
596  }
597