xref: /qemu/block/qcow2-refcount.c (revision 2eb71a0c20a6a77be128a76c1ef8fb5dc7028a8b)
1  /*
2   * Block driver for the QCOW version 2 format
3   *
4   * Copyright (c) 2004-2006 Fabrice Bellard
5   *
6   * Permission is hereby granted, free of charge, to any person obtaining a copy
7   * of this software and associated documentation files (the "Software"), to deal
8   * in the Software without restriction, including without limitation the rights
9   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10   * copies of the Software, and to permit persons to whom the Software is
11   * furnished to do so, subject to the following conditions:
12   *
13   * The above copyright notice and this permission notice shall be included in
14   * all copies or substantial portions of the Software.
15   *
16   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22   * THE SOFTWARE.
23   */
24  
25  #include "qemu/osdep.h"
26  #include "block/block-io.h"
27  #include "qapi/error.h"
28  #include "qcow2.h"
29  #include "qemu/range.h"
30  #include "qemu/bswap.h"
31  #include "qemu/cutils.h"
32  #include "qemu/memalign.h"
33  #include "trace.h"
34  
35  static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
36                                      uint64_t max);
37  
38  G_GNUC_WARN_UNUSED_RESULT
39  static int update_refcount(BlockDriverState *bs,
40                             int64_t offset, int64_t length, uint64_t addend,
41                             bool decrease, enum qcow2_discard_type type);
42  
43  static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
44  static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
45  static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
46  static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
47  static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
48  static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
49  static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
50  
51  static void set_refcount_ro0(void *refcount_array, uint64_t index,
52                               uint64_t value);
53  static void set_refcount_ro1(void *refcount_array, uint64_t index,
54                               uint64_t value);
55  static void set_refcount_ro2(void *refcount_array, uint64_t index,
56                               uint64_t value);
57  static void set_refcount_ro3(void *refcount_array, uint64_t index,
58                               uint64_t value);
59  static void set_refcount_ro4(void *refcount_array, uint64_t index,
60                               uint64_t value);
61  static void set_refcount_ro5(void *refcount_array, uint64_t index,
62                               uint64_t value);
63  static void set_refcount_ro6(void *refcount_array, uint64_t index,
64                               uint64_t value);
65  
66  
67  static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
68      &get_refcount_ro0,
69      &get_refcount_ro1,
70      &get_refcount_ro2,
71      &get_refcount_ro3,
72      &get_refcount_ro4,
73      &get_refcount_ro5,
74      &get_refcount_ro6
75  };
76  
77  static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
78      &set_refcount_ro0,
79      &set_refcount_ro1,
80      &set_refcount_ro2,
81      &set_refcount_ro3,
82      &set_refcount_ro4,
83      &set_refcount_ro5,
84      &set_refcount_ro6
85  };
86  
87  
88  /*********************************************************/
89  /* refcount handling */
90  
91  static void update_max_refcount_table_index(BDRVQcow2State *s)
92  {
93      unsigned i = s->refcount_table_size - 1;
94      while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
95          i--;
96      }
97      /* Set s->max_refcount_table_index to the index of the last used entry */
98      s->max_refcount_table_index = i;
99  }
100  
101  int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
102  {
103      BDRVQcow2State *s = bs->opaque;
104      unsigned int refcount_table_size2, i;
105      int ret;
106  
107      assert(s->refcount_order >= 0 && s->refcount_order <= 6);
108  
109      s->get_refcount = get_refcount_funcs[s->refcount_order];
110      s->set_refcount = set_refcount_funcs[s->refcount_order];
111  
112      assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
113      refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
114      s->refcount_table = g_try_malloc(refcount_table_size2);
115  
116      if (s->refcount_table_size > 0) {
117          if (s->refcount_table == NULL) {
118              ret = -ENOMEM;
119              goto fail;
120          }
121          BLKDBG_CO_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
122          ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
123                              refcount_table_size2, s->refcount_table, 0);
124          if (ret < 0) {
125              goto fail;
126          }
127          for(i = 0; i < s->refcount_table_size; i++)
128              be64_to_cpus(&s->refcount_table[i]);
129          update_max_refcount_table_index(s);
130      }
131      return 0;
132   fail:
133      return ret;
134  }
135  
136  void qcow2_refcount_close(BlockDriverState *bs)
137  {
138      BDRVQcow2State *s = bs->opaque;
139      g_free(s->refcount_table);
140  }
141  
142  
143  static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
144  {
145      return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
146  }
147  
148  static void set_refcount_ro0(void *refcount_array, uint64_t index,
149                               uint64_t value)
150  {
151      assert(!(value >> 1));
152      ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
153      ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
154  }
155  
156  static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
157  {
158      return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
159             & 0x3;
160  }
161  
162  static void set_refcount_ro1(void *refcount_array, uint64_t index,
163                               uint64_t value)
164  {
165      assert(!(value >> 2));
166      ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
167      ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
168  }
169  
170  static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
171  {
172      return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
173             & 0xf;
174  }
175  
176  static void set_refcount_ro2(void *refcount_array, uint64_t index,
177                               uint64_t value)
178  {
179      assert(!(value >> 4));
180      ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
181      ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
182  }
183  
184  static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
185  {
186      return ((const uint8_t *)refcount_array)[index];
187  }
188  
189  static void set_refcount_ro3(void *refcount_array, uint64_t index,
190                               uint64_t value)
191  {
192      assert(!(value >> 8));
193      ((uint8_t *)refcount_array)[index] = value;
194  }
195  
196  static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
197  {
198      return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
199  }
200  
201  static void set_refcount_ro4(void *refcount_array, uint64_t index,
202                               uint64_t value)
203  {
204      assert(!(value >> 16));
205      ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
206  }
207  
208  static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
209  {
210      return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
211  }
212  
213  static void set_refcount_ro5(void *refcount_array, uint64_t index,
214                               uint64_t value)
215  {
216      assert(!(value >> 32));
217      ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
218  }
219  
220  static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
221  {
222      return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
223  }
224  
225  static void set_refcount_ro6(void *refcount_array, uint64_t index,
226                               uint64_t value)
227  {
228      ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
229  }
230  
231  
232  static int GRAPH_RDLOCK
233  load_refcount_block(BlockDriverState *bs, int64_t refcount_block_offset,
234                      void **refcount_block)
235  {
236      BDRVQcow2State *s = bs->opaque;
237  
238      BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
239      return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
240                             refcount_block);
241  }
242  
243  /*
244   * Retrieves the refcount of the cluster given by its index and stores it in
245   * *refcount. Returns 0 on success and -errno on failure.
246   */
247  int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
248                         uint64_t *refcount)
249  {
250      BDRVQcow2State *s = bs->opaque;
251      uint64_t refcount_table_index, block_index;
252      int64_t refcount_block_offset;
253      int ret;
254      void *refcount_block;
255  
256      refcount_table_index = cluster_index >> s->refcount_block_bits;
257      if (refcount_table_index >= s->refcount_table_size) {
258          *refcount = 0;
259          return 0;
260      }
261      refcount_block_offset =
262          s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
263      if (!refcount_block_offset) {
264          *refcount = 0;
265          return 0;
266      }
267  
268      if (offset_into_cluster(s, refcount_block_offset)) {
269          qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
270                                  " unaligned (reftable index: %#" PRIx64 ")",
271                                  refcount_block_offset, refcount_table_index);
272          return -EIO;
273      }
274  
275      ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
276                            &refcount_block);
277      if (ret < 0) {
278          return ret;
279      }
280  
281      block_index = cluster_index & (s->refcount_block_size - 1);
282      *refcount = s->get_refcount(refcount_block, block_index);
283  
284      qcow2_cache_put(s->refcount_block_cache, &refcount_block);
285  
286      return 0;
287  }
288  
289  /* Checks if two offsets are described by the same refcount block */
290  static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
291      uint64_t offset_b)
292  {
293      uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
294      uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
295  
296      return (block_a == block_b);
297  }
298  
299  /*
300   * Loads a refcount block. If it doesn't exist yet, it is allocated first
301   * (including growing the refcount table if needed).
302   *
303   * Returns 0 on success or -errno in error case
304   */
305  static int GRAPH_RDLOCK
306  alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index,
307                       void **refcount_block)
308  {
309      BDRVQcow2State *s = bs->opaque;
310      unsigned int refcount_table_index;
311      int64_t ret;
312  
313      BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
314  
315      /* Find the refcount block for the given cluster */
316      refcount_table_index = cluster_index >> s->refcount_block_bits;
317  
318      if (refcount_table_index < s->refcount_table_size) {
319  
320          uint64_t refcount_block_offset =
321              s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
322  
323          /* If it's already there, we're done */
324          if (refcount_block_offset) {
325              if (offset_into_cluster(s, refcount_block_offset)) {
326                  qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
327                                          PRIx64 " unaligned (reftable index: "
328                                          "%#x)", refcount_block_offset,
329                                          refcount_table_index);
330                  return -EIO;
331              }
332  
333               return load_refcount_block(bs, refcount_block_offset,
334                                          refcount_block);
335          }
336      }
337  
338      /*
339       * If we came here, we need to allocate something. Something is at least
340       * a cluster for the new refcount block. It may also include a new refcount
341       * table if the old refcount table is too small.
342       *
343       * Note that allocating clusters here needs some special care:
344       *
345       * - We can't use the normal qcow2_alloc_clusters(), it would try to
346       *   increase the refcount and very likely we would end up with an endless
347       *   recursion. Instead we must place the refcount blocks in a way that
348       *   they can describe them themselves.
349       *
350       * - We need to consider that at this point we are inside update_refcounts
351       *   and potentially doing an initial refcount increase. This means that
352       *   some clusters have already been allocated by the caller, but their
353       *   refcount isn't accurate yet. If we allocate clusters for metadata, we
354       *   need to return -EAGAIN to signal the caller that it needs to restart
355       *   the search for free clusters.
356       *
357       * - alloc_clusters_noref and qcow2_free_clusters may load a different
358       *   refcount block into the cache
359       */
360  
361      *refcount_block = NULL;
362  
363      /* We write to the refcount table, so we might depend on L2 tables */
364      ret = qcow2_cache_flush(bs, s->l2_table_cache);
365      if (ret < 0) {
366          return ret;
367      }
368  
369      /* Allocate the refcount block itself and mark it as used */
370      int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
371      if (new_block < 0) {
372          return new_block;
373      }
374  
375      /* The offset must fit in the offset field of the refcount table entry */
376      assert((new_block & REFT_OFFSET_MASK) == new_block);
377  
378      /* If we're allocating the block at offset 0 then something is wrong */
379      if (new_block == 0) {
380          qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
381                                  "allocation of refcount block at offset 0");
382          return -EIO;
383      }
384  
385  #ifdef DEBUG_ALLOC2
386      fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
387          " at %" PRIx64 "\n",
388          refcount_table_index, cluster_index << s->cluster_bits, new_block);
389  #endif
390  
391      if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
392          /* Zero the new refcount block before updating it */
393          ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
394                                      refcount_block);
395          if (ret < 0) {
396              goto fail;
397          }
398  
399          memset(*refcount_block, 0, s->cluster_size);
400  
401          /* The block describes itself, need to update the cache */
402          int block_index = (new_block >> s->cluster_bits) &
403              (s->refcount_block_size - 1);
404          s->set_refcount(*refcount_block, block_index, 1);
405      } else {
406          /* Described somewhere else. This can recurse at most twice before we
407           * arrive at a block that describes itself. */
408          ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
409                                QCOW2_DISCARD_NEVER);
410          if (ret < 0) {
411              goto fail;
412          }
413  
414          ret = qcow2_cache_flush(bs, s->refcount_block_cache);
415          if (ret < 0) {
416              goto fail;
417          }
418  
419          /* Initialize the new refcount block only after updating its refcount,
420           * update_refcount uses the refcount cache itself */
421          ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
422                                      refcount_block);
423          if (ret < 0) {
424              goto fail;
425          }
426  
427          memset(*refcount_block, 0, s->cluster_size);
428      }
429  
430      /* Now the new refcount block needs to be written to disk */
431      BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
432      qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
433      ret = qcow2_cache_flush(bs, s->refcount_block_cache);
434      if (ret < 0) {
435          goto fail;
436      }
437  
438      /* If the refcount table is big enough, just hook the block up there */
439      if (refcount_table_index < s->refcount_table_size) {
440          uint64_t data64 = cpu_to_be64(new_block);
441          BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
442          ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
443                                 refcount_table_index * REFTABLE_ENTRY_SIZE,
444              sizeof(data64), &data64, 0);
445          if (ret < 0) {
446              goto fail;
447          }
448  
449          s->refcount_table[refcount_table_index] = new_block;
450          /* If there's a hole in s->refcount_table then it can happen
451           * that refcount_table_index < s->max_refcount_table_index */
452          s->max_refcount_table_index =
453              MAX(s->max_refcount_table_index, refcount_table_index);
454  
455          /* The new refcount block may be where the caller intended to put its
456           * data, so let it restart the search. */
457          return -EAGAIN;
458      }
459  
460      qcow2_cache_put(s->refcount_block_cache, refcount_block);
461  
462      /*
463       * If we come here, we need to grow the refcount table. Again, a new
464       * refcount table needs some space and we can't simply allocate to avoid
465       * endless recursion.
466       *
467       * Therefore let's grab new refcount blocks at the end of the image, which
468       * will describe themselves and the new refcount table. This way we can
469       * reference them only in the new table and do the switch to the new
470       * refcount table at once without producing an inconsistent state in
471       * between.
472       */
473      BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
474  
475      /* Calculate the number of refcount blocks needed so far; this will be the
476       * basis for calculating the index of the first cluster used for the
477       * self-describing refcount structures which we are about to create.
478       *
479       * Because we reached this point, there cannot be any refcount entries for
480       * cluster_index or higher indices yet. However, because new_block has been
481       * allocated to describe that cluster (and it will assume this role later
482       * on), we cannot use that index; also, new_block may actually have a higher
483       * cluster index than cluster_index, so it needs to be taken into account
484       * here (and 1 needs to be added to its value because that cluster is used).
485       */
486      uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
487                                              (new_block >> s->cluster_bits) + 1),
488                                          s->refcount_block_size);
489  
490      /* Create the new refcount table and blocks */
491      uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
492          s->cluster_size;
493  
494      ret = qcow2_refcount_area(bs, meta_offset, 0, false,
495                                refcount_table_index, new_block);
496      if (ret < 0) {
497          return ret;
498      }
499  
500      ret = load_refcount_block(bs, new_block, refcount_block);
501      if (ret < 0) {
502          return ret;
503      }
504  
505      /* If we were trying to do the initial refcount update for some cluster
506       * allocation, we might have used the same clusters to store newly
507       * allocated metadata. Make the caller search some new space. */
508      return -EAGAIN;
509  
510  fail:
511      if (*refcount_block != NULL) {
512          qcow2_cache_put(s->refcount_block_cache, refcount_block);
513      }
514      return ret;
515  }
516  
517  /*
518   * Starting at @start_offset, this function creates new self-covering refcount
519   * structures: A new refcount table and refcount blocks which cover all of
520   * themselves, and a number of @additional_clusters beyond their end.
521   * @start_offset must be at the end of the image file, that is, there must be
522   * only empty space beyond it.
523   * If @exact_size is false, the refcount table will have 50 % more entries than
524   * necessary so it will not need to grow again soon.
525   * If @new_refblock_offset is not zero, it contains the offset of a refcount
526   * block that should be entered into the new refcount table at index
527   * @new_refblock_index.
528   *
529   * Returns: The offset after the new refcount structures (i.e. where the
530   *          @additional_clusters may be placed) on success, -errno on error.
531   */
532  int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
533                              uint64_t additional_clusters, bool exact_size,
534                              int new_refblock_index,
535                              uint64_t new_refblock_offset)
536  {
537      BDRVQcow2State *s = bs->opaque;
538      uint64_t total_refblock_count_u64, additional_refblock_count;
539      int total_refblock_count, table_size, area_reftable_index, table_clusters;
540      int i;
541      uint64_t table_offset, block_offset, end_offset;
542      int ret;
543      uint64_t *new_table;
544  
545      assert(!(start_offset % s->cluster_size));
546  
547      qcow2_refcount_metadata_size(start_offset / s->cluster_size +
548                                   additional_clusters,
549                                   s->cluster_size, s->refcount_order,
550                                   !exact_size, &total_refblock_count_u64);
551      if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
552          return -EFBIG;
553      }
554      total_refblock_count = total_refblock_count_u64;
555  
556      /* Index in the refcount table of the first refcount block to cover the area
557       * of refcount structures we are about to create; we know that
558       * @total_refblock_count can cover @start_offset, so this will definitely
559       * fit into an int. */
560      area_reftable_index = (start_offset / s->cluster_size) /
561                            s->refcount_block_size;
562  
563      if (exact_size) {
564          table_size = total_refblock_count;
565      } else {
566          table_size = total_refblock_count +
567                       DIV_ROUND_UP(total_refblock_count, 2);
568      }
569      /* The qcow2 file can only store the reftable size in number of clusters */
570      table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
571      table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
572  
573      if (table_size > QCOW_MAX_REFTABLE_SIZE) {
574          return -EFBIG;
575      }
576  
577      new_table = g_try_new0(uint64_t, table_size);
578  
579      assert(table_size > 0);
580      if (new_table == NULL) {
581          ret = -ENOMEM;
582          goto fail;
583      }
584  
585      /* Fill the new refcount table */
586      if (table_size > s->max_refcount_table_index) {
587          /* We're actually growing the reftable */
588          memcpy(new_table, s->refcount_table,
589                 (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
590      } else {
591          /* Improbable case: We're shrinking the reftable. However, the caller
592           * has assured us that there is only empty space beyond @start_offset,
593           * so we can simply drop all of the refblocks that won't fit into the
594           * new reftable. */
595          memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
596      }
597  
598      if (new_refblock_offset) {
599          assert(new_refblock_index < total_refblock_count);
600          new_table[new_refblock_index] = new_refblock_offset;
601      }
602  
603      /* Count how many new refblocks we have to create */
604      additional_refblock_count = 0;
605      for (i = area_reftable_index; i < total_refblock_count; i++) {
606          if (!new_table[i]) {
607              additional_refblock_count++;
608          }
609      }
610  
611      table_offset = start_offset + additional_refblock_count * s->cluster_size;
612      end_offset = table_offset + table_clusters * s->cluster_size;
613  
614      /* Fill the refcount blocks, and create new ones, if necessary */
615      block_offset = start_offset;
616      for (i = area_reftable_index; i < total_refblock_count; i++) {
617          void *refblock_data;
618          uint64_t first_offset_covered;
619  
620          /* Reuse an existing refblock if possible, create a new one otherwise */
621          if (new_table[i]) {
622              ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
623                                    &refblock_data);
624              if (ret < 0) {
625                  goto fail;
626              }
627          } else {
628              ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
629                                          block_offset, &refblock_data);
630              if (ret < 0) {
631                  goto fail;
632              }
633              memset(refblock_data, 0, s->cluster_size);
634              qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
635                                           refblock_data);
636  
637              new_table[i] = block_offset;
638              block_offset += s->cluster_size;
639          }
640  
641          /* First host offset covered by this refblock */
642          first_offset_covered = (uint64_t)i * s->refcount_block_size *
643                                 s->cluster_size;
644          if (first_offset_covered < end_offset) {
645              int j, end_index;
646  
647              /* Set the refcount of all of the new refcount structures to 1 */
648  
649              if (first_offset_covered < start_offset) {
650                  assert(i == area_reftable_index);
651                  j = (start_offset - first_offset_covered) / s->cluster_size;
652                  assert(j < s->refcount_block_size);
653              } else {
654                  j = 0;
655              }
656  
657              end_index = MIN((end_offset - first_offset_covered) /
658                              s->cluster_size,
659                              s->refcount_block_size);
660  
661              for (; j < end_index; j++) {
662                  /* The caller guaranteed us this space would be empty */
663                  assert(s->get_refcount(refblock_data, j) == 0);
664                  s->set_refcount(refblock_data, j, 1);
665              }
666  
667              qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
668                                           refblock_data);
669          }
670  
671          qcow2_cache_put(s->refcount_block_cache, &refblock_data);
672      }
673  
674      assert(block_offset == table_offset);
675  
676      /* Write refcount blocks to disk */
677      BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
678      ret = qcow2_cache_flush(bs, s->refcount_block_cache);
679      if (ret < 0) {
680          goto fail;
681      }
682  
683      /* Write refcount table to disk */
684      for (i = 0; i < total_refblock_count; i++) {
685          cpu_to_be64s(&new_table[i]);
686      }
687  
688      BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
689      ret = bdrv_pwrite_sync(bs->file, table_offset,
690                             table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
691      if (ret < 0) {
692          goto fail;
693      }
694  
695      for (i = 0; i < total_refblock_count; i++) {
696          be64_to_cpus(&new_table[i]);
697      }
698  
699      /* Hook up the new refcount table in the qcow2 header */
700      struct QEMU_PACKED {
701          uint64_t d64;
702          uint32_t d32;
703      } data;
704      data.d64 = cpu_to_be64(table_offset);
705      data.d32 = cpu_to_be32(table_clusters);
706      BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
707      ret = bdrv_pwrite_sync(bs->file,
708                             offsetof(QCowHeader, refcount_table_offset),
709                             sizeof(data), &data, 0);
710      if (ret < 0) {
711          goto fail;
712      }
713  
714      /* And switch it in memory */
715      uint64_t old_table_offset = s->refcount_table_offset;
716      uint64_t old_table_size = s->refcount_table_size;
717  
718      g_free(s->refcount_table);
719      s->refcount_table = new_table;
720      s->refcount_table_size = table_size;
721      s->refcount_table_offset = table_offset;
722      update_max_refcount_table_index(s);
723  
724      /* Free old table. */
725      qcow2_free_clusters(bs, old_table_offset,
726                          old_table_size * REFTABLE_ENTRY_SIZE,
727                          QCOW2_DISCARD_OTHER);
728  
729      return end_offset;
730  
731  fail:
732      g_free(new_table);
733      return ret;
734  }
735  
736  void qcow2_process_discards(BlockDriverState *bs, int ret)
737  {
738      BDRVQcow2State *s = bs->opaque;
739      Qcow2DiscardRegion *d, *next;
740  
741      QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
742          QTAILQ_REMOVE(&s->discards, d, next);
743  
744          /* Discard is optional, ignore the return value */
745          if (ret >= 0) {
746              int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
747              if (r2 < 0) {
748                  trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
749                                                             r2);
750              }
751          }
752  
753          g_free(d);
754      }
755  }
756  
757  static void update_refcount_discard(BlockDriverState *bs,
758                                      uint64_t offset, uint64_t length)
759  {
760      BDRVQcow2State *s = bs->opaque;
761      Qcow2DiscardRegion *d, *p, *next;
762  
763      QTAILQ_FOREACH(d, &s->discards, next) {
764          uint64_t new_start = MIN(offset, d->offset);
765          uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
766  
767          if (new_end - new_start <= length + d->bytes) {
768              /* There can't be any overlap, areas ending up here have no
769               * references any more and therefore shouldn't get freed another
770               * time. */
771              assert(d->bytes + length == new_end - new_start);
772              d->offset = new_start;
773              d->bytes = new_end - new_start;
774              goto found;
775          }
776      }
777  
778      d = g_malloc(sizeof(*d));
779      *d = (Qcow2DiscardRegion) {
780          .bs     = bs,
781          .offset = offset,
782          .bytes  = length,
783      };
784      QTAILQ_INSERT_TAIL(&s->discards, d, next);
785  
786  found:
787      /* Merge discard requests if they are adjacent now */
788      QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
789          if (p == d
790              || p->offset > d->offset + d->bytes
791              || d->offset > p->offset + p->bytes)
792          {
793              continue;
794          }
795  
796          /* Still no overlap possible */
797          assert(p->offset == d->offset + d->bytes
798              || d->offset == p->offset + p->bytes);
799  
800          QTAILQ_REMOVE(&s->discards, p, next);
801          d->offset = MIN(d->offset, p->offset);
802          d->bytes += p->bytes;
803          g_free(p);
804      }
805  }
806  
807  /* XXX: cache several refcount block clusters ? */
808  /* @addend is the absolute value of the addend; if @decrease is set, @addend
809   * will be subtracted from the current refcount, otherwise it will be added */
810  static int GRAPH_RDLOCK
811  update_refcount(BlockDriverState *bs, int64_t offset, int64_t length,
812                  uint64_t addend, bool decrease, enum qcow2_discard_type type)
813  {
814      BDRVQcow2State *s = bs->opaque;
815      int64_t start, last, cluster_offset;
816      void *refcount_block = NULL;
817      int64_t old_table_index = -1;
818      int ret;
819  
820  #ifdef DEBUG_ALLOC2
821      fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
822              " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
823              addend);
824  #endif
825      if (length < 0) {
826          return -EINVAL;
827      } else if (length == 0) {
828          return 0;
829      }
830  
831      if (decrease) {
832          qcow2_cache_set_dependency(bs, s->refcount_block_cache,
833              s->l2_table_cache);
834      }
835  
836      start = start_of_cluster(s, offset);
837      last = start_of_cluster(s, offset + length - 1);
838      for(cluster_offset = start; cluster_offset <= last;
839          cluster_offset += s->cluster_size)
840      {
841          int block_index;
842          uint64_t refcount;
843          int64_t cluster_index = cluster_offset >> s->cluster_bits;
844          int64_t table_index = cluster_index >> s->refcount_block_bits;
845  
846          /* Load the refcount block and allocate it if needed */
847          if (table_index != old_table_index) {
848              if (refcount_block) {
849                  qcow2_cache_put(s->refcount_block_cache, &refcount_block);
850              }
851              ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
852              /* If the caller needs to restart the search for free clusters,
853               * try the same ones first to see if they're still free. */
854              if (ret == -EAGAIN) {
855                  if (s->free_cluster_index > (start >> s->cluster_bits)) {
856                      s->free_cluster_index = (start >> s->cluster_bits);
857                  }
858              }
859              if (ret < 0) {
860                  goto fail;
861              }
862          }
863          old_table_index = table_index;
864  
865          qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
866  
867          /* we can update the count and save it */
868          block_index = cluster_index & (s->refcount_block_size - 1);
869  
870          refcount = s->get_refcount(refcount_block, block_index);
871          if (decrease ? (refcount - addend > refcount)
872                       : (refcount + addend < refcount ||
873                          refcount + addend > s->refcount_max))
874          {
875              ret = -EINVAL;
876              goto fail;
877          }
878          if (decrease) {
879              refcount -= addend;
880          } else {
881              refcount += addend;
882          }
883          if (refcount == 0 && cluster_index < s->free_cluster_index) {
884              s->free_cluster_index = cluster_index;
885          }
886          s->set_refcount(refcount_block, block_index, refcount);
887  
888          if (refcount == 0) {
889              void *table;
890  
891              table = qcow2_cache_is_table_offset(s->refcount_block_cache,
892                                                  offset);
893              if (table != NULL) {
894                  qcow2_cache_put(s->refcount_block_cache, &refcount_block);
895                  old_table_index = -1;
896                  qcow2_cache_discard(s->refcount_block_cache, table);
897              }
898  
899              table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
900              if (table != NULL) {
901                  qcow2_cache_discard(s->l2_table_cache, table);
902              }
903  
904              if (s->discard_passthrough[type]) {
905                  update_refcount_discard(bs, cluster_offset, s->cluster_size);
906              }
907          }
908      }
909  
910      ret = 0;
911  fail:
912      if (!s->cache_discards) {
913          qcow2_process_discards(bs, ret);
914      }
915  
916      /* Write last changed block to disk */
917      if (refcount_block) {
918          qcow2_cache_put(s->refcount_block_cache, &refcount_block);
919      }
920  
921      /*
922       * Try do undo any updates if an error is returned (This may succeed in
923       * some cases like ENOSPC for allocating a new refcount block)
924       */
925      if (ret < 0) {
926          int dummy;
927          dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
928                                  !decrease, QCOW2_DISCARD_NEVER);
929          (void)dummy;
930      }
931  
932      return ret;
933  }
934  
935  /*
936   * Increases or decreases the refcount of a given cluster.
937   *
938   * @addend is the absolute value of the addend; if @decrease is set, @addend
939   * will be subtracted from the current refcount, otherwise it will be added.
940   *
941   * On success 0 is returned; on failure -errno is returned.
942   */
943  int qcow2_update_cluster_refcount(BlockDriverState *bs,
944                                    int64_t cluster_index,
945                                    uint64_t addend, bool decrease,
946                                    enum qcow2_discard_type type)
947  {
948      BDRVQcow2State *s = bs->opaque;
949      int ret;
950  
951      ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
952                            decrease, type);
953      if (ret < 0) {
954          return ret;
955      }
956  
957      return 0;
958  }
959  
960  
961  
962  /*********************************************************/
963  /* cluster allocation functions */
964  
965  
966  
967  /* return < 0 if error */
968  static int64_t GRAPH_RDLOCK
969  alloc_clusters_noref(BlockDriverState *bs, uint64_t size, uint64_t max)
970  {
971      BDRVQcow2State *s = bs->opaque;
972      uint64_t i, nb_clusters, refcount;
973      int ret;
974  
975      /* We can't allocate clusters if they may still be queued for discard. */
976      if (s->cache_discards) {
977          qcow2_process_discards(bs, 0);
978      }
979  
980      nb_clusters = size_to_clusters(s, size);
981  retry:
982      for(i = 0; i < nb_clusters; i++) {
983          uint64_t next_cluster_index = s->free_cluster_index++;
984          ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
985  
986          if (ret < 0) {
987              return ret;
988          } else if (refcount != 0) {
989              goto retry;
990          }
991      }
992  
993      /* Make sure that all offsets in the "allocated" range are representable
994       * in the requested max */
995      if (s->free_cluster_index > 0 &&
996          s->free_cluster_index - 1 > (max >> s->cluster_bits))
997      {
998          return -EFBIG;
999      }
1000  
1001  #ifdef DEBUG_ALLOC2
1002      fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1003              size,
1004              (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1005  #endif
1006      return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1007  }
1008  
1009  int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1010  {
1011      int64_t offset;
1012      int ret;
1013  
1014      BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1015      do {
1016          offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1017          if (offset < 0) {
1018              return offset;
1019          }
1020  
1021          ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1022      } while (ret == -EAGAIN);
1023  
1024      if (ret < 0) {
1025          return ret;
1026      }
1027  
1028      return offset;
1029  }
1030  
1031  int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1032                                               int64_t nb_clusters)
1033  {
1034      BDRVQcow2State *s = bs->opaque;
1035      uint64_t cluster_index, refcount;
1036      uint64_t i;
1037      int ret;
1038  
1039      assert(nb_clusters >= 0);
1040      if (nb_clusters == 0) {
1041          return 0;
1042      }
1043  
1044      do {
1045          /* Check how many clusters there are free */
1046          cluster_index = offset >> s->cluster_bits;
1047          for(i = 0; i < nb_clusters; i++) {
1048              ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1049              if (ret < 0) {
1050                  return ret;
1051              } else if (refcount != 0) {
1052                  break;
1053              }
1054          }
1055  
1056          /* And then allocate them */
1057          ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1058                                QCOW2_DISCARD_NEVER);
1059      } while (ret == -EAGAIN);
1060  
1061      if (ret < 0) {
1062          return ret;
1063      }
1064  
1065      return i;
1066  }
1067  
1068  /* only used to allocate compressed sectors. We try to allocate
1069     contiguous sectors. size must be <= cluster_size */
1070  int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size)
1071  {
1072      BDRVQcow2State *s = bs->opaque;
1073      int64_t offset;
1074      size_t free_in_cluster;
1075      int ret;
1076  
1077      BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1078      assert(size > 0 && size <= s->cluster_size);
1079      assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1080  
1081      offset = s->free_byte_offset;
1082  
1083      if (offset) {
1084          uint64_t refcount;
1085          ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1086          if (ret < 0) {
1087              return ret;
1088          }
1089  
1090          if (refcount == s->refcount_max) {
1091              offset = 0;
1092          }
1093      }
1094  
1095      free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1096      do {
1097          if (!offset || free_in_cluster < size) {
1098              int64_t new_cluster;
1099  
1100              new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1101                                                 MIN(s->cluster_offset_mask,
1102                                                     QCOW_MAX_CLUSTER_OFFSET));
1103              if (new_cluster < 0) {
1104                  return new_cluster;
1105              }
1106  
1107              if (new_cluster == 0) {
1108                  qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1109                                          "allocation of compressed cluster "
1110                                          "at offset 0");
1111                  return -EIO;
1112              }
1113  
1114              if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1115                  offset = new_cluster;
1116                  free_in_cluster = s->cluster_size;
1117              } else {
1118                  free_in_cluster += s->cluster_size;
1119              }
1120          }
1121  
1122          assert(offset);
1123          ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1124          if (ret < 0) {
1125              offset = 0;
1126          }
1127      } while (ret == -EAGAIN);
1128      if (ret < 0) {
1129          return ret;
1130      }
1131  
1132      /* The cluster refcount was incremented; refcount blocks must be flushed
1133       * before the caller's L2 table updates. */
1134      qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1135  
1136      s->free_byte_offset = offset + size;
1137      if (!offset_into_cluster(s, s->free_byte_offset)) {
1138          s->free_byte_offset = 0;
1139      }
1140  
1141      return offset;
1142  }
1143  
1144  void qcow2_free_clusters(BlockDriverState *bs,
1145                            int64_t offset, int64_t size,
1146                            enum qcow2_discard_type type)
1147  {
1148      int ret;
1149  
1150      BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1151      ret = update_refcount(bs, offset, size, 1, true, type);
1152      if (ret < 0) {
1153          fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1154          /* TODO Remember the clusters to free them later and avoid leaking */
1155      }
1156  }
1157  
1158  /*
1159   * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1160   * normal cluster, compressed cluster, etc.)
1161   */
1162  void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1163                              enum qcow2_discard_type type)
1164  {
1165      BDRVQcow2State *s = bs->opaque;
1166      QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1167  
1168      if (has_data_file(bs)) {
1169          if (s->discard_passthrough[type] &&
1170              (ctype == QCOW2_CLUSTER_NORMAL ||
1171               ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1172          {
1173              bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1174                            s->cluster_size);
1175          }
1176          return;
1177      }
1178  
1179      switch (ctype) {
1180      case QCOW2_CLUSTER_COMPRESSED:
1181          {
1182              uint64_t coffset;
1183              int csize;
1184  
1185              qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1186              qcow2_free_clusters(bs, coffset, csize, type);
1187          }
1188          break;
1189      case QCOW2_CLUSTER_NORMAL:
1190      case QCOW2_CLUSTER_ZERO_ALLOC:
1191          if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1192              qcow2_signal_corruption(bs, false, -1, -1,
1193                                      "Cannot free unaligned cluster %#llx",
1194                                      l2_entry & L2E_OFFSET_MASK);
1195          } else {
1196              qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1197                                  s->cluster_size, type);
1198          }
1199          break;
1200      case QCOW2_CLUSTER_ZERO_PLAIN:
1201      case QCOW2_CLUSTER_UNALLOCATED:
1202          break;
1203      default:
1204          abort();
1205      }
1206  }
1207  
1208  int qcow2_write_caches(BlockDriverState *bs)
1209  {
1210      BDRVQcow2State *s = bs->opaque;
1211      int ret;
1212  
1213      ret = qcow2_cache_write(bs, s->l2_table_cache);
1214      if (ret < 0) {
1215          return ret;
1216      }
1217  
1218      if (qcow2_need_accurate_refcounts(s)) {
1219          ret = qcow2_cache_write(bs, s->refcount_block_cache);
1220          if (ret < 0) {
1221              return ret;
1222          }
1223      }
1224  
1225      return 0;
1226  }
1227  
1228  int qcow2_flush_caches(BlockDriverState *bs)
1229  {
1230      int ret = qcow2_write_caches(bs);
1231      if (ret < 0) {
1232          return ret;
1233      }
1234  
1235      return bdrv_flush(bs->file->bs);
1236  }
1237  
1238  /*********************************************************/
1239  /* snapshots and image creation */
1240  
1241  
1242  
1243  /* update the refcounts of snapshots and the copied flag */
1244  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1245      int64_t l1_table_offset, int l1_size, int addend)
1246  {
1247      BDRVQcow2State *s = bs->opaque;
1248      uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1249      bool l1_allocated = false;
1250      int64_t old_entry, old_l2_offset;
1251      unsigned slice, slice_size2, n_slices;
1252      int i, j, l1_modified = 0;
1253      int ret;
1254  
1255      assert(addend >= -1 && addend <= 1);
1256  
1257      l2_slice = NULL;
1258      l1_table = NULL;
1259      l1_size2 = l1_size * L1E_SIZE;
1260      slice_size2 = s->l2_slice_size * l2_entry_size(s);
1261      n_slices = s->cluster_size / slice_size2;
1262  
1263      s->cache_discards = true;
1264  
1265      /* WARNING: qcow2_snapshot_goto relies on this function not using the
1266       * l1_table_offset when it is the current s->l1_table_offset! Be careful
1267       * when changing this! */
1268      if (l1_table_offset != s->l1_table_offset) {
1269          l1_table = g_try_malloc0(l1_size2);
1270          if (l1_size2 && l1_table == NULL) {
1271              ret = -ENOMEM;
1272              goto fail;
1273          }
1274          l1_allocated = true;
1275  
1276          ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
1277          if (ret < 0) {
1278              goto fail;
1279          }
1280  
1281          for (i = 0; i < l1_size; i++) {
1282              be64_to_cpus(&l1_table[i]);
1283          }
1284      } else {
1285          assert(l1_size == s->l1_size);
1286          l1_table = s->l1_table;
1287          l1_allocated = false;
1288      }
1289  
1290      for (i = 0; i < l1_size; i++) {
1291          l2_offset = l1_table[i];
1292          if (l2_offset) {
1293              old_l2_offset = l2_offset;
1294              l2_offset &= L1E_OFFSET_MASK;
1295  
1296              if (offset_into_cluster(s, l2_offset)) {
1297                  qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1298                                          PRIx64 " unaligned (L1 index: %#x)",
1299                                          l2_offset, i);
1300                  ret = -EIO;
1301                  goto fail;
1302              }
1303  
1304              for (slice = 0; slice < n_slices; slice++) {
1305                  ret = qcow2_cache_get(bs, s->l2_table_cache,
1306                                        l2_offset + slice * slice_size2,
1307                                        (void **) &l2_slice);
1308                  if (ret < 0) {
1309                      goto fail;
1310                  }
1311  
1312                  for (j = 0; j < s->l2_slice_size; j++) {
1313                      uint64_t cluster_index;
1314                      uint64_t offset;
1315  
1316                      entry = get_l2_entry(s, l2_slice, j);
1317                      old_entry = entry;
1318                      entry &= ~QCOW_OFLAG_COPIED;
1319                      offset = entry & L2E_OFFSET_MASK;
1320  
1321                      switch (qcow2_get_cluster_type(bs, entry)) {
1322                      case QCOW2_CLUSTER_COMPRESSED:
1323                          if (addend != 0) {
1324                              uint64_t coffset;
1325                              int csize;
1326  
1327                              qcow2_parse_compressed_l2_entry(bs, entry,
1328                                                              &coffset, &csize);
1329                              ret = update_refcount(
1330                                  bs, coffset, csize,
1331                                  abs(addend), addend < 0,
1332                                  QCOW2_DISCARD_SNAPSHOT);
1333                              if (ret < 0) {
1334                                  goto fail;
1335                              }
1336                          }
1337                          /* compressed clusters are never modified */
1338                          refcount = 2;
1339                          break;
1340  
1341                      case QCOW2_CLUSTER_NORMAL:
1342                      case QCOW2_CLUSTER_ZERO_ALLOC:
1343                          if (offset_into_cluster(s, offset)) {
1344                              /* Here l2_index means table (not slice) index */
1345                              int l2_index = slice * s->l2_slice_size + j;
1346                              qcow2_signal_corruption(
1347                                  bs, true, -1, -1, "Cluster "
1348                                  "allocation offset %#" PRIx64
1349                                  " unaligned (L2 offset: %#"
1350                                  PRIx64 ", L2 index: %#x)",
1351                                  offset, l2_offset, l2_index);
1352                              ret = -EIO;
1353                              goto fail;
1354                          }
1355  
1356                          cluster_index = offset >> s->cluster_bits;
1357                          assert(cluster_index);
1358                          if (addend != 0) {
1359                              ret = qcow2_update_cluster_refcount(
1360                                  bs, cluster_index, abs(addend), addend < 0,
1361                                  QCOW2_DISCARD_SNAPSHOT);
1362                              if (ret < 0) {
1363                                  goto fail;
1364                              }
1365                          }
1366  
1367                          ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1368                          if (ret < 0) {
1369                              goto fail;
1370                          }
1371                          break;
1372  
1373                      case QCOW2_CLUSTER_ZERO_PLAIN:
1374                      case QCOW2_CLUSTER_UNALLOCATED:
1375                          refcount = 0;
1376                          break;
1377  
1378                      default:
1379                          abort();
1380                      }
1381  
1382                      if (refcount == 1) {
1383                          entry |= QCOW_OFLAG_COPIED;
1384                      }
1385                      if (entry != old_entry) {
1386                          if (addend > 0) {
1387                              qcow2_cache_set_dependency(bs, s->l2_table_cache,
1388                                                         s->refcount_block_cache);
1389                          }
1390                          set_l2_entry(s, l2_slice, j, entry);
1391                          qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1392                                                       l2_slice);
1393                      }
1394                  }
1395  
1396                  qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1397              }
1398  
1399              if (addend != 0) {
1400                  ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1401                                                          s->cluster_bits,
1402                                                      abs(addend), addend < 0,
1403                                                      QCOW2_DISCARD_SNAPSHOT);
1404                  if (ret < 0) {
1405                      goto fail;
1406                  }
1407              }
1408              ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1409                                       &refcount);
1410              if (ret < 0) {
1411                  goto fail;
1412              } else if (refcount == 1) {
1413                  l2_offset |= QCOW_OFLAG_COPIED;
1414              }
1415              if (l2_offset != old_l2_offset) {
1416                  l1_table[i] = l2_offset;
1417                  l1_modified = 1;
1418              }
1419          }
1420      }
1421  
1422      ret = bdrv_flush(bs);
1423  fail:
1424      if (l2_slice) {
1425          qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1426      }
1427  
1428      s->cache_discards = false;
1429      qcow2_process_discards(bs, ret);
1430  
1431      /* Update L1 only if it isn't deleted anyway (addend = -1) */
1432      if (ret == 0 && addend >= 0 && l1_modified) {
1433          for (i = 0; i < l1_size; i++) {
1434              cpu_to_be64s(&l1_table[i]);
1435          }
1436  
1437          ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
1438                                 0);
1439  
1440          for (i = 0; i < l1_size; i++) {
1441              be64_to_cpus(&l1_table[i]);
1442          }
1443      }
1444      if (l1_allocated)
1445          g_free(l1_table);
1446      return ret;
1447  }
1448  
1449  
1450  
1451  
1452  /*********************************************************/
1453  /* refcount checking functions */
1454  
1455  
1456  static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1457  {
1458      /* This assertion holds because there is no way we can address more than
1459       * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1460       * offsets have to be representable in bytes); due to every cluster
1461       * corresponding to one refcount entry, we are well below that limit */
1462      assert(entries < (UINT64_C(1) << (64 - 9)));
1463  
1464      /* Thanks to the assertion this will not overflow, because
1465       * s->refcount_order < 7.
1466       * (note: x << s->refcount_order == x * s->refcount_bits) */
1467      return DIV_ROUND_UP(entries << s->refcount_order, 8);
1468  }
1469  
1470  /**
1471   * Reallocates *array so that it can hold new_size entries. *size must contain
1472   * the current number of entries in *array. If the reallocation fails, *array
1473   * and *size will not be modified and -errno will be returned. If the
1474   * reallocation is successful, *array will be set to the new buffer, *size
1475   * will be set to new_size and 0 will be returned. The size of the reallocated
1476   * refcount array buffer will be aligned to a cluster boundary, and the newly
1477   * allocated area will be zeroed.
1478   */
1479  static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1480                                    int64_t *size, int64_t new_size)
1481  {
1482      int64_t old_byte_size, new_byte_size;
1483      void *new_ptr;
1484  
1485      /* Round to clusters so the array can be directly written to disk */
1486      old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1487                      * s->cluster_size;
1488      new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1489                      * s->cluster_size;
1490  
1491      if (new_byte_size == old_byte_size) {
1492          *size = new_size;
1493          return 0;
1494      }
1495  
1496      assert(new_byte_size > 0);
1497  
1498      if (new_byte_size > SIZE_MAX) {
1499          return -ENOMEM;
1500      }
1501  
1502      new_ptr = g_try_realloc(*array, new_byte_size);
1503      if (!new_ptr) {
1504          return -ENOMEM;
1505      }
1506  
1507      if (new_byte_size > old_byte_size) {
1508          memset((char *)new_ptr + old_byte_size, 0,
1509                 new_byte_size - old_byte_size);
1510      }
1511  
1512      *array = new_ptr;
1513      *size  = new_size;
1514  
1515      return 0;
1516  }
1517  
1518  /*
1519   * Increases the refcount for a range of clusters in a given refcount table.
1520   * This is used to construct a temporary refcount table out of L1 and L2 tables
1521   * which can be compared to the refcount table saved in the image.
1522   *
1523   * Modifies the number of errors in res.
1524   */
1525  int coroutine_fn GRAPH_RDLOCK
1526  qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1527                           void **refcount_table,
1528                           int64_t *refcount_table_size,
1529                           int64_t offset, int64_t size)
1530  {
1531      BDRVQcow2State *s = bs->opaque;
1532      uint64_t start, last, cluster_offset, k, refcount;
1533      int64_t file_len;
1534      int ret;
1535  
1536      if (size <= 0) {
1537          return 0;
1538      }
1539  
1540      file_len = bdrv_co_getlength(bs->file->bs);
1541      if (file_len < 0) {
1542          return file_len;
1543      }
1544  
1545      /*
1546       * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1547       * reference some space after file end but it should be less than one
1548       * cluster.
1549       */
1550      if (offset + size - file_len >= s->cluster_size) {
1551          fprintf(stderr, "ERROR: counting reference for region exceeding the "
1552                  "end of the file by one cluster or more: offset 0x%" PRIx64
1553                  " size 0x%" PRIx64 "\n", offset, size);
1554          res->corruptions++;
1555          return 0;
1556      }
1557  
1558      start = start_of_cluster(s, offset);
1559      last = start_of_cluster(s, offset + size - 1);
1560      for(cluster_offset = start; cluster_offset <= last;
1561          cluster_offset += s->cluster_size) {
1562          k = cluster_offset >> s->cluster_bits;
1563          if (k >= *refcount_table_size) {
1564              ret = realloc_refcount_array(s, refcount_table,
1565                                           refcount_table_size, k + 1);
1566              if (ret < 0) {
1567                  res->check_errors++;
1568                  return ret;
1569              }
1570          }
1571  
1572          refcount = s->get_refcount(*refcount_table, k);
1573          if (refcount == s->refcount_max) {
1574              fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1575                      "\n", cluster_offset);
1576              fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1577                      "width or qemu-img convert to create a clean copy if the "
1578                      "image cannot be opened for writing\n");
1579              res->corruptions++;
1580              continue;
1581          }
1582          s->set_refcount(*refcount_table, k, refcount + 1);
1583      }
1584  
1585      return 0;
1586  }
1587  
1588  /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1589  enum {
1590      CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1591  };
1592  
1593  /*
1594   * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1595   * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1596   *
1597   * This function decrements res->corruptions on success, so the caller is
1598   * responsible to increment res->corruptions prior to the call.
1599   *
1600   * On failure in-memory @l2_table may be modified.
1601   */
1602  static int coroutine_fn GRAPH_RDLOCK
1603  fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1604                       uint64_t l2_offset, uint64_t *l2_table,
1605                       int l2_index, bool active,
1606                       bool *metadata_overlap)
1607  {
1608      BDRVQcow2State *s = bs->opaque;
1609      int ret;
1610      int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1611      uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1612      int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1613  
1614      if (has_subclusters(s)) {
1615          uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1616  
1617          /* Allocated subclusters become zero */
1618          l2_bitmap |= l2_bitmap << 32;
1619          l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1620  
1621          set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1622          set_l2_entry(s, l2_table, l2_index, 0);
1623      } else {
1624          set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1625      }
1626  
1627      ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1628                                          false);
1629      if (metadata_overlap) {
1630          *metadata_overlap = ret < 0;
1631      }
1632      if (ret < 0) {
1633          fprintf(stderr, "ERROR: Overlap check failed\n");
1634          goto fail;
1635      }
1636  
1637      ret = bdrv_co_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
1638                                &l2_table[idx], 0);
1639      if (ret < 0) {
1640          fprintf(stderr, "ERROR: Failed to overwrite L2 "
1641                  "table entry: %s\n", strerror(-ret));
1642          goto fail;
1643      }
1644  
1645      res->corruptions--;
1646      res->corruptions_fixed++;
1647      return 0;
1648  
1649  fail:
1650      res->check_errors++;
1651      return ret;
1652  }
1653  
1654  /*
1655   * Increases the refcount in the given refcount table for the all clusters
1656   * referenced in the L2 table. While doing so, performs some checks on L2
1657   * entries.
1658   *
1659   * Returns the number of errors found by the checks or -errno if an internal
1660   * error occurred.
1661   */
1662  static int coroutine_fn GRAPH_RDLOCK
1663  check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1664                     void **refcount_table,
1665                     int64_t *refcount_table_size, int64_t l2_offset,
1666                     int flags, BdrvCheckMode fix, bool active)
1667  {
1668      BDRVQcow2State *s = bs->opaque;
1669      uint64_t l2_entry, l2_bitmap;
1670      uint64_t next_contiguous_offset = 0;
1671      int i, ret;
1672      size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1673      g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1674      bool metadata_overlap;
1675  
1676      /* Read L2 table from disk */
1677      ret = bdrv_co_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
1678      if (ret < 0) {
1679          fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1680          res->check_errors++;
1681          return ret;
1682      }
1683  
1684      /* Do the actual checks */
1685      for (i = 0; i < s->l2_size; i++) {
1686          uint64_t coffset;
1687          int csize;
1688          QCow2ClusterType type;
1689  
1690          l2_entry = get_l2_entry(s, l2_table, i);
1691          l2_bitmap = get_l2_bitmap(s, l2_table, i);
1692          type = qcow2_get_cluster_type(bs, l2_entry);
1693  
1694          if (type != QCOW2_CLUSTER_COMPRESSED) {
1695              /* Check reserved bits of Standard Cluster Descriptor */
1696              if (l2_entry & L2E_STD_RESERVED_MASK) {
1697                  fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1698                          "%" PRIx64 "\n", l2_entry);
1699                  res->corruptions++;
1700              }
1701          }
1702  
1703          switch (type) {
1704          case QCOW2_CLUSTER_COMPRESSED:
1705              /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1706              if (l2_entry & QCOW_OFLAG_COPIED) {
1707                  fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1708                      "copied flag must never be set for compressed "
1709                      "clusters\n", l2_entry & s->cluster_offset_mask);
1710                  l2_entry &= ~QCOW_OFLAG_COPIED;
1711                  res->corruptions++;
1712              }
1713  
1714              if (has_data_file(bs)) {
1715                  fprintf(stderr, "ERROR compressed cluster %d with data file, "
1716                          "entry=0x%" PRIx64 "\n", i, l2_entry);
1717                  res->corruptions++;
1718                  break;
1719              }
1720  
1721              if (l2_bitmap) {
1722                  fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1723                          "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1724                          i, l2_entry);
1725                  res->corruptions++;
1726                  break;
1727              }
1728  
1729              /* Mark cluster as used */
1730              qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1731              ret = qcow2_inc_refcounts_imrt(
1732                  bs, res, refcount_table, refcount_table_size, coffset, csize);
1733              if (ret < 0) {
1734                  return ret;
1735              }
1736  
1737              if (flags & CHECK_FRAG_INFO) {
1738                  res->bfi.allocated_clusters++;
1739                  res->bfi.compressed_clusters++;
1740  
1741                  /*
1742                   * Compressed clusters are fragmented by nature.  Since they
1743                   * take up sub-sector space but we only have sector granularity
1744                   * I/O we need to re-read the same sectors even for adjacent
1745                   * compressed clusters.
1746                   */
1747                  res->bfi.fragmented_clusters++;
1748              }
1749              break;
1750  
1751          case QCOW2_CLUSTER_ZERO_ALLOC:
1752          case QCOW2_CLUSTER_NORMAL:
1753          {
1754              uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1755  
1756              if ((l2_bitmap >> 32) & l2_bitmap) {
1757                  res->corruptions++;
1758                  fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1759                          "cluster has corrupted subcluster allocation bitmap\n",
1760                          offset);
1761              }
1762  
1763              /* Correct offsets are cluster aligned */
1764              if (offset_into_cluster(s, offset)) {
1765                  bool contains_data;
1766                  res->corruptions++;
1767  
1768                  if (has_subclusters(s)) {
1769                      contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1770                  } else {
1771                      contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1772                  }
1773  
1774                  if (!contains_data) {
1775                      fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1776                              "cluster is not properly aligned; L2 entry "
1777                              "corrupted.\n",
1778                              fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1779                              offset);
1780                      if (fix & BDRV_FIX_ERRORS) {
1781                          ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1782                                                     l2_table, i, active,
1783                                                     &metadata_overlap);
1784                          if (metadata_overlap) {
1785                              /*
1786                               * Something is seriously wrong, so abort checking
1787                               * this L2 table.
1788                               */
1789                              return ret;
1790                          }
1791  
1792                          if (ret == 0) {
1793                              /*
1794                               * Skip marking the cluster as used
1795                               * (it is unused now).
1796                               */
1797                              continue;
1798                          }
1799  
1800                          /*
1801                           * Failed to fix.
1802                           * Do not abort, continue checking the rest of this
1803                           * L2 table's entries.
1804                           */
1805                      }
1806                  } else {
1807                      fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1808                          "not properly aligned; L2 entry corrupted.\n", offset);
1809                  }
1810              }
1811  
1812              if (flags & CHECK_FRAG_INFO) {
1813                  res->bfi.allocated_clusters++;
1814                  if (next_contiguous_offset &&
1815                      offset != next_contiguous_offset) {
1816                      res->bfi.fragmented_clusters++;
1817                  }
1818                  next_contiguous_offset = offset + s->cluster_size;
1819              }
1820  
1821              /* Mark cluster as used */
1822              if (!has_data_file(bs)) {
1823                  ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1824                                                 refcount_table_size,
1825                                                 offset, s->cluster_size);
1826                  if (ret < 0) {
1827                      return ret;
1828                  }
1829              }
1830              break;
1831          }
1832  
1833          case QCOW2_CLUSTER_ZERO_PLAIN:
1834              /* Impossible when image has subclusters */
1835              assert(!l2_bitmap);
1836              break;
1837  
1838          case QCOW2_CLUSTER_UNALLOCATED:
1839              if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1840                  res->corruptions++;
1841                  fprintf(stderr, "ERROR: Unallocated "
1842                          "cluster has non-zero subcluster allocation map\n");
1843              }
1844              break;
1845  
1846          default:
1847              abort();
1848          }
1849      }
1850  
1851      return 0;
1852  }
1853  
1854  /*
1855   * Increases the refcount for the L1 table, its L2 tables and all referenced
1856   * clusters in the given refcount table. While doing so, performs some checks
1857   * on L1 and L2 entries.
1858   *
1859   * Returns the number of errors found by the checks or -errno if an internal
1860   * error occurred.
1861   */
1862  static int coroutine_fn GRAPH_RDLOCK
1863  check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res,
1864                     void **refcount_table, int64_t *refcount_table_size,
1865                     int64_t l1_table_offset, int l1_size,
1866                     int flags, BdrvCheckMode fix, bool active)
1867  {
1868      BDRVQcow2State *s = bs->opaque;
1869      size_t l1_size_bytes = l1_size * L1E_SIZE;
1870      g_autofree uint64_t *l1_table = NULL;
1871      uint64_t l2_offset;
1872      int i, ret;
1873  
1874      if (!l1_size) {
1875          return 0;
1876      }
1877  
1878      /* Mark L1 table as used */
1879      ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1880                                     l1_table_offset, l1_size_bytes);
1881      if (ret < 0) {
1882          return ret;
1883      }
1884  
1885      l1_table = g_try_malloc(l1_size_bytes);
1886      if (l1_table == NULL) {
1887          res->check_errors++;
1888          return -ENOMEM;
1889      }
1890  
1891      /* Read L1 table entries from disk */
1892      ret = bdrv_co_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
1893      if (ret < 0) {
1894          fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1895          res->check_errors++;
1896          return ret;
1897      }
1898  
1899      for (i = 0; i < l1_size; i++) {
1900          be64_to_cpus(&l1_table[i]);
1901      }
1902  
1903      /* Do the actual checks */
1904      for (i = 0; i < l1_size; i++) {
1905          if (!l1_table[i]) {
1906              continue;
1907          }
1908  
1909          if (l1_table[i] & L1E_RESERVED_MASK) {
1910              fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1911                      "%" PRIx64 "\n", l1_table[i]);
1912              res->corruptions++;
1913          }
1914  
1915          l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1916  
1917          /* Mark L2 table as used */
1918          ret = qcow2_inc_refcounts_imrt(bs, res,
1919                                         refcount_table, refcount_table_size,
1920                                         l2_offset, s->cluster_size);
1921          if (ret < 0) {
1922              return ret;
1923          }
1924  
1925          /* L2 tables are cluster aligned */
1926          if (offset_into_cluster(s, l2_offset)) {
1927              fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1928                  "cluster aligned; L1 entry corrupted\n", l2_offset);
1929              res->corruptions++;
1930          }
1931  
1932          /* Process and check L2 entries */
1933          ret = check_refcounts_l2(bs, res, refcount_table,
1934                                   refcount_table_size, l2_offset, flags,
1935                                   fix, active);
1936          if (ret < 0) {
1937              return ret;
1938          }
1939      }
1940  
1941      return 0;
1942  }
1943  
1944  /*
1945   * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1946   *
1947   * This function does not print an error message nor does it increment
1948   * check_errors if qcow2_get_refcount fails (this is because such an error will
1949   * have been already detected and sufficiently signaled by the calling function
1950   * (qcow2_check_refcounts) by the time this function is called).
1951   */
1952  static int coroutine_fn GRAPH_RDLOCK
1953  check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
1954  {
1955      BDRVQcow2State *s = bs->opaque;
1956      uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1957      int ret;
1958      uint64_t refcount;
1959      int i, j;
1960      bool repair;
1961  
1962      if (fix & BDRV_FIX_ERRORS) {
1963          /* Always repair */
1964          repair = true;
1965      } else if (fix & BDRV_FIX_LEAKS) {
1966          /* Repair only if that seems safe: This function is always
1967           * called after the refcounts have been fixed, so the refcount
1968           * is accurate if that repair was successful */
1969          repair = !res->check_errors && !res->corruptions && !res->leaks;
1970      } else {
1971          repair = false;
1972      }
1973  
1974      for (i = 0; i < s->l1_size; i++) {
1975          uint64_t l1_entry = s->l1_table[i];
1976          uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1977          int l2_dirty = 0;
1978  
1979          if (!l2_offset) {
1980              continue;
1981          }
1982  
1983          ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1984                                   &refcount);
1985          if (ret < 0) {
1986              /* don't print message nor increment check_errors */
1987              continue;
1988          }
1989          if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1990              res->corruptions++;
1991              fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1992                      "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1993                      repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1994              if (repair) {
1995                  s->l1_table[i] = refcount == 1
1996                                 ? l1_entry |  QCOW_OFLAG_COPIED
1997                                 : l1_entry & ~QCOW_OFLAG_COPIED;
1998                  ret = qcow2_write_l1_entry(bs, i);
1999                  if (ret < 0) {
2000                      res->check_errors++;
2001                      goto fail;
2002                  }
2003                  res->corruptions--;
2004                  res->corruptions_fixed++;
2005              }
2006          }
2007  
2008          ret = bdrv_co_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
2009                              l2_table, 0);
2010          if (ret < 0) {
2011              fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2012                      strerror(-ret));
2013              res->check_errors++;
2014              goto fail;
2015          }
2016  
2017          for (j = 0; j < s->l2_size; j++) {
2018              uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2019              uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2020              QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2021  
2022              if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2023                  cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2024                  if (has_data_file(bs)) {
2025                      refcount = 1;
2026                  } else {
2027                      ret = qcow2_get_refcount(bs,
2028                                               data_offset >> s->cluster_bits,
2029                                               &refcount);
2030                      if (ret < 0) {
2031                          /* don't print message nor increment check_errors */
2032                          continue;
2033                      }
2034                  }
2035                  if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2036                      res->corruptions++;
2037                      fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2038                              "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2039                              repair ? "Repairing" : "ERROR", l2_entry, refcount);
2040                      if (repair) {
2041                          set_l2_entry(s, l2_table, j,
2042                                       refcount == 1 ?
2043                                       l2_entry |  QCOW_OFLAG_COPIED :
2044                                       l2_entry & ~QCOW_OFLAG_COPIED);
2045                          l2_dirty++;
2046                      }
2047                  }
2048              }
2049          }
2050  
2051          if (l2_dirty > 0) {
2052              ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2053                                                  l2_offset, s->cluster_size,
2054                                                  false);
2055              if (ret < 0) {
2056                  fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2057                          "overlap check failed: %s\n", strerror(-ret));
2058                  res->check_errors++;
2059                  goto fail;
2060              }
2061  
2062              ret = bdrv_co_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, 0);
2063              if (ret < 0) {
2064                  fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2065                          strerror(-ret));
2066                  res->check_errors++;
2067                  goto fail;
2068              }
2069              res->corruptions -= l2_dirty;
2070              res->corruptions_fixed += l2_dirty;
2071          }
2072      }
2073  
2074      ret = 0;
2075  
2076  fail:
2077      qemu_vfree(l2_table);
2078      return ret;
2079  }
2080  
2081  /*
2082   * Checks consistency of refblocks and accounts for each refblock in
2083   * *refcount_table.
2084   */
2085  static int coroutine_fn GRAPH_RDLOCK
2086  check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2087                  BdrvCheckMode fix, bool *rebuild,
2088                  void **refcount_table, int64_t *nb_clusters)
2089  {
2090      BDRVQcow2State *s = bs->opaque;
2091      int64_t i, size;
2092      int ret;
2093  
2094      for(i = 0; i < s->refcount_table_size; i++) {
2095          uint64_t offset, cluster;
2096          offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2097          cluster = offset >> s->cluster_bits;
2098  
2099          if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2100              fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2101                      "reserved bits set\n", i);
2102              res->corruptions++;
2103              *rebuild = true;
2104              continue;
2105          }
2106  
2107          /* Refcount blocks are cluster aligned */
2108          if (offset_into_cluster(s, offset)) {
2109              fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2110                  "cluster aligned; refcount table entry corrupted\n", i);
2111              res->corruptions++;
2112              *rebuild = true;
2113              continue;
2114          }
2115  
2116          if (cluster >= *nb_clusters) {
2117              res->corruptions++;
2118              fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2119                      fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2120  
2121              if (fix & BDRV_FIX_ERRORS) {
2122                  int64_t new_nb_clusters;
2123                  Error *local_err = NULL;
2124  
2125                  if (offset > INT64_MAX - s->cluster_size) {
2126                      ret = -EINVAL;
2127                      goto resize_fail;
2128                  }
2129  
2130                  ret = bdrv_co_truncate(bs->file, offset + s->cluster_size, false,
2131                                         PREALLOC_MODE_OFF, 0, &local_err);
2132                  if (ret < 0) {
2133                      error_report_err(local_err);
2134                      goto resize_fail;
2135                  }
2136                  size = bdrv_co_getlength(bs->file->bs);
2137                  if (size < 0) {
2138                      ret = size;
2139                      goto resize_fail;
2140                  }
2141  
2142                  new_nb_clusters = size_to_clusters(s, size);
2143                  assert(new_nb_clusters >= *nb_clusters);
2144  
2145                  ret = realloc_refcount_array(s, refcount_table,
2146                                               nb_clusters, new_nb_clusters);
2147                  if (ret < 0) {
2148                      res->check_errors++;
2149                      return ret;
2150                  }
2151  
2152                  if (cluster >= *nb_clusters) {
2153                      ret = -EINVAL;
2154                      goto resize_fail;
2155                  }
2156  
2157                  res->corruptions--;
2158                  res->corruptions_fixed++;
2159                  ret = qcow2_inc_refcounts_imrt(bs, res,
2160                                                 refcount_table, nb_clusters,
2161                                                 offset, s->cluster_size);
2162                  if (ret < 0) {
2163                      return ret;
2164                  }
2165                  /* No need to check whether the refcount is now greater than 1:
2166                   * This area was just allocated and zeroed, so it can only be
2167                   * exactly 1 after qcow2_inc_refcounts_imrt() */
2168                  continue;
2169  
2170  resize_fail:
2171                  *rebuild = true;
2172                  fprintf(stderr, "ERROR could not resize image: %s\n",
2173                          strerror(-ret));
2174              }
2175              continue;
2176          }
2177  
2178          if (offset != 0) {
2179              ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2180                                             offset, s->cluster_size);
2181              if (ret < 0) {
2182                  return ret;
2183              }
2184              if (s->get_refcount(*refcount_table, cluster) != 1) {
2185                  fprintf(stderr, "ERROR refcount block %" PRId64
2186                          " refcount=%" PRIu64 "\n", i,
2187                          s->get_refcount(*refcount_table, cluster));
2188                  res->corruptions++;
2189                  *rebuild = true;
2190              }
2191          }
2192      }
2193  
2194      return 0;
2195  }
2196  
2197  /*
2198   * Calculates an in-memory refcount table.
2199   */
2200  static int coroutine_fn GRAPH_RDLOCK
2201  calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2202                      BdrvCheckMode fix, bool *rebuild,
2203                      void **refcount_table, int64_t *nb_clusters)
2204  {
2205      BDRVQcow2State *s = bs->opaque;
2206      int64_t i;
2207      QCowSnapshot *sn;
2208      int ret;
2209  
2210      if (!*refcount_table) {
2211          int64_t old_size = 0;
2212          ret = realloc_refcount_array(s, refcount_table,
2213                                       &old_size, *nb_clusters);
2214          if (ret < 0) {
2215              res->check_errors++;
2216              return ret;
2217          }
2218      }
2219  
2220      /* header */
2221      ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2222                                     0, s->cluster_size);
2223      if (ret < 0) {
2224          return ret;
2225      }
2226  
2227      /* current L1 table */
2228      ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2229                               s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2230                               fix, true);
2231      if (ret < 0) {
2232          return ret;
2233      }
2234  
2235      /* snapshots */
2236      if (has_data_file(bs) && s->nb_snapshots) {
2237          fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2238                  s->nb_snapshots);
2239          res->corruptions++;
2240      }
2241  
2242      for (i = 0; i < s->nb_snapshots; i++) {
2243          sn = s->snapshots + i;
2244          if (offset_into_cluster(s, sn->l1_table_offset)) {
2245              fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2246                      "L1 table is not cluster aligned; snapshot table entry "
2247                      "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2248              res->corruptions++;
2249              continue;
2250          }
2251          if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2252              fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2253                      "L1 table is too large; snapshot table entry corrupted\n",
2254                      sn->id_str, sn->name, sn->l1_size);
2255              res->corruptions++;
2256              continue;
2257          }
2258          ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2259                                   sn->l1_table_offset, sn->l1_size, 0, fix,
2260                                   false);
2261          if (ret < 0) {
2262              return ret;
2263          }
2264      }
2265      ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2266                                     s->snapshots_offset, s->snapshots_size);
2267      if (ret < 0) {
2268          return ret;
2269      }
2270  
2271      /* refcount data */
2272      ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2273                                     s->refcount_table_offset,
2274                                     s->refcount_table_size *
2275                                     REFTABLE_ENTRY_SIZE);
2276      if (ret < 0) {
2277          return ret;
2278      }
2279  
2280      /* encryption */
2281      if (s->crypto_header.length) {
2282          ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2283                                         s->crypto_header.offset,
2284                                         s->crypto_header.length);
2285          if (ret < 0) {
2286              return ret;
2287          }
2288      }
2289  
2290      /* bitmaps */
2291      ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2292      if (ret < 0) {
2293          return ret;
2294      }
2295  
2296      return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2297  }
2298  
2299  /*
2300   * Compares the actual reference count for each cluster in the image against the
2301   * refcount as reported by the refcount structures on-disk.
2302   */
2303  static void coroutine_fn GRAPH_RDLOCK
2304  compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2305                    BdrvCheckMode fix, bool *rebuild,
2306                    int64_t *highest_cluster,
2307                    void *refcount_table, int64_t nb_clusters)
2308  {
2309      BDRVQcow2State *s = bs->opaque;
2310      int64_t i;
2311      uint64_t refcount1, refcount2;
2312      int ret;
2313  
2314      for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2315          ret = qcow2_get_refcount(bs, i, &refcount1);
2316          if (ret < 0) {
2317              fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2318                      i, strerror(-ret));
2319              res->check_errors++;
2320              continue;
2321          }
2322  
2323          refcount2 = s->get_refcount(refcount_table, i);
2324  
2325          if (refcount1 > 0 || refcount2 > 0) {
2326              *highest_cluster = i;
2327          }
2328  
2329          if (refcount1 != refcount2) {
2330              /* Check if we're allowed to fix the mismatch */
2331              int *num_fixed = NULL;
2332              if (refcount1 == 0) {
2333                  *rebuild = true;
2334              } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2335                  num_fixed = &res->leaks_fixed;
2336              } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2337                  num_fixed = &res->corruptions_fixed;
2338              }
2339  
2340              fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2341                      " reference=%" PRIu64 "\n",
2342                     num_fixed != NULL     ? "Repairing" :
2343                     refcount1 < refcount2 ? "ERROR" :
2344                                             "Leaked",
2345                     i, refcount1, refcount2);
2346  
2347              if (num_fixed) {
2348                  ret = update_refcount(bs, i << s->cluster_bits, 1,
2349                                        refcount_diff(refcount1, refcount2),
2350                                        refcount1 > refcount2,
2351                                        QCOW2_DISCARD_ALWAYS);
2352                  if (ret >= 0) {
2353                      (*num_fixed)++;
2354                      continue;
2355                  }
2356              }
2357  
2358              /* And if we couldn't, print an error */
2359              if (refcount1 < refcount2) {
2360                  res->corruptions++;
2361              } else {
2362                  res->leaks++;
2363              }
2364          }
2365      }
2366  }
2367  
2368  /*
2369   * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2370   * the on-disk refcount structures.
2371   *
2372   * On input, *first_free_cluster tells where to start looking, and need not
2373   * actually be a free cluster; the returned offset will not be before that
2374   * cluster.  On output, *first_free_cluster points to the first gap found, even
2375   * if that gap was too small to be used as the returned offset.
2376   *
2377   * Note that *first_free_cluster is a cluster index whereas the return value is
2378   * an offset.
2379   */
2380  static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2381                                     int cluster_count,
2382                                     void **refcount_table,
2383                                     int64_t *imrt_nb_clusters,
2384                                     int64_t *first_free_cluster)
2385  {
2386      BDRVQcow2State *s = bs->opaque;
2387      int64_t cluster = *first_free_cluster, i;
2388      bool first_gap = true;
2389      int contiguous_free_clusters;
2390      int ret;
2391  
2392      /* Starting at *first_free_cluster, find a range of at least cluster_count
2393       * continuously free clusters */
2394      for (contiguous_free_clusters = 0;
2395           cluster < *imrt_nb_clusters &&
2396           contiguous_free_clusters < cluster_count;
2397           cluster++)
2398      {
2399          if (!s->get_refcount(*refcount_table, cluster)) {
2400              contiguous_free_clusters++;
2401              if (first_gap) {
2402                  /* If this is the first free cluster found, update
2403                   * *first_free_cluster accordingly */
2404                  *first_free_cluster = cluster;
2405                  first_gap = false;
2406              }
2407          } else if (contiguous_free_clusters) {
2408              contiguous_free_clusters = 0;
2409          }
2410      }
2411  
2412      /* If contiguous_free_clusters is greater than zero, it contains the number
2413       * of continuously free clusters until the current cluster; the first free
2414       * cluster in the current "gap" is therefore
2415       * cluster - contiguous_free_clusters */
2416  
2417      /* If no such range could be found, grow the in-memory refcount table
2418       * accordingly to append free clusters at the end of the image */
2419      if (contiguous_free_clusters < cluster_count) {
2420          /* contiguous_free_clusters clusters are already empty at the image end;
2421           * we need cluster_count clusters; therefore, we have to allocate
2422           * cluster_count - contiguous_free_clusters new clusters at the end of
2423           * the image (which is the current value of cluster; note that cluster
2424           * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2425           * the image end) */
2426          ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2427                                       cluster + cluster_count
2428                                       - contiguous_free_clusters);
2429          if (ret < 0) {
2430              return ret;
2431          }
2432      }
2433  
2434      /* Go back to the first free cluster */
2435      cluster -= contiguous_free_clusters;
2436      for (i = 0; i < cluster_count; i++) {
2437          s->set_refcount(*refcount_table, cluster + i, 1);
2438      }
2439  
2440      return cluster << s->cluster_bits;
2441  }
2442  
2443  /*
2444   * Helper function for rebuild_refcount_structure().
2445   *
2446   * Scan the range of clusters [first_cluster, end_cluster) for allocated
2447   * clusters and write all corresponding refblocks to disk.  The refblock
2448   * and allocation data is taken from the in-memory refcount table
2449   * *refcount_table[] (of size *nb_clusters), which is basically one big
2450   * (unlimited size) refblock for the whole image.
2451   *
2452   * For these refblocks, clusters are allocated using said in-memory
2453   * refcount table.  Care is taken that these allocations are reflected
2454   * in the refblocks written to disk.
2455   *
2456   * The refblocks' offsets are written into a reftable, which is
2457   * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
2458   * that reftable is of insufficient size, it will be resized to fit.
2459   * This reftable is not written to disk.
2460   *
2461   * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
2462   * to point to existing valid refblocks that do not need to be allocated
2463   * again.)
2464   *
2465   * Return whether the on-disk reftable array was resized (true/false),
2466   * or -errno on error.
2467   */
2468  static int coroutine_fn GRAPH_RDLOCK
2469  rebuild_refcounts_write_refblocks(
2470          BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
2471          int64_t first_cluster, int64_t end_cluster,
2472          uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
2473          Error **errp
2474      )
2475  {
2476      BDRVQcow2State *s = bs->opaque;
2477      int64_t cluster;
2478      int64_t refblock_offset, refblock_start, refblock_index;
2479      int64_t first_free_cluster = 0;
2480      uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
2481      uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
2482      void *on_disk_refblock;
2483      bool reftable_grown = false;
2484      int ret;
2485  
2486      for (cluster = first_cluster; cluster < end_cluster; cluster++) {
2487          /* Check all clusters to find refblocks that contain non-zero entries */
2488          if (!s->get_refcount(*refcount_table, cluster)) {
2489              continue;
2490          }
2491  
2492          /*
2493           * This cluster is allocated, so we need to create a refblock
2494           * for it.  The data we will write to disk is just the
2495           * respective slice from *refcount_table, so it will contain
2496           * accurate refcounts for all clusters belonging to this
2497           * refblock.  After we have written it, we will therefore skip
2498           * all remaining clusters in this refblock.
2499           */
2500  
2501          refblock_index = cluster >> s->refcount_block_bits;
2502          refblock_start = refblock_index << s->refcount_block_bits;
2503  
2504          if (on_disk_reftable_entries > refblock_index &&
2505              on_disk_reftable[refblock_index])
2506          {
2507              /*
2508               * We can get here after a `goto write_refblocks`: We have a
2509               * reftable from a previous run, and the refblock is already
2510               * allocated.  No need to allocate it again.
2511               */
2512              refblock_offset = on_disk_reftable[refblock_index];
2513          } else {
2514              int64_t refblock_cluster_index;
2515  
2516              /* Don't allocate a cluster in a refblock already written to disk */
2517              if (first_free_cluster < refblock_start) {
2518                  first_free_cluster = refblock_start;
2519              }
2520              refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2521                                                    nb_clusters,
2522                                                    &first_free_cluster);
2523              if (refblock_offset < 0) {
2524                  error_setg_errno(errp, -refblock_offset,
2525                                   "ERROR allocating refblock");
2526                  return refblock_offset;
2527              }
2528  
2529              refblock_cluster_index = refblock_offset / s->cluster_size;
2530              if (refblock_cluster_index >= end_cluster) {
2531                  /*
2532                   * We must write the refblock that holds this refblock's
2533                   * refcount
2534                   */
2535                  end_cluster = refblock_cluster_index + 1;
2536              }
2537  
2538              if (on_disk_reftable_entries <= refblock_index) {
2539                  on_disk_reftable_entries =
2540                      ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2541                               s->cluster_size) / REFTABLE_ENTRY_SIZE;
2542                  on_disk_reftable =
2543                      g_try_realloc(on_disk_reftable,
2544                                    on_disk_reftable_entries *
2545                                    REFTABLE_ENTRY_SIZE);
2546                  if (!on_disk_reftable) {
2547                      error_setg(errp, "ERROR allocating reftable memory");
2548                      return -ENOMEM;
2549                  }
2550  
2551                  memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
2552                         (on_disk_reftable_entries -
2553                          *on_disk_reftable_entries_ptr) *
2554                         REFTABLE_ENTRY_SIZE);
2555  
2556                  *on_disk_reftable_ptr = on_disk_reftable;
2557                  *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
2558  
2559                  reftable_grown = true;
2560              } else {
2561                  assert(on_disk_reftable);
2562              }
2563              on_disk_reftable[refblock_index] = refblock_offset;
2564          }
2565  
2566          /* Refblock is allocated, write it to disk */
2567  
2568          ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2569                                              s->cluster_size, false);
2570          if (ret < 0) {
2571              error_setg_errno(errp, -ret, "ERROR writing refblock");
2572              return ret;
2573          }
2574  
2575          /*
2576           * The refblock is simply a slice of *refcount_table.
2577           * Note that the size of *refcount_table is always aligned to
2578           * whole clusters, so the write operation will not result in
2579           * out-of-bounds accesses.
2580           */
2581          on_disk_refblock = (void *)((char *) *refcount_table +
2582                                      refblock_index * s->cluster_size);
2583  
2584          ret = bdrv_co_pwrite(bs->file, refblock_offset, s->cluster_size,
2585                               on_disk_refblock, 0);
2586          if (ret < 0) {
2587              error_setg_errno(errp, -ret, "ERROR writing refblock");
2588              return ret;
2589          }
2590  
2591          /* This refblock is done, skip to its end */
2592          cluster = refblock_start + s->refcount_block_size - 1;
2593      }
2594  
2595      return reftable_grown;
2596  }
2597  
2598  /*
2599   * Creates a new refcount structure based solely on the in-memory information
2600   * given through *refcount_table (this in-memory information is basically just
2601   * the concatenation of all refblocks).  All necessary allocations will be
2602   * reflected in that array.
2603   *
2604   * On success, the old refcount structure is leaked (it will be covered by the
2605   * new refcount structure).
2606   */
2607  static int coroutine_fn GRAPH_RDLOCK
2608  rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res,
2609                             void **refcount_table, int64_t *nb_clusters,
2610                             Error **errp)
2611  {
2612      BDRVQcow2State *s = bs->opaque;
2613      int64_t reftable_offset = -1;
2614      int64_t reftable_length = 0;
2615      int64_t reftable_clusters;
2616      int64_t refblock_index;
2617      uint32_t on_disk_reftable_entries = 0;
2618      uint64_t *on_disk_reftable = NULL;
2619      int ret = 0;
2620      int reftable_size_changed = 0;
2621      struct {
2622          uint64_t reftable_offset;
2623          uint32_t reftable_clusters;
2624      } QEMU_PACKED reftable_offset_and_clusters;
2625  
2626      qcow2_cache_empty(bs, s->refcount_block_cache);
2627  
2628      /*
2629       * For each refblock containing entries, we try to allocate a
2630       * cluster (in the in-memory refcount table) and write its offset
2631       * into on_disk_reftable[].  We then write the whole refblock to
2632       * disk (as a slice of the in-memory refcount table).
2633       * This is done by rebuild_refcounts_write_refblocks().
2634       *
2635       * Once we have scanned all clusters, we try to find space for the
2636       * reftable.  This will dirty the in-memory refcount table (i.e.
2637       * make it differ from the refblocks we have already written), so we
2638       * need to run rebuild_refcounts_write_refblocks() again for the
2639       * range of clusters where the reftable has been allocated.
2640       *
2641       * This second run might make the reftable grow again, in which case
2642       * we will need to allocate another space for it, which is why we
2643       * repeat all this until the reftable stops growing.
2644       *
2645       * (This loop will terminate, because with every cluster the
2646       * reftable grows, it can accommodate a multitude of more refcounts,
2647       * so that at some point this must be able to cover the reftable
2648       * and all refblocks describing it.)
2649       *
2650       * We then convert the reftable to big-endian and write it to disk.
2651       *
2652       * Note that we never free any reftable allocations.  Doing so would
2653       * needlessly complicate the algorithm: The eventual second check
2654       * run we do will clean up all leaks we have caused.
2655       */
2656  
2657      reftable_size_changed =
2658          rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2659                                            0, *nb_clusters,
2660                                            &on_disk_reftable,
2661                                            &on_disk_reftable_entries, errp);
2662      if (reftable_size_changed < 0) {
2663          res->check_errors++;
2664          ret = reftable_size_changed;
2665          goto fail;
2666      }
2667  
2668      /*
2669       * There was no reftable before, so rebuild_refcounts_write_refblocks()
2670       * must have increased its size (from 0 to something).
2671       */
2672      assert(reftable_size_changed);
2673  
2674      do {
2675          int64_t reftable_start_cluster, reftable_end_cluster;
2676          int64_t first_free_cluster = 0;
2677  
2678          reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
2679          reftable_clusters = size_to_clusters(s, reftable_length);
2680  
2681          reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2682                                                refcount_table, nb_clusters,
2683                                                &first_free_cluster);
2684          if (reftable_offset < 0) {
2685              error_setg_errno(errp, -reftable_offset,
2686                               "ERROR allocating reftable");
2687              res->check_errors++;
2688              ret = reftable_offset;
2689              goto fail;
2690          }
2691  
2692          /*
2693           * We need to update the affected refblocks, so re-run the
2694           * write_refblocks loop for the reftable's range of clusters.
2695           */
2696          assert(offset_into_cluster(s, reftable_offset) == 0);
2697          reftable_start_cluster = reftable_offset / s->cluster_size;
2698          reftable_end_cluster = reftable_start_cluster + reftable_clusters;
2699          reftable_size_changed =
2700              rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2701                                                reftable_start_cluster,
2702                                                reftable_end_cluster,
2703                                                &on_disk_reftable,
2704                                                &on_disk_reftable_entries, errp);
2705          if (reftable_size_changed < 0) {
2706              res->check_errors++;
2707              ret = reftable_size_changed;
2708              goto fail;
2709          }
2710  
2711          /*
2712           * If the reftable size has changed, we will need to find a new
2713           * allocation, repeating the loop.
2714           */
2715      } while (reftable_size_changed);
2716  
2717      /* The above loop must have run at least once */
2718      assert(reftable_offset >= 0);
2719  
2720      /*
2721       * All allocations are done, all refblocks are written, convert the
2722       * reftable to big-endian and write it to disk.
2723       */
2724  
2725      for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2726           refblock_index++)
2727      {
2728          cpu_to_be64s(&on_disk_reftable[refblock_index]);
2729      }
2730  
2731      ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
2732                                          false);
2733      if (ret < 0) {
2734          error_setg_errno(errp, -ret, "ERROR writing reftable");
2735          goto fail;
2736      }
2737  
2738      assert(reftable_length < INT_MAX);
2739      ret = bdrv_co_pwrite(bs->file, reftable_offset, reftable_length,
2740                           on_disk_reftable, 0);
2741      if (ret < 0) {
2742          error_setg_errno(errp, -ret, "ERROR writing reftable");
2743          goto fail;
2744      }
2745  
2746      /* Enter new reftable into the image header */
2747      reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2748      reftable_offset_and_clusters.reftable_clusters =
2749          cpu_to_be32(reftable_clusters);
2750      ret = bdrv_co_pwrite_sync(bs->file,
2751                                offsetof(QCowHeader, refcount_table_offset),
2752                                sizeof(reftable_offset_and_clusters),
2753                                &reftable_offset_and_clusters, 0);
2754      if (ret < 0) {
2755          error_setg_errno(errp, -ret, "ERROR setting reftable");
2756          goto fail;
2757      }
2758  
2759      for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2760           refblock_index++)
2761      {
2762          be64_to_cpus(&on_disk_reftable[refblock_index]);
2763      }
2764      s->refcount_table = on_disk_reftable;
2765      s->refcount_table_offset = reftable_offset;
2766      s->refcount_table_size = on_disk_reftable_entries;
2767      update_max_refcount_table_index(s);
2768  
2769      return 0;
2770  
2771  fail:
2772      g_free(on_disk_reftable);
2773      return ret;
2774  }
2775  
2776  /*
2777   * Checks an image for refcount consistency.
2778   *
2779   * Returns 0 if no errors are found, the number of errors in case the image is
2780   * detected as corrupted, and -errno when an internal error occurred.
2781   */
2782  int coroutine_fn GRAPH_RDLOCK
2783  qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix)
2784  {
2785      BDRVQcow2State *s = bs->opaque;
2786      BdrvCheckResult pre_compare_res;
2787      int64_t size, highest_cluster, nb_clusters;
2788      void *refcount_table = NULL;
2789      bool rebuild = false;
2790      int ret;
2791  
2792      size = bdrv_co_getlength(bs->file->bs);
2793      if (size < 0) {
2794          res->check_errors++;
2795          return size;
2796      }
2797  
2798      nb_clusters = size_to_clusters(s, size);
2799      if (nb_clusters > INT_MAX) {
2800          res->check_errors++;
2801          return -EFBIG;
2802      }
2803  
2804      res->bfi.total_clusters =
2805          size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2806  
2807      ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2808                                &nb_clusters);
2809      if (ret < 0) {
2810          goto fail;
2811      }
2812  
2813      /* In case we don't need to rebuild the refcount structure (but want to fix
2814       * something), this function is immediately called again, in which case the
2815       * result should be ignored */
2816      pre_compare_res = *res;
2817      compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2818                        nb_clusters);
2819  
2820      if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2821          BdrvCheckResult old_res = *res;
2822          int fresh_leaks = 0;
2823          Error *local_err = NULL;
2824  
2825          fprintf(stderr, "Rebuilding refcount structure\n");
2826          ret = rebuild_refcount_structure(bs, res, &refcount_table,
2827                                           &nb_clusters, &local_err);
2828          if (ret < 0) {
2829              error_report_err(local_err);
2830              goto fail;
2831          }
2832  
2833          res->corruptions = 0;
2834          res->leaks = 0;
2835  
2836          /* Because the old reftable has been exchanged for a new one the
2837           * references have to be recalculated */
2838          rebuild = false;
2839          memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2840          ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2841                                    &nb_clusters);
2842          if (ret < 0) {
2843              goto fail;
2844          }
2845  
2846          if (fix & BDRV_FIX_LEAKS) {
2847              /* The old refcount structures are now leaked, fix it; the result
2848               * can be ignored, aside from leaks which were introduced by
2849               * rebuild_refcount_structure() that could not be fixed */
2850              BdrvCheckResult saved_res = *res;
2851              *res = (BdrvCheckResult){ 0 };
2852  
2853              compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2854                                &highest_cluster, refcount_table, nb_clusters);
2855              if (rebuild) {
2856                  fprintf(stderr, "ERROR rebuilt refcount structure is still "
2857                          "broken\n");
2858              }
2859  
2860              /* Any leaks accounted for here were introduced by
2861               * rebuild_refcount_structure() because that function has created a
2862               * new refcount structure from scratch */
2863              fresh_leaks = res->leaks;
2864              *res = saved_res;
2865          }
2866  
2867          if (res->corruptions < old_res.corruptions) {
2868              res->corruptions_fixed += old_res.corruptions - res->corruptions;
2869          }
2870          if (res->leaks < old_res.leaks) {
2871              res->leaks_fixed += old_res.leaks - res->leaks;
2872          }
2873          res->leaks += fresh_leaks;
2874      } else if (fix) {
2875          if (rebuild) {
2876              fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2877              res->check_errors++;
2878              ret = -EIO;
2879              goto fail;
2880          }
2881  
2882          if (res->leaks || res->corruptions) {
2883              *res = pre_compare_res;
2884              compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2885                                refcount_table, nb_clusters);
2886          }
2887      }
2888  
2889      /* check OFLAG_COPIED */
2890      ret = check_oflag_copied(bs, res, fix);
2891      if (ret < 0) {
2892          goto fail;
2893      }
2894  
2895      res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2896      ret = 0;
2897  
2898  fail:
2899      g_free(refcount_table);
2900  
2901      return ret;
2902  }
2903  
2904  #define overlaps_with(ofs, sz) \
2905      ranges_overlap(offset, size, ofs, sz)
2906  
2907  /*
2908   * Checks if the given offset into the image file is actually free to use by
2909   * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2910   * i.e. a sanity check without relying on the refcount tables.
2911   *
2912   * The ign parameter specifies what checks not to perform (being a bitmask of
2913   * QCow2MetadataOverlap values), i.e., what sections to ignore.
2914   *
2915   * Returns:
2916   * - 0 if writing to this offset will not affect the mentioned metadata
2917   * - a positive QCow2MetadataOverlap value indicating one overlapping section
2918   * - a negative value (-errno) indicating an error while performing a check,
2919   *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2920   */
2921  int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2922                                   int64_t size)
2923  {
2924      BDRVQcow2State *s = bs->opaque;
2925      int chk = s->overlap_check & ~ign;
2926      int i, j;
2927  
2928      if (!size) {
2929          return 0;
2930      }
2931  
2932      if (chk & QCOW2_OL_MAIN_HEADER) {
2933          if (offset < s->cluster_size) {
2934              return QCOW2_OL_MAIN_HEADER;
2935          }
2936      }
2937  
2938      /* align range to test to cluster boundaries */
2939      size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2940      offset = start_of_cluster(s, offset);
2941  
2942      if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2943          if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2944              return QCOW2_OL_ACTIVE_L1;
2945          }
2946      }
2947  
2948      if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2949          if (overlaps_with(s->refcount_table_offset,
2950              s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2951              return QCOW2_OL_REFCOUNT_TABLE;
2952          }
2953      }
2954  
2955      if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2956          if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2957              return QCOW2_OL_SNAPSHOT_TABLE;
2958          }
2959      }
2960  
2961      if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2962          for (i = 0; i < s->nb_snapshots; i++) {
2963              if (s->snapshots[i].l1_size &&
2964                  overlaps_with(s->snapshots[i].l1_table_offset,
2965                  s->snapshots[i].l1_size * L1E_SIZE)) {
2966                  return QCOW2_OL_INACTIVE_L1;
2967              }
2968          }
2969      }
2970  
2971      if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2972          for (i = 0; i < s->l1_size; i++) {
2973              if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2974                  overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2975                  s->cluster_size)) {
2976                  return QCOW2_OL_ACTIVE_L2;
2977              }
2978          }
2979      }
2980  
2981      if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2982          unsigned last_entry = s->max_refcount_table_index;
2983          assert(last_entry < s->refcount_table_size);
2984          assert(last_entry + 1 == s->refcount_table_size ||
2985                 (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2986          for (i = 0; i <= last_entry; i++) {
2987              if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2988                  overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2989                  s->cluster_size)) {
2990                  return QCOW2_OL_REFCOUNT_BLOCK;
2991              }
2992          }
2993      }
2994  
2995      if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2996          for (i = 0; i < s->nb_snapshots; i++) {
2997              uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2998              uint32_t l1_sz  = s->snapshots[i].l1_size;
2999              uint64_t l1_sz2 = l1_sz * L1E_SIZE;
3000              uint64_t *l1;
3001              int ret;
3002  
3003              ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
3004                                         QCOW_MAX_L1_SIZE, "", NULL);
3005              if (ret < 0) {
3006                  return ret;
3007              }
3008  
3009              l1 = g_try_malloc(l1_sz2);
3010  
3011              if (l1_sz2 && l1 == NULL) {
3012                  return -ENOMEM;
3013              }
3014  
3015              ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
3016              if (ret < 0) {
3017                  g_free(l1);
3018                  return ret;
3019              }
3020  
3021              for (j = 0; j < l1_sz; j++) {
3022                  uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
3023                  if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
3024                      g_free(l1);
3025                      return QCOW2_OL_INACTIVE_L2;
3026                  }
3027              }
3028  
3029              g_free(l1);
3030          }
3031      }
3032  
3033      if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
3034          (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
3035      {
3036          if (overlaps_with(s->bitmap_directory_offset,
3037                            s->bitmap_directory_size))
3038          {
3039              return QCOW2_OL_BITMAP_DIRECTORY;
3040          }
3041      }
3042  
3043      return 0;
3044  }
3045  
3046  static const char *metadata_ol_names[] = {
3047      [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
3048      [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
3049      [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
3050      [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
3051      [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
3052      [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
3053      [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
3054      [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
3055      [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
3056  };
3057  QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
3058  
3059  /*
3060   * First performs a check for metadata overlaps (through
3061   * qcow2_check_metadata_overlap); if that fails with a negative value (error
3062   * while performing a check), that value is returned. If an impending overlap
3063   * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
3064   * and -EIO returned.
3065   *
3066   * Returns 0 if there were neither overlaps nor errors while checking for
3067   * overlaps; or a negative value (-errno) on error.
3068   */
3069  int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
3070                                    int64_t size, bool data_file)
3071  {
3072      int ret;
3073  
3074      if (data_file && has_data_file(bs)) {
3075          return 0;
3076      }
3077  
3078      ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
3079      if (ret < 0) {
3080          return ret;
3081      } else if (ret > 0) {
3082          int metadata_ol_bitnr = ctz32(ret);
3083          assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
3084  
3085          qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
3086                                  "write on metadata (overlaps with %s)",
3087                                  metadata_ol_names[metadata_ol_bitnr]);
3088          return -EIO;
3089      }
3090  
3091      return 0;
3092  }
3093  
3094  /* A pointer to a function of this type is given to walk_over_reftable(). That
3095   * function will create refblocks and pass them to a RefblockFinishOp once they
3096   * are completed (@refblock). @refblock_empty is set if the refblock is
3097   * completely empty.
3098   *
3099   * Along with the refblock, a corresponding reftable entry is passed, in the
3100   * reftable @reftable (which may be reallocated) at @reftable_index.
3101   *
3102   * @allocated should be set to true if a new cluster has been allocated.
3103   */
3104  typedef int /* GRAPH_RDLOCK_PTR */
3105      (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
3106                         uint64_t reftable_index, uint64_t *reftable_size,
3107                         void *refblock, bool refblock_empty,
3108                         bool *allocated, Error **errp);
3109  
3110  /**
3111   * This "operation" for walk_over_reftable() allocates the refblock on disk (if
3112   * it is not empty) and inserts its offset into the new reftable. The size of
3113   * this new reftable is increased as required.
3114   */
3115  static int GRAPH_RDLOCK
3116  alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
3117                 uint64_t reftable_index, uint64_t *reftable_size,
3118                 void *refblock, bool refblock_empty, bool *allocated,
3119                 Error **errp)
3120  {
3121      BDRVQcow2State *s = bs->opaque;
3122      int64_t offset;
3123  
3124      if (!refblock_empty && reftable_index >= *reftable_size) {
3125          uint64_t *new_reftable;
3126          uint64_t new_reftable_size;
3127  
3128          new_reftable_size = ROUND_UP(reftable_index + 1,
3129                                       s->cluster_size / REFTABLE_ENTRY_SIZE);
3130          if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
3131              error_setg(errp,
3132                         "This operation would make the refcount table grow "
3133                         "beyond the maximum size supported by QEMU, aborting");
3134              return -ENOTSUP;
3135          }
3136  
3137          new_reftable = g_try_realloc(*reftable, new_reftable_size *
3138                                                  REFTABLE_ENTRY_SIZE);
3139          if (!new_reftable) {
3140              error_setg(errp, "Failed to increase reftable buffer size");
3141              return -ENOMEM;
3142          }
3143  
3144          memset(new_reftable + *reftable_size, 0,
3145                 (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
3146  
3147          *reftable      = new_reftable;
3148          *reftable_size = new_reftable_size;
3149      }
3150  
3151      if (!refblock_empty && !(*reftable)[reftable_index]) {
3152          offset = qcow2_alloc_clusters(bs, s->cluster_size);
3153          if (offset < 0) {
3154              error_setg_errno(errp, -offset, "Failed to allocate refblock");
3155              return offset;
3156          }
3157          (*reftable)[reftable_index] = offset;
3158          *allocated = true;
3159      }
3160  
3161      return 0;
3162  }
3163  
3164  /**
3165   * This "operation" for walk_over_reftable() writes the refblock to disk at the
3166   * offset specified by the new reftable's entry. It does not modify the new
3167   * reftable or change any refcounts.
3168   */
3169  static int GRAPH_RDLOCK
3170  flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3171                 uint64_t reftable_index, uint64_t *reftable_size,
3172                 void *refblock, bool refblock_empty, bool *allocated,
3173                 Error **errp)
3174  {
3175      BDRVQcow2State *s = bs->opaque;
3176      int64_t offset;
3177      int ret;
3178  
3179      if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3180          offset = (*reftable)[reftable_index];
3181  
3182          ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3183                                              false);
3184          if (ret < 0) {
3185              error_setg_errno(errp, -ret, "Overlap check failed");
3186              return ret;
3187          }
3188  
3189          ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
3190          if (ret < 0) {
3191              error_setg_errno(errp, -ret, "Failed to write refblock");
3192              return ret;
3193          }
3194      } else {
3195          assert(refblock_empty);
3196      }
3197  
3198      return 0;
3199  }
3200  
3201  /**
3202   * This function walks over the existing reftable and every referenced refblock;
3203   * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3204   * create an equal new entry in the passed @new_refblock. Once that
3205   * @new_refblock is completely filled, @operation will be called.
3206   *
3207   * @status_cb and @cb_opaque are used for the amend operation's status callback.
3208   * @index is the index of the walk_over_reftable() calls and @total is the total
3209   * number of walk_over_reftable() calls per amend operation. Both are used for
3210   * calculating the parameters for the status callback.
3211   *
3212   * @allocated is set to true if a new cluster has been allocated.
3213   */
3214  static int GRAPH_RDLOCK
3215  walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3216                     uint64_t *new_reftable_index,
3217                     uint64_t *new_reftable_size,
3218                     void *new_refblock, int new_refblock_size,
3219                     int new_refcount_bits,
3220                     RefblockFinishOp *operation, bool *allocated,
3221                     Qcow2SetRefcountFunc *new_set_refcount,
3222                     BlockDriverAmendStatusCB *status_cb,
3223                     void *cb_opaque, int index, int total,
3224                     Error **errp)
3225  {
3226      BDRVQcow2State *s = bs->opaque;
3227      uint64_t reftable_index;
3228      bool new_refblock_empty = true;
3229      int refblock_index;
3230      int new_refblock_index = 0;
3231      int ret;
3232  
3233      for (reftable_index = 0; reftable_index < s->refcount_table_size;
3234           reftable_index++)
3235      {
3236          uint64_t refblock_offset = s->refcount_table[reftable_index]
3237                                   & REFT_OFFSET_MASK;
3238  
3239          status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3240                    (uint64_t)total * s->refcount_table_size, cb_opaque);
3241  
3242          if (refblock_offset) {
3243              void *refblock;
3244  
3245              if (offset_into_cluster(s, refblock_offset)) {
3246                  qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3247                                          PRIx64 " unaligned (reftable index: %#"
3248                                          PRIx64 ")", refblock_offset,
3249                                          reftable_index);
3250                  error_setg(errp,
3251                             "Image is corrupt (unaligned refblock offset)");
3252                  return -EIO;
3253              }
3254  
3255              ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3256                                    &refblock);
3257              if (ret < 0) {
3258                  error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3259                  return ret;
3260              }
3261  
3262              for (refblock_index = 0; refblock_index < s->refcount_block_size;
3263                   refblock_index++)
3264              {
3265                  uint64_t refcount;
3266  
3267                  if (new_refblock_index >= new_refblock_size) {
3268                      /* new_refblock is now complete */
3269                      ret = operation(bs, new_reftable, *new_reftable_index,
3270                                      new_reftable_size, new_refblock,
3271                                      new_refblock_empty, allocated, errp);
3272                      if (ret < 0) {
3273                          qcow2_cache_put(s->refcount_block_cache, &refblock);
3274                          return ret;
3275                      }
3276  
3277                      (*new_reftable_index)++;
3278                      new_refblock_index = 0;
3279                      new_refblock_empty = true;
3280                  }
3281  
3282                  refcount = s->get_refcount(refblock, refblock_index);
3283                  if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3284                      uint64_t offset;
3285  
3286                      qcow2_cache_put(s->refcount_block_cache, &refblock);
3287  
3288                      offset = ((reftable_index << s->refcount_block_bits)
3289                                + refblock_index) << s->cluster_bits;
3290  
3291                      error_setg(errp, "Cannot decrease refcount entry width to "
3292                                 "%i bits: Cluster at offset %#" PRIx64 " has a "
3293                                 "refcount of %" PRIu64, new_refcount_bits,
3294                                 offset, refcount);
3295                      return -EINVAL;
3296                  }
3297  
3298                  if (new_set_refcount) {
3299                      new_set_refcount(new_refblock, new_refblock_index++,
3300                                       refcount);
3301                  } else {
3302                      new_refblock_index++;
3303                  }
3304                  new_refblock_empty = new_refblock_empty && refcount == 0;
3305              }
3306  
3307              qcow2_cache_put(s->refcount_block_cache, &refblock);
3308          } else {
3309              /* No refblock means every refcount is 0 */
3310              for (refblock_index = 0; refblock_index < s->refcount_block_size;
3311                   refblock_index++)
3312              {
3313                  if (new_refblock_index >= new_refblock_size) {
3314                      /* new_refblock is now complete */
3315                      ret = operation(bs, new_reftable, *new_reftable_index,
3316                                      new_reftable_size, new_refblock,
3317                                      new_refblock_empty, allocated, errp);
3318                      if (ret < 0) {
3319                          return ret;
3320                      }
3321  
3322                      (*new_reftable_index)++;
3323                      new_refblock_index = 0;
3324                      new_refblock_empty = true;
3325                  }
3326  
3327                  if (new_set_refcount) {
3328                      new_set_refcount(new_refblock, new_refblock_index++, 0);
3329                  } else {
3330                      new_refblock_index++;
3331                  }
3332              }
3333          }
3334      }
3335  
3336      if (new_refblock_index > 0) {
3337          /* Complete the potentially existing partially filled final refblock */
3338          if (new_set_refcount) {
3339              for (; new_refblock_index < new_refblock_size;
3340                   new_refblock_index++)
3341              {
3342                  new_set_refcount(new_refblock, new_refblock_index, 0);
3343              }
3344          }
3345  
3346          ret = operation(bs, new_reftable, *new_reftable_index,
3347                          new_reftable_size, new_refblock, new_refblock_empty,
3348                          allocated, errp);
3349          if (ret < 0) {
3350              return ret;
3351          }
3352  
3353          (*new_reftable_index)++;
3354      }
3355  
3356      status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3357                (uint64_t)total * s->refcount_table_size, cb_opaque);
3358  
3359      return 0;
3360  }
3361  
3362  int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3363                                  BlockDriverAmendStatusCB *status_cb,
3364                                  void *cb_opaque, Error **errp)
3365  {
3366      BDRVQcow2State *s = bs->opaque;
3367      Qcow2GetRefcountFunc *new_get_refcount;
3368      Qcow2SetRefcountFunc *new_set_refcount;
3369      void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3370      uint64_t *new_reftable = NULL, new_reftable_size = 0;
3371      uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3372      uint64_t new_reftable_index = 0;
3373      uint64_t i;
3374      int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3375      int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3376      int old_refcount_order;
3377      int walk_index = 0;
3378      int ret;
3379      bool new_allocation;
3380  
3381      assert(s->qcow_version >= 3);
3382      assert(refcount_order >= 0 && refcount_order <= 6);
3383  
3384      /* see qcow2_open() */
3385      new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3386  
3387      new_get_refcount = get_refcount_funcs[refcount_order];
3388      new_set_refcount = set_refcount_funcs[refcount_order];
3389  
3390  
3391      do {
3392          int total_walks;
3393  
3394          new_allocation = false;
3395  
3396          /* At least we have to do this walk and the one which writes the
3397           * refblocks; also, at least we have to do this loop here at least
3398           * twice (normally), first to do the allocations, and second to
3399           * determine that everything is correctly allocated, this then makes
3400           * three walks in total */
3401          total_walks = MAX(walk_index + 2, 3);
3402  
3403          /* First, allocate the structures so they are present in the refcount
3404           * structures */
3405          ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3406                                   &new_reftable_size, NULL, new_refblock_size,
3407                                   new_refcount_bits, &alloc_refblock,
3408                                   &new_allocation, NULL, status_cb, cb_opaque,
3409                                   walk_index++, total_walks, errp);
3410          if (ret < 0) {
3411              goto done;
3412          }
3413  
3414          new_reftable_index = 0;
3415  
3416          if (new_allocation) {
3417              if (new_reftable_offset) {
3418                  qcow2_free_clusters(
3419                      bs, new_reftable_offset,
3420                      allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3421                      QCOW2_DISCARD_NEVER);
3422              }
3423  
3424              new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3425                                                             REFTABLE_ENTRY_SIZE);
3426              if (new_reftable_offset < 0) {
3427                  error_setg_errno(errp, -new_reftable_offset,
3428                                   "Failed to allocate the new reftable");
3429                  ret = new_reftable_offset;
3430                  goto done;
3431              }
3432              allocated_reftable_size = new_reftable_size;
3433          }
3434      } while (new_allocation);
3435  
3436      /* Second, write the new refblocks */
3437      ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3438                               &new_reftable_size, new_refblock,
3439                               new_refblock_size, new_refcount_bits,
3440                               &flush_refblock, &new_allocation, new_set_refcount,
3441                               status_cb, cb_opaque, walk_index, walk_index + 1,
3442                               errp);
3443      if (ret < 0) {
3444          goto done;
3445      }
3446      assert(!new_allocation);
3447  
3448  
3449      /* Write the new reftable */
3450      ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3451                                          new_reftable_size * REFTABLE_ENTRY_SIZE,
3452                                          false);
3453      if (ret < 0) {
3454          error_setg_errno(errp, -ret, "Overlap check failed");
3455          goto done;
3456      }
3457  
3458      for (i = 0; i < new_reftable_size; i++) {
3459          cpu_to_be64s(&new_reftable[i]);
3460      }
3461  
3462      ret = bdrv_pwrite(bs->file, new_reftable_offset,
3463                        new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
3464                        0);
3465  
3466      for (i = 0; i < new_reftable_size; i++) {
3467          be64_to_cpus(&new_reftable[i]);
3468      }
3469  
3470      if (ret < 0) {
3471          error_setg_errno(errp, -ret, "Failed to write the new reftable");
3472          goto done;
3473      }
3474  
3475  
3476      /* Empty the refcount cache */
3477      ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3478      if (ret < 0) {
3479          error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3480          goto done;
3481      }
3482  
3483      /* Update the image header to point to the new reftable; this only updates
3484       * the fields which are relevant to qcow2_update_header(); other fields
3485       * such as s->refcount_table or s->refcount_bits stay stale for now
3486       * (because we have to restore everything if qcow2_update_header() fails) */
3487      old_refcount_order  = s->refcount_order;
3488      old_reftable_size   = s->refcount_table_size;
3489      old_reftable_offset = s->refcount_table_offset;
3490  
3491      s->refcount_order        = refcount_order;
3492      s->refcount_table_size   = new_reftable_size;
3493      s->refcount_table_offset = new_reftable_offset;
3494  
3495      ret = qcow2_update_header(bs);
3496      if (ret < 0) {
3497          s->refcount_order        = old_refcount_order;
3498          s->refcount_table_size   = old_reftable_size;
3499          s->refcount_table_offset = old_reftable_offset;
3500          error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3501          goto done;
3502      }
3503  
3504      /* Now update the rest of the in-memory information */
3505      old_reftable = s->refcount_table;
3506      s->refcount_table = new_reftable;
3507      update_max_refcount_table_index(s);
3508  
3509      s->refcount_bits = 1 << refcount_order;
3510      s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3511      s->refcount_max += s->refcount_max - 1;
3512  
3513      s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3514      s->refcount_block_size = 1 << s->refcount_block_bits;
3515  
3516      s->get_refcount = new_get_refcount;
3517      s->set_refcount = new_set_refcount;
3518  
3519      /* For cleaning up all old refblocks and the old reftable below the "done"
3520       * label */
3521      new_reftable        = old_reftable;
3522      new_reftable_size   = old_reftable_size;
3523      new_reftable_offset = old_reftable_offset;
3524  
3525  done:
3526      if (new_reftable) {
3527          /* On success, new_reftable actually points to the old reftable (and
3528           * new_reftable_size is the old reftable's size); but that is just
3529           * fine */
3530          for (i = 0; i < new_reftable_size; i++) {
3531              uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3532              if (offset) {
3533                  qcow2_free_clusters(bs, offset, s->cluster_size,
3534                                      QCOW2_DISCARD_OTHER);
3535              }
3536          }
3537          g_free(new_reftable);
3538  
3539          if (new_reftable_offset > 0) {
3540              qcow2_free_clusters(bs, new_reftable_offset,
3541                                  new_reftable_size * REFTABLE_ENTRY_SIZE,
3542                                  QCOW2_DISCARD_OTHER);
3543          }
3544      }
3545  
3546      qemu_vfree(new_refblock);
3547      return ret;
3548  }
3549  
3550  static int64_t coroutine_fn GRAPH_RDLOCK
3551  get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3552  {
3553      BDRVQcow2State *s = bs->opaque;
3554      uint32_t index = offset_to_reftable_index(s, offset);
3555      int64_t covering_refblock_offset = 0;
3556  
3557      if (index < s->refcount_table_size) {
3558          covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3559      }
3560      if (!covering_refblock_offset) {
3561          qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3562                                  "not covered by the refcount structures",
3563                                  offset);
3564          return -EIO;
3565      }
3566  
3567      return covering_refblock_offset;
3568  }
3569  
3570  static int coroutine_fn GRAPH_RDLOCK
3571  qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
3572  {
3573      BDRVQcow2State *s = bs->opaque;
3574      int64_t refblock_offs;
3575      uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3576      uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3577      void *refblock;
3578      int ret;
3579  
3580      refblock_offs = get_refblock_offset(bs, discard_block_offs);
3581      if (refblock_offs < 0) {
3582          return refblock_offs;
3583      }
3584  
3585      assert(discard_block_offs != 0);
3586  
3587      ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3588                            &refblock);
3589      if (ret < 0) {
3590          return ret;
3591      }
3592  
3593      if (s->get_refcount(refblock, block_index) != 1) {
3594          qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3595                                  " refblock offset %#" PRIx64
3596                                  ", reftable index %u"
3597                                  ", block offset %#" PRIx64
3598                                  ", refcount %#" PRIx64,
3599                                  refblock_offs,
3600                                  offset_to_reftable_index(s, discard_block_offs),
3601                                  discard_block_offs,
3602                                  s->get_refcount(refblock, block_index));
3603          qcow2_cache_put(s->refcount_block_cache, &refblock);
3604          return -EINVAL;
3605      }
3606      s->set_refcount(refblock, block_index, 0);
3607  
3608      qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3609  
3610      qcow2_cache_put(s->refcount_block_cache, &refblock);
3611  
3612      if (cluster_index < s->free_cluster_index) {
3613          s->free_cluster_index = cluster_index;
3614      }
3615  
3616      refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3617                                             discard_block_offs);
3618      if (refblock) {
3619          /* discard refblock from the cache if refblock is cached */
3620          qcow2_cache_discard(s->refcount_block_cache, refblock);
3621      }
3622      update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3623  
3624      return 0;
3625  }
3626  
3627  int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
3628  {
3629      BDRVQcow2State *s = bs->opaque;
3630      uint64_t *reftable_tmp =
3631          g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3632      int i, ret;
3633  
3634      for (i = 0; i < s->refcount_table_size; i++) {
3635          int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3636          void *refblock;
3637          bool unused_block;
3638  
3639          if (refblock_offs == 0) {
3640              reftable_tmp[i] = 0;
3641              continue;
3642          }
3643          ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3644                                &refblock);
3645          if (ret < 0) {
3646              goto out;
3647          }
3648  
3649          /* the refblock has own reference */
3650          if (i == offset_to_reftable_index(s, refblock_offs)) {
3651              uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3652                                     (s->refcount_block_size - 1);
3653              uint64_t refcount = s->get_refcount(refblock, block_index);
3654  
3655              s->set_refcount(refblock, block_index, 0);
3656  
3657              unused_block = buffer_is_zero(refblock, s->cluster_size);
3658  
3659              s->set_refcount(refblock, block_index, refcount);
3660          } else {
3661              unused_block = buffer_is_zero(refblock, s->cluster_size);
3662          }
3663          qcow2_cache_put(s->refcount_block_cache, &refblock);
3664  
3665          reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3666      }
3667  
3668      ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
3669                                s->refcount_table_size * REFTABLE_ENTRY_SIZE,
3670                                reftable_tmp, 0);
3671      /*
3672       * If the write in the reftable failed the image may contain a partially
3673       * overwritten reftable. In this case it would be better to clear the
3674       * reftable in memory to avoid possible image corruption.
3675       */
3676      for (i = 0; i < s->refcount_table_size; i++) {
3677          if (s->refcount_table[i] && !reftable_tmp[i]) {
3678              if (ret == 0) {
3679                  ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3680                                                         REFT_OFFSET_MASK);
3681              }
3682              s->refcount_table[i] = 0;
3683          }
3684      }
3685  
3686      if (!s->cache_discards) {
3687          qcow2_process_discards(bs, ret);
3688      }
3689  
3690  out:
3691      g_free(reftable_tmp);
3692      return ret;
3693  }
3694  
3695  int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3696  {
3697      BDRVQcow2State *s = bs->opaque;
3698      int64_t i;
3699  
3700      for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3701          uint64_t refcount;
3702          int ret = qcow2_get_refcount(bs, i, &refcount);
3703          if (ret < 0) {
3704              fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3705                      i, strerror(-ret));
3706              return ret;
3707          }
3708          if (refcount > 0) {
3709              return i;
3710          }
3711      }
3712      qcow2_signal_corruption(bs, true, -1, -1,
3713                              "There are no references in the refcount table.");
3714      return -EIO;
3715  }
3716  
3717  int coroutine_fn GRAPH_RDLOCK
3718  qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3719  {
3720      BDRVQcow2State *s = bs->opaque;
3721      int64_t i, end_cluster, cluster_count = 0, threshold;
3722      int64_t file_length, real_allocation, real_clusters;
3723  
3724      qemu_co_mutex_assert_locked(&s->lock);
3725  
3726      file_length = bdrv_co_getlength(bs->file->bs);
3727      if (file_length < 0) {
3728          return file_length;
3729      }
3730  
3731      real_allocation = bdrv_co_get_allocated_file_size(bs->file->bs);
3732      if (real_allocation < 0) {
3733          return real_allocation;
3734      }
3735  
3736      real_clusters = real_allocation / s->cluster_size;
3737      threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3738  
3739      end_cluster = size_to_clusters(s, file_length);
3740      for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3741          uint64_t refcount;
3742          int ret = qcow2_get_refcount(bs, i, &refcount);
3743          if (ret < 0) {
3744              return ret;
3745          }
3746          cluster_count += !!refcount;
3747      }
3748  
3749      return cluster_count >= threshold;
3750  }
3751