xref: /qemu/block/qcow2.c (revision 5ea929e3d13622e5d06ca8795819f1590644cda2)
1585f8587Sbellard /*
2585f8587Sbellard  * Block driver for the QCOW version 2 format
3585f8587Sbellard  *
4585f8587Sbellard  * Copyright (c) 2004-2006 Fabrice Bellard
5585f8587Sbellard  *
6585f8587Sbellard  * Permission is hereby granted, free of charge, to any person obtaining a copy
7585f8587Sbellard  * of this software and associated documentation files (the "Software"), to deal
8585f8587Sbellard  * in the Software without restriction, including without limitation the rights
9585f8587Sbellard  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10585f8587Sbellard  * copies of the Software, and to permit persons to whom the Software is
11585f8587Sbellard  * furnished to do so, subject to the following conditions:
12585f8587Sbellard  *
13585f8587Sbellard  * The above copyright notice and this permission notice shall be included in
14585f8587Sbellard  * all copies or substantial portions of the Software.
15585f8587Sbellard  *
16585f8587Sbellard  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17585f8587Sbellard  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18585f8587Sbellard  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19585f8587Sbellard  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20585f8587Sbellard  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21585f8587Sbellard  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22585f8587Sbellard  * THE SOFTWARE.
23585f8587Sbellard  */
24faf07963Spbrook #include "qemu-common.h"
25585f8587Sbellard #include "block_int.h"
265efa9d5aSAnthony Liguori #include "module.h"
27585f8587Sbellard #include <zlib.h>
28585f8587Sbellard #include "aes.h"
29f7d0fe02SKevin Wolf #include "block/qcow2.h"
30a9420734SKevin Wolf #include "qemu-error.h"
31585f8587Sbellard 
32585f8587Sbellard /*
33585f8587Sbellard   Differences with QCOW:
34585f8587Sbellard 
35585f8587Sbellard   - Support for multiple incremental snapshots.
36585f8587Sbellard   - Memory management by reference counts.
37585f8587Sbellard   - Clusters which have a reference count of one have the bit
38585f8587Sbellard     QCOW_OFLAG_COPIED to optimize write performance.
39585f8587Sbellard   - Size of compressed clusters is stored in sectors to reduce bit usage
40585f8587Sbellard     in the cluster offsets.
41585f8587Sbellard   - Support for storing additional data (such as the VM state) in the
42585f8587Sbellard     snapshots.
43585f8587Sbellard   - If a backing store is used, the cluster size is not constrained
44585f8587Sbellard     (could be backported to QCOW).
45585f8587Sbellard   - L2 tables have always a size of one cluster.
46585f8587Sbellard */
47585f8587Sbellard 
489b80ddf3Saliguori 
499b80ddf3Saliguori typedef struct {
509b80ddf3Saliguori     uint32_t magic;
519b80ddf3Saliguori     uint32_t len;
529b80ddf3Saliguori } QCowExtension;
537c80ab3fSJes Sorensen #define  QCOW2_EXT_MAGIC_END 0
547c80ab3fSJes Sorensen #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
559b80ddf3Saliguori 
567c80ab3fSJes Sorensen static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
57585f8587Sbellard {
58585f8587Sbellard     const QCowHeader *cow_header = (const void *)buf;
59585f8587Sbellard 
60585f8587Sbellard     if (buf_size >= sizeof(QCowHeader) &&
61585f8587Sbellard         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
62585f8587Sbellard         be32_to_cpu(cow_header->version) == QCOW_VERSION)
63585f8587Sbellard         return 100;
64585f8587Sbellard     else
65585f8587Sbellard         return 0;
66585f8587Sbellard }
67585f8587Sbellard 
689b80ddf3Saliguori 
699b80ddf3Saliguori /*
709b80ddf3Saliguori  * read qcow2 extension and fill bs
719b80ddf3Saliguori  * start reading from start_offset
729b80ddf3Saliguori  * finish reading upon magic of value 0 or when end_offset reached
739b80ddf3Saliguori  * unknown magic is skipped (future extension this version knows nothing about)
749b80ddf3Saliguori  * return 0 upon success, non-0 otherwise
759b80ddf3Saliguori  */
767c80ab3fSJes Sorensen static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
779b80ddf3Saliguori                                  uint64_t end_offset)
789b80ddf3Saliguori {
799b80ddf3Saliguori     QCowExtension ext;
809b80ddf3Saliguori     uint64_t offset;
819b80ddf3Saliguori 
829b80ddf3Saliguori #ifdef DEBUG_EXT
837c80ab3fSJes Sorensen     printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
849b80ddf3Saliguori #endif
859b80ddf3Saliguori     offset = start_offset;
869b80ddf3Saliguori     while (offset < end_offset) {
879b80ddf3Saliguori 
889b80ddf3Saliguori #ifdef DEBUG_EXT
899b80ddf3Saliguori         /* Sanity check */
909b80ddf3Saliguori         if (offset > s->cluster_size)
917c80ab3fSJes Sorensen             printf("qcow2_read_extension: suspicious offset %lu\n", offset);
929b80ddf3Saliguori 
939b80ddf3Saliguori         printf("attemting to read extended header in offset %lu\n", offset);
949b80ddf3Saliguori #endif
959b80ddf3Saliguori 
9666f82ceeSKevin Wolf         if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
977c80ab3fSJes Sorensen             fprintf(stderr, "qcow2_read_extension: ERROR: "
980bfcd599SBlue Swirl                     "pread fail from offset %" PRIu64 "\n",
990bfcd599SBlue Swirl                     offset);
1009b80ddf3Saliguori             return 1;
1019b80ddf3Saliguori         }
1029b80ddf3Saliguori         be32_to_cpus(&ext.magic);
1039b80ddf3Saliguori         be32_to_cpus(&ext.len);
1049b80ddf3Saliguori         offset += sizeof(ext);
1059b80ddf3Saliguori #ifdef DEBUG_EXT
1069b80ddf3Saliguori         printf("ext.magic = 0x%x\n", ext.magic);
1079b80ddf3Saliguori #endif
1089b80ddf3Saliguori         switch (ext.magic) {
1097c80ab3fSJes Sorensen         case QCOW2_EXT_MAGIC_END:
1109b80ddf3Saliguori             return 0;
111f965509cSaliguori 
1127c80ab3fSJes Sorensen         case QCOW2_EXT_MAGIC_BACKING_FORMAT:
113f965509cSaliguori             if (ext.len >= sizeof(bs->backing_format)) {
114f965509cSaliguori                 fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
1154c978075Saliguori                         " (>=%zu)\n",
116f965509cSaliguori                         ext.len, sizeof(bs->backing_format));
117f965509cSaliguori                 return 2;
118f965509cSaliguori             }
11966f82ceeSKevin Wolf             if (bdrv_pread(bs->file, offset , bs->backing_format,
120f965509cSaliguori                            ext.len) != ext.len)
121f965509cSaliguori                 return 3;
122f965509cSaliguori             bs->backing_format[ext.len] = '\0';
123f965509cSaliguori #ifdef DEBUG_EXT
124f965509cSaliguori             printf("Qcow2: Got format extension %s\n", bs->backing_format);
125f965509cSaliguori #endif
126e1c7f0e3SKevin Wolf             offset = ((offset + ext.len + 7) & ~7);
127f965509cSaliguori             break;
128f965509cSaliguori 
1299b80ddf3Saliguori         default:
1309b80ddf3Saliguori             /* unknown magic -- just skip it */
131e1c7f0e3SKevin Wolf             offset = ((offset + ext.len + 7) & ~7);
1329b80ddf3Saliguori             break;
1339b80ddf3Saliguori         }
1349b80ddf3Saliguori     }
1359b80ddf3Saliguori 
1369b80ddf3Saliguori     return 0;
1379b80ddf3Saliguori }
1389b80ddf3Saliguori 
1399b80ddf3Saliguori 
1407c80ab3fSJes Sorensen static int qcow2_open(BlockDriverState *bs, int flags)
141585f8587Sbellard {
142585f8587Sbellard     BDRVQcowState *s = bs->opaque;
1436d85a57eSJes Sorensen     int len, i, ret = 0;
144585f8587Sbellard     QCowHeader header;
1459b80ddf3Saliguori     uint64_t ext_end;
14629c1a730SKevin Wolf     bool writethrough;
147585f8587Sbellard 
1486d85a57eSJes Sorensen     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
1496d85a57eSJes Sorensen     if (ret < 0) {
150585f8587Sbellard         goto fail;
1516d85a57eSJes Sorensen     }
152585f8587Sbellard     be32_to_cpus(&header.magic);
153585f8587Sbellard     be32_to_cpus(&header.version);
154585f8587Sbellard     be64_to_cpus(&header.backing_file_offset);
155585f8587Sbellard     be32_to_cpus(&header.backing_file_size);
156585f8587Sbellard     be64_to_cpus(&header.size);
157585f8587Sbellard     be32_to_cpus(&header.cluster_bits);
158585f8587Sbellard     be32_to_cpus(&header.crypt_method);
159585f8587Sbellard     be64_to_cpus(&header.l1_table_offset);
160585f8587Sbellard     be32_to_cpus(&header.l1_size);
161585f8587Sbellard     be64_to_cpus(&header.refcount_table_offset);
162585f8587Sbellard     be32_to_cpus(&header.refcount_table_clusters);
163585f8587Sbellard     be64_to_cpus(&header.snapshots_offset);
164585f8587Sbellard     be32_to_cpus(&header.nb_snapshots);
165585f8587Sbellard 
1666d85a57eSJes Sorensen     if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION) {
1676d85a57eSJes Sorensen         ret = -EINVAL;
168585f8587Sbellard         goto fail;
1696d85a57eSJes Sorensen     }
170d191d12dSStefan Weil     if (header.cluster_bits < MIN_CLUSTER_BITS ||
1716d85a57eSJes Sorensen         header.cluster_bits > MAX_CLUSTER_BITS) {
1726d85a57eSJes Sorensen         ret = -EINVAL;
173585f8587Sbellard         goto fail;
1746d85a57eSJes Sorensen     }
1756d85a57eSJes Sorensen     if (header.crypt_method > QCOW_CRYPT_AES) {
1766d85a57eSJes Sorensen         ret = -EINVAL;
177585f8587Sbellard         goto fail;
1786d85a57eSJes Sorensen     }
179585f8587Sbellard     s->crypt_method_header = header.crypt_method;
1806d85a57eSJes Sorensen     if (s->crypt_method_header) {
181585f8587Sbellard         bs->encrypted = 1;
1826d85a57eSJes Sorensen     }
183585f8587Sbellard     s->cluster_bits = header.cluster_bits;
184585f8587Sbellard     s->cluster_size = 1 << s->cluster_bits;
185585f8587Sbellard     s->cluster_sectors = 1 << (s->cluster_bits - 9);
186585f8587Sbellard     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
187585f8587Sbellard     s->l2_size = 1 << s->l2_bits;
188585f8587Sbellard     bs->total_sectors = header.size / 512;
189585f8587Sbellard     s->csize_shift = (62 - (s->cluster_bits - 8));
190585f8587Sbellard     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
191585f8587Sbellard     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
192585f8587Sbellard     s->refcount_table_offset = header.refcount_table_offset;
193585f8587Sbellard     s->refcount_table_size =
194585f8587Sbellard         header.refcount_table_clusters << (s->cluster_bits - 3);
195585f8587Sbellard 
196585f8587Sbellard     s->snapshots_offset = header.snapshots_offset;
197585f8587Sbellard     s->nb_snapshots = header.nb_snapshots;
198585f8587Sbellard 
199585f8587Sbellard     /* read the level 1 table */
200585f8587Sbellard     s->l1_size = header.l1_size;
201419b19d9SStefan Hajnoczi     s->l1_vm_state_index = size_to_l1(s, header.size);
202585f8587Sbellard     /* the L1 table must contain at least enough entries to put
203585f8587Sbellard        header.size bytes */
2046d85a57eSJes Sorensen     if (s->l1_size < s->l1_vm_state_index) {
2056d85a57eSJes Sorensen         ret = -EINVAL;
206585f8587Sbellard         goto fail;
2076d85a57eSJes Sorensen     }
208585f8587Sbellard     s->l1_table_offset = header.l1_table_offset;
209d191d12dSStefan Weil     if (s->l1_size > 0) {
2103f6a3ee5SKevin Wolf         s->l1_table = qemu_mallocz(
2113f6a3ee5SKevin Wolf             align_offset(s->l1_size * sizeof(uint64_t), 512));
2126d85a57eSJes Sorensen         ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
2136d85a57eSJes Sorensen                          s->l1_size * sizeof(uint64_t));
2146d85a57eSJes Sorensen         if (ret < 0) {
215585f8587Sbellard             goto fail;
2166d85a57eSJes Sorensen         }
217585f8587Sbellard         for(i = 0;i < s->l1_size; i++) {
218585f8587Sbellard             be64_to_cpus(&s->l1_table[i]);
219585f8587Sbellard         }
220d191d12dSStefan Weil     }
22129c1a730SKevin Wolf 
22229c1a730SKevin Wolf     /* alloc L2 table/refcount block cache */
22329c1a730SKevin Wolf     writethrough = ((flags & BDRV_O_CACHE_MASK) == 0);
22429c1a730SKevin Wolf     s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE, writethrough);
22529c1a730SKevin Wolf     s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE,
22629c1a730SKevin Wolf         writethrough);
22729c1a730SKevin Wolf 
228585f8587Sbellard     s->cluster_cache = qemu_malloc(s->cluster_size);
229585f8587Sbellard     /* one more sector for decompressed data alignment */
230095a9c58Saliguori     s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
231095a9c58Saliguori                                   + 512);
232585f8587Sbellard     s->cluster_cache_offset = -1;
233585f8587Sbellard 
2346d85a57eSJes Sorensen     ret = qcow2_refcount_init(bs);
2356d85a57eSJes Sorensen     if (ret != 0) {
236585f8587Sbellard         goto fail;
2376d85a57eSJes Sorensen     }
238585f8587Sbellard 
23972cf2d4fSBlue Swirl     QLIST_INIT(&s->cluster_allocs);
240f214978aSKevin Wolf 
2419b80ddf3Saliguori     /* read qcow2 extensions */
2426d85a57eSJes Sorensen     if (header.backing_file_offset) {
2439b80ddf3Saliguori         ext_end = header.backing_file_offset;
2446d85a57eSJes Sorensen     } else {
2459b80ddf3Saliguori         ext_end = s->cluster_size;
2466d85a57eSJes Sorensen     }
2476d85a57eSJes Sorensen     if (qcow2_read_extensions(bs, sizeof(header), ext_end)) {
2486d85a57eSJes Sorensen         ret = -EINVAL;
2499b80ddf3Saliguori         goto fail;
2506d85a57eSJes Sorensen     }
2519b80ddf3Saliguori 
252585f8587Sbellard     /* read the backing file name */
253585f8587Sbellard     if (header.backing_file_offset != 0) {
254585f8587Sbellard         len = header.backing_file_size;
2556d85a57eSJes Sorensen         if (len > 1023) {
256585f8587Sbellard             len = 1023;
2576d85a57eSJes Sorensen         }
2586d85a57eSJes Sorensen         ret = bdrv_pread(bs->file, header.backing_file_offset,
2596d85a57eSJes Sorensen                          bs->backing_file, len);
2606d85a57eSJes Sorensen         if (ret < 0) {
261585f8587Sbellard             goto fail;
2626d85a57eSJes Sorensen         }
263585f8587Sbellard         bs->backing_file[len] = '\0';
264585f8587Sbellard     }
2656d85a57eSJes Sorensen     if (qcow2_read_snapshots(bs) < 0) {
2666d85a57eSJes Sorensen         ret = -EINVAL;
267585f8587Sbellard         goto fail;
2686d85a57eSJes Sorensen     }
269585f8587Sbellard 
270585f8587Sbellard #ifdef DEBUG_ALLOC
27114899cdfSFilip Navara     qcow2_check_refcounts(bs);
272585f8587Sbellard #endif
2736d85a57eSJes Sorensen     return ret;
274585f8587Sbellard 
275585f8587Sbellard  fail:
276ed6ccf0fSKevin Wolf     qcow2_free_snapshots(bs);
277ed6ccf0fSKevin Wolf     qcow2_refcount_close(bs);
278585f8587Sbellard     qemu_free(s->l1_table);
27929c1a730SKevin Wolf     if (s->l2_table_cache) {
28029c1a730SKevin Wolf         qcow2_cache_destroy(bs, s->l2_table_cache);
28129c1a730SKevin Wolf     }
282585f8587Sbellard     qemu_free(s->cluster_cache);
283585f8587Sbellard     qemu_free(s->cluster_data);
2846d85a57eSJes Sorensen     return ret;
285585f8587Sbellard }
286585f8587Sbellard 
2877c80ab3fSJes Sorensen static int qcow2_set_key(BlockDriverState *bs, const char *key)
288585f8587Sbellard {
289585f8587Sbellard     BDRVQcowState *s = bs->opaque;
290585f8587Sbellard     uint8_t keybuf[16];
291585f8587Sbellard     int len, i;
292585f8587Sbellard 
293585f8587Sbellard     memset(keybuf, 0, 16);
294585f8587Sbellard     len = strlen(key);
295585f8587Sbellard     if (len > 16)
296585f8587Sbellard         len = 16;
297585f8587Sbellard     /* XXX: we could compress the chars to 7 bits to increase
298585f8587Sbellard        entropy */
299585f8587Sbellard     for(i = 0;i < len;i++) {
300585f8587Sbellard         keybuf[i] = key[i];
301585f8587Sbellard     }
302585f8587Sbellard     s->crypt_method = s->crypt_method_header;
303585f8587Sbellard 
304585f8587Sbellard     if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
305585f8587Sbellard         return -1;
306585f8587Sbellard     if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
307585f8587Sbellard         return -1;
308585f8587Sbellard #if 0
309585f8587Sbellard     /* test */
310585f8587Sbellard     {
311585f8587Sbellard         uint8_t in[16];
312585f8587Sbellard         uint8_t out[16];
313585f8587Sbellard         uint8_t tmp[16];
314585f8587Sbellard         for(i=0;i<16;i++)
315585f8587Sbellard             in[i] = i;
316585f8587Sbellard         AES_encrypt(in, tmp, &s->aes_encrypt_key);
317585f8587Sbellard         AES_decrypt(tmp, out, &s->aes_decrypt_key);
318585f8587Sbellard         for(i = 0; i < 16; i++)
319585f8587Sbellard             printf(" %02x", tmp[i]);
320585f8587Sbellard         printf("\n");
321585f8587Sbellard         for(i = 0; i < 16; i++)
322585f8587Sbellard             printf(" %02x", out[i]);
323585f8587Sbellard         printf("\n");
324585f8587Sbellard     }
325585f8587Sbellard #endif
326585f8587Sbellard     return 0;
327585f8587Sbellard }
328585f8587Sbellard 
3297c80ab3fSJes Sorensen static int qcow2_is_allocated(BlockDriverState *bs, int64_t sector_num,
330585f8587Sbellard                               int nb_sectors, int *pnum)
331585f8587Sbellard {
332585f8587Sbellard     uint64_t cluster_offset;
3331c46efaaSKevin Wolf     int ret;
334585f8587Sbellard 
335095a9c58Saliguori     *pnum = nb_sectors;
3361c46efaaSKevin Wolf     /* FIXME We can get errors here, but the bdrv_is_allocated interface can't
3371c46efaaSKevin Wolf      * pass them on today */
3381c46efaaSKevin Wolf     ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
3391c46efaaSKevin Wolf     if (ret < 0) {
3401c46efaaSKevin Wolf         *pnum = 0;
3411c46efaaSKevin Wolf     }
342095a9c58Saliguori 
343585f8587Sbellard     return (cluster_offset != 0);
344585f8587Sbellard }
345585f8587Sbellard 
346a9465922Sbellard /* handle reading after the end of the backing file */
347bd28f835SKevin Wolf int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
348bd28f835SKevin Wolf                   int64_t sector_num, int nb_sectors)
349a9465922Sbellard {
350a9465922Sbellard     int n1;
351a9465922Sbellard     if ((sector_num + nb_sectors) <= bs->total_sectors)
352a9465922Sbellard         return nb_sectors;
353a9465922Sbellard     if (sector_num >= bs->total_sectors)
354a9465922Sbellard         n1 = 0;
355a9465922Sbellard     else
356a9465922Sbellard         n1 = bs->total_sectors - sector_num;
357bd28f835SKevin Wolf 
358bd28f835SKevin Wolf     qemu_iovec_memset(qiov, 0, 512 * (nb_sectors - n1));
359bd28f835SKevin Wolf 
360a9465922Sbellard     return n1;
361a9465922Sbellard }
362a9465922Sbellard 
363ce1a14dcSpbrook typedef struct QCowAIOCB {
364ce1a14dcSpbrook     BlockDriverAIOCB common;
365585f8587Sbellard     int64_t sector_num;
366f141eafeSaliguori     QEMUIOVector *qiov;
3677b88e48bSChristoph Hellwig     int remaining_sectors;
3687b88e48bSChristoph Hellwig     int cur_nr_sectors;	/* number of sectors in current iteration */
369bd28f835SKevin Wolf     uint64_t bytes_done;
370585f8587Sbellard     uint64_t cluster_offset;
371585f8587Sbellard     uint8_t *cluster_data;
372585f8587Sbellard     BlockDriverAIOCB *hd_aiocb;
373c87c0672Saliguori     QEMUIOVector hd_qiov;
3741490791fSaliguori     QEMUBH *bh;
375e976c6a1Saliguori     QCowL2Meta l2meta;
37672cf2d4fSBlue Swirl     QLIST_ENTRY(QCowAIOCB) next_depend;
377585f8587Sbellard } QCowAIOCB;
378585f8587Sbellard 
3797c80ab3fSJes Sorensen static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb)
380c16b5a2cSChristoph Hellwig {
381b666d239SKevin Wolf     QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
382c16b5a2cSChristoph Hellwig     if (acb->hd_aiocb)
383c16b5a2cSChristoph Hellwig         bdrv_aio_cancel(acb->hd_aiocb);
384c16b5a2cSChristoph Hellwig     qemu_aio_release(acb);
385c16b5a2cSChristoph Hellwig }
386c16b5a2cSChristoph Hellwig 
3877c80ab3fSJes Sorensen static AIOPool qcow2_aio_pool = {
388c16b5a2cSChristoph Hellwig     .aiocb_size         = sizeof(QCowAIOCB),
3897c80ab3fSJes Sorensen     .cancel             = qcow2_aio_cancel,
390c16b5a2cSChristoph Hellwig };
391c16b5a2cSChristoph Hellwig 
3927c80ab3fSJes Sorensen static void qcow2_aio_read_cb(void *opaque, int ret);
3937c80ab3fSJes Sorensen static void qcow2_aio_read_bh(void *opaque)
3941490791fSaliguori {
3951490791fSaliguori     QCowAIOCB *acb = opaque;
3961490791fSaliguori     qemu_bh_delete(acb->bh);
3971490791fSaliguori     acb->bh = NULL;
3987c80ab3fSJes Sorensen     qcow2_aio_read_cb(opaque, 0);
3991490791fSaliguori }
4001490791fSaliguori 
4017c80ab3fSJes Sorensen static int qcow2_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
402a32ef786Saliguori {
403a32ef786Saliguori     if (acb->bh)
404a32ef786Saliguori         return -EIO;
405a32ef786Saliguori 
406a32ef786Saliguori     acb->bh = qemu_bh_new(cb, acb);
407a32ef786Saliguori     if (!acb->bh)
408a32ef786Saliguori         return -EIO;
409a32ef786Saliguori 
410a32ef786Saliguori     qemu_bh_schedule(acb->bh);
411a32ef786Saliguori 
412a32ef786Saliguori     return 0;
413a32ef786Saliguori }
414a32ef786Saliguori 
4157c80ab3fSJes Sorensen static void qcow2_aio_read_cb(void *opaque, int ret)
416585f8587Sbellard {
417ce1a14dcSpbrook     QCowAIOCB *acb = opaque;
418ce1a14dcSpbrook     BlockDriverState *bs = acb->common.bs;
419585f8587Sbellard     BDRVQcowState *s = bs->opaque;
420a9465922Sbellard     int index_in_cluster, n1;
421585f8587Sbellard 
422ce1a14dcSpbrook     acb->hd_aiocb = NULL;
423f141eafeSaliguori     if (ret < 0)
424f141eafeSaliguori         goto done;
425585f8587Sbellard 
426585f8587Sbellard     /* post process the read buffer */
427ce1a14dcSpbrook     if (!acb->cluster_offset) {
428585f8587Sbellard         /* nothing to do */
429ce1a14dcSpbrook     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
430585f8587Sbellard         /* nothing to do */
431585f8587Sbellard     } else {
432585f8587Sbellard         if (s->crypt_method) {
433bd28f835SKevin Wolf             qcow2_encrypt_sectors(s, acb->sector_num,  acb->cluster_data,
434bd28f835SKevin Wolf                 acb->cluster_data, acb->cur_nr_sectors, 0, &s->aes_decrypt_key);
435bd28f835SKevin Wolf             qemu_iovec_reset(&acb->hd_qiov);
436bd28f835SKevin Wolf             qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
437bd28f835SKevin Wolf                 acb->cur_nr_sectors * 512);
438bd28f835SKevin Wolf             qemu_iovec_from_buffer(&acb->hd_qiov, acb->cluster_data,
439bd28f835SKevin Wolf                 512 * acb->cur_nr_sectors);
440585f8587Sbellard         }
441585f8587Sbellard     }
442585f8587Sbellard 
4437b88e48bSChristoph Hellwig     acb->remaining_sectors -= acb->cur_nr_sectors;
4447b88e48bSChristoph Hellwig     acb->sector_num += acb->cur_nr_sectors;
445bd28f835SKevin Wolf     acb->bytes_done += acb->cur_nr_sectors * 512;
446585f8587Sbellard 
4477b88e48bSChristoph Hellwig     if (acb->remaining_sectors == 0) {
448585f8587Sbellard         /* request completed */
449f141eafeSaliguori         ret = 0;
450f141eafeSaliguori         goto done;
451585f8587Sbellard     }
452585f8587Sbellard 
453585f8587Sbellard     /* prepare next AIO request */
4547b88e48bSChristoph Hellwig     acb->cur_nr_sectors = acb->remaining_sectors;
455bd28f835SKevin Wolf     if (s->crypt_method) {
456bd28f835SKevin Wolf         acb->cur_nr_sectors = MIN(acb->cur_nr_sectors,
457bd28f835SKevin Wolf             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
458bd28f835SKevin Wolf     }
459bd28f835SKevin Wolf 
4601c46efaaSKevin Wolf     ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9,
4611c46efaaSKevin Wolf         &acb->cur_nr_sectors, &acb->cluster_offset);
4621c46efaaSKevin Wolf     if (ret < 0) {
4631c46efaaSKevin Wolf         goto done;
4641c46efaaSKevin Wolf     }
4651c46efaaSKevin Wolf 
466095a9c58Saliguori     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
467585f8587Sbellard 
468bd28f835SKevin Wolf     qemu_iovec_reset(&acb->hd_qiov);
469bd28f835SKevin Wolf     qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
470bd28f835SKevin Wolf         acb->cur_nr_sectors * 512);
471bd28f835SKevin Wolf 
472ce1a14dcSpbrook     if (!acb->cluster_offset) {
473bd28f835SKevin Wolf 
474585f8587Sbellard         if (bs->backing_hd) {
475585f8587Sbellard             /* read from the base image */
476bd28f835SKevin Wolf             n1 = qcow2_backing_read1(bs->backing_hd, &acb->hd_qiov,
477bd28f835SKevin Wolf                 acb->sector_num, acb->cur_nr_sectors);
478a9465922Sbellard             if (n1 > 0) {
47966f82ceeSKevin Wolf                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
480c87c0672Saliguori                 acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
4817b88e48bSChristoph Hellwig                                     &acb->hd_qiov, acb->cur_nr_sectors,
4827c80ab3fSJes Sorensen 				    qcow2_aio_read_cb, acb);
483ce1a14dcSpbrook                 if (acb->hd_aiocb == NULL)
484f141eafeSaliguori                     goto done;
485585f8587Sbellard             } else {
4867c80ab3fSJes Sorensen                 ret = qcow2_schedule_bh(qcow2_aio_read_bh, acb);
487a32ef786Saliguori                 if (ret < 0)
488f141eafeSaliguori                     goto done;
4891490791fSaliguori             }
490a9465922Sbellard         } else {
491585f8587Sbellard             /* Note: in this case, no need to wait */
492bd28f835SKevin Wolf             qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors);
4937c80ab3fSJes Sorensen             ret = qcow2_schedule_bh(qcow2_aio_read_bh, acb);
494a32ef786Saliguori             if (ret < 0)
495f141eafeSaliguori                 goto done;
4961490791fSaliguori         }
497ce1a14dcSpbrook     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
498585f8587Sbellard         /* add AIO support for compressed blocks ? */
49966f82ceeSKevin Wolf         if (qcow2_decompress_cluster(bs, acb->cluster_offset) < 0)
500f141eafeSaliguori             goto done;
501bd28f835SKevin Wolf 
502bd28f835SKevin Wolf         qemu_iovec_from_buffer(&acb->hd_qiov,
503bd28f835SKevin Wolf             s->cluster_cache + index_in_cluster * 512,
5047b88e48bSChristoph Hellwig             512 * acb->cur_nr_sectors);
505bd28f835SKevin Wolf 
5067c80ab3fSJes Sorensen         ret = qcow2_schedule_bh(qcow2_aio_read_bh, acb);
507a32ef786Saliguori         if (ret < 0)
508f141eafeSaliguori             goto done;
509585f8587Sbellard     } else {
510ce1a14dcSpbrook         if ((acb->cluster_offset & 511) != 0) {
511585f8587Sbellard             ret = -EIO;
512f141eafeSaliguori             goto done;
513585f8587Sbellard         }
514c87c0672Saliguori 
515bd28f835SKevin Wolf         if (s->crypt_method) {
516bd28f835SKevin Wolf             /*
517bd28f835SKevin Wolf              * For encrypted images, read everything into a temporary
518bd28f835SKevin Wolf              * contiguous buffer on which the AES functions can work.
519bd28f835SKevin Wolf              */
520bd28f835SKevin Wolf             if (!acb->cluster_data) {
521bd28f835SKevin Wolf                 acb->cluster_data =
522bd28f835SKevin Wolf                     qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
523bd28f835SKevin Wolf             }
524bd28f835SKevin Wolf 
525bd28f835SKevin Wolf             assert(acb->cur_nr_sectors <=
526bd28f835SKevin Wolf                 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
527bd28f835SKevin Wolf             qemu_iovec_reset(&acb->hd_qiov);
528bd28f835SKevin Wolf             qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
529bd28f835SKevin Wolf                 512 * acb->cur_nr_sectors);
530bd28f835SKevin Wolf         }
531bd28f835SKevin Wolf 
53266f82ceeSKevin Wolf         BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
53366f82ceeSKevin Wolf         acb->hd_aiocb = bdrv_aio_readv(bs->file,
534ce1a14dcSpbrook                             (acb->cluster_offset >> 9) + index_in_cluster,
5357b88e48bSChristoph Hellwig                             &acb->hd_qiov, acb->cur_nr_sectors,
5367c80ab3fSJes Sorensen                             qcow2_aio_read_cb, acb);
537171e3d6bSKevin Wolf         if (acb->hd_aiocb == NULL) {
538171e3d6bSKevin Wolf             ret = -EIO;
539f141eafeSaliguori             goto done;
540585f8587Sbellard         }
541171e3d6bSKevin Wolf     }
542f141eafeSaliguori 
543f141eafeSaliguori     return;
544f141eafeSaliguori done:
545f141eafeSaliguori     acb->common.cb(acb->common.opaque, ret);
546bd28f835SKevin Wolf     qemu_iovec_destroy(&acb->hd_qiov);
547f141eafeSaliguori     qemu_aio_release(acb);
548585f8587Sbellard }
549585f8587Sbellard 
5507c80ab3fSJes Sorensen static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
5517c80ab3fSJes Sorensen                                   QEMUIOVector *qiov, int nb_sectors,
5527c80ab3fSJes Sorensen                                   BlockDriverCompletionFunc *cb,
5537c80ab3fSJes Sorensen                                   void *opaque, int is_write)
554585f8587Sbellard {
555ce1a14dcSpbrook     QCowAIOCB *acb;
556585f8587Sbellard 
5577c80ab3fSJes Sorensen     acb = qemu_aio_get(&qcow2_aio_pool, bs, cb, opaque);
558ce1a14dcSpbrook     if (!acb)
559ce1a14dcSpbrook         return NULL;
560ce1a14dcSpbrook     acb->hd_aiocb = NULL;
561ce1a14dcSpbrook     acb->sector_num = sector_num;
562f141eafeSaliguori     acb->qiov = qiov;
563bd28f835SKevin Wolf 
564bd28f835SKevin Wolf     qemu_iovec_init(&acb->hd_qiov, qiov->niov);
565bd28f835SKevin Wolf 
566bd28f835SKevin Wolf     acb->bytes_done = 0;
5677b88e48bSChristoph Hellwig     acb->remaining_sectors = nb_sectors;
5687b88e48bSChristoph Hellwig     acb->cur_nr_sectors = 0;
569ce1a14dcSpbrook     acb->cluster_offset = 0;
570e976c6a1Saliguori     acb->l2meta.nb_clusters = 0;
57172cf2d4fSBlue Swirl     QLIST_INIT(&acb->l2meta.dependent_requests);
572ce1a14dcSpbrook     return acb;
573ce1a14dcSpbrook }
574ce1a14dcSpbrook 
5757c80ab3fSJes Sorensen static BlockDriverAIOCB *qcow2_aio_readv(BlockDriverState *bs,
5767c80ab3fSJes Sorensen                                          int64_t sector_num,
5777c80ab3fSJes Sorensen                                          QEMUIOVector *qiov, int nb_sectors,
5787c80ab3fSJes Sorensen                                          BlockDriverCompletionFunc *cb,
5797c80ab3fSJes Sorensen                                          void *opaque)
580ce1a14dcSpbrook {
581ce1a14dcSpbrook     QCowAIOCB *acb;
582ce1a14dcSpbrook 
5837c80ab3fSJes Sorensen     acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
584ce1a14dcSpbrook     if (!acb)
585ce1a14dcSpbrook         return NULL;
586585f8587Sbellard 
5877c80ab3fSJes Sorensen     qcow2_aio_read_cb(acb, 0);
588ce1a14dcSpbrook     return &acb->common;
589585f8587Sbellard }
590585f8587Sbellard 
5917c80ab3fSJes Sorensen static void qcow2_aio_write_cb(void *opaque, int ret);
592f214978aSKevin Wolf 
593f214978aSKevin Wolf static void run_dependent_requests(QCowL2Meta *m)
594f214978aSKevin Wolf {
595f214978aSKevin Wolf     QCowAIOCB *req;
596f214978aSKevin Wolf     QCowAIOCB *next;
597f214978aSKevin Wolf 
598f214978aSKevin Wolf     /* Take the request off the list of running requests */
599f214978aSKevin Wolf     if (m->nb_clusters != 0) {
60072cf2d4fSBlue Swirl         QLIST_REMOVE(m, next_in_flight);
601f214978aSKevin Wolf     }
602f214978aSKevin Wolf 
603d4c146f0SStefan Hajnoczi     /* Restart all dependent requests */
604d4c146f0SStefan Hajnoczi     QLIST_FOREACH_SAFE(req, &m->dependent_requests, next_depend, next) {
6057c80ab3fSJes Sorensen         qcow2_aio_write_cb(req, 0);
606f214978aSKevin Wolf     }
607f214978aSKevin Wolf 
608f214978aSKevin Wolf     /* Empty the list for the next part of the request */
60972cf2d4fSBlue Swirl     QLIST_INIT(&m->dependent_requests);
610f214978aSKevin Wolf }
611f214978aSKevin Wolf 
6127c80ab3fSJes Sorensen static void qcow2_aio_write_cb(void *opaque, int ret)
613585f8587Sbellard {
614ce1a14dcSpbrook     QCowAIOCB *acb = opaque;
615ce1a14dcSpbrook     BlockDriverState *bs = acb->common.bs;
616585f8587Sbellard     BDRVQcowState *s = bs->opaque;
617585f8587Sbellard     int index_in_cluster;
618095a9c58Saliguori     int n_end;
619585f8587Sbellard 
620ce1a14dcSpbrook     acb->hd_aiocb = NULL;
621ce1a14dcSpbrook 
622f214978aSKevin Wolf     if (ret >= 0) {
623148da7eaSKevin Wolf         ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta);
624f214978aSKevin Wolf     }
625f214978aSKevin Wolf 
626f214978aSKevin Wolf     run_dependent_requests(&acb->l2meta);
627f214978aSKevin Wolf 
628f141eafeSaliguori     if (ret < 0)
629f141eafeSaliguori         goto done;
630585f8587Sbellard 
6317b88e48bSChristoph Hellwig     acb->remaining_sectors -= acb->cur_nr_sectors;
6327b88e48bSChristoph Hellwig     acb->sector_num += acb->cur_nr_sectors;
6336f5f060bSKevin Wolf     acb->bytes_done += acb->cur_nr_sectors * 512;
634585f8587Sbellard 
6357b88e48bSChristoph Hellwig     if (acb->remaining_sectors == 0) {
636585f8587Sbellard         /* request completed */
637f141eafeSaliguori         ret = 0;
638f141eafeSaliguori         goto done;
639585f8587Sbellard     }
640585f8587Sbellard 
641ce1a14dcSpbrook     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
6427b88e48bSChristoph Hellwig     n_end = index_in_cluster + acb->remaining_sectors;
643095a9c58Saliguori     if (s->crypt_method &&
644095a9c58Saliguori         n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
645095a9c58Saliguori         n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
646095a9c58Saliguori 
647148da7eaSKevin Wolf     ret = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9,
6487b88e48bSChristoph Hellwig         index_in_cluster, n_end, &acb->cur_nr_sectors, &acb->l2meta);
649148da7eaSKevin Wolf     if (ret < 0) {
650148da7eaSKevin Wolf         goto done;
651148da7eaSKevin Wolf     }
652148da7eaSKevin Wolf 
653148da7eaSKevin Wolf     acb->cluster_offset = acb->l2meta.cluster_offset;
654f214978aSKevin Wolf 
655f214978aSKevin Wolf     /* Need to wait for another request? If so, we are done for now. */
656148da7eaSKevin Wolf     if (acb->l2meta.nb_clusters == 0 && acb->l2meta.depends_on != NULL) {
65772cf2d4fSBlue Swirl         QLIST_INSERT_HEAD(&acb->l2meta.depends_on->dependent_requests,
658f214978aSKevin Wolf             acb, next_depend);
659f214978aSKevin Wolf         return;
660f214978aSKevin Wolf     }
661f214978aSKevin Wolf 
662148da7eaSKevin Wolf     assert((acb->cluster_offset & 511) == 0);
663148da7eaSKevin Wolf 
6646f5f060bSKevin Wolf     qemu_iovec_reset(&acb->hd_qiov);
6656f5f060bSKevin Wolf     qemu_iovec_copy(&acb->hd_qiov, acb->qiov, acb->bytes_done,
6666f5f060bSKevin Wolf         acb->cur_nr_sectors * 512);
6676f5f060bSKevin Wolf 
668585f8587Sbellard     if (s->crypt_method) {
669ce1a14dcSpbrook         if (!acb->cluster_data) {
670095a9c58Saliguori             acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
671095a9c58Saliguori                                              s->cluster_size);
672585f8587Sbellard         }
6736f5f060bSKevin Wolf 
6746f5f060bSKevin Wolf         assert(acb->hd_qiov.size <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
6756f5f060bSKevin Wolf         qemu_iovec_to_buffer(&acb->hd_qiov, acb->cluster_data);
6766f5f060bSKevin Wolf 
6776f5f060bSKevin Wolf         qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data,
6786f5f060bSKevin Wolf             acb->cluster_data, acb->cur_nr_sectors, 1, &s->aes_encrypt_key);
6796f5f060bSKevin Wolf 
6806f5f060bSKevin Wolf         qemu_iovec_reset(&acb->hd_qiov);
6816f5f060bSKevin Wolf         qemu_iovec_add(&acb->hd_qiov, acb->cluster_data,
6826f5f060bSKevin Wolf             acb->cur_nr_sectors * 512);
683585f8587Sbellard     }
6846f5f060bSKevin Wolf 
68566f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
68666f82ceeSKevin Wolf     acb->hd_aiocb = bdrv_aio_writev(bs->file,
687e976c6a1Saliguori                                     (acb->cluster_offset >> 9) + index_in_cluster,
6887b88e48bSChristoph Hellwig                                     &acb->hd_qiov, acb->cur_nr_sectors,
6897c80ab3fSJes Sorensen                                     qcow2_aio_write_cb, acb);
690171e3d6bSKevin Wolf     if (acb->hd_aiocb == NULL) {
691171e3d6bSKevin Wolf         ret = -EIO;
692c644db3dSKevin Wolf         goto fail;
693171e3d6bSKevin Wolf     }
694f141eafeSaliguori 
695f141eafeSaliguori     return;
696f141eafeSaliguori 
697c644db3dSKevin Wolf fail:
698c644db3dSKevin Wolf     if (acb->l2meta.nb_clusters != 0) {
699c644db3dSKevin Wolf         QLIST_REMOVE(&acb->l2meta, next_in_flight);
700c644db3dSKevin Wolf     }
701f141eafeSaliguori done:
702f141eafeSaliguori     acb->common.cb(acb->common.opaque, ret);
7036f5f060bSKevin Wolf     qemu_iovec_destroy(&acb->hd_qiov);
704f141eafeSaliguori     qemu_aio_release(acb);
705585f8587Sbellard }
706585f8587Sbellard 
7077c80ab3fSJes Sorensen static BlockDriverAIOCB *qcow2_aio_writev(BlockDriverState *bs,
7087c80ab3fSJes Sorensen                                           int64_t sector_num,
7097c80ab3fSJes Sorensen                                           QEMUIOVector *qiov, int nb_sectors,
7107c80ab3fSJes Sorensen                                           BlockDriverCompletionFunc *cb,
7117c80ab3fSJes Sorensen                                           void *opaque)
712585f8587Sbellard {
713585f8587Sbellard     BDRVQcowState *s = bs->opaque;
714ce1a14dcSpbrook     QCowAIOCB *acb;
715585f8587Sbellard 
716585f8587Sbellard     s->cluster_cache_offset = -1; /* disable compressed cache */
717585f8587Sbellard 
7187c80ab3fSJes Sorensen     acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
719ce1a14dcSpbrook     if (!acb)
720ce1a14dcSpbrook         return NULL;
721585f8587Sbellard 
7227c80ab3fSJes Sorensen     qcow2_aio_write_cb(acb, 0);
723ce1a14dcSpbrook     return &acb->common;
724585f8587Sbellard }
725585f8587Sbellard 
7267c80ab3fSJes Sorensen static void qcow2_close(BlockDriverState *bs)
727585f8587Sbellard {
728585f8587Sbellard     BDRVQcowState *s = bs->opaque;
729585f8587Sbellard     qemu_free(s->l1_table);
73029c1a730SKevin Wolf 
73129c1a730SKevin Wolf     qcow2_cache_flush(bs, s->l2_table_cache);
73229c1a730SKevin Wolf     qcow2_cache_flush(bs, s->refcount_block_cache);
73329c1a730SKevin Wolf 
73429c1a730SKevin Wolf     qcow2_cache_destroy(bs, s->l2_table_cache);
73529c1a730SKevin Wolf     qcow2_cache_destroy(bs, s->refcount_block_cache);
73629c1a730SKevin Wolf 
737585f8587Sbellard     qemu_free(s->cluster_cache);
738585f8587Sbellard     qemu_free(s->cluster_data);
739ed6ccf0fSKevin Wolf     qcow2_refcount_close(bs);
740585f8587Sbellard }
741585f8587Sbellard 
742756e6736SKevin Wolf /*
743756e6736SKevin Wolf  * Updates the variable length parts of the qcow2 header, i.e. the backing file
744756e6736SKevin Wolf  * name and all extensions. qcow2 was not designed to allow such changes, so if
745756e6736SKevin Wolf  * we run out of space (we can only use the first cluster) this function may
746756e6736SKevin Wolf  * fail.
747756e6736SKevin Wolf  *
748756e6736SKevin Wolf  * Returns 0 on success, -errno in error cases.
749756e6736SKevin Wolf  */
750756e6736SKevin Wolf static int qcow2_update_ext_header(BlockDriverState *bs,
751756e6736SKevin Wolf     const char *backing_file, const char *backing_fmt)
752756e6736SKevin Wolf {
753756e6736SKevin Wolf     size_t backing_file_len = 0;
754756e6736SKevin Wolf     size_t backing_fmt_len = 0;
755756e6736SKevin Wolf     BDRVQcowState *s = bs->opaque;
756756e6736SKevin Wolf     QCowExtension ext_backing_fmt = {0, 0};
757756e6736SKevin Wolf     int ret;
758756e6736SKevin Wolf 
759756e6736SKevin Wolf     /* Backing file format doesn't make sense without a backing file */
760756e6736SKevin Wolf     if (backing_fmt && !backing_file) {
761756e6736SKevin Wolf         return -EINVAL;
762756e6736SKevin Wolf     }
763756e6736SKevin Wolf 
764756e6736SKevin Wolf     /* Prepare the backing file format extension if needed */
765756e6736SKevin Wolf     if (backing_fmt) {
766756e6736SKevin Wolf         ext_backing_fmt.len = cpu_to_be32(strlen(backing_fmt));
7677c80ab3fSJes Sorensen         ext_backing_fmt.magic = cpu_to_be32(QCOW2_EXT_MAGIC_BACKING_FORMAT);
768756e6736SKevin Wolf         backing_fmt_len = ((sizeof(ext_backing_fmt)
769756e6736SKevin Wolf             + strlen(backing_fmt) + 7) & ~7);
770756e6736SKevin Wolf     }
771756e6736SKevin Wolf 
772756e6736SKevin Wolf     /* Check if we can fit the new header into the first cluster */
773756e6736SKevin Wolf     if (backing_file) {
774756e6736SKevin Wolf         backing_file_len = strlen(backing_file);
775756e6736SKevin Wolf     }
776756e6736SKevin Wolf 
777756e6736SKevin Wolf     size_t header_size = sizeof(QCowHeader) + backing_file_len
778756e6736SKevin Wolf         + backing_fmt_len;
779756e6736SKevin Wolf 
780756e6736SKevin Wolf     if (header_size > s->cluster_size) {
781756e6736SKevin Wolf         return -ENOSPC;
782756e6736SKevin Wolf     }
783756e6736SKevin Wolf 
784756e6736SKevin Wolf     /* Rewrite backing file name and qcow2 extensions */
785756e6736SKevin Wolf     size_t ext_size = header_size - sizeof(QCowHeader);
786756e6736SKevin Wolf     uint8_t buf[ext_size];
787756e6736SKevin Wolf     size_t offset = 0;
788756e6736SKevin Wolf     size_t backing_file_offset = 0;
789756e6736SKevin Wolf 
790756e6736SKevin Wolf     if (backing_file) {
791756e6736SKevin Wolf         if (backing_fmt) {
792756e6736SKevin Wolf             int padding = backing_fmt_len -
793756e6736SKevin Wolf                 (sizeof(ext_backing_fmt) + strlen(backing_fmt));
794756e6736SKevin Wolf 
795756e6736SKevin Wolf             memcpy(buf + offset, &ext_backing_fmt, sizeof(ext_backing_fmt));
796756e6736SKevin Wolf             offset += sizeof(ext_backing_fmt);
797756e6736SKevin Wolf 
798756e6736SKevin Wolf             memcpy(buf + offset, backing_fmt, strlen(backing_fmt));
799756e6736SKevin Wolf             offset += strlen(backing_fmt);
800756e6736SKevin Wolf 
801756e6736SKevin Wolf             memset(buf + offset, 0, padding);
802756e6736SKevin Wolf             offset += padding;
803756e6736SKevin Wolf         }
804756e6736SKevin Wolf 
805756e6736SKevin Wolf         memcpy(buf + offset, backing_file, backing_file_len);
806756e6736SKevin Wolf         backing_file_offset = sizeof(QCowHeader) + offset;
807756e6736SKevin Wolf     }
808756e6736SKevin Wolf 
8098b3b7206SKevin Wolf     ret = bdrv_pwrite_sync(bs->file, sizeof(QCowHeader), buf, ext_size);
810756e6736SKevin Wolf     if (ret < 0) {
811756e6736SKevin Wolf         goto fail;
812756e6736SKevin Wolf     }
813756e6736SKevin Wolf 
814756e6736SKevin Wolf     /* Update header fields */
815756e6736SKevin Wolf     uint64_t be_backing_file_offset = cpu_to_be64(backing_file_offset);
816756e6736SKevin Wolf     uint32_t be_backing_file_size = cpu_to_be32(backing_file_len);
817756e6736SKevin Wolf 
8188b3b7206SKevin Wolf     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, backing_file_offset),
819756e6736SKevin Wolf         &be_backing_file_offset, sizeof(uint64_t));
820756e6736SKevin Wolf     if (ret < 0) {
821756e6736SKevin Wolf         goto fail;
822756e6736SKevin Wolf     }
823756e6736SKevin Wolf 
8248b3b7206SKevin Wolf     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, backing_file_size),
825756e6736SKevin Wolf         &be_backing_file_size, sizeof(uint32_t));
826756e6736SKevin Wolf     if (ret < 0) {
827756e6736SKevin Wolf         goto fail;
828756e6736SKevin Wolf     }
829756e6736SKevin Wolf 
830756e6736SKevin Wolf     ret = 0;
831756e6736SKevin Wolf fail:
832756e6736SKevin Wolf     return ret;
833756e6736SKevin Wolf }
834756e6736SKevin Wolf 
835756e6736SKevin Wolf static int qcow2_change_backing_file(BlockDriverState *bs,
836756e6736SKevin Wolf     const char *backing_file, const char *backing_fmt)
837756e6736SKevin Wolf {
838756e6736SKevin Wolf     return qcow2_update_ext_header(bs, backing_file, backing_fmt);
839756e6736SKevin Wolf }
840756e6736SKevin Wolf 
841a35e1c17SKevin Wolf static int preallocate(BlockDriverState *bs)
842a35e1c17SKevin Wolf {
843a35e1c17SKevin Wolf     uint64_t nb_sectors;
844a35e1c17SKevin Wolf     uint64_t offset;
845a35e1c17SKevin Wolf     int num;
846148da7eaSKevin Wolf     int ret;
847a35e1c17SKevin Wolf     QCowL2Meta meta;
848a35e1c17SKevin Wolf 
849a35e1c17SKevin Wolf     nb_sectors = bdrv_getlength(bs) >> 9;
850a35e1c17SKevin Wolf     offset = 0;
85172cf2d4fSBlue Swirl     QLIST_INIT(&meta.dependent_requests);
852148da7eaSKevin Wolf     meta.cluster_offset = 0;
853a35e1c17SKevin Wolf 
854a35e1c17SKevin Wolf     while (nb_sectors) {
855a35e1c17SKevin Wolf         num = MIN(nb_sectors, INT_MAX >> 9);
856148da7eaSKevin Wolf         ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
857148da7eaSKevin Wolf         if (ret < 0) {
85819dbcbf7SKevin Wolf             return ret;
859a35e1c17SKevin Wolf         }
860a35e1c17SKevin Wolf 
86119dbcbf7SKevin Wolf         ret = qcow2_alloc_cluster_link_l2(bs, &meta);
86219dbcbf7SKevin Wolf         if (ret < 0) {
863148da7eaSKevin Wolf             qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters);
86419dbcbf7SKevin Wolf             return ret;
865a35e1c17SKevin Wolf         }
866a35e1c17SKevin Wolf 
867f214978aSKevin Wolf         /* There are no dependent requests, but we need to remove our request
868f214978aSKevin Wolf          * from the list of in-flight requests */
869f214978aSKevin Wolf         run_dependent_requests(&meta);
870f214978aSKevin Wolf 
871a35e1c17SKevin Wolf         /* TODO Preallocate data if requested */
872a35e1c17SKevin Wolf 
873a35e1c17SKevin Wolf         nb_sectors -= num;
874a35e1c17SKevin Wolf         offset += num << 9;
875a35e1c17SKevin Wolf     }
876a35e1c17SKevin Wolf 
877a35e1c17SKevin Wolf     /*
878a35e1c17SKevin Wolf      * It is expected that the image file is large enough to actually contain
879a35e1c17SKevin Wolf      * all of the allocated clusters (otherwise we get failing reads after
880a35e1c17SKevin Wolf      * EOF). Extend the image to the last allocated sector.
881a35e1c17SKevin Wolf      */
882148da7eaSKevin Wolf     if (meta.cluster_offset != 0) {
883ea80b906SKevin Wolf         uint8_t buf[512];
884ea80b906SKevin Wolf         memset(buf, 0, 512);
88519dbcbf7SKevin Wolf         ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1);
88619dbcbf7SKevin Wolf         if (ret < 0) {
88719dbcbf7SKevin Wolf             return ret;
88819dbcbf7SKevin Wolf         }
889a35e1c17SKevin Wolf     }
890a35e1c17SKevin Wolf 
891a35e1c17SKevin Wolf     return 0;
892a35e1c17SKevin Wolf }
893a35e1c17SKevin Wolf 
8947c80ab3fSJes Sorensen static int qcow2_create2(const char *filename, int64_t total_size,
895a9420734SKevin Wolf                          const char *backing_file, const char *backing_format,
896a9420734SKevin Wolf                          int flags, size_t cluster_size, int prealloc,
897a9420734SKevin Wolf                          QEMUOptionParameter *options)
898a9420734SKevin Wolf {
899a9420734SKevin Wolf     /* Calulate cluster_bits */
900a9420734SKevin Wolf     int cluster_bits;
901a9420734SKevin Wolf     cluster_bits = ffs(cluster_size) - 1;
902a9420734SKevin Wolf     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
903a9420734SKevin Wolf         (1 << cluster_bits) != cluster_size)
904a9420734SKevin Wolf     {
905a9420734SKevin Wolf         error_report(
906a9420734SKevin Wolf             "Cluster size must be a power of two between %d and %dk\n",
907a9420734SKevin Wolf             1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
908a9420734SKevin Wolf         return -EINVAL;
909a9420734SKevin Wolf     }
910a9420734SKevin Wolf 
911a9420734SKevin Wolf     /*
912a9420734SKevin Wolf      * Open the image file and write a minimal qcow2 header.
913a9420734SKevin Wolf      *
914a9420734SKevin Wolf      * We keep things simple and start with a zero-sized image. We also
915a9420734SKevin Wolf      * do without refcount blocks or a L1 table for now. We'll fix the
916a9420734SKevin Wolf      * inconsistency later.
917a9420734SKevin Wolf      *
918a9420734SKevin Wolf      * We do need a refcount table because growing the refcount table means
919a9420734SKevin Wolf      * allocating two new refcount blocks - the seconds of which would be at
920a9420734SKevin Wolf      * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
921a9420734SKevin Wolf      * size for any qcow2 image.
922a9420734SKevin Wolf      */
923a9420734SKevin Wolf     BlockDriverState* bs;
924a9420734SKevin Wolf     QCowHeader header;
925a9420734SKevin Wolf     uint8_t* refcount_table;
926a9420734SKevin Wolf     int ret;
927a9420734SKevin Wolf 
928a9420734SKevin Wolf     ret = bdrv_create_file(filename, options);
929a9420734SKevin Wolf     if (ret < 0) {
930a9420734SKevin Wolf         return ret;
931a9420734SKevin Wolf     }
932a9420734SKevin Wolf 
933a9420734SKevin Wolf     ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
934a9420734SKevin Wolf     if (ret < 0) {
935a9420734SKevin Wolf         return ret;
936a9420734SKevin Wolf     }
937a9420734SKevin Wolf 
938a9420734SKevin Wolf     /* Write the header */
939a9420734SKevin Wolf     memset(&header, 0, sizeof(header));
940a9420734SKevin Wolf     header.magic = cpu_to_be32(QCOW_MAGIC);
941a9420734SKevin Wolf     header.version = cpu_to_be32(QCOW_VERSION);
942a9420734SKevin Wolf     header.cluster_bits = cpu_to_be32(cluster_bits);
943a9420734SKevin Wolf     header.size = cpu_to_be64(0);
944a9420734SKevin Wolf     header.l1_table_offset = cpu_to_be64(0);
945a9420734SKevin Wolf     header.l1_size = cpu_to_be32(0);
946a9420734SKevin Wolf     header.refcount_table_offset = cpu_to_be64(cluster_size);
947a9420734SKevin Wolf     header.refcount_table_clusters = cpu_to_be32(1);
948a9420734SKevin Wolf 
949a9420734SKevin Wolf     if (flags & BLOCK_FLAG_ENCRYPT) {
950a9420734SKevin Wolf         header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
951a9420734SKevin Wolf     } else {
952a9420734SKevin Wolf         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
953a9420734SKevin Wolf     }
954a9420734SKevin Wolf 
955a9420734SKevin Wolf     ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
956a9420734SKevin Wolf     if (ret < 0) {
957a9420734SKevin Wolf         goto out;
958a9420734SKevin Wolf     }
959a9420734SKevin Wolf 
960a9420734SKevin Wolf     /* Write an empty refcount table */
961a9420734SKevin Wolf     refcount_table = qemu_mallocz(cluster_size);
962a9420734SKevin Wolf     ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
963a9420734SKevin Wolf     qemu_free(refcount_table);
964a9420734SKevin Wolf 
965a9420734SKevin Wolf     if (ret < 0) {
966a9420734SKevin Wolf         goto out;
967a9420734SKevin Wolf     }
968a9420734SKevin Wolf 
969a9420734SKevin Wolf     bdrv_close(bs);
970a9420734SKevin Wolf 
971a9420734SKevin Wolf     /*
972a9420734SKevin Wolf      * And now open the image and make it consistent first (i.e. increase the
973a9420734SKevin Wolf      * refcount of the cluster that is occupied by the header and the refcount
974a9420734SKevin Wolf      * table)
975a9420734SKevin Wolf      */
976a9420734SKevin Wolf     BlockDriver* drv = bdrv_find_format("qcow2");
977a9420734SKevin Wolf     assert(drv != NULL);
978a9420734SKevin Wolf     ret = bdrv_open(bs, filename, BDRV_O_RDWR | BDRV_O_NO_FLUSH, drv);
979a9420734SKevin Wolf     if (ret < 0) {
980a9420734SKevin Wolf         goto out;
981a9420734SKevin Wolf     }
982a9420734SKevin Wolf 
983a9420734SKevin Wolf     ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
984a9420734SKevin Wolf     if (ret < 0) {
985a9420734SKevin Wolf         goto out;
986a9420734SKevin Wolf 
987a9420734SKevin Wolf     } else if (ret != 0) {
988a9420734SKevin Wolf         error_report("Huh, first cluster in empty image is already in use?");
989a9420734SKevin Wolf         abort();
990a9420734SKevin Wolf     }
991a9420734SKevin Wolf 
992a9420734SKevin Wolf     /* Okay, now that we have a valid image, let's give it the right size */
993a9420734SKevin Wolf     ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
994a9420734SKevin Wolf     if (ret < 0) {
995a9420734SKevin Wolf         goto out;
996a9420734SKevin Wolf     }
997a9420734SKevin Wolf 
998a9420734SKevin Wolf     /* Want a backing file? There you go.*/
999a9420734SKevin Wolf     if (backing_file) {
1000a9420734SKevin Wolf         ret = bdrv_change_backing_file(bs, backing_file, backing_format);
1001a9420734SKevin Wolf         if (ret < 0) {
1002a9420734SKevin Wolf             goto out;
1003a9420734SKevin Wolf         }
1004a9420734SKevin Wolf     }
1005a9420734SKevin Wolf 
1006a9420734SKevin Wolf     /* And if we're supposed to preallocate metadata, do that now */
1007a9420734SKevin Wolf     if (prealloc) {
1008a9420734SKevin Wolf         ret = preallocate(bs);
1009a9420734SKevin Wolf         if (ret < 0) {
1010a9420734SKevin Wolf             goto out;
1011a9420734SKevin Wolf         }
1012a9420734SKevin Wolf     }
1013a9420734SKevin Wolf 
1014a9420734SKevin Wolf     ret = 0;
1015a9420734SKevin Wolf out:
1016a9420734SKevin Wolf     bdrv_delete(bs);
1017a9420734SKevin Wolf     return ret;
1018a9420734SKevin Wolf }
1019de5f3f40SKevin Wolf 
10207c80ab3fSJes Sorensen static int qcow2_create(const char *filename, QEMUOptionParameter *options)
1021de5f3f40SKevin Wolf {
1022de5f3f40SKevin Wolf     const char *backing_file = NULL;
1023de5f3f40SKevin Wolf     const char *backing_fmt = NULL;
1024de5f3f40SKevin Wolf     uint64_t sectors = 0;
1025de5f3f40SKevin Wolf     int flags = 0;
1026de5f3f40SKevin Wolf     size_t cluster_size = 65536;
1027de5f3f40SKevin Wolf     int prealloc = 0;
1028de5f3f40SKevin Wolf 
1029de5f3f40SKevin Wolf     /* Read out options */
1030de5f3f40SKevin Wolf     while (options && options->name) {
1031de5f3f40SKevin Wolf         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1032de5f3f40SKevin Wolf             sectors = options->value.n / 512;
1033de5f3f40SKevin Wolf         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1034de5f3f40SKevin Wolf             backing_file = options->value.s;
1035de5f3f40SKevin Wolf         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1036de5f3f40SKevin Wolf             backing_fmt = options->value.s;
1037de5f3f40SKevin Wolf         } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1038de5f3f40SKevin Wolf             flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1039de5f3f40SKevin Wolf         } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1040de5f3f40SKevin Wolf             if (options->value.n) {
1041de5f3f40SKevin Wolf                 cluster_size = options->value.n;
1042de5f3f40SKevin Wolf             }
1043de5f3f40SKevin Wolf         } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1044de5f3f40SKevin Wolf             if (!options->value.s || !strcmp(options->value.s, "off")) {
1045de5f3f40SKevin Wolf                 prealloc = 0;
1046de5f3f40SKevin Wolf             } else if (!strcmp(options->value.s, "metadata")) {
1047de5f3f40SKevin Wolf                 prealloc = 1;
1048de5f3f40SKevin Wolf             } else {
1049de5f3f40SKevin Wolf                 fprintf(stderr, "Invalid preallocation mode: '%s'\n",
1050de5f3f40SKevin Wolf                     options->value.s);
1051de5f3f40SKevin Wolf                 return -EINVAL;
1052de5f3f40SKevin Wolf             }
1053de5f3f40SKevin Wolf         }
1054de5f3f40SKevin Wolf         options++;
1055de5f3f40SKevin Wolf     }
1056de5f3f40SKevin Wolf 
1057de5f3f40SKevin Wolf     if (backing_file && prealloc) {
1058de5f3f40SKevin Wolf         fprintf(stderr, "Backing file and preallocation cannot be used at "
1059de5f3f40SKevin Wolf             "the same time\n");
1060de5f3f40SKevin Wolf         return -EINVAL;
1061de5f3f40SKevin Wolf     }
1062de5f3f40SKevin Wolf 
10637c80ab3fSJes Sorensen     return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
1064a9420734SKevin Wolf                          cluster_size, prealloc, options);
1065de5f3f40SKevin Wolf }
1066de5f3f40SKevin Wolf 
10677c80ab3fSJes Sorensen static int qcow2_make_empty(BlockDriverState *bs)
106820d97356SBlue Swirl {
106920d97356SBlue Swirl #if 0
107020d97356SBlue Swirl     /* XXX: not correct */
107120d97356SBlue Swirl     BDRVQcowState *s = bs->opaque;
107220d97356SBlue Swirl     uint32_t l1_length = s->l1_size * sizeof(uint64_t);
107320d97356SBlue Swirl     int ret;
107420d97356SBlue Swirl 
107520d97356SBlue Swirl     memset(s->l1_table, 0, l1_length);
107666f82ceeSKevin Wolf     if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
107720d97356SBlue Swirl         return -1;
107866f82ceeSKevin Wolf     ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
107920d97356SBlue Swirl     if (ret < 0)
108020d97356SBlue Swirl         return ret;
108120d97356SBlue Swirl 
108220d97356SBlue Swirl     l2_cache_reset(bs);
108320d97356SBlue Swirl #endif
108420d97356SBlue Swirl     return 0;
108520d97356SBlue Swirl }
108620d97356SBlue Swirl 
1087*5ea929e3SKevin Wolf static int qcow2_discard(BlockDriverState *bs, int64_t sector_num,
1088*5ea929e3SKevin Wolf     int nb_sectors)
1089*5ea929e3SKevin Wolf {
1090*5ea929e3SKevin Wolf     return qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1091*5ea929e3SKevin Wolf         nb_sectors);
1092*5ea929e3SKevin Wolf }
1093*5ea929e3SKevin Wolf 
1094419b19d9SStefan Hajnoczi static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
1095419b19d9SStefan Hajnoczi {
1096419b19d9SStefan Hajnoczi     BDRVQcowState *s = bs->opaque;
1097419b19d9SStefan Hajnoczi     int ret, new_l1_size;
1098419b19d9SStefan Hajnoczi 
1099419b19d9SStefan Hajnoczi     if (offset & 511) {
1100419b19d9SStefan Hajnoczi         return -EINVAL;
1101419b19d9SStefan Hajnoczi     }
1102419b19d9SStefan Hajnoczi 
1103419b19d9SStefan Hajnoczi     /* cannot proceed if image has snapshots */
1104419b19d9SStefan Hajnoczi     if (s->nb_snapshots) {
1105419b19d9SStefan Hajnoczi         return -ENOTSUP;
1106419b19d9SStefan Hajnoczi     }
1107419b19d9SStefan Hajnoczi 
1108419b19d9SStefan Hajnoczi     /* shrinking is currently not supported */
1109419b19d9SStefan Hajnoczi     if (offset < bs->total_sectors * 512) {
1110419b19d9SStefan Hajnoczi         return -ENOTSUP;
1111419b19d9SStefan Hajnoczi     }
1112419b19d9SStefan Hajnoczi 
1113419b19d9SStefan Hajnoczi     new_l1_size = size_to_l1(s, offset);
111472893756SStefan Hajnoczi     ret = qcow2_grow_l1_table(bs, new_l1_size, true);
1115419b19d9SStefan Hajnoczi     if (ret < 0) {
1116419b19d9SStefan Hajnoczi         return ret;
1117419b19d9SStefan Hajnoczi     }
1118419b19d9SStefan Hajnoczi 
1119419b19d9SStefan Hajnoczi     /* write updated header.size */
1120419b19d9SStefan Hajnoczi     offset = cpu_to_be64(offset);
11218b3b7206SKevin Wolf     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
1122419b19d9SStefan Hajnoczi                            &offset, sizeof(uint64_t));
1123419b19d9SStefan Hajnoczi     if (ret < 0) {
1124419b19d9SStefan Hajnoczi         return ret;
1125419b19d9SStefan Hajnoczi     }
1126419b19d9SStefan Hajnoczi 
1127419b19d9SStefan Hajnoczi     s->l1_vm_state_index = new_l1_size;
1128419b19d9SStefan Hajnoczi     return 0;
1129419b19d9SStefan Hajnoczi }
1130419b19d9SStefan Hajnoczi 
113120d97356SBlue Swirl /* XXX: put compressed sectors first, then all the cluster aligned
113220d97356SBlue Swirl    tables to avoid losing bytes in alignment */
11337c80ab3fSJes Sorensen static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
113420d97356SBlue Swirl                                   const uint8_t *buf, int nb_sectors)
113520d97356SBlue Swirl {
113620d97356SBlue Swirl     BDRVQcowState *s = bs->opaque;
113720d97356SBlue Swirl     z_stream strm;
113820d97356SBlue Swirl     int ret, out_len;
113920d97356SBlue Swirl     uint8_t *out_buf;
114020d97356SBlue Swirl     uint64_t cluster_offset;
114120d97356SBlue Swirl 
114220d97356SBlue Swirl     if (nb_sectors == 0) {
114320d97356SBlue Swirl         /* align end of file to a sector boundary to ease reading with
114420d97356SBlue Swirl            sector based I/Os */
114566f82ceeSKevin Wolf         cluster_offset = bdrv_getlength(bs->file);
114620d97356SBlue Swirl         cluster_offset = (cluster_offset + 511) & ~511;
114766f82ceeSKevin Wolf         bdrv_truncate(bs->file, cluster_offset);
114820d97356SBlue Swirl         return 0;
114920d97356SBlue Swirl     }
115020d97356SBlue Swirl 
115120d97356SBlue Swirl     if (nb_sectors != s->cluster_sectors)
115220d97356SBlue Swirl         return -EINVAL;
115320d97356SBlue Swirl 
115420d97356SBlue Swirl     out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
115520d97356SBlue Swirl 
115620d97356SBlue Swirl     /* best compression, small window, no zlib header */
115720d97356SBlue Swirl     memset(&strm, 0, sizeof(strm));
115820d97356SBlue Swirl     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
115920d97356SBlue Swirl                        Z_DEFLATED, -12,
116020d97356SBlue Swirl                        9, Z_DEFAULT_STRATEGY);
116120d97356SBlue Swirl     if (ret != 0) {
116220d97356SBlue Swirl         qemu_free(out_buf);
116320d97356SBlue Swirl         return -1;
116420d97356SBlue Swirl     }
116520d97356SBlue Swirl 
116620d97356SBlue Swirl     strm.avail_in = s->cluster_size;
116720d97356SBlue Swirl     strm.next_in = (uint8_t *)buf;
116820d97356SBlue Swirl     strm.avail_out = s->cluster_size;
116920d97356SBlue Swirl     strm.next_out = out_buf;
117020d97356SBlue Swirl 
117120d97356SBlue Swirl     ret = deflate(&strm, Z_FINISH);
117220d97356SBlue Swirl     if (ret != Z_STREAM_END && ret != Z_OK) {
117320d97356SBlue Swirl         qemu_free(out_buf);
117420d97356SBlue Swirl         deflateEnd(&strm);
117520d97356SBlue Swirl         return -1;
117620d97356SBlue Swirl     }
117720d97356SBlue Swirl     out_len = strm.next_out - out_buf;
117820d97356SBlue Swirl 
117920d97356SBlue Swirl     deflateEnd(&strm);
118020d97356SBlue Swirl 
118120d97356SBlue Swirl     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
118220d97356SBlue Swirl         /* could not compress: write normal cluster */
118320d97356SBlue Swirl         bdrv_write(bs, sector_num, buf, s->cluster_sectors);
118420d97356SBlue Swirl     } else {
118520d97356SBlue Swirl         cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
118620d97356SBlue Swirl             sector_num << 9, out_len);
118720d97356SBlue Swirl         if (!cluster_offset)
118820d97356SBlue Swirl             return -1;
118920d97356SBlue Swirl         cluster_offset &= s->cluster_offset_mask;
119066f82ceeSKevin Wolf         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
119166f82ceeSKevin Wolf         if (bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len) != out_len) {
119220d97356SBlue Swirl             qemu_free(out_buf);
119320d97356SBlue Swirl             return -1;
119420d97356SBlue Swirl         }
119520d97356SBlue Swirl     }
119620d97356SBlue Swirl 
119720d97356SBlue Swirl     qemu_free(out_buf);
119820d97356SBlue Swirl     return 0;
119920d97356SBlue Swirl }
120020d97356SBlue Swirl 
12017c80ab3fSJes Sorensen static int qcow2_flush(BlockDriverState *bs)
120220d97356SBlue Swirl {
120329c1a730SKevin Wolf     BDRVQcowState *s = bs->opaque;
120429c1a730SKevin Wolf     int ret;
120529c1a730SKevin Wolf 
120629c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->l2_table_cache);
120729c1a730SKevin Wolf     if (ret < 0) {
120829c1a730SKevin Wolf         return ret;
120929c1a730SKevin Wolf     }
121029c1a730SKevin Wolf 
121129c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
121229c1a730SKevin Wolf     if (ret < 0) {
121329c1a730SKevin Wolf         return ret;
121429c1a730SKevin Wolf     }
121529c1a730SKevin Wolf 
1216205ef796SKevin Wolf     return bdrv_flush(bs->file);
121720d97356SBlue Swirl }
121820d97356SBlue Swirl 
12197c80ab3fSJes Sorensen static BlockDriverAIOCB *qcow2_aio_flush(BlockDriverState *bs,
12207c80ab3fSJes Sorensen                                          BlockDriverCompletionFunc *cb,
12217c80ab3fSJes Sorensen                                          void *opaque)
122220d97356SBlue Swirl {
122329c1a730SKevin Wolf     BDRVQcowState *s = bs->opaque;
122429c1a730SKevin Wolf     int ret;
122529c1a730SKevin Wolf 
122629c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->l2_table_cache);
122729c1a730SKevin Wolf     if (ret < 0) {
122829c1a730SKevin Wolf         return NULL;
122929c1a730SKevin Wolf     }
123029c1a730SKevin Wolf 
123129c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
123229c1a730SKevin Wolf     if (ret < 0) {
123329c1a730SKevin Wolf         return NULL;
123429c1a730SKevin Wolf     }
123529c1a730SKevin Wolf 
123666f82ceeSKevin Wolf     return bdrv_aio_flush(bs->file, cb, opaque);
123720d97356SBlue Swirl }
123820d97356SBlue Swirl 
12397c80ab3fSJes Sorensen static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
124020d97356SBlue Swirl {
124120d97356SBlue Swirl 	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
124220d97356SBlue Swirl }
124320d97356SBlue Swirl 
12447c80ab3fSJes Sorensen static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
124520d97356SBlue Swirl {
124620d97356SBlue Swirl     BDRVQcowState *s = bs->opaque;
124720d97356SBlue Swirl     bdi->cluster_size = s->cluster_size;
12487c80ab3fSJes Sorensen     bdi->vm_state_offset = qcow2_vm_state_offset(s);
124920d97356SBlue Swirl     return 0;
125020d97356SBlue Swirl }
125120d97356SBlue Swirl 
125220d97356SBlue Swirl 
12537c80ab3fSJes Sorensen static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result)
125420d97356SBlue Swirl {
12559ac228e0SKevin Wolf     return qcow2_check_refcounts(bs, result);
125620d97356SBlue Swirl }
125720d97356SBlue Swirl 
125820d97356SBlue Swirl #if 0
125920d97356SBlue Swirl static void dump_refcounts(BlockDriverState *bs)
126020d97356SBlue Swirl {
126120d97356SBlue Swirl     BDRVQcowState *s = bs->opaque;
126220d97356SBlue Swirl     int64_t nb_clusters, k, k1, size;
126320d97356SBlue Swirl     int refcount;
126420d97356SBlue Swirl 
126566f82ceeSKevin Wolf     size = bdrv_getlength(bs->file);
126620d97356SBlue Swirl     nb_clusters = size_to_clusters(s, size);
126720d97356SBlue Swirl     for(k = 0; k < nb_clusters;) {
126820d97356SBlue Swirl         k1 = k;
126920d97356SBlue Swirl         refcount = get_refcount(bs, k);
127020d97356SBlue Swirl         k++;
127120d97356SBlue Swirl         while (k < nb_clusters && get_refcount(bs, k) == refcount)
127220d97356SBlue Swirl             k++;
12730bfcd599SBlue Swirl         printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
12740bfcd599SBlue Swirl                k - k1);
127520d97356SBlue Swirl     }
127620d97356SBlue Swirl }
127720d97356SBlue Swirl #endif
127820d97356SBlue Swirl 
12797c80ab3fSJes Sorensen static int qcow2_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
128020d97356SBlue Swirl                               int64_t pos, int size)
128120d97356SBlue Swirl {
128220d97356SBlue Swirl     BDRVQcowState *s = bs->opaque;
128320d97356SBlue Swirl     int growable = bs->growable;
128420d97356SBlue Swirl     int ret;
128520d97356SBlue Swirl 
128666f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
128720d97356SBlue Swirl     bs->growable = 1;
12887c80ab3fSJes Sorensen     ret = bdrv_pwrite(bs, qcow2_vm_state_offset(s) + pos, buf, size);
128920d97356SBlue Swirl     bs->growable = growable;
129020d97356SBlue Swirl 
129120d97356SBlue Swirl     return ret;
129220d97356SBlue Swirl }
129320d97356SBlue Swirl 
12947c80ab3fSJes Sorensen static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
129520d97356SBlue Swirl                               int64_t pos, int size)
129620d97356SBlue Swirl {
129720d97356SBlue Swirl     BDRVQcowState *s = bs->opaque;
129820d97356SBlue Swirl     int growable = bs->growable;
129920d97356SBlue Swirl     int ret;
130020d97356SBlue Swirl 
130166f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
130220d97356SBlue Swirl     bs->growable = 1;
13037c80ab3fSJes Sorensen     ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
130420d97356SBlue Swirl     bs->growable = growable;
130520d97356SBlue Swirl 
130620d97356SBlue Swirl     return ret;
130720d97356SBlue Swirl }
130820d97356SBlue Swirl 
13097c80ab3fSJes Sorensen static QEMUOptionParameter qcow2_create_options[] = {
131020d97356SBlue Swirl     {
131120d97356SBlue Swirl         .name = BLOCK_OPT_SIZE,
131220d97356SBlue Swirl         .type = OPT_SIZE,
131320d97356SBlue Swirl         .help = "Virtual disk size"
131420d97356SBlue Swirl     },
131520d97356SBlue Swirl     {
131620d97356SBlue Swirl         .name = BLOCK_OPT_BACKING_FILE,
131720d97356SBlue Swirl         .type = OPT_STRING,
131820d97356SBlue Swirl         .help = "File name of a base image"
131920d97356SBlue Swirl     },
132020d97356SBlue Swirl     {
132120d97356SBlue Swirl         .name = BLOCK_OPT_BACKING_FMT,
132220d97356SBlue Swirl         .type = OPT_STRING,
132320d97356SBlue Swirl         .help = "Image format of the base image"
132420d97356SBlue Swirl     },
132520d97356SBlue Swirl     {
132620d97356SBlue Swirl         .name = BLOCK_OPT_ENCRYPT,
132720d97356SBlue Swirl         .type = OPT_FLAG,
132820d97356SBlue Swirl         .help = "Encrypt the image"
132920d97356SBlue Swirl     },
133020d97356SBlue Swirl     {
133120d97356SBlue Swirl         .name = BLOCK_OPT_CLUSTER_SIZE,
133220d97356SBlue Swirl         .type = OPT_SIZE,
133320d97356SBlue Swirl         .help = "qcow2 cluster size"
133420d97356SBlue Swirl     },
133520d97356SBlue Swirl     {
133620d97356SBlue Swirl         .name = BLOCK_OPT_PREALLOC,
133720d97356SBlue Swirl         .type = OPT_STRING,
133820d97356SBlue Swirl         .help = "Preallocation mode (allowed values: off, metadata)"
133920d97356SBlue Swirl     },
134020d97356SBlue Swirl     { NULL }
134120d97356SBlue Swirl };
134220d97356SBlue Swirl 
134320d97356SBlue Swirl static BlockDriver bdrv_qcow2 = {
134420d97356SBlue Swirl     .format_name        = "qcow2",
134520d97356SBlue Swirl     .instance_size      = sizeof(BDRVQcowState),
13467c80ab3fSJes Sorensen     .bdrv_probe         = qcow2_probe,
13477c80ab3fSJes Sorensen     .bdrv_open          = qcow2_open,
13487c80ab3fSJes Sorensen     .bdrv_close         = qcow2_close,
13497c80ab3fSJes Sorensen     .bdrv_create        = qcow2_create,
13507c80ab3fSJes Sorensen     .bdrv_flush         = qcow2_flush,
13517c80ab3fSJes Sorensen     .bdrv_is_allocated  = qcow2_is_allocated,
13527c80ab3fSJes Sorensen     .bdrv_set_key       = qcow2_set_key,
13537c80ab3fSJes Sorensen     .bdrv_make_empty    = qcow2_make_empty,
135420d97356SBlue Swirl 
13557c80ab3fSJes Sorensen     .bdrv_aio_readv     = qcow2_aio_readv,
13567c80ab3fSJes Sorensen     .bdrv_aio_writev    = qcow2_aio_writev,
13577c80ab3fSJes Sorensen     .bdrv_aio_flush     = qcow2_aio_flush,
1358419b19d9SStefan Hajnoczi 
1359*5ea929e3SKevin Wolf     .bdrv_discard           = qcow2_discard,
1360419b19d9SStefan Hajnoczi     .bdrv_truncate          = qcow2_truncate,
13617c80ab3fSJes Sorensen     .bdrv_write_compressed  = qcow2_write_compressed,
136220d97356SBlue Swirl 
136320d97356SBlue Swirl     .bdrv_snapshot_create   = qcow2_snapshot_create,
136420d97356SBlue Swirl     .bdrv_snapshot_goto     = qcow2_snapshot_goto,
136520d97356SBlue Swirl     .bdrv_snapshot_delete   = qcow2_snapshot_delete,
136620d97356SBlue Swirl     .bdrv_snapshot_list     = qcow2_snapshot_list,
136751ef6727Sedison     .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
13687c80ab3fSJes Sorensen     .bdrv_get_info      = qcow2_get_info,
136920d97356SBlue Swirl 
13707c80ab3fSJes Sorensen     .bdrv_save_vmstate    = qcow2_save_vmstate,
13717c80ab3fSJes Sorensen     .bdrv_load_vmstate    = qcow2_load_vmstate,
137220d97356SBlue Swirl 
137320d97356SBlue Swirl     .bdrv_change_backing_file   = qcow2_change_backing_file,
137420d97356SBlue Swirl 
13757c80ab3fSJes Sorensen     .create_options = qcow2_create_options,
13767c80ab3fSJes Sorensen     .bdrv_check = qcow2_check,
137720d97356SBlue Swirl };
137820d97356SBlue Swirl 
13795efa9d5aSAnthony Liguori static void bdrv_qcow2_init(void)
13805efa9d5aSAnthony Liguori {
13815efa9d5aSAnthony Liguori     bdrv_register(&bdrv_qcow2);
13825efa9d5aSAnthony Liguori }
13835efa9d5aSAnthony Liguori 
13845efa9d5aSAnthony Liguori block_init(bdrv_qcow2_init);
1385