xref: /qemu/block/vpc.c (revision 70ce076fa6dff60585c229a4b641b13e64bf03cf)
1 /*
2  * Block driver for Connectix / Microsoft Virtual PC images
3  *
4  * Copyright (c) 2005 Alex Beregszaszi
5  * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #include "qemu/osdep.h"
27 #include "qapi/error.h"
28 #include "block/block_int.h"
29 #include "block/qdict.h"
30 #include "system/block-backend.h"
31 #include "qemu/module.h"
32 #include "qemu/option.h"
33 #include "migration/blocker.h"
34 #include "qemu/bswap.h"
35 #include "qemu/uuid.h"
36 #include "qemu/memalign.h"
37 #include "qobject/qdict.h"
38 #include "qapi/qobject-input-visitor.h"
39 #include "qapi/qapi-visit-block-core.h"
40 
41 /**************************************************************/
42 
43 //#define CACHE
44 
45 enum vhd_type {
46     VHD_FIXED           = 2,
47     VHD_DYNAMIC         = 3,
48     VHD_DIFFERENCING    = 4,
49 };
50 
51 /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
52 #define VHD_TIMESTAMP_BASE 946684800
53 
54 #define VHD_CHS_MAX_C   65535LL
55 #define VHD_CHS_MAX_H   16
56 #define VHD_CHS_MAX_S   255
57 
58 #define VHD_MAX_SECTORS       0xff000000    /* 2040 GiB max image size */
59 #define VHD_MAX_GEOMETRY      (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S)
60 
61 #define VPC_OPT_FORCE_SIZE "force_size"
62 
63 /* always big-endian */
64 typedef struct vhd_footer {
65     char        creator[8]; /* "conectix" */
66     uint32_t    features;
67     uint32_t    version;
68 
69     /* Offset of next header structure, 0xFFFFFFFF if none */
70     uint64_t    data_offset;
71 
72     /* Seconds since Jan 1, 2000 0:00:00 (UTC) */
73     uint32_t    timestamp;
74 
75     char        creator_app[4]; /*  e.g., "vpc " */
76     uint16_t    major;
77     uint16_t    minor;
78     char        creator_os[4]; /* "Wi2k" */
79 
80     uint64_t    orig_size;
81     uint64_t    current_size;
82 
83     uint16_t    cyls;
84     uint8_t     heads;
85     uint8_t     secs_per_cyl;
86 
87     uint32_t    type;
88 
89     /* Checksum of the Hard Disk Footer ("one's complement of the sum of all
90        the bytes in the footer without the checksum field") */
91     uint32_t    checksum;
92 
93     /* UUID used to identify a parent hard disk (backing file) */
94     QemuUUID    uuid;
95 
96     uint8_t     in_saved_state;
97     uint8_t     reserved[427];
98 } QEMU_PACKED VHDFooter;
99 
100 QEMU_BUILD_BUG_ON(sizeof(VHDFooter) != 512);
101 
102 typedef struct vhd_dyndisk_header {
103     char        magic[8]; /* "cxsparse" */
104 
105     /* Offset of next header structure, 0xFFFFFFFF if none */
106     uint64_t    data_offset;
107 
108     /* Offset of the Block Allocation Table (BAT) */
109     uint64_t    table_offset;
110 
111     uint32_t    version;
112     uint32_t    max_table_entries; /* 32bit/entry */
113 
114     /* 2 MB by default, must be a power of two */
115     uint32_t    block_size;
116 
117     uint32_t    checksum;
118     uint8_t     parent_uuid[16];
119     uint32_t    parent_timestamp;
120     uint32_t    reserved;
121 
122     /* Backing file name (in UTF-16) */
123     uint8_t     parent_name[512];
124 
125     struct {
126         uint32_t    platform;
127         uint32_t    data_space;
128         uint32_t    data_length;
129         uint32_t    reserved;
130         uint64_t    data_offset;
131     } parent_locator[8];
132     uint8_t     reserved2[256];
133 } QEMU_PACKED VHDDynDiskHeader;
134 
135 QEMU_BUILD_BUG_ON(sizeof(VHDDynDiskHeader) != 1024);
136 
137 typedef struct BDRVVPCState {
138     CoMutex lock;
139     VHDFooter footer;
140     uint64_t free_data_block_offset;
141     int max_table_entries;
142     uint32_t *pagetable;
143     uint64_t bat_offset;
144     uint64_t last_bitmap_offset;
145 
146     uint32_t block_size;
147     uint32_t bitmap_size;
148     bool force_use_chs;
149     bool force_use_sz;
150 
151 #ifdef CACHE
152     uint8_t *pageentry_u8;
153     uint32_t *pageentry_u32;
154     uint16_t *pageentry_u16;
155 
156     uint64_t last_bitmap;
157 #endif
158 
159     Error *migration_blocker;
160 } BDRVVPCState;
161 
162 #define VPC_OPT_SIZE_CALC "force_size_calc"
163 static QemuOptsList vpc_runtime_opts = {
164     .name = "vpc-runtime-opts",
165     .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head),
166     .desc = {
167         {
168             .name = VPC_OPT_SIZE_CALC,
169             .type = QEMU_OPT_STRING,
170             .help = "Force disk size calculation to use either CHS geometry, "
171                     "or use the disk current_size specified in the VHD footer. "
172                     "{chs, current_size}"
173         },
174         { /* end of list */ }
175     }
176 };
177 
178 static QemuOptsList vpc_create_opts;
179 
180 static uint32_t vpc_checksum(void *p, size_t size)
181 {
182     uint8_t *buf = p;
183     uint32_t res = 0;
184     int i;
185 
186     for (i = 0; i < size; i++)
187         res += buf[i];
188 
189     return ~res;
190 }
191 
192 
193 static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
194 {
195     if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
196         return 100;
197     return 0;
198 }
199 
200 static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts,
201                               Error **errp)
202 {
203     BDRVVPCState *s = bs->opaque;
204     const char *size_calc;
205 
206     size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC);
207 
208     if (!size_calc) {
209        /* no override, use autodetect only */
210     } else if (!strcmp(size_calc, "current_size")) {
211         s->force_use_sz = true;
212     } else if (!strcmp(size_calc, "chs")) {
213         s->force_use_chs = true;
214     } else {
215         error_setg(errp, "Invalid size calculation mode: '%s'", size_calc);
216     }
217 }
218 
219 /*
220  * Microsoft Virtual PC and Microsoft Hyper-V produce and read
221  * VHD image sizes differently.  VPC will rely on CHS geometry,
222  * while Hyper-V and disk2vhd use the size specified in the footer.
223  *
224  * We use a couple of approaches to try and determine the correct method:
225  * look at the Creator App field, and look for images that have CHS
226  * geometry that is the maximum value.
227  *
228  * If the CHS geometry is the maximum CHS geometry, then we assume that
229  * the size is the footer->current_size to avoid truncation.  Otherwise,
230  * we follow the table based on footer->creator_app:
231  *
232  *  Known creator apps:
233  *      'vpc '  :  CHS              Virtual PC (uses disk geometry)
234  *      'qemu'  :  CHS              QEMU (uses disk geometry)
235  *      'qem2'  :  current_size     QEMU (uses current_size)
236  *      'win '  :  current_size     Hyper-V
237  *      'd2v '  :  current_size     Disk2vhd
238  *      'tap\0' :  current_size     XenServer
239  *      'CTXS'  :  current_size     XenConverter
240  *      'wa\0\0':  current_size     Azure
241  *
242  *  The user can override the table values via drive options, however
243  *  even with an override we will still use current_size for images
244  *  that have CHS geometry of the maximum size.
245  */
246 static bool vpc_ignore_current_size(VHDFooter *footer)
247 {
248     return !strncmp(footer->creator_app, "vpc ", 4) ||
249            !strncmp(footer->creator_app, "qemu", 4);
250 }
251 
252 static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
253                     Error **errp)
254 {
255     BDRVVPCState *s = bs->opaque;
256     int i;
257     VHDFooter *footer;
258     QemuOpts *opts = NULL;
259     Error *local_err = NULL;
260     bool use_chs;
261     VHDDynDiskHeader dyndisk_header;
262     uint32_t checksum;
263     uint64_t computed_size;
264     uint64_t pagetable_size;
265     int disk_type = VHD_DYNAMIC;
266     int ret;
267     int64_t bs_size;
268 
269     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
270     if (ret < 0) {
271         return ret;
272     }
273 
274     GRAPH_RDLOCK_GUARD_MAINLOOP();
275 
276     opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort);
277     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
278         ret = -EINVAL;
279         goto fail;
280     }
281 
282     vpc_parse_options(bs, opts, &local_err);
283     if (local_err) {
284         error_propagate(errp, local_err);
285         ret = -EINVAL;
286         goto fail;
287     }
288 
289     ret = bdrv_pread(bs->file, 0, sizeof(s->footer), &s->footer, 0);
290     if (ret < 0) {
291         error_setg(errp, "Unable to read VHD header");
292         goto fail;
293     }
294 
295     footer = &s->footer;
296     if (strncmp(footer->creator, "conectix", 8)) {
297         int64_t offset = bdrv_getlength(bs->file->bs);
298         if (offset < 0) {
299             ret = offset;
300             error_setg(errp, "Invalid file size");
301             goto fail;
302         } else if (offset < sizeof(*footer)) {
303             ret = -EINVAL;
304             error_setg(errp, "File too small for a VHD header");
305             goto fail;
306         }
307 
308         /* If a fixed disk, the footer is found only at the end of the file */
309         ret = bdrv_pread(bs->file, offset - sizeof(*footer), sizeof(*footer),
310                          footer, 0);
311         if (ret < 0) {
312             goto fail;
313         }
314         if (strncmp(footer->creator, "conectix", 8) ||
315             be32_to_cpu(footer->type) != VHD_FIXED) {
316             error_setg(errp, "invalid VPC image");
317             ret = -EINVAL;
318             goto fail;
319         }
320         disk_type = VHD_FIXED;
321     }
322 
323     checksum = be32_to_cpu(footer->checksum);
324     footer->checksum = 0;
325     if (vpc_checksum(footer, sizeof(*footer)) != checksum) {
326         error_setg(errp, "Incorrect header checksum");
327         ret = -EINVAL;
328         goto fail;
329     }
330 
331     /* Write 'checksum' back to footer, or else will leave it with zero. */
332     footer->checksum = cpu_to_be32(checksum);
333 
334     /* The visible size of a image in Virtual PC depends on the geometry
335        rather than on the size stored in the footer (the size in the footer
336        is too large usually) */
337     bs->total_sectors = (int64_t)
338         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
339 
340     /* Use CHS or current_size to determine the image size. */
341     use_chs = vpc_ignore_current_size(footer) || s->force_use_chs;
342 
343     if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) {
344         bs->total_sectors = be64_to_cpu(footer->current_size) /
345                                         BDRV_SECTOR_SIZE;
346     }
347 
348     /* Allow a maximum disk size of 2040 GiB */
349     if (bs->total_sectors > VHD_MAX_SECTORS) {
350         ret = -EFBIG;
351         goto fail;
352     }
353 
354     if (disk_type == VHD_DYNAMIC) {
355         ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset),
356                          sizeof(dyndisk_header), &dyndisk_header, 0);
357         if (ret < 0) {
358             error_setg(errp, "Error reading dynamic VHD header");
359             goto fail;
360         }
361 
362         if (strncmp(dyndisk_header.magic, "cxsparse", 8)) {
363             error_setg(errp, "Invalid header magic");
364             ret = -EINVAL;
365             goto fail;
366         }
367 
368         s->block_size = be32_to_cpu(dyndisk_header.block_size);
369         if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) {
370             error_setg(errp, "Invalid block size %" PRIu32, s->block_size);
371             ret = -EINVAL;
372             goto fail;
373         }
374         s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
375 
376         s->max_table_entries = be32_to_cpu(dyndisk_header.max_table_entries);
377 
378         if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) {
379             error_setg(errp, "Too many blocks");
380             ret = -EINVAL;
381             goto fail;
382         }
383 
384         computed_size = (uint64_t) s->max_table_entries * s->block_size;
385         if (computed_size < bs->total_sectors * 512) {
386             error_setg(errp, "Page table too small");
387             ret = -EINVAL;
388             goto fail;
389         }
390 
391         if (s->max_table_entries > SIZE_MAX / 4 ||
392             s->max_table_entries > (int) INT_MAX / 4) {
393             error_setg(errp, "Max Table Entries too large (%" PRId32 ")",
394                         s->max_table_entries);
395             ret = -EINVAL;
396             goto fail;
397         }
398 
399         pagetable_size = (uint64_t) s->max_table_entries * 4;
400 
401         s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size);
402         if (s->pagetable == NULL) {
403             error_setg(errp, "Unable to allocate memory for page table");
404             ret = -ENOMEM;
405             goto fail;
406         }
407 
408         s->bat_offset = be64_to_cpu(dyndisk_header.table_offset);
409 
410         ret = bdrv_pread(bs->file, s->bat_offset, pagetable_size,
411                          s->pagetable, 0);
412         if (ret < 0) {
413             error_setg(errp, "Error reading pagetable");
414             goto fail;
415         }
416 
417         s->free_data_block_offset =
418             ROUND_UP(s->bat_offset + pagetable_size, 512);
419 
420         for (i = 0; i < s->max_table_entries; i++) {
421             be32_to_cpus(&s->pagetable[i]);
422             if (s->pagetable[i] != 0xFFFFFFFF) {
423                 int64_t next = (512 * (int64_t) s->pagetable[i]) +
424                     s->bitmap_size + s->block_size;
425 
426                 if (next > s->free_data_block_offset) {
427                     s->free_data_block_offset = next;
428                 }
429             }
430         }
431 
432         bs_size = bdrv_getlength(bs->file->bs);
433         if (bs_size < 0) {
434             error_setg_errno(errp, -bs_size, "Unable to learn image size");
435             ret = bs_size;
436             goto fail;
437         }
438         if (s->free_data_block_offset > bs_size) {
439             error_setg(errp, "block-vpc: free_data_block_offset points after "
440                              "the end of file. The image has been truncated.");
441             ret = -EINVAL;
442             goto fail;
443         }
444 
445         s->last_bitmap_offset = (int64_t) -1;
446 
447 #ifdef CACHE
448         s->pageentry_u8 = g_malloc(512);
449         s->pageentry_u32 = s->pageentry_u8;
450         s->pageentry_u16 = s->pageentry_u8;
451         s->last_pagetable = -1;
452 #endif
453     }
454 
455     /* Disable migration when VHD images are used */
456     error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
457                "does not support live migration",
458                bdrv_get_device_or_node_name(bs));
459 
460     ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
461     if (ret < 0) {
462         goto fail;
463     }
464 
465     qemu_co_mutex_init(&s->lock);
466     qemu_opts_del(opts);
467 
468     return 0;
469 
470 fail:
471     qemu_opts_del(opts);
472     qemu_vfree(s->pagetable);
473 #ifdef CACHE
474     g_free(s->pageentry_u8);
475 #endif
476     return ret;
477 }
478 
479 static int vpc_reopen_prepare(BDRVReopenState *state,
480                               BlockReopenQueue *queue, Error **errp)
481 {
482     return 0;
483 }
484 
485 /*
486  * Returns the absolute byte offset of the given sector in the image file.
487  * If the sector is not allocated, -1 is returned instead.
488  * If an error occurred trying to write an updated block bitmap back to
489  * the file, -2 is returned, and the error value is written to *err.
490  * This can only happen for a write operation.
491  *
492  * The parameter write must be 1 if the offset will be used for a write
493  * operation (the block bitmaps is updated then), 0 otherwise.
494  * If write is true then err must not be NULL.
495  */
496 static int64_t coroutine_fn GRAPH_RDLOCK
497 get_image_offset(BlockDriverState *bs, uint64_t offset, bool write, int *err)
498 {
499     BDRVVPCState *s = bs->opaque;
500     uint64_t bitmap_offset, block_offset;
501     uint32_t pagetable_index, offset_in_block;
502 
503     assert(!(write && err == NULL));
504 
505     pagetable_index = offset / s->block_size;
506     offset_in_block = offset % s->block_size;
507 
508     if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
509         return -1; /* not allocated */
510 
511     bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
512     block_offset = bitmap_offset + s->bitmap_size + offset_in_block;
513 
514     /* We must ensure that we don't write to any sectors which are marked as
515        unused in the bitmap. We get away with setting all bits in the block
516        bitmap each time we write to a new block. This might cause Virtual PC to
517        miss sparse read optimization, but it's not a problem in terms of
518        correctness. */
519     if (write && (s->last_bitmap_offset != bitmap_offset)) {
520         g_autofree uint8_t *bitmap = g_malloc(s->bitmap_size);
521         int r;
522 
523         s->last_bitmap_offset = bitmap_offset;
524         memset(bitmap, 0xff, s->bitmap_size);
525         r = bdrv_co_pwrite_sync(bs->file, bitmap_offset, s->bitmap_size, bitmap, 0);
526         if (r < 0) {
527             *err = r;
528             return -2;
529         }
530     }
531 
532     return block_offset;
533 }
534 
535 /*
536  * Writes the footer to the end of the image file. This is needed when the
537  * file grows as it overwrites the old footer
538  *
539  * Returns 0 on success and < 0 on error
540  */
541 static int coroutine_fn GRAPH_RDLOCK rewrite_footer(BlockDriverState *bs)
542 {
543     int ret;
544     BDRVVPCState *s = bs->opaque;
545     int64_t offset = s->free_data_block_offset;
546 
547     ret = bdrv_co_pwrite_sync(bs->file, offset, sizeof(s->footer), &s->footer, 0);
548     if (ret < 0)
549         return ret;
550 
551     return 0;
552 }
553 
554 /*
555  * Allocates a new block. This involves writing a new footer and updating
556  * the Block Allocation Table to use the space at the old end of the image
557  * file (overwriting the old footer)
558  *
559  * Returns the sectors' offset in the image file on success and < 0 on error
560  */
561 static int64_t coroutine_fn GRAPH_RDLOCK
562 alloc_block(BlockDriverState *bs, int64_t offset)
563 {
564     BDRVVPCState *s = bs->opaque;
565     int64_t bat_offset;
566     uint32_t index, bat_value;
567     int ret;
568     g_autofree uint8_t *bitmap = g_malloc(s->bitmap_size);
569 
570     /* Check if sector_num is valid */
571     if ((offset < 0) || (offset > bs->total_sectors * BDRV_SECTOR_SIZE)) {
572         return -EINVAL;
573     }
574 
575     /* Write entry into in-memory BAT */
576     index = offset / s->block_size;
577     assert(s->pagetable[index] == 0xFFFFFFFF);
578     s->pagetable[index] = s->free_data_block_offset / 512;
579 
580     /* Initialize the block's bitmap */
581     memset(bitmap, 0xff, s->bitmap_size);
582     ret = bdrv_co_pwrite_sync(bs->file, s->free_data_block_offset,
583                               s->bitmap_size, bitmap, 0);
584     if (ret < 0) {
585         return ret;
586     }
587 
588     /* Write new footer (the old one will be overwritten) */
589     s->free_data_block_offset += s->block_size + s->bitmap_size;
590     ret = rewrite_footer(bs);
591     if (ret < 0)
592         goto fail;
593 
594     /* Write BAT entry to disk */
595     bat_offset = s->bat_offset + (4 * index);
596     bat_value = cpu_to_be32(s->pagetable[index]);
597     ret = bdrv_co_pwrite_sync(bs->file, bat_offset, 4, &bat_value, 0);
598     if (ret < 0)
599         goto fail;
600 
601     return get_image_offset(bs, offset, false, NULL);
602 
603 fail:
604     s->free_data_block_offset -= (s->block_size + s->bitmap_size);
605     return ret;
606 }
607 
608 static int coroutine_fn
609 vpc_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
610 {
611     BDRVVPCState *s = (BDRVVPCState *)bs->opaque;
612 
613     if (be32_to_cpu(s->footer.type) != VHD_FIXED) {
614         bdi->cluster_size = s->block_size;
615     }
616 
617     return 0;
618 }
619 
620 static int coroutine_fn GRAPH_RDLOCK
621 vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
622               QEMUIOVector *qiov, BdrvRequestFlags flags)
623 {
624     BDRVVPCState *s = bs->opaque;
625     int ret;
626     int64_t image_offset;
627     int64_t n_bytes;
628     int64_t bytes_done = 0;
629     QEMUIOVector local_qiov;
630 
631     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
632         return bdrv_co_preadv(bs->file, offset, bytes, qiov, 0);
633     }
634 
635     qemu_co_mutex_lock(&s->lock);
636     qemu_iovec_init(&local_qiov, qiov->niov);
637 
638     while (bytes > 0) {
639         image_offset = get_image_offset(bs, offset, false, NULL);
640         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
641 
642         if (image_offset == -1) {
643             qemu_iovec_memset(qiov, bytes_done, 0, n_bytes);
644         } else {
645             qemu_iovec_reset(&local_qiov);
646             qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
647 
648             qemu_co_mutex_unlock(&s->lock);
649             ret = bdrv_co_preadv(bs->file, image_offset, n_bytes,
650                                  &local_qiov, 0);
651             qemu_co_mutex_lock(&s->lock);
652             if (ret < 0) {
653                 goto fail;
654             }
655         }
656 
657         bytes -= n_bytes;
658         offset += n_bytes;
659         bytes_done += n_bytes;
660     }
661 
662     ret = 0;
663 fail:
664     qemu_iovec_destroy(&local_qiov);
665     qemu_co_mutex_unlock(&s->lock);
666 
667     return ret;
668 }
669 
670 static int coroutine_fn GRAPH_RDLOCK
671 vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
672                QEMUIOVector *qiov, BdrvRequestFlags flags)
673 {
674     BDRVVPCState *s = bs->opaque;
675     int64_t image_offset;
676     int64_t n_bytes;
677     int64_t bytes_done = 0;
678     int ret = 0;
679     QEMUIOVector local_qiov;
680 
681     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
682         return bdrv_co_pwritev(bs->file, offset, bytes, qiov, 0);
683     }
684 
685     qemu_co_mutex_lock(&s->lock);
686     qemu_iovec_init(&local_qiov, qiov->niov);
687 
688     while (bytes > 0) {
689         image_offset = get_image_offset(bs, offset, true, &ret);
690         if (image_offset == -2) {
691             /* Failed to write block bitmap: can't proceed with write */
692             goto fail;
693         }
694         n_bytes = MIN(bytes, s->block_size - (offset % s->block_size));
695 
696         if (image_offset == -1) {
697             image_offset = alloc_block(bs, offset);
698             if (image_offset < 0) {
699                 ret = image_offset;
700                 goto fail;
701             }
702         }
703 
704         qemu_iovec_reset(&local_qiov);
705         qemu_iovec_concat(&local_qiov, qiov, bytes_done, n_bytes);
706 
707         qemu_co_mutex_unlock(&s->lock);
708         ret = bdrv_co_pwritev(bs->file, image_offset, n_bytes,
709                               &local_qiov, 0);
710         qemu_co_mutex_lock(&s->lock);
711         if (ret < 0) {
712             goto fail;
713         }
714 
715         bytes -= n_bytes;
716         offset += n_bytes;
717         bytes_done += n_bytes;
718     }
719 
720     ret = 0;
721 fail:
722     qemu_iovec_destroy(&local_qiov);
723     qemu_co_mutex_unlock(&s->lock);
724 
725     return ret;
726 }
727 
728 static int coroutine_fn GRAPH_RDLOCK
729 vpc_co_block_status(BlockDriverState *bs, bool want_zero,
730                     int64_t offset, int64_t bytes,
731                     int64_t *pnum, int64_t *map,
732                     BlockDriverState **file)
733 {
734     BDRVVPCState *s = bs->opaque;
735     int64_t image_offset;
736     bool allocated;
737     int ret;
738     int64_t n;
739 
740     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
741         *pnum = bytes;
742         *map = offset;
743         *file = bs->file->bs;
744         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_RECURSE;
745     }
746 
747     qemu_co_mutex_lock(&s->lock);
748 
749     image_offset = get_image_offset(bs, offset, false, NULL);
750     allocated = (image_offset != -1);
751     *pnum = 0;
752     ret = BDRV_BLOCK_ZERO;
753 
754     do {
755         /* All sectors in a block are contiguous (without using the bitmap) */
756         n = ROUND_UP(offset + 1, s->block_size) - offset;
757         n = MIN(n, bytes);
758 
759         *pnum += n;
760         offset += n;
761         bytes -= n;
762         /* *pnum can't be greater than one block for allocated
763          * sectors since there is always a bitmap in between. */
764         if (allocated) {
765             *file = bs->file->bs;
766             *map = image_offset;
767             ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
768             break;
769         }
770         if (bytes == 0) {
771             break;
772         }
773         image_offset = get_image_offset(bs, offset, false, NULL);
774     } while (image_offset == -1);
775 
776     qemu_co_mutex_unlock(&s->lock);
777     return ret;
778 }
779 
780 /*
781  * Calculates the number of cylinders, heads and sectors per cylinder
782  * based on a given number of sectors. This is the algorithm described
783  * in the VHD specification.
784  *
785  * Note that the geometry doesn't always exactly match total_sectors but
786  * may round it down.
787  *
788  * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override
789  * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB)
790  * and instead allow up to 255 heads.
791  */
792 static int calculate_geometry(int64_t total_sectors, uint16_t *cyls,
793     uint8_t *heads, uint8_t *secs_per_cyl)
794 {
795     uint32_t cyls_times_heads;
796 
797     total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY);
798 
799     if (total_sectors >= 65535LL * 16 * 63) {
800         *secs_per_cyl = 255;
801         *heads = 16;
802         cyls_times_heads = total_sectors / *secs_per_cyl;
803     } else {
804         *secs_per_cyl = 17;
805         cyls_times_heads = total_sectors / *secs_per_cyl;
806         *heads = DIV_ROUND_UP(cyls_times_heads, 1024);
807 
808         if (*heads < 4) {
809             *heads = 4;
810         }
811 
812         if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
813             *secs_per_cyl = 31;
814             *heads = 16;
815             cyls_times_heads = total_sectors / *secs_per_cyl;
816         }
817 
818         if (cyls_times_heads >= (*heads * 1024)) {
819             *secs_per_cyl = 63;
820             *heads = 16;
821             cyls_times_heads = total_sectors / *secs_per_cyl;
822         }
823     }
824 
825     *cyls = cyls_times_heads / *heads;
826 
827     return 0;
828 }
829 
830 static int coroutine_fn create_dynamic_disk(BlockBackend *blk, VHDFooter *footer,
831                                             int64_t total_sectors)
832 {
833     VHDDynDiskHeader dyndisk_header;
834     uint8_t bat_sector[512];
835     size_t block_size, num_bat_entries;
836     int i;
837     int ret;
838     int64_t offset = 0;
839 
840     /* Write the footer (twice: at the beginning and at the end) */
841     block_size = 0x200000;
842     num_bat_entries = DIV_ROUND_UP(total_sectors, block_size / 512);
843 
844     ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
845     if (ret < 0) {
846         goto fail;
847     }
848 
849     offset = 1536 + ((num_bat_entries * 4 + 511) & ~511);
850     ret = blk_co_pwrite(blk, offset, sizeof(*footer), footer, 0);
851     if (ret < 0) {
852         goto fail;
853     }
854 
855     /* Write the initial BAT */
856     offset = 3 * 512;
857 
858     memset(bat_sector, 0xFF, 512);
859     for (i = 0; i < DIV_ROUND_UP(num_bat_entries * 4, 512); i++) {
860         ret = blk_co_pwrite(blk, offset, 512, bat_sector, 0);
861         if (ret < 0) {
862             goto fail;
863         }
864         offset += 512;
865     }
866 
867     /* Prepare the Dynamic Disk Header */
868     memset(&dyndisk_header, 0, sizeof(dyndisk_header));
869 
870     memcpy(dyndisk_header.magic, "cxsparse", 8);
871 
872     /*
873      * Note: The spec is actually wrong here for data_offset, it says
874      * 0xFFFFFFFF, but MS tools expect all 64 bits to be set.
875      */
876     dyndisk_header.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
877     dyndisk_header.table_offset = cpu_to_be64(3 * 512);
878     dyndisk_header.version = cpu_to_be32(0x00010000);
879     dyndisk_header.block_size = cpu_to_be32(block_size);
880     dyndisk_header.max_table_entries = cpu_to_be32(num_bat_entries);
881 
882     dyndisk_header.checksum = cpu_to_be32(
883         vpc_checksum(&dyndisk_header, sizeof(dyndisk_header)));
884 
885     /* Write the header */
886     offset = 512;
887 
888     ret = blk_co_pwrite(blk, offset, sizeof(dyndisk_header), &dyndisk_header, 0);
889     if (ret < 0) {
890         goto fail;
891     }
892 
893     ret = 0;
894  fail:
895     return ret;
896 }
897 
898 static int coroutine_fn create_fixed_disk(BlockBackend *blk, VHDFooter *footer,
899                                           int64_t total_size, Error **errp)
900 {
901     int ret;
902 
903     /* Add footer to total size */
904     total_size += sizeof(*footer);
905 
906     ret = blk_co_truncate(blk, total_size, false, PREALLOC_MODE_OFF, 0, errp);
907     if (ret < 0) {
908         return ret;
909     }
910 
911     ret = blk_co_pwrite(blk, total_size - sizeof(*footer), sizeof(*footer),
912                         footer, 0);
913     if (ret < 0) {
914         error_setg_errno(errp, -ret, "Unable to write VHD header");
915         return ret;
916     }
917 
918     return 0;
919 }
920 
921 static int calculate_rounded_image_size(BlockdevCreateOptionsVpc *vpc_opts,
922                                         uint16_t *out_cyls,
923                                         uint8_t *out_heads,
924                                         uint8_t *out_secs_per_cyl,
925                                         int64_t *out_total_sectors,
926                                         Error **errp)
927 {
928     int64_t total_size = vpc_opts->size;
929     uint16_t cyls = 0;
930     uint8_t heads = 0;
931     uint8_t secs_per_cyl = 0;
932     int64_t total_sectors;
933     int i;
934 
935     /*
936      * Calculate matching total_size and geometry. Increase the number of
937      * sectors requested until we get enough (or fail). This ensures that
938      * qemu-img convert doesn't truncate images, but rather rounds up.
939      *
940      * If the image size can't be represented by a spec conformant CHS geometry,
941      * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use
942      * the image size from the VHD footer to calculate total_sectors.
943      */
944     if (vpc_opts->force_size) {
945         /* This will force the use of total_size for sector count, below */
946         cyls         = VHD_CHS_MAX_C;
947         heads        = VHD_CHS_MAX_H;
948         secs_per_cyl = VHD_CHS_MAX_S;
949     } else {
950         total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE);
951         for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) {
952             calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl);
953         }
954     }
955 
956     if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) {
957         total_sectors = total_size / BDRV_SECTOR_SIZE;
958         /* Allow a maximum disk size of 2040 GiB */
959         if (total_sectors > VHD_MAX_SECTORS) {
960             error_setg(errp, "Disk size is too large, max size is 2040 GiB");
961             return -EFBIG;
962         }
963     } else {
964         total_sectors = (int64_t) cyls * heads * secs_per_cyl;
965     }
966 
967     *out_total_sectors = total_sectors;
968     if (out_cyls) {
969         *out_cyls = cyls;
970         *out_heads = heads;
971         *out_secs_per_cyl = secs_per_cyl;
972     }
973 
974     return 0;
975 }
976 
977 static int coroutine_fn GRAPH_UNLOCKED
978 vpc_co_create(BlockdevCreateOptions *opts, Error **errp)
979 {
980     BlockdevCreateOptionsVpc *vpc_opts;
981     BlockBackend *blk = NULL;
982     BlockDriverState *bs = NULL;
983 
984     VHDFooter footer;
985     uint16_t cyls = 0;
986     uint8_t heads = 0;
987     uint8_t secs_per_cyl = 0;
988     int64_t total_sectors;
989     int64_t total_size;
990     int disk_type;
991     int ret = -EIO;
992     QemuUUID uuid;
993 
994     assert(opts->driver == BLOCKDEV_DRIVER_VPC);
995     vpc_opts = &opts->u.vpc;
996 
997     /* Validate options and set default values */
998     total_size = vpc_opts->size;
999 
1000     if (!vpc_opts->has_subformat) {
1001         vpc_opts->subformat = BLOCKDEV_VPC_SUBFORMAT_DYNAMIC;
1002     }
1003     switch (vpc_opts->subformat) {
1004     case BLOCKDEV_VPC_SUBFORMAT_DYNAMIC:
1005         disk_type = VHD_DYNAMIC;
1006         break;
1007     case BLOCKDEV_VPC_SUBFORMAT_FIXED:
1008         disk_type = VHD_FIXED;
1009         break;
1010     default:
1011         g_assert_not_reached();
1012     }
1013 
1014     /* Create BlockBackend to write to the image */
1015     bs = bdrv_co_open_blockdev_ref(vpc_opts->file, errp);
1016     if (bs == NULL) {
1017         return -EIO;
1018     }
1019 
1020     blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
1021                              errp);
1022     if (!blk) {
1023         ret = -EPERM;
1024         goto out;
1025     }
1026     blk_set_allow_write_beyond_eof(blk, true);
1027 
1028     /* Get geometry and check that it matches the image size*/
1029     ret = calculate_rounded_image_size(vpc_opts, &cyls, &heads, &secs_per_cyl,
1030                                        &total_sectors, errp);
1031     if (ret < 0) {
1032         goto out;
1033     }
1034 
1035     if (total_size != total_sectors * BDRV_SECTOR_SIZE) {
1036         error_setg(errp, "The requested image size cannot be represented in "
1037                          "CHS geometry");
1038         error_append_hint(errp, "Try size=%llu or force-size=on (the "
1039                                 "latter makes the image incompatible with "
1040                                 "Virtual PC)",
1041                           total_sectors * BDRV_SECTOR_SIZE);
1042         ret = -EINVAL;
1043         goto out;
1044     }
1045 
1046     /* Prepare the Hard Disk Footer */
1047     memset(&footer, 0, sizeof(footer));
1048 
1049     memcpy(footer.creator, "conectix", 8);
1050     if (vpc_opts->force_size) {
1051         memcpy(footer.creator_app, "qem2", 4);
1052     } else {
1053         memcpy(footer.creator_app, "qemu", 4);
1054     }
1055     memcpy(footer.creator_os, "Wi2k", 4);
1056 
1057     footer.features = cpu_to_be32(0x02);
1058     footer.version = cpu_to_be32(0x00010000);
1059     if (disk_type == VHD_DYNAMIC) {
1060         footer.data_offset = cpu_to_be64(sizeof(footer));
1061     } else {
1062         footer.data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL);
1063     }
1064     footer.timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE);
1065 
1066     /* Version of Virtual PC 2007 */
1067     footer.major = cpu_to_be16(0x0005);
1068     footer.minor = cpu_to_be16(0x0003);
1069     footer.orig_size = cpu_to_be64(total_size);
1070     footer.current_size = cpu_to_be64(total_size);
1071     footer.cyls = cpu_to_be16(cyls);
1072     footer.heads = heads;
1073     footer.secs_per_cyl = secs_per_cyl;
1074 
1075     footer.type = cpu_to_be32(disk_type);
1076 
1077     qemu_uuid_generate(&uuid);
1078     footer.uuid = uuid;
1079 
1080     footer.checksum = cpu_to_be32(vpc_checksum(&footer, sizeof(footer)));
1081 
1082     if (disk_type == VHD_DYNAMIC) {
1083         ret = create_dynamic_disk(blk, &footer, total_sectors);
1084         if (ret < 0) {
1085             error_setg(errp, "Unable to create or write VHD header");
1086         }
1087     } else {
1088         ret = create_fixed_disk(blk, &footer, total_size, errp);
1089     }
1090 
1091 out:
1092     blk_co_unref(blk);
1093     bdrv_co_unref(bs);
1094     return ret;
1095 }
1096 
1097 static int coroutine_fn GRAPH_UNLOCKED
1098 vpc_co_create_opts(BlockDriver *drv, const char *filename,
1099                    QemuOpts *opts, Error **errp)
1100 {
1101     BlockdevCreateOptions *create_options = NULL;
1102     QDict *qdict;
1103     Visitor *v;
1104     BlockDriverState *bs = NULL;
1105     int ret;
1106 
1107     static const QDictRenames opt_renames[] = {
1108         { VPC_OPT_FORCE_SIZE,           "force-size" },
1109         { NULL, NULL },
1110     };
1111 
1112     /* Parse options and convert legacy syntax */
1113     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vpc_create_opts, true);
1114 
1115     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
1116         ret = -EINVAL;
1117         goto fail;
1118     }
1119 
1120     /* Create and open the file (protocol layer) */
1121     ret = bdrv_co_create_file(filename, opts, errp);
1122     if (ret < 0) {
1123         goto fail;
1124     }
1125 
1126     bs = bdrv_co_open(filename, NULL, NULL,
1127                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
1128     if (bs == NULL) {
1129         ret = -EIO;
1130         goto fail;
1131     }
1132 
1133     /* Now get the QAPI type BlockdevCreateOptions */
1134     qdict_put_str(qdict, "driver", "vpc");
1135     qdict_put_str(qdict, "file", bs->node_name);
1136 
1137     v = qobject_input_visitor_new_flat_confused(qdict, errp);
1138     if (!v) {
1139         ret = -EINVAL;
1140         goto fail;
1141     }
1142 
1143     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1144     visit_free(v);
1145     if (!create_options) {
1146         ret = -EINVAL;
1147         goto fail;
1148     }
1149 
1150     /* Silently round up size */
1151     assert(create_options->driver == BLOCKDEV_DRIVER_VPC);
1152     create_options->u.vpc.size =
1153         ROUND_UP(create_options->u.vpc.size, BDRV_SECTOR_SIZE);
1154 
1155     if (!create_options->u.vpc.force_size) {
1156         int64_t total_sectors;
1157         ret = calculate_rounded_image_size(&create_options->u.vpc, NULL, NULL,
1158                                            NULL, &total_sectors, errp);
1159         if (ret < 0) {
1160             goto fail;
1161         }
1162 
1163         create_options->u.vpc.size = total_sectors * BDRV_SECTOR_SIZE;
1164     }
1165 
1166 
1167     /* Create the vpc image (format layer) */
1168     ret = vpc_co_create(create_options, errp);
1169 
1170 fail:
1171     qobject_unref(qdict);
1172     bdrv_co_unref(bs);
1173     qapi_free_BlockdevCreateOptions(create_options);
1174     return ret;
1175 }
1176 
1177 
1178 static int GRAPH_RDLOCK vpc_has_zero_init(BlockDriverState *bs)
1179 {
1180     BDRVVPCState *s = bs->opaque;
1181 
1182     if (be32_to_cpu(s->footer.type) == VHD_FIXED) {
1183         return bdrv_has_zero_init(bs->file->bs);
1184     } else {
1185         return 1;
1186     }
1187 }
1188 
1189 static void vpc_close(BlockDriverState *bs)
1190 {
1191     BDRVVPCState *s = bs->opaque;
1192     qemu_vfree(s->pagetable);
1193 #ifdef CACHE
1194     g_free(s->pageentry_u8);
1195 #endif
1196 
1197     migrate_del_blocker(&s->migration_blocker);
1198 }
1199 
1200 static QemuOptsList vpc_create_opts = {
1201     .name = "vpc-create-opts",
1202     .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head),
1203     .desc = {
1204         {
1205             .name = BLOCK_OPT_SIZE,
1206             .type = QEMU_OPT_SIZE,
1207             .help = "Virtual disk size"
1208         },
1209         {
1210             .name = BLOCK_OPT_SUBFMT,
1211             .type = QEMU_OPT_STRING,
1212             .help =
1213                 "Type of virtual hard disk format. Supported formats are "
1214                 "{dynamic (default) | fixed} "
1215         },
1216         {
1217             .name = VPC_OPT_FORCE_SIZE,
1218             .type = QEMU_OPT_BOOL,
1219             .help = "Force disk size calculation to use the actual size "
1220                     "specified, rather than using the nearest CHS-based "
1221                     "calculation"
1222         },
1223         { /* end of list */ }
1224     }
1225 };
1226 
1227 static const char *const vpc_strong_runtime_opts[] = {
1228     VPC_OPT_SIZE_CALC,
1229 
1230     NULL
1231 };
1232 
1233 static BlockDriver bdrv_vpc = {
1234     .format_name    = "vpc",
1235     .instance_size  = sizeof(BDRVVPCState),
1236 
1237     .bdrv_probe             = vpc_probe,
1238     .bdrv_open              = vpc_open,
1239     .bdrv_close             = vpc_close,
1240     .bdrv_reopen_prepare    = vpc_reopen_prepare,
1241     .bdrv_child_perm        = bdrv_default_perms,
1242     .bdrv_co_create         = vpc_co_create,
1243     .bdrv_co_create_opts    = vpc_co_create_opts,
1244 
1245     .bdrv_co_preadv             = vpc_co_preadv,
1246     .bdrv_co_pwritev            = vpc_co_pwritev,
1247     .bdrv_co_block_status       = vpc_co_block_status,
1248 
1249     .bdrv_co_get_info       = vpc_co_get_info,
1250 
1251     .is_format              = true,
1252     .create_opts            = &vpc_create_opts,
1253     .bdrv_has_zero_init     = vpc_has_zero_init,
1254     .strong_runtime_opts    = vpc_strong_runtime_opts,
1255 };
1256 
1257 static void bdrv_vpc_init(void)
1258 {
1259     bdrv_register(&bdrv_vpc);
1260 }
1261 
1262 block_init(bdrv_vpc_init);
1263