xref: /qemu/scripts/qcow2-to-stdout.py (revision ffd5a60e9b67e14f7bac7ea29300ea46a944e508)
1#!/usr/bin/env python3
2
3# This tool reads a disk image in any format and converts it to qcow2,
4# writing the result directly to stdout.
5#
6# Copyright (C) 2024 Igalia, S.L.
7#
8# Authors: Alberto Garcia <berto@igalia.com>
9#          Madeeha Javed <javed@igalia.com>
10#
11# SPDX-License-Identifier: GPL-2.0-or-later
12#
13# qcow2 files produced by this script are always arranged like this:
14#
15# - qcow2 header
16# - refcount table
17# - refcount blocks
18# - L1 table
19# - L2 tables
20# - Data clusters
21#
22# A note about variable names: in qcow2 there is one refcount table
23# and one (active) L1 table, although each can occupy several
24# clusters. For the sake of simplicity the code sometimes talks about
25# refcount tables and L1 tables when referring to those clusters.
26
27import argparse
28import errno
29import math
30import os
31import signal
32import struct
33import subprocess
34import sys
35import tempfile
36import time
37from contextlib import contextmanager
38
39QCOW2_DEFAULT_CLUSTER_SIZE = 65536
40QCOW2_DEFAULT_REFCOUNT_BITS = 16
41QCOW2_FEATURE_NAME_TABLE = 0x6803F857
42QCOW2_DATA_FILE_NAME_STRING = 0x44415441
43QCOW2_V3_HEADER_LENGTH = 112  # Header length in QEMU 9.0. Must be a multiple of 8
44QCOW2_INCOMPAT_DATA_FILE_BIT = 2
45QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT = 1
46QCOW_OFLAG_COPIED = 1 << 63
47QEMU_STORAGE_DAEMON = "qemu-storage-daemon"
48
49
50def bitmap_set(bitmap, idx):
51    bitmap[idx // 8] |= 1 << (idx % 8)
52
53
54def bitmap_is_set(bitmap, idx):
55    return (bitmap[idx // 8] & (1 << (idx % 8))) != 0
56
57
58def bitmap_iterator(bitmap, length):
59    for idx in range(length):
60        if bitmap_is_set(bitmap, idx):
61            yield idx
62
63
64def align_up(num, d):
65    return d * math.ceil(num / d)
66
67
68# Holes in the input file contain only zeroes so we can skip them and
69# save time. This function returns the indexes of the clusters that
70# are known to contain data. Those are the ones that we need to read.
71def clusters_with_data(fd, cluster_size):
72    data_to = 0
73    while True:
74        try:
75            data_from = os.lseek(fd, data_to, os.SEEK_DATA)
76            data_to = align_up(os.lseek(fd, data_from, os.SEEK_HOLE), cluster_size)
77            for idx in range(data_from // cluster_size, data_to // cluster_size):
78                yield idx
79        except OSError as err:
80            if err.errno == errno.ENXIO:  # End of file reached
81                break
82            raise err
83
84
85# write_qcow2_content() expects a raw input file. If we have a different
86# format we can use qemu-storage-daemon to make it appear as raw.
87@contextmanager
88def get_input_as_raw_file(input_file, input_format):
89    if input_format == "raw":
90        yield input_file
91        return
92    try:
93        temp_dir = tempfile.mkdtemp()
94        pid_file = os.path.join(temp_dir, "pid")
95        raw_file = os.path.join(temp_dir, "raw")
96        open(raw_file, "wb").close()
97        ret = subprocess.run(
98            [
99                QEMU_STORAGE_DAEMON,
100                "--daemonize",
101                "--pidfile", pid_file,
102                "--blockdev", f"driver=file,node-name=file0,driver=file,filename={input_file},read-only=on",
103                "--blockdev", f"driver={input_format},node-name=disk0,file=file0,read-only=on",
104                "--export", f"type=fuse,id=export0,node-name=disk0,mountpoint={raw_file},writable=off",
105            ],
106            capture_output=True,
107        )
108        if ret.returncode != 0:
109            sys.exit("[Error] Could not start the qemu-storage-daemon:\n" +
110                     ret.stderr.decode().rstrip('\n'))
111        yield raw_file
112    finally:
113        # Kill the storage daemon on exit
114        # and remove all temporary files
115        if os.path.exists(pid_file):
116            with open(pid_file, "r") as f:
117                pid = int(f.readline())
118            os.kill(pid, signal.SIGTERM)
119            while os.path.exists(pid_file):
120                time.sleep(0.1)
121        os.unlink(raw_file)
122        os.rmdir(temp_dir)
123
124
125def write_features(cluster, offset, data_file_name):
126    if data_file_name is not None:
127        encoded_name = data_file_name.encode("utf-8")
128        padded_name_len = align_up(len(encoded_name), 8)
129        struct.pack_into(f">II{padded_name_len}s", cluster, offset,
130                         QCOW2_DATA_FILE_NAME_STRING,
131                         len(encoded_name),
132                         encoded_name)
133        offset += 8 + padded_name_len
134
135    qcow2_features = [
136        # Incompatible
137        (0, 0, "dirty bit"),
138        (0, 1, "corrupt bit"),
139        (0, 2, "external data file"),
140        (0, 3, "compression type"),
141        (0, 4, "extended L2 entries"),
142        # Compatible
143        (1, 0, "lazy refcounts"),
144        # Autoclear
145        (2, 0, "bitmaps"),
146        (2, 1, "raw external data"),
147    ]
148    struct.pack_into(">I", cluster, offset, QCOW2_FEATURE_NAME_TABLE)
149    struct.pack_into(">I", cluster, offset + 4, len(qcow2_features) * 48)
150    offset += 8
151    for feature_type, feature_bit, feature_name in qcow2_features:
152        struct.pack_into(">BB46s", cluster, offset,
153                         feature_type, feature_bit, feature_name.encode("ascii"))
154        offset += 48
155
156
157def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, data_file_raw):
158    # Some basic values
159    l1_entries_per_table = cluster_size // 8
160    l2_entries_per_table = cluster_size // 8
161    refcounts_per_table  = cluster_size // 8
162    refcounts_per_block  = cluster_size * 8 // refcount_bits
163
164    # Virtual disk size, number of data clusters and L1 entries
165    disk_size = align_up(os.path.getsize(input_file), 512)
166    total_data_clusters = math.ceil(disk_size / cluster_size)
167    l1_entries = math.ceil(total_data_clusters / l2_entries_per_table)
168    allocated_l1_tables = math.ceil(l1_entries / l1_entries_per_table)
169
170    # Max L1 table size is 32 MB (QCOW_MAX_L1_SIZE in block/qcow2.h)
171    if (l1_entries * 8) > (32 * 1024 * 1024):
172        sys.exit("[Error] The image size is too large. Try using a larger cluster size.")
173
174    # Two bitmaps indicating which L1 and L2 entries are set
175    l1_bitmap = bytearray(allocated_l1_tables * l1_entries_per_table // 8)
176    l2_bitmap = bytearray(l1_entries * l2_entries_per_table // 8)
177    allocated_l2_tables = 0
178    allocated_data_clusters = 0
179
180    if data_file_raw:
181        # If data_file_raw is set then all clusters are allocated and
182        # we don't need to read the input file at all.
183        allocated_l2_tables = l1_entries
184        for idx in range(l1_entries):
185            bitmap_set(l1_bitmap, idx)
186        for idx in range(total_data_clusters):
187            bitmap_set(l2_bitmap, idx)
188    else:
189        # Open the input file for reading
190        fd = os.open(input_file, os.O_RDONLY)
191        zero_cluster = bytes(cluster_size)
192        # Read all the clusters that contain data
193        for idx in clusters_with_data(fd, cluster_size):
194            cluster = os.pread(fd, cluster_size, cluster_size * idx)
195            # If the last cluster is smaller than cluster_size pad it with zeroes
196            if len(cluster) < cluster_size:
197                cluster += bytes(cluster_size - len(cluster))
198            # If a cluster has non-zero data then it must be allocated
199            # in the output file and its L2 entry must be set
200            if cluster != zero_cluster:
201                bitmap_set(l2_bitmap, idx)
202                allocated_data_clusters += 1
203                # Allocated data clusters also need their corresponding L1 entry and L2 table
204                l1_idx = math.floor(idx / l2_entries_per_table)
205                if not bitmap_is_set(l1_bitmap, l1_idx):
206                    bitmap_set(l1_bitmap, l1_idx)
207                    allocated_l2_tables += 1
208
209    # Total amount of allocated clusters excluding the refcount blocks and table
210    total_allocated_clusters = 1 + allocated_l1_tables + allocated_l2_tables
211    if data_file_name is None:
212        total_allocated_clusters += allocated_data_clusters
213
214    # Clusters allocated for the refcount blocks and table
215    allocated_refcount_blocks = math.ceil(total_allocated_clusters  / refcounts_per_block)
216    allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table)
217
218    # Now we have a problem because allocated_refcount_blocks and allocated_refcount_tables...
219    # (a) increase total_allocated_clusters, and
220    # (b) need to be recalculated when total_allocated_clusters is increased
221    # So we need to repeat the calculation as long as the numbers change
222    while True:
223        new_total_allocated_clusters = total_allocated_clusters + allocated_refcount_tables + allocated_refcount_blocks
224        new_allocated_refcount_blocks = math.ceil(new_total_allocated_clusters / refcounts_per_block)
225        if new_allocated_refcount_blocks > allocated_refcount_blocks:
226            allocated_refcount_blocks = new_allocated_refcount_blocks
227            allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table)
228        else:
229            break
230
231    # Now that we have the final numbers we can update total_allocated_clusters
232    total_allocated_clusters += allocated_refcount_tables + allocated_refcount_blocks
233
234    # At this point we have the exact number of clusters that the output
235    # image is going to use so we can calculate all the offsets.
236    current_cluster_idx = 1
237
238    refcount_table_offset = current_cluster_idx * cluster_size
239    current_cluster_idx += allocated_refcount_tables
240
241    refcount_block_offset = current_cluster_idx * cluster_size
242    current_cluster_idx += allocated_refcount_blocks
243
244    l1_table_offset = current_cluster_idx * cluster_size
245    current_cluster_idx += allocated_l1_tables
246
247    l2_table_offset = current_cluster_idx * cluster_size
248    current_cluster_idx += allocated_l2_tables
249
250    data_clusters_offset = current_cluster_idx * cluster_size
251
252    # Calculate some values used in the qcow2 header
253    if allocated_l1_tables == 0:
254        l1_table_offset = 0
255
256    hdr_cluster_bits = int(math.log2(cluster_size))
257    hdr_refcount_bits = int(math.log2(refcount_bits))
258    hdr_length = QCOW2_V3_HEADER_LENGTH
259    hdr_incompat_features = 0
260    if data_file_name is not None:
261        hdr_incompat_features |= 1 << QCOW2_INCOMPAT_DATA_FILE_BIT
262    hdr_autoclear_features = 0
263    if data_file_raw:
264        hdr_autoclear_features |= 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT
265
266    ### Write qcow2 header
267    cluster = bytearray(cluster_size)
268    struct.pack_into(">4sIQIIQIIQQIIQQQQII", cluster, 0,
269        b"QFI\xfb",            # QCOW magic string
270        3,                     # version
271        0,                     # backing file offset
272        0,                     # backing file sizes
273        hdr_cluster_bits,
274        disk_size,
275        0,                     # encryption method
276        l1_entries,
277        l1_table_offset,
278        refcount_table_offset,
279        allocated_refcount_tables,
280        0,                     # number of snapshots
281        0,                     # snapshot table offset
282        hdr_incompat_features,
283        0,                     # compatible features
284        hdr_autoclear_features,
285        hdr_refcount_bits,
286        hdr_length,
287    )
288
289    write_features(cluster, hdr_length, data_file_name)
290
291    sys.stdout.buffer.write(cluster)
292
293    ### Write refcount table
294    cur_offset = refcount_block_offset
295    remaining_refcount_table_entries = allocated_refcount_blocks # Each entry is a pointer to a refcount block
296    while remaining_refcount_table_entries > 0:
297        cluster = bytearray(cluster_size)
298        to_write = min(remaining_refcount_table_entries, refcounts_per_table)
299        remaining_refcount_table_entries -= to_write
300        for idx in range(to_write):
301            struct.pack_into(">Q", cluster, idx * 8, cur_offset)
302            cur_offset += cluster_size
303        sys.stdout.buffer.write(cluster)
304
305    ### Write refcount blocks
306    remaining_refcount_block_entries = total_allocated_clusters # One entry for each allocated cluster
307    for tbl in range(allocated_refcount_blocks):
308        cluster = bytearray(cluster_size)
309        to_write = min(remaining_refcount_block_entries, refcounts_per_block)
310        remaining_refcount_block_entries -= to_write
311        # All refcount entries contain the number 1. The only difference
312        # is their bit width, defined when the image is created.
313        for idx in range(to_write):
314            if refcount_bits == 64:
315                struct.pack_into(">Q", cluster, idx * 8, 1)
316            elif refcount_bits == 32:
317                struct.pack_into(">L", cluster, idx * 4, 1)
318            elif refcount_bits == 16:
319                struct.pack_into(">H", cluster, idx * 2, 1)
320            elif refcount_bits == 8:
321                cluster[idx] = 1
322            elif refcount_bits == 4:
323                cluster[idx // 2] |= 1 << ((idx % 2) * 4)
324            elif refcount_bits == 2:
325                cluster[idx // 4] |= 1 << ((idx % 4) * 2)
326            elif refcount_bits == 1:
327                cluster[idx // 8] |= 1 << (idx % 8)
328        sys.stdout.buffer.write(cluster)
329
330    ### Write L1 table
331    cur_offset = l2_table_offset
332    for tbl in range(allocated_l1_tables):
333        cluster = bytearray(cluster_size)
334        for idx in range(l1_entries_per_table):
335            l1_idx = tbl * l1_entries_per_table + idx
336            if bitmap_is_set(l1_bitmap, l1_idx):
337                struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED)
338                cur_offset += cluster_size
339        sys.stdout.buffer.write(cluster)
340
341    ### Write L2 tables
342    cur_offset = data_clusters_offset
343    for tbl in range(l1_entries):
344        # Skip the empty L2 tables. We can identify them because
345        # there is no L1 entry pointing at them.
346        if bitmap_is_set(l1_bitmap, tbl):
347            cluster = bytearray(cluster_size)
348            for idx in range(l2_entries_per_table):
349                l2_idx = tbl * l2_entries_per_table + idx
350                if bitmap_is_set(l2_bitmap, l2_idx):
351                    if data_file_name is None:
352                        struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED)
353                        cur_offset += cluster_size
354                    else:
355                        struct.pack_into(">Q", cluster, idx * 8, (l2_idx * cluster_size) | QCOW_OFLAG_COPIED)
356            sys.stdout.buffer.write(cluster)
357
358    ### Write data clusters
359    if data_file_name is None:
360        for idx in bitmap_iterator(l2_bitmap, total_data_clusters):
361            cluster = os.pread(fd, cluster_size, cluster_size * idx)
362            # If the last cluster is smaller than cluster_size pad it with zeroes
363            if len(cluster) < cluster_size:
364                cluster += bytes(cluster_size - len(cluster))
365            sys.stdout.buffer.write(cluster)
366
367    if not data_file_raw:
368        os.close(fd)
369
370
371def main():
372    # Command-line arguments
373    parser = argparse.ArgumentParser(
374        description="This program converts a QEMU disk image to qcow2 "
375        "and writes it to the standard output"
376    )
377    parser.add_argument("input_file", help="name of the input file")
378    parser.add_argument(
379        "-f",
380        dest="input_format",
381        metavar="input_format",
382        help="format of the input file (default: raw)",
383        default="raw",
384    )
385    parser.add_argument(
386        "-c",
387        dest="cluster_size",
388        metavar="cluster_size",
389        help=f"qcow2 cluster size (default: {QCOW2_DEFAULT_CLUSTER_SIZE})",
390        default=QCOW2_DEFAULT_CLUSTER_SIZE,
391        type=int,
392        choices=[1 << x for x in range(9, 22)],
393    )
394    parser.add_argument(
395        "-r",
396        dest="refcount_bits",
397        metavar="refcount_bits",
398        help=f"width of the reference count entries (default: {QCOW2_DEFAULT_REFCOUNT_BITS})",
399        default=QCOW2_DEFAULT_REFCOUNT_BITS,
400        type=int,
401        choices=[1 << x for x in range(7)],
402    )
403    parser.add_argument(
404        "-d",
405        dest="data_file",
406        help="create an image with input_file as an external data file",
407        action="store_true",
408    )
409    parser.add_argument(
410        "-R",
411        dest="data_file_raw",
412        help="enable data_file_raw on the generated image (implies -d)",
413        action="store_true",
414    )
415    args = parser.parse_args()
416
417    if args.data_file_raw:
418        args.data_file = True
419
420    if not os.path.isfile(args.input_file):
421        sys.exit(f"[Error] {args.input_file} does not exist or is not a regular file.")
422
423    if args.data_file and args.input_format != "raw":
424        sys.exit("[Error] External data files can only be used with raw input images")
425
426    # A 512 byte header is too small for the data file name extension
427    if args.data_file and args.cluster_size == 512:
428        sys.exit("[Error] External data files require a larger cluster size")
429
430    if sys.stdout.isatty():
431        sys.exit("[Error] Refusing to write to a tty. Try redirecting stdout.")
432
433    if args.data_file:
434        data_file_name = args.input_file
435    else:
436        data_file_name = None
437
438    with get_input_as_raw_file(args.input_file, args.input_format) as raw_file:
439        write_qcow2_content(
440            raw_file,
441            args.cluster_size,
442            args.refcount_bits,
443            data_file_name,
444            args.data_file_raw,
445        )
446
447
448if __name__ == "__main__":
449    main()
450