xref: /kvmtool/disk/qcow.c (revision fe8bdde0bd72210f0cfeb36bcdd802708d66c4ea)
186835cedSPrasad Joshi #include "kvm/qcow.h"
286835cedSPrasad Joshi 
386835cedSPrasad Joshi #include "kvm/disk-image.h"
486835cedSPrasad Joshi #include "kvm/read-write.h"
586835cedSPrasad Joshi #include "kvm/util.h"
686835cedSPrasad Joshi 
786835cedSPrasad Joshi #include <sys/types.h>
886835cedSPrasad Joshi #include <sys/stat.h>
986835cedSPrasad Joshi #include <stdbool.h>
1086835cedSPrasad Joshi #include <stdlib.h>
1186835cedSPrasad Joshi #include <string.h>
1286835cedSPrasad Joshi #include <unistd.h>
1386835cedSPrasad Joshi #include <fcntl.h>
1486835cedSPrasad Joshi 
1586835cedSPrasad Joshi #include <linux/byteorder.h>
16865c675fSPrasad Joshi #include <linux/kernel.h>
170df6b4d9SPekka Enberg #include <linux/types.h>
1886835cedSPrasad Joshi 
19473d58ffSPekka Enberg static int insert(struct rb_root *root, struct qcow_l2_table *new)
203309045fSPrasad Joshi {
213309045fSPrasad Joshi 	struct rb_node **link = &(root->rb_node), *parent = NULL;
223309045fSPrasad Joshi 	u64 offset = new->offset;
233309045fSPrasad Joshi 
243309045fSPrasad Joshi 	/* search the tree */
253309045fSPrasad Joshi 	while (*link) {
26473d58ffSPekka Enberg 		struct qcow_l2_table *t;
273309045fSPrasad Joshi 
28473d58ffSPekka Enberg 		t = rb_entry(*link, struct qcow_l2_table, node);
293309045fSPrasad Joshi 		if (!t)
303309045fSPrasad Joshi 			goto error;
313309045fSPrasad Joshi 
323309045fSPrasad Joshi 		parent = *link;
333309045fSPrasad Joshi 
343309045fSPrasad Joshi 		if (t->offset > offset)
353309045fSPrasad Joshi 			link = &(*link)->rb_left;
363309045fSPrasad Joshi 		else if (t->offset < offset)
373309045fSPrasad Joshi 			link = &(*link)->rb_right;
383309045fSPrasad Joshi 		else
393309045fSPrasad Joshi 			goto out;
403309045fSPrasad Joshi 	}
413309045fSPrasad Joshi 
423309045fSPrasad Joshi 	/* add new node */
433309045fSPrasad Joshi 	rb_link_node(&new->node, parent, link);
443309045fSPrasad Joshi 	rb_insert_color(&new->node, root);
453309045fSPrasad Joshi out:
463309045fSPrasad Joshi 	return 0;
473309045fSPrasad Joshi error:
483309045fSPrasad Joshi 	return -1;
493309045fSPrasad Joshi }
503309045fSPrasad Joshi 
51473d58ffSPekka Enberg static struct qcow_l2_table *search(struct rb_root *root, u64 offset)
523309045fSPrasad Joshi {
533309045fSPrasad Joshi 	struct rb_node *link = root->rb_node;
543309045fSPrasad Joshi 
553309045fSPrasad Joshi 	while (link) {
56473d58ffSPekka Enberg 		struct qcow_l2_table *t;
573309045fSPrasad Joshi 
58473d58ffSPekka Enberg 		t = rb_entry(link, struct qcow_l2_table, node);
593309045fSPrasad Joshi 		if (!t)
603309045fSPrasad Joshi 			goto out;
613309045fSPrasad Joshi 
623309045fSPrasad Joshi 		if (t->offset > offset)
633309045fSPrasad Joshi 			link = link->rb_left;
643309045fSPrasad Joshi 		else if (t->offset < offset)
653309045fSPrasad Joshi 			link = link->rb_right;
663309045fSPrasad Joshi 		else
673309045fSPrasad Joshi 			return t;
683309045fSPrasad Joshi 	}
693309045fSPrasad Joshi out:
703309045fSPrasad Joshi 	return NULL;
713309045fSPrasad Joshi }
723309045fSPrasad Joshi 
733309045fSPrasad Joshi static void free_cache(struct qcow *q)
743309045fSPrasad Joshi {
753309045fSPrasad Joshi 	struct list_head *pos, *n;
76473d58ffSPekka Enberg 	struct qcow_l2_table *t;
773309045fSPrasad Joshi 	struct rb_root *r = &q->root;
783309045fSPrasad Joshi 
793309045fSPrasad Joshi 	list_for_each_safe(pos, n, &q->lru_list) {
803309045fSPrasad Joshi 		/* Remove cache table from the list and RB tree */
813309045fSPrasad Joshi 		list_del(pos);
82473d58ffSPekka Enberg 		t = list_entry(pos, struct qcow_l2_table, list);
833309045fSPrasad Joshi 		rb_erase(&t->node, r);
843309045fSPrasad Joshi 
853309045fSPrasad Joshi 		/* Free the cached node */
863309045fSPrasad Joshi 		free(t);
873309045fSPrasad Joshi 	}
883309045fSPrasad Joshi }
893309045fSPrasad Joshi 
90473d58ffSPekka Enberg static int cache_table(struct qcow *q, struct qcow_l2_table *c)
913309045fSPrasad Joshi {
923309045fSPrasad Joshi 	struct rb_root *r = &q->root;
93473d58ffSPekka Enberg 	struct qcow_l2_table *lru;
943309045fSPrasad Joshi 
953309045fSPrasad Joshi 	if (q->nr_cached == MAX_CACHE_NODES) {
963309045fSPrasad Joshi 		/*
973309045fSPrasad Joshi 		 * The node at the head of the list is least recently used
983309045fSPrasad Joshi 		 * node. Remove it from the list and replaced with a new node.
993309045fSPrasad Joshi 		 */
100473d58ffSPekka Enberg 		lru = list_first_entry(&q->lru_list, struct qcow_l2_table, list);
1013309045fSPrasad Joshi 
1023309045fSPrasad Joshi 		/* Remove the node from the cache */
1033309045fSPrasad Joshi 		rb_erase(&lru->node, r);
1043309045fSPrasad Joshi 		list_del_init(&lru->list);
1053309045fSPrasad Joshi 		q->nr_cached--;
1063309045fSPrasad Joshi 
1073309045fSPrasad Joshi 		/* Free the LRUed node */
1083309045fSPrasad Joshi 		free(lru);
1093309045fSPrasad Joshi 	}
1103309045fSPrasad Joshi 
1113309045fSPrasad Joshi 	/* Add new node in RB Tree: Helps in searching faster */
1123309045fSPrasad Joshi 	if (insert(r, c) < 0)
1133309045fSPrasad Joshi 		goto error;
1143309045fSPrasad Joshi 
1153309045fSPrasad Joshi 	/* Add in LRU replacement list */
1163309045fSPrasad Joshi 	list_add_tail(&c->list, &q->lru_list);
1173309045fSPrasad Joshi 	q->nr_cached++;
1183309045fSPrasad Joshi 
1193309045fSPrasad Joshi 	return 0;
1203309045fSPrasad Joshi error:
1213309045fSPrasad Joshi 	return -1;
1223309045fSPrasad Joshi }
1233309045fSPrasad Joshi 
124*fe8bdde0SPekka Enberg static struct qcow_l2_table *search_table(struct qcow *q, u64 offset)
1253309045fSPrasad Joshi {
126*fe8bdde0SPekka Enberg 	struct qcow_l2_table *l2t;
1273309045fSPrasad Joshi 
128*fe8bdde0SPekka Enberg 	l2t = search(&q->root, offset);
129*fe8bdde0SPekka Enberg 	if (!l2t)
130*fe8bdde0SPekka Enberg 		return NULL;
1313309045fSPrasad Joshi 
1323309045fSPrasad Joshi 	/* Update the LRU state, by moving the searched node to list tail */
133*fe8bdde0SPekka Enberg 	list_move_tail(&l2t->list, &q->lru_list);
1343309045fSPrasad Joshi 
135*fe8bdde0SPekka Enberg 	return l2t;
1363309045fSPrasad Joshi }
1373309045fSPrasad Joshi 
1383309045fSPrasad Joshi /* Allocates a new node for caching L2 table */
139473d58ffSPekka Enberg static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
1403309045fSPrasad Joshi {
1413309045fSPrasad Joshi 	struct qcow_header *header = q->header;
142473d58ffSPekka Enberg 	struct qcow_l2_table *c;
1433309045fSPrasad Joshi 	u64 l2t_sz;
1443309045fSPrasad Joshi 	u64 size;
1453309045fSPrasad Joshi 
1463309045fSPrasad Joshi 	l2t_sz = 1 << header->l2_bits;
1473309045fSPrasad Joshi 	size   = sizeof(*c) + l2t_sz * sizeof(u64);
1483309045fSPrasad Joshi 	c      = calloc(1, size);
1493309045fSPrasad Joshi 	if (!c)
1503309045fSPrasad Joshi 		goto out;
1513309045fSPrasad Joshi 
1523309045fSPrasad Joshi 	c->offset = offset;
1533309045fSPrasad Joshi 	RB_CLEAR_NODE(&c->node);
1543309045fSPrasad Joshi 	INIT_LIST_HEAD(&c->list);
1553309045fSPrasad Joshi out:
1563309045fSPrasad Joshi 	return c;
1573309045fSPrasad Joshi }
1583309045fSPrasad Joshi 
159742fce76SPrasad Joshi static inline u64 get_l1_index(struct qcow *q, u64 offset)
16086835cedSPrasad Joshi {
161ad627d62SPekka Enberg 	struct qcow_header *header = q->header;
16286835cedSPrasad Joshi 
16386835cedSPrasad Joshi 	return offset >> (header->l2_bits + header->cluster_bits);
16486835cedSPrasad Joshi }
16586835cedSPrasad Joshi 
166742fce76SPrasad Joshi static inline u64 get_l2_index(struct qcow *q, u64 offset)
16786835cedSPrasad Joshi {
168ad627d62SPekka Enberg 	struct qcow_header *header = q->header;
16986835cedSPrasad Joshi 
17086835cedSPrasad Joshi 	return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
17186835cedSPrasad Joshi }
17286835cedSPrasad Joshi 
173742fce76SPrasad Joshi static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
17486835cedSPrasad Joshi {
175ad627d62SPekka Enberg 	struct qcow_header *header = q->header;
17686835cedSPrasad Joshi 
17786835cedSPrasad Joshi 	return offset & ((1 << header->cluster_bits)-1);
17886835cedSPrasad Joshi }
17986835cedSPrasad Joshi 
180*fe8bdde0SPekka Enberg static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
1813309045fSPrasad Joshi {
1823309045fSPrasad Joshi 	struct qcow_header *header = q->header;
183*fe8bdde0SPekka Enberg 	struct qcow_l2_table *l2t;
1843309045fSPrasad Joshi 	u64 size;
1853309045fSPrasad Joshi 	u64 i;
1863309045fSPrasad Joshi 
1873309045fSPrasad Joshi 	size = 1 << header->l2_bits;
1883309045fSPrasad Joshi 
1893309045fSPrasad Joshi 	/* search an entry for offset in cache */
190*fe8bdde0SPekka Enberg 	l2t = search_table(q, offset);
191*fe8bdde0SPekka Enberg 	if (l2t)
192*fe8bdde0SPekka Enberg 		return l2t;
1933309045fSPrasad Joshi 
1943309045fSPrasad Joshi 	/* allocate new node for caching l2 table */
195*fe8bdde0SPekka Enberg 	l2t = new_cache_table(q, offset);
196*fe8bdde0SPekka Enberg 	if (!l2t)
1973309045fSPrasad Joshi 		goto error;
1983309045fSPrasad Joshi 
1993309045fSPrasad Joshi 	/* table not cached: read from the disk */
200*fe8bdde0SPekka Enberg 	if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
2013309045fSPrasad Joshi 		goto error;
2023309045fSPrasad Joshi 
2033309045fSPrasad Joshi 	/* cache the table */
204*fe8bdde0SPekka Enberg 	if (cache_table(q, l2t) < 0)
2053309045fSPrasad Joshi 		goto error;
2063309045fSPrasad Joshi 
2073309045fSPrasad Joshi 	/* change cached table to CPU's byte-order */
2083309045fSPrasad Joshi 	for (i = 0; i < size; i++)
209*fe8bdde0SPekka Enberg 		be64_to_cpus(&l2t->table[i]);
2103309045fSPrasad Joshi 
211*fe8bdde0SPekka Enberg 	return l2t;
2123309045fSPrasad Joshi error:
213*fe8bdde0SPekka Enberg 	free(l2t);
214*fe8bdde0SPekka Enberg 	return NULL;
2153309045fSPrasad Joshi }
2163309045fSPrasad Joshi 
217b1c84095SPekka Enberg static ssize_t qcow_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len)
21886835cedSPrasad Joshi {
219ad627d62SPekka Enberg 	struct qcow_header *header = q->header;
2203dac48d4SPrasad Joshi 	struct qcow_table *table  = &q->table;
221*fe8bdde0SPekka Enberg 	struct qcow_l2_table *l2_table;
222742fce76SPrasad Joshi 	u64 l2_table_offset;
223742fce76SPrasad Joshi 	u64 l2_table_size;
2243dac48d4SPrasad Joshi 	u64 cluster_size;
225742fce76SPrasad Joshi 	u64 clust_offset;
226742fce76SPrasad Joshi 	u64 clust_start;
227a51948ceSPekka Enberg 	size_t length;
228742fce76SPrasad Joshi 	u64 l1_idx;
229742fce76SPrasad Joshi 	u64 l2_idx;
23086835cedSPrasad Joshi 
231dae803fbSPekka Enberg 	cluster_size = 1 << header->cluster_bits;
23286835cedSPrasad Joshi 
233c5e0624bSPrasad Joshi 	l1_idx = get_l1_index(q, offset);
2343dac48d4SPrasad Joshi 	if (l1_idx >= table->table_size)
23586835cedSPrasad Joshi 		goto out_error;
23686835cedSPrasad Joshi 
2373dac48d4SPrasad Joshi 	clust_offset = get_cluster_offset(q, offset);
2383dac48d4SPrasad Joshi 	if (clust_offset >= cluster_size)
2393dac48d4SPrasad Joshi 		goto out_error;
2403dac48d4SPrasad Joshi 
2413dac48d4SPrasad Joshi 	length = cluster_size - clust_offset;
2423dac48d4SPrasad Joshi 	if (length > dst_len)
2433dac48d4SPrasad Joshi 		length = dst_len;
2443dac48d4SPrasad Joshi 
245ad627d62SPekka Enberg 	l2_table_offset = table->l1_table[l1_idx] & ~header->oflag_mask;
24686835cedSPrasad Joshi 	if (!l2_table_offset)
2473dac48d4SPrasad Joshi 		goto zero_cluster;
24886835cedSPrasad Joshi 
24986835cedSPrasad Joshi 	l2_table_size = 1 << header->l2_bits;
25086835cedSPrasad Joshi 
2513309045fSPrasad Joshi 	/* read and cache level 2 table */
252*fe8bdde0SPekka Enberg 	l2_table = qcow_read_l2_table(q, l2_table_offset);
253*fe8bdde0SPekka Enberg 	if (!l2_table)
254b6edb0ecSSasha Levin 		goto out_error;
25586835cedSPrasad Joshi 
256c5e0624bSPrasad Joshi 	l2_idx = get_l2_index(q, offset);
25786835cedSPrasad Joshi 	if (l2_idx >= l2_table_size)
258b6edb0ecSSasha Levin 		goto out_error;
25986835cedSPrasad Joshi 
260*fe8bdde0SPekka Enberg 	clust_start = l2_table->table[l2_idx] & ~header->oflag_mask;
26186835cedSPrasad Joshi 	if (!clust_start)
2623dac48d4SPrasad Joshi 		goto zero_cluster;
26386835cedSPrasad Joshi 
2643dac48d4SPrasad Joshi 	if (pread_in_full(q->fd, dst, length, clust_start + clust_offset) < 0)
265b6edb0ecSSasha Levin 		goto out_error;
26686835cedSPrasad Joshi 
267179b71f0SPekka Enberg out:
2683dac48d4SPrasad Joshi 	return length;
26986835cedSPrasad Joshi 
270179b71f0SPekka Enberg zero_cluster:
271179b71f0SPekka Enberg 	memset(dst, 0, length);
272179b71f0SPekka Enberg 	goto out;
273179b71f0SPekka Enberg 
27486835cedSPrasad Joshi out_error:
275179b71f0SPekka Enberg 	length = -1;
276179b71f0SPekka Enberg 	goto out;
2773dac48d4SPrasad Joshi }
278b6edb0ecSSasha Levin 
279b1c84095SPekka Enberg static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector, void *dst, u32 dst_len)
2803dac48d4SPrasad Joshi {
28143835ac9SSasha Levin 	struct qcow *q = disk->priv;
282ad627d62SPekka Enberg 	struct qcow_header *header = q->header;
283d8eea993SPekka Enberg 	u32 nr_read;
2840df6b4d9SPekka Enberg 	u64 offset;
2850df6b4d9SPekka Enberg 	char *buf;
2863dac48d4SPrasad Joshi 	u32 nr;
2873dac48d4SPrasad Joshi 
2880df6b4d9SPekka Enberg 	buf		= dst;
289d8eea993SPekka Enberg 	nr_read		= 0;
2900df6b4d9SPekka Enberg 
291d8eea993SPekka Enberg 	while (nr_read < dst_len) {
2923dac48d4SPrasad Joshi 		offset		= sector << SECTOR_SHIFT;
2933dac48d4SPrasad Joshi 		if (offset >= header->size)
2940df6b4d9SPekka Enberg 			return -1;
2953dac48d4SPrasad Joshi 
296b1c84095SPekka Enberg 		nr = qcow_read_cluster(q, offset, buf, dst_len - nr_read);
297a51948ceSPekka Enberg 		if (nr <= 0)
2980df6b4d9SPekka Enberg 			return -1;
2993dac48d4SPrasad Joshi 
300d8eea993SPekka Enberg 		nr_read		+= nr;
3013dac48d4SPrasad Joshi 		buf		+= nr;
3023dac48d4SPrasad Joshi 		sector		+= (nr >> SECTOR_SHIFT);
3033dac48d4SPrasad Joshi 	}
3040df6b4d9SPekka Enberg 
30572133dd2SAsias He 	return dst_len;
30686835cedSPrasad Joshi }
30786835cedSPrasad Joshi 
308865c675fSPrasad Joshi static inline u64 file_size(int fd)
309865c675fSPrasad Joshi {
310865c675fSPrasad Joshi 	struct stat st;
3110df6b4d9SPekka Enberg 
312865c675fSPrasad Joshi 	if (fstat(fd, &st) < 0)
313865c675fSPrasad Joshi 		return 0;
3140df6b4d9SPekka Enberg 
315865c675fSPrasad Joshi 	return st.st_size;
316865c675fSPrasad Joshi }
317865c675fSPrasad Joshi 
3180df6b4d9SPekka Enberg static inline int qcow_pwrite_sync(int fd, void *buf, size_t count, off_t offset)
319865c675fSPrasad Joshi {
320865c675fSPrasad Joshi 	if (pwrite_in_full(fd, buf, count, offset) < 0)
321865c675fSPrasad Joshi 		return -1;
3220df6b4d9SPekka Enberg 
3237d94a719SPekka Enberg 	return fdatasync(fd);
324865c675fSPrasad Joshi }
325865c675fSPrasad Joshi 
326865c675fSPrasad Joshi /* Writes a level 2 table at the end of the file. */
327b1c84095SPekka Enberg static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
328865c675fSPrasad Joshi {
329865c675fSPrasad Joshi 	struct qcow_header *header = q->header;
330865c675fSPrasad Joshi 	u64 clust_sz;
331865c675fSPrasad Joshi 	u64 f_sz;
3320df6b4d9SPekka Enberg 	u64 off;
3330df6b4d9SPekka Enberg 	u64 sz;
334865c675fSPrasad Joshi 
335865c675fSPrasad Joshi 	f_sz		= file_size(q->fd);
336865c675fSPrasad Joshi 	if (!f_sz)
337865c675fSPrasad Joshi 		return 0;
338865c675fSPrasad Joshi 
339865c675fSPrasad Joshi 	sz		= 1 << header->l2_bits;
340865c675fSPrasad Joshi 	clust_sz	= 1 << header->cluster_bits;
341865c675fSPrasad Joshi 	off		= ALIGN(f_sz, clust_sz);
342865c675fSPrasad Joshi 
3430df6b4d9SPekka Enberg 	if (qcow_pwrite_sync(q->fd, table, sz * sizeof(u64), off) < 0)
344865c675fSPrasad Joshi 		return 0;
3450df6b4d9SPekka Enberg 
346865c675fSPrasad Joshi 	return off;
347865c675fSPrasad Joshi }
348865c675fSPrasad Joshi 
349865c675fSPrasad Joshi /*
350865c675fSPrasad Joshi  * QCOW file might grow during a write operation. Not only data but metadata is
351865c675fSPrasad Joshi  * also written at the end of the file. Therefore it is necessary to ensure
3520df6b4d9SPekka Enberg  * every write is committed to disk. Hence we use uses qcow_pwrite_sync() to
353865c675fSPrasad Joshi  * synchronize the in-core state of QCOW image to disk.
354865c675fSPrasad Joshi  *
355865c675fSPrasad Joshi  * We also try to restore the image to a consistent state if the metdata
356865c675fSPrasad Joshi  * operation fails. The two metadat operations are: level 1 and level 2 table
357865c675fSPrasad Joshi  * update. If either of them fails the image is truncated to a consistent state.
358865c675fSPrasad Joshi  */
359b1c84095SPekka Enberg static ssize_t qcow_write_cluster(struct qcow *q, u64 offset, void *buf, u32 src_len)
360865c675fSPrasad Joshi {
361865c675fSPrasad Joshi 	struct qcow_header *header = q->header;
362865c675fSPrasad Joshi 	struct qcow_table  *table  = &q->table;
363*fe8bdde0SPekka Enberg 	struct qcow_l2_table *l2t;
3640df6b4d9SPekka Enberg 	bool update_meta;
3650df6b4d9SPekka Enberg 	u64 clust_start;
3660df6b4d9SPekka Enberg 	u64 clust_off;
367865c675fSPrasad Joshi 	u64 clust_sz;
368865c675fSPrasad Joshi 	u64 l1t_idx;
369865c675fSPrasad Joshi 	u64 l2t_idx;
3700df6b4d9SPekka Enberg 	u64 l2t_off;
3710df6b4d9SPekka Enberg 	u64 l2t_sz;
372865c675fSPrasad Joshi 	u64 f_sz;
3730df6b4d9SPekka Enberg 	u64 len;
374865c675fSPrasad Joshi 	u64 t;
375865c675fSPrasad Joshi 
376*fe8bdde0SPekka Enberg 	l2t		= NULL;
377865c675fSPrasad Joshi 	l2t_sz		= 1 << header->l2_bits;
378865c675fSPrasad Joshi 	clust_sz	= 1 << header->cluster_bits;
379865c675fSPrasad Joshi 
380865c675fSPrasad Joshi 	l1t_idx		= get_l1_index(q, offset);
381865c675fSPrasad Joshi 	if (l1t_idx >= table->table_size)
382865c675fSPrasad Joshi 		goto error;
383865c675fSPrasad Joshi 
384865c675fSPrasad Joshi 	l2t_idx		= get_l2_index(q, offset);
385865c675fSPrasad Joshi 	if (l2t_idx >= l2t_sz)
386865c675fSPrasad Joshi 		goto error;
387865c675fSPrasad Joshi 
388865c675fSPrasad Joshi 	clust_off	= get_cluster_offset(q, offset);
389865c675fSPrasad Joshi 	if (clust_off >= clust_sz)
390865c675fSPrasad Joshi 		goto error;
391865c675fSPrasad Joshi 
392865c675fSPrasad Joshi 	len		= clust_sz - clust_off;
393865c675fSPrasad Joshi 	if (len > src_len)
394865c675fSPrasad Joshi 		len = src_len;
395865c675fSPrasad Joshi 
396865c675fSPrasad Joshi 	l2t_off		= table->l1_table[l1t_idx] & ~header->oflag_mask;
397865c675fSPrasad Joshi 	if (l2t_off) {
3983309045fSPrasad Joshi 		/* read and cache l2 table */
399*fe8bdde0SPekka Enberg 		l2t = qcow_read_l2_table(q, l2t_off);
400*fe8bdde0SPekka Enberg 		if (!l2t)
4013309045fSPrasad Joshi 			goto error;
402865c675fSPrasad Joshi 	} else {
403*fe8bdde0SPekka Enberg 		l2t = new_cache_table(q, l2t_off);
404*fe8bdde0SPekka Enberg 		if (!l2t)
4053309045fSPrasad Joshi 			goto error;
4063309045fSPrasad Joshi 
4070df6b4d9SPekka Enberg 		/* Capture the state of the consistent QCOW image */
408865c675fSPrasad Joshi 		f_sz		= file_size(q->fd);
409865c675fSPrasad Joshi 		if (!f_sz)
4103309045fSPrasad Joshi 			goto free_cache;
411865c675fSPrasad Joshi 
412865c675fSPrasad Joshi 		/* Write the l2 table of 0's at the end of the file */
413*fe8bdde0SPekka Enberg 		l2t_off		= qcow_write_l2_table(q, l2t->table);
414865c675fSPrasad Joshi 		if (!l2t_off)
4153309045fSPrasad Joshi 			goto free_cache;
416865c675fSPrasad Joshi 
417865c675fSPrasad Joshi 		/* Metadata update: update on disk level 1 table */
418865c675fSPrasad Joshi 		t		= cpu_to_be64(l2t_off);
4190df6b4d9SPekka Enberg 
4200df6b4d9SPekka Enberg 		if (qcow_pwrite_sync(q->fd, &t, sizeof(t), header->l1_table_offset + l1t_idx * sizeof(u64)) < 0) {
421865c675fSPrasad Joshi 			/* restore file to consistent state */
422865c675fSPrasad Joshi 			if (ftruncate(q->fd, f_sz) < 0)
4233309045fSPrasad Joshi 				goto free_cache;
4240df6b4d9SPekka Enberg 
4253309045fSPrasad Joshi 			goto free_cache;
4263309045fSPrasad Joshi 		}
4273309045fSPrasad Joshi 
428*fe8bdde0SPekka Enberg 		if (cache_table(q, l2t) < 0) {
4293309045fSPrasad Joshi 			if (ftruncate(q->fd, f_sz) < 0)
4303309045fSPrasad Joshi 				goto free_cache;
4313309045fSPrasad Joshi 
4323309045fSPrasad Joshi 			goto free_cache;
433865c675fSPrasad Joshi 		}
434865c675fSPrasad Joshi 
4350df6b4d9SPekka Enberg 		/* Update the in-core entry */
436865c675fSPrasad Joshi 		table->l1_table[l1t_idx] = l2t_off;
437865c675fSPrasad Joshi 	}
438865c675fSPrasad Joshi 
4390df6b4d9SPekka Enberg 	/* Capture the state of the consistent QCOW image */
440865c675fSPrasad Joshi 	f_sz		= file_size(q->fd);
441865c675fSPrasad Joshi 	if (!f_sz)
4423309045fSPrasad Joshi 		goto error;
443865c675fSPrasad Joshi 
444*fe8bdde0SPekka Enberg 	clust_start	= l2t->table[l2t_idx] & ~header->oflag_mask;
445865c675fSPrasad Joshi 	if (!clust_start) {
446865c675fSPrasad Joshi 		clust_start	= ALIGN(f_sz, clust_sz);
447865c675fSPrasad Joshi 		update_meta	= true;
4480df6b4d9SPekka Enberg 	} else
4490df6b4d9SPekka Enberg 		update_meta	= false;
450865c675fSPrasad Joshi 
4510df6b4d9SPekka Enberg 	/* Write actual data */
452865c675fSPrasad Joshi 	if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
453865c675fSPrasad Joshi 		goto error;
454865c675fSPrasad Joshi 
455865c675fSPrasad Joshi 	if (update_meta) {
456865c675fSPrasad Joshi 		t = cpu_to_be64(clust_start);
4570df6b4d9SPekka Enberg 		if (qcow_pwrite_sync(q->fd, &t, sizeof(t), l2t_off + l2t_idx * sizeof(u64)) < 0) {
4580df6b4d9SPekka Enberg 			/* Restore the file to consistent state */
459865c675fSPrasad Joshi 			if (ftruncate(q->fd, f_sz) < 0)
460865c675fSPrasad Joshi 				goto error;
4610df6b4d9SPekka Enberg 
462865c675fSPrasad Joshi 			goto error;
463865c675fSPrasad Joshi 		}
4643309045fSPrasad Joshi 
4653309045fSPrasad Joshi 		/* Update the cached level2 entry */
466*fe8bdde0SPekka Enberg 		l2t->table[l2t_idx] = clust_start;
467865c675fSPrasad Joshi 	}
4680df6b4d9SPekka Enberg 
469865c675fSPrasad Joshi 	return len;
4703309045fSPrasad Joshi 
4713309045fSPrasad Joshi free_cache:
472*fe8bdde0SPekka Enberg 	free(l2t);
473865c675fSPrasad Joshi error:
474865c675fSPrasad Joshi 	return -1;
475865c675fSPrasad Joshi }
476865c675fSPrasad Joshi 
477b1c84095SPekka Enberg static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector, void *src, u32 src_len)
47886835cedSPrasad Joshi {
479865c675fSPrasad Joshi 	struct qcow *q = disk->priv;
480865c675fSPrasad Joshi 	struct qcow_header *header = q->header;
481c4acb611SIngo Molnar 	u32 nr_written;
4820df6b4d9SPekka Enberg 	char *buf;
483865c675fSPrasad Joshi 	u64 offset;
484865c675fSPrasad Joshi 	ssize_t nr;
485865c675fSPrasad Joshi 
4860df6b4d9SPekka Enberg 	buf		= src;
4870df6b4d9SPekka Enberg 	nr_written	= 0;
488865c675fSPrasad Joshi 	offset		= sector << SECTOR_SHIFT;
4890df6b4d9SPekka Enberg 
4900df6b4d9SPekka Enberg 	while (nr_written < src_len) {
491865c675fSPrasad Joshi 		if (offset >= header->size)
4920df6b4d9SPekka Enberg 			return -1;
493865c675fSPrasad Joshi 
494b1c84095SPekka Enberg 		nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
495865c675fSPrasad Joshi 		if (nr < 0)
4960df6b4d9SPekka Enberg 			return -1;
497865c675fSPrasad Joshi 
4980df6b4d9SPekka Enberg 		nr_written	+= nr;
499865c675fSPrasad Joshi 		buf		+= nr;
500865c675fSPrasad Joshi 		offset		+= nr;
501865c675fSPrasad Joshi 	}
5020df6b4d9SPekka Enberg 
50372133dd2SAsias He 	return nr_written;
50486835cedSPrasad Joshi }
50586835cedSPrasad Joshi 
506b1c84095SPekka Enberg static ssize_t qcow_nowrite_sector(struct disk_image *disk, u64 sector, void *src, u32 src_len)
507f10860caSPekka Enberg {
508f10860caSPekka Enberg 	/* I/O error */
509b1c84095SPekka Enberg 	pr_info("%s: no write support\n", __func__);
510f10860caSPekka Enberg 	return -1;
511f10860caSPekka Enberg }
512f10860caSPekka Enberg 
513b1c84095SPekka Enberg static int qcow_disk_close(struct disk_image *disk)
51486835cedSPrasad Joshi {
51586835cedSPrasad Joshi 	struct qcow *q;
51686835cedSPrasad Joshi 
51743835ac9SSasha Levin 	if (!disk)
51872133dd2SAsias He 		return 0;
51986835cedSPrasad Joshi 
52043835ac9SSasha Levin 	q = disk->priv;
52186835cedSPrasad Joshi 
5223309045fSPrasad Joshi 	free_cache(q);
5236c6f79b6SPrasad Joshi 	free(q->table.l1_table);
52486835cedSPrasad Joshi 	free(q->header);
52586835cedSPrasad Joshi 	free(q);
52672133dd2SAsias He 
52772133dd2SAsias He 	return 0;
52886835cedSPrasad Joshi }
52986835cedSPrasad Joshi 
530b1c84095SPekka Enberg static struct disk_image_operations qcow_disk_readonly_ops = {
531b1c84095SPekka Enberg 	.read_sector		= qcow_read_sector,
532b1c84095SPekka Enberg 	.write_sector		= qcow_nowrite_sector,
533b1c84095SPekka Enberg 	.close			= qcow_disk_close,
534f10860caSPekka Enberg };
535f10860caSPekka Enberg 
536b1c84095SPekka Enberg static struct disk_image_operations qcow_disk_ops = {
537b1c84095SPekka Enberg 	.read_sector		= qcow_read_sector,
538b1c84095SPekka Enberg 	.write_sector		= qcow_write_sector,
539b1c84095SPekka Enberg 	.close			= qcow_disk_close,
54086835cedSPrasad Joshi };
54186835cedSPrasad Joshi 
54286835cedSPrasad Joshi static int qcow_read_l1_table(struct qcow *q)
54386835cedSPrasad Joshi {
544ad627d62SPekka Enberg 	struct qcow_header *header = q->header;
54500adcc1bSPrasad Joshi 	struct qcow_table *table = &q->table;
54600adcc1bSPrasad Joshi 	u64 i;
54786835cedSPrasad Joshi 
548ad627d62SPekka Enberg 	table->table_size	= header->l1_size;
54986835cedSPrasad Joshi 
55000adcc1bSPrasad Joshi 	table->l1_table	= calloc(table->table_size, sizeof(u64));
55100adcc1bSPrasad Joshi 	if (!table->l1_table)
55286835cedSPrasad Joshi 		return -1;
55386835cedSPrasad Joshi 
55400adcc1bSPrasad Joshi 	if (pread_in_full(q->fd, table->l1_table, sizeof(u64) *
55500adcc1bSPrasad Joshi 				table->table_size, header->l1_table_offset) < 0)
55686835cedSPrasad Joshi 		return -1;
55786835cedSPrasad Joshi 
55800adcc1bSPrasad Joshi 	for (i = 0; i < table->table_size; i++)
55900adcc1bSPrasad Joshi 		be64_to_cpus(&table->l1_table[i]);
56000adcc1bSPrasad Joshi 
56186835cedSPrasad Joshi 	return 0;
56286835cedSPrasad Joshi }
56386835cedSPrasad Joshi 
564ad627d62SPekka Enberg static void *qcow2_read_header(int fd)
56586835cedSPrasad Joshi {
566ad627d62SPekka Enberg 	struct qcow2_header_disk f_header;
567ad627d62SPekka Enberg 	struct qcow_header *header;
56886835cedSPrasad Joshi 
569ad627d62SPekka Enberg 	header = malloc(sizeof(struct qcow_header));
57086835cedSPrasad Joshi 	if (!header)
57186835cedSPrasad Joshi 		return NULL;
57286835cedSPrasad Joshi 
5730657f33dSPrasad Joshi 	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
5740657f33dSPrasad Joshi 		free(header);
57586835cedSPrasad Joshi 		return NULL;
5760657f33dSPrasad Joshi 	}
57786835cedSPrasad Joshi 
578ad627d62SPekka Enberg 	be32_to_cpus(&f_header.magic);
579ad627d62SPekka Enberg 	be32_to_cpus(&f_header.version);
580ad627d62SPekka Enberg 	be64_to_cpus(&f_header.backing_file_offset);
581ad627d62SPekka Enberg 	be32_to_cpus(&f_header.backing_file_size);
582ad627d62SPekka Enberg 	be32_to_cpus(&f_header.cluster_bits);
583ad627d62SPekka Enberg 	be64_to_cpus(&f_header.size);
584ad627d62SPekka Enberg 	be32_to_cpus(&f_header.crypt_method);
585ad627d62SPekka Enberg 	be32_to_cpus(&f_header.l1_size);
586ad627d62SPekka Enberg 	be64_to_cpus(&f_header.l1_table_offset);
587ad627d62SPekka Enberg 	be64_to_cpus(&f_header.refcount_table_offset);
588ad627d62SPekka Enberg 	be32_to_cpus(&f_header.refcount_table_clusters);
589ad627d62SPekka Enberg 	be32_to_cpus(&f_header.nb_snapshots);
590ad627d62SPekka Enberg 	be64_to_cpus(&f_header.snapshots_offset);
591ad627d62SPekka Enberg 
592ad627d62SPekka Enberg 	*header		= (struct qcow_header) {
593ad627d62SPekka Enberg 		.size			= f_header.size,
594ad627d62SPekka Enberg 		.l1_table_offset	= f_header.l1_table_offset,
595ad627d62SPekka Enberg 		.l1_size		= f_header.l1_size,
596ad627d62SPekka Enberg 		.cluster_bits		= f_header.cluster_bits,
597ad627d62SPekka Enberg 		.l2_bits		= f_header.cluster_bits - 3,
598ad627d62SPekka Enberg 		.oflag_mask		= QCOW2_OFLAG_MASK,
599ad627d62SPekka Enberg 	};
600ad627d62SPekka Enberg 
601ad627d62SPekka Enberg 	return header;
602ad627d62SPekka Enberg }
603ad627d62SPekka Enberg 
604f10860caSPekka Enberg static struct disk_image *qcow2_probe(int fd, bool readonly)
605ad627d62SPekka Enberg {
606ad627d62SPekka Enberg 	struct qcow *q;
607ad627d62SPekka Enberg 	struct qcow_header *h;
608ad627d62SPekka Enberg 	struct disk_image *disk_image;
609ad627d62SPekka Enberg 
610ad627d62SPekka Enberg 	q = calloc(1, sizeof(struct qcow));
611ad627d62SPekka Enberg 	if (!q)
612ad627d62SPekka Enberg 		goto error;
613ad627d62SPekka Enberg 
614ad627d62SPekka Enberg 	q->fd = fd;
6153309045fSPrasad Joshi 	q->root = RB_ROOT;
6163309045fSPrasad Joshi 	INIT_LIST_HEAD(&q->lru_list);
617ad627d62SPekka Enberg 
618ad627d62SPekka Enberg 	h = q->header = qcow2_read_header(fd);
619ad627d62SPekka Enberg 	if (!h)
620ad627d62SPekka Enberg 		goto error;
621ad627d62SPekka Enberg 
622ad627d62SPekka Enberg 	if (qcow_read_l1_table(q) < 0)
623ad627d62SPekka Enberg 		goto error;
624ad627d62SPekka Enberg 
6257d22135fSAsias He 	/*
6267d22135fSAsias He 	 * Do not use mmap use read/write instead
6277d22135fSAsias He 	 */
628f10860caSPekka Enberg 	if (readonly)
629b1c84095SPekka Enberg 		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_NOMMAP);
630f10860caSPekka Enberg 	else
631b1c84095SPekka Enberg 		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_NOMMAP);
632f10860caSPekka Enberg 
633ad627d62SPekka Enberg 	if (!disk_image)
634ad627d62SPekka Enberg 		goto error;
635ad627d62SPekka Enberg 	disk_image->priv = q;
636ad627d62SPekka Enberg 
637ad627d62SPekka Enberg 	return disk_image;
638ad627d62SPekka Enberg error:
639ad627d62SPekka Enberg 	if (!q)
640ad627d62SPekka Enberg 		return NULL;
641ad627d62SPekka Enberg 
642ad627d62SPekka Enberg 	free(q->table.l1_table);
643ad627d62SPekka Enberg 	free(q->header);
644ad627d62SPekka Enberg 	free(q);
645ad627d62SPekka Enberg 
646ad627d62SPekka Enberg 	return NULL;
647ad627d62SPekka Enberg }
648ad627d62SPekka Enberg 
649ad627d62SPekka Enberg static bool qcow2_check_image(int fd)
650ad627d62SPekka Enberg {
651ad627d62SPekka Enberg 	struct qcow2_header_disk f_header;
652ad627d62SPekka Enberg 
653ad627d62SPekka Enberg 	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
654ad627d62SPekka Enberg 		return false;
655ad627d62SPekka Enberg 
656ad627d62SPekka Enberg 	be32_to_cpus(&f_header.magic);
657ad627d62SPekka Enberg 	be32_to_cpus(&f_header.version);
658ad627d62SPekka Enberg 
659ad627d62SPekka Enberg 	if (f_header.magic != QCOW_MAGIC)
660ad627d62SPekka Enberg 		return false;
661ad627d62SPekka Enberg 
662ad627d62SPekka Enberg 	if (f_header.version != QCOW2_VERSION)
663ad627d62SPekka Enberg 		return false;
664ad627d62SPekka Enberg 
665ad627d62SPekka Enberg 	return true;
666ad627d62SPekka Enberg }
667ad627d62SPekka Enberg 
668ad627d62SPekka Enberg static void *qcow1_read_header(int fd)
669ad627d62SPekka Enberg {
670ad627d62SPekka Enberg 	struct qcow1_header_disk f_header;
671ad627d62SPekka Enberg 	struct qcow_header *header;
672ad627d62SPekka Enberg 
673ad627d62SPekka Enberg 	header = malloc(sizeof(struct qcow_header));
674ad627d62SPekka Enberg 	if (!header)
675ad627d62SPekka Enberg 		return NULL;
676ad627d62SPekka Enberg 
677d39cefd2SSasha Levin 	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
678d39cefd2SSasha Levin 		free(header);
679ad627d62SPekka Enberg 		return NULL;
680d39cefd2SSasha Levin 	}
681ad627d62SPekka Enberg 
682ad627d62SPekka Enberg 	be32_to_cpus(&f_header.magic);
683ad627d62SPekka Enberg 	be32_to_cpus(&f_header.version);
684ad627d62SPekka Enberg 	be64_to_cpus(&f_header.backing_file_offset);
685ad627d62SPekka Enberg 	be32_to_cpus(&f_header.backing_file_size);
686ad627d62SPekka Enberg 	be32_to_cpus(&f_header.mtime);
687ad627d62SPekka Enberg 	be64_to_cpus(&f_header.size);
688ad627d62SPekka Enberg 	be32_to_cpus(&f_header.crypt_method);
689ad627d62SPekka Enberg 	be64_to_cpus(&f_header.l1_table_offset);
690ad627d62SPekka Enberg 
691ad627d62SPekka Enberg 	*header		= (struct qcow_header) {
692ad627d62SPekka Enberg 		.size			= f_header.size,
693ad627d62SPekka Enberg 		.l1_table_offset	= f_header.l1_table_offset,
694ad627d62SPekka Enberg 		.l1_size		= f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
695ad627d62SPekka Enberg 		.cluster_bits		= f_header.cluster_bits,
696ad627d62SPekka Enberg 		.l2_bits		= f_header.l2_bits,
697ad627d62SPekka Enberg 		.oflag_mask		= QCOW1_OFLAG_MASK,
698ad627d62SPekka Enberg 	};
69986835cedSPrasad Joshi 
70086835cedSPrasad Joshi 	return header;
70186835cedSPrasad Joshi }
70286835cedSPrasad Joshi 
703f10860caSPekka Enberg static struct disk_image *qcow1_probe(int fd, bool readonly)
70486835cedSPrasad Joshi {
70586835cedSPrasad Joshi 	struct qcow *q;
706ad627d62SPekka Enberg 	struct qcow_header *h;
70786835cedSPrasad Joshi 	struct disk_image *disk_image;
70886835cedSPrasad Joshi 
70986835cedSPrasad Joshi 	q = calloc(1, sizeof(struct qcow));
71086835cedSPrasad Joshi 	if (!q)
71186835cedSPrasad Joshi 		goto error;
71286835cedSPrasad Joshi 
71386835cedSPrasad Joshi 	q->fd = fd;
7143309045fSPrasad Joshi 	q->root = RB_ROOT;
7153309045fSPrasad Joshi 	INIT_LIST_HEAD(&q->lru_list);
71686835cedSPrasad Joshi 
71786835cedSPrasad Joshi 	h = q->header = qcow1_read_header(fd);
71886835cedSPrasad Joshi 	if (!h)
71986835cedSPrasad Joshi 		goto error;
72086835cedSPrasad Joshi 
72186835cedSPrasad Joshi 	if (qcow_read_l1_table(q) < 0)
72286835cedSPrasad Joshi 		goto error;
72386835cedSPrasad Joshi 
7247d22135fSAsias He 	/*
7257d22135fSAsias He 	 * Do not use mmap use read/write instead
7267d22135fSAsias He 	 */
727f10860caSPekka Enberg 	if (readonly)
728b1c84095SPekka Enberg 		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_NOMMAP);
729f10860caSPekka Enberg 	else
730b1c84095SPekka Enberg 		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_NOMMAP);
731f10860caSPekka Enberg 
73286835cedSPrasad Joshi 	if (!disk_image)
73386835cedSPrasad Joshi 		goto error;
73486835cedSPrasad Joshi 	disk_image->priv = q;
73586835cedSPrasad Joshi 
73686835cedSPrasad Joshi 	return disk_image;
73786835cedSPrasad Joshi error:
73886835cedSPrasad Joshi 	if (!q)
73986835cedSPrasad Joshi 		return NULL;
74086835cedSPrasad Joshi 
7416c6f79b6SPrasad Joshi 	free(q->table.l1_table);
74286835cedSPrasad Joshi 	free(q->header);
74386835cedSPrasad Joshi 	free(q);
74486835cedSPrasad Joshi 
74586835cedSPrasad Joshi 	return NULL;
74686835cedSPrasad Joshi }
74786835cedSPrasad Joshi 
748ad627d62SPekka Enberg static bool qcow1_check_image(int fd)
74986835cedSPrasad Joshi {
750ad627d62SPekka Enberg 	struct qcow1_header_disk f_header;
75186835cedSPrasad Joshi 
752ad627d62SPekka Enberg 	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
753ad627d62SPekka Enberg 		return false;
75486835cedSPrasad Joshi 
755ad627d62SPekka Enberg 	be32_to_cpus(&f_header.magic);
756ad627d62SPekka Enberg 	be32_to_cpus(&f_header.version);
75786835cedSPrasad Joshi 
758ad627d62SPekka Enberg 	if (f_header.magic != QCOW_MAGIC)
759ad627d62SPekka Enberg 		return false;
76086835cedSPrasad Joshi 
761ad627d62SPekka Enberg 	if (f_header.version != QCOW1_VERSION)
762ad627d62SPekka Enberg 		return false;
76386835cedSPrasad Joshi 
764ad627d62SPekka Enberg 	return true;
76586835cedSPrasad Joshi }
76686835cedSPrasad Joshi 
767f10860caSPekka Enberg struct disk_image *qcow_probe(int fd, bool readonly)
76886835cedSPrasad Joshi {
769ad627d62SPekka Enberg 	if (qcow1_check_image(fd))
770f10860caSPekka Enberg 		return qcow1_probe(fd, readonly);
771ad627d62SPekka Enberg 
772ad627d62SPekka Enberg 	if (qcow2_check_image(fd))
773f10860caSPekka Enberg 		return qcow2_probe(fd, readonly);
774ad627d62SPekka Enberg 
775ad627d62SPekka Enberg 	return NULL;
77686835cedSPrasad Joshi }
777