xref: /src/sys/contrib/openzfs/cmd/zstream/zstream_redup.c (revision 80aae8a3f8aa70712930664572be9e6885dc0be7)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * This file and its contents are supplied under the terms of the
6  * Common Development and Distribution License ("CDDL"), version 1.0.
7  * You may only use this file in accordance with the terms of version
8  * 1.0 of the CDDL.
9  *
10  * A full copy of the text of the CDDL should have accompanied this
11  * source.  A copy of the CDDL is also available via the Internet at
12  * http://www.illumos.org/license/CDDL.
13  *
14  * CDDL HEADER END
15  */
16 
17 /*
18  * Copyright (c) 2020 by Delphix. All rights reserved.
19  */
20 
21 #include <assert.h>
22 #include <cityhash.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <libzfs.h>
27 #include <libzutil.h>
28 #include <stddef.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <umem.h>
33 #include <unistd.h>
34 #include <sys/debug.h>
35 #include <sys/stat.h>
36 #include <sys/zfs_ioctl.h>
37 #include <sys/zio_checksum.h>
38 #include "zfs_fletcher.h"
39 #include "zstream.h"
40 #include "zstream_util.h"
41 
42 
43 #define	MAX_RDT_PHYSMEM_PERCENT		20
44 #define	SMALLEST_POSSIBLE_MAX_RDT_MB		128
45 
46 typedef struct redup_entry {
47 	struct redup_entry	*rde_next;
48 	uint64_t rde_guid;
49 	uint64_t rde_object;
50 	uint64_t rde_offset;
51 	uint64_t rde_stream_offset;
52 } redup_entry_t;
53 
54 typedef struct redup_table {
55 	redup_entry_t	**redup_hash_array;
56 	umem_cache_t	*ddecache;
57 	uint64_t	ddt_count;
58 	int		numhashbits;
59 } redup_table_t;
60 
61 /*
62  * Safe version of pread(), exits on error.
63  */
64 static void
spread(int fd,void * buf,size_t count,off_t offset)65 spread(int fd, void *buf, size_t count, off_t offset)
66 {
67 	ssize_t err = pread(fd, buf, count, offset);
68 	if (err == -1) {
69 		(void) fprintf(stderr,
70 		    "Error while reading file: %s\n",
71 		    strerror(errno));
72 		exit(1);
73 	} else if (err != count) {
74 		(void) fprintf(stderr,
75 		    "Error while reading file: short read\n");
76 		exit(1);
77 	}
78 }
79 
80 static void
rdt_insert(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t stream_offset)81 rdt_insert(redup_table_t *rdt,
82     uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
83 {
84 	uint64_t ch = cityhash3(guid, object, offset);
85 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
86 	redup_entry_t **rdepp;
87 
88 	rdepp = &(rdt->redup_hash_array[hashcode]);
89 	redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
90 	rde->rde_next = *rdepp;
91 	rde->rde_guid = guid;
92 	rde->rde_object = object;
93 	rde->rde_offset = offset;
94 	rde->rde_stream_offset = stream_offset;
95 	*rdepp = rde;
96 	rdt->ddt_count++;
97 }
98 
99 static void
rdt_lookup(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t * stream_offsetp)100 rdt_lookup(redup_table_t *rdt,
101     uint64_t guid, uint64_t object, uint64_t offset,
102     uint64_t *stream_offsetp)
103 {
104 	uint64_t ch = cityhash3(guid, object, offset);
105 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
106 
107 	for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
108 	    rde != NULL; rde = rde->rde_next) {
109 		if (rde->rde_guid == guid &&
110 		    rde->rde_object == object &&
111 		    rde->rde_offset == offset) {
112 			*stream_offsetp = rde->rde_stream_offset;
113 			return;
114 		}
115 	}
116 	assert(!"could not find expected redup table entry");
117 }
118 
119 /*
120  * Convert a dedup stream (generated by "zfs send -D") to a
121  * non-deduplicated stream.  The entire infd will be converted, including
122  * any substreams in a stream package (generated by "zfs send -RD"). The
123  * infd must be seekable.
124  */
125 static void
zfs_redup_stream(int infd,int outfd,boolean_t verbose)126 zfs_redup_stream(int infd, int outfd, boolean_t verbose)
127 {
128 	int bufsz = SPA_MAXBLOCKSIZE;
129 	dmu_replay_record_t thedrr;
130 	dmu_replay_record_t *drr = &thedrr;
131 	redup_table_t rdt;
132 	zio_cksum_t stream_cksum;
133 	uint64_t numbuckets;
134 	uint64_t num_records = 0;
135 	uint64_t num_write_byref_records = 0;
136 
137 	memset(&thedrr, 0, sizeof (dmu_replay_record_t));
138 
139 #ifdef _ILP32
140 	uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
141 #else
142 	uint64_t physbytes = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
143 	uint64_t max_rde_size =
144 	    MAX((physbytes * MAX_RDT_PHYSMEM_PERCENT) / 100,
145 	    SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
146 #endif
147 
148 	numbuckets = max_rde_size / (sizeof (redup_entry_t));
149 
150 	/*
151 	 * numbuckets must be a power of 2.  Increase number to
152 	 * a power of 2 if necessary.
153 	 */
154 	if (!ISP2(numbuckets))
155 		numbuckets = 1ULL << highbit64(numbuckets);
156 
157 	rdt.redup_hash_array =
158 	    safe_calloc(numbuckets * sizeof (redup_entry_t *));
159 	rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
160 	    NULL, NULL, NULL, NULL, NULL, 0);
161 	rdt.numhashbits = highbit64(numbuckets) - 1;
162 	rdt.ddt_count = 0;
163 
164 	char *buf = safe_calloc(bufsz);
165 	FILE *ofp = fdopen(infd, "r");
166 	long offset = ftell(ofp);
167 	int begin = 0;
168 	boolean_t seen = B_FALSE;
169 	while (sfread(drr, sizeof (*drr), ofp) != 0) {
170 		num_records++;
171 
172 		/*
173 		 * We need to regenerate the checksum.
174 		 */
175 		if (drr->drr_type != DRR_BEGIN) {
176 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
177 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
178 		}
179 
180 		uint64_t payload_size = 0;
181 		switch (drr->drr_type) {
182 		case DRR_BEGIN:
183 		{
184 			struct drr_begin *drrb = &drr->drr_u.drr_begin;
185 			int fflags;
186 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
187 			VERIFY0(begin++);
188 			seen = B_TRUE;
189 
190 			assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
191 
192 			/* clear the DEDUP feature flag for this stream */
193 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
194 			fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
195 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
196 			/* cppcheck-suppress syntaxError */
197 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
198 
199 			uint32_t sz = drr->drr_payloadlen;
200 
201 			VERIFY3U(sz, <=, 1U << 28);
202 
203 			if (sz != 0) {
204 				if (sz > bufsz) {
205 					free(buf);
206 					buf = safe_calloc(sz);
207 					bufsz = sz;
208 				}
209 				(void) sfread(buf, sz, ofp);
210 			}
211 			payload_size = sz;
212 			break;
213 		}
214 
215 		case DRR_END:
216 		{
217 			struct drr_end *drre = &drr->drr_u.drr_end;
218 			/*
219 			 * We would prefer to just check --begin == 0, but
220 			 * replication streams have an end of stream END
221 			 * record, so we must avoid tripping it.
222 			 */
223 			VERIFY3B(seen, ==, B_TRUE);
224 			begin--;
225 			/*
226 			 * Use the recalculated checksum, unless this is
227 			 * the END record of a stream package, which has
228 			 * no checksum.
229 			 */
230 			if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
231 				drre->drr_checksum = stream_cksum;
232 			break;
233 		}
234 
235 		case DRR_OBJECT:
236 		{
237 			struct drr_object *drro = &drr->drr_u.drr_object;
238 			VERIFY3S(begin, ==, 1);
239 
240 			if (drro->drr_bonuslen > 0) {
241 				payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
242 				(void) sfread(buf, payload_size, ofp);
243 			}
244 			break;
245 		}
246 
247 		case DRR_SPILL:
248 		{
249 			struct drr_spill *drrs = &drr->drr_u.drr_spill;
250 			VERIFY3S(begin, ==, 1);
251 			payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
252 			(void) sfread(buf, payload_size, ofp);
253 			break;
254 		}
255 
256 		case DRR_WRITE_BYREF:
257 		{
258 			struct drr_write_byref drrwb =
259 			    drr->drr_u.drr_write_byref;
260 			VERIFY3S(begin, ==, 1);
261 
262 			num_write_byref_records++;
263 
264 			/*
265 			 * Look up in hash table by drrwb->drr_refguid,
266 			 * drr_refobject, drr_refoffset.  Replace this
267 			 * record with the found WRITE record, but with
268 			 * drr_object,drr_offset,drr_toguid replaced with ours.
269 			 */
270 			uint64_t stream_offset = 0;
271 			rdt_lookup(&rdt, drrwb.drr_refguid,
272 			    drrwb.drr_refobject, drrwb.drr_refoffset,
273 			    &stream_offset);
274 
275 			spread(infd, drr, sizeof (*drr), stream_offset);
276 
277 			assert(drr->drr_type == DRR_WRITE);
278 			struct drr_write *drrw = &drr->drr_u.drr_write;
279 			assert(drrw->drr_toguid == drrwb.drr_refguid);
280 			assert(drrw->drr_object == drrwb.drr_refobject);
281 			assert(drrw->drr_offset == drrwb.drr_refoffset);
282 
283 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
284 			spread(infd, buf, payload_size,
285 			    stream_offset + sizeof (*drr));
286 
287 			drrw->drr_toguid = drrwb.drr_toguid;
288 			drrw->drr_object = drrwb.drr_object;
289 			drrw->drr_offset = drrwb.drr_offset;
290 			break;
291 		}
292 
293 		case DRR_WRITE:
294 		{
295 			struct drr_write *drrw = &drr->drr_u.drr_write;
296 			VERIFY3S(begin, ==, 1);
297 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
298 			(void) sfread(buf, payload_size, ofp);
299 
300 			rdt_insert(&rdt, drrw->drr_toguid,
301 			    drrw->drr_object, drrw->drr_offset, offset);
302 			break;
303 		}
304 
305 		case DRR_WRITE_EMBEDDED:
306 		{
307 			struct drr_write_embedded *drrwe =
308 			    &drr->drr_u.drr_write_embedded;
309 			VERIFY3S(begin, ==, 1);
310 			payload_size =
311 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
312 			(void) sfread(buf, payload_size, ofp);
313 			break;
314 		}
315 
316 		case DRR_FREEOBJECTS:
317 		case DRR_FREE:
318 		case DRR_OBJECT_RANGE:
319 			VERIFY3S(begin, ==, 1);
320 			break;
321 
322 		default:
323 			(void) fprintf(stderr, "INVALID record type 0x%x\n",
324 			    drr->drr_type);
325 			/* should never happen, so assert */
326 			assert(B_FALSE);
327 		}
328 
329 		if (feof(ofp)) {
330 			fprintf(stderr, "Error: unexpected end-of-file\n");
331 			exit(1);
332 		}
333 		if (ferror(ofp)) {
334 			fprintf(stderr, "Error while reading file: %s\n",
335 			    strerror(errno));
336 			exit(1);
337 		}
338 
339 		/*
340 		 * We need to recalculate the checksum, and it needs to be
341 		 * initially zero to do that.  BEGIN records don't have
342 		 * a checksum.
343 		 */
344 		if (drr->drr_type != DRR_BEGIN) {
345 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
346 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
347 		}
348 		if (dump_record(drr, buf, payload_size,
349 		    &stream_cksum, outfd) != 0)
350 			break;
351 		if (drr->drr_type == DRR_END) {
352 			/*
353 			 * Typically the END record is either the last
354 			 * thing in the stream, or it is followed
355 			 * by a BEGIN record (which also zeros the checksum).
356 			 * However, a stream package ends with two END
357 			 * records.  The last END record's checksum starts
358 			 * from zero.
359 			 */
360 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
361 		}
362 		offset = ftell(ofp);
363 	}
364 
365 	if (verbose) {
366 		char mem_str[16];
367 		zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
368 		    mem_str, sizeof (mem_str));
369 		fprintf(stderr, "converted stream with %llu total records, "
370 		    "including %llu dedup records, using %sB memory.\n",
371 		    (long long)num_records,
372 		    (long long)num_write_byref_records,
373 		    mem_str);
374 	}
375 
376 	umem_cache_destroy(rdt.ddecache);
377 	free(rdt.redup_hash_array);
378 	free(buf);
379 	(void) fclose(ofp);
380 }
381 
382 int
zstream_do_redup(int argc,char * argv[])383 zstream_do_redup(int argc, char *argv[])
384 {
385 	boolean_t verbose = B_FALSE;
386 	int c;
387 
388 	while ((c = getopt(argc, argv, "v")) != -1) {
389 		switch (c) {
390 		case 'v':
391 			verbose = B_TRUE;
392 			break;
393 		case '?':
394 			(void) fprintf(stderr, "invalid option '%c'\n",
395 			    optopt);
396 			zstream_usage();
397 			break;
398 		}
399 	}
400 
401 	argc -= optind;
402 	argv += optind;
403 
404 	if (argc != 1)
405 		zstream_usage();
406 
407 	const char *filename = argv[0];
408 
409 	if (isatty(STDOUT_FILENO)) {
410 		(void) fprintf(stderr,
411 		    "Error: Stream can not be written to a terminal.\n"
412 		    "You must redirect standard output.\n");
413 		return (1);
414 	}
415 
416 	int fd = open(filename, O_RDONLY);
417 	if (fd == -1) {
418 		(void) fprintf(stderr,
419 		    "Error while opening file '%s': %s\n",
420 		    filename, strerror(errno));
421 		exit(1);
422 	}
423 
424 	fletcher_4_init();
425 	zfs_redup_stream(fd, STDOUT_FILENO, verbose);
426 	fletcher_4_fini();
427 
428 	close(fd);
429 
430 	return (0);
431 }
432