xref: /linux/tools/testing/selftests/ublk/kublk.c (revision 7fe6ac157b7e15c8976bd62ad7cb98e248884e83)
1 /* SPDX-License-Identifier: MIT */
2 /*
3  * Description: uring_cmd based ublk
4  */
5 
6 #include <linux/fs.h>
7 #include <sys/un.h>
8 #include "kublk.h"
9 
10 #define MAX_NR_TGT_ARG 	64
11 
12 unsigned int ublk_dbg_mask = UBLK_LOG;
13 static const struct ublk_tgt_ops *tgt_ops_list[] = {
14 	&null_tgt_ops,
15 	&loop_tgt_ops,
16 	&stripe_tgt_ops,
17 	&fault_inject_tgt_ops,
18 };
19 
ublk_find_tgt(const char * name)20 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
21 {
22 	int i;
23 
24 	if (name == NULL)
25 		return NULL;
26 
27 	for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
28 		if (strcmp(tgt_ops_list[i]->name, name) == 0)
29 			return tgt_ops_list[i];
30 	return NULL;
31 }
32 
ublk_setup_ring(struct io_uring * r,int depth,int cq_depth,unsigned flags)33 static inline int ublk_setup_ring(struct io_uring *r, int depth,
34 		int cq_depth, unsigned flags)
35 {
36 	struct io_uring_params p;
37 
38 	memset(&p, 0, sizeof(p));
39 	p.flags = flags | IORING_SETUP_CQSIZE;
40 	p.cq_entries = cq_depth;
41 
42 	return io_uring_queue_init_params(depth, r, &p);
43 }
44 
ublk_ctrl_init_cmd(struct ublk_dev * dev,struct io_uring_sqe * sqe,struct ublk_ctrl_cmd_data * data)45 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
46 		struct io_uring_sqe *sqe,
47 		struct ublk_ctrl_cmd_data *data)
48 {
49 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
50 	struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
51 
52 	sqe->fd = dev->ctrl_fd;
53 	sqe->opcode = IORING_OP_URING_CMD;
54 	sqe->ioprio = 0;
55 
56 	if (data->flags & CTRL_CMD_HAS_BUF) {
57 		cmd->addr = data->addr;
58 		cmd->len = data->len;
59 	}
60 
61 	if (data->flags & CTRL_CMD_HAS_DATA)
62 		cmd->data[0] = data->data[0];
63 
64 	cmd->dev_id = info->dev_id;
65 	cmd->queue_id = -1;
66 
67 	ublk_set_sqe_cmd_op(sqe, data->cmd_op);
68 
69 	io_uring_sqe_set_data(sqe, cmd);
70 }
71 
__ublk_ctrl_cmd(struct ublk_dev * dev,struct ublk_ctrl_cmd_data * data)72 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
73 		struct ublk_ctrl_cmd_data *data)
74 {
75 	struct io_uring_sqe *sqe;
76 	struct io_uring_cqe *cqe;
77 	int ret = -EINVAL;
78 
79 	sqe = io_uring_get_sqe(&dev->ring);
80 	if (!sqe) {
81 		ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
82 		return ret;
83 	}
84 
85 	ublk_ctrl_init_cmd(dev, sqe, data);
86 
87 	ret = io_uring_submit(&dev->ring);
88 	if (ret < 0) {
89 		ublk_err("uring submit ret %d\n", ret);
90 		return ret;
91 	}
92 
93 	ret = io_uring_wait_cqe(&dev->ring, &cqe);
94 	if (ret < 0) {
95 		ublk_err("wait cqe: %s\n", strerror(-ret));
96 		return ret;
97 	}
98 	io_uring_cqe_seen(&dev->ring, cqe);
99 
100 	return cqe->res;
101 }
102 
ublk_ctrl_stop_dev(struct ublk_dev * dev)103 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
104 {
105 	struct ublk_ctrl_cmd_data data = {
106 		.cmd_op	= UBLK_U_CMD_STOP_DEV,
107 	};
108 
109 	return __ublk_ctrl_cmd(dev, &data);
110 }
111 
ublk_ctrl_try_stop_dev(struct ublk_dev * dev)112 static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev)
113 {
114 	struct ublk_ctrl_cmd_data data = {
115 		.cmd_op	= UBLK_U_CMD_TRY_STOP_DEV,
116 	};
117 
118 	return __ublk_ctrl_cmd(dev, &data);
119 }
120 
ublk_ctrl_start_dev(struct ublk_dev * dev,int daemon_pid)121 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
122 		int daemon_pid)
123 {
124 	struct ublk_ctrl_cmd_data data = {
125 		.cmd_op	= UBLK_U_CMD_START_DEV,
126 		.flags	= CTRL_CMD_HAS_DATA,
127 	};
128 
129 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
130 
131 	return __ublk_ctrl_cmd(dev, &data);
132 }
133 
ublk_ctrl_start_user_recovery(struct ublk_dev * dev)134 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
135 {
136 	struct ublk_ctrl_cmd_data data = {
137 		.cmd_op	= UBLK_U_CMD_START_USER_RECOVERY,
138 	};
139 
140 	return __ublk_ctrl_cmd(dev, &data);
141 }
142 
ublk_ctrl_end_user_recovery(struct ublk_dev * dev,int daemon_pid)143 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
144 {
145 	struct ublk_ctrl_cmd_data data = {
146 		.cmd_op	= UBLK_U_CMD_END_USER_RECOVERY,
147 		.flags	= CTRL_CMD_HAS_DATA,
148 	};
149 
150 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
151 
152 	return __ublk_ctrl_cmd(dev, &data);
153 }
154 
ublk_ctrl_add_dev(struct ublk_dev * dev)155 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
156 {
157 	struct ublk_ctrl_cmd_data data = {
158 		.cmd_op	= UBLK_U_CMD_ADD_DEV,
159 		.flags	= CTRL_CMD_HAS_BUF,
160 		.addr = (__u64) (uintptr_t) &dev->dev_info,
161 		.len = sizeof(struct ublksrv_ctrl_dev_info),
162 	};
163 
164 	return __ublk_ctrl_cmd(dev, &data);
165 }
166 
ublk_ctrl_del_dev(struct ublk_dev * dev)167 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
168 {
169 	struct ublk_ctrl_cmd_data data = {
170 		.cmd_op = UBLK_U_CMD_DEL_DEV,
171 		.flags = 0,
172 	};
173 
174 	return __ublk_ctrl_cmd(dev, &data);
175 }
176 
ublk_ctrl_get_info(struct ublk_dev * dev)177 static int ublk_ctrl_get_info(struct ublk_dev *dev)
178 {
179 	struct ublk_ctrl_cmd_data data = {
180 		.cmd_op	= UBLK_U_CMD_GET_DEV_INFO,
181 		.flags	= CTRL_CMD_HAS_BUF,
182 		.addr = (__u64) (uintptr_t) &dev->dev_info,
183 		.len = sizeof(struct ublksrv_ctrl_dev_info),
184 	};
185 
186 	return __ublk_ctrl_cmd(dev, &data);
187 }
188 
ublk_ctrl_set_params(struct ublk_dev * dev,struct ublk_params * params)189 static int ublk_ctrl_set_params(struct ublk_dev *dev,
190 		struct ublk_params *params)
191 {
192 	struct ublk_ctrl_cmd_data data = {
193 		.cmd_op	= UBLK_U_CMD_SET_PARAMS,
194 		.flags	= CTRL_CMD_HAS_BUF,
195 		.addr = (__u64) (uintptr_t) params,
196 		.len = sizeof(*params),
197 	};
198 	params->len = sizeof(*params);
199 	return __ublk_ctrl_cmd(dev, &data);
200 }
201 
ublk_ctrl_get_params(struct ublk_dev * dev,struct ublk_params * params)202 static int ublk_ctrl_get_params(struct ublk_dev *dev,
203 		struct ublk_params *params)
204 {
205 	struct ublk_ctrl_cmd_data data = {
206 		.cmd_op	= UBLK_U_CMD_GET_PARAMS,
207 		.flags	= CTRL_CMD_HAS_BUF,
208 		.addr = (__u64)params,
209 		.len = sizeof(*params),
210 	};
211 
212 	params->len = sizeof(*params);
213 
214 	return __ublk_ctrl_cmd(dev, &data);
215 }
216 
ublk_ctrl_get_features(struct ublk_dev * dev,__u64 * features)217 static int ublk_ctrl_get_features(struct ublk_dev *dev,
218 		__u64 *features)
219 {
220 	struct ublk_ctrl_cmd_data data = {
221 		.cmd_op	= UBLK_U_CMD_GET_FEATURES,
222 		.flags	= CTRL_CMD_HAS_BUF,
223 		.addr = (__u64) (uintptr_t) features,
224 		.len = sizeof(*features),
225 	};
226 
227 	return __ublk_ctrl_cmd(dev, &data);
228 }
229 
ublk_ctrl_update_size(struct ublk_dev * dev,__u64 nr_sects)230 static int ublk_ctrl_update_size(struct ublk_dev *dev,
231 		__u64 nr_sects)
232 {
233 	struct ublk_ctrl_cmd_data data = {
234 		.cmd_op	= UBLK_U_CMD_UPDATE_SIZE,
235 		.flags	= CTRL_CMD_HAS_DATA,
236 	};
237 
238 	data.data[0] = nr_sects;
239 	return __ublk_ctrl_cmd(dev, &data);
240 }
241 
ublk_ctrl_quiesce_dev(struct ublk_dev * dev,unsigned int timeout_ms)242 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
243 				 unsigned int timeout_ms)
244 {
245 	struct ublk_ctrl_cmd_data data = {
246 		.cmd_op	= UBLK_U_CMD_QUIESCE_DEV,
247 		.flags	= CTRL_CMD_HAS_DATA,
248 	};
249 
250 	data.data[0] = timeout_ms;
251 	return __ublk_ctrl_cmd(dev, &data);
252 }
253 
ublk_dev_state_desc(struct ublk_dev * dev)254 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
255 {
256 	switch (dev->dev_info.state) {
257 	case UBLK_S_DEV_DEAD:
258 		return "DEAD";
259 	case UBLK_S_DEV_LIVE:
260 		return "LIVE";
261 	case UBLK_S_DEV_QUIESCED:
262 		return "QUIESCED";
263 	default:
264 		return "UNKNOWN";
265 	};
266 }
267 
ublk_print_cpu_set(const cpu_set_t * set,char * buf,unsigned len)268 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
269 {
270 	unsigned done = 0;
271 	int i;
272 
273 	for (i = 0; i < CPU_SETSIZE; i++) {
274 		if (CPU_ISSET(i, set))
275 			done += snprintf(&buf[done], len - done, "%d ", i);
276 	}
277 }
278 
ublk_adjust_affinity(cpu_set_t * set)279 static void ublk_adjust_affinity(cpu_set_t *set)
280 {
281 	int j, updated = 0;
282 
283 	/*
284 	 * Just keep the 1st CPU now.
285 	 *
286 	 * In future, auto affinity selection can be tried.
287 	 */
288 	for (j = 0; j < CPU_SETSIZE; j++) {
289 		if (CPU_ISSET(j, set)) {
290 			if (!updated) {
291 				updated = 1;
292 				continue;
293 			}
294 			CPU_CLR(j, set);
295 		}
296 	}
297 }
298 
299 /* Caller must free the allocated buffer */
ublk_ctrl_get_affinity(struct ublk_dev * ctrl_dev,cpu_set_t ** ptr_buf)300 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
301 {
302 	struct ublk_ctrl_cmd_data data = {
303 		.cmd_op	= UBLK_U_CMD_GET_QUEUE_AFFINITY,
304 		.flags	= CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
305 	};
306 	cpu_set_t *buf;
307 	int i, ret;
308 
309 	buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
310 	if (!buf)
311 		return -ENOMEM;
312 
313 	for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
314 		data.data[0] = i;
315 		data.len = sizeof(cpu_set_t);
316 		data.addr = (__u64)&buf[i];
317 
318 		ret = __ublk_ctrl_cmd(ctrl_dev, &data);
319 		if (ret < 0) {
320 			free(buf);
321 			return ret;
322 		}
323 		ublk_adjust_affinity(&buf[i]);
324 	}
325 
326 	*ptr_buf = buf;
327 	return 0;
328 }
329 
ublk_ctrl_dump(struct ublk_dev * dev)330 static void ublk_ctrl_dump(struct ublk_dev *dev)
331 {
332 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
333 	struct ublk_params p;
334 	cpu_set_t *affinity;
335 	int ret;
336 
337 	ret = ublk_ctrl_get_params(dev, &p);
338 	if (ret < 0) {
339 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
340 		return;
341 	}
342 
343 	ret = ublk_ctrl_get_affinity(dev, &affinity);
344 	if (ret < 0) {
345 		ublk_err("failed to get affinity %m\n");
346 		return;
347 	}
348 
349 	ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
350 			info->dev_id, info->nr_hw_queues, info->queue_depth,
351 			1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
352 	ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
353 			info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
354 			ublk_dev_state_desc(dev));
355 
356 	if (affinity) {
357 		char buf[512];
358 		int i;
359 
360 		for (i = 0; i < info->nr_hw_queues; i++) {
361 			ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
362 			printf("\tqueue %u: affinity(%s)\n",
363 					i, buf);
364 		}
365 		free(affinity);
366 	}
367 
368 	fflush(stdout);
369 }
370 
ublk_ctrl_deinit(struct ublk_dev * dev)371 static void ublk_ctrl_deinit(struct ublk_dev *dev)
372 {
373 	close(dev->ctrl_fd);
374 	free(dev);
375 }
376 
ublk_ctrl_init(void)377 static struct ublk_dev *ublk_ctrl_init(void)
378 {
379 	struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
380 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
381 	int ret;
382 
383 	dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
384 	if (dev->ctrl_fd < 0) {
385 		free(dev);
386 		return NULL;
387 	}
388 
389 	info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
390 
391 	ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
392 			UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
393 	if (ret < 0) {
394 		ublk_err("queue_init: %s\n", strerror(-ret));
395 		free(dev);
396 		return NULL;
397 	}
398 	dev->nr_fds = 1;
399 
400 	return dev;
401 }
402 
__ublk_queue_cmd_buf_sz(unsigned depth)403 static int __ublk_queue_cmd_buf_sz(unsigned depth)
404 {
405 	int size =  depth * sizeof(struct ublksrv_io_desc);
406 	unsigned int page_sz = getpagesize();
407 
408 	return round_up(size, page_sz);
409 }
410 
ublk_queue_max_cmd_buf_sz(void)411 static int ublk_queue_max_cmd_buf_sz(void)
412 {
413 	return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
414 }
415 
ublk_queue_cmd_buf_sz(struct ublk_queue * q)416 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
417 {
418 	return __ublk_queue_cmd_buf_sz(q->q_depth);
419 }
420 
ublk_queue_deinit(struct ublk_queue * q)421 static void ublk_queue_deinit(struct ublk_queue *q)
422 {
423 	int i;
424 	int nr_ios = q->q_depth;
425 
426 	if (q->io_cmd_buf)
427 		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
428 
429 	for (i = 0; i < nr_ios; i++) {
430 		free(q->ios[i].buf_addr);
431 		free(q->ios[i].integrity_buf);
432 	}
433 }
434 
ublk_thread_deinit(struct ublk_thread * t)435 static void ublk_thread_deinit(struct ublk_thread *t)
436 {
437 	io_uring_unregister_buffers(&t->ring);
438 
439 	ublk_batch_free_buf(t);
440 
441 	io_uring_unregister_ring_fd(&t->ring);
442 
443 	if (t->ring.ring_fd > 0) {
444 		io_uring_unregister_files(&t->ring);
445 		close(t->ring.ring_fd);
446 		t->ring.ring_fd = -1;
447 	}
448 }
449 
ublk_queue_init(struct ublk_queue * q,unsigned long long extra_flags,__u8 metadata_size)450 static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
451 			   __u8 metadata_size)
452 {
453 	struct ublk_dev *dev = q->dev;
454 	int depth = dev->dev_info.queue_depth;
455 	int i;
456 	int cmd_buf_size, io_buf_size, integrity_size;
457 	unsigned long off;
458 
459 	pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE);
460 	q->tgt_ops = dev->tgt.ops;
461 	q->flags = 0;
462 	q->q_depth = depth;
463 	q->flags = dev->dev_info.flags;
464 	q->flags |= extra_flags;
465 	q->metadata_size = metadata_size;
466 
467 	/* Cache fd in queue for fast path access */
468 	q->ublk_fd = dev->fds[0];
469 
470 	cmd_buf_size = ublk_queue_cmd_buf_sz(q);
471 	off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
472 	q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
473 			MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
474 	if (q->io_cmd_buf == MAP_FAILED) {
475 		ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
476 				q->dev->dev_info.dev_id, q->q_id);
477 		goto fail;
478 	}
479 
480 	io_buf_size = dev->dev_info.max_io_buf_bytes;
481 	integrity_size = ublk_integrity_len(q, io_buf_size);
482 	for (i = 0; i < q->q_depth; i++) {
483 		q->ios[i].buf_addr = NULL;
484 		q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
485 		q->ios[i].tag = i;
486 
487 		if (integrity_size) {
488 			q->ios[i].integrity_buf = malloc(integrity_size);
489 			if (!q->ios[i].integrity_buf) {
490 				ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n",
491 					 dev->dev_info.dev_id, q->q_id, i,
492 					 integrity_size);
493 				goto fail;
494 			}
495 		}
496 
497 
498 		if (ublk_queue_no_buf(q))
499 			continue;
500 
501 		if (posix_memalign((void **)&q->ios[i].buf_addr,
502 					getpagesize(), io_buf_size)) {
503 			ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
504 					dev->dev_info.dev_id, q->q_id, i);
505 			goto fail;
506 		}
507 	}
508 
509 	return 0;
510  fail:
511 	ublk_queue_deinit(q);
512 	ublk_err("ublk dev %d queue %d failed\n",
513 			dev->dev_info.dev_id, q->q_id);
514 	return -ENOMEM;
515 }
516 
ublk_thread_init(struct ublk_thread * t,unsigned long long extra_flags)517 static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags)
518 {
519 	struct ublk_dev *dev = t->dev;
520 	unsigned long long flags = dev->dev_info.flags | extra_flags;
521 	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
522 	int ret;
523 
524 	/* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
525 	if (ublk_dev_batch_io(dev))
526 		cq_depth += dev->dev_info.queue_depth * 2;
527 
528 	ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
529 			IORING_SETUP_COOP_TASKRUN |
530 			IORING_SETUP_SINGLE_ISSUER |
531 			IORING_SETUP_DEFER_TASKRUN);
532 	if (ret < 0) {
533 		ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
534 				dev->dev_info.dev_id, t->idx, ret);
535 		goto fail;
536 	}
537 
538 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
539 		unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
540 		unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
541 		max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
542 
543 		t->nr_bufs = max_nr_ios_per_thread;
544 	} else {
545 		t->nr_bufs = 0;
546 	}
547 
548 	if (ublk_dev_batch_io(dev))
549 		 ublk_batch_prepare(t);
550 
551 	if (t->nr_bufs) {
552 		ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs);
553 		if (ret) {
554 			ublk_err("ublk dev %d thread %d register spare buffers failed %d\n",
555 					dev->dev_info.dev_id, t->idx, ret);
556 			goto fail;
557 		}
558 	}
559 
560 	if (ublk_dev_batch_io(dev)) {
561 		ret = ublk_batch_alloc_buf(t);
562 		if (ret) {
563 			ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n",
564 				dev->dev_info.dev_id, t->idx, ret);
565 			goto fail;
566 		}
567 	}
568 
569 	io_uring_register_ring_fd(&t->ring);
570 
571 	if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
572 		/* Register only backing files starting from index 1, exclude ublk control device */
573 		if (dev->nr_fds > 1) {
574 			ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1);
575 		} else {
576 			/* No backing files to register, skip file registration */
577 			ret = 0;
578 		}
579 	} else {
580 		ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
581 	}
582 	if (ret) {
583 		ublk_err("ublk dev %d thread %d register files failed %d\n",
584 				t->dev->dev_info.dev_id, t->idx, ret);
585 		goto fail;
586 	}
587 
588 	return 0;
589 fail:
590 	ublk_thread_deinit(t);
591 	ublk_err("ublk dev %d thread %d init failed\n",
592 			dev->dev_info.dev_id, t->idx);
593 	return -ENOMEM;
594 }
595 
596 #define WAIT_USEC 	100000
597 #define MAX_WAIT_USEC 	(3 * 1000000)
ublk_dev_prep(const struct dev_ctx * ctx,struct ublk_dev * dev)598 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
599 {
600 	int dev_id = dev->dev_info.dev_id;
601 	unsigned int wait_usec = 0;
602 	int ret = 0, fd = -1;
603 	char buf[64];
604 
605 	snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
606 
607 	while (wait_usec < MAX_WAIT_USEC) {
608 		fd = open(buf, O_RDWR);
609 		if (fd >= 0)
610 			break;
611 		usleep(WAIT_USEC);
612 		wait_usec += WAIT_USEC;
613 	}
614 	if (fd < 0) {
615 		ublk_err("can't open %s %s\n", buf, strerror(errno));
616 		return -1;
617 	}
618 
619 	dev->fds[0] = fd;
620 	if (dev->tgt.ops->init_tgt)
621 		ret = dev->tgt.ops->init_tgt(ctx, dev);
622 	if (ret)
623 		close(dev->fds[0]);
624 	return ret;
625 }
626 
ublk_dev_unprep(struct ublk_dev * dev)627 static void ublk_dev_unprep(struct ublk_dev *dev)
628 {
629 	if (dev->tgt.ops->deinit_tgt)
630 		dev->tgt.ops->deinit_tgt(dev);
631 	close(dev->fds[0]);
632 }
633 
ublk_set_auto_buf_reg(const struct ublk_thread * t,const struct ublk_queue * q,struct io_uring_sqe * sqe,unsigned short tag)634 static void ublk_set_auto_buf_reg(const struct ublk_thread *t,
635 				  const struct ublk_queue *q,
636 				  struct io_uring_sqe *sqe,
637 				  unsigned short tag)
638 {
639 	struct ublk_auto_buf_reg buf = {};
640 
641 	if (q->tgt_ops->buf_index)
642 		buf.index = q->tgt_ops->buf_index(t, q, tag);
643 	else
644 		buf.index = ublk_io_buf_idx(t, q, tag);
645 
646 	if (ublk_queue_auto_zc_fallback(q))
647 		buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
648 
649 	sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
650 }
651 
652 /* Copy in pieces to test the buffer offset logic */
653 #define UBLK_USER_COPY_LEN 2048
654 
ublk_user_copy(const struct ublk_io * io,__u8 match_ublk_op)655 static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
656 {
657 	const struct ublk_queue *q = ublk_io_to_queue(io);
658 	const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag);
659 	__u64 off = ublk_user_copy_offset(q->q_id, io->tag);
660 	__u8 ublk_op = ublksrv_get_op(iod);
661 	__u32 len = iod->nr_sectors << 9;
662 	void *addr = io->buf_addr;
663 	ssize_t copied;
664 
665 	if (ublk_op != match_ublk_op)
666 		return;
667 
668 	while (len) {
669 		__u32 copy_len = min(len, UBLK_USER_COPY_LEN);
670 
671 		if (ublk_op == UBLK_IO_OP_WRITE)
672 			copied = pread(q->ublk_fd, addr, copy_len, off);
673 		else if (ublk_op == UBLK_IO_OP_READ)
674 			copied = pwrite(q->ublk_fd, addr, copy_len, off);
675 		else
676 			assert(0);
677 		assert(copied == (ssize_t)copy_len);
678 		addr += copy_len;
679 		off += copy_len;
680 		len -= copy_len;
681 	}
682 
683 	if (!(iod->op_flags & UBLK_IO_F_INTEGRITY))
684 		return;
685 
686 	len = ublk_integrity_len(q, iod->nr_sectors << 9);
687 	off = ublk_user_copy_offset(q->q_id, io->tag);
688 	off |= UBLKSRV_IO_INTEGRITY_FLAG;
689 	if (ublk_op == UBLK_IO_OP_WRITE)
690 		copied = pread(q->ublk_fd, io->integrity_buf, len, off);
691 	else if (ublk_op == UBLK_IO_OP_READ)
692 		copied = pwrite(q->ublk_fd, io->integrity_buf, len, off);
693 	else
694 		assert(0);
695 	assert(copied == (ssize_t)len);
696 }
697 
ublk_queue_io_cmd(struct ublk_thread * t,struct ublk_io * io)698 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
699 {
700 	struct ublk_queue *q = ublk_io_to_queue(io);
701 	struct ublksrv_io_cmd *cmd;
702 	struct io_uring_sqe *sqe[1];
703 	unsigned int cmd_op = 0;
704 	__u64 user_data;
705 
706 	/* only freed io can be issued */
707 	if (!(io->flags & UBLKS_IO_FREE))
708 		return 0;
709 
710 	/*
711 	 * we issue because we need either fetching or committing or
712 	 * getting data
713 	 */
714 	if (!(io->flags &
715 		(UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
716 		return 0;
717 
718 	if (io->flags & UBLKS_IO_NEED_GET_DATA)
719 		cmd_op = UBLK_U_IO_NEED_GET_DATA;
720 	else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) {
721 		if (ublk_queue_use_user_copy(q))
722 			ublk_user_copy(io, UBLK_IO_OP_READ);
723 
724 		cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
725 	} else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
726 		cmd_op = UBLK_U_IO_FETCH_REQ;
727 
728 	if (io_uring_sq_space_left(&t->ring) < 1)
729 		io_uring_submit(&t->ring);
730 
731 	ublk_io_alloc_sqes(t, sqe, 1);
732 	if (!sqe[0]) {
733 		ublk_err("%s: run out of sqe. thread %u, tag %d\n",
734 				__func__, t->idx, io->tag);
735 		return -1;
736 	}
737 
738 	cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
739 
740 	if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
741 		cmd->result = io->result;
742 
743 	/* These fields should be written once, never change */
744 	ublk_set_sqe_cmd_op(sqe[0], cmd_op);
745 	sqe[0]->fd	= ublk_get_registered_fd(q, 0);	/* dev->fds[0] */
746 	sqe[0]->opcode	= IORING_OP_URING_CMD;
747 	if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
748 		sqe[0]->flags	= 0;  /* Use raw FD, not fixed file */
749 	else
750 		sqe[0]->flags	= IOSQE_FIXED_FILE;
751 	sqe[0]->rw_flags	= 0;
752 	cmd->tag	= io->tag;
753 	cmd->q_id	= q->q_id;
754 	if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q))
755 		cmd->addr	= (__u64) (uintptr_t) io->buf_addr;
756 	else
757 		cmd->addr	= 0;
758 
759 	if (ublk_queue_use_auto_zc(q))
760 		ublk_set_auto_buf_reg(t, q, sqe[0], io->tag);
761 
762 	user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
763 	io_uring_sqe_set_data64(sqe[0], user_data);
764 
765 	io->flags = 0;
766 
767 	t->cmd_inflight += 1;
768 
769 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
770 			__func__, t->idx, q->q_id, io->tag, cmd_op,
771 			io->flags, !!(t->state & UBLKS_T_STOPPING));
772 	return 1;
773 }
774 
ublk_submit_fetch_commands(struct ublk_thread * t)775 static void ublk_submit_fetch_commands(struct ublk_thread *t)
776 {
777 	struct ublk_queue *q;
778 	struct ublk_io *io;
779 	int i = 0, j = 0;
780 
781 	if (t->dev->per_io_tasks) {
782 		/*
783 		 * Lexicographically order all the (qid,tag) pairs, with
784 		 * qid taking priority (so (1,0) > (0,1)). Then make
785 		 * this thread the daemon for every Nth entry in this
786 		 * list (N is the number of threads), starting at this
787 		 * thread's index. This ensures that each queue is
788 		 * handled by as many ublk server threads as possible,
789 		 * so that load that is concentrated on one or a few
790 		 * queues can make use of all ublk server threads.
791 		 */
792 		const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
793 		int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
794 		for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
795 			int q_id = i / dinfo->queue_depth;
796 			int tag = i % dinfo->queue_depth;
797 			q = &t->dev->q[q_id];
798 			io = &q->ios[tag];
799 			io->buf_index = j++;
800 			if (q->tgt_ops->pre_fetch_io)
801 				q->tgt_ops->pre_fetch_io(t, q, tag, false);
802 			ublk_queue_io_cmd(t, io);
803 		}
804 	} else {
805 		/*
806 		 * Service exclusively the queue whose q_id matches our
807 		 * thread index.
808 		 */
809 		struct ublk_queue *q = &t->dev->q[t->idx];
810 		for (i = 0; i < q->q_depth; i++) {
811 			io = &q->ios[i];
812 			io->buf_index = i;
813 			if (q->tgt_ops->pre_fetch_io)
814 				q->tgt_ops->pre_fetch_io(t, q, i, false);
815 			ublk_queue_io_cmd(t, io);
816 		}
817 	}
818 }
819 
ublk_thread_is_idle(struct ublk_thread * t)820 static int ublk_thread_is_idle(struct ublk_thread *t)
821 {
822 	return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
823 }
824 
ublk_thread_is_done(struct ublk_thread * t)825 static int ublk_thread_is_done(struct ublk_thread *t)
826 {
827 	return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight;
828 }
829 
ublksrv_handle_tgt_cqe(struct ublk_thread * t,struct ublk_queue * q,struct io_uring_cqe * cqe)830 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
831 					  struct ublk_queue *q,
832 					  struct io_uring_cqe *cqe)
833 {
834 	if (cqe->res < 0 && cqe->res != -EAGAIN)
835 		ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
836 			__func__, cqe->res, q->q_id,
837 			user_data_to_tag(cqe->user_data),
838 			user_data_to_op(cqe->user_data));
839 
840 	if (q->tgt_ops->tgt_io_done)
841 		q->tgt_ops->tgt_io_done(t, q, cqe);
842 }
843 
ublk_handle_uring_cmd(struct ublk_thread * t,struct ublk_queue * q,const struct io_uring_cqe * cqe)844 static void ublk_handle_uring_cmd(struct ublk_thread *t,
845 				  struct ublk_queue *q,
846 				  const struct io_uring_cqe *cqe)
847 {
848 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
849 		!(t->state & UBLKS_T_STOPPING);
850 	unsigned tag = user_data_to_tag(cqe->user_data);
851 	struct ublk_io *io = &q->ios[tag];
852 
853 	t->cmd_inflight--;
854 
855 	if (!fetch) {
856 		t->state |= UBLKS_T_STOPPING;
857 		io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
858 	}
859 
860 	if (cqe->res == UBLK_IO_RES_OK) {
861 		ublk_assert(tag < q->q_depth);
862 
863 		if (ublk_queue_use_user_copy(q))
864 			ublk_user_copy(io, UBLK_IO_OP_WRITE);
865 
866 		if (q->tgt_ops->queue_io)
867 			q->tgt_ops->queue_io(t, q, tag);
868 	} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
869 		io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
870 		ublk_queue_io_cmd(t, io);
871 	} else {
872 		/*
873 		 * COMMIT_REQ will be completed immediately since no fetching
874 		 * piggyback is required.
875 		 *
876 		 * Marking IO_FREE only, then this io won't be issued since
877 		 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
878 		 *
879 		 * */
880 		io->flags = UBLKS_IO_FREE;
881 	}
882 }
883 
ublk_handle_cqe(struct ublk_thread * t,struct io_uring_cqe * cqe,void * data)884 static void ublk_handle_cqe(struct ublk_thread *t,
885 		struct io_uring_cqe *cqe, void *data)
886 {
887 	struct ublk_dev *dev = t->dev;
888 	unsigned q_id = user_data_to_q_id(cqe->user_data);
889 	unsigned cmd_op = user_data_to_op(cqe->user_data);
890 
891 	if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS)
892 		ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
893 				cqe->res, cqe->user_data, t->state);
894 
895 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x "
896 			"data %lx target %d/%d) stopping %d\n",
897 			__func__, cqe->res, t->idx, q_id,
898 			user_data_to_tag(cqe->user_data),
899 			cmd_op, cqe->user_data, is_target_io(cqe->user_data),
900 			user_data_to_tgt_data(cqe->user_data),
901 			(t->state & UBLKS_T_STOPPING));
902 
903 	/* Don't retrieve io in case of target io */
904 	if (is_target_io(cqe->user_data)) {
905 		ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe);
906 		return;
907 	}
908 
909 	if (ublk_thread_batch_io(t))
910 		ublk_batch_compl_cmd(t, cqe);
911 	else
912 		ublk_handle_uring_cmd(t, &dev->q[q_id], cqe);
913 }
914 
ublk_reap_events_uring(struct ublk_thread * t)915 static int ublk_reap_events_uring(struct ublk_thread *t)
916 {
917 	struct io_uring_cqe *cqe;
918 	unsigned head;
919 	int count = 0;
920 
921 	io_uring_for_each_cqe(&t->ring, head, cqe) {
922 		ublk_handle_cqe(t, cqe, NULL);
923 		count += 1;
924 	}
925 	io_uring_cq_advance(&t->ring, count);
926 
927 	return count;
928 }
929 
ublk_process_io(struct ublk_thread * t)930 static int ublk_process_io(struct ublk_thread *t)
931 {
932 	int ret, reapped;
933 
934 	ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
935 				t->dev->dev_info.dev_id,
936 				t->idx, io_uring_sq_ready(&t->ring),
937 				t->cmd_inflight,
938 				(t->state & UBLKS_T_STOPPING));
939 
940 	if (ublk_thread_is_done(t))
941 		return -ENODEV;
942 
943 	ret = io_uring_submit_and_wait(&t->ring, 1);
944 	if (ublk_thread_batch_io(t)) {
945 		ublk_batch_prep_commit(t);
946 		reapped = ublk_reap_events_uring(t);
947 		ublk_batch_commit_io_cmds(t);
948 	} else {
949 		reapped = ublk_reap_events_uring(t);
950 	}
951 
952 	ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
953 			ret, reapped, (t->state & UBLKS_T_STOPPING),
954 			(t->state & UBLKS_T_IDLE));
955 
956 	return reapped;
957 }
958 
959 struct ublk_thread_info {
960 	struct ublk_dev 	*dev;
961 	pthread_t		thread;
962 	unsigned		idx;
963 	sem_t 			*ready;
964 	cpu_set_t 		*affinity;
965 	unsigned long long	extra_flags;
966 	unsigned char		(*q_thread_map)[UBLK_MAX_QUEUES];
967 };
968 
ublk_thread_set_sched_affinity(const struct ublk_thread_info * info)969 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
970 {
971 	if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0)
972 		ublk_err("ublk dev %u thread %u set affinity failed",
973 				info->dev->dev_info.dev_id, info->idx);
974 }
975 
ublk_batch_setup_queues(struct ublk_thread * t)976 static void ublk_batch_setup_queues(struct ublk_thread *t)
977 {
978 	int i;
979 
980 	for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
981 		struct ublk_queue *q = &t->dev->q[i];
982 		int ret;
983 
984 		/*
985 		 * Only prepare io commands in the mapped thread context,
986 		 * otherwise io command buffer index may not work as expected
987 		 */
988 		if (t->q_map[i] == 0)
989 			continue;
990 
991 		if (q->tgt_ops->pre_fetch_io)
992 			q->tgt_ops->pre_fetch_io(t, q, 0, true);
993 
994 		ret = ublk_batch_queue_prep_io_cmds(t, q);
995 		ublk_assert(ret >= 0);
996 	}
997 }
998 
__ublk_io_handler_fn(struct ublk_thread_info * info)999 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
1000 {
1001 	struct ublk_thread t = {
1002 		.dev = info->dev,
1003 		.idx = info->idx,
1004 	};
1005 	int dev_id = info->dev->dev_info.dev_id;
1006 	int ret;
1007 
1008 	/* Copy per-thread queue mapping into thread-local variable */
1009 	if (info->q_thread_map)
1010 		memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map));
1011 
1012 	ret = ublk_thread_init(&t, info->extra_flags);
1013 	if (ret) {
1014 		ublk_err("ublk dev %d thread %u init failed\n",
1015 				dev_id, t.idx);
1016 		return ret;
1017 	}
1018 	sem_post(info->ready);
1019 
1020 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
1021 			gettid(), dev_id, t.idx);
1022 
1023 	if (!ublk_thread_batch_io(&t)) {
1024 		/* submit all io commands to ublk driver */
1025 		ublk_submit_fetch_commands(&t);
1026 	} else {
1027 		ublk_batch_setup_queues(&t);
1028 		ublk_batch_start_fetch(&t);
1029 	}
1030 
1031 	do {
1032 		if (ublk_process_io(&t) < 0)
1033 			break;
1034 	} while (1);
1035 
1036 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
1037 		 gettid(), dev_id, t.idx);
1038 	ublk_thread_deinit(&t);
1039 	return 0;
1040 }
1041 
ublk_io_handler_fn(void * data)1042 static void *ublk_io_handler_fn(void *data)
1043 {
1044 	struct ublk_thread_info *info = data;
1045 
1046 	/*
1047 	 * IO perf is sensitive with queue pthread affinity on NUMA machine
1048 	 *
1049 	 * Set sched_affinity at beginning, so following allocated memory/pages
1050 	 * could be CPU/NUMA aware.
1051 	 */
1052 	if (info->affinity)
1053 		ublk_thread_set_sched_affinity(info);
1054 
1055 	__ublk_io_handler_fn(info);
1056 
1057 	return NULL;
1058 }
1059 
ublk_set_parameters(struct ublk_dev * dev)1060 static void ublk_set_parameters(struct ublk_dev *dev)
1061 {
1062 	int ret;
1063 
1064 	ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
1065 	if (ret)
1066 		ublk_err("dev %d set basic parameter failed %d\n",
1067 				dev->dev_info.dev_id, ret);
1068 }
1069 
ublk_send_dev_event(const struct dev_ctx * ctx,struct ublk_dev * dev,int dev_id)1070 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
1071 {
1072 	uint64_t id;
1073 	int evtfd = ctx->_evtfd;
1074 
1075 	if (evtfd < 0)
1076 		return -EBADF;
1077 
1078 	if (dev_id >= 0)
1079 		id = dev_id + 1;
1080 	else
1081 		id = ERROR_EVTFD_DEVID;
1082 
1083 	if (dev && ctx->shadow_dev)
1084 		memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
1085 
1086 	if (write(evtfd, &id, sizeof(id)) != sizeof(id))
1087 		return -EINVAL;
1088 
1089 	close(evtfd);
1090 	shmdt(ctx->shadow_dev);
1091 
1092 	return 0;
1093 }
1094 
1095 
1096 /*
1097  * Shared memory registration socket listener.
1098  *
1099  * The parent daemon context listens on a per-device unix socket at
1100  * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests
1101  * from clients. Clients send a memfd via SCM_RIGHTS; the server
1102  * registers it with the kernel, mmaps it, and returns the assigned index.
1103  */
1104 #define UBLK_SHMEM_SOCK_DIR	"/run/ublk"
1105 
1106 /* defined in kublk.h, shared with file_backed.c (loop target) */
1107 struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
1108 int shmem_count;
1109 
ublk_shmem_sock_path(int dev_id,char * buf,size_t len)1110 static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len)
1111 {
1112 	snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id);
1113 }
1114 
ublk_shmem_sock_create(int dev_id)1115 static int ublk_shmem_sock_create(int dev_id)
1116 {
1117 	struct sockaddr_un addr = { .sun_family = AF_UNIX };
1118 	char path[108];
1119 	int fd;
1120 
1121 	mkdir(UBLK_SHMEM_SOCK_DIR, 0755);
1122 	ublk_shmem_sock_path(dev_id, path, sizeof(path));
1123 	unlink(path);
1124 
1125 	fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
1126 	if (fd < 0)
1127 		return -1;
1128 
1129 	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
1130 	if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
1131 		close(fd);
1132 		return -1;
1133 	}
1134 
1135 	listen(fd, 4);
1136 	ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path);
1137 	return fd;
1138 }
1139 
ublk_shmem_sock_destroy(int dev_id,int sock_fd)1140 static void ublk_shmem_sock_destroy(int dev_id, int sock_fd)
1141 {
1142 	char path[108];
1143 
1144 	if (sock_fd >= 0)
1145 		close(sock_fd);
1146 	ublk_shmem_sock_path(dev_id, path, sizeof(path));
1147 	unlink(path);
1148 }
1149 
1150 /* Receive a memfd from a client via SCM_RIGHTS */
ublk_shmem_recv_fd(int client_fd)1151 static int ublk_shmem_recv_fd(int client_fd)
1152 {
1153 	char buf[1];
1154 	struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) };
1155 	union {
1156 		char cmsg_buf[CMSG_SPACE(sizeof(int))];
1157 		struct cmsghdr align;
1158 	} u;
1159 	struct msghdr msg = {
1160 		.msg_iov = &iov,
1161 		.msg_iovlen = 1,
1162 		.msg_control = u.cmsg_buf,
1163 		.msg_controllen = sizeof(u.cmsg_buf),
1164 	};
1165 	struct cmsghdr *cmsg;
1166 
1167 	if (recvmsg(client_fd, &msg, 0) <= 0)
1168 		return -1;
1169 
1170 	cmsg = CMSG_FIRSTHDR(&msg);
1171 	if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
1172 	    cmsg->cmsg_type != SCM_RIGHTS)
1173 		return -1;
1174 
1175 	return *(int *)CMSG_DATA(cmsg);
1176 }
1177 
1178 /* Register a shared memory buffer: store fd, mmap it, return index */
ublk_shmem_register(int shmem_fd)1179 static int ublk_shmem_register(int shmem_fd)
1180 {
1181 	off_t size;
1182 	void *base;
1183 	int idx;
1184 
1185 	if (shmem_count >= UBLK_BUF_MAX)
1186 		return -1;
1187 
1188 	size = lseek(shmem_fd, 0, SEEK_END);
1189 	if (size <= 0)
1190 		return -1;
1191 
1192 	base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
1193 		    shmem_fd, 0);
1194 	if (base == MAP_FAILED)
1195 		return -1;
1196 
1197 	idx = shmem_count++;
1198 	shmem_table[idx].fd = shmem_fd;
1199 	shmem_table[idx].mmap_base = base;
1200 	shmem_table[idx].size = size;
1201 
1202 	ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n",
1203 		 idx, shmem_fd, (size_t)size);
1204 	return idx;
1205 }
1206 
ublk_shmem_unregister_all(void)1207 static void ublk_shmem_unregister_all(void)
1208 {
1209 	int i;
1210 
1211 	for (i = 0; i < shmem_count; i++) {
1212 		if (shmem_table[i].mmap_base) {
1213 			munmap(shmem_table[i].mmap_base,
1214 			       shmem_table[i].size);
1215 			close(shmem_table[i].fd);
1216 			shmem_table[i].mmap_base = NULL;
1217 		}
1218 	}
1219 	shmem_count = 0;
1220 }
1221 
ublk_ctrl_reg_buf(struct ublk_dev * dev,void * addr,size_t size,__u32 flags)1222 static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size,
1223 			     __u32 flags)
1224 {
1225 	struct ublk_shmem_buf_reg buf_reg = {
1226 		.addr = (unsigned long)addr,
1227 		.len = size,
1228 		.flags = flags,
1229 	};
1230 	struct ublk_ctrl_cmd_data data = {
1231 		.cmd_op = UBLK_U_CMD_REG_BUF,
1232 		.flags = CTRL_CMD_HAS_BUF,
1233 		.addr = (unsigned long)&buf_reg,
1234 		.len = sizeof(buf_reg),
1235 	};
1236 
1237 	return __ublk_ctrl_cmd(dev, &data);
1238 }
1239 
1240 /*
1241  * Handle one client connection: receive memfd, mmap it, register
1242  * the VA range with kernel, send back the assigned index.
1243  */
ublk_shmem_handle_client(int sock_fd,struct ublk_dev * dev)1244 static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev)
1245 {
1246 	int client_fd, memfd, idx, ret;
1247 	int32_t reply;
1248 	off_t size;
1249 	void *base;
1250 
1251 	client_fd = accept(sock_fd, NULL, NULL);
1252 	if (client_fd < 0)
1253 		return;
1254 
1255 	memfd = ublk_shmem_recv_fd(client_fd);
1256 	if (memfd < 0) {
1257 		reply = -1;
1258 		goto out;
1259 	}
1260 
1261 	/* mmap the memfd in server address space */
1262 	size = lseek(memfd, 0, SEEK_END);
1263 	if (size <= 0) {
1264 		reply = -1;
1265 		close(memfd);
1266 		goto out;
1267 	}
1268 	base = mmap(NULL, size, PROT_READ | PROT_WRITE,
1269 		    MAP_SHARED | MAP_POPULATE, memfd, 0);
1270 	if (base == MAP_FAILED) {
1271 		reply = -1;
1272 		close(memfd);
1273 		goto out;
1274 	}
1275 
1276 	/* Register server's VA range with kernel for PFN matching */
1277 	ret = ublk_ctrl_reg_buf(dev, base, size, 0);
1278 	if (ret < 0) {
1279 		ublk_dbg(UBLK_DBG_DEV,
1280 			 "shmem_zc: kernel reg failed %d\n", ret);
1281 		munmap(base, size);
1282 		close(memfd);
1283 		reply = ret;
1284 		goto out;
1285 	}
1286 
1287 	/* Store in table for I/O handling */
1288 	idx = ublk_shmem_register(memfd);
1289 	if (idx >= 0) {
1290 		shmem_table[idx].mmap_base = base;
1291 		shmem_table[idx].size = size;
1292 	}
1293 	reply = idx;
1294 out:
1295 	send(client_fd, &reply, sizeof(reply), 0);
1296 	close(client_fd);
1297 }
1298 
1299 struct shmem_listener_info {
1300 	int dev_id;
1301 	int stop_efd;		/* eventfd to signal listener to stop */
1302 	int sock_fd;		/* listener socket fd (output) */
1303 	struct ublk_dev *dev;
1304 };
1305 
1306 /*
1307  * Socket listener thread: runs in the parent daemon context alongside
1308  * the I/O threads. Accepts shared memory registration requests from
1309  * clients via SCM_RIGHTS. Exits when stop_efd is signaled.
1310  */
ublk_shmem_listener_fn(void * data)1311 static void *ublk_shmem_listener_fn(void *data)
1312 {
1313 	struct shmem_listener_info *info = data;
1314 	struct pollfd pfds[2];
1315 
1316 	info->sock_fd = ublk_shmem_sock_create(info->dev_id);
1317 	if (info->sock_fd < 0)
1318 		return NULL;
1319 
1320 	pfds[0].fd = info->sock_fd;
1321 	pfds[0].events = POLLIN;
1322 	pfds[1].fd = info->stop_efd;
1323 	pfds[1].events = POLLIN;
1324 
1325 	while (1) {
1326 		int ret = poll(pfds, 2, -1);
1327 
1328 		if (ret < 0)
1329 			break;
1330 
1331 		/* Stop signal from parent */
1332 		if (pfds[1].revents & POLLIN)
1333 			break;
1334 
1335 		/* Client connection */
1336 		if (pfds[0].revents & POLLIN)
1337 			ublk_shmem_handle_client(info->sock_fd, info->dev);
1338 	}
1339 
1340 	return NULL;
1341 }
1342 
ublk_shmem_htlb_setup(const struct dev_ctx * ctx,struct ublk_dev * dev)1343 static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx,
1344 				 struct ublk_dev *dev)
1345 {
1346 	int fd, idx, ret;
1347 	struct stat st;
1348 	void *base;
1349 
1350 	fd = open(ctx->htlb_path, O_RDWR);
1351 	if (fd < 0) {
1352 		ublk_err("htlb: can't open %s\n", ctx->htlb_path);
1353 		return -errno;
1354 	}
1355 
1356 	if (fstat(fd, &st) < 0 || st.st_size <= 0) {
1357 		ublk_err("htlb: invalid file size\n");
1358 		close(fd);
1359 		return -EINVAL;
1360 	}
1361 
1362 	base = mmap(NULL, st.st_size,
1363 		    ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE,
1364 		    MAP_SHARED | MAP_POPULATE, fd, 0);
1365 	if (base == MAP_FAILED) {
1366 		ublk_err("htlb: mmap failed\n");
1367 		close(fd);
1368 		return -ENOMEM;
1369 	}
1370 
1371 	ret = ublk_ctrl_reg_buf(dev, base, st.st_size,
1372 			       ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0);
1373 	if (ret < 0) {
1374 		ublk_err("htlb: reg_buf failed: %d\n", ret);
1375 		munmap(base, st.st_size);
1376 		close(fd);
1377 		return ret;
1378 	}
1379 
1380 	if (shmem_count >= UBLK_BUF_MAX) {
1381 		munmap(base, st.st_size);
1382 		close(fd);
1383 		return -ENOMEM;
1384 	}
1385 
1386 	idx = shmem_count++;
1387 	shmem_table[idx].fd = fd;
1388 	shmem_table[idx].mmap_base = base;
1389 	shmem_table[idx].size = st.st_size;
1390 
1391 	ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n",
1392 		 idx, (size_t)st.st_size);
1393 	return 0;
1394 }
1395 
ublk_start_daemon(const struct dev_ctx * ctx,struct ublk_dev * dev)1396 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
1397 {
1398 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
1399 	struct shmem_listener_info linfo = {};
1400 	struct ublk_thread_info *tinfo;
1401 	unsigned long long extra_flags = 0;
1402 	cpu_set_t *affinity_buf;
1403 	unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
1404 	uint64_t stop_val = 1;
1405 	pthread_t listener;
1406 	void *thread_ret;
1407 	sem_t ready;
1408 	int ret, i;
1409 
1410 	ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
1411 
1412 	tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
1413 	if (!tinfo)
1414 		return -ENOMEM;
1415 
1416 	sem_init(&ready, 0, 0);
1417 	ret = ublk_dev_prep(ctx, dev);
1418 	if (ret)
1419 		return ret;
1420 
1421 	ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
1422 	if (ret)
1423 		return ret;
1424 
1425 	if (ublk_dev_batch_io(dev)) {
1426 		q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map));
1427 		if (!q_thread_map) {
1428 			ret = -ENOMEM;
1429 			goto fail;
1430 		}
1431 		ublk_batch_setup_map(q_thread_map, dev->nthreads,
1432 				     dinfo->nr_hw_queues);
1433 	}
1434 
1435 	if (ctx->auto_zc_fallback)
1436 		extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
1437 	if (ctx->no_ublk_fixed_fd)
1438 		extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD;
1439 
1440 	for (i = 0; i < dinfo->nr_hw_queues; i++) {
1441 		dev->q[i].dev = dev;
1442 		dev->q[i].q_id = i;
1443 
1444 		ret = ublk_queue_init(&dev->q[i], extra_flags,
1445 				      ctx->metadata_size);
1446 		if (ret) {
1447 			ublk_err("ublk dev %d queue %d init queue failed\n",
1448 				 dinfo->dev_id, i);
1449 			goto fail;
1450 		}
1451 	}
1452 
1453 	for (i = 0; i < dev->nthreads; i++) {
1454 		tinfo[i].dev = dev;
1455 		tinfo[i].idx = i;
1456 		tinfo[i].ready = &ready;
1457 		tinfo[i].extra_flags = extra_flags;
1458 		tinfo[i].q_thread_map = q_thread_map;
1459 
1460 		/*
1461 		 * If threads are not tied 1:1 to queues, setting thread
1462 		 * affinity based on queue affinity makes little sense.
1463 		 * However, thread CPU affinity has significant impact
1464 		 * on performance, so to compare fairly, we'll still set
1465 		 * thread CPU affinity based on queue affinity where
1466 		 * possible.
1467 		 */
1468 		if (dev->nthreads == dinfo->nr_hw_queues)
1469 			tinfo[i].affinity = &affinity_buf[i];
1470 		pthread_create(&tinfo[i].thread, NULL,
1471 				ublk_io_handler_fn,
1472 				&tinfo[i]);
1473 	}
1474 
1475 	for (i = 0; i < dev->nthreads; i++)
1476 		sem_wait(&ready);
1477 	free(affinity_buf);
1478 	free(q_thread_map);
1479 
1480 	/* everything is fine now, start us */
1481 	if (ctx->recovery)
1482 		ret = ublk_ctrl_end_user_recovery(dev, getpid());
1483 	else {
1484 		ublk_set_parameters(dev);
1485 		ret = ublk_ctrl_start_dev(dev, getpid());
1486 	}
1487 	if (ret < 0) {
1488 		ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
1489 		/* stop device so that inflight uring_cmd can be cancelled */
1490 		ublk_ctrl_stop_dev(dev);
1491 		goto fail_start;
1492 	}
1493 
1494 	if (ctx->htlb_path) {
1495 		ret = ublk_shmem_htlb_setup(ctx, dev);
1496 		if (ret < 0) {
1497 			ublk_err("htlb setup failed: %d\n", ret);
1498 			ublk_ctrl_stop_dev(dev);
1499 			goto fail_start;
1500 		}
1501 	}
1502 
1503 	ublk_ctrl_get_info(dev);
1504 	if (ctx->fg)
1505 		ublk_ctrl_dump(dev);
1506 	else
1507 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
1508 fail_start:
1509 	/*
1510 	 * Wait for I/O threads to exit. While waiting, a listener
1511 	 * thread accepts shared memory registration requests from
1512 	 * clients via a per-device unix socket (SCM_RIGHTS fd passing).
1513 	 */
1514 	linfo.dev_id = dinfo->dev_id;
1515 	linfo.dev = dev;
1516 	linfo.stop_efd = eventfd(0, 0);
1517 	if (linfo.stop_efd >= 0)
1518 		pthread_create(&listener, NULL,
1519 			       ublk_shmem_listener_fn, &linfo);
1520 
1521 	for (i = 0; i < (int)dev->nthreads; i++)
1522 		pthread_join(tinfo[i].thread, &thread_ret);
1523 
1524 	/* Signal listener thread to stop and wait for it */
1525 	if (linfo.stop_efd >= 0) {
1526 		write(linfo.stop_efd, &stop_val, sizeof(stop_val));
1527 		pthread_join(listener, NULL);
1528 		close(linfo.stop_efd);
1529 		ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd);
1530 	}
1531 	ublk_shmem_unregister_all();
1532 	free(tinfo);
1533  fail:
1534 	for (i = 0; i < dinfo->nr_hw_queues; i++)
1535 		ublk_queue_deinit(&dev->q[i]);
1536 	ublk_dev_unprep(dev);
1537 	ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
1538 
1539 	return ret;
1540 }
1541 
wait_ublk_dev(const char * path,int evt_mask,unsigned timeout)1542 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
1543 {
1544 #define EV_SIZE (sizeof(struct inotify_event))
1545 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
1546 	struct pollfd pfd;
1547 	int fd, wd;
1548 	int ret = -EINVAL;
1549 	const char *dev_name = basename(path);
1550 
1551 	fd = inotify_init();
1552 	if (fd < 0) {
1553 		ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
1554 		return fd;
1555 	}
1556 
1557 	wd = inotify_add_watch(fd, "/dev", evt_mask);
1558 	if (wd == -1) {
1559 		ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
1560 		goto fail;
1561 	}
1562 
1563 	pfd.fd = fd;
1564 	pfd.events = POLL_IN;
1565 	while (1) {
1566 		int i = 0;
1567 		char buffer[EV_BUF_LEN];
1568 		ret = poll(&pfd, 1, 1000 * timeout);
1569 
1570 		if (ret == -1) {
1571 			ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
1572 			goto rm_watch;
1573 		} else if (ret == 0) {
1574 			ublk_err("%s: poll inotify timeout\n", __func__);
1575 			ret = -ETIMEDOUT;
1576 			goto rm_watch;
1577 		}
1578 
1579 		ret = read(fd, buffer, EV_BUF_LEN);
1580 		if (ret < 0) {
1581 			ublk_err("%s: read inotify fd failed\n", __func__);
1582 			goto rm_watch;
1583 		}
1584 
1585 		while (i < ret) {
1586 			struct inotify_event *event = (struct inotify_event *)&buffer[i];
1587 
1588 			ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
1589 					__func__, event->mask, event->name);
1590 			if (event->mask & evt_mask) {
1591 				if (!strcmp(event->name, dev_name)) {
1592 					ret = 0;
1593 					goto rm_watch;
1594 				}
1595 			}
1596 			i += EV_SIZE + event->len;
1597 		}
1598 	}
1599 rm_watch:
1600 	inotify_rm_watch(fd, wd);
1601 fail:
1602 	close(fd);
1603 	return ret;
1604 }
1605 
ublk_stop_io_daemon(const struct ublk_dev * dev)1606 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
1607 {
1608 	int daemon_pid = dev->dev_info.ublksrv_pid;
1609 	int dev_id = dev->dev_info.dev_id;
1610 	char ublkc[64];
1611 	int ret = 0;
1612 
1613 	if (daemon_pid < 0)
1614 		return 0;
1615 
1616 	/* daemon may be dead already */
1617 	if (kill(daemon_pid, 0) < 0)
1618 		goto wait;
1619 
1620 	snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1621 
1622 	/* ublk char device may be gone already */
1623 	if (access(ublkc, F_OK) != 0)
1624 		goto wait;
1625 
1626 	/* Wait until ublk char device is closed, when the daemon is shutdown */
1627 	ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1628 	/* double check and since it may be closed before starting inotify */
1629 	if (ret == -ETIMEDOUT)
1630 		ret = kill(daemon_pid, 0) < 0;
1631 wait:
1632 	waitpid(daemon_pid, NULL, 0);
1633 	ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1634 			__func__, daemon_pid, dev_id, ret);
1635 
1636 	return ret;
1637 }
1638 
__cmd_dev_add(const struct dev_ctx * ctx)1639 static int __cmd_dev_add(const struct dev_ctx *ctx)
1640 {
1641 	unsigned nthreads = ctx->nthreads;
1642 	unsigned nr_queues = ctx->nr_hw_queues;
1643 	const char *tgt_type = ctx->tgt_type;
1644 	unsigned depth = ctx->queue_depth;
1645 	__u64 features;
1646 	const struct ublk_tgt_ops *ops;
1647 	struct ublksrv_ctrl_dev_info *info;
1648 	struct ublk_dev *dev = NULL;
1649 	int dev_id = ctx->dev_id;
1650 	int ret, i;
1651 
1652 	ops = ublk_find_tgt(tgt_type);
1653 	if (!ops) {
1654 		ublk_err("%s: no such tgt type, type %s\n",
1655 				__func__, tgt_type);
1656 		ret = -ENODEV;
1657 		goto fail;
1658 	}
1659 
1660 	if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1661 		ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1662 				__func__, nr_queues, depth);
1663 		ret = -EINVAL;
1664 		goto fail;
1665 	}
1666 
1667 	/* default to 1:1 threads:queues if nthreads is unspecified */
1668 	if (!nthreads)
1669 		nthreads = nr_queues;
1670 
1671 	if (nthreads > UBLK_MAX_THREADS) {
1672 		ublk_err("%s: %u is too many threads (max %u)\n",
1673 				__func__, nthreads, UBLK_MAX_THREADS);
1674 		ret = -EINVAL;
1675 		goto fail;
1676 	}
1677 
1678 	if (nthreads != nr_queues && (!ctx->per_io_tasks &&
1679 				!(ctx->flags & UBLK_F_BATCH_IO))) {
1680 		ublk_err("%s: threads %u must be same as queues %u if "
1681 			"not using per_io_tasks\n",
1682 			__func__, nthreads, nr_queues);
1683 		ret = -EINVAL;
1684 		goto fail;
1685 	}
1686 
1687 	dev = ublk_ctrl_init();
1688 	if (!dev) {
1689 		ublk_err("%s: can't alloc dev id %d, type %s\n",
1690 				__func__, dev_id, tgt_type);
1691 		ret = -ENOMEM;
1692 		goto fail;
1693 	}
1694 
1695 	/* kernel doesn't support get_features */
1696 	ret = ublk_ctrl_get_features(dev, &features);
1697 	if (ret < 0) {
1698 		ret = -EINVAL;
1699 		goto fail;
1700 	}
1701 
1702 	if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
1703 		ret = -ENOTSUP;
1704 		goto fail;
1705 	}
1706 
1707 	info = &dev->dev_info;
1708 	info->dev_id = ctx->dev_id;
1709 	info->nr_hw_queues = nr_queues;
1710 	info->queue_depth = depth;
1711 	info->flags = ctx->flags;
1712 	if ((features & UBLK_F_QUIESCE) &&
1713 			(info->flags & UBLK_F_USER_RECOVERY))
1714 		info->flags |= UBLK_F_QUIESCE;
1715 	dev->nthreads = nthreads;
1716 	dev->per_io_tasks = ctx->per_io_tasks;
1717 	dev->tgt.ops = ops;
1718 	dev->tgt.sq_depth = depth;
1719 	dev->tgt.cq_depth = depth;
1720 
1721 	for (i = 0; i < MAX_BACK_FILES; i++) {
1722 		if (ctx->files[i]) {
1723 			strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1724 			dev->tgt.nr_backing_files++;
1725 		}
1726 	}
1727 
1728 	if (ctx->recovery)
1729 		ret = ublk_ctrl_start_user_recovery(dev);
1730 	else
1731 		ret = ublk_ctrl_add_dev(dev);
1732 	if (ret < 0) {
1733 		ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1734 				__func__, dev_id, tgt_type, ret);
1735 		goto fail;
1736 	}
1737 
1738 	ret = ublk_start_daemon(ctx, dev);
1739 	ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
1740 	if (ret < 0)
1741 		ublk_ctrl_del_dev(dev);
1742 
1743 fail:
1744 	if (ret < 0)
1745 		ublk_send_dev_event(ctx, dev, -1);
1746 	if (dev)
1747 		ublk_ctrl_deinit(dev);
1748 	return ret;
1749 }
1750 
1751 static int __cmd_dev_list(struct dev_ctx *ctx);
1752 
cmd_dev_add(struct dev_ctx * ctx)1753 static int cmd_dev_add(struct dev_ctx *ctx)
1754 {
1755 	int res;
1756 
1757 	if (ctx->fg)
1758 		goto run;
1759 
1760 	ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1761 	if (ctx->_shmid < 0) {
1762 		ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1763 		exit(-1);
1764 	}
1765 	ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1766 	if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1767 		ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1768 		exit(-1);
1769 	}
1770 	ctx->_evtfd = eventfd(0, 0);
1771 	if (ctx->_evtfd < 0) {
1772 		ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1773 		exit(-1);
1774 	}
1775 
1776 	res = fork();
1777 	if (res == 0) {
1778 		int res2;
1779 
1780 		setsid();
1781 		res2 = fork();
1782 		if (res2 == 0) {
1783 			/* prepare for detaching */
1784 			close(STDIN_FILENO);
1785 			close(STDOUT_FILENO);
1786 			close(STDERR_FILENO);
1787 run:
1788 			res = __cmd_dev_add(ctx);
1789 			return res;
1790 		} else {
1791 			/* detached from the foreground task */
1792 			exit(EXIT_SUCCESS);
1793 		}
1794 	} else if (res > 0) {
1795 		uint64_t id;
1796 		int exit_code = EXIT_FAILURE;
1797 
1798 		res = read(ctx->_evtfd, &id, sizeof(id));
1799 		close(ctx->_evtfd);
1800 		if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1801 			ctx->dev_id = id - 1;
1802 			if (__cmd_dev_list(ctx) >= 0)
1803 				exit_code = EXIT_SUCCESS;
1804 		}
1805 		shmdt(ctx->shadow_dev);
1806 		shmctl(ctx->_shmid, IPC_RMID, NULL);
1807 		/* wait for child and detach from it */
1808 		wait(NULL);
1809 		if (exit_code == EXIT_FAILURE)
1810 			ublk_err("%s: command failed\n", __func__);
1811 		exit(exit_code);
1812 	} else {
1813 		exit(EXIT_FAILURE);
1814 	}
1815 }
1816 
__cmd_dev_del(struct dev_ctx * ctx)1817 static int __cmd_dev_del(struct dev_ctx *ctx)
1818 {
1819 	int number = ctx->dev_id;
1820 	struct ublk_dev *dev;
1821 	int ret;
1822 
1823 	dev = ublk_ctrl_init();
1824 	dev->dev_info.dev_id = number;
1825 
1826 	ret = ublk_ctrl_get_info(dev);
1827 	if (ret < 0)
1828 		goto fail;
1829 
1830 	ret = ublk_ctrl_stop_dev(dev);
1831 	if (ret < 0)
1832 		ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1833 
1834 	ret = ublk_stop_io_daemon(dev);
1835 	if (ret < 0)
1836 		ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1837 				__func__, dev->dev_info.ublksrv_pid, number, ret);
1838 	ublk_ctrl_del_dev(dev);
1839 fail:
1840 	ublk_ctrl_deinit(dev);
1841 
1842 	return (ret >= 0) ? 0 : ret;
1843 }
1844 
cmd_dev_del(struct dev_ctx * ctx)1845 static int cmd_dev_del(struct dev_ctx *ctx)
1846 {
1847 	int i;
1848 
1849 	if (ctx->dev_id >= 0 || !ctx->all)
1850 		return __cmd_dev_del(ctx);
1851 
1852 	for (i = 0; i < 255; i++) {
1853 		ctx->dev_id = i;
1854 		__cmd_dev_del(ctx);
1855 	}
1856 	return 0;
1857 }
1858 
cmd_dev_stop(struct dev_ctx * ctx)1859 static int cmd_dev_stop(struct dev_ctx *ctx)
1860 {
1861 	int number = ctx->dev_id;
1862 	struct ublk_dev *dev;
1863 	int ret;
1864 
1865 	if (number < 0) {
1866 		ublk_err("%s: device id is required\n", __func__);
1867 		return -EINVAL;
1868 	}
1869 
1870 	dev = ublk_ctrl_init();
1871 	dev->dev_info.dev_id = number;
1872 
1873 	ret = ublk_ctrl_get_info(dev);
1874 	if (ret < 0)
1875 		goto fail;
1876 
1877 	if (ctx->safe_stop) {
1878 		ret = ublk_ctrl_try_stop_dev(dev);
1879 		if (ret < 0)
1880 			ublk_err("%s: try_stop dev %d failed ret %d\n",
1881 					__func__, number, ret);
1882 	} else {
1883 		ret = ublk_ctrl_stop_dev(dev);
1884 		if (ret < 0)
1885 			ublk_err("%s: stop dev %d failed ret %d\n",
1886 					__func__, number, ret);
1887 	}
1888 
1889 fail:
1890 	ublk_ctrl_deinit(dev);
1891 
1892 	return ret;
1893 }
1894 
__cmd_dev_list(struct dev_ctx * ctx)1895 static int __cmd_dev_list(struct dev_ctx *ctx)
1896 {
1897 	struct ublk_dev *dev = ublk_ctrl_init();
1898 	int ret;
1899 
1900 	if (!dev)
1901 		return -ENODEV;
1902 
1903 	dev->dev_info.dev_id = ctx->dev_id;
1904 
1905 	ret = ublk_ctrl_get_info(dev);
1906 	if (ret < 0) {
1907 		if (ctx->logging)
1908 			ublk_err("%s: can't get dev info from %d: %d\n",
1909 					__func__, ctx->dev_id, ret);
1910 	} else {
1911 		if (ctx->shadow_dev)
1912 			memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1913 
1914 		ublk_ctrl_dump(dev);
1915 	}
1916 
1917 	ublk_ctrl_deinit(dev);
1918 
1919 	return ret;
1920 }
1921 
cmd_dev_list(struct dev_ctx * ctx)1922 static int cmd_dev_list(struct dev_ctx *ctx)
1923 {
1924 	int i;
1925 
1926 	if (ctx->dev_id >= 0 || !ctx->all)
1927 		return __cmd_dev_list(ctx);
1928 
1929 	ctx->logging = false;
1930 	for (i = 0; i < 255; i++) {
1931 		ctx->dev_id = i;
1932 		__cmd_dev_list(ctx);
1933 	}
1934 	return 0;
1935 }
1936 
cmd_dev_get_features(void)1937 static int cmd_dev_get_features(void)
1938 {
1939 #define const_ilog2(x) (63 - __builtin_clzll(x))
1940 #define FEAT_NAME(f) [const_ilog2(f)] = #f
1941 	static const char *feat_map[] = {
1942 		FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY),
1943 		FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK),
1944 		FEAT_NAME(UBLK_F_NEED_GET_DATA),
1945 		FEAT_NAME(UBLK_F_USER_RECOVERY),
1946 		FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE),
1947 		FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV),
1948 		FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE),
1949 		FEAT_NAME(UBLK_F_USER_COPY),
1950 		FEAT_NAME(UBLK_F_ZONED),
1951 		FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO),
1952 		FEAT_NAME(UBLK_F_UPDATE_SIZE),
1953 		FEAT_NAME(UBLK_F_AUTO_BUF_REG),
1954 		FEAT_NAME(UBLK_F_QUIESCE),
1955 		FEAT_NAME(UBLK_F_PER_IO_DAEMON),
1956 		FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
1957 		FEAT_NAME(UBLK_F_INTEGRITY),
1958 		FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
1959 		FEAT_NAME(UBLK_F_BATCH_IO),
1960 		FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
1961 		FEAT_NAME(UBLK_F_SHMEM_ZC),
1962 	};
1963 	struct ublk_dev *dev;
1964 	__u64 features = 0;
1965 	int ret;
1966 
1967 	dev = ublk_ctrl_init();
1968 	if (!dev) {
1969 		fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1970 		return -EOPNOTSUPP;
1971 	}
1972 
1973 	ret = ublk_ctrl_get_features(dev, &features);
1974 	if (!ret) {
1975 		int i;
1976 
1977 		printf("ublk_drv features: 0x%llx\n", features);
1978 
1979 		for (i = 0; i < sizeof(features) * 8; i++) {
1980 			const char *feat;
1981 
1982 			if (!((1ULL << i)  & features))
1983 				continue;
1984 			if (i < ARRAY_SIZE(feat_map))
1985 				feat = feat_map[i];
1986 			else
1987 				feat = "unknown";
1988 			printf("0x%-16llx: %s\n", 1ULL << i, feat);
1989 		}
1990 	}
1991 
1992 	return ret;
1993 }
1994 
cmd_dev_update_size(struct dev_ctx * ctx)1995 static int cmd_dev_update_size(struct dev_ctx *ctx)
1996 {
1997 	struct ublk_dev *dev = ublk_ctrl_init();
1998 	struct ublk_params p;
1999 	int ret = -EINVAL;
2000 
2001 	if (!dev)
2002 		return -ENODEV;
2003 
2004 	if (ctx->dev_id < 0) {
2005 		fprintf(stderr, "device id isn't provided\n");
2006 		goto out;
2007 	}
2008 
2009 	dev->dev_info.dev_id = ctx->dev_id;
2010 	ret = ublk_ctrl_get_params(dev, &p);
2011 	if (ret < 0) {
2012 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
2013 		goto out;
2014 	}
2015 
2016 	if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
2017 		ublk_err("size isn't aligned with logical block size\n");
2018 		ret = -EINVAL;
2019 		goto out;
2020 	}
2021 
2022 	ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
2023 out:
2024 	ublk_ctrl_deinit(dev);
2025 	return ret;
2026 }
2027 
cmd_dev_quiesce(struct dev_ctx * ctx)2028 static int cmd_dev_quiesce(struct dev_ctx *ctx)
2029 {
2030 	struct ublk_dev *dev = ublk_ctrl_init();
2031 	int ret = -EINVAL;
2032 
2033 	if (!dev)
2034 		return -ENODEV;
2035 
2036 	if (ctx->dev_id < 0) {
2037 		fprintf(stderr, "device id isn't provided for quiesce\n");
2038 		goto out;
2039 	}
2040 	dev->dev_info.dev_id = ctx->dev_id;
2041 	ret = ublk_ctrl_quiesce_dev(dev, 10000);
2042 
2043 out:
2044 	ublk_ctrl_deinit(dev);
2045 	return ret;
2046 }
2047 
__cmd_create_help(char * exe,bool recovery)2048 static void __cmd_create_help(char *exe, bool recovery)
2049 {
2050 	int i;
2051 
2052 	printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
2053 			exe, recovery ? "recover" : "add");
2054 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
2055 	printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
2056 	printf("\t[--nthreads threads] [--per_io_tasks]\n");
2057 	printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
2058 		 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
2059 	printf("\t[--batch|-b] [--no_auto_part_scan]\n");
2060 	printf("\t[target options] [backfile1] [backfile2] ...\n");
2061 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
2062 	printf("\tdefault: nthreads=nr_queues");
2063 
2064 	for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) {
2065 		const struct ublk_tgt_ops *ops = tgt_ops_list[i];
2066 
2067 		if (ops->usage)
2068 			ops->usage(ops);
2069 	}
2070 }
2071 
cmd_add_help(char * exe)2072 static void cmd_add_help(char *exe)
2073 {
2074 	__cmd_create_help(exe, false);
2075 	printf("\n");
2076 }
2077 
cmd_recover_help(char * exe)2078 static void cmd_recover_help(char *exe)
2079 {
2080 	__cmd_create_help(exe, true);
2081 	printf("\tPlease provide exact command line for creating this device with real dev_id\n");
2082 	printf("\n");
2083 }
2084 
cmd_dev_help(char * exe)2085 static int cmd_dev_help(char *exe)
2086 {
2087 	cmd_add_help(exe);
2088 	cmd_recover_help(exe);
2089 
2090 	printf("%s del [-n dev_id] -a \n", exe);
2091 	printf("\t -a delete all devices -n delete specified device\n\n");
2092 	printf("%s stop -n dev_id [--safe]\n", exe);
2093 	printf("\t --safe only stop if device has no active openers\n\n");
2094 	printf("%s list [-n dev_id] -a \n", exe);
2095 	printf("\t -a list all devices, -n list specified device, default -a \n\n");
2096 	printf("%s features\n", exe);
2097 	printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
2098 	printf("%s quiesce -n dev_id\n", exe);
2099 	return 0;
2100 }
2101 
main(int argc,char * argv[])2102 int main(int argc, char *argv[])
2103 {
2104 	static const struct option longopts[] = {
2105 		{ "all",		0,	NULL, 'a' },
2106 		{ "type",		1,	NULL, 't' },
2107 		{ "number",		1,	NULL, 'n' },
2108 		{ "queues",		1,	NULL, 'q' },
2109 		{ "depth",		1,	NULL, 'd' },
2110 		{ "debug_mask",		1,	NULL,  0  },
2111 		{ "quiet",		0,	NULL,  0  },
2112 		{ "zero_copy",          0,      NULL, 'z' },
2113 		{ "foreground",		0,	NULL,  0  },
2114 		{ "recovery", 		1,      NULL, 'r' },
2115 		{ "recovery_fail_io",	1,	NULL, 'e'},
2116 		{ "recovery_reissue",	1,	NULL, 'i'},
2117 		{ "get_data",		1,	NULL, 'g'},
2118 		{ "auto_zc",		0,	NULL,  0 },
2119 		{ "auto_zc_fallback", 	0,	NULL,  0 },
2120 		{ "user_copy",		0,	NULL, 'u'},
2121 		{ "size",		1,	NULL, 's'},
2122 		{ "nthreads",		1,	NULL,  0 },
2123 		{ "per_io_tasks",	0,	NULL,  0 },
2124 		{ "no_ublk_fixed_fd",	0,	NULL,  0 },
2125 		{ "integrity_capable",	0,	NULL,  0 },
2126 		{ "integrity_reftag",	0,	NULL,  0 },
2127 		{ "metadata_size",	1,	NULL,  0 },
2128 		{ "pi_offset",		1,	NULL,  0 },
2129 		{ "csum_type",		1,	NULL,  0 },
2130 		{ "tag_size",		1,	NULL,  0 },
2131 		{ "safe",		0,	NULL,  0 },
2132 		{ "batch",              0,      NULL, 'b'},
2133 		{ "no_auto_part_scan",	0,	NULL,  0 },
2134 		{ "shmem_zc",		0,	NULL,  0  },
2135 		{ "htlb",		1,	NULL,  0  },
2136 		{ "rdonly_shmem_buf",	0,	NULL,  0  },
2137 		{ 0, 0, 0, 0 }
2138 	};
2139 	const struct ublk_tgt_ops *ops = NULL;
2140 	int option_idx, opt;
2141 	const char *cmd = argv[1];
2142 	struct dev_ctx ctx = {
2143 		._evtfd         =       -1,
2144 		.queue_depth	=	128,
2145 		.nr_hw_queues	=	2,
2146 		.dev_id		=	-1,
2147 		.tgt_type	=	"unknown",
2148 		.csum_type	=	LBMD_PI_CSUM_NONE,
2149 	};
2150 	int ret = -EINVAL, i;
2151 	int tgt_argc = 1;
2152 	char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
2153 	int value;
2154 
2155 	if (argc == 1)
2156 		return ret;
2157 
2158 	opterr = 0;
2159 	optind = 2;
2160 	while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub",
2161 				  longopts, &option_idx)) != -1) {
2162 		switch (opt) {
2163 		case 'a':
2164 			ctx.all = 1;
2165 			break;
2166 		case 'b':
2167 			ctx.flags |= UBLK_F_BATCH_IO;
2168 			break;
2169 		case 'n':
2170 			ctx.dev_id = strtol(optarg, NULL, 10);
2171 			break;
2172 		case 't':
2173 			if (strlen(optarg) < sizeof(ctx.tgt_type))
2174 				strcpy(ctx.tgt_type, optarg);
2175 			break;
2176 		case 'q':
2177 			ctx.nr_hw_queues = strtol(optarg, NULL, 10);
2178 			break;
2179 		case 'd':
2180 			ctx.queue_depth = strtol(optarg, NULL, 10);
2181 			break;
2182 		case 'z':
2183 			ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY;
2184 			break;
2185 		case 'r':
2186 			value = strtol(optarg, NULL, 10);
2187 			if (value)
2188 				ctx.flags |= UBLK_F_USER_RECOVERY;
2189 			break;
2190 		case 'e':
2191 			value = strtol(optarg, NULL, 10);
2192 			if (value)
2193 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
2194 			break;
2195 		case 'i':
2196 			value = strtol(optarg, NULL, 10);
2197 			if (value)
2198 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
2199 			break;
2200 		case 'g':
2201 			ctx.flags |= UBLK_F_NEED_GET_DATA;
2202 			break;
2203 		case 'u':
2204 			ctx.flags |= UBLK_F_USER_COPY;
2205 			break;
2206 		case 's':
2207 			ctx.size = strtoull(optarg, NULL, 10);
2208 			break;
2209 		case 0:
2210 			if (!strcmp(longopts[option_idx].name, "debug_mask"))
2211 				ublk_dbg_mask = strtol(optarg, NULL, 16);
2212 			if (!strcmp(longopts[option_idx].name, "quiet"))
2213 				ublk_dbg_mask = 0;
2214 			if (!strcmp(longopts[option_idx].name, "foreground"))
2215 				ctx.fg = 1;
2216 			if (!strcmp(longopts[option_idx].name, "auto_zc"))
2217 				ctx.flags |= UBLK_F_AUTO_BUF_REG;
2218 			if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
2219 				ctx.auto_zc_fallback = 1;
2220 			if (!strcmp(longopts[option_idx].name, "nthreads"))
2221 				ctx.nthreads = strtol(optarg, NULL, 10);
2222 			if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
2223 				ctx.per_io_tasks = 1;
2224 			if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
2225 				ctx.no_ublk_fixed_fd = 1;
2226 			if (!strcmp(longopts[option_idx].name, "integrity_capable"))
2227 				ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY;
2228 			if (!strcmp(longopts[option_idx].name, "integrity_reftag"))
2229 				ctx.integrity_flags |= LBMD_PI_CAP_REFTAG;
2230 			if (!strcmp(longopts[option_idx].name, "metadata_size"))
2231 				ctx.metadata_size = strtoul(optarg, NULL, 0);
2232 			if (!strcmp(longopts[option_idx].name, "pi_offset"))
2233 				ctx.pi_offset = strtoul(optarg, NULL, 0);
2234 			if (!strcmp(longopts[option_idx].name, "csum_type")) {
2235 				if (!strcmp(optarg, "ip")) {
2236 					ctx.csum_type = LBMD_PI_CSUM_IP;
2237 				} else if (!strcmp(optarg, "t10dif")) {
2238 					ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF;
2239 				} else if (!strcmp(optarg, "nvme")) {
2240 					ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME;
2241 				} else {
2242 					ublk_err("invalid csum_type: %s\n", optarg);
2243 					return -EINVAL;
2244 				}
2245 			}
2246 			if (!strcmp(longopts[option_idx].name, "tag_size"))
2247 				ctx.tag_size = strtoul(optarg, NULL, 0);
2248 			if (!strcmp(longopts[option_idx].name, "safe"))
2249 				ctx.safe_stop = 1;
2250 			if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
2251 				ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
2252 			if (!strcmp(longopts[option_idx].name, "shmem_zc"))
2253 				ctx.flags |= UBLK_F_SHMEM_ZC;
2254 			if (!strcmp(longopts[option_idx].name, "htlb"))
2255 				ctx.htlb_path = strdup(optarg);
2256 			if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf"))
2257 				ctx.rdonly_shmem_buf = 1;
2258 			break;
2259 		case '?':
2260 			/*
2261 			 * target requires every option must have argument
2262 			 */
2263 			if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
2264 				fprintf(stderr, "every target option requires argument: %s %s\n",
2265 						argv[optind - 1], argv[optind]);
2266 				exit(EXIT_FAILURE);
2267 			}
2268 
2269 			if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
2270 				tgt_argv[tgt_argc++] = argv[optind - 1];
2271 				tgt_argv[tgt_argc++] = argv[optind];
2272 			} else {
2273 				fprintf(stderr, "too many target options\n");
2274 				exit(EXIT_FAILURE);
2275 			}
2276 			optind += 1;
2277 			break;
2278 		}
2279 	}
2280 
2281 	if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) {
2282 		ublk_err("per_io_task and F_BATCH_IO conflict\n");
2283 		return -EINVAL;
2284 	}
2285 
2286 	/* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
2287 	if (ctx.auto_zc_fallback &&
2288 	    !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
2289 		    (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
2290 		ublk_err("%s: auto_zc_fallback is set but neither "
2291 				"F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
2292 					__func__);
2293 		return -EINVAL;
2294 	}
2295 
2296 	if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) +
2297 	    !!(ctx.flags & UBLK_F_USER_COPY) +
2298 	    (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) +
2299 	    (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) +
2300 	    ctx.auto_zc_fallback > 1) {
2301 		fprintf(stderr, "too many data copy modes specified\n");
2302 		return -EINVAL;
2303 	}
2304 
2305 	if (ctx.metadata_size) {
2306 		if (!(ctx.flags & UBLK_F_USER_COPY)) {
2307 			ublk_err("integrity requires user_copy\n");
2308 			return -EINVAL;
2309 		}
2310 
2311 		ctx.flags |= UBLK_F_INTEGRITY;
2312 	} else if (ctx.integrity_flags ||
2313 		   ctx.pi_offset ||
2314 		   ctx.csum_type != LBMD_PI_CSUM_NONE ||
2315 		   ctx.tag_size) {
2316 		ublk_err("integrity parameters require metadata_size\n");
2317 		return -EINVAL;
2318 	}
2319 
2320 	if ((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
2321 			(ctx.flags & UBLK_F_BATCH_IO) &&
2322 			(ctx.nthreads > ctx.nr_hw_queues)) {
2323 		ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n");
2324 		return -EINVAL;
2325 	}
2326 
2327 	i = optind;
2328 	while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
2329 		ctx.files[ctx.nr_files++] = argv[i++];
2330 	}
2331 
2332 	ops = ublk_find_tgt(ctx.tgt_type);
2333 	if (ops && ops->parse_cmd_line) {
2334 		optind = 0;
2335 
2336 		tgt_argv[0] = ctx.tgt_type;
2337 		ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
2338 	}
2339 
2340 	if (!strcmp(cmd, "add"))
2341 		ret = cmd_dev_add(&ctx);
2342 	else if (!strcmp(cmd, "recover")) {
2343 		if (ctx.dev_id < 0) {
2344 			fprintf(stderr, "device id isn't provided for recovering\n");
2345 			ret = -EINVAL;
2346 		} else {
2347 			ctx.recovery = 1;
2348 			ret = cmd_dev_add(&ctx);
2349 		}
2350 	} else if (!strcmp(cmd, "del"))
2351 		ret = cmd_dev_del(&ctx);
2352 	else if (!strcmp(cmd, "stop"))
2353 		ret = cmd_dev_stop(&ctx);
2354 	else if (!strcmp(cmd, "list")) {
2355 		ctx.all = 1;
2356 		ret = cmd_dev_list(&ctx);
2357 	} else if (!strcmp(cmd, "help"))
2358 		ret = cmd_dev_help(argv[0]);
2359 	else if (!strcmp(cmd, "features"))
2360 		ret = cmd_dev_get_features();
2361 	else if (!strcmp(cmd, "update_size"))
2362 		ret = cmd_dev_update_size(&ctx);
2363 	else if (!strcmp(cmd, "quiesce"))
2364 		ret = cmd_dev_quiesce(&ctx);
2365 	else
2366 		cmd_dev_help(argv[0]);
2367 
2368 	return ret;
2369 }
2370