xref: /linux/tools/testing/selftests/ublk/kublk.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 /* SPDX-License-Identifier: MIT */
2 /*
3  * Description: uring_cmd based ublk
4  */
5 
6 #include "kublk.h"
7 
8 #define MAX_NR_TGT_ARG 	64
9 
10 unsigned int ublk_dbg_mask = UBLK_LOG;
11 static const struct ublk_tgt_ops *tgt_ops_list[] = {
12 	&null_tgt_ops,
13 	&loop_tgt_ops,
14 	&stripe_tgt_ops,
15 	&fault_inject_tgt_ops,
16 };
17 
ublk_find_tgt(const char * name)18 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
19 {
20 	int i;
21 
22 	if (name == NULL)
23 		return NULL;
24 
25 	for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
26 		if (strcmp(tgt_ops_list[i]->name, name) == 0)
27 			return tgt_ops_list[i];
28 	return NULL;
29 }
30 
ublk_setup_ring(struct io_uring * r,int depth,int cq_depth,unsigned flags)31 static inline int ublk_setup_ring(struct io_uring *r, int depth,
32 		int cq_depth, unsigned flags)
33 {
34 	struct io_uring_params p;
35 
36 	memset(&p, 0, sizeof(p));
37 	p.flags = flags | IORING_SETUP_CQSIZE;
38 	p.cq_entries = cq_depth;
39 
40 	return io_uring_queue_init_params(depth, r, &p);
41 }
42 
ublk_ctrl_init_cmd(struct ublk_dev * dev,struct io_uring_sqe * sqe,struct ublk_ctrl_cmd_data * data)43 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
44 		struct io_uring_sqe *sqe,
45 		struct ublk_ctrl_cmd_data *data)
46 {
47 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
48 	struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
49 
50 	sqe->fd = dev->ctrl_fd;
51 	sqe->opcode = IORING_OP_URING_CMD;
52 	sqe->ioprio = 0;
53 
54 	if (data->flags & CTRL_CMD_HAS_BUF) {
55 		cmd->addr = data->addr;
56 		cmd->len = data->len;
57 	}
58 
59 	if (data->flags & CTRL_CMD_HAS_DATA)
60 		cmd->data[0] = data->data[0];
61 
62 	cmd->dev_id = info->dev_id;
63 	cmd->queue_id = -1;
64 
65 	ublk_set_sqe_cmd_op(sqe, data->cmd_op);
66 
67 	io_uring_sqe_set_data(sqe, cmd);
68 }
69 
__ublk_ctrl_cmd(struct ublk_dev * dev,struct ublk_ctrl_cmd_data * data)70 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
71 		struct ublk_ctrl_cmd_data *data)
72 {
73 	struct io_uring_sqe *sqe;
74 	struct io_uring_cqe *cqe;
75 	int ret = -EINVAL;
76 
77 	sqe = io_uring_get_sqe(&dev->ring);
78 	if (!sqe) {
79 		ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
80 		return ret;
81 	}
82 
83 	ublk_ctrl_init_cmd(dev, sqe, data);
84 
85 	ret = io_uring_submit(&dev->ring);
86 	if (ret < 0) {
87 		ublk_err("uring submit ret %d\n", ret);
88 		return ret;
89 	}
90 
91 	ret = io_uring_wait_cqe(&dev->ring, &cqe);
92 	if (ret < 0) {
93 		ublk_err("wait cqe: %s\n", strerror(-ret));
94 		return ret;
95 	}
96 	io_uring_cqe_seen(&dev->ring, cqe);
97 
98 	return cqe->res;
99 }
100 
ublk_ctrl_stop_dev(struct ublk_dev * dev)101 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
102 {
103 	struct ublk_ctrl_cmd_data data = {
104 		.cmd_op	= UBLK_U_CMD_STOP_DEV,
105 	};
106 
107 	return __ublk_ctrl_cmd(dev, &data);
108 }
109 
ublk_ctrl_start_dev(struct ublk_dev * dev,int daemon_pid)110 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
111 		int daemon_pid)
112 {
113 	struct ublk_ctrl_cmd_data data = {
114 		.cmd_op	= UBLK_U_CMD_START_DEV,
115 		.flags	= CTRL_CMD_HAS_DATA,
116 	};
117 
118 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
119 
120 	return __ublk_ctrl_cmd(dev, &data);
121 }
122 
ublk_ctrl_start_user_recovery(struct ublk_dev * dev)123 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
124 {
125 	struct ublk_ctrl_cmd_data data = {
126 		.cmd_op	= UBLK_U_CMD_START_USER_RECOVERY,
127 	};
128 
129 	return __ublk_ctrl_cmd(dev, &data);
130 }
131 
ublk_ctrl_end_user_recovery(struct ublk_dev * dev,int daemon_pid)132 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
133 {
134 	struct ublk_ctrl_cmd_data data = {
135 		.cmd_op	= UBLK_U_CMD_END_USER_RECOVERY,
136 		.flags	= CTRL_CMD_HAS_DATA,
137 	};
138 
139 	dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
140 
141 	return __ublk_ctrl_cmd(dev, &data);
142 }
143 
ublk_ctrl_add_dev(struct ublk_dev * dev)144 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
145 {
146 	struct ublk_ctrl_cmd_data data = {
147 		.cmd_op	= UBLK_U_CMD_ADD_DEV,
148 		.flags	= CTRL_CMD_HAS_BUF,
149 		.addr = (__u64) (uintptr_t) &dev->dev_info,
150 		.len = sizeof(struct ublksrv_ctrl_dev_info),
151 	};
152 
153 	return __ublk_ctrl_cmd(dev, &data);
154 }
155 
ublk_ctrl_del_dev(struct ublk_dev * dev)156 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
157 {
158 	struct ublk_ctrl_cmd_data data = {
159 		.cmd_op = UBLK_U_CMD_DEL_DEV,
160 		.flags = 0,
161 	};
162 
163 	return __ublk_ctrl_cmd(dev, &data);
164 }
165 
ublk_ctrl_get_info(struct ublk_dev * dev)166 static int ublk_ctrl_get_info(struct ublk_dev *dev)
167 {
168 	struct ublk_ctrl_cmd_data data = {
169 		.cmd_op	= UBLK_U_CMD_GET_DEV_INFO,
170 		.flags	= CTRL_CMD_HAS_BUF,
171 		.addr = (__u64) (uintptr_t) &dev->dev_info,
172 		.len = sizeof(struct ublksrv_ctrl_dev_info),
173 	};
174 
175 	return __ublk_ctrl_cmd(dev, &data);
176 }
177 
ublk_ctrl_set_params(struct ublk_dev * dev,struct ublk_params * params)178 static int ublk_ctrl_set_params(struct ublk_dev *dev,
179 		struct ublk_params *params)
180 {
181 	struct ublk_ctrl_cmd_data data = {
182 		.cmd_op	= UBLK_U_CMD_SET_PARAMS,
183 		.flags	= CTRL_CMD_HAS_BUF,
184 		.addr = (__u64) (uintptr_t) params,
185 		.len = sizeof(*params),
186 	};
187 	params->len = sizeof(*params);
188 	return __ublk_ctrl_cmd(dev, &data);
189 }
190 
ublk_ctrl_get_params(struct ublk_dev * dev,struct ublk_params * params)191 static int ublk_ctrl_get_params(struct ublk_dev *dev,
192 		struct ublk_params *params)
193 {
194 	struct ublk_ctrl_cmd_data data = {
195 		.cmd_op	= UBLK_U_CMD_GET_PARAMS,
196 		.flags	= CTRL_CMD_HAS_BUF,
197 		.addr = (__u64)params,
198 		.len = sizeof(*params),
199 	};
200 
201 	params->len = sizeof(*params);
202 
203 	return __ublk_ctrl_cmd(dev, &data);
204 }
205 
ublk_ctrl_get_features(struct ublk_dev * dev,__u64 * features)206 static int ublk_ctrl_get_features(struct ublk_dev *dev,
207 		__u64 *features)
208 {
209 	struct ublk_ctrl_cmd_data data = {
210 		.cmd_op	= UBLK_U_CMD_GET_FEATURES,
211 		.flags	= CTRL_CMD_HAS_BUF,
212 		.addr = (__u64) (uintptr_t) features,
213 		.len = sizeof(*features),
214 	};
215 
216 	return __ublk_ctrl_cmd(dev, &data);
217 }
218 
ublk_ctrl_update_size(struct ublk_dev * dev,__u64 nr_sects)219 static int ublk_ctrl_update_size(struct ublk_dev *dev,
220 		__u64 nr_sects)
221 {
222 	struct ublk_ctrl_cmd_data data = {
223 		.cmd_op	= UBLK_U_CMD_UPDATE_SIZE,
224 		.flags	= CTRL_CMD_HAS_DATA,
225 	};
226 
227 	data.data[0] = nr_sects;
228 	return __ublk_ctrl_cmd(dev, &data);
229 }
230 
ublk_ctrl_quiesce_dev(struct ublk_dev * dev,unsigned int timeout_ms)231 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
232 				 unsigned int timeout_ms)
233 {
234 	struct ublk_ctrl_cmd_data data = {
235 		.cmd_op	= UBLK_U_CMD_QUIESCE_DEV,
236 		.flags	= CTRL_CMD_HAS_DATA,
237 	};
238 
239 	data.data[0] = timeout_ms;
240 	return __ublk_ctrl_cmd(dev, &data);
241 }
242 
ublk_dev_state_desc(struct ublk_dev * dev)243 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
244 {
245 	switch (dev->dev_info.state) {
246 	case UBLK_S_DEV_DEAD:
247 		return "DEAD";
248 	case UBLK_S_DEV_LIVE:
249 		return "LIVE";
250 	case UBLK_S_DEV_QUIESCED:
251 		return "QUIESCED";
252 	default:
253 		return "UNKNOWN";
254 	};
255 }
256 
ublk_print_cpu_set(const cpu_set_t * set,char * buf,unsigned len)257 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
258 {
259 	unsigned done = 0;
260 	int i;
261 
262 	for (i = 0; i < CPU_SETSIZE; i++) {
263 		if (CPU_ISSET(i, set))
264 			done += snprintf(&buf[done], len - done, "%d ", i);
265 	}
266 }
267 
ublk_adjust_affinity(cpu_set_t * set)268 static void ublk_adjust_affinity(cpu_set_t *set)
269 {
270 	int j, updated = 0;
271 
272 	/*
273 	 * Just keep the 1st CPU now.
274 	 *
275 	 * In future, auto affinity selection can be tried.
276 	 */
277 	for (j = 0; j < CPU_SETSIZE; j++) {
278 		if (CPU_ISSET(j, set)) {
279 			if (!updated) {
280 				updated = 1;
281 				continue;
282 			}
283 			CPU_CLR(j, set);
284 		}
285 	}
286 }
287 
288 /* Caller must free the allocated buffer */
ublk_ctrl_get_affinity(struct ublk_dev * ctrl_dev,cpu_set_t ** ptr_buf)289 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
290 {
291 	struct ublk_ctrl_cmd_data data = {
292 		.cmd_op	= UBLK_U_CMD_GET_QUEUE_AFFINITY,
293 		.flags	= CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
294 	};
295 	cpu_set_t *buf;
296 	int i, ret;
297 
298 	buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
299 	if (!buf)
300 		return -ENOMEM;
301 
302 	for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
303 		data.data[0] = i;
304 		data.len = sizeof(cpu_set_t);
305 		data.addr = (__u64)&buf[i];
306 
307 		ret = __ublk_ctrl_cmd(ctrl_dev, &data);
308 		if (ret < 0) {
309 			free(buf);
310 			return ret;
311 		}
312 		ublk_adjust_affinity(&buf[i]);
313 	}
314 
315 	*ptr_buf = buf;
316 	return 0;
317 }
318 
ublk_ctrl_dump(struct ublk_dev * dev)319 static void ublk_ctrl_dump(struct ublk_dev *dev)
320 {
321 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
322 	struct ublk_params p;
323 	cpu_set_t *affinity;
324 	int ret;
325 
326 	ret = ublk_ctrl_get_params(dev, &p);
327 	if (ret < 0) {
328 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
329 		return;
330 	}
331 
332 	ret = ublk_ctrl_get_affinity(dev, &affinity);
333 	if (ret < 0) {
334 		ublk_err("failed to get affinity %m\n");
335 		return;
336 	}
337 
338 	ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
339 			info->dev_id, info->nr_hw_queues, info->queue_depth,
340 			1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
341 	ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
342 			info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
343 			ublk_dev_state_desc(dev));
344 
345 	if (affinity) {
346 		char buf[512];
347 		int i;
348 
349 		for (i = 0; i < info->nr_hw_queues; i++) {
350 			ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
351 			printf("\tqueue %u: affinity(%s)\n",
352 					i, buf);
353 		}
354 		free(affinity);
355 	}
356 
357 	fflush(stdout);
358 }
359 
ublk_ctrl_deinit(struct ublk_dev * dev)360 static void ublk_ctrl_deinit(struct ublk_dev *dev)
361 {
362 	close(dev->ctrl_fd);
363 	free(dev);
364 }
365 
ublk_ctrl_init(void)366 static struct ublk_dev *ublk_ctrl_init(void)
367 {
368 	struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
369 	struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
370 	int ret;
371 
372 	dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
373 	if (dev->ctrl_fd < 0) {
374 		free(dev);
375 		return NULL;
376 	}
377 
378 	info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
379 
380 	ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
381 			UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
382 	if (ret < 0) {
383 		ublk_err("queue_init: %s\n", strerror(-ret));
384 		free(dev);
385 		return NULL;
386 	}
387 	dev->nr_fds = 1;
388 
389 	return dev;
390 }
391 
__ublk_queue_cmd_buf_sz(unsigned depth)392 static int __ublk_queue_cmd_buf_sz(unsigned depth)
393 {
394 	int size =  depth * sizeof(struct ublksrv_io_desc);
395 	unsigned int page_sz = getpagesize();
396 
397 	return round_up(size, page_sz);
398 }
399 
ublk_queue_max_cmd_buf_sz(void)400 static int ublk_queue_max_cmd_buf_sz(void)
401 {
402 	return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
403 }
404 
ublk_queue_cmd_buf_sz(struct ublk_queue * q)405 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
406 {
407 	return __ublk_queue_cmd_buf_sz(q->q_depth);
408 }
409 
ublk_queue_deinit(struct ublk_queue * q)410 static void ublk_queue_deinit(struct ublk_queue *q)
411 {
412 	int i;
413 	int nr_ios = q->q_depth;
414 
415 	if (q->io_cmd_buf)
416 		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
417 
418 	for (i = 0; i < nr_ios; i++)
419 		free(q->ios[i].buf_addr);
420 }
421 
ublk_thread_deinit(struct ublk_thread * t)422 static void ublk_thread_deinit(struct ublk_thread *t)
423 {
424 	io_uring_unregister_buffers(&t->ring);
425 
426 	io_uring_unregister_ring_fd(&t->ring);
427 
428 	if (t->ring.ring_fd > 0) {
429 		io_uring_unregister_files(&t->ring);
430 		close(t->ring.ring_fd);
431 		t->ring.ring_fd = -1;
432 	}
433 }
434 
ublk_queue_init(struct ublk_queue * q,unsigned extra_flags)435 static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
436 {
437 	struct ublk_dev *dev = q->dev;
438 	int depth = dev->dev_info.queue_depth;
439 	int i;
440 	int cmd_buf_size, io_buf_size;
441 	unsigned long off;
442 
443 	q->tgt_ops = dev->tgt.ops;
444 	q->flags = 0;
445 	q->q_depth = depth;
446 	q->flags = dev->dev_info.flags;
447 	q->flags |= extra_flags;
448 
449 	cmd_buf_size = ublk_queue_cmd_buf_sz(q);
450 	off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
451 	q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
452 			MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
453 	if (q->io_cmd_buf == MAP_FAILED) {
454 		ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
455 				q->dev->dev_info.dev_id, q->q_id);
456 		goto fail;
457 	}
458 
459 	io_buf_size = dev->dev_info.max_io_buf_bytes;
460 	for (i = 0; i < q->q_depth; i++) {
461 		q->ios[i].buf_addr = NULL;
462 		q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
463 		q->ios[i].tag = i;
464 
465 		if (ublk_queue_no_buf(q))
466 			continue;
467 
468 		if (posix_memalign((void **)&q->ios[i].buf_addr,
469 					getpagesize(), io_buf_size)) {
470 			ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
471 					dev->dev_info.dev_id, q->q_id, i);
472 			goto fail;
473 		}
474 	}
475 
476 	return 0;
477  fail:
478 	ublk_queue_deinit(q);
479 	ublk_err("ublk dev %d queue %d failed\n",
480 			dev->dev_info.dev_id, q->q_id);
481 	return -ENOMEM;
482 }
483 
ublk_thread_init(struct ublk_thread * t)484 static int ublk_thread_init(struct ublk_thread *t)
485 {
486 	struct ublk_dev *dev = t->dev;
487 	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
488 	int ret;
489 
490 	ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
491 			IORING_SETUP_COOP_TASKRUN |
492 			IORING_SETUP_SINGLE_ISSUER |
493 			IORING_SETUP_DEFER_TASKRUN);
494 	if (ret < 0) {
495 		ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
496 				dev->dev_info.dev_id, t->idx, ret);
497 		goto fail;
498 	}
499 
500 	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
501 		unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
502 		unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
503 		max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
504 		ret = io_uring_register_buffers_sparse(
505 			&t->ring, max_nr_ios_per_thread);
506 		if (ret) {
507 			ublk_err("ublk dev %d thread %d register spare buffers failed %d",
508 					dev->dev_info.dev_id, t->idx, ret);
509 			goto fail;
510 		}
511 	}
512 
513 	io_uring_register_ring_fd(&t->ring);
514 
515 	ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
516 	if (ret) {
517 		ublk_err("ublk dev %d thread %d register files failed %d\n",
518 				t->dev->dev_info.dev_id, t->idx, ret);
519 		goto fail;
520 	}
521 
522 	return 0;
523 fail:
524 	ublk_thread_deinit(t);
525 	ublk_err("ublk dev %d thread %d init failed\n",
526 			dev->dev_info.dev_id, t->idx);
527 	return -ENOMEM;
528 }
529 
530 #define WAIT_USEC 	100000
531 #define MAX_WAIT_USEC 	(3 * 1000000)
ublk_dev_prep(const struct dev_ctx * ctx,struct ublk_dev * dev)532 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
533 {
534 	int dev_id = dev->dev_info.dev_id;
535 	unsigned int wait_usec = 0;
536 	int ret = 0, fd = -1;
537 	char buf[64];
538 
539 	snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
540 
541 	while (wait_usec < MAX_WAIT_USEC) {
542 		fd = open(buf, O_RDWR);
543 		if (fd >= 0)
544 			break;
545 		usleep(WAIT_USEC);
546 		wait_usec += WAIT_USEC;
547 	}
548 	if (fd < 0) {
549 		ublk_err("can't open %s %s\n", buf, strerror(errno));
550 		return -1;
551 	}
552 
553 	dev->fds[0] = fd;
554 	if (dev->tgt.ops->init_tgt)
555 		ret = dev->tgt.ops->init_tgt(ctx, dev);
556 	if (ret)
557 		close(dev->fds[0]);
558 	return ret;
559 }
560 
ublk_dev_unprep(struct ublk_dev * dev)561 static void ublk_dev_unprep(struct ublk_dev *dev)
562 {
563 	if (dev->tgt.ops->deinit_tgt)
564 		dev->tgt.ops->deinit_tgt(dev);
565 	close(dev->fds[0]);
566 }
567 
ublk_set_auto_buf_reg(const struct ublk_queue * q,struct io_uring_sqe * sqe,unsigned short tag)568 static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
569 				  struct io_uring_sqe *sqe,
570 				  unsigned short tag)
571 {
572 	struct ublk_auto_buf_reg buf = {};
573 
574 	if (q->tgt_ops->buf_index)
575 		buf.index = q->tgt_ops->buf_index(q, tag);
576 	else
577 		buf.index = q->ios[tag].buf_index;
578 
579 	if (ublk_queue_auto_zc_fallback(q))
580 		buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
581 
582 	sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
583 }
584 
ublk_queue_io_cmd(struct ublk_thread * t,struct ublk_io * io)585 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
586 {
587 	struct ublk_queue *q = ublk_io_to_queue(io);
588 	struct ublksrv_io_cmd *cmd;
589 	struct io_uring_sqe *sqe[1];
590 	unsigned int cmd_op = 0;
591 	__u64 user_data;
592 
593 	/* only freed io can be issued */
594 	if (!(io->flags & UBLKS_IO_FREE))
595 		return 0;
596 
597 	/*
598 	 * we issue because we need either fetching or committing or
599 	 * getting data
600 	 */
601 	if (!(io->flags &
602 		(UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
603 		return 0;
604 
605 	if (io->flags & UBLKS_IO_NEED_GET_DATA)
606 		cmd_op = UBLK_U_IO_NEED_GET_DATA;
607 	else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP)
608 		cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
609 	else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
610 		cmd_op = UBLK_U_IO_FETCH_REQ;
611 
612 	if (io_uring_sq_space_left(&t->ring) < 1)
613 		io_uring_submit(&t->ring);
614 
615 	ublk_io_alloc_sqes(t, sqe, 1);
616 	if (!sqe[0]) {
617 		ublk_err("%s: run out of sqe. thread %u, tag %d\n",
618 				__func__, t->idx, io->tag);
619 		return -1;
620 	}
621 
622 	cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
623 
624 	if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
625 		cmd->result = io->result;
626 
627 	/* These fields should be written once, never change */
628 	ublk_set_sqe_cmd_op(sqe[0], cmd_op);
629 	sqe[0]->fd		= 0;	/* dev->fds[0] */
630 	sqe[0]->opcode	= IORING_OP_URING_CMD;
631 	sqe[0]->flags	= IOSQE_FIXED_FILE;
632 	sqe[0]->rw_flags	= 0;
633 	cmd->tag	= io->tag;
634 	cmd->q_id	= q->q_id;
635 	if (!ublk_queue_no_buf(q))
636 		cmd->addr	= (__u64) (uintptr_t) io->buf_addr;
637 	else
638 		cmd->addr	= 0;
639 
640 	if (ublk_queue_use_auto_zc(q))
641 		ublk_set_auto_buf_reg(q, sqe[0], io->tag);
642 
643 	user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
644 	io_uring_sqe_set_data64(sqe[0], user_data);
645 
646 	io->flags = 0;
647 
648 	t->cmd_inflight += 1;
649 
650 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
651 			__func__, t->idx, q->q_id, io->tag, cmd_op,
652 			io->flags, !!(t->state & UBLKS_T_STOPPING));
653 	return 1;
654 }
655 
ublk_submit_fetch_commands(struct ublk_thread * t)656 static void ublk_submit_fetch_commands(struct ublk_thread *t)
657 {
658 	struct ublk_queue *q;
659 	struct ublk_io *io;
660 	int i = 0, j = 0;
661 
662 	if (t->dev->per_io_tasks) {
663 		/*
664 		 * Lexicographically order all the (qid,tag) pairs, with
665 		 * qid taking priority (so (1,0) > (0,1)). Then make
666 		 * this thread the daemon for every Nth entry in this
667 		 * list (N is the number of threads), starting at this
668 		 * thread's index. This ensures that each queue is
669 		 * handled by as many ublk server threads as possible,
670 		 * so that load that is concentrated on one or a few
671 		 * queues can make use of all ublk server threads.
672 		 */
673 		const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
674 		int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
675 		for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
676 			int q_id = i / dinfo->queue_depth;
677 			int tag = i % dinfo->queue_depth;
678 			q = &t->dev->q[q_id];
679 			io = &q->ios[tag];
680 			io->buf_index = j++;
681 			ublk_queue_io_cmd(t, io);
682 		}
683 	} else {
684 		/*
685 		 * Service exclusively the queue whose q_id matches our
686 		 * thread index.
687 		 */
688 		struct ublk_queue *q = &t->dev->q[t->idx];
689 		for (i = 0; i < q->q_depth; i++) {
690 			io = &q->ios[i];
691 			io->buf_index = i;
692 			ublk_queue_io_cmd(t, io);
693 		}
694 	}
695 }
696 
ublk_thread_is_idle(struct ublk_thread * t)697 static int ublk_thread_is_idle(struct ublk_thread *t)
698 {
699 	return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
700 }
701 
ublk_thread_is_done(struct ublk_thread * t)702 static int ublk_thread_is_done(struct ublk_thread *t)
703 {
704 	return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t);
705 }
706 
ublksrv_handle_tgt_cqe(struct ublk_thread * t,struct ublk_queue * q,struct io_uring_cqe * cqe)707 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
708 					  struct ublk_queue *q,
709 					  struct io_uring_cqe *cqe)
710 {
711 	if (cqe->res < 0 && cqe->res != -EAGAIN)
712 		ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
713 			__func__, cqe->res, q->q_id,
714 			user_data_to_tag(cqe->user_data),
715 			user_data_to_op(cqe->user_data));
716 
717 	if (q->tgt_ops->tgt_io_done)
718 		q->tgt_ops->tgt_io_done(t, q, cqe);
719 }
720 
ublk_handle_uring_cmd(struct ublk_thread * t,struct ublk_queue * q,const struct io_uring_cqe * cqe)721 static void ublk_handle_uring_cmd(struct ublk_thread *t,
722 				  struct ublk_queue *q,
723 				  const struct io_uring_cqe *cqe)
724 {
725 	int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
726 		!(t->state & UBLKS_T_STOPPING);
727 	unsigned tag = user_data_to_tag(cqe->user_data);
728 	struct ublk_io *io = &q->ios[tag];
729 
730 	if (!fetch) {
731 		t->state |= UBLKS_T_STOPPING;
732 		io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
733 	}
734 
735 	if (cqe->res == UBLK_IO_RES_OK) {
736 		assert(tag < q->q_depth);
737 		if (q->tgt_ops->queue_io)
738 			q->tgt_ops->queue_io(t, q, tag);
739 	} else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
740 		io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
741 		ublk_queue_io_cmd(t, io);
742 	} else {
743 		/*
744 		 * COMMIT_REQ will be completed immediately since no fetching
745 		 * piggyback is required.
746 		 *
747 		 * Marking IO_FREE only, then this io won't be issued since
748 		 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
749 		 *
750 		 * */
751 		io->flags = UBLKS_IO_FREE;
752 	}
753 }
754 
ublk_handle_cqe(struct ublk_thread * t,struct io_uring_cqe * cqe,void * data)755 static void ublk_handle_cqe(struct ublk_thread *t,
756 		struct io_uring_cqe *cqe, void *data)
757 {
758 	struct ublk_dev *dev = t->dev;
759 	unsigned q_id = user_data_to_q_id(cqe->user_data);
760 	struct ublk_queue *q = &dev->q[q_id];
761 	unsigned cmd_op = user_data_to_op(cqe->user_data);
762 
763 	if (cqe->res < 0 && cqe->res != -ENODEV)
764 		ublk_err("%s: res %d userdata %llx queue state %x\n", __func__,
765 				cqe->res, cqe->user_data, q->flags);
766 
767 	ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n",
768 			__func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data),
769 			cmd_op, is_target_io(cqe->user_data),
770 			user_data_to_tgt_data(cqe->user_data),
771 			(t->state & UBLKS_T_STOPPING));
772 
773 	/* Don't retrieve io in case of target io */
774 	if (is_target_io(cqe->user_data)) {
775 		ublksrv_handle_tgt_cqe(t, q, cqe);
776 		return;
777 	}
778 
779 	t->cmd_inflight--;
780 
781 	ublk_handle_uring_cmd(t, q, cqe);
782 }
783 
ublk_reap_events_uring(struct ublk_thread * t)784 static int ublk_reap_events_uring(struct ublk_thread *t)
785 {
786 	struct io_uring_cqe *cqe;
787 	unsigned head;
788 	int count = 0;
789 
790 	io_uring_for_each_cqe(&t->ring, head, cqe) {
791 		ublk_handle_cqe(t, cqe, NULL);
792 		count += 1;
793 	}
794 	io_uring_cq_advance(&t->ring, count);
795 
796 	return count;
797 }
798 
ublk_process_io(struct ublk_thread * t)799 static int ublk_process_io(struct ublk_thread *t)
800 {
801 	int ret, reapped;
802 
803 	ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
804 				t->dev->dev_info.dev_id,
805 				t->idx, io_uring_sq_ready(&t->ring),
806 				t->cmd_inflight,
807 				(t->state & UBLKS_T_STOPPING));
808 
809 	if (ublk_thread_is_done(t))
810 		return -ENODEV;
811 
812 	ret = io_uring_submit_and_wait(&t->ring, 1);
813 	reapped = ublk_reap_events_uring(t);
814 
815 	ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
816 			ret, reapped, (t->state & UBLKS_T_STOPPING),
817 			(t->state & UBLKS_T_IDLE));
818 
819 	return reapped;
820 }
821 
ublk_thread_set_sched_affinity(const struct ublk_thread * t,cpu_set_t * cpuset)822 static void ublk_thread_set_sched_affinity(const struct ublk_thread *t,
823 		cpu_set_t *cpuset)
824 {
825         if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
826 		ublk_err("ublk dev %u thread %u set affinity failed",
827 				t->dev->dev_info.dev_id, t->idx);
828 }
829 
830 struct ublk_thread_info {
831 	struct ublk_dev 	*dev;
832 	unsigned		idx;
833 	sem_t 			*ready;
834 	cpu_set_t 		*affinity;
835 };
836 
ublk_io_handler_fn(void * data)837 static void *ublk_io_handler_fn(void *data)
838 {
839 	struct ublk_thread_info *info = data;
840 	struct ublk_thread *t = &info->dev->threads[info->idx];
841 	int dev_id = info->dev->dev_info.dev_id;
842 	int ret;
843 
844 	t->dev = info->dev;
845 	t->idx = info->idx;
846 
847 	ret = ublk_thread_init(t);
848 	if (ret) {
849 		ublk_err("ublk dev %d thread %u init failed\n",
850 				dev_id, t->idx);
851 		return NULL;
852 	}
853 	/* IO perf is sensitive with queue pthread affinity on NUMA machine*/
854 	if (info->affinity)
855 		ublk_thread_set_sched_affinity(t, info->affinity);
856 	sem_post(info->ready);
857 
858 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
859 			gettid(), dev_id, t->idx);
860 
861 	/* submit all io commands to ublk driver */
862 	ublk_submit_fetch_commands(t);
863 	do {
864 		if (ublk_process_io(t) < 0)
865 			break;
866 	} while (1);
867 
868 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
869 		 gettid(), dev_id, t->idx);
870 	ublk_thread_deinit(t);
871 	return NULL;
872 }
873 
ublk_set_parameters(struct ublk_dev * dev)874 static void ublk_set_parameters(struct ublk_dev *dev)
875 {
876 	int ret;
877 
878 	ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
879 	if (ret)
880 		ublk_err("dev %d set basic parameter failed %d\n",
881 				dev->dev_info.dev_id, ret);
882 }
883 
ublk_send_dev_event(const struct dev_ctx * ctx,struct ublk_dev * dev,int dev_id)884 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
885 {
886 	uint64_t id;
887 	int evtfd = ctx->_evtfd;
888 
889 	if (evtfd < 0)
890 		return -EBADF;
891 
892 	if (dev_id >= 0)
893 		id = dev_id + 1;
894 	else
895 		id = ERROR_EVTFD_DEVID;
896 
897 	if (dev && ctx->shadow_dev)
898 		memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
899 
900 	if (write(evtfd, &id, sizeof(id)) != sizeof(id))
901 		return -EINVAL;
902 
903 	close(evtfd);
904 	shmdt(ctx->shadow_dev);
905 
906 	return 0;
907 }
908 
909 
ublk_start_daemon(const struct dev_ctx * ctx,struct ublk_dev * dev)910 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
911 {
912 	const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
913 	struct ublk_thread_info *tinfo;
914 	unsigned long long extra_flags = 0;
915 	cpu_set_t *affinity_buf;
916 	void *thread_ret;
917 	sem_t ready;
918 	int ret, i;
919 
920 	ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
921 
922 	tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
923 	if (!tinfo)
924 		return -ENOMEM;
925 
926 	sem_init(&ready, 0, 0);
927 	ret = ublk_dev_prep(ctx, dev);
928 	if (ret)
929 		return ret;
930 
931 	ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
932 	if (ret)
933 		return ret;
934 
935 	if (ctx->auto_zc_fallback)
936 		extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
937 
938 	for (i = 0; i < dinfo->nr_hw_queues; i++) {
939 		dev->q[i].dev = dev;
940 		dev->q[i].q_id = i;
941 
942 		ret = ublk_queue_init(&dev->q[i], extra_flags);
943 		if (ret) {
944 			ublk_err("ublk dev %d queue %d init queue failed\n",
945 				 dinfo->dev_id, i);
946 			goto fail;
947 		}
948 	}
949 
950 	for (i = 0; i < dev->nthreads; i++) {
951 		tinfo[i].dev = dev;
952 		tinfo[i].idx = i;
953 		tinfo[i].ready = &ready;
954 
955 		/*
956 		 * If threads are not tied 1:1 to queues, setting thread
957 		 * affinity based on queue affinity makes little sense.
958 		 * However, thread CPU affinity has significant impact
959 		 * on performance, so to compare fairly, we'll still set
960 		 * thread CPU affinity based on queue affinity where
961 		 * possible.
962 		 */
963 		if (dev->nthreads == dinfo->nr_hw_queues)
964 			tinfo[i].affinity = &affinity_buf[i];
965 		pthread_create(&dev->threads[i].thread, NULL,
966 				ublk_io_handler_fn,
967 				&tinfo[i]);
968 	}
969 
970 	for (i = 0; i < dev->nthreads; i++)
971 		sem_wait(&ready);
972 	free(tinfo);
973 	free(affinity_buf);
974 
975 	/* everything is fine now, start us */
976 	if (ctx->recovery)
977 		ret = ublk_ctrl_end_user_recovery(dev, getpid());
978 	else {
979 		ublk_set_parameters(dev);
980 		ret = ublk_ctrl_start_dev(dev, getpid());
981 	}
982 	if (ret < 0) {
983 		ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
984 		goto fail;
985 	}
986 
987 	ublk_ctrl_get_info(dev);
988 	if (ctx->fg)
989 		ublk_ctrl_dump(dev);
990 	else
991 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
992 
993 	/* wait until we are terminated */
994 	for (i = 0; i < dev->nthreads; i++)
995 		pthread_join(dev->threads[i].thread, &thread_ret);
996  fail:
997 	for (i = 0; i < dinfo->nr_hw_queues; i++)
998 		ublk_queue_deinit(&dev->q[i]);
999 	ublk_dev_unprep(dev);
1000 	ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
1001 
1002 	return ret;
1003 }
1004 
wait_ublk_dev(const char * path,int evt_mask,unsigned timeout)1005 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
1006 {
1007 #define EV_SIZE (sizeof(struct inotify_event))
1008 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
1009 	struct pollfd pfd;
1010 	int fd, wd;
1011 	int ret = -EINVAL;
1012 	const char *dev_name = basename(path);
1013 
1014 	fd = inotify_init();
1015 	if (fd < 0) {
1016 		ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
1017 		return fd;
1018 	}
1019 
1020 	wd = inotify_add_watch(fd, "/dev", evt_mask);
1021 	if (wd == -1) {
1022 		ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
1023 		goto fail;
1024 	}
1025 
1026 	pfd.fd = fd;
1027 	pfd.events = POLL_IN;
1028 	while (1) {
1029 		int i = 0;
1030 		char buffer[EV_BUF_LEN];
1031 		ret = poll(&pfd, 1, 1000 * timeout);
1032 
1033 		if (ret == -1) {
1034 			ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
1035 			goto rm_watch;
1036 		} else if (ret == 0) {
1037 			ublk_err("%s: poll inotify timeout\n", __func__);
1038 			ret = -ETIMEDOUT;
1039 			goto rm_watch;
1040 		}
1041 
1042 		ret = read(fd, buffer, EV_BUF_LEN);
1043 		if (ret < 0) {
1044 			ublk_err("%s: read inotify fd failed\n", __func__);
1045 			goto rm_watch;
1046 		}
1047 
1048 		while (i < ret) {
1049 			struct inotify_event *event = (struct inotify_event *)&buffer[i];
1050 
1051 			ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
1052 					__func__, event->mask, event->name);
1053 			if (event->mask & evt_mask) {
1054 				if (!strcmp(event->name, dev_name)) {
1055 					ret = 0;
1056 					goto rm_watch;
1057 				}
1058 			}
1059 			i += EV_SIZE + event->len;
1060 		}
1061 	}
1062 rm_watch:
1063 	inotify_rm_watch(fd, wd);
1064 fail:
1065 	close(fd);
1066 	return ret;
1067 }
1068 
ublk_stop_io_daemon(const struct ublk_dev * dev)1069 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
1070 {
1071 	int daemon_pid = dev->dev_info.ublksrv_pid;
1072 	int dev_id = dev->dev_info.dev_id;
1073 	char ublkc[64];
1074 	int ret = 0;
1075 
1076 	if (daemon_pid < 0)
1077 		return 0;
1078 
1079 	/* daemon may be dead already */
1080 	if (kill(daemon_pid, 0) < 0)
1081 		goto wait;
1082 
1083 	snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1084 
1085 	/* ublk char device may be gone already */
1086 	if (access(ublkc, F_OK) != 0)
1087 		goto wait;
1088 
1089 	/* Wait until ublk char device is closed, when the daemon is shutdown */
1090 	ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1091 	/* double check and since it may be closed before starting inotify */
1092 	if (ret == -ETIMEDOUT)
1093 		ret = kill(daemon_pid, 0) < 0;
1094 wait:
1095 	waitpid(daemon_pid, NULL, 0);
1096 	ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1097 			__func__, daemon_pid, dev_id, ret);
1098 
1099 	return ret;
1100 }
1101 
__cmd_dev_add(const struct dev_ctx * ctx)1102 static int __cmd_dev_add(const struct dev_ctx *ctx)
1103 {
1104 	unsigned nthreads = ctx->nthreads;
1105 	unsigned nr_queues = ctx->nr_hw_queues;
1106 	const char *tgt_type = ctx->tgt_type;
1107 	unsigned depth = ctx->queue_depth;
1108 	__u64 features;
1109 	const struct ublk_tgt_ops *ops;
1110 	struct ublksrv_ctrl_dev_info *info;
1111 	struct ublk_dev *dev = NULL;
1112 	int dev_id = ctx->dev_id;
1113 	int ret, i;
1114 
1115 	ops = ublk_find_tgt(tgt_type);
1116 	if (!ops) {
1117 		ublk_err("%s: no such tgt type, type %s\n",
1118 				__func__, tgt_type);
1119 		ret = -ENODEV;
1120 		goto fail;
1121 	}
1122 
1123 	if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1124 		ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1125 				__func__, nr_queues, depth);
1126 		ret = -EINVAL;
1127 		goto fail;
1128 	}
1129 
1130 	/* default to 1:1 threads:queues if nthreads is unspecified */
1131 	if (!nthreads)
1132 		nthreads = nr_queues;
1133 
1134 	if (nthreads > UBLK_MAX_THREADS) {
1135 		ublk_err("%s: %u is too many threads (max %u)\n",
1136 				__func__, nthreads, UBLK_MAX_THREADS);
1137 		ret = -EINVAL;
1138 		goto fail;
1139 	}
1140 
1141 	if (nthreads != nr_queues && !ctx->per_io_tasks) {
1142 		ublk_err("%s: threads %u must be same as queues %u if "
1143 			"not using per_io_tasks\n",
1144 			__func__, nthreads, nr_queues);
1145 		ret = -EINVAL;
1146 		goto fail;
1147 	}
1148 
1149 	dev = ublk_ctrl_init();
1150 	if (!dev) {
1151 		ublk_err("%s: can't alloc dev id %d, type %s\n",
1152 				__func__, dev_id, tgt_type);
1153 		ret = -ENOMEM;
1154 		goto fail;
1155 	}
1156 
1157 	/* kernel doesn't support get_features */
1158 	ret = ublk_ctrl_get_features(dev, &features);
1159 	if (ret < 0) {
1160 		ret = -EINVAL;
1161 		goto fail;
1162 	}
1163 
1164 	if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
1165 		ret = -ENOTSUP;
1166 		goto fail;
1167 	}
1168 
1169 	info = &dev->dev_info;
1170 	info->dev_id = ctx->dev_id;
1171 	info->nr_hw_queues = nr_queues;
1172 	info->queue_depth = depth;
1173 	info->flags = ctx->flags;
1174 	if ((features & UBLK_F_QUIESCE) &&
1175 			(info->flags & UBLK_F_USER_RECOVERY))
1176 		info->flags |= UBLK_F_QUIESCE;
1177 	dev->nthreads = nthreads;
1178 	dev->per_io_tasks = ctx->per_io_tasks;
1179 	dev->tgt.ops = ops;
1180 	dev->tgt.sq_depth = depth;
1181 	dev->tgt.cq_depth = depth;
1182 
1183 	for (i = 0; i < MAX_BACK_FILES; i++) {
1184 		if (ctx->files[i]) {
1185 			strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1186 			dev->tgt.nr_backing_files++;
1187 		}
1188 	}
1189 
1190 	if (ctx->recovery)
1191 		ret = ublk_ctrl_start_user_recovery(dev);
1192 	else
1193 		ret = ublk_ctrl_add_dev(dev);
1194 	if (ret < 0) {
1195 		ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1196 				__func__, dev_id, tgt_type, ret);
1197 		goto fail;
1198 	}
1199 
1200 	ret = ublk_start_daemon(ctx, dev);
1201 	ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret);
1202 	if (ret < 0)
1203 		ublk_ctrl_del_dev(dev);
1204 
1205 fail:
1206 	if (ret < 0)
1207 		ublk_send_dev_event(ctx, dev, -1);
1208 	if (dev)
1209 		ublk_ctrl_deinit(dev);
1210 	return ret;
1211 }
1212 
1213 static int __cmd_dev_list(struct dev_ctx *ctx);
1214 
cmd_dev_add(struct dev_ctx * ctx)1215 static int cmd_dev_add(struct dev_ctx *ctx)
1216 {
1217 	int res;
1218 
1219 	if (ctx->fg)
1220 		goto run;
1221 
1222 	ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1223 	if (ctx->_shmid < 0) {
1224 		ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1225 		exit(-1);
1226 	}
1227 	ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1228 	if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1229 		ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1230 		exit(-1);
1231 	}
1232 	ctx->_evtfd = eventfd(0, 0);
1233 	if (ctx->_evtfd < 0) {
1234 		ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1235 		exit(-1);
1236 	}
1237 
1238 	res = fork();
1239 	if (res == 0) {
1240 		int res2;
1241 
1242 		setsid();
1243 		res2 = fork();
1244 		if (res2 == 0) {
1245 			/* prepare for detaching */
1246 			close(STDIN_FILENO);
1247 			close(STDOUT_FILENO);
1248 			close(STDERR_FILENO);
1249 run:
1250 			res = __cmd_dev_add(ctx);
1251 			return res;
1252 		} else {
1253 			/* detached from the foreground task */
1254 			exit(EXIT_SUCCESS);
1255 		}
1256 	} else if (res > 0) {
1257 		uint64_t id;
1258 		int exit_code = EXIT_FAILURE;
1259 
1260 		res = read(ctx->_evtfd, &id, sizeof(id));
1261 		close(ctx->_evtfd);
1262 		if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1263 			ctx->dev_id = id - 1;
1264 			if (__cmd_dev_list(ctx) >= 0)
1265 				exit_code = EXIT_SUCCESS;
1266 		}
1267 		shmdt(ctx->shadow_dev);
1268 		shmctl(ctx->_shmid, IPC_RMID, NULL);
1269 		/* wait for child and detach from it */
1270 		wait(NULL);
1271 		if (exit_code == EXIT_FAILURE)
1272 			ublk_err("%s: command failed\n", __func__);
1273 		exit(exit_code);
1274 	} else {
1275 		exit(EXIT_FAILURE);
1276 	}
1277 }
1278 
__cmd_dev_del(struct dev_ctx * ctx)1279 static int __cmd_dev_del(struct dev_ctx *ctx)
1280 {
1281 	int number = ctx->dev_id;
1282 	struct ublk_dev *dev;
1283 	int ret;
1284 
1285 	dev = ublk_ctrl_init();
1286 	dev->dev_info.dev_id = number;
1287 
1288 	ret = ublk_ctrl_get_info(dev);
1289 	if (ret < 0)
1290 		goto fail;
1291 
1292 	ret = ublk_ctrl_stop_dev(dev);
1293 	if (ret < 0)
1294 		ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1295 
1296 	ret = ublk_stop_io_daemon(dev);
1297 	if (ret < 0)
1298 		ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1299 				__func__, dev->dev_info.ublksrv_pid, number, ret);
1300 	ublk_ctrl_del_dev(dev);
1301 fail:
1302 	ublk_ctrl_deinit(dev);
1303 
1304 	return (ret >= 0) ? 0 : ret;
1305 }
1306 
cmd_dev_del(struct dev_ctx * ctx)1307 static int cmd_dev_del(struct dev_ctx *ctx)
1308 {
1309 	int i;
1310 
1311 	if (ctx->dev_id >= 0 || !ctx->all)
1312 		return __cmd_dev_del(ctx);
1313 
1314 	for (i = 0; i < 255; i++) {
1315 		ctx->dev_id = i;
1316 		__cmd_dev_del(ctx);
1317 	}
1318 	return 0;
1319 }
1320 
__cmd_dev_list(struct dev_ctx * ctx)1321 static int __cmd_dev_list(struct dev_ctx *ctx)
1322 {
1323 	struct ublk_dev *dev = ublk_ctrl_init();
1324 	int ret;
1325 
1326 	if (!dev)
1327 		return -ENODEV;
1328 
1329 	dev->dev_info.dev_id = ctx->dev_id;
1330 
1331 	ret = ublk_ctrl_get_info(dev);
1332 	if (ret < 0) {
1333 		if (ctx->logging)
1334 			ublk_err("%s: can't get dev info from %d: %d\n",
1335 					__func__, ctx->dev_id, ret);
1336 	} else {
1337 		if (ctx->shadow_dev)
1338 			memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1339 
1340 		ublk_ctrl_dump(dev);
1341 	}
1342 
1343 	ublk_ctrl_deinit(dev);
1344 
1345 	return ret;
1346 }
1347 
cmd_dev_list(struct dev_ctx * ctx)1348 static int cmd_dev_list(struct dev_ctx *ctx)
1349 {
1350 	int i;
1351 
1352 	if (ctx->dev_id >= 0 || !ctx->all)
1353 		return __cmd_dev_list(ctx);
1354 
1355 	ctx->logging = false;
1356 	for (i = 0; i < 255; i++) {
1357 		ctx->dev_id = i;
1358 		__cmd_dev_list(ctx);
1359 	}
1360 	return 0;
1361 }
1362 
cmd_dev_get_features(void)1363 static int cmd_dev_get_features(void)
1364 {
1365 #define const_ilog2(x) (63 - __builtin_clzll(x))
1366 	static const char *feat_map[] = {
1367 		[const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY",
1368 		[const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK",
1369 		[const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA",
1370 		[const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY",
1371 		[const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE",
1372 		[const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV",
1373 		[const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE",
1374 		[const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY",
1375 		[const_ilog2(UBLK_F_ZONED)] = "ZONED",
1376 		[const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO",
1377 		[const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
1378 		[const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
1379 		[const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
1380 		[const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON",
1381 	};
1382 	struct ublk_dev *dev;
1383 	__u64 features = 0;
1384 	int ret;
1385 
1386 	dev = ublk_ctrl_init();
1387 	if (!dev) {
1388 		fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1389 		return -EOPNOTSUPP;
1390 	}
1391 
1392 	ret = ublk_ctrl_get_features(dev, &features);
1393 	if (!ret) {
1394 		int i;
1395 
1396 		printf("ublk_drv features: 0x%llx\n", features);
1397 
1398 		for (i = 0; i < sizeof(features) * 8; i++) {
1399 			const char *feat;
1400 
1401 			if (!((1ULL << i)  & features))
1402 				continue;
1403 			if (i < sizeof(feat_map) / sizeof(feat_map[0]))
1404 				feat = feat_map[i];
1405 			else
1406 				feat = "unknown";
1407 			printf("\t%-20s: 0x%llx\n", feat, 1ULL << i);
1408 		}
1409 	}
1410 
1411 	return ret;
1412 }
1413 
cmd_dev_update_size(struct dev_ctx * ctx)1414 static int cmd_dev_update_size(struct dev_ctx *ctx)
1415 {
1416 	struct ublk_dev *dev = ublk_ctrl_init();
1417 	struct ublk_params p;
1418 	int ret = -EINVAL;
1419 
1420 	if (!dev)
1421 		return -ENODEV;
1422 
1423 	if (ctx->dev_id < 0) {
1424 		fprintf(stderr, "device id isn't provided\n");
1425 		goto out;
1426 	}
1427 
1428 	dev->dev_info.dev_id = ctx->dev_id;
1429 	ret = ublk_ctrl_get_params(dev, &p);
1430 	if (ret < 0) {
1431 		ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
1432 		goto out;
1433 	}
1434 
1435 	if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
1436 		ublk_err("size isn't aligned with logical block size\n");
1437 		ret = -EINVAL;
1438 		goto out;
1439 	}
1440 
1441 	ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
1442 out:
1443 	ublk_ctrl_deinit(dev);
1444 	return ret;
1445 }
1446 
cmd_dev_quiesce(struct dev_ctx * ctx)1447 static int cmd_dev_quiesce(struct dev_ctx *ctx)
1448 {
1449 	struct ublk_dev *dev = ublk_ctrl_init();
1450 	int ret = -EINVAL;
1451 
1452 	if (!dev)
1453 		return -ENODEV;
1454 
1455 	if (ctx->dev_id < 0) {
1456 		fprintf(stderr, "device id isn't provided for quiesce\n");
1457 		goto out;
1458 	}
1459 	dev->dev_info.dev_id = ctx->dev_id;
1460 	ret = ublk_ctrl_quiesce_dev(dev, 10000);
1461 
1462 out:
1463 	ublk_ctrl_deinit(dev);
1464 	return ret;
1465 }
1466 
__cmd_create_help(char * exe,bool recovery)1467 static void __cmd_create_help(char *exe, bool recovery)
1468 {
1469 	int i;
1470 
1471 	printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
1472 			exe, recovery ? "recover" : "add");
1473 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
1474 	printf("\t[-e 0|1 ] [-i 0|1]\n");
1475 	printf("\t[--nthreads threads] [--per_io_tasks]\n");
1476 	printf("\t[target options] [backfile1] [backfile2] ...\n");
1477 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
1478 	printf("\tdefault: nthreads=nr_queues");
1479 
1480 	for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
1481 		const struct ublk_tgt_ops *ops = tgt_ops_list[i];
1482 
1483 		if (ops->usage)
1484 			ops->usage(ops);
1485 	}
1486 }
1487 
cmd_add_help(char * exe)1488 static void cmd_add_help(char *exe)
1489 {
1490 	__cmd_create_help(exe, false);
1491 	printf("\n");
1492 }
1493 
cmd_recover_help(char * exe)1494 static void cmd_recover_help(char *exe)
1495 {
1496 	__cmd_create_help(exe, true);
1497 	printf("\tPlease provide exact command line for creating this device with real dev_id\n");
1498 	printf("\n");
1499 }
1500 
cmd_dev_help(char * exe)1501 static int cmd_dev_help(char *exe)
1502 {
1503 	cmd_add_help(exe);
1504 	cmd_recover_help(exe);
1505 
1506 	printf("%s del [-n dev_id] -a \n", exe);
1507 	printf("\t -a delete all devices -n delete specified device\n\n");
1508 	printf("%s list [-n dev_id] -a \n", exe);
1509 	printf("\t -a list all devices, -n list specified device, default -a \n\n");
1510 	printf("%s features\n", exe);
1511 	printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
1512 	printf("%s quiesce -n dev_id\n", exe);
1513 	return 0;
1514 }
1515 
main(int argc,char * argv[])1516 int main(int argc, char *argv[])
1517 {
1518 	static const struct option longopts[] = {
1519 		{ "all",		0,	NULL, 'a' },
1520 		{ "type",		1,	NULL, 't' },
1521 		{ "number",		1,	NULL, 'n' },
1522 		{ "queues",		1,	NULL, 'q' },
1523 		{ "depth",		1,	NULL, 'd' },
1524 		{ "debug_mask",		1,	NULL,  0  },
1525 		{ "quiet",		0,	NULL,  0  },
1526 		{ "zero_copy",          0,      NULL, 'z' },
1527 		{ "foreground",		0,	NULL,  0  },
1528 		{ "recovery", 		1,      NULL, 'r' },
1529 		{ "recovery_fail_io",	1,	NULL, 'e'},
1530 		{ "recovery_reissue",	1,	NULL, 'i'},
1531 		{ "get_data",		1,	NULL, 'g'},
1532 		{ "auto_zc",		0,	NULL,  0 },
1533 		{ "auto_zc_fallback", 	0,	NULL,  0 },
1534 		{ "size",		1,	NULL, 's'},
1535 		{ "nthreads",		1,	NULL,  0 },
1536 		{ "per_io_tasks",	0,	NULL,  0 },
1537 		{ 0, 0, 0, 0 }
1538 	};
1539 	const struct ublk_tgt_ops *ops = NULL;
1540 	int option_idx, opt;
1541 	const char *cmd = argv[1];
1542 	struct dev_ctx ctx = {
1543 		.queue_depth	=	128,
1544 		.nr_hw_queues	=	2,
1545 		.dev_id		=	-1,
1546 		.tgt_type	=	"unknown",
1547 	};
1548 	int ret = -EINVAL, i;
1549 	int tgt_argc = 1;
1550 	char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
1551 	int value;
1552 
1553 	if (argc == 1)
1554 		return ret;
1555 
1556 	opterr = 0;
1557 	optind = 2;
1558 	while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gaz",
1559 				  longopts, &option_idx)) != -1) {
1560 		switch (opt) {
1561 		case 'a':
1562 			ctx.all = 1;
1563 			break;
1564 		case 'n':
1565 			ctx.dev_id = strtol(optarg, NULL, 10);
1566 			break;
1567 		case 't':
1568 			if (strlen(optarg) < sizeof(ctx.tgt_type))
1569 				strcpy(ctx.tgt_type, optarg);
1570 			break;
1571 		case 'q':
1572 			ctx.nr_hw_queues = strtol(optarg, NULL, 10);
1573 			break;
1574 		case 'd':
1575 			ctx.queue_depth = strtol(optarg, NULL, 10);
1576 			break;
1577 		case 'z':
1578 			ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY;
1579 			break;
1580 		case 'r':
1581 			value = strtol(optarg, NULL, 10);
1582 			if (value)
1583 				ctx.flags |= UBLK_F_USER_RECOVERY;
1584 			break;
1585 		case 'e':
1586 			value = strtol(optarg, NULL, 10);
1587 			if (value)
1588 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
1589 			break;
1590 		case 'i':
1591 			value = strtol(optarg, NULL, 10);
1592 			if (value)
1593 				ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
1594 			break;
1595 		case 'g':
1596 			ctx.flags |= UBLK_F_NEED_GET_DATA;
1597 			break;
1598 		case 's':
1599 			ctx.size = strtoull(optarg, NULL, 10);
1600 			break;
1601 		case 0:
1602 			if (!strcmp(longopts[option_idx].name, "debug_mask"))
1603 				ublk_dbg_mask = strtol(optarg, NULL, 16);
1604 			if (!strcmp(longopts[option_idx].name, "quiet"))
1605 				ublk_dbg_mask = 0;
1606 			if (!strcmp(longopts[option_idx].name, "foreground"))
1607 				ctx.fg = 1;
1608 			if (!strcmp(longopts[option_idx].name, "auto_zc"))
1609 				ctx.flags |= UBLK_F_AUTO_BUF_REG;
1610 			if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
1611 				ctx.auto_zc_fallback = 1;
1612 			if (!strcmp(longopts[option_idx].name, "nthreads"))
1613 				ctx.nthreads = strtol(optarg, NULL, 10);
1614 			if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
1615 				ctx.per_io_tasks = 1;
1616 			break;
1617 		case '?':
1618 			/*
1619 			 * target requires every option must have argument
1620 			 */
1621 			if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
1622 				fprintf(stderr, "every target option requires argument: %s %s\n",
1623 						argv[optind - 1], argv[optind]);
1624 				exit(EXIT_FAILURE);
1625 			}
1626 
1627 			if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
1628 				tgt_argv[tgt_argc++] = argv[optind - 1];
1629 				tgt_argv[tgt_argc++] = argv[optind];
1630 			} else {
1631 				fprintf(stderr, "too many target options\n");
1632 				exit(EXIT_FAILURE);
1633 			}
1634 			optind += 1;
1635 			break;
1636 		}
1637 	}
1638 
1639 	/* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
1640 	if (ctx.auto_zc_fallback &&
1641 	    !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
1642 		    (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
1643 		ublk_err("%s: auto_zc_fallback is set but neither "
1644 				"F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
1645 					__func__);
1646 		return -EINVAL;
1647 	}
1648 
1649 	i = optind;
1650 	while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
1651 		ctx.files[ctx.nr_files++] = argv[i++];
1652 	}
1653 
1654 	ops = ublk_find_tgt(ctx.tgt_type);
1655 	if (ops && ops->parse_cmd_line) {
1656 		optind = 0;
1657 
1658 		tgt_argv[0] = ctx.tgt_type;
1659 		ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
1660 	}
1661 
1662 	if (!strcmp(cmd, "add"))
1663 		ret = cmd_dev_add(&ctx);
1664 	else if (!strcmp(cmd, "recover")) {
1665 		if (ctx.dev_id < 0) {
1666 			fprintf(stderr, "device id isn't provided for recovering\n");
1667 			ret = -EINVAL;
1668 		} else {
1669 			ctx.recovery = 1;
1670 			ret = cmd_dev_add(&ctx);
1671 		}
1672 	} else if (!strcmp(cmd, "del"))
1673 		ret = cmd_dev_del(&ctx);
1674 	else if (!strcmp(cmd, "list")) {
1675 		ctx.all = 1;
1676 		ret = cmd_dev_list(&ctx);
1677 	} else if (!strcmp(cmd, "help"))
1678 		ret = cmd_dev_help(argv[0]);
1679 	else if (!strcmp(cmd, "features"))
1680 		ret = cmd_dev_get_features();
1681 	else if (!strcmp(cmd, "update_size"))
1682 		ret = cmd_dev_update_size(&ctx);
1683 	else if (!strcmp(cmd, "quiesce"))
1684 		ret = cmd_dev_quiesce(&ctx);
1685 	else
1686 		cmd_dev_help(argv[0]);
1687 
1688 	return ret;
1689 }
1690