1 /* SPDX-License-Identifier: MIT */
2 /*
3 * Description: uring_cmd based ublk
4 */
5
6 #include "kublk.h"
7
8 #define MAX_NR_TGT_ARG 64
9
10 unsigned int ublk_dbg_mask = UBLK_LOG;
11 static const struct ublk_tgt_ops *tgt_ops_list[] = {
12 &null_tgt_ops,
13 &loop_tgt_ops,
14 &stripe_tgt_ops,
15 &fault_inject_tgt_ops,
16 };
17
ublk_find_tgt(const char * name)18 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
19 {
20 int i;
21
22 if (name == NULL)
23 return NULL;
24
25 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
26 if (strcmp(tgt_ops_list[i]->name, name) == 0)
27 return tgt_ops_list[i];
28 return NULL;
29 }
30
ublk_setup_ring(struct io_uring * r,int depth,int cq_depth,unsigned flags)31 static inline int ublk_setup_ring(struct io_uring *r, int depth,
32 int cq_depth, unsigned flags)
33 {
34 struct io_uring_params p;
35
36 memset(&p, 0, sizeof(p));
37 p.flags = flags | IORING_SETUP_CQSIZE;
38 p.cq_entries = cq_depth;
39
40 return io_uring_queue_init_params(depth, r, &p);
41 }
42
ublk_ctrl_init_cmd(struct ublk_dev * dev,struct io_uring_sqe * sqe,struct ublk_ctrl_cmd_data * data)43 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
44 struct io_uring_sqe *sqe,
45 struct ublk_ctrl_cmd_data *data)
46 {
47 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
48 struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
49
50 sqe->fd = dev->ctrl_fd;
51 sqe->opcode = IORING_OP_URING_CMD;
52 sqe->ioprio = 0;
53
54 if (data->flags & CTRL_CMD_HAS_BUF) {
55 cmd->addr = data->addr;
56 cmd->len = data->len;
57 }
58
59 if (data->flags & CTRL_CMD_HAS_DATA)
60 cmd->data[0] = data->data[0];
61
62 cmd->dev_id = info->dev_id;
63 cmd->queue_id = -1;
64
65 ublk_set_sqe_cmd_op(sqe, data->cmd_op);
66
67 io_uring_sqe_set_data(sqe, cmd);
68 }
69
__ublk_ctrl_cmd(struct ublk_dev * dev,struct ublk_ctrl_cmd_data * data)70 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
71 struct ublk_ctrl_cmd_data *data)
72 {
73 struct io_uring_sqe *sqe;
74 struct io_uring_cqe *cqe;
75 int ret = -EINVAL;
76
77 sqe = io_uring_get_sqe(&dev->ring);
78 if (!sqe) {
79 ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
80 return ret;
81 }
82
83 ublk_ctrl_init_cmd(dev, sqe, data);
84
85 ret = io_uring_submit(&dev->ring);
86 if (ret < 0) {
87 ublk_err("uring submit ret %d\n", ret);
88 return ret;
89 }
90
91 ret = io_uring_wait_cqe(&dev->ring, &cqe);
92 if (ret < 0) {
93 ublk_err("wait cqe: %s\n", strerror(-ret));
94 return ret;
95 }
96 io_uring_cqe_seen(&dev->ring, cqe);
97
98 return cqe->res;
99 }
100
ublk_ctrl_stop_dev(struct ublk_dev * dev)101 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
102 {
103 struct ublk_ctrl_cmd_data data = {
104 .cmd_op = UBLK_U_CMD_STOP_DEV,
105 };
106
107 return __ublk_ctrl_cmd(dev, &data);
108 }
109
ublk_ctrl_start_dev(struct ublk_dev * dev,int daemon_pid)110 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
111 int daemon_pid)
112 {
113 struct ublk_ctrl_cmd_data data = {
114 .cmd_op = UBLK_U_CMD_START_DEV,
115 .flags = CTRL_CMD_HAS_DATA,
116 };
117
118 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
119
120 return __ublk_ctrl_cmd(dev, &data);
121 }
122
ublk_ctrl_start_user_recovery(struct ublk_dev * dev)123 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
124 {
125 struct ublk_ctrl_cmd_data data = {
126 .cmd_op = UBLK_U_CMD_START_USER_RECOVERY,
127 };
128
129 return __ublk_ctrl_cmd(dev, &data);
130 }
131
ublk_ctrl_end_user_recovery(struct ublk_dev * dev,int daemon_pid)132 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
133 {
134 struct ublk_ctrl_cmd_data data = {
135 .cmd_op = UBLK_U_CMD_END_USER_RECOVERY,
136 .flags = CTRL_CMD_HAS_DATA,
137 };
138
139 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
140
141 return __ublk_ctrl_cmd(dev, &data);
142 }
143
ublk_ctrl_add_dev(struct ublk_dev * dev)144 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
145 {
146 struct ublk_ctrl_cmd_data data = {
147 .cmd_op = UBLK_U_CMD_ADD_DEV,
148 .flags = CTRL_CMD_HAS_BUF,
149 .addr = (__u64) (uintptr_t) &dev->dev_info,
150 .len = sizeof(struct ublksrv_ctrl_dev_info),
151 };
152
153 return __ublk_ctrl_cmd(dev, &data);
154 }
155
ublk_ctrl_del_dev(struct ublk_dev * dev)156 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
157 {
158 struct ublk_ctrl_cmd_data data = {
159 .cmd_op = UBLK_U_CMD_DEL_DEV,
160 .flags = 0,
161 };
162
163 return __ublk_ctrl_cmd(dev, &data);
164 }
165
ublk_ctrl_get_info(struct ublk_dev * dev)166 static int ublk_ctrl_get_info(struct ublk_dev *dev)
167 {
168 struct ublk_ctrl_cmd_data data = {
169 .cmd_op = UBLK_U_CMD_GET_DEV_INFO,
170 .flags = CTRL_CMD_HAS_BUF,
171 .addr = (__u64) (uintptr_t) &dev->dev_info,
172 .len = sizeof(struct ublksrv_ctrl_dev_info),
173 };
174
175 return __ublk_ctrl_cmd(dev, &data);
176 }
177
ublk_ctrl_set_params(struct ublk_dev * dev,struct ublk_params * params)178 static int ublk_ctrl_set_params(struct ublk_dev *dev,
179 struct ublk_params *params)
180 {
181 struct ublk_ctrl_cmd_data data = {
182 .cmd_op = UBLK_U_CMD_SET_PARAMS,
183 .flags = CTRL_CMD_HAS_BUF,
184 .addr = (__u64) (uintptr_t) params,
185 .len = sizeof(*params),
186 };
187 params->len = sizeof(*params);
188 return __ublk_ctrl_cmd(dev, &data);
189 }
190
ublk_ctrl_get_params(struct ublk_dev * dev,struct ublk_params * params)191 static int ublk_ctrl_get_params(struct ublk_dev *dev,
192 struct ublk_params *params)
193 {
194 struct ublk_ctrl_cmd_data data = {
195 .cmd_op = UBLK_U_CMD_GET_PARAMS,
196 .flags = CTRL_CMD_HAS_BUF,
197 .addr = (__u64)params,
198 .len = sizeof(*params),
199 };
200
201 params->len = sizeof(*params);
202
203 return __ublk_ctrl_cmd(dev, &data);
204 }
205
ublk_ctrl_get_features(struct ublk_dev * dev,__u64 * features)206 static int ublk_ctrl_get_features(struct ublk_dev *dev,
207 __u64 *features)
208 {
209 struct ublk_ctrl_cmd_data data = {
210 .cmd_op = UBLK_U_CMD_GET_FEATURES,
211 .flags = CTRL_CMD_HAS_BUF,
212 .addr = (__u64) (uintptr_t) features,
213 .len = sizeof(*features),
214 };
215
216 return __ublk_ctrl_cmd(dev, &data);
217 }
218
ublk_ctrl_update_size(struct ublk_dev * dev,__u64 nr_sects)219 static int ublk_ctrl_update_size(struct ublk_dev *dev,
220 __u64 nr_sects)
221 {
222 struct ublk_ctrl_cmd_data data = {
223 .cmd_op = UBLK_U_CMD_UPDATE_SIZE,
224 .flags = CTRL_CMD_HAS_DATA,
225 };
226
227 data.data[0] = nr_sects;
228 return __ublk_ctrl_cmd(dev, &data);
229 }
230
ublk_ctrl_quiesce_dev(struct ublk_dev * dev,unsigned int timeout_ms)231 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
232 unsigned int timeout_ms)
233 {
234 struct ublk_ctrl_cmd_data data = {
235 .cmd_op = UBLK_U_CMD_QUIESCE_DEV,
236 .flags = CTRL_CMD_HAS_DATA,
237 };
238
239 data.data[0] = timeout_ms;
240 return __ublk_ctrl_cmd(dev, &data);
241 }
242
ublk_dev_state_desc(struct ublk_dev * dev)243 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
244 {
245 switch (dev->dev_info.state) {
246 case UBLK_S_DEV_DEAD:
247 return "DEAD";
248 case UBLK_S_DEV_LIVE:
249 return "LIVE";
250 case UBLK_S_DEV_QUIESCED:
251 return "QUIESCED";
252 default:
253 return "UNKNOWN";
254 };
255 }
256
ublk_print_cpu_set(const cpu_set_t * set,char * buf,unsigned len)257 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
258 {
259 unsigned done = 0;
260 int i;
261
262 for (i = 0; i < CPU_SETSIZE; i++) {
263 if (CPU_ISSET(i, set))
264 done += snprintf(&buf[done], len - done, "%d ", i);
265 }
266 }
267
ublk_adjust_affinity(cpu_set_t * set)268 static void ublk_adjust_affinity(cpu_set_t *set)
269 {
270 int j, updated = 0;
271
272 /*
273 * Just keep the 1st CPU now.
274 *
275 * In future, auto affinity selection can be tried.
276 */
277 for (j = 0; j < CPU_SETSIZE; j++) {
278 if (CPU_ISSET(j, set)) {
279 if (!updated) {
280 updated = 1;
281 continue;
282 }
283 CPU_CLR(j, set);
284 }
285 }
286 }
287
288 /* Caller must free the allocated buffer */
ublk_ctrl_get_affinity(struct ublk_dev * ctrl_dev,cpu_set_t ** ptr_buf)289 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
290 {
291 struct ublk_ctrl_cmd_data data = {
292 .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY,
293 .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
294 };
295 cpu_set_t *buf;
296 int i, ret;
297
298 buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
299 if (!buf)
300 return -ENOMEM;
301
302 for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
303 data.data[0] = i;
304 data.len = sizeof(cpu_set_t);
305 data.addr = (__u64)&buf[i];
306
307 ret = __ublk_ctrl_cmd(ctrl_dev, &data);
308 if (ret < 0) {
309 free(buf);
310 return ret;
311 }
312 ublk_adjust_affinity(&buf[i]);
313 }
314
315 *ptr_buf = buf;
316 return 0;
317 }
318
ublk_ctrl_dump(struct ublk_dev * dev)319 static void ublk_ctrl_dump(struct ublk_dev *dev)
320 {
321 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
322 struct ublk_params p;
323 cpu_set_t *affinity;
324 int ret;
325
326 ret = ublk_ctrl_get_params(dev, &p);
327 if (ret < 0) {
328 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
329 return;
330 }
331
332 ret = ublk_ctrl_get_affinity(dev, &affinity);
333 if (ret < 0) {
334 ublk_err("failed to get affinity %m\n");
335 return;
336 }
337
338 ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
339 info->dev_id, info->nr_hw_queues, info->queue_depth,
340 1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
341 ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
342 info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
343 ublk_dev_state_desc(dev));
344
345 if (affinity) {
346 char buf[512];
347 int i;
348
349 for (i = 0; i < info->nr_hw_queues; i++) {
350 ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
351 printf("\tqueue %u: affinity(%s)\n",
352 i, buf);
353 }
354 free(affinity);
355 }
356
357 fflush(stdout);
358 }
359
ublk_ctrl_deinit(struct ublk_dev * dev)360 static void ublk_ctrl_deinit(struct ublk_dev *dev)
361 {
362 close(dev->ctrl_fd);
363 free(dev);
364 }
365
ublk_ctrl_init(void)366 static struct ublk_dev *ublk_ctrl_init(void)
367 {
368 struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
369 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
370 int ret;
371
372 dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
373 if (dev->ctrl_fd < 0) {
374 free(dev);
375 return NULL;
376 }
377
378 info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
379
380 ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
381 UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
382 if (ret < 0) {
383 ublk_err("queue_init: %s\n", strerror(-ret));
384 free(dev);
385 return NULL;
386 }
387 dev->nr_fds = 1;
388
389 return dev;
390 }
391
__ublk_queue_cmd_buf_sz(unsigned depth)392 static int __ublk_queue_cmd_buf_sz(unsigned depth)
393 {
394 int size = depth * sizeof(struct ublksrv_io_desc);
395 unsigned int page_sz = getpagesize();
396
397 return round_up(size, page_sz);
398 }
399
ublk_queue_max_cmd_buf_sz(void)400 static int ublk_queue_max_cmd_buf_sz(void)
401 {
402 return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
403 }
404
ublk_queue_cmd_buf_sz(struct ublk_queue * q)405 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
406 {
407 return __ublk_queue_cmd_buf_sz(q->q_depth);
408 }
409
ublk_queue_deinit(struct ublk_queue * q)410 static void ublk_queue_deinit(struct ublk_queue *q)
411 {
412 int i;
413 int nr_ios = q->q_depth;
414
415 if (q->io_cmd_buf)
416 munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
417
418 for (i = 0; i < nr_ios; i++)
419 free(q->ios[i].buf_addr);
420 }
421
ublk_thread_deinit(struct ublk_thread * t)422 static void ublk_thread_deinit(struct ublk_thread *t)
423 {
424 io_uring_unregister_buffers(&t->ring);
425
426 io_uring_unregister_ring_fd(&t->ring);
427
428 if (t->ring.ring_fd > 0) {
429 io_uring_unregister_files(&t->ring);
430 close(t->ring.ring_fd);
431 t->ring.ring_fd = -1;
432 }
433 }
434
ublk_queue_init(struct ublk_queue * q,unsigned extra_flags)435 static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
436 {
437 struct ublk_dev *dev = q->dev;
438 int depth = dev->dev_info.queue_depth;
439 int i;
440 int cmd_buf_size, io_buf_size;
441 unsigned long off;
442
443 q->tgt_ops = dev->tgt.ops;
444 q->flags = 0;
445 q->q_depth = depth;
446 q->flags = dev->dev_info.flags;
447 q->flags |= extra_flags;
448
449 cmd_buf_size = ublk_queue_cmd_buf_sz(q);
450 off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
451 q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
452 MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
453 if (q->io_cmd_buf == MAP_FAILED) {
454 ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
455 q->dev->dev_info.dev_id, q->q_id);
456 goto fail;
457 }
458
459 io_buf_size = dev->dev_info.max_io_buf_bytes;
460 for (i = 0; i < q->q_depth; i++) {
461 q->ios[i].buf_addr = NULL;
462 q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
463 q->ios[i].tag = i;
464
465 if (ublk_queue_no_buf(q))
466 continue;
467
468 if (posix_memalign((void **)&q->ios[i].buf_addr,
469 getpagesize(), io_buf_size)) {
470 ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
471 dev->dev_info.dev_id, q->q_id, i);
472 goto fail;
473 }
474 }
475
476 return 0;
477 fail:
478 ublk_queue_deinit(q);
479 ublk_err("ublk dev %d queue %d failed\n",
480 dev->dev_info.dev_id, q->q_id);
481 return -ENOMEM;
482 }
483
ublk_thread_init(struct ublk_thread * t)484 static int ublk_thread_init(struct ublk_thread *t)
485 {
486 struct ublk_dev *dev = t->dev;
487 int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
488 int ret;
489
490 ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
491 IORING_SETUP_COOP_TASKRUN |
492 IORING_SETUP_SINGLE_ISSUER |
493 IORING_SETUP_DEFER_TASKRUN);
494 if (ret < 0) {
495 ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
496 dev->dev_info.dev_id, t->idx, ret);
497 goto fail;
498 }
499
500 if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
501 unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
502 unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
503 max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
504 ret = io_uring_register_buffers_sparse(
505 &t->ring, max_nr_ios_per_thread);
506 if (ret) {
507 ublk_err("ublk dev %d thread %d register spare buffers failed %d",
508 dev->dev_info.dev_id, t->idx, ret);
509 goto fail;
510 }
511 }
512
513 io_uring_register_ring_fd(&t->ring);
514
515 ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
516 if (ret) {
517 ublk_err("ublk dev %d thread %d register files failed %d\n",
518 t->dev->dev_info.dev_id, t->idx, ret);
519 goto fail;
520 }
521
522 return 0;
523 fail:
524 ublk_thread_deinit(t);
525 ublk_err("ublk dev %d thread %d init failed\n",
526 dev->dev_info.dev_id, t->idx);
527 return -ENOMEM;
528 }
529
530 #define WAIT_USEC 100000
531 #define MAX_WAIT_USEC (3 * 1000000)
ublk_dev_prep(const struct dev_ctx * ctx,struct ublk_dev * dev)532 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
533 {
534 int dev_id = dev->dev_info.dev_id;
535 unsigned int wait_usec = 0;
536 int ret = 0, fd = -1;
537 char buf[64];
538
539 snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
540
541 while (wait_usec < MAX_WAIT_USEC) {
542 fd = open(buf, O_RDWR);
543 if (fd >= 0)
544 break;
545 usleep(WAIT_USEC);
546 wait_usec += WAIT_USEC;
547 }
548 if (fd < 0) {
549 ublk_err("can't open %s %s\n", buf, strerror(errno));
550 return -1;
551 }
552
553 dev->fds[0] = fd;
554 if (dev->tgt.ops->init_tgt)
555 ret = dev->tgt.ops->init_tgt(ctx, dev);
556 if (ret)
557 close(dev->fds[0]);
558 return ret;
559 }
560
ublk_dev_unprep(struct ublk_dev * dev)561 static void ublk_dev_unprep(struct ublk_dev *dev)
562 {
563 if (dev->tgt.ops->deinit_tgt)
564 dev->tgt.ops->deinit_tgt(dev);
565 close(dev->fds[0]);
566 }
567
ublk_set_auto_buf_reg(const struct ublk_queue * q,struct io_uring_sqe * sqe,unsigned short tag)568 static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
569 struct io_uring_sqe *sqe,
570 unsigned short tag)
571 {
572 struct ublk_auto_buf_reg buf = {};
573
574 if (q->tgt_ops->buf_index)
575 buf.index = q->tgt_ops->buf_index(q, tag);
576 else
577 buf.index = q->ios[tag].buf_index;
578
579 if (ublk_queue_auto_zc_fallback(q))
580 buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
581
582 sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
583 }
584
ublk_queue_io_cmd(struct ublk_thread * t,struct ublk_io * io)585 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
586 {
587 struct ublk_queue *q = ublk_io_to_queue(io);
588 struct ublksrv_io_cmd *cmd;
589 struct io_uring_sqe *sqe[1];
590 unsigned int cmd_op = 0;
591 __u64 user_data;
592
593 /* only freed io can be issued */
594 if (!(io->flags & UBLKS_IO_FREE))
595 return 0;
596
597 /*
598 * we issue because we need either fetching or committing or
599 * getting data
600 */
601 if (!(io->flags &
602 (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
603 return 0;
604
605 if (io->flags & UBLKS_IO_NEED_GET_DATA)
606 cmd_op = UBLK_U_IO_NEED_GET_DATA;
607 else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP)
608 cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
609 else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
610 cmd_op = UBLK_U_IO_FETCH_REQ;
611
612 if (io_uring_sq_space_left(&t->ring) < 1)
613 io_uring_submit(&t->ring);
614
615 ublk_io_alloc_sqes(t, sqe, 1);
616 if (!sqe[0]) {
617 ublk_err("%s: run out of sqe. thread %u, tag %d\n",
618 __func__, t->idx, io->tag);
619 return -1;
620 }
621
622 cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
623
624 if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
625 cmd->result = io->result;
626
627 /* These fields should be written once, never change */
628 ublk_set_sqe_cmd_op(sqe[0], cmd_op);
629 sqe[0]->fd = 0; /* dev->fds[0] */
630 sqe[0]->opcode = IORING_OP_URING_CMD;
631 sqe[0]->flags = IOSQE_FIXED_FILE;
632 sqe[0]->rw_flags = 0;
633 cmd->tag = io->tag;
634 cmd->q_id = q->q_id;
635 if (!ublk_queue_no_buf(q))
636 cmd->addr = (__u64) (uintptr_t) io->buf_addr;
637 else
638 cmd->addr = 0;
639
640 if (ublk_queue_use_auto_zc(q))
641 ublk_set_auto_buf_reg(q, sqe[0], io->tag);
642
643 user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
644 io_uring_sqe_set_data64(sqe[0], user_data);
645
646 io->flags = 0;
647
648 t->cmd_inflight += 1;
649
650 ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
651 __func__, t->idx, q->q_id, io->tag, cmd_op,
652 io->flags, !!(t->state & UBLKS_T_STOPPING));
653 return 1;
654 }
655
ublk_submit_fetch_commands(struct ublk_thread * t)656 static void ublk_submit_fetch_commands(struct ublk_thread *t)
657 {
658 struct ublk_queue *q;
659 struct ublk_io *io;
660 int i = 0, j = 0;
661
662 if (t->dev->per_io_tasks) {
663 /*
664 * Lexicographically order all the (qid,tag) pairs, with
665 * qid taking priority (so (1,0) > (0,1)). Then make
666 * this thread the daemon for every Nth entry in this
667 * list (N is the number of threads), starting at this
668 * thread's index. This ensures that each queue is
669 * handled by as many ublk server threads as possible,
670 * so that load that is concentrated on one or a few
671 * queues can make use of all ublk server threads.
672 */
673 const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
674 int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
675 for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
676 int q_id = i / dinfo->queue_depth;
677 int tag = i % dinfo->queue_depth;
678 q = &t->dev->q[q_id];
679 io = &q->ios[tag];
680 io->buf_index = j++;
681 ublk_queue_io_cmd(t, io);
682 }
683 } else {
684 /*
685 * Service exclusively the queue whose q_id matches our
686 * thread index.
687 */
688 struct ublk_queue *q = &t->dev->q[t->idx];
689 for (i = 0; i < q->q_depth; i++) {
690 io = &q->ios[i];
691 io->buf_index = i;
692 ublk_queue_io_cmd(t, io);
693 }
694 }
695 }
696
ublk_thread_is_idle(struct ublk_thread * t)697 static int ublk_thread_is_idle(struct ublk_thread *t)
698 {
699 return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
700 }
701
ublk_thread_is_done(struct ublk_thread * t)702 static int ublk_thread_is_done(struct ublk_thread *t)
703 {
704 return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t);
705 }
706
ublksrv_handle_tgt_cqe(struct ublk_thread * t,struct ublk_queue * q,struct io_uring_cqe * cqe)707 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
708 struct ublk_queue *q,
709 struct io_uring_cqe *cqe)
710 {
711 if (cqe->res < 0 && cqe->res != -EAGAIN)
712 ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
713 __func__, cqe->res, q->q_id,
714 user_data_to_tag(cqe->user_data),
715 user_data_to_op(cqe->user_data));
716
717 if (q->tgt_ops->tgt_io_done)
718 q->tgt_ops->tgt_io_done(t, q, cqe);
719 }
720
ublk_handle_uring_cmd(struct ublk_thread * t,struct ublk_queue * q,const struct io_uring_cqe * cqe)721 static void ublk_handle_uring_cmd(struct ublk_thread *t,
722 struct ublk_queue *q,
723 const struct io_uring_cqe *cqe)
724 {
725 int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
726 !(t->state & UBLKS_T_STOPPING);
727 unsigned tag = user_data_to_tag(cqe->user_data);
728 struct ublk_io *io = &q->ios[tag];
729
730 if (!fetch) {
731 t->state |= UBLKS_T_STOPPING;
732 io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
733 }
734
735 if (cqe->res == UBLK_IO_RES_OK) {
736 assert(tag < q->q_depth);
737 if (q->tgt_ops->queue_io)
738 q->tgt_ops->queue_io(t, q, tag);
739 } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
740 io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
741 ublk_queue_io_cmd(t, io);
742 } else {
743 /*
744 * COMMIT_REQ will be completed immediately since no fetching
745 * piggyback is required.
746 *
747 * Marking IO_FREE only, then this io won't be issued since
748 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
749 *
750 * */
751 io->flags = UBLKS_IO_FREE;
752 }
753 }
754
ublk_handle_cqe(struct ublk_thread * t,struct io_uring_cqe * cqe,void * data)755 static void ublk_handle_cqe(struct ublk_thread *t,
756 struct io_uring_cqe *cqe, void *data)
757 {
758 struct ublk_dev *dev = t->dev;
759 unsigned q_id = user_data_to_q_id(cqe->user_data);
760 struct ublk_queue *q = &dev->q[q_id];
761 unsigned cmd_op = user_data_to_op(cqe->user_data);
762
763 if (cqe->res < 0 && cqe->res != -ENODEV)
764 ublk_err("%s: res %d userdata %llx queue state %x\n", __func__,
765 cqe->res, cqe->user_data, q->flags);
766
767 ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n",
768 __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data),
769 cmd_op, is_target_io(cqe->user_data),
770 user_data_to_tgt_data(cqe->user_data),
771 (t->state & UBLKS_T_STOPPING));
772
773 /* Don't retrieve io in case of target io */
774 if (is_target_io(cqe->user_data)) {
775 ublksrv_handle_tgt_cqe(t, q, cqe);
776 return;
777 }
778
779 t->cmd_inflight--;
780
781 ublk_handle_uring_cmd(t, q, cqe);
782 }
783
ublk_reap_events_uring(struct ublk_thread * t)784 static int ublk_reap_events_uring(struct ublk_thread *t)
785 {
786 struct io_uring_cqe *cqe;
787 unsigned head;
788 int count = 0;
789
790 io_uring_for_each_cqe(&t->ring, head, cqe) {
791 ublk_handle_cqe(t, cqe, NULL);
792 count += 1;
793 }
794 io_uring_cq_advance(&t->ring, count);
795
796 return count;
797 }
798
ublk_process_io(struct ublk_thread * t)799 static int ublk_process_io(struct ublk_thread *t)
800 {
801 int ret, reapped;
802
803 ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
804 t->dev->dev_info.dev_id,
805 t->idx, io_uring_sq_ready(&t->ring),
806 t->cmd_inflight,
807 (t->state & UBLKS_T_STOPPING));
808
809 if (ublk_thread_is_done(t))
810 return -ENODEV;
811
812 ret = io_uring_submit_and_wait(&t->ring, 1);
813 reapped = ublk_reap_events_uring(t);
814
815 ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
816 ret, reapped, (t->state & UBLKS_T_STOPPING),
817 (t->state & UBLKS_T_IDLE));
818
819 return reapped;
820 }
821
ublk_thread_set_sched_affinity(const struct ublk_thread * t,cpu_set_t * cpuset)822 static void ublk_thread_set_sched_affinity(const struct ublk_thread *t,
823 cpu_set_t *cpuset)
824 {
825 if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0)
826 ublk_err("ublk dev %u thread %u set affinity failed",
827 t->dev->dev_info.dev_id, t->idx);
828 }
829
830 struct ublk_thread_info {
831 struct ublk_dev *dev;
832 unsigned idx;
833 sem_t *ready;
834 cpu_set_t *affinity;
835 };
836
ublk_io_handler_fn(void * data)837 static void *ublk_io_handler_fn(void *data)
838 {
839 struct ublk_thread_info *info = data;
840 struct ublk_thread *t = &info->dev->threads[info->idx];
841 int dev_id = info->dev->dev_info.dev_id;
842 int ret;
843
844 t->dev = info->dev;
845 t->idx = info->idx;
846
847 ret = ublk_thread_init(t);
848 if (ret) {
849 ublk_err("ublk dev %d thread %u init failed\n",
850 dev_id, t->idx);
851 return NULL;
852 }
853 /* IO perf is sensitive with queue pthread affinity on NUMA machine*/
854 if (info->affinity)
855 ublk_thread_set_sched_affinity(t, info->affinity);
856 sem_post(info->ready);
857
858 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
859 gettid(), dev_id, t->idx);
860
861 /* submit all io commands to ublk driver */
862 ublk_submit_fetch_commands(t);
863 do {
864 if (ublk_process_io(t) < 0)
865 break;
866 } while (1);
867
868 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
869 gettid(), dev_id, t->idx);
870 ublk_thread_deinit(t);
871 return NULL;
872 }
873
ublk_set_parameters(struct ublk_dev * dev)874 static void ublk_set_parameters(struct ublk_dev *dev)
875 {
876 int ret;
877
878 ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
879 if (ret)
880 ublk_err("dev %d set basic parameter failed %d\n",
881 dev->dev_info.dev_id, ret);
882 }
883
ublk_send_dev_event(const struct dev_ctx * ctx,struct ublk_dev * dev,int dev_id)884 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
885 {
886 uint64_t id;
887 int evtfd = ctx->_evtfd;
888
889 if (evtfd < 0)
890 return -EBADF;
891
892 if (dev_id >= 0)
893 id = dev_id + 1;
894 else
895 id = ERROR_EVTFD_DEVID;
896
897 if (dev && ctx->shadow_dev)
898 memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
899
900 if (write(evtfd, &id, sizeof(id)) != sizeof(id))
901 return -EINVAL;
902
903 close(evtfd);
904 shmdt(ctx->shadow_dev);
905
906 return 0;
907 }
908
909
ublk_start_daemon(const struct dev_ctx * ctx,struct ublk_dev * dev)910 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
911 {
912 const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
913 struct ublk_thread_info *tinfo;
914 unsigned long long extra_flags = 0;
915 cpu_set_t *affinity_buf;
916 void *thread_ret;
917 sem_t ready;
918 int ret, i;
919
920 ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
921
922 tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
923 if (!tinfo)
924 return -ENOMEM;
925
926 sem_init(&ready, 0, 0);
927 ret = ublk_dev_prep(ctx, dev);
928 if (ret)
929 return ret;
930
931 ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
932 if (ret)
933 return ret;
934
935 if (ctx->auto_zc_fallback)
936 extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
937
938 for (i = 0; i < dinfo->nr_hw_queues; i++) {
939 dev->q[i].dev = dev;
940 dev->q[i].q_id = i;
941
942 ret = ublk_queue_init(&dev->q[i], extra_flags);
943 if (ret) {
944 ublk_err("ublk dev %d queue %d init queue failed\n",
945 dinfo->dev_id, i);
946 goto fail;
947 }
948 }
949
950 for (i = 0; i < dev->nthreads; i++) {
951 tinfo[i].dev = dev;
952 tinfo[i].idx = i;
953 tinfo[i].ready = &ready;
954
955 /*
956 * If threads are not tied 1:1 to queues, setting thread
957 * affinity based on queue affinity makes little sense.
958 * However, thread CPU affinity has significant impact
959 * on performance, so to compare fairly, we'll still set
960 * thread CPU affinity based on queue affinity where
961 * possible.
962 */
963 if (dev->nthreads == dinfo->nr_hw_queues)
964 tinfo[i].affinity = &affinity_buf[i];
965 pthread_create(&dev->threads[i].thread, NULL,
966 ublk_io_handler_fn,
967 &tinfo[i]);
968 }
969
970 for (i = 0; i < dev->nthreads; i++)
971 sem_wait(&ready);
972 free(tinfo);
973 free(affinity_buf);
974
975 /* everything is fine now, start us */
976 if (ctx->recovery)
977 ret = ublk_ctrl_end_user_recovery(dev, getpid());
978 else {
979 ublk_set_parameters(dev);
980 ret = ublk_ctrl_start_dev(dev, getpid());
981 }
982 if (ret < 0) {
983 ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
984 goto fail;
985 }
986
987 ublk_ctrl_get_info(dev);
988 if (ctx->fg)
989 ublk_ctrl_dump(dev);
990 else
991 ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
992
993 /* wait until we are terminated */
994 for (i = 0; i < dev->nthreads; i++)
995 pthread_join(dev->threads[i].thread, &thread_ret);
996 fail:
997 for (i = 0; i < dinfo->nr_hw_queues; i++)
998 ublk_queue_deinit(&dev->q[i]);
999 ublk_dev_unprep(dev);
1000 ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
1001
1002 return ret;
1003 }
1004
wait_ublk_dev(const char * path,int evt_mask,unsigned timeout)1005 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
1006 {
1007 #define EV_SIZE (sizeof(struct inotify_event))
1008 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
1009 struct pollfd pfd;
1010 int fd, wd;
1011 int ret = -EINVAL;
1012 const char *dev_name = basename(path);
1013
1014 fd = inotify_init();
1015 if (fd < 0) {
1016 ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
1017 return fd;
1018 }
1019
1020 wd = inotify_add_watch(fd, "/dev", evt_mask);
1021 if (wd == -1) {
1022 ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
1023 goto fail;
1024 }
1025
1026 pfd.fd = fd;
1027 pfd.events = POLL_IN;
1028 while (1) {
1029 int i = 0;
1030 char buffer[EV_BUF_LEN];
1031 ret = poll(&pfd, 1, 1000 * timeout);
1032
1033 if (ret == -1) {
1034 ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
1035 goto rm_watch;
1036 } else if (ret == 0) {
1037 ublk_err("%s: poll inotify timeout\n", __func__);
1038 ret = -ETIMEDOUT;
1039 goto rm_watch;
1040 }
1041
1042 ret = read(fd, buffer, EV_BUF_LEN);
1043 if (ret < 0) {
1044 ublk_err("%s: read inotify fd failed\n", __func__);
1045 goto rm_watch;
1046 }
1047
1048 while (i < ret) {
1049 struct inotify_event *event = (struct inotify_event *)&buffer[i];
1050
1051 ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
1052 __func__, event->mask, event->name);
1053 if (event->mask & evt_mask) {
1054 if (!strcmp(event->name, dev_name)) {
1055 ret = 0;
1056 goto rm_watch;
1057 }
1058 }
1059 i += EV_SIZE + event->len;
1060 }
1061 }
1062 rm_watch:
1063 inotify_rm_watch(fd, wd);
1064 fail:
1065 close(fd);
1066 return ret;
1067 }
1068
ublk_stop_io_daemon(const struct ublk_dev * dev)1069 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
1070 {
1071 int daemon_pid = dev->dev_info.ublksrv_pid;
1072 int dev_id = dev->dev_info.dev_id;
1073 char ublkc[64];
1074 int ret = 0;
1075
1076 if (daemon_pid < 0)
1077 return 0;
1078
1079 /* daemon may be dead already */
1080 if (kill(daemon_pid, 0) < 0)
1081 goto wait;
1082
1083 snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1084
1085 /* ublk char device may be gone already */
1086 if (access(ublkc, F_OK) != 0)
1087 goto wait;
1088
1089 /* Wait until ublk char device is closed, when the daemon is shutdown */
1090 ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1091 /* double check and since it may be closed before starting inotify */
1092 if (ret == -ETIMEDOUT)
1093 ret = kill(daemon_pid, 0) < 0;
1094 wait:
1095 waitpid(daemon_pid, NULL, 0);
1096 ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1097 __func__, daemon_pid, dev_id, ret);
1098
1099 return ret;
1100 }
1101
__cmd_dev_add(const struct dev_ctx * ctx)1102 static int __cmd_dev_add(const struct dev_ctx *ctx)
1103 {
1104 unsigned nthreads = ctx->nthreads;
1105 unsigned nr_queues = ctx->nr_hw_queues;
1106 const char *tgt_type = ctx->tgt_type;
1107 unsigned depth = ctx->queue_depth;
1108 __u64 features;
1109 const struct ublk_tgt_ops *ops;
1110 struct ublksrv_ctrl_dev_info *info;
1111 struct ublk_dev *dev = NULL;
1112 int dev_id = ctx->dev_id;
1113 int ret, i;
1114
1115 ops = ublk_find_tgt(tgt_type);
1116 if (!ops) {
1117 ublk_err("%s: no such tgt type, type %s\n",
1118 __func__, tgt_type);
1119 ret = -ENODEV;
1120 goto fail;
1121 }
1122
1123 if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1124 ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1125 __func__, nr_queues, depth);
1126 ret = -EINVAL;
1127 goto fail;
1128 }
1129
1130 /* default to 1:1 threads:queues if nthreads is unspecified */
1131 if (!nthreads)
1132 nthreads = nr_queues;
1133
1134 if (nthreads > UBLK_MAX_THREADS) {
1135 ublk_err("%s: %u is too many threads (max %u)\n",
1136 __func__, nthreads, UBLK_MAX_THREADS);
1137 ret = -EINVAL;
1138 goto fail;
1139 }
1140
1141 if (nthreads != nr_queues && !ctx->per_io_tasks) {
1142 ublk_err("%s: threads %u must be same as queues %u if "
1143 "not using per_io_tasks\n",
1144 __func__, nthreads, nr_queues);
1145 ret = -EINVAL;
1146 goto fail;
1147 }
1148
1149 dev = ublk_ctrl_init();
1150 if (!dev) {
1151 ublk_err("%s: can't alloc dev id %d, type %s\n",
1152 __func__, dev_id, tgt_type);
1153 ret = -ENOMEM;
1154 goto fail;
1155 }
1156
1157 /* kernel doesn't support get_features */
1158 ret = ublk_ctrl_get_features(dev, &features);
1159 if (ret < 0) {
1160 ret = -EINVAL;
1161 goto fail;
1162 }
1163
1164 if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
1165 ret = -ENOTSUP;
1166 goto fail;
1167 }
1168
1169 info = &dev->dev_info;
1170 info->dev_id = ctx->dev_id;
1171 info->nr_hw_queues = nr_queues;
1172 info->queue_depth = depth;
1173 info->flags = ctx->flags;
1174 if ((features & UBLK_F_QUIESCE) &&
1175 (info->flags & UBLK_F_USER_RECOVERY))
1176 info->flags |= UBLK_F_QUIESCE;
1177 dev->nthreads = nthreads;
1178 dev->per_io_tasks = ctx->per_io_tasks;
1179 dev->tgt.ops = ops;
1180 dev->tgt.sq_depth = depth;
1181 dev->tgt.cq_depth = depth;
1182
1183 for (i = 0; i < MAX_BACK_FILES; i++) {
1184 if (ctx->files[i]) {
1185 strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1186 dev->tgt.nr_backing_files++;
1187 }
1188 }
1189
1190 if (ctx->recovery)
1191 ret = ublk_ctrl_start_user_recovery(dev);
1192 else
1193 ret = ublk_ctrl_add_dev(dev);
1194 if (ret < 0) {
1195 ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1196 __func__, dev_id, tgt_type, ret);
1197 goto fail;
1198 }
1199
1200 ret = ublk_start_daemon(ctx, dev);
1201 ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret);
1202 if (ret < 0)
1203 ublk_ctrl_del_dev(dev);
1204
1205 fail:
1206 if (ret < 0)
1207 ublk_send_dev_event(ctx, dev, -1);
1208 if (dev)
1209 ublk_ctrl_deinit(dev);
1210 return ret;
1211 }
1212
1213 static int __cmd_dev_list(struct dev_ctx *ctx);
1214
cmd_dev_add(struct dev_ctx * ctx)1215 static int cmd_dev_add(struct dev_ctx *ctx)
1216 {
1217 int res;
1218
1219 if (ctx->fg)
1220 goto run;
1221
1222 ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1223 if (ctx->_shmid < 0) {
1224 ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1225 exit(-1);
1226 }
1227 ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1228 if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1229 ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1230 exit(-1);
1231 }
1232 ctx->_evtfd = eventfd(0, 0);
1233 if (ctx->_evtfd < 0) {
1234 ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1235 exit(-1);
1236 }
1237
1238 res = fork();
1239 if (res == 0) {
1240 int res2;
1241
1242 setsid();
1243 res2 = fork();
1244 if (res2 == 0) {
1245 /* prepare for detaching */
1246 close(STDIN_FILENO);
1247 close(STDOUT_FILENO);
1248 close(STDERR_FILENO);
1249 run:
1250 res = __cmd_dev_add(ctx);
1251 return res;
1252 } else {
1253 /* detached from the foreground task */
1254 exit(EXIT_SUCCESS);
1255 }
1256 } else if (res > 0) {
1257 uint64_t id;
1258 int exit_code = EXIT_FAILURE;
1259
1260 res = read(ctx->_evtfd, &id, sizeof(id));
1261 close(ctx->_evtfd);
1262 if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1263 ctx->dev_id = id - 1;
1264 if (__cmd_dev_list(ctx) >= 0)
1265 exit_code = EXIT_SUCCESS;
1266 }
1267 shmdt(ctx->shadow_dev);
1268 shmctl(ctx->_shmid, IPC_RMID, NULL);
1269 /* wait for child and detach from it */
1270 wait(NULL);
1271 if (exit_code == EXIT_FAILURE)
1272 ublk_err("%s: command failed\n", __func__);
1273 exit(exit_code);
1274 } else {
1275 exit(EXIT_FAILURE);
1276 }
1277 }
1278
__cmd_dev_del(struct dev_ctx * ctx)1279 static int __cmd_dev_del(struct dev_ctx *ctx)
1280 {
1281 int number = ctx->dev_id;
1282 struct ublk_dev *dev;
1283 int ret;
1284
1285 dev = ublk_ctrl_init();
1286 dev->dev_info.dev_id = number;
1287
1288 ret = ublk_ctrl_get_info(dev);
1289 if (ret < 0)
1290 goto fail;
1291
1292 ret = ublk_ctrl_stop_dev(dev);
1293 if (ret < 0)
1294 ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1295
1296 ret = ublk_stop_io_daemon(dev);
1297 if (ret < 0)
1298 ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1299 __func__, dev->dev_info.ublksrv_pid, number, ret);
1300 ublk_ctrl_del_dev(dev);
1301 fail:
1302 ublk_ctrl_deinit(dev);
1303
1304 return (ret >= 0) ? 0 : ret;
1305 }
1306
cmd_dev_del(struct dev_ctx * ctx)1307 static int cmd_dev_del(struct dev_ctx *ctx)
1308 {
1309 int i;
1310
1311 if (ctx->dev_id >= 0 || !ctx->all)
1312 return __cmd_dev_del(ctx);
1313
1314 for (i = 0; i < 255; i++) {
1315 ctx->dev_id = i;
1316 __cmd_dev_del(ctx);
1317 }
1318 return 0;
1319 }
1320
__cmd_dev_list(struct dev_ctx * ctx)1321 static int __cmd_dev_list(struct dev_ctx *ctx)
1322 {
1323 struct ublk_dev *dev = ublk_ctrl_init();
1324 int ret;
1325
1326 if (!dev)
1327 return -ENODEV;
1328
1329 dev->dev_info.dev_id = ctx->dev_id;
1330
1331 ret = ublk_ctrl_get_info(dev);
1332 if (ret < 0) {
1333 if (ctx->logging)
1334 ublk_err("%s: can't get dev info from %d: %d\n",
1335 __func__, ctx->dev_id, ret);
1336 } else {
1337 if (ctx->shadow_dev)
1338 memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1339
1340 ublk_ctrl_dump(dev);
1341 }
1342
1343 ublk_ctrl_deinit(dev);
1344
1345 return ret;
1346 }
1347
cmd_dev_list(struct dev_ctx * ctx)1348 static int cmd_dev_list(struct dev_ctx *ctx)
1349 {
1350 int i;
1351
1352 if (ctx->dev_id >= 0 || !ctx->all)
1353 return __cmd_dev_list(ctx);
1354
1355 ctx->logging = false;
1356 for (i = 0; i < 255; i++) {
1357 ctx->dev_id = i;
1358 __cmd_dev_list(ctx);
1359 }
1360 return 0;
1361 }
1362
cmd_dev_get_features(void)1363 static int cmd_dev_get_features(void)
1364 {
1365 #define const_ilog2(x) (63 - __builtin_clzll(x))
1366 static const char *feat_map[] = {
1367 [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY",
1368 [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK",
1369 [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA",
1370 [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY",
1371 [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE",
1372 [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV",
1373 [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE",
1374 [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY",
1375 [const_ilog2(UBLK_F_ZONED)] = "ZONED",
1376 [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO",
1377 [const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
1378 [const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
1379 [const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
1380 [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON",
1381 };
1382 struct ublk_dev *dev;
1383 __u64 features = 0;
1384 int ret;
1385
1386 dev = ublk_ctrl_init();
1387 if (!dev) {
1388 fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1389 return -EOPNOTSUPP;
1390 }
1391
1392 ret = ublk_ctrl_get_features(dev, &features);
1393 if (!ret) {
1394 int i;
1395
1396 printf("ublk_drv features: 0x%llx\n", features);
1397
1398 for (i = 0; i < sizeof(features) * 8; i++) {
1399 const char *feat;
1400
1401 if (!((1ULL << i) & features))
1402 continue;
1403 if (i < sizeof(feat_map) / sizeof(feat_map[0]))
1404 feat = feat_map[i];
1405 else
1406 feat = "unknown";
1407 printf("\t%-20s: 0x%llx\n", feat, 1ULL << i);
1408 }
1409 }
1410
1411 return ret;
1412 }
1413
cmd_dev_update_size(struct dev_ctx * ctx)1414 static int cmd_dev_update_size(struct dev_ctx *ctx)
1415 {
1416 struct ublk_dev *dev = ublk_ctrl_init();
1417 struct ublk_params p;
1418 int ret = -EINVAL;
1419
1420 if (!dev)
1421 return -ENODEV;
1422
1423 if (ctx->dev_id < 0) {
1424 fprintf(stderr, "device id isn't provided\n");
1425 goto out;
1426 }
1427
1428 dev->dev_info.dev_id = ctx->dev_id;
1429 ret = ublk_ctrl_get_params(dev, &p);
1430 if (ret < 0) {
1431 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
1432 goto out;
1433 }
1434
1435 if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
1436 ublk_err("size isn't aligned with logical block size\n");
1437 ret = -EINVAL;
1438 goto out;
1439 }
1440
1441 ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
1442 out:
1443 ublk_ctrl_deinit(dev);
1444 return ret;
1445 }
1446
cmd_dev_quiesce(struct dev_ctx * ctx)1447 static int cmd_dev_quiesce(struct dev_ctx *ctx)
1448 {
1449 struct ublk_dev *dev = ublk_ctrl_init();
1450 int ret = -EINVAL;
1451
1452 if (!dev)
1453 return -ENODEV;
1454
1455 if (ctx->dev_id < 0) {
1456 fprintf(stderr, "device id isn't provided for quiesce\n");
1457 goto out;
1458 }
1459 dev->dev_info.dev_id = ctx->dev_id;
1460 ret = ublk_ctrl_quiesce_dev(dev, 10000);
1461
1462 out:
1463 ublk_ctrl_deinit(dev);
1464 return ret;
1465 }
1466
__cmd_create_help(char * exe,bool recovery)1467 static void __cmd_create_help(char *exe, bool recovery)
1468 {
1469 int i;
1470
1471 printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
1472 exe, recovery ? "recover" : "add");
1473 printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
1474 printf("\t[-e 0|1 ] [-i 0|1]\n");
1475 printf("\t[--nthreads threads] [--per_io_tasks]\n");
1476 printf("\t[target options] [backfile1] [backfile2] ...\n");
1477 printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
1478 printf("\tdefault: nthreads=nr_queues");
1479
1480 for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) {
1481 const struct ublk_tgt_ops *ops = tgt_ops_list[i];
1482
1483 if (ops->usage)
1484 ops->usage(ops);
1485 }
1486 }
1487
cmd_add_help(char * exe)1488 static void cmd_add_help(char *exe)
1489 {
1490 __cmd_create_help(exe, false);
1491 printf("\n");
1492 }
1493
cmd_recover_help(char * exe)1494 static void cmd_recover_help(char *exe)
1495 {
1496 __cmd_create_help(exe, true);
1497 printf("\tPlease provide exact command line for creating this device with real dev_id\n");
1498 printf("\n");
1499 }
1500
cmd_dev_help(char * exe)1501 static int cmd_dev_help(char *exe)
1502 {
1503 cmd_add_help(exe);
1504 cmd_recover_help(exe);
1505
1506 printf("%s del [-n dev_id] -a \n", exe);
1507 printf("\t -a delete all devices -n delete specified device\n\n");
1508 printf("%s list [-n dev_id] -a \n", exe);
1509 printf("\t -a list all devices, -n list specified device, default -a \n\n");
1510 printf("%s features\n", exe);
1511 printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
1512 printf("%s quiesce -n dev_id\n", exe);
1513 return 0;
1514 }
1515
main(int argc,char * argv[])1516 int main(int argc, char *argv[])
1517 {
1518 static const struct option longopts[] = {
1519 { "all", 0, NULL, 'a' },
1520 { "type", 1, NULL, 't' },
1521 { "number", 1, NULL, 'n' },
1522 { "queues", 1, NULL, 'q' },
1523 { "depth", 1, NULL, 'd' },
1524 { "debug_mask", 1, NULL, 0 },
1525 { "quiet", 0, NULL, 0 },
1526 { "zero_copy", 0, NULL, 'z' },
1527 { "foreground", 0, NULL, 0 },
1528 { "recovery", 1, NULL, 'r' },
1529 { "recovery_fail_io", 1, NULL, 'e'},
1530 { "recovery_reissue", 1, NULL, 'i'},
1531 { "get_data", 1, NULL, 'g'},
1532 { "auto_zc", 0, NULL, 0 },
1533 { "auto_zc_fallback", 0, NULL, 0 },
1534 { "size", 1, NULL, 's'},
1535 { "nthreads", 1, NULL, 0 },
1536 { "per_io_tasks", 0, NULL, 0 },
1537 { 0, 0, 0, 0 }
1538 };
1539 const struct ublk_tgt_ops *ops = NULL;
1540 int option_idx, opt;
1541 const char *cmd = argv[1];
1542 struct dev_ctx ctx = {
1543 .queue_depth = 128,
1544 .nr_hw_queues = 2,
1545 .dev_id = -1,
1546 .tgt_type = "unknown",
1547 };
1548 int ret = -EINVAL, i;
1549 int tgt_argc = 1;
1550 char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
1551 int value;
1552
1553 if (argc == 1)
1554 return ret;
1555
1556 opterr = 0;
1557 optind = 2;
1558 while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gaz",
1559 longopts, &option_idx)) != -1) {
1560 switch (opt) {
1561 case 'a':
1562 ctx.all = 1;
1563 break;
1564 case 'n':
1565 ctx.dev_id = strtol(optarg, NULL, 10);
1566 break;
1567 case 't':
1568 if (strlen(optarg) < sizeof(ctx.tgt_type))
1569 strcpy(ctx.tgt_type, optarg);
1570 break;
1571 case 'q':
1572 ctx.nr_hw_queues = strtol(optarg, NULL, 10);
1573 break;
1574 case 'd':
1575 ctx.queue_depth = strtol(optarg, NULL, 10);
1576 break;
1577 case 'z':
1578 ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY;
1579 break;
1580 case 'r':
1581 value = strtol(optarg, NULL, 10);
1582 if (value)
1583 ctx.flags |= UBLK_F_USER_RECOVERY;
1584 break;
1585 case 'e':
1586 value = strtol(optarg, NULL, 10);
1587 if (value)
1588 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
1589 break;
1590 case 'i':
1591 value = strtol(optarg, NULL, 10);
1592 if (value)
1593 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
1594 break;
1595 case 'g':
1596 ctx.flags |= UBLK_F_NEED_GET_DATA;
1597 break;
1598 case 's':
1599 ctx.size = strtoull(optarg, NULL, 10);
1600 break;
1601 case 0:
1602 if (!strcmp(longopts[option_idx].name, "debug_mask"))
1603 ublk_dbg_mask = strtol(optarg, NULL, 16);
1604 if (!strcmp(longopts[option_idx].name, "quiet"))
1605 ublk_dbg_mask = 0;
1606 if (!strcmp(longopts[option_idx].name, "foreground"))
1607 ctx.fg = 1;
1608 if (!strcmp(longopts[option_idx].name, "auto_zc"))
1609 ctx.flags |= UBLK_F_AUTO_BUF_REG;
1610 if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
1611 ctx.auto_zc_fallback = 1;
1612 if (!strcmp(longopts[option_idx].name, "nthreads"))
1613 ctx.nthreads = strtol(optarg, NULL, 10);
1614 if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
1615 ctx.per_io_tasks = 1;
1616 break;
1617 case '?':
1618 /*
1619 * target requires every option must have argument
1620 */
1621 if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
1622 fprintf(stderr, "every target option requires argument: %s %s\n",
1623 argv[optind - 1], argv[optind]);
1624 exit(EXIT_FAILURE);
1625 }
1626
1627 if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
1628 tgt_argv[tgt_argc++] = argv[optind - 1];
1629 tgt_argv[tgt_argc++] = argv[optind];
1630 } else {
1631 fprintf(stderr, "too many target options\n");
1632 exit(EXIT_FAILURE);
1633 }
1634 optind += 1;
1635 break;
1636 }
1637 }
1638
1639 /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
1640 if (ctx.auto_zc_fallback &&
1641 !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
1642 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
1643 ublk_err("%s: auto_zc_fallback is set but neither "
1644 "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
1645 __func__);
1646 return -EINVAL;
1647 }
1648
1649 i = optind;
1650 while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
1651 ctx.files[ctx.nr_files++] = argv[i++];
1652 }
1653
1654 ops = ublk_find_tgt(ctx.tgt_type);
1655 if (ops && ops->parse_cmd_line) {
1656 optind = 0;
1657
1658 tgt_argv[0] = ctx.tgt_type;
1659 ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
1660 }
1661
1662 if (!strcmp(cmd, "add"))
1663 ret = cmd_dev_add(&ctx);
1664 else if (!strcmp(cmd, "recover")) {
1665 if (ctx.dev_id < 0) {
1666 fprintf(stderr, "device id isn't provided for recovering\n");
1667 ret = -EINVAL;
1668 } else {
1669 ctx.recovery = 1;
1670 ret = cmd_dev_add(&ctx);
1671 }
1672 } else if (!strcmp(cmd, "del"))
1673 ret = cmd_dev_del(&ctx);
1674 else if (!strcmp(cmd, "list")) {
1675 ctx.all = 1;
1676 ret = cmd_dev_list(&ctx);
1677 } else if (!strcmp(cmd, "help"))
1678 ret = cmd_dev_help(argv[0]);
1679 else if (!strcmp(cmd, "features"))
1680 ret = cmd_dev_get_features();
1681 else if (!strcmp(cmd, "update_size"))
1682 ret = cmd_dev_update_size(&ctx);
1683 else if (!strcmp(cmd, "quiesce"))
1684 ret = cmd_dev_quiesce(&ctx);
1685 else
1686 cmd_dev_help(argv[0]);
1687
1688 return ret;
1689 }
1690