1 /* SPDX-License-Identifier: MIT */
2 /*
3 * Description: uring_cmd based ublk
4 */
5
6 #include <linux/fs.h>
7 #include <sys/un.h>
8 #include "kublk.h"
9
10 #define MAX_NR_TGT_ARG 64
11
12 unsigned int ublk_dbg_mask = UBLK_LOG;
13 static const struct ublk_tgt_ops *tgt_ops_list[] = {
14 &null_tgt_ops,
15 &loop_tgt_ops,
16 &stripe_tgt_ops,
17 &fault_inject_tgt_ops,
18 };
19
ublk_find_tgt(const char * name)20 static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
21 {
22 int i;
23
24 if (name == NULL)
25 return NULL;
26
27 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
28 if (strcmp(tgt_ops_list[i]->name, name) == 0)
29 return tgt_ops_list[i];
30 return NULL;
31 }
32
ublk_setup_ring(struct io_uring * r,int depth,int cq_depth,unsigned flags)33 static inline int ublk_setup_ring(struct io_uring *r, int depth,
34 int cq_depth, unsigned flags)
35 {
36 struct io_uring_params p;
37
38 memset(&p, 0, sizeof(p));
39 p.flags = flags | IORING_SETUP_CQSIZE;
40 p.cq_entries = cq_depth;
41
42 return io_uring_queue_init_params(depth, r, &p);
43 }
44
ublk_ctrl_init_cmd(struct ublk_dev * dev,struct io_uring_sqe * sqe,struct ublk_ctrl_cmd_data * data)45 static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
46 struct io_uring_sqe *sqe,
47 struct ublk_ctrl_cmd_data *data)
48 {
49 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
50 struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
51
52 sqe->fd = dev->ctrl_fd;
53 sqe->opcode = IORING_OP_URING_CMD;
54 sqe->ioprio = 0;
55
56 if (data->flags & CTRL_CMD_HAS_BUF) {
57 cmd->addr = data->addr;
58 cmd->len = data->len;
59 }
60
61 if (data->flags & CTRL_CMD_HAS_DATA)
62 cmd->data[0] = data->data[0];
63
64 cmd->dev_id = info->dev_id;
65 cmd->queue_id = -1;
66
67 ublk_set_sqe_cmd_op(sqe, data->cmd_op);
68
69 io_uring_sqe_set_data(sqe, cmd);
70 }
71
__ublk_ctrl_cmd(struct ublk_dev * dev,struct ublk_ctrl_cmd_data * data)72 static int __ublk_ctrl_cmd(struct ublk_dev *dev,
73 struct ublk_ctrl_cmd_data *data)
74 {
75 struct io_uring_sqe *sqe;
76 struct io_uring_cqe *cqe;
77 int ret = -EINVAL;
78
79 sqe = io_uring_get_sqe(&dev->ring);
80 if (!sqe) {
81 ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
82 return ret;
83 }
84
85 ublk_ctrl_init_cmd(dev, sqe, data);
86
87 ret = io_uring_submit(&dev->ring);
88 if (ret < 0) {
89 ublk_err("uring submit ret %d\n", ret);
90 return ret;
91 }
92
93 ret = io_uring_wait_cqe(&dev->ring, &cqe);
94 if (ret < 0) {
95 ublk_err("wait cqe: %s\n", strerror(-ret));
96 return ret;
97 }
98 io_uring_cqe_seen(&dev->ring, cqe);
99
100 return cqe->res;
101 }
102
ublk_ctrl_stop_dev(struct ublk_dev * dev)103 static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
104 {
105 struct ublk_ctrl_cmd_data data = {
106 .cmd_op = UBLK_U_CMD_STOP_DEV,
107 };
108
109 return __ublk_ctrl_cmd(dev, &data);
110 }
111
ublk_ctrl_try_stop_dev(struct ublk_dev * dev)112 static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev)
113 {
114 struct ublk_ctrl_cmd_data data = {
115 .cmd_op = UBLK_U_CMD_TRY_STOP_DEV,
116 };
117
118 return __ublk_ctrl_cmd(dev, &data);
119 }
120
ublk_ctrl_start_dev(struct ublk_dev * dev,int daemon_pid)121 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
122 int daemon_pid)
123 {
124 struct ublk_ctrl_cmd_data data = {
125 .cmd_op = UBLK_U_CMD_START_DEV,
126 .flags = CTRL_CMD_HAS_DATA,
127 };
128
129 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
130
131 return __ublk_ctrl_cmd(dev, &data);
132 }
133
ublk_ctrl_start_user_recovery(struct ublk_dev * dev)134 static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
135 {
136 struct ublk_ctrl_cmd_data data = {
137 .cmd_op = UBLK_U_CMD_START_USER_RECOVERY,
138 };
139
140 return __ublk_ctrl_cmd(dev, &data);
141 }
142
ublk_ctrl_end_user_recovery(struct ublk_dev * dev,int daemon_pid)143 static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
144 {
145 struct ublk_ctrl_cmd_data data = {
146 .cmd_op = UBLK_U_CMD_END_USER_RECOVERY,
147 .flags = CTRL_CMD_HAS_DATA,
148 };
149
150 dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
151
152 return __ublk_ctrl_cmd(dev, &data);
153 }
154
ublk_ctrl_add_dev(struct ublk_dev * dev)155 static int ublk_ctrl_add_dev(struct ublk_dev *dev)
156 {
157 struct ublk_ctrl_cmd_data data = {
158 .cmd_op = UBLK_U_CMD_ADD_DEV,
159 .flags = CTRL_CMD_HAS_BUF,
160 .addr = (__u64) (uintptr_t) &dev->dev_info,
161 .len = sizeof(struct ublksrv_ctrl_dev_info),
162 };
163
164 return __ublk_ctrl_cmd(dev, &data);
165 }
166
ublk_ctrl_del_dev(struct ublk_dev * dev)167 static int ublk_ctrl_del_dev(struct ublk_dev *dev)
168 {
169 struct ublk_ctrl_cmd_data data = {
170 .cmd_op = UBLK_U_CMD_DEL_DEV,
171 .flags = 0,
172 };
173
174 return __ublk_ctrl_cmd(dev, &data);
175 }
176
ublk_ctrl_get_info(struct ublk_dev * dev)177 static int ublk_ctrl_get_info(struct ublk_dev *dev)
178 {
179 struct ublk_ctrl_cmd_data data = {
180 .cmd_op = UBLK_U_CMD_GET_DEV_INFO,
181 .flags = CTRL_CMD_HAS_BUF,
182 .addr = (__u64) (uintptr_t) &dev->dev_info,
183 .len = sizeof(struct ublksrv_ctrl_dev_info),
184 };
185
186 return __ublk_ctrl_cmd(dev, &data);
187 }
188
ublk_ctrl_set_params(struct ublk_dev * dev,struct ublk_params * params)189 static int ublk_ctrl_set_params(struct ublk_dev *dev,
190 struct ublk_params *params)
191 {
192 struct ublk_ctrl_cmd_data data = {
193 .cmd_op = UBLK_U_CMD_SET_PARAMS,
194 .flags = CTRL_CMD_HAS_BUF,
195 .addr = (__u64) (uintptr_t) params,
196 .len = sizeof(*params),
197 };
198 params->len = sizeof(*params);
199 return __ublk_ctrl_cmd(dev, &data);
200 }
201
ublk_ctrl_get_params(struct ublk_dev * dev,struct ublk_params * params)202 static int ublk_ctrl_get_params(struct ublk_dev *dev,
203 struct ublk_params *params)
204 {
205 struct ublk_ctrl_cmd_data data = {
206 .cmd_op = UBLK_U_CMD_GET_PARAMS,
207 .flags = CTRL_CMD_HAS_BUF,
208 .addr = (__u64)params,
209 .len = sizeof(*params),
210 };
211
212 params->len = sizeof(*params);
213
214 return __ublk_ctrl_cmd(dev, &data);
215 }
216
ublk_ctrl_get_features(struct ublk_dev * dev,__u64 * features)217 static int ublk_ctrl_get_features(struct ublk_dev *dev,
218 __u64 *features)
219 {
220 struct ublk_ctrl_cmd_data data = {
221 .cmd_op = UBLK_U_CMD_GET_FEATURES,
222 .flags = CTRL_CMD_HAS_BUF,
223 .addr = (__u64) (uintptr_t) features,
224 .len = sizeof(*features),
225 };
226
227 return __ublk_ctrl_cmd(dev, &data);
228 }
229
ublk_ctrl_update_size(struct ublk_dev * dev,__u64 nr_sects)230 static int ublk_ctrl_update_size(struct ublk_dev *dev,
231 __u64 nr_sects)
232 {
233 struct ublk_ctrl_cmd_data data = {
234 .cmd_op = UBLK_U_CMD_UPDATE_SIZE,
235 .flags = CTRL_CMD_HAS_DATA,
236 };
237
238 data.data[0] = nr_sects;
239 return __ublk_ctrl_cmd(dev, &data);
240 }
241
ublk_ctrl_quiesce_dev(struct ublk_dev * dev,unsigned int timeout_ms)242 static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
243 unsigned int timeout_ms)
244 {
245 struct ublk_ctrl_cmd_data data = {
246 .cmd_op = UBLK_U_CMD_QUIESCE_DEV,
247 .flags = CTRL_CMD_HAS_DATA,
248 };
249
250 data.data[0] = timeout_ms;
251 return __ublk_ctrl_cmd(dev, &data);
252 }
253
ublk_dev_state_desc(struct ublk_dev * dev)254 static const char *ublk_dev_state_desc(struct ublk_dev *dev)
255 {
256 switch (dev->dev_info.state) {
257 case UBLK_S_DEV_DEAD:
258 return "DEAD";
259 case UBLK_S_DEV_LIVE:
260 return "LIVE";
261 case UBLK_S_DEV_QUIESCED:
262 return "QUIESCED";
263 default:
264 return "UNKNOWN";
265 };
266 }
267
ublk_print_cpu_set(const cpu_set_t * set,char * buf,unsigned len)268 static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
269 {
270 unsigned done = 0;
271 int i;
272
273 for (i = 0; i < CPU_SETSIZE; i++) {
274 if (CPU_ISSET(i, set))
275 done += snprintf(&buf[done], len - done, "%d ", i);
276 }
277 }
278
ublk_adjust_affinity(cpu_set_t * set)279 static void ublk_adjust_affinity(cpu_set_t *set)
280 {
281 int j, updated = 0;
282
283 /*
284 * Just keep the 1st CPU now.
285 *
286 * In future, auto affinity selection can be tried.
287 */
288 for (j = 0; j < CPU_SETSIZE; j++) {
289 if (CPU_ISSET(j, set)) {
290 if (!updated) {
291 updated = 1;
292 continue;
293 }
294 CPU_CLR(j, set);
295 }
296 }
297 }
298
299 /* Caller must free the allocated buffer */
ublk_ctrl_get_affinity(struct ublk_dev * ctrl_dev,cpu_set_t ** ptr_buf)300 static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
301 {
302 struct ublk_ctrl_cmd_data data = {
303 .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY,
304 .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
305 };
306 cpu_set_t *buf;
307 int i, ret;
308
309 buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
310 if (!buf)
311 return -ENOMEM;
312
313 for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
314 data.data[0] = i;
315 data.len = sizeof(cpu_set_t);
316 data.addr = (__u64)&buf[i];
317
318 ret = __ublk_ctrl_cmd(ctrl_dev, &data);
319 if (ret < 0) {
320 free(buf);
321 return ret;
322 }
323 ublk_adjust_affinity(&buf[i]);
324 }
325
326 *ptr_buf = buf;
327 return 0;
328 }
329
ublk_ctrl_dump(struct ublk_dev * dev)330 static void ublk_ctrl_dump(struct ublk_dev *dev)
331 {
332 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
333 struct ublk_params p;
334 cpu_set_t *affinity;
335 int ret;
336
337 ret = ublk_ctrl_get_params(dev, &p);
338 if (ret < 0) {
339 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
340 return;
341 }
342
343 ret = ublk_ctrl_get_affinity(dev, &affinity);
344 if (ret < 0) {
345 ublk_err("failed to get affinity %m\n");
346 return;
347 }
348
349 ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
350 info->dev_id, info->nr_hw_queues, info->queue_depth,
351 1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
352 ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
353 info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
354 ublk_dev_state_desc(dev));
355
356 if (affinity) {
357 char buf[512];
358 int i;
359
360 for (i = 0; i < info->nr_hw_queues; i++) {
361 ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
362 printf("\tqueue %u: affinity(%s)\n",
363 i, buf);
364 }
365 free(affinity);
366 }
367
368 fflush(stdout);
369 }
370
ublk_ctrl_deinit(struct ublk_dev * dev)371 static void ublk_ctrl_deinit(struct ublk_dev *dev)
372 {
373 close(dev->ctrl_fd);
374 free(dev);
375 }
376
ublk_ctrl_init(void)377 static struct ublk_dev *ublk_ctrl_init(void)
378 {
379 struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
380 struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
381 int ret;
382
383 dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
384 if (dev->ctrl_fd < 0) {
385 free(dev);
386 return NULL;
387 }
388
389 info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
390
391 ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
392 UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
393 if (ret < 0) {
394 ublk_err("queue_init: %s\n", strerror(-ret));
395 free(dev);
396 return NULL;
397 }
398 dev->nr_fds = 1;
399
400 return dev;
401 }
402
__ublk_queue_cmd_buf_sz(unsigned depth)403 static int __ublk_queue_cmd_buf_sz(unsigned depth)
404 {
405 int size = depth * sizeof(struct ublksrv_io_desc);
406 unsigned int page_sz = getpagesize();
407
408 return round_up(size, page_sz);
409 }
410
ublk_queue_max_cmd_buf_sz(void)411 static int ublk_queue_max_cmd_buf_sz(void)
412 {
413 return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
414 }
415
ublk_queue_cmd_buf_sz(struct ublk_queue * q)416 static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
417 {
418 return __ublk_queue_cmd_buf_sz(q->q_depth);
419 }
420
ublk_queue_deinit(struct ublk_queue * q)421 static void ublk_queue_deinit(struct ublk_queue *q)
422 {
423 int i;
424 int nr_ios = q->q_depth;
425
426 if (q->io_cmd_buf)
427 munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
428
429 for (i = 0; i < nr_ios; i++) {
430 free(q->ios[i].buf_addr);
431 free(q->ios[i].integrity_buf);
432 }
433 }
434
ublk_thread_deinit(struct ublk_thread * t)435 static void ublk_thread_deinit(struct ublk_thread *t)
436 {
437 io_uring_unregister_buffers(&t->ring);
438
439 ublk_batch_free_buf(t);
440
441 io_uring_unregister_ring_fd(&t->ring);
442
443 if (t->ring.ring_fd > 0) {
444 io_uring_unregister_files(&t->ring);
445 close(t->ring.ring_fd);
446 t->ring.ring_fd = -1;
447 }
448 }
449
ublk_queue_init(struct ublk_queue * q,unsigned long long extra_flags,__u8 metadata_size)450 static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
451 __u8 metadata_size)
452 {
453 struct ublk_dev *dev = q->dev;
454 int depth = dev->dev_info.queue_depth;
455 int i;
456 int cmd_buf_size, io_buf_size, integrity_size;
457 unsigned long off;
458
459 pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE);
460 q->tgt_ops = dev->tgt.ops;
461 q->flags = 0;
462 q->q_depth = depth;
463 q->flags = dev->dev_info.flags;
464 q->flags |= extra_flags;
465 q->metadata_size = metadata_size;
466
467 /* Cache fd in queue for fast path access */
468 q->ublk_fd = dev->fds[0];
469
470 cmd_buf_size = ublk_queue_cmd_buf_sz(q);
471 off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
472 q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
473 MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
474 if (q->io_cmd_buf == MAP_FAILED) {
475 ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
476 q->dev->dev_info.dev_id, q->q_id);
477 goto fail;
478 }
479
480 io_buf_size = dev->dev_info.max_io_buf_bytes;
481 integrity_size = ublk_integrity_len(q, io_buf_size);
482 for (i = 0; i < q->q_depth; i++) {
483 q->ios[i].buf_addr = NULL;
484 q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
485 q->ios[i].tag = i;
486
487 if (integrity_size) {
488 q->ios[i].integrity_buf = malloc(integrity_size);
489 if (!q->ios[i].integrity_buf) {
490 ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n",
491 dev->dev_info.dev_id, q->q_id, i,
492 integrity_size);
493 goto fail;
494 }
495 }
496
497
498 if (ublk_queue_no_buf(q))
499 continue;
500
501 if (posix_memalign((void **)&q->ios[i].buf_addr,
502 getpagesize(), io_buf_size)) {
503 ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
504 dev->dev_info.dev_id, q->q_id, i);
505 goto fail;
506 }
507 }
508
509 return 0;
510 fail:
511 ublk_queue_deinit(q);
512 ublk_err("ublk dev %d queue %d failed\n",
513 dev->dev_info.dev_id, q->q_id);
514 return -ENOMEM;
515 }
516
ublk_thread_init(struct ublk_thread * t,unsigned long long extra_flags)517 static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags)
518 {
519 struct ublk_dev *dev = t->dev;
520 unsigned long long flags = dev->dev_info.flags | extra_flags;
521 int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
522 int ret;
523
524 /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
525 if (ublk_dev_batch_io(dev))
526 cq_depth += dev->dev_info.queue_depth * 2;
527
528 ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
529 IORING_SETUP_COOP_TASKRUN |
530 IORING_SETUP_SINGLE_ISSUER |
531 IORING_SETUP_DEFER_TASKRUN);
532 if (ret < 0) {
533 ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
534 dev->dev_info.dev_id, t->idx, ret);
535 goto fail;
536 }
537
538 if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
539 unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
540 unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
541 max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
542
543 t->nr_bufs = max_nr_ios_per_thread;
544 } else {
545 t->nr_bufs = 0;
546 }
547
548 if (ublk_dev_batch_io(dev))
549 ublk_batch_prepare(t);
550
551 if (t->nr_bufs) {
552 ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs);
553 if (ret) {
554 ublk_err("ublk dev %d thread %d register spare buffers failed %d\n",
555 dev->dev_info.dev_id, t->idx, ret);
556 goto fail;
557 }
558 }
559
560 if (ublk_dev_batch_io(dev)) {
561 ret = ublk_batch_alloc_buf(t);
562 if (ret) {
563 ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n",
564 dev->dev_info.dev_id, t->idx, ret);
565 goto fail;
566 }
567 }
568
569 io_uring_register_ring_fd(&t->ring);
570
571 if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
572 /* Register only backing files starting from index 1, exclude ublk control device */
573 if (dev->nr_fds > 1) {
574 ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1);
575 } else {
576 /* No backing files to register, skip file registration */
577 ret = 0;
578 }
579 } else {
580 ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
581 }
582 if (ret) {
583 ublk_err("ublk dev %d thread %d register files failed %d\n",
584 t->dev->dev_info.dev_id, t->idx, ret);
585 goto fail;
586 }
587
588 return 0;
589 fail:
590 ublk_thread_deinit(t);
591 ublk_err("ublk dev %d thread %d init failed\n",
592 dev->dev_info.dev_id, t->idx);
593 return -ENOMEM;
594 }
595
596 #define WAIT_USEC 100000
597 #define MAX_WAIT_USEC (3 * 1000000)
ublk_dev_prep(const struct dev_ctx * ctx,struct ublk_dev * dev)598 static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
599 {
600 int dev_id = dev->dev_info.dev_id;
601 unsigned int wait_usec = 0;
602 int ret = 0, fd = -1;
603 char buf[64];
604
605 snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
606
607 while (wait_usec < MAX_WAIT_USEC) {
608 fd = open(buf, O_RDWR);
609 if (fd >= 0)
610 break;
611 usleep(WAIT_USEC);
612 wait_usec += WAIT_USEC;
613 }
614 if (fd < 0) {
615 ublk_err("can't open %s %s\n", buf, strerror(errno));
616 return -1;
617 }
618
619 dev->fds[0] = fd;
620 if (dev->tgt.ops->init_tgt)
621 ret = dev->tgt.ops->init_tgt(ctx, dev);
622 if (ret)
623 close(dev->fds[0]);
624 return ret;
625 }
626
ublk_dev_unprep(struct ublk_dev * dev)627 static void ublk_dev_unprep(struct ublk_dev *dev)
628 {
629 if (dev->tgt.ops->deinit_tgt)
630 dev->tgt.ops->deinit_tgt(dev);
631 close(dev->fds[0]);
632 }
633
ublk_set_auto_buf_reg(const struct ublk_thread * t,const struct ublk_queue * q,struct io_uring_sqe * sqe,unsigned short tag)634 static void ublk_set_auto_buf_reg(const struct ublk_thread *t,
635 const struct ublk_queue *q,
636 struct io_uring_sqe *sqe,
637 unsigned short tag)
638 {
639 struct ublk_auto_buf_reg buf = {};
640
641 if (q->tgt_ops->buf_index)
642 buf.index = q->tgt_ops->buf_index(t, q, tag);
643 else
644 buf.index = ublk_io_buf_idx(t, q, tag);
645
646 if (ublk_queue_auto_zc_fallback(q))
647 buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
648
649 sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
650 }
651
652 /* Copy in pieces to test the buffer offset logic */
653 #define UBLK_USER_COPY_LEN 2048
654
ublk_user_copy(const struct ublk_io * io,__u8 match_ublk_op)655 static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
656 {
657 const struct ublk_queue *q = ublk_io_to_queue(io);
658 const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag);
659 __u64 off = ublk_user_copy_offset(q->q_id, io->tag);
660 __u8 ublk_op = ublksrv_get_op(iod);
661 __u32 len = iod->nr_sectors << 9;
662 void *addr = io->buf_addr;
663 ssize_t copied;
664
665 if (ublk_op != match_ublk_op)
666 return;
667
668 while (len) {
669 __u32 copy_len = min(len, UBLK_USER_COPY_LEN);
670
671 if (ublk_op == UBLK_IO_OP_WRITE)
672 copied = pread(q->ublk_fd, addr, copy_len, off);
673 else if (ublk_op == UBLK_IO_OP_READ)
674 copied = pwrite(q->ublk_fd, addr, copy_len, off);
675 else
676 assert(0);
677 assert(copied == (ssize_t)copy_len);
678 addr += copy_len;
679 off += copy_len;
680 len -= copy_len;
681 }
682
683 if (!(iod->op_flags & UBLK_IO_F_INTEGRITY))
684 return;
685
686 len = ublk_integrity_len(q, iod->nr_sectors << 9);
687 off = ublk_user_copy_offset(q->q_id, io->tag);
688 off |= UBLKSRV_IO_INTEGRITY_FLAG;
689 if (ublk_op == UBLK_IO_OP_WRITE)
690 copied = pread(q->ublk_fd, io->integrity_buf, len, off);
691 else if (ublk_op == UBLK_IO_OP_READ)
692 copied = pwrite(q->ublk_fd, io->integrity_buf, len, off);
693 else
694 assert(0);
695 assert(copied == (ssize_t)len);
696 }
697
ublk_queue_io_cmd(struct ublk_thread * t,struct ublk_io * io)698 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
699 {
700 struct ublk_queue *q = ublk_io_to_queue(io);
701 struct ublksrv_io_cmd *cmd;
702 struct io_uring_sqe *sqe[1];
703 unsigned int cmd_op = 0;
704 __u64 user_data;
705
706 /* only freed io can be issued */
707 if (!(io->flags & UBLKS_IO_FREE))
708 return 0;
709
710 /*
711 * we issue because we need either fetching or committing or
712 * getting data
713 */
714 if (!(io->flags &
715 (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
716 return 0;
717
718 if (io->flags & UBLKS_IO_NEED_GET_DATA)
719 cmd_op = UBLK_U_IO_NEED_GET_DATA;
720 else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) {
721 if (ublk_queue_use_user_copy(q))
722 ublk_user_copy(io, UBLK_IO_OP_READ);
723
724 cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
725 } else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
726 cmd_op = UBLK_U_IO_FETCH_REQ;
727
728 if (io_uring_sq_space_left(&t->ring) < 1)
729 io_uring_submit(&t->ring);
730
731 ublk_io_alloc_sqes(t, sqe, 1);
732 if (!sqe[0]) {
733 ublk_err("%s: run out of sqe. thread %u, tag %d\n",
734 __func__, t->idx, io->tag);
735 return -1;
736 }
737
738 cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
739
740 if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
741 cmd->result = io->result;
742
743 /* These fields should be written once, never change */
744 ublk_set_sqe_cmd_op(sqe[0], cmd_op);
745 sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */
746 sqe[0]->opcode = IORING_OP_URING_CMD;
747 if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
748 sqe[0]->flags = 0; /* Use raw FD, not fixed file */
749 else
750 sqe[0]->flags = IOSQE_FIXED_FILE;
751 sqe[0]->rw_flags = 0;
752 cmd->tag = io->tag;
753 cmd->q_id = q->q_id;
754 if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q))
755 cmd->addr = (__u64) (uintptr_t) io->buf_addr;
756 else
757 cmd->addr = 0;
758
759 if (ublk_queue_use_auto_zc(q))
760 ublk_set_auto_buf_reg(t, q, sqe[0], io->tag);
761
762 user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
763 io_uring_sqe_set_data64(sqe[0], user_data);
764
765 io->flags = 0;
766
767 t->cmd_inflight += 1;
768
769 ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
770 __func__, t->idx, q->q_id, io->tag, cmd_op,
771 io->flags, !!(t->state & UBLKS_T_STOPPING));
772 return 1;
773 }
774
ublk_submit_fetch_commands(struct ublk_thread * t)775 static void ublk_submit_fetch_commands(struct ublk_thread *t)
776 {
777 struct ublk_queue *q;
778 struct ublk_io *io;
779 int i = 0, j = 0;
780
781 if (t->dev->per_io_tasks) {
782 /*
783 * Lexicographically order all the (qid,tag) pairs, with
784 * qid taking priority (so (1,0) > (0,1)). Then make
785 * this thread the daemon for every Nth entry in this
786 * list (N is the number of threads), starting at this
787 * thread's index. This ensures that each queue is
788 * handled by as many ublk server threads as possible,
789 * so that load that is concentrated on one or a few
790 * queues can make use of all ublk server threads.
791 */
792 const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
793 int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
794 for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
795 int q_id = i / dinfo->queue_depth;
796 int tag = i % dinfo->queue_depth;
797 q = &t->dev->q[q_id];
798 io = &q->ios[tag];
799 io->buf_index = j++;
800 if (q->tgt_ops->pre_fetch_io)
801 q->tgt_ops->pre_fetch_io(t, q, tag, false);
802 ublk_queue_io_cmd(t, io);
803 }
804 } else {
805 /*
806 * Service exclusively the queue whose q_id matches our
807 * thread index.
808 */
809 struct ublk_queue *q = &t->dev->q[t->idx];
810 for (i = 0; i < q->q_depth; i++) {
811 io = &q->ios[i];
812 io->buf_index = i;
813 if (q->tgt_ops->pre_fetch_io)
814 q->tgt_ops->pre_fetch_io(t, q, i, false);
815 ublk_queue_io_cmd(t, io);
816 }
817 }
818 }
819
ublk_thread_is_idle(struct ublk_thread * t)820 static int ublk_thread_is_idle(struct ublk_thread *t)
821 {
822 return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
823 }
824
ublk_thread_is_done(struct ublk_thread * t)825 static int ublk_thread_is_done(struct ublk_thread *t)
826 {
827 return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight;
828 }
829
ublksrv_handle_tgt_cqe(struct ublk_thread * t,struct ublk_queue * q,struct io_uring_cqe * cqe)830 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
831 struct ublk_queue *q,
832 struct io_uring_cqe *cqe)
833 {
834 if (cqe->res < 0 && cqe->res != -EAGAIN)
835 ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
836 __func__, cqe->res, q->q_id,
837 user_data_to_tag(cqe->user_data),
838 user_data_to_op(cqe->user_data));
839
840 if (q->tgt_ops->tgt_io_done)
841 q->tgt_ops->tgt_io_done(t, q, cqe);
842 }
843
ublk_handle_uring_cmd(struct ublk_thread * t,struct ublk_queue * q,const struct io_uring_cqe * cqe)844 static void ublk_handle_uring_cmd(struct ublk_thread *t,
845 struct ublk_queue *q,
846 const struct io_uring_cqe *cqe)
847 {
848 int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
849 !(t->state & UBLKS_T_STOPPING);
850 unsigned tag = user_data_to_tag(cqe->user_data);
851 struct ublk_io *io = &q->ios[tag];
852
853 t->cmd_inflight--;
854
855 if (!fetch) {
856 t->state |= UBLKS_T_STOPPING;
857 io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
858 }
859
860 if (cqe->res == UBLK_IO_RES_OK) {
861 ublk_assert(tag < q->q_depth);
862
863 if (ublk_queue_use_user_copy(q))
864 ublk_user_copy(io, UBLK_IO_OP_WRITE);
865
866 if (q->tgt_ops->queue_io)
867 q->tgt_ops->queue_io(t, q, tag);
868 } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
869 io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
870 ublk_queue_io_cmd(t, io);
871 } else {
872 /*
873 * COMMIT_REQ will be completed immediately since no fetching
874 * piggyback is required.
875 *
876 * Marking IO_FREE only, then this io won't be issued since
877 * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
878 *
879 * */
880 io->flags = UBLKS_IO_FREE;
881 }
882 }
883
ublk_handle_cqe(struct ublk_thread * t,struct io_uring_cqe * cqe,void * data)884 static void ublk_handle_cqe(struct ublk_thread *t,
885 struct io_uring_cqe *cqe, void *data)
886 {
887 struct ublk_dev *dev = t->dev;
888 unsigned q_id = user_data_to_q_id(cqe->user_data);
889 unsigned cmd_op = user_data_to_op(cqe->user_data);
890
891 if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS)
892 ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
893 cqe->res, cqe->user_data, t->state);
894
895 ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x "
896 "data %lx target %d/%d) stopping %d\n",
897 __func__, cqe->res, t->idx, q_id,
898 user_data_to_tag(cqe->user_data),
899 cmd_op, cqe->user_data, is_target_io(cqe->user_data),
900 user_data_to_tgt_data(cqe->user_data),
901 (t->state & UBLKS_T_STOPPING));
902
903 /* Don't retrieve io in case of target io */
904 if (is_target_io(cqe->user_data)) {
905 ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe);
906 return;
907 }
908
909 if (ublk_thread_batch_io(t))
910 ublk_batch_compl_cmd(t, cqe);
911 else
912 ublk_handle_uring_cmd(t, &dev->q[q_id], cqe);
913 }
914
ublk_reap_events_uring(struct ublk_thread * t)915 static int ublk_reap_events_uring(struct ublk_thread *t)
916 {
917 struct io_uring_cqe *cqe;
918 unsigned head;
919 int count = 0;
920
921 io_uring_for_each_cqe(&t->ring, head, cqe) {
922 ublk_handle_cqe(t, cqe, NULL);
923 count += 1;
924 }
925 io_uring_cq_advance(&t->ring, count);
926
927 return count;
928 }
929
ublk_process_io(struct ublk_thread * t)930 static int ublk_process_io(struct ublk_thread *t)
931 {
932 int ret, reapped;
933
934 ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
935 t->dev->dev_info.dev_id,
936 t->idx, io_uring_sq_ready(&t->ring),
937 t->cmd_inflight,
938 (t->state & UBLKS_T_STOPPING));
939
940 if (ublk_thread_is_done(t))
941 return -ENODEV;
942
943 ret = io_uring_submit_and_wait(&t->ring, 1);
944 if (ublk_thread_batch_io(t)) {
945 ublk_batch_prep_commit(t);
946 reapped = ublk_reap_events_uring(t);
947 ublk_batch_commit_io_cmds(t);
948 } else {
949 reapped = ublk_reap_events_uring(t);
950 }
951
952 ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
953 ret, reapped, (t->state & UBLKS_T_STOPPING),
954 (t->state & UBLKS_T_IDLE));
955
956 return reapped;
957 }
958
959 struct ublk_thread_info {
960 struct ublk_dev *dev;
961 pthread_t thread;
962 unsigned idx;
963 sem_t *ready;
964 cpu_set_t *affinity;
965 unsigned long long extra_flags;
966 unsigned char (*q_thread_map)[UBLK_MAX_QUEUES];
967 };
968
ublk_thread_set_sched_affinity(const struct ublk_thread_info * info)969 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
970 {
971 if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0)
972 ublk_err("ublk dev %u thread %u set affinity failed",
973 info->dev->dev_info.dev_id, info->idx);
974 }
975
ublk_batch_setup_queues(struct ublk_thread * t)976 static void ublk_batch_setup_queues(struct ublk_thread *t)
977 {
978 int i;
979
980 for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
981 struct ublk_queue *q = &t->dev->q[i];
982 int ret;
983
984 /*
985 * Only prepare io commands in the mapped thread context,
986 * otherwise io command buffer index may not work as expected
987 */
988 if (t->q_map[i] == 0)
989 continue;
990
991 if (q->tgt_ops->pre_fetch_io)
992 q->tgt_ops->pre_fetch_io(t, q, 0, true);
993
994 ret = ublk_batch_queue_prep_io_cmds(t, q);
995 ublk_assert(ret >= 0);
996 }
997 }
998
__ublk_io_handler_fn(struct ublk_thread_info * info)999 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
1000 {
1001 struct ublk_thread t = {
1002 .dev = info->dev,
1003 .idx = info->idx,
1004 };
1005 int dev_id = info->dev->dev_info.dev_id;
1006 int ret;
1007
1008 /* Copy per-thread queue mapping into thread-local variable */
1009 if (info->q_thread_map)
1010 memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map));
1011
1012 ret = ublk_thread_init(&t, info->extra_flags);
1013 if (ret) {
1014 ublk_err("ublk dev %d thread %u init failed\n",
1015 dev_id, t.idx);
1016 return ret;
1017 }
1018 sem_post(info->ready);
1019
1020 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
1021 gettid(), dev_id, t.idx);
1022
1023 if (!ublk_thread_batch_io(&t)) {
1024 /* submit all io commands to ublk driver */
1025 ublk_submit_fetch_commands(&t);
1026 } else {
1027 ublk_batch_setup_queues(&t);
1028 ublk_batch_start_fetch(&t);
1029 }
1030
1031 do {
1032 if (ublk_process_io(&t) < 0)
1033 break;
1034 } while (1);
1035
1036 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
1037 gettid(), dev_id, t.idx);
1038 ublk_thread_deinit(&t);
1039 return 0;
1040 }
1041
ublk_io_handler_fn(void * data)1042 static void *ublk_io_handler_fn(void *data)
1043 {
1044 struct ublk_thread_info *info = data;
1045
1046 /*
1047 * IO perf is sensitive with queue pthread affinity on NUMA machine
1048 *
1049 * Set sched_affinity at beginning, so following allocated memory/pages
1050 * could be CPU/NUMA aware.
1051 */
1052 if (info->affinity)
1053 ublk_thread_set_sched_affinity(info);
1054
1055 __ublk_io_handler_fn(info);
1056
1057 return NULL;
1058 }
1059
ublk_set_parameters(struct ublk_dev * dev)1060 static void ublk_set_parameters(struct ublk_dev *dev)
1061 {
1062 int ret;
1063
1064 ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
1065 if (ret)
1066 ublk_err("dev %d set basic parameter failed %d\n",
1067 dev->dev_info.dev_id, ret);
1068 }
1069
ublk_send_dev_event(const struct dev_ctx * ctx,struct ublk_dev * dev,int dev_id)1070 static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
1071 {
1072 uint64_t id;
1073 int evtfd = ctx->_evtfd;
1074
1075 if (evtfd < 0)
1076 return -EBADF;
1077
1078 if (dev_id >= 0)
1079 id = dev_id + 1;
1080 else
1081 id = ERROR_EVTFD_DEVID;
1082
1083 if (dev && ctx->shadow_dev)
1084 memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
1085
1086 if (write(evtfd, &id, sizeof(id)) != sizeof(id))
1087 return -EINVAL;
1088
1089 close(evtfd);
1090 shmdt(ctx->shadow_dev);
1091
1092 return 0;
1093 }
1094
1095
1096 /*
1097 * Shared memory registration socket listener.
1098 *
1099 * The parent daemon context listens on a per-device unix socket at
1100 * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests
1101 * from clients. Clients send a memfd via SCM_RIGHTS; the server
1102 * registers it with the kernel, mmaps it, and returns the assigned index.
1103 */
1104 #define UBLK_SHMEM_SOCK_DIR "/run/ublk"
1105
1106 /* defined in kublk.h, shared with file_backed.c (loop target) */
1107 struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
1108 int shmem_count;
1109
ublk_shmem_sock_path(int dev_id,char * buf,size_t len)1110 static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len)
1111 {
1112 snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id);
1113 }
1114
ublk_shmem_sock_create(int dev_id)1115 static int ublk_shmem_sock_create(int dev_id)
1116 {
1117 struct sockaddr_un addr = { .sun_family = AF_UNIX };
1118 char path[108];
1119 int fd;
1120
1121 mkdir(UBLK_SHMEM_SOCK_DIR, 0755);
1122 ublk_shmem_sock_path(dev_id, path, sizeof(path));
1123 unlink(path);
1124
1125 fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
1126 if (fd < 0)
1127 return -1;
1128
1129 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
1130 if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
1131 close(fd);
1132 return -1;
1133 }
1134
1135 listen(fd, 4);
1136 ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path);
1137 return fd;
1138 }
1139
ublk_shmem_sock_destroy(int dev_id,int sock_fd)1140 static void ublk_shmem_sock_destroy(int dev_id, int sock_fd)
1141 {
1142 char path[108];
1143
1144 if (sock_fd >= 0)
1145 close(sock_fd);
1146 ublk_shmem_sock_path(dev_id, path, sizeof(path));
1147 unlink(path);
1148 }
1149
1150 /* Receive a memfd from a client via SCM_RIGHTS */
ublk_shmem_recv_fd(int client_fd)1151 static int ublk_shmem_recv_fd(int client_fd)
1152 {
1153 char buf[1];
1154 struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) };
1155 union {
1156 char cmsg_buf[CMSG_SPACE(sizeof(int))];
1157 struct cmsghdr align;
1158 } u;
1159 struct msghdr msg = {
1160 .msg_iov = &iov,
1161 .msg_iovlen = 1,
1162 .msg_control = u.cmsg_buf,
1163 .msg_controllen = sizeof(u.cmsg_buf),
1164 };
1165 struct cmsghdr *cmsg;
1166
1167 if (recvmsg(client_fd, &msg, 0) <= 0)
1168 return -1;
1169
1170 cmsg = CMSG_FIRSTHDR(&msg);
1171 if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
1172 cmsg->cmsg_type != SCM_RIGHTS)
1173 return -1;
1174
1175 return *(int *)CMSG_DATA(cmsg);
1176 }
1177
1178 /* Register a shared memory buffer: store fd, mmap it, return index */
ublk_shmem_register(int shmem_fd)1179 static int ublk_shmem_register(int shmem_fd)
1180 {
1181 off_t size;
1182 void *base;
1183 int idx;
1184
1185 if (shmem_count >= UBLK_BUF_MAX)
1186 return -1;
1187
1188 size = lseek(shmem_fd, 0, SEEK_END);
1189 if (size <= 0)
1190 return -1;
1191
1192 base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
1193 shmem_fd, 0);
1194 if (base == MAP_FAILED)
1195 return -1;
1196
1197 idx = shmem_count++;
1198 shmem_table[idx].fd = shmem_fd;
1199 shmem_table[idx].mmap_base = base;
1200 shmem_table[idx].size = size;
1201
1202 ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n",
1203 idx, shmem_fd, (size_t)size);
1204 return idx;
1205 }
1206
ublk_shmem_unregister_all(void)1207 static void ublk_shmem_unregister_all(void)
1208 {
1209 int i;
1210
1211 for (i = 0; i < shmem_count; i++) {
1212 if (shmem_table[i].mmap_base) {
1213 munmap(shmem_table[i].mmap_base,
1214 shmem_table[i].size);
1215 close(shmem_table[i].fd);
1216 shmem_table[i].mmap_base = NULL;
1217 }
1218 }
1219 shmem_count = 0;
1220 }
1221
ublk_ctrl_reg_buf(struct ublk_dev * dev,void * addr,size_t size,__u32 flags)1222 static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size,
1223 __u32 flags)
1224 {
1225 struct ublk_shmem_buf_reg buf_reg = {
1226 .addr = (unsigned long)addr,
1227 .len = size,
1228 .flags = flags,
1229 };
1230 struct ublk_ctrl_cmd_data data = {
1231 .cmd_op = UBLK_U_CMD_REG_BUF,
1232 .flags = CTRL_CMD_HAS_BUF,
1233 .addr = (unsigned long)&buf_reg,
1234 .len = sizeof(buf_reg),
1235 };
1236
1237 return __ublk_ctrl_cmd(dev, &data);
1238 }
1239
1240 /*
1241 * Handle one client connection: receive memfd, mmap it, register
1242 * the VA range with kernel, send back the assigned index.
1243 */
ublk_shmem_handle_client(int sock_fd,struct ublk_dev * dev)1244 static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev)
1245 {
1246 int client_fd, memfd, idx, ret;
1247 int32_t reply;
1248 off_t size;
1249 void *base;
1250
1251 client_fd = accept(sock_fd, NULL, NULL);
1252 if (client_fd < 0)
1253 return;
1254
1255 memfd = ublk_shmem_recv_fd(client_fd);
1256 if (memfd < 0) {
1257 reply = -1;
1258 goto out;
1259 }
1260
1261 /* mmap the memfd in server address space */
1262 size = lseek(memfd, 0, SEEK_END);
1263 if (size <= 0) {
1264 reply = -1;
1265 close(memfd);
1266 goto out;
1267 }
1268 base = mmap(NULL, size, PROT_READ | PROT_WRITE,
1269 MAP_SHARED | MAP_POPULATE, memfd, 0);
1270 if (base == MAP_FAILED) {
1271 reply = -1;
1272 close(memfd);
1273 goto out;
1274 }
1275
1276 /* Register server's VA range with kernel for PFN matching */
1277 ret = ublk_ctrl_reg_buf(dev, base, size, 0);
1278 if (ret < 0) {
1279 ublk_dbg(UBLK_DBG_DEV,
1280 "shmem_zc: kernel reg failed %d\n", ret);
1281 munmap(base, size);
1282 close(memfd);
1283 reply = ret;
1284 goto out;
1285 }
1286
1287 /* Store in table for I/O handling */
1288 idx = ublk_shmem_register(memfd);
1289 if (idx >= 0) {
1290 shmem_table[idx].mmap_base = base;
1291 shmem_table[idx].size = size;
1292 }
1293 reply = idx;
1294 out:
1295 send(client_fd, &reply, sizeof(reply), 0);
1296 close(client_fd);
1297 }
1298
1299 struct shmem_listener_info {
1300 int dev_id;
1301 int stop_efd; /* eventfd to signal listener to stop */
1302 int sock_fd; /* listener socket fd (output) */
1303 struct ublk_dev *dev;
1304 };
1305
1306 /*
1307 * Socket listener thread: runs in the parent daemon context alongside
1308 * the I/O threads. Accepts shared memory registration requests from
1309 * clients via SCM_RIGHTS. Exits when stop_efd is signaled.
1310 */
ublk_shmem_listener_fn(void * data)1311 static void *ublk_shmem_listener_fn(void *data)
1312 {
1313 struct shmem_listener_info *info = data;
1314 struct pollfd pfds[2];
1315
1316 info->sock_fd = ublk_shmem_sock_create(info->dev_id);
1317 if (info->sock_fd < 0)
1318 return NULL;
1319
1320 pfds[0].fd = info->sock_fd;
1321 pfds[0].events = POLLIN;
1322 pfds[1].fd = info->stop_efd;
1323 pfds[1].events = POLLIN;
1324
1325 while (1) {
1326 int ret = poll(pfds, 2, -1);
1327
1328 if (ret < 0)
1329 break;
1330
1331 /* Stop signal from parent */
1332 if (pfds[1].revents & POLLIN)
1333 break;
1334
1335 /* Client connection */
1336 if (pfds[0].revents & POLLIN)
1337 ublk_shmem_handle_client(info->sock_fd, info->dev);
1338 }
1339
1340 return NULL;
1341 }
1342
ublk_shmem_htlb_setup(const struct dev_ctx * ctx,struct ublk_dev * dev)1343 static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx,
1344 struct ublk_dev *dev)
1345 {
1346 int fd, idx, ret;
1347 struct stat st;
1348 void *base;
1349
1350 fd = open(ctx->htlb_path, O_RDWR);
1351 if (fd < 0) {
1352 ublk_err("htlb: can't open %s\n", ctx->htlb_path);
1353 return -errno;
1354 }
1355
1356 if (fstat(fd, &st) < 0 || st.st_size <= 0) {
1357 ublk_err("htlb: invalid file size\n");
1358 close(fd);
1359 return -EINVAL;
1360 }
1361
1362 base = mmap(NULL, st.st_size,
1363 ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE,
1364 MAP_SHARED | MAP_POPULATE, fd, 0);
1365 if (base == MAP_FAILED) {
1366 ublk_err("htlb: mmap failed\n");
1367 close(fd);
1368 return -ENOMEM;
1369 }
1370
1371 ret = ublk_ctrl_reg_buf(dev, base, st.st_size,
1372 ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0);
1373 if (ret < 0) {
1374 ublk_err("htlb: reg_buf failed: %d\n", ret);
1375 munmap(base, st.st_size);
1376 close(fd);
1377 return ret;
1378 }
1379
1380 if (shmem_count >= UBLK_BUF_MAX) {
1381 munmap(base, st.st_size);
1382 close(fd);
1383 return -ENOMEM;
1384 }
1385
1386 idx = shmem_count++;
1387 shmem_table[idx].fd = fd;
1388 shmem_table[idx].mmap_base = base;
1389 shmem_table[idx].size = st.st_size;
1390
1391 ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n",
1392 idx, (size_t)st.st_size);
1393 return 0;
1394 }
1395
ublk_start_daemon(const struct dev_ctx * ctx,struct ublk_dev * dev)1396 static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
1397 {
1398 const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
1399 struct shmem_listener_info linfo = {};
1400 struct ublk_thread_info *tinfo;
1401 unsigned long long extra_flags = 0;
1402 cpu_set_t *affinity_buf;
1403 unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
1404 uint64_t stop_val = 1;
1405 pthread_t listener;
1406 void *thread_ret;
1407 sem_t ready;
1408 int ret, i;
1409
1410 ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
1411
1412 tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
1413 if (!tinfo)
1414 return -ENOMEM;
1415
1416 sem_init(&ready, 0, 0);
1417 ret = ublk_dev_prep(ctx, dev);
1418 if (ret)
1419 return ret;
1420
1421 ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
1422 if (ret)
1423 return ret;
1424
1425 if (ublk_dev_batch_io(dev)) {
1426 q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map));
1427 if (!q_thread_map) {
1428 ret = -ENOMEM;
1429 goto fail;
1430 }
1431 ublk_batch_setup_map(q_thread_map, dev->nthreads,
1432 dinfo->nr_hw_queues);
1433 }
1434
1435 if (ctx->auto_zc_fallback)
1436 extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
1437 if (ctx->no_ublk_fixed_fd)
1438 extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD;
1439
1440 for (i = 0; i < dinfo->nr_hw_queues; i++) {
1441 dev->q[i].dev = dev;
1442 dev->q[i].q_id = i;
1443
1444 ret = ublk_queue_init(&dev->q[i], extra_flags,
1445 ctx->metadata_size);
1446 if (ret) {
1447 ublk_err("ublk dev %d queue %d init queue failed\n",
1448 dinfo->dev_id, i);
1449 goto fail;
1450 }
1451 }
1452
1453 for (i = 0; i < dev->nthreads; i++) {
1454 tinfo[i].dev = dev;
1455 tinfo[i].idx = i;
1456 tinfo[i].ready = &ready;
1457 tinfo[i].extra_flags = extra_flags;
1458 tinfo[i].q_thread_map = q_thread_map;
1459
1460 /*
1461 * If threads are not tied 1:1 to queues, setting thread
1462 * affinity based on queue affinity makes little sense.
1463 * However, thread CPU affinity has significant impact
1464 * on performance, so to compare fairly, we'll still set
1465 * thread CPU affinity based on queue affinity where
1466 * possible.
1467 */
1468 if (dev->nthreads == dinfo->nr_hw_queues)
1469 tinfo[i].affinity = &affinity_buf[i];
1470 pthread_create(&tinfo[i].thread, NULL,
1471 ublk_io_handler_fn,
1472 &tinfo[i]);
1473 }
1474
1475 for (i = 0; i < dev->nthreads; i++)
1476 sem_wait(&ready);
1477 free(affinity_buf);
1478 free(q_thread_map);
1479
1480 /* everything is fine now, start us */
1481 if (ctx->recovery)
1482 ret = ublk_ctrl_end_user_recovery(dev, getpid());
1483 else {
1484 ublk_set_parameters(dev);
1485 ret = ublk_ctrl_start_dev(dev, getpid());
1486 }
1487 if (ret < 0) {
1488 ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
1489 /* stop device so that inflight uring_cmd can be cancelled */
1490 ublk_ctrl_stop_dev(dev);
1491 goto fail_start;
1492 }
1493
1494 if (ctx->htlb_path) {
1495 ret = ublk_shmem_htlb_setup(ctx, dev);
1496 if (ret < 0) {
1497 ublk_err("htlb setup failed: %d\n", ret);
1498 ublk_ctrl_stop_dev(dev);
1499 goto fail_start;
1500 }
1501 }
1502
1503 ublk_ctrl_get_info(dev);
1504 if (ctx->fg)
1505 ublk_ctrl_dump(dev);
1506 else
1507 ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
1508 fail_start:
1509 /*
1510 * Wait for I/O threads to exit. While waiting, a listener
1511 * thread accepts shared memory registration requests from
1512 * clients via a per-device unix socket (SCM_RIGHTS fd passing).
1513 */
1514 linfo.dev_id = dinfo->dev_id;
1515 linfo.dev = dev;
1516 linfo.stop_efd = eventfd(0, 0);
1517 if (linfo.stop_efd >= 0)
1518 pthread_create(&listener, NULL,
1519 ublk_shmem_listener_fn, &linfo);
1520
1521 for (i = 0; i < (int)dev->nthreads; i++)
1522 pthread_join(tinfo[i].thread, &thread_ret);
1523
1524 /* Signal listener thread to stop and wait for it */
1525 if (linfo.stop_efd >= 0) {
1526 write(linfo.stop_efd, &stop_val, sizeof(stop_val));
1527 pthread_join(listener, NULL);
1528 close(linfo.stop_efd);
1529 ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd);
1530 }
1531 ublk_shmem_unregister_all();
1532 free(tinfo);
1533 fail:
1534 for (i = 0; i < dinfo->nr_hw_queues; i++)
1535 ublk_queue_deinit(&dev->q[i]);
1536 ublk_dev_unprep(dev);
1537 ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
1538
1539 return ret;
1540 }
1541
wait_ublk_dev(const char * path,int evt_mask,unsigned timeout)1542 static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
1543 {
1544 #define EV_SIZE (sizeof(struct inotify_event))
1545 #define EV_BUF_LEN (128 * (EV_SIZE + 16))
1546 struct pollfd pfd;
1547 int fd, wd;
1548 int ret = -EINVAL;
1549 const char *dev_name = basename(path);
1550
1551 fd = inotify_init();
1552 if (fd < 0) {
1553 ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
1554 return fd;
1555 }
1556
1557 wd = inotify_add_watch(fd, "/dev", evt_mask);
1558 if (wd == -1) {
1559 ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
1560 goto fail;
1561 }
1562
1563 pfd.fd = fd;
1564 pfd.events = POLL_IN;
1565 while (1) {
1566 int i = 0;
1567 char buffer[EV_BUF_LEN];
1568 ret = poll(&pfd, 1, 1000 * timeout);
1569
1570 if (ret == -1) {
1571 ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
1572 goto rm_watch;
1573 } else if (ret == 0) {
1574 ublk_err("%s: poll inotify timeout\n", __func__);
1575 ret = -ETIMEDOUT;
1576 goto rm_watch;
1577 }
1578
1579 ret = read(fd, buffer, EV_BUF_LEN);
1580 if (ret < 0) {
1581 ublk_err("%s: read inotify fd failed\n", __func__);
1582 goto rm_watch;
1583 }
1584
1585 while (i < ret) {
1586 struct inotify_event *event = (struct inotify_event *)&buffer[i];
1587
1588 ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
1589 __func__, event->mask, event->name);
1590 if (event->mask & evt_mask) {
1591 if (!strcmp(event->name, dev_name)) {
1592 ret = 0;
1593 goto rm_watch;
1594 }
1595 }
1596 i += EV_SIZE + event->len;
1597 }
1598 }
1599 rm_watch:
1600 inotify_rm_watch(fd, wd);
1601 fail:
1602 close(fd);
1603 return ret;
1604 }
1605
ublk_stop_io_daemon(const struct ublk_dev * dev)1606 static int ublk_stop_io_daemon(const struct ublk_dev *dev)
1607 {
1608 int daemon_pid = dev->dev_info.ublksrv_pid;
1609 int dev_id = dev->dev_info.dev_id;
1610 char ublkc[64];
1611 int ret = 0;
1612
1613 if (daemon_pid < 0)
1614 return 0;
1615
1616 /* daemon may be dead already */
1617 if (kill(daemon_pid, 0) < 0)
1618 goto wait;
1619
1620 snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
1621
1622 /* ublk char device may be gone already */
1623 if (access(ublkc, F_OK) != 0)
1624 goto wait;
1625
1626 /* Wait until ublk char device is closed, when the daemon is shutdown */
1627 ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
1628 /* double check and since it may be closed before starting inotify */
1629 if (ret == -ETIMEDOUT)
1630 ret = kill(daemon_pid, 0) < 0;
1631 wait:
1632 waitpid(daemon_pid, NULL, 0);
1633 ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
1634 __func__, daemon_pid, dev_id, ret);
1635
1636 return ret;
1637 }
1638
__cmd_dev_add(const struct dev_ctx * ctx)1639 static int __cmd_dev_add(const struct dev_ctx *ctx)
1640 {
1641 unsigned nthreads = ctx->nthreads;
1642 unsigned nr_queues = ctx->nr_hw_queues;
1643 const char *tgt_type = ctx->tgt_type;
1644 unsigned depth = ctx->queue_depth;
1645 __u64 features;
1646 const struct ublk_tgt_ops *ops;
1647 struct ublksrv_ctrl_dev_info *info;
1648 struct ublk_dev *dev = NULL;
1649 int dev_id = ctx->dev_id;
1650 int ret, i;
1651
1652 ops = ublk_find_tgt(tgt_type);
1653 if (!ops) {
1654 ublk_err("%s: no such tgt type, type %s\n",
1655 __func__, tgt_type);
1656 ret = -ENODEV;
1657 goto fail;
1658 }
1659
1660 if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
1661 ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
1662 __func__, nr_queues, depth);
1663 ret = -EINVAL;
1664 goto fail;
1665 }
1666
1667 /* default to 1:1 threads:queues if nthreads is unspecified */
1668 if (!nthreads)
1669 nthreads = nr_queues;
1670
1671 if (nthreads > UBLK_MAX_THREADS) {
1672 ublk_err("%s: %u is too many threads (max %u)\n",
1673 __func__, nthreads, UBLK_MAX_THREADS);
1674 ret = -EINVAL;
1675 goto fail;
1676 }
1677
1678 if (nthreads != nr_queues && (!ctx->per_io_tasks &&
1679 !(ctx->flags & UBLK_F_BATCH_IO))) {
1680 ublk_err("%s: threads %u must be same as queues %u if "
1681 "not using per_io_tasks\n",
1682 __func__, nthreads, nr_queues);
1683 ret = -EINVAL;
1684 goto fail;
1685 }
1686
1687 dev = ublk_ctrl_init();
1688 if (!dev) {
1689 ublk_err("%s: can't alloc dev id %d, type %s\n",
1690 __func__, dev_id, tgt_type);
1691 ret = -ENOMEM;
1692 goto fail;
1693 }
1694
1695 /* kernel doesn't support get_features */
1696 ret = ublk_ctrl_get_features(dev, &features);
1697 if (ret < 0) {
1698 ret = -EINVAL;
1699 goto fail;
1700 }
1701
1702 if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
1703 ret = -ENOTSUP;
1704 goto fail;
1705 }
1706
1707 info = &dev->dev_info;
1708 info->dev_id = ctx->dev_id;
1709 info->nr_hw_queues = nr_queues;
1710 info->queue_depth = depth;
1711 info->flags = ctx->flags;
1712 if ((features & UBLK_F_QUIESCE) &&
1713 (info->flags & UBLK_F_USER_RECOVERY))
1714 info->flags |= UBLK_F_QUIESCE;
1715 dev->nthreads = nthreads;
1716 dev->per_io_tasks = ctx->per_io_tasks;
1717 dev->tgt.ops = ops;
1718 dev->tgt.sq_depth = depth;
1719 dev->tgt.cq_depth = depth;
1720
1721 for (i = 0; i < MAX_BACK_FILES; i++) {
1722 if (ctx->files[i]) {
1723 strcpy(dev->tgt.backing_file[i], ctx->files[i]);
1724 dev->tgt.nr_backing_files++;
1725 }
1726 }
1727
1728 if (ctx->recovery)
1729 ret = ublk_ctrl_start_user_recovery(dev);
1730 else
1731 ret = ublk_ctrl_add_dev(dev);
1732 if (ret < 0) {
1733 ublk_err("%s: can't add dev id %d, type %s ret %d\n",
1734 __func__, dev_id, tgt_type, ret);
1735 goto fail;
1736 }
1737
1738 ret = ublk_start_daemon(ctx, dev);
1739 ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
1740 if (ret < 0)
1741 ublk_ctrl_del_dev(dev);
1742
1743 fail:
1744 if (ret < 0)
1745 ublk_send_dev_event(ctx, dev, -1);
1746 if (dev)
1747 ublk_ctrl_deinit(dev);
1748 return ret;
1749 }
1750
1751 static int __cmd_dev_list(struct dev_ctx *ctx);
1752
cmd_dev_add(struct dev_ctx * ctx)1753 static int cmd_dev_add(struct dev_ctx *ctx)
1754 {
1755 int res;
1756
1757 if (ctx->fg)
1758 goto run;
1759
1760 ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
1761 if (ctx->_shmid < 0) {
1762 ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
1763 exit(-1);
1764 }
1765 ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
1766 if (ctx->shadow_dev == (struct ublk_dev *)-1) {
1767 ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
1768 exit(-1);
1769 }
1770 ctx->_evtfd = eventfd(0, 0);
1771 if (ctx->_evtfd < 0) {
1772 ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
1773 exit(-1);
1774 }
1775
1776 res = fork();
1777 if (res == 0) {
1778 int res2;
1779
1780 setsid();
1781 res2 = fork();
1782 if (res2 == 0) {
1783 /* prepare for detaching */
1784 close(STDIN_FILENO);
1785 close(STDOUT_FILENO);
1786 close(STDERR_FILENO);
1787 run:
1788 res = __cmd_dev_add(ctx);
1789 return res;
1790 } else {
1791 /* detached from the foreground task */
1792 exit(EXIT_SUCCESS);
1793 }
1794 } else if (res > 0) {
1795 uint64_t id;
1796 int exit_code = EXIT_FAILURE;
1797
1798 res = read(ctx->_evtfd, &id, sizeof(id));
1799 close(ctx->_evtfd);
1800 if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
1801 ctx->dev_id = id - 1;
1802 if (__cmd_dev_list(ctx) >= 0)
1803 exit_code = EXIT_SUCCESS;
1804 }
1805 shmdt(ctx->shadow_dev);
1806 shmctl(ctx->_shmid, IPC_RMID, NULL);
1807 /* wait for child and detach from it */
1808 wait(NULL);
1809 if (exit_code == EXIT_FAILURE)
1810 ublk_err("%s: command failed\n", __func__);
1811 exit(exit_code);
1812 } else {
1813 exit(EXIT_FAILURE);
1814 }
1815 }
1816
__cmd_dev_del(struct dev_ctx * ctx)1817 static int __cmd_dev_del(struct dev_ctx *ctx)
1818 {
1819 int number = ctx->dev_id;
1820 struct ublk_dev *dev;
1821 int ret;
1822
1823 dev = ublk_ctrl_init();
1824 dev->dev_info.dev_id = number;
1825
1826 ret = ublk_ctrl_get_info(dev);
1827 if (ret < 0)
1828 goto fail;
1829
1830 ret = ublk_ctrl_stop_dev(dev);
1831 if (ret < 0)
1832 ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
1833
1834 ret = ublk_stop_io_daemon(dev);
1835 if (ret < 0)
1836 ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
1837 __func__, dev->dev_info.ublksrv_pid, number, ret);
1838 ublk_ctrl_del_dev(dev);
1839 fail:
1840 ublk_ctrl_deinit(dev);
1841
1842 return (ret >= 0) ? 0 : ret;
1843 }
1844
cmd_dev_del(struct dev_ctx * ctx)1845 static int cmd_dev_del(struct dev_ctx *ctx)
1846 {
1847 int i;
1848
1849 if (ctx->dev_id >= 0 || !ctx->all)
1850 return __cmd_dev_del(ctx);
1851
1852 for (i = 0; i < 255; i++) {
1853 ctx->dev_id = i;
1854 __cmd_dev_del(ctx);
1855 }
1856 return 0;
1857 }
1858
cmd_dev_stop(struct dev_ctx * ctx)1859 static int cmd_dev_stop(struct dev_ctx *ctx)
1860 {
1861 int number = ctx->dev_id;
1862 struct ublk_dev *dev;
1863 int ret;
1864
1865 if (number < 0) {
1866 ublk_err("%s: device id is required\n", __func__);
1867 return -EINVAL;
1868 }
1869
1870 dev = ublk_ctrl_init();
1871 dev->dev_info.dev_id = number;
1872
1873 ret = ublk_ctrl_get_info(dev);
1874 if (ret < 0)
1875 goto fail;
1876
1877 if (ctx->safe_stop) {
1878 ret = ublk_ctrl_try_stop_dev(dev);
1879 if (ret < 0)
1880 ublk_err("%s: try_stop dev %d failed ret %d\n",
1881 __func__, number, ret);
1882 } else {
1883 ret = ublk_ctrl_stop_dev(dev);
1884 if (ret < 0)
1885 ublk_err("%s: stop dev %d failed ret %d\n",
1886 __func__, number, ret);
1887 }
1888
1889 fail:
1890 ublk_ctrl_deinit(dev);
1891
1892 return ret;
1893 }
1894
__cmd_dev_list(struct dev_ctx * ctx)1895 static int __cmd_dev_list(struct dev_ctx *ctx)
1896 {
1897 struct ublk_dev *dev = ublk_ctrl_init();
1898 int ret;
1899
1900 if (!dev)
1901 return -ENODEV;
1902
1903 dev->dev_info.dev_id = ctx->dev_id;
1904
1905 ret = ublk_ctrl_get_info(dev);
1906 if (ret < 0) {
1907 if (ctx->logging)
1908 ublk_err("%s: can't get dev info from %d: %d\n",
1909 __func__, ctx->dev_id, ret);
1910 } else {
1911 if (ctx->shadow_dev)
1912 memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
1913
1914 ublk_ctrl_dump(dev);
1915 }
1916
1917 ublk_ctrl_deinit(dev);
1918
1919 return ret;
1920 }
1921
cmd_dev_list(struct dev_ctx * ctx)1922 static int cmd_dev_list(struct dev_ctx *ctx)
1923 {
1924 int i;
1925
1926 if (ctx->dev_id >= 0 || !ctx->all)
1927 return __cmd_dev_list(ctx);
1928
1929 ctx->logging = false;
1930 for (i = 0; i < 255; i++) {
1931 ctx->dev_id = i;
1932 __cmd_dev_list(ctx);
1933 }
1934 return 0;
1935 }
1936
cmd_dev_get_features(void)1937 static int cmd_dev_get_features(void)
1938 {
1939 #define const_ilog2(x) (63 - __builtin_clzll(x))
1940 #define FEAT_NAME(f) [const_ilog2(f)] = #f
1941 static const char *feat_map[] = {
1942 FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY),
1943 FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK),
1944 FEAT_NAME(UBLK_F_NEED_GET_DATA),
1945 FEAT_NAME(UBLK_F_USER_RECOVERY),
1946 FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE),
1947 FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV),
1948 FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE),
1949 FEAT_NAME(UBLK_F_USER_COPY),
1950 FEAT_NAME(UBLK_F_ZONED),
1951 FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO),
1952 FEAT_NAME(UBLK_F_UPDATE_SIZE),
1953 FEAT_NAME(UBLK_F_AUTO_BUF_REG),
1954 FEAT_NAME(UBLK_F_QUIESCE),
1955 FEAT_NAME(UBLK_F_PER_IO_DAEMON),
1956 FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
1957 FEAT_NAME(UBLK_F_INTEGRITY),
1958 FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
1959 FEAT_NAME(UBLK_F_BATCH_IO),
1960 FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
1961 FEAT_NAME(UBLK_F_SHMEM_ZC),
1962 };
1963 struct ublk_dev *dev;
1964 __u64 features = 0;
1965 int ret;
1966
1967 dev = ublk_ctrl_init();
1968 if (!dev) {
1969 fprintf(stderr, "ublksrv_ctrl_init failed id\n");
1970 return -EOPNOTSUPP;
1971 }
1972
1973 ret = ublk_ctrl_get_features(dev, &features);
1974 if (!ret) {
1975 int i;
1976
1977 printf("ublk_drv features: 0x%llx\n", features);
1978
1979 for (i = 0; i < sizeof(features) * 8; i++) {
1980 const char *feat;
1981
1982 if (!((1ULL << i) & features))
1983 continue;
1984 if (i < ARRAY_SIZE(feat_map))
1985 feat = feat_map[i];
1986 else
1987 feat = "unknown";
1988 printf("0x%-16llx: %s\n", 1ULL << i, feat);
1989 }
1990 }
1991
1992 return ret;
1993 }
1994
cmd_dev_update_size(struct dev_ctx * ctx)1995 static int cmd_dev_update_size(struct dev_ctx *ctx)
1996 {
1997 struct ublk_dev *dev = ublk_ctrl_init();
1998 struct ublk_params p;
1999 int ret = -EINVAL;
2000
2001 if (!dev)
2002 return -ENODEV;
2003
2004 if (ctx->dev_id < 0) {
2005 fprintf(stderr, "device id isn't provided\n");
2006 goto out;
2007 }
2008
2009 dev->dev_info.dev_id = ctx->dev_id;
2010 ret = ublk_ctrl_get_params(dev, &p);
2011 if (ret < 0) {
2012 ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
2013 goto out;
2014 }
2015
2016 if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
2017 ublk_err("size isn't aligned with logical block size\n");
2018 ret = -EINVAL;
2019 goto out;
2020 }
2021
2022 ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
2023 out:
2024 ublk_ctrl_deinit(dev);
2025 return ret;
2026 }
2027
cmd_dev_quiesce(struct dev_ctx * ctx)2028 static int cmd_dev_quiesce(struct dev_ctx *ctx)
2029 {
2030 struct ublk_dev *dev = ublk_ctrl_init();
2031 int ret = -EINVAL;
2032
2033 if (!dev)
2034 return -ENODEV;
2035
2036 if (ctx->dev_id < 0) {
2037 fprintf(stderr, "device id isn't provided for quiesce\n");
2038 goto out;
2039 }
2040 dev->dev_info.dev_id = ctx->dev_id;
2041 ret = ublk_ctrl_quiesce_dev(dev, 10000);
2042
2043 out:
2044 ublk_ctrl_deinit(dev);
2045 return ret;
2046 }
2047
__cmd_create_help(char * exe,bool recovery)2048 static void __cmd_create_help(char *exe, bool recovery)
2049 {
2050 int i;
2051
2052 printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
2053 exe, recovery ? "recover" : "add");
2054 printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
2055 printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
2056 printf("\t[--nthreads threads] [--per_io_tasks]\n");
2057 printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
2058 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
2059 printf("\t[--batch|-b] [--no_auto_part_scan]\n");
2060 printf("\t[target options] [backfile1] [backfile2] ...\n");
2061 printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
2062 printf("\tdefault: nthreads=nr_queues");
2063
2064 for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) {
2065 const struct ublk_tgt_ops *ops = tgt_ops_list[i];
2066
2067 if (ops->usage)
2068 ops->usage(ops);
2069 }
2070 }
2071
cmd_add_help(char * exe)2072 static void cmd_add_help(char *exe)
2073 {
2074 __cmd_create_help(exe, false);
2075 printf("\n");
2076 }
2077
cmd_recover_help(char * exe)2078 static void cmd_recover_help(char *exe)
2079 {
2080 __cmd_create_help(exe, true);
2081 printf("\tPlease provide exact command line for creating this device with real dev_id\n");
2082 printf("\n");
2083 }
2084
cmd_dev_help(char * exe)2085 static int cmd_dev_help(char *exe)
2086 {
2087 cmd_add_help(exe);
2088 cmd_recover_help(exe);
2089
2090 printf("%s del [-n dev_id] -a \n", exe);
2091 printf("\t -a delete all devices -n delete specified device\n\n");
2092 printf("%s stop -n dev_id [--safe]\n", exe);
2093 printf("\t --safe only stop if device has no active openers\n\n");
2094 printf("%s list [-n dev_id] -a \n", exe);
2095 printf("\t -a list all devices, -n list specified device, default -a \n\n");
2096 printf("%s features\n", exe);
2097 printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
2098 printf("%s quiesce -n dev_id\n", exe);
2099 return 0;
2100 }
2101
main(int argc,char * argv[])2102 int main(int argc, char *argv[])
2103 {
2104 static const struct option longopts[] = {
2105 { "all", 0, NULL, 'a' },
2106 { "type", 1, NULL, 't' },
2107 { "number", 1, NULL, 'n' },
2108 { "queues", 1, NULL, 'q' },
2109 { "depth", 1, NULL, 'd' },
2110 { "debug_mask", 1, NULL, 0 },
2111 { "quiet", 0, NULL, 0 },
2112 { "zero_copy", 0, NULL, 'z' },
2113 { "foreground", 0, NULL, 0 },
2114 { "recovery", 1, NULL, 'r' },
2115 { "recovery_fail_io", 1, NULL, 'e'},
2116 { "recovery_reissue", 1, NULL, 'i'},
2117 { "get_data", 1, NULL, 'g'},
2118 { "auto_zc", 0, NULL, 0 },
2119 { "auto_zc_fallback", 0, NULL, 0 },
2120 { "user_copy", 0, NULL, 'u'},
2121 { "size", 1, NULL, 's'},
2122 { "nthreads", 1, NULL, 0 },
2123 { "per_io_tasks", 0, NULL, 0 },
2124 { "no_ublk_fixed_fd", 0, NULL, 0 },
2125 { "integrity_capable", 0, NULL, 0 },
2126 { "integrity_reftag", 0, NULL, 0 },
2127 { "metadata_size", 1, NULL, 0 },
2128 { "pi_offset", 1, NULL, 0 },
2129 { "csum_type", 1, NULL, 0 },
2130 { "tag_size", 1, NULL, 0 },
2131 { "safe", 0, NULL, 0 },
2132 { "batch", 0, NULL, 'b'},
2133 { "no_auto_part_scan", 0, NULL, 0 },
2134 { "shmem_zc", 0, NULL, 0 },
2135 { "htlb", 1, NULL, 0 },
2136 { "rdonly_shmem_buf", 0, NULL, 0 },
2137 { 0, 0, 0, 0 }
2138 };
2139 const struct ublk_tgt_ops *ops = NULL;
2140 int option_idx, opt;
2141 const char *cmd = argv[1];
2142 struct dev_ctx ctx = {
2143 ._evtfd = -1,
2144 .queue_depth = 128,
2145 .nr_hw_queues = 2,
2146 .dev_id = -1,
2147 .tgt_type = "unknown",
2148 .csum_type = LBMD_PI_CSUM_NONE,
2149 };
2150 int ret = -EINVAL, i;
2151 int tgt_argc = 1;
2152 char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
2153 int value;
2154
2155 if (argc == 1)
2156 return ret;
2157
2158 opterr = 0;
2159 optind = 2;
2160 while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub",
2161 longopts, &option_idx)) != -1) {
2162 switch (opt) {
2163 case 'a':
2164 ctx.all = 1;
2165 break;
2166 case 'b':
2167 ctx.flags |= UBLK_F_BATCH_IO;
2168 break;
2169 case 'n':
2170 ctx.dev_id = strtol(optarg, NULL, 10);
2171 break;
2172 case 't':
2173 if (strlen(optarg) < sizeof(ctx.tgt_type))
2174 strcpy(ctx.tgt_type, optarg);
2175 break;
2176 case 'q':
2177 ctx.nr_hw_queues = strtol(optarg, NULL, 10);
2178 break;
2179 case 'd':
2180 ctx.queue_depth = strtol(optarg, NULL, 10);
2181 break;
2182 case 'z':
2183 ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY;
2184 break;
2185 case 'r':
2186 value = strtol(optarg, NULL, 10);
2187 if (value)
2188 ctx.flags |= UBLK_F_USER_RECOVERY;
2189 break;
2190 case 'e':
2191 value = strtol(optarg, NULL, 10);
2192 if (value)
2193 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
2194 break;
2195 case 'i':
2196 value = strtol(optarg, NULL, 10);
2197 if (value)
2198 ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
2199 break;
2200 case 'g':
2201 ctx.flags |= UBLK_F_NEED_GET_DATA;
2202 break;
2203 case 'u':
2204 ctx.flags |= UBLK_F_USER_COPY;
2205 break;
2206 case 's':
2207 ctx.size = strtoull(optarg, NULL, 10);
2208 break;
2209 case 0:
2210 if (!strcmp(longopts[option_idx].name, "debug_mask"))
2211 ublk_dbg_mask = strtol(optarg, NULL, 16);
2212 if (!strcmp(longopts[option_idx].name, "quiet"))
2213 ublk_dbg_mask = 0;
2214 if (!strcmp(longopts[option_idx].name, "foreground"))
2215 ctx.fg = 1;
2216 if (!strcmp(longopts[option_idx].name, "auto_zc"))
2217 ctx.flags |= UBLK_F_AUTO_BUF_REG;
2218 if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
2219 ctx.auto_zc_fallback = 1;
2220 if (!strcmp(longopts[option_idx].name, "nthreads"))
2221 ctx.nthreads = strtol(optarg, NULL, 10);
2222 if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
2223 ctx.per_io_tasks = 1;
2224 if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
2225 ctx.no_ublk_fixed_fd = 1;
2226 if (!strcmp(longopts[option_idx].name, "integrity_capable"))
2227 ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY;
2228 if (!strcmp(longopts[option_idx].name, "integrity_reftag"))
2229 ctx.integrity_flags |= LBMD_PI_CAP_REFTAG;
2230 if (!strcmp(longopts[option_idx].name, "metadata_size"))
2231 ctx.metadata_size = strtoul(optarg, NULL, 0);
2232 if (!strcmp(longopts[option_idx].name, "pi_offset"))
2233 ctx.pi_offset = strtoul(optarg, NULL, 0);
2234 if (!strcmp(longopts[option_idx].name, "csum_type")) {
2235 if (!strcmp(optarg, "ip")) {
2236 ctx.csum_type = LBMD_PI_CSUM_IP;
2237 } else if (!strcmp(optarg, "t10dif")) {
2238 ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF;
2239 } else if (!strcmp(optarg, "nvme")) {
2240 ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME;
2241 } else {
2242 ublk_err("invalid csum_type: %s\n", optarg);
2243 return -EINVAL;
2244 }
2245 }
2246 if (!strcmp(longopts[option_idx].name, "tag_size"))
2247 ctx.tag_size = strtoul(optarg, NULL, 0);
2248 if (!strcmp(longopts[option_idx].name, "safe"))
2249 ctx.safe_stop = 1;
2250 if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
2251 ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
2252 if (!strcmp(longopts[option_idx].name, "shmem_zc"))
2253 ctx.flags |= UBLK_F_SHMEM_ZC;
2254 if (!strcmp(longopts[option_idx].name, "htlb"))
2255 ctx.htlb_path = strdup(optarg);
2256 if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf"))
2257 ctx.rdonly_shmem_buf = 1;
2258 break;
2259 case '?':
2260 /*
2261 * target requires every option must have argument
2262 */
2263 if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
2264 fprintf(stderr, "every target option requires argument: %s %s\n",
2265 argv[optind - 1], argv[optind]);
2266 exit(EXIT_FAILURE);
2267 }
2268
2269 if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
2270 tgt_argv[tgt_argc++] = argv[optind - 1];
2271 tgt_argv[tgt_argc++] = argv[optind];
2272 } else {
2273 fprintf(stderr, "too many target options\n");
2274 exit(EXIT_FAILURE);
2275 }
2276 optind += 1;
2277 break;
2278 }
2279 }
2280
2281 if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) {
2282 ublk_err("per_io_task and F_BATCH_IO conflict\n");
2283 return -EINVAL;
2284 }
2285
2286 /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
2287 if (ctx.auto_zc_fallback &&
2288 !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
2289 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
2290 ublk_err("%s: auto_zc_fallback is set but neither "
2291 "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
2292 __func__);
2293 return -EINVAL;
2294 }
2295
2296 if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) +
2297 !!(ctx.flags & UBLK_F_USER_COPY) +
2298 (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) +
2299 (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) +
2300 ctx.auto_zc_fallback > 1) {
2301 fprintf(stderr, "too many data copy modes specified\n");
2302 return -EINVAL;
2303 }
2304
2305 if (ctx.metadata_size) {
2306 if (!(ctx.flags & UBLK_F_USER_COPY)) {
2307 ublk_err("integrity requires user_copy\n");
2308 return -EINVAL;
2309 }
2310
2311 ctx.flags |= UBLK_F_INTEGRITY;
2312 } else if (ctx.integrity_flags ||
2313 ctx.pi_offset ||
2314 ctx.csum_type != LBMD_PI_CSUM_NONE ||
2315 ctx.tag_size) {
2316 ublk_err("integrity parameters require metadata_size\n");
2317 return -EINVAL;
2318 }
2319
2320 if ((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
2321 (ctx.flags & UBLK_F_BATCH_IO) &&
2322 (ctx.nthreads > ctx.nr_hw_queues)) {
2323 ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n");
2324 return -EINVAL;
2325 }
2326
2327 i = optind;
2328 while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
2329 ctx.files[ctx.nr_files++] = argv[i++];
2330 }
2331
2332 ops = ublk_find_tgt(ctx.tgt_type);
2333 if (ops && ops->parse_cmd_line) {
2334 optind = 0;
2335
2336 tgt_argv[0] = ctx.tgt_type;
2337 ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
2338 }
2339
2340 if (!strcmp(cmd, "add"))
2341 ret = cmd_dev_add(&ctx);
2342 else if (!strcmp(cmd, "recover")) {
2343 if (ctx.dev_id < 0) {
2344 fprintf(stderr, "device id isn't provided for recovering\n");
2345 ret = -EINVAL;
2346 } else {
2347 ctx.recovery = 1;
2348 ret = cmd_dev_add(&ctx);
2349 }
2350 } else if (!strcmp(cmd, "del"))
2351 ret = cmd_dev_del(&ctx);
2352 else if (!strcmp(cmd, "stop"))
2353 ret = cmd_dev_stop(&ctx);
2354 else if (!strcmp(cmd, "list")) {
2355 ctx.all = 1;
2356 ret = cmd_dev_list(&ctx);
2357 } else if (!strcmp(cmd, "help"))
2358 ret = cmd_dev_help(argv[0]);
2359 else if (!strcmp(cmd, "features"))
2360 ret = cmd_dev_get_features();
2361 else if (!strcmp(cmd, "update_size"))
2362 ret = cmd_dev_update_size(&ctx);
2363 else if (!strcmp(cmd, "quiesce"))
2364 ret = cmd_dev_quiesce(&ctx);
2365 else
2366 cmd_dev_help(argv[0]);
2367
2368 return ret;
2369 }
2370