1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * FUSE: Filesystem in Userspace 4 * Copyright (c) 2023-2024 DataDirect Networks. 5 */ 6 7 #include "fuse_i.h" 8 #include "dev_uring_i.h" 9 #include "fuse_dev_i.h" 10 11 #include <linux/fs.h> 12 #include <linux/io_uring/cmd.h> 13 14 static bool __read_mostly enable_uring; 15 module_param(enable_uring, bool, 0644); 16 MODULE_PARM_DESC(enable_uring, 17 "Enable userspace communication through io-uring"); 18 19 #define FUSE_URING_IOV_SEGS 2 /* header and payload */ 20 21 22 bool fuse_uring_enabled(void) 23 { 24 return enable_uring; 25 } 26 27 struct fuse_uring_pdu { 28 struct fuse_ring_ent *ent; 29 }; 30 31 static const struct fuse_iqueue_ops fuse_io_uring_ops; 32 33 static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, 34 struct fuse_ring_ent *ring_ent) 35 { 36 struct fuse_uring_pdu *pdu = 37 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 38 39 pdu->ent = ring_ent; 40 } 41 42 static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) 43 { 44 struct fuse_uring_pdu *pdu = 45 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 46 47 return pdu->ent; 48 } 49 50 static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) 51 { 52 struct fuse_ring *ring = queue->ring; 53 struct fuse_conn *fc = ring->fc; 54 55 lockdep_assert_held(&queue->lock); 56 lockdep_assert_held(&fc->bg_lock); 57 58 /* 59 * Allow one bg request per queue, ignoring global fc limits. 60 * This prevents a single queue from consuming all resources and 61 * eliminates the need for remote queue wake-ups when global 62 * limits are met but this queue has no more waiting requests. 63 */ 64 while ((fc->active_background < fc->max_background || 65 !queue->active_background) && 66 (!list_empty(&queue->fuse_req_bg_queue))) { 67 struct fuse_req *req; 68 69 req = list_first_entry(&queue->fuse_req_bg_queue, 70 struct fuse_req, list); 71 fc->active_background++; 72 queue->active_background++; 73 74 list_move_tail(&req->list, &queue->fuse_req_queue); 75 } 76 } 77 78 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, 79 int error) 80 { 81 struct fuse_ring_queue *queue = ent->queue; 82 struct fuse_ring *ring = queue->ring; 83 struct fuse_conn *fc = ring->fc; 84 85 lockdep_assert_not_held(&queue->lock); 86 spin_lock(&queue->lock); 87 ent->fuse_req = NULL; 88 if (test_bit(FR_BACKGROUND, &req->flags)) { 89 queue->active_background--; 90 spin_lock(&fc->bg_lock); 91 fuse_uring_flush_bg(queue); 92 spin_unlock(&fc->bg_lock); 93 } 94 95 spin_unlock(&queue->lock); 96 97 if (error) 98 req->out.h.error = error; 99 100 clear_bit(FR_SENT, &req->flags); 101 fuse_request_end(req); 102 } 103 104 /* Abort all list queued request on the given ring queue */ 105 static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) 106 { 107 struct fuse_req *req; 108 LIST_HEAD(req_list); 109 110 spin_lock(&queue->lock); 111 list_for_each_entry(req, &queue->fuse_req_queue, list) 112 clear_bit(FR_PENDING, &req->flags); 113 list_splice_init(&queue->fuse_req_queue, &req_list); 114 spin_unlock(&queue->lock); 115 116 /* must not hold queue lock to avoid order issues with fi->lock */ 117 fuse_dev_end_requests(&req_list); 118 } 119 120 void fuse_uring_abort_end_requests(struct fuse_ring *ring) 121 { 122 int qid; 123 struct fuse_ring_queue *queue; 124 struct fuse_conn *fc = ring->fc; 125 126 for (qid = 0; qid < ring->nr_queues; qid++) { 127 queue = READ_ONCE(ring->queues[qid]); 128 if (!queue) 129 continue; 130 131 queue->stopped = true; 132 133 WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); 134 spin_lock(&queue->lock); 135 spin_lock(&fc->bg_lock); 136 fuse_uring_flush_bg(queue); 137 spin_unlock(&fc->bg_lock); 138 spin_unlock(&queue->lock); 139 fuse_uring_abort_end_queue_requests(queue); 140 } 141 } 142 143 static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) 144 { 145 struct fuse_ring_ent *ent; 146 struct fuse_req *req; 147 148 ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); 149 if (!ent) 150 return false; 151 152 req = ent->fuse_req; 153 154 return time_is_before_jiffies(req->create_time + 155 fc->timeout.req_timeout); 156 } 157 158 bool fuse_uring_request_expired(struct fuse_conn *fc) 159 { 160 struct fuse_ring *ring = fc->ring; 161 struct fuse_ring_queue *queue; 162 int qid; 163 164 if (!ring) 165 return false; 166 167 for (qid = 0; qid < ring->nr_queues; qid++) { 168 queue = READ_ONCE(ring->queues[qid]); 169 if (!queue) 170 continue; 171 172 spin_lock(&queue->lock); 173 if (fuse_request_expired(fc, &queue->fuse_req_queue) || 174 fuse_request_expired(fc, &queue->fuse_req_bg_queue) || 175 ent_list_request_expired(fc, &queue->ent_w_req_queue) || 176 ent_list_request_expired(fc, &queue->ent_in_userspace)) { 177 spin_unlock(&queue->lock); 178 return true; 179 } 180 spin_unlock(&queue->lock); 181 } 182 183 return false; 184 } 185 186 void fuse_uring_destruct(struct fuse_conn *fc) 187 { 188 struct fuse_ring *ring = fc->ring; 189 int qid; 190 191 if (!ring) 192 return; 193 194 for (qid = 0; qid < ring->nr_queues; qid++) { 195 struct fuse_ring_queue *queue = ring->queues[qid]; 196 struct fuse_ring_ent *ent, *next; 197 198 if (!queue) 199 continue; 200 201 WARN_ON(!list_empty(&queue->ent_avail_queue)); 202 WARN_ON(!list_empty(&queue->ent_w_req_queue)); 203 WARN_ON(!list_empty(&queue->ent_commit_queue)); 204 WARN_ON(!list_empty(&queue->ent_in_userspace)); 205 206 list_for_each_entry_safe(ent, next, &queue->ent_released, 207 list) { 208 list_del_init(&ent->list); 209 kfree(ent); 210 } 211 212 kfree(queue->fpq.processing); 213 kfree(queue); 214 ring->queues[qid] = NULL; 215 } 216 217 kfree(ring->queues); 218 kfree(ring); 219 fc->ring = NULL; 220 } 221 222 /* 223 * Basic ring setup for this connection based on the provided configuration 224 */ 225 static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) 226 { 227 struct fuse_ring *ring; 228 size_t nr_queues = num_possible_cpus(); 229 struct fuse_ring *res = NULL; 230 size_t max_payload_size; 231 232 ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); 233 if (!ring) 234 return NULL; 235 236 ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), 237 GFP_KERNEL_ACCOUNT); 238 if (!ring->queues) 239 goto out_err; 240 241 max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); 242 max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); 243 244 spin_lock(&fc->lock); 245 if (fc->ring) { 246 /* race, another thread created the ring in the meantime */ 247 spin_unlock(&fc->lock); 248 res = fc->ring; 249 goto out_err; 250 } 251 252 init_waitqueue_head(&ring->stop_waitq); 253 254 ring->nr_queues = nr_queues; 255 ring->fc = fc; 256 ring->max_payload_sz = max_payload_size; 257 smp_store_release(&fc->ring, ring); 258 259 spin_unlock(&fc->lock); 260 return ring; 261 262 out_err: 263 kfree(ring->queues); 264 kfree(ring); 265 return res; 266 } 267 268 static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, 269 int qid) 270 { 271 struct fuse_conn *fc = ring->fc; 272 struct fuse_ring_queue *queue; 273 struct list_head *pq; 274 275 queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); 276 if (!queue) 277 return NULL; 278 pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); 279 if (!pq) { 280 kfree(queue); 281 return NULL; 282 } 283 284 queue->qid = qid; 285 queue->ring = ring; 286 spin_lock_init(&queue->lock); 287 288 INIT_LIST_HEAD(&queue->ent_avail_queue); 289 INIT_LIST_HEAD(&queue->ent_commit_queue); 290 INIT_LIST_HEAD(&queue->ent_w_req_queue); 291 INIT_LIST_HEAD(&queue->ent_in_userspace); 292 INIT_LIST_HEAD(&queue->fuse_req_queue); 293 INIT_LIST_HEAD(&queue->fuse_req_bg_queue); 294 INIT_LIST_HEAD(&queue->ent_released); 295 296 queue->fpq.processing = pq; 297 fuse_pqueue_init(&queue->fpq); 298 299 spin_lock(&fc->lock); 300 if (ring->queues[qid]) { 301 spin_unlock(&fc->lock); 302 kfree(queue->fpq.processing); 303 kfree(queue); 304 return ring->queues[qid]; 305 } 306 307 /* 308 * write_once and lock as the caller mostly doesn't take the lock at all 309 */ 310 WRITE_ONCE(ring->queues[qid], queue); 311 spin_unlock(&fc->lock); 312 313 return queue; 314 } 315 316 static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) 317 { 318 clear_bit(FR_SENT, &req->flags); 319 req->out.h.error = -ECONNABORTED; 320 fuse_request_end(req); 321 } 322 323 /* 324 * Release a request/entry on connection tear down 325 */ 326 static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) 327 { 328 struct fuse_req *req; 329 struct io_uring_cmd *cmd; 330 331 struct fuse_ring_queue *queue = ent->queue; 332 333 spin_lock(&queue->lock); 334 cmd = ent->cmd; 335 ent->cmd = NULL; 336 req = ent->fuse_req; 337 ent->fuse_req = NULL; 338 if (req) { 339 /* remove entry from queue->fpq->processing */ 340 list_del_init(&req->list); 341 } 342 343 /* 344 * The entry must not be freed immediately, due to access of direct 345 * pointer access of entries through IO_URING_F_CANCEL - there is a risk 346 * of race between daemon termination (which triggers IO_URING_F_CANCEL 347 * and accesses entries without checking the list state first 348 */ 349 list_move(&ent->list, &queue->ent_released); 350 ent->state = FRRS_RELEASED; 351 spin_unlock(&queue->lock); 352 353 if (cmd) 354 io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); 355 356 if (req) 357 fuse_uring_stop_fuse_req_end(req); 358 } 359 360 static void fuse_uring_stop_list_entries(struct list_head *head, 361 struct fuse_ring_queue *queue, 362 enum fuse_ring_req_state exp_state) 363 { 364 struct fuse_ring *ring = queue->ring; 365 struct fuse_ring_ent *ent, *next; 366 ssize_t queue_refs = SSIZE_MAX; 367 LIST_HEAD(to_teardown); 368 369 spin_lock(&queue->lock); 370 list_for_each_entry_safe(ent, next, head, list) { 371 if (ent->state != exp_state) { 372 pr_warn("entry teardown qid=%d state=%d expected=%d", 373 queue->qid, ent->state, exp_state); 374 continue; 375 } 376 377 ent->state = FRRS_TEARDOWN; 378 list_move(&ent->list, &to_teardown); 379 } 380 spin_unlock(&queue->lock); 381 382 /* no queue lock to avoid lock order issues */ 383 list_for_each_entry_safe(ent, next, &to_teardown, list) { 384 fuse_uring_entry_teardown(ent); 385 queue_refs = atomic_dec_return(&ring->queue_refs); 386 WARN_ON_ONCE(queue_refs < 0); 387 } 388 } 389 390 static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) 391 { 392 fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, 393 FRRS_USERSPACE); 394 fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, 395 FRRS_AVAILABLE); 396 } 397 398 /* 399 * Log state debug info 400 */ 401 static void fuse_uring_log_ent_state(struct fuse_ring *ring) 402 { 403 int qid; 404 struct fuse_ring_ent *ent; 405 406 for (qid = 0; qid < ring->nr_queues; qid++) { 407 struct fuse_ring_queue *queue = ring->queues[qid]; 408 409 if (!queue) 410 continue; 411 412 spin_lock(&queue->lock); 413 /* 414 * Log entries from the intermediate queue, the other queues 415 * should be empty 416 */ 417 list_for_each_entry(ent, &queue->ent_w_req_queue, list) { 418 pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", 419 ring, qid, ent, ent->state); 420 } 421 list_for_each_entry(ent, &queue->ent_commit_queue, list) { 422 pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", 423 ring, qid, ent, ent->state); 424 } 425 spin_unlock(&queue->lock); 426 } 427 ring->stop_debug_log = 1; 428 } 429 430 static void fuse_uring_async_stop_queues(struct work_struct *work) 431 { 432 int qid; 433 struct fuse_ring *ring = 434 container_of(work, struct fuse_ring, async_teardown_work.work); 435 436 /* XXX code dup */ 437 for (qid = 0; qid < ring->nr_queues; qid++) { 438 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 439 440 if (!queue) 441 continue; 442 443 fuse_uring_teardown_entries(queue); 444 } 445 446 /* 447 * Some ring entries might be in the middle of IO operations, 448 * i.e. in process to get handled by file_operations::uring_cmd 449 * or on the way to userspace - we could handle that with conditions in 450 * run time code, but easier/cleaner to have an async tear down handler 451 * If there are still queue references left 452 */ 453 if (atomic_read(&ring->queue_refs) > 0) { 454 if (time_after(jiffies, 455 ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) 456 fuse_uring_log_ent_state(ring); 457 458 schedule_delayed_work(&ring->async_teardown_work, 459 FUSE_URING_TEARDOWN_INTERVAL); 460 } else { 461 wake_up_all(&ring->stop_waitq); 462 } 463 } 464 465 /* 466 * Stop the ring queues 467 */ 468 void fuse_uring_stop_queues(struct fuse_ring *ring) 469 { 470 int qid; 471 472 for (qid = 0; qid < ring->nr_queues; qid++) { 473 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 474 475 if (!queue) 476 continue; 477 478 fuse_uring_teardown_entries(queue); 479 } 480 481 if (atomic_read(&ring->queue_refs) > 0) { 482 ring->teardown_time = jiffies; 483 INIT_DELAYED_WORK(&ring->async_teardown_work, 484 fuse_uring_async_stop_queues); 485 schedule_delayed_work(&ring->async_teardown_work, 486 FUSE_URING_TEARDOWN_INTERVAL); 487 } else { 488 wake_up_all(&ring->stop_waitq); 489 } 490 } 491 492 /* 493 * Handle IO_URING_F_CANCEL, typically should come on daemon termination. 494 * 495 * Releasing the last entry should trigger fuse_dev_release() if 496 * the daemon was terminated 497 */ 498 static void fuse_uring_cancel(struct io_uring_cmd *cmd, 499 unsigned int issue_flags) 500 { 501 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 502 struct fuse_ring_queue *queue; 503 bool need_cmd_done = false; 504 505 /* 506 * direct access on ent - it must not be destructed as long as 507 * IO_URING_F_CANCEL might come up 508 */ 509 queue = ent->queue; 510 spin_lock(&queue->lock); 511 if (ent->state == FRRS_AVAILABLE) { 512 ent->state = FRRS_USERSPACE; 513 list_move_tail(&ent->list, &queue->ent_in_userspace); 514 need_cmd_done = true; 515 ent->cmd = NULL; 516 } 517 spin_unlock(&queue->lock); 518 519 if (need_cmd_done) { 520 /* no queue lock to avoid lock order issues */ 521 io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); 522 } 523 } 524 525 static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, 526 struct fuse_ring_ent *ring_ent) 527 { 528 uring_cmd_set_ring_ent(cmd, ring_ent); 529 io_uring_cmd_mark_cancelable(cmd, issue_flags); 530 } 531 532 /* 533 * Checks for errors and stores it into the request 534 */ 535 static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, 536 struct fuse_req *req, 537 struct fuse_conn *fc) 538 { 539 int err; 540 541 err = -EINVAL; 542 if (oh->unique == 0) { 543 /* Not supported through io-uring yet */ 544 pr_warn_once("notify through fuse-io-uring not supported\n"); 545 goto err; 546 } 547 548 if (oh->error <= -ERESTARTSYS || oh->error > 0) 549 goto err; 550 551 if (oh->error) { 552 err = oh->error; 553 goto err; 554 } 555 556 err = -ENOENT; 557 if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { 558 pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", 559 req->in.h.unique, 560 oh->unique & ~FUSE_INT_REQ_BIT); 561 goto err; 562 } 563 564 /* 565 * Is it an interrupt reply ID? 566 * XXX: Not supported through fuse-io-uring yet, it should not even 567 * find the request - should not happen. 568 */ 569 WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); 570 571 err = 0; 572 err: 573 return err; 574 } 575 576 static int fuse_uring_copy_from_ring(struct fuse_ring *ring, 577 struct fuse_req *req, 578 struct fuse_ring_ent *ent) 579 { 580 struct fuse_copy_state cs; 581 struct fuse_args *args = req->args; 582 struct iov_iter iter; 583 int err; 584 struct fuse_uring_ent_in_out ring_in_out; 585 586 err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, 587 sizeof(ring_in_out)); 588 if (err) 589 return -EFAULT; 590 591 err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, 592 &iter); 593 if (err) 594 return err; 595 596 fuse_copy_init(&cs, false, &iter); 597 cs.is_uring = true; 598 cs.req = req; 599 600 return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); 601 } 602 603 /* 604 * Copy data from the req to the ring buffer 605 */ 606 static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, 607 struct fuse_ring_ent *ent) 608 { 609 struct fuse_copy_state cs; 610 struct fuse_args *args = req->args; 611 struct fuse_in_arg *in_args = args->in_args; 612 int num_args = args->in_numargs; 613 int err; 614 struct iov_iter iter; 615 struct fuse_uring_ent_in_out ent_in_out = { 616 .flags = 0, 617 .commit_id = req->in.h.unique, 618 }; 619 620 err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); 621 if (err) { 622 pr_info_ratelimited("fuse: Import of user buffer failed\n"); 623 return err; 624 } 625 626 fuse_copy_init(&cs, true, &iter); 627 cs.is_uring = true; 628 cs.req = req; 629 630 if (num_args > 0) { 631 /* 632 * Expectation is that the first argument is the per op header. 633 * Some op code have that as zero size. 634 */ 635 if (args->in_args[0].size > 0) { 636 err = copy_to_user(&ent->headers->op_in, in_args->value, 637 in_args->size); 638 if (err) { 639 pr_info_ratelimited( 640 "Copying the header failed.\n"); 641 return -EFAULT; 642 } 643 } 644 in_args++; 645 num_args--; 646 } 647 648 /* copy the payload */ 649 err = fuse_copy_args(&cs, num_args, args->in_pages, 650 (struct fuse_arg *)in_args, 0); 651 if (err) { 652 pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); 653 return err; 654 } 655 656 ent_in_out.payload_sz = cs.ring.copied_sz; 657 err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, 658 sizeof(ent_in_out)); 659 return err ? -EFAULT : 0; 660 } 661 662 static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, 663 struct fuse_req *req) 664 { 665 struct fuse_ring_queue *queue = ent->queue; 666 struct fuse_ring *ring = queue->ring; 667 int err; 668 669 err = -EIO; 670 if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { 671 pr_err("qid=%d ring-req=%p invalid state %d on send\n", 672 queue->qid, ent, ent->state); 673 return err; 674 } 675 676 err = -EINVAL; 677 if (WARN_ON(req->in.h.unique == 0)) 678 return err; 679 680 /* copy the request */ 681 err = fuse_uring_args_to_ring(ring, req, ent); 682 if (unlikely(err)) { 683 pr_info_ratelimited("Copy to ring failed: %d\n", err); 684 return err; 685 } 686 687 /* copy fuse_in_header */ 688 err = copy_to_user(&ent->headers->in_out, &req->in.h, 689 sizeof(req->in.h)); 690 if (err) { 691 err = -EFAULT; 692 return err; 693 } 694 695 return 0; 696 } 697 698 static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, 699 struct fuse_req *req) 700 { 701 int err; 702 703 err = fuse_uring_copy_to_ring(ent, req); 704 if (!err) 705 set_bit(FR_SENT, &req->flags); 706 else 707 fuse_uring_req_end(ent, req, err); 708 709 return err; 710 } 711 712 /* 713 * Write data to the ring buffer and send the request to userspace, 714 * userspace will read it 715 * This is comparable with classical read(/dev/fuse) 716 */ 717 static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, 718 struct fuse_req *req, 719 unsigned int issue_flags) 720 { 721 struct fuse_ring_queue *queue = ent->queue; 722 int err; 723 struct io_uring_cmd *cmd; 724 725 err = fuse_uring_prepare_send(ent, req); 726 if (err) 727 return err; 728 729 spin_lock(&queue->lock); 730 cmd = ent->cmd; 731 ent->cmd = NULL; 732 ent->state = FRRS_USERSPACE; 733 list_move_tail(&ent->list, &queue->ent_in_userspace); 734 spin_unlock(&queue->lock); 735 736 io_uring_cmd_done(cmd, 0, 0, issue_flags); 737 return 0; 738 } 739 740 /* 741 * Make a ring entry available for fuse_req assignment 742 */ 743 static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, 744 struct fuse_ring_queue *queue) 745 { 746 WARN_ON_ONCE(!ent->cmd); 747 list_move(&ent->list, &queue->ent_avail_queue); 748 ent->state = FRRS_AVAILABLE; 749 } 750 751 /* Used to find the request on SQE commit */ 752 static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, 753 struct fuse_req *req) 754 { 755 struct fuse_ring_queue *queue = ent->queue; 756 struct fuse_pqueue *fpq = &queue->fpq; 757 unsigned int hash; 758 759 req->ring_entry = ent; 760 hash = fuse_req_hash(req->in.h.unique); 761 list_move_tail(&req->list, &fpq->processing[hash]); 762 } 763 764 /* 765 * Assign a fuse queue entry to the given entry 766 */ 767 static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, 768 struct fuse_req *req) 769 { 770 struct fuse_ring_queue *queue = ent->queue; 771 772 lockdep_assert_held(&queue->lock); 773 774 if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && 775 ent->state != FRRS_COMMIT)) { 776 pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, 777 ent->state); 778 } 779 780 clear_bit(FR_PENDING, &req->flags); 781 ent->fuse_req = req; 782 ent->state = FRRS_FUSE_REQ; 783 list_move_tail(&ent->list, &queue->ent_w_req_queue); 784 fuse_uring_add_to_pq(ent, req); 785 } 786 787 /* Fetch the next fuse request if available */ 788 static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) 789 __must_hold(&queue->lock) 790 { 791 struct fuse_req *req; 792 struct fuse_ring_queue *queue = ent->queue; 793 struct list_head *req_queue = &queue->fuse_req_queue; 794 795 lockdep_assert_held(&queue->lock); 796 797 /* get and assign the next entry while it is still holding the lock */ 798 req = list_first_entry_or_null(req_queue, struct fuse_req, list); 799 if (req) 800 fuse_uring_add_req_to_ring_ent(ent, req); 801 802 return req; 803 } 804 805 /* 806 * Read data from the ring buffer, which user space has written to 807 * This is comparible with handling of classical write(/dev/fuse). 808 * Also make the ring request available again for new fuse requests. 809 */ 810 static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, 811 unsigned int issue_flags) 812 { 813 struct fuse_ring *ring = ent->queue->ring; 814 struct fuse_conn *fc = ring->fc; 815 ssize_t err = 0; 816 817 err = copy_from_user(&req->out.h, &ent->headers->in_out, 818 sizeof(req->out.h)); 819 if (err) { 820 req->out.h.error = -EFAULT; 821 goto out; 822 } 823 824 err = fuse_uring_out_header_has_err(&req->out.h, req, fc); 825 if (err) { 826 /* req->out.h.error already set */ 827 goto out; 828 } 829 830 err = fuse_uring_copy_from_ring(ring, req, ent); 831 out: 832 fuse_uring_req_end(ent, req, err); 833 } 834 835 /* 836 * Get the next fuse req and send it 837 */ 838 static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, 839 struct fuse_ring_queue *queue, 840 unsigned int issue_flags) 841 { 842 int err; 843 struct fuse_req *req; 844 845 retry: 846 spin_lock(&queue->lock); 847 fuse_uring_ent_avail(ent, queue); 848 req = fuse_uring_ent_assign_req(ent); 849 spin_unlock(&queue->lock); 850 851 if (req) { 852 err = fuse_uring_send_next_to_ring(ent, req, issue_flags); 853 if (err) 854 goto retry; 855 } 856 } 857 858 static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) 859 { 860 struct fuse_ring_queue *queue = ent->queue; 861 862 lockdep_assert_held(&queue->lock); 863 864 if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) 865 return -EIO; 866 867 ent->state = FRRS_COMMIT; 868 list_move(&ent->list, &queue->ent_commit_queue); 869 870 return 0; 871 } 872 873 /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ 874 static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, 875 struct fuse_conn *fc) 876 { 877 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 878 struct fuse_ring_ent *ent; 879 int err; 880 struct fuse_ring *ring = fc->ring; 881 struct fuse_ring_queue *queue; 882 uint64_t commit_id = READ_ONCE(cmd_req->commit_id); 883 unsigned int qid = READ_ONCE(cmd_req->qid); 884 struct fuse_pqueue *fpq; 885 struct fuse_req *req; 886 887 err = -ENOTCONN; 888 if (!ring) 889 return err; 890 891 if (qid >= ring->nr_queues) 892 return -EINVAL; 893 894 queue = ring->queues[qid]; 895 if (!queue) 896 return err; 897 fpq = &queue->fpq; 898 899 if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) 900 return err; 901 902 spin_lock(&queue->lock); 903 /* Find a request based on the unique ID of the fuse request 904 * This should get revised, as it needs a hash calculation and list 905 * search. And full struct fuse_pqueue is needed (memory overhead). 906 * As well as the link from req to ring_ent. 907 */ 908 req = fuse_request_find(fpq, commit_id); 909 err = -ENOENT; 910 if (!req) { 911 pr_info("qid=%d commit_id %llu not found\n", queue->qid, 912 commit_id); 913 spin_unlock(&queue->lock); 914 return err; 915 } 916 list_del_init(&req->list); 917 ent = req->ring_entry; 918 req->ring_entry = NULL; 919 920 err = fuse_ring_ent_set_commit(ent); 921 if (err != 0) { 922 pr_info_ratelimited("qid=%d commit_id %llu state %d", 923 queue->qid, commit_id, ent->state); 924 spin_unlock(&queue->lock); 925 req->out.h.error = err; 926 clear_bit(FR_SENT, &req->flags); 927 fuse_request_end(req); 928 return err; 929 } 930 931 ent->cmd = cmd; 932 spin_unlock(&queue->lock); 933 934 /* without the queue lock, as other locks are taken */ 935 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 936 fuse_uring_commit(ent, req, issue_flags); 937 938 /* 939 * Fetching the next request is absolutely required as queued 940 * fuse requests would otherwise not get processed - committing 941 * and fetching is done in one step vs legacy fuse, which has separated 942 * read (fetch request) and write (commit result). 943 */ 944 fuse_uring_next_fuse_req(ent, queue, issue_flags); 945 return 0; 946 } 947 948 static bool is_ring_ready(struct fuse_ring *ring, int current_qid) 949 { 950 int qid; 951 struct fuse_ring_queue *queue; 952 bool ready = true; 953 954 for (qid = 0; qid < ring->nr_queues && ready; qid++) { 955 if (current_qid == qid) 956 continue; 957 958 queue = ring->queues[qid]; 959 if (!queue) { 960 ready = false; 961 break; 962 } 963 964 spin_lock(&queue->lock); 965 if (list_empty(&queue->ent_avail_queue)) 966 ready = false; 967 spin_unlock(&queue->lock); 968 } 969 970 return ready; 971 } 972 973 /* 974 * fuse_uring_req_fetch command handling 975 */ 976 static void fuse_uring_do_register(struct fuse_ring_ent *ent, 977 struct io_uring_cmd *cmd, 978 unsigned int issue_flags) 979 { 980 struct fuse_ring_queue *queue = ent->queue; 981 struct fuse_ring *ring = queue->ring; 982 struct fuse_conn *fc = ring->fc; 983 struct fuse_iqueue *fiq = &fc->iq; 984 985 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 986 987 spin_lock(&queue->lock); 988 ent->cmd = cmd; 989 fuse_uring_ent_avail(ent, queue); 990 spin_unlock(&queue->lock); 991 992 if (!ring->ready) { 993 bool ready = is_ring_ready(ring, queue->qid); 994 995 if (ready) { 996 WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); 997 WRITE_ONCE(ring->ready, true); 998 wake_up_all(&fc->blocked_waitq); 999 } 1000 } 1001 } 1002 1003 /* 1004 * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] 1005 * the payload 1006 */ 1007 static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, 1008 struct iovec iov[FUSE_URING_IOV_SEGS]) 1009 { 1010 struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1011 struct iov_iter iter; 1012 ssize_t ret; 1013 1014 if (sqe->len != FUSE_URING_IOV_SEGS) 1015 return -EINVAL; 1016 1017 /* 1018 * Direction for buffer access will actually be READ and WRITE, 1019 * using write for the import should include READ access as well. 1020 */ 1021 ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, 1022 FUSE_URING_IOV_SEGS, &iov, &iter); 1023 if (ret < 0) 1024 return ret; 1025 1026 return 0; 1027 } 1028 1029 static struct fuse_ring_ent * 1030 fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, 1031 struct fuse_ring_queue *queue) 1032 { 1033 struct fuse_ring *ring = queue->ring; 1034 struct fuse_ring_ent *ent; 1035 size_t payload_size; 1036 struct iovec iov[FUSE_URING_IOV_SEGS]; 1037 int err; 1038 1039 err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); 1040 if (err) { 1041 pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", 1042 err); 1043 return ERR_PTR(err); 1044 } 1045 1046 err = -EINVAL; 1047 if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { 1048 pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); 1049 return ERR_PTR(err); 1050 } 1051 1052 payload_size = iov[1].iov_len; 1053 if (payload_size < ring->max_payload_sz) { 1054 pr_info_ratelimited("Invalid req payload len %zu\n", 1055 payload_size); 1056 return ERR_PTR(err); 1057 } 1058 1059 err = -ENOMEM; 1060 ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); 1061 if (!ent) 1062 return ERR_PTR(err); 1063 1064 INIT_LIST_HEAD(&ent->list); 1065 1066 ent->queue = queue; 1067 ent->headers = iov[0].iov_base; 1068 ent->payload = iov[1].iov_base; 1069 1070 atomic_inc(&ring->queue_refs); 1071 return ent; 1072 } 1073 1074 /* 1075 * Register header and payload buffer with the kernel and puts the 1076 * entry as "ready to get fuse requests" on the queue 1077 */ 1078 static int fuse_uring_register(struct io_uring_cmd *cmd, 1079 unsigned int issue_flags, struct fuse_conn *fc) 1080 { 1081 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 1082 struct fuse_ring *ring = smp_load_acquire(&fc->ring); 1083 struct fuse_ring_queue *queue; 1084 struct fuse_ring_ent *ent; 1085 int err; 1086 unsigned int qid = READ_ONCE(cmd_req->qid); 1087 1088 err = -ENOMEM; 1089 if (!ring) { 1090 ring = fuse_uring_create(fc); 1091 if (!ring) 1092 return err; 1093 } 1094 1095 if (qid >= ring->nr_queues) { 1096 pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); 1097 return -EINVAL; 1098 } 1099 1100 queue = ring->queues[qid]; 1101 if (!queue) { 1102 queue = fuse_uring_create_queue(ring, qid); 1103 if (!queue) 1104 return err; 1105 } 1106 1107 /* 1108 * The created queue above does not need to be destructed in 1109 * case of entry errors below, will be done at ring destruction time. 1110 */ 1111 1112 ent = fuse_uring_create_ring_ent(cmd, queue); 1113 if (IS_ERR(ent)) 1114 return PTR_ERR(ent); 1115 1116 fuse_uring_do_register(ent, cmd, issue_flags); 1117 1118 return 0; 1119 } 1120 1121 /* 1122 * Entry function from io_uring to handle the given passthrough command 1123 * (op code IORING_OP_URING_CMD) 1124 */ 1125 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 1126 { 1127 struct fuse_dev *fud; 1128 struct fuse_conn *fc; 1129 u32 cmd_op = cmd->cmd_op; 1130 int err; 1131 1132 if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { 1133 fuse_uring_cancel(cmd, issue_flags); 1134 return 0; 1135 } 1136 1137 /* This extra SQE size holds struct fuse_uring_cmd_req */ 1138 if (!(issue_flags & IO_URING_F_SQE128)) 1139 return -EINVAL; 1140 1141 fud = fuse_get_dev(cmd->file); 1142 if (!fud) { 1143 pr_info_ratelimited("No fuse device found\n"); 1144 return -ENOTCONN; 1145 } 1146 fc = fud->fc; 1147 1148 /* Once a connection has io-uring enabled on it, it can't be disabled */ 1149 if (!enable_uring && !fc->io_uring) { 1150 pr_info_ratelimited("fuse-io-uring is disabled\n"); 1151 return -EOPNOTSUPP; 1152 } 1153 1154 if (fc->aborted) 1155 return -ECONNABORTED; 1156 if (!fc->connected) 1157 return -ENOTCONN; 1158 1159 /* 1160 * fuse_uring_register() needs the ring to be initialized, 1161 * we need to know the max payload size 1162 */ 1163 if (!fc->initialized) 1164 return -EAGAIN; 1165 1166 switch (cmd_op) { 1167 case FUSE_IO_URING_CMD_REGISTER: 1168 err = fuse_uring_register(cmd, issue_flags, fc); 1169 if (err) { 1170 pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", 1171 err); 1172 fc->io_uring = 0; 1173 wake_up_all(&fc->blocked_waitq); 1174 return err; 1175 } 1176 break; 1177 case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: 1178 err = fuse_uring_commit_fetch(cmd, issue_flags, fc); 1179 if (err) { 1180 pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", 1181 err); 1182 return err; 1183 } 1184 break; 1185 default: 1186 return -EINVAL; 1187 } 1188 1189 return -EIOCBQUEUED; 1190 } 1191 1192 static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, 1193 ssize_t ret, unsigned int issue_flags) 1194 { 1195 struct fuse_ring_queue *queue = ent->queue; 1196 1197 spin_lock(&queue->lock); 1198 ent->state = FRRS_USERSPACE; 1199 list_move_tail(&ent->list, &queue->ent_in_userspace); 1200 ent->cmd = NULL; 1201 spin_unlock(&queue->lock); 1202 1203 io_uring_cmd_done(cmd, ret, 0, issue_flags); 1204 } 1205 1206 /* 1207 * This prepares and sends the ring request in fuse-uring task context. 1208 * User buffers are not mapped yet - the application does not have permission 1209 * to write to it - this has to be executed in ring task context. 1210 */ 1211 static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, 1212 unsigned int issue_flags) 1213 { 1214 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 1215 struct fuse_ring_queue *queue = ent->queue; 1216 int err; 1217 1218 if (!(issue_flags & IO_URING_F_TASK_DEAD)) { 1219 err = fuse_uring_prepare_send(ent, ent->fuse_req); 1220 if (err) { 1221 fuse_uring_next_fuse_req(ent, queue, issue_flags); 1222 return; 1223 } 1224 } else { 1225 err = -ECANCELED; 1226 } 1227 1228 fuse_uring_send(ent, cmd, err, issue_flags); 1229 } 1230 1231 static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) 1232 { 1233 unsigned int qid; 1234 struct fuse_ring_queue *queue; 1235 1236 qid = task_cpu(current); 1237 1238 if (WARN_ONCE(qid >= ring->nr_queues, 1239 "Core number (%u) exceeds nr queues (%zu)\n", qid, 1240 ring->nr_queues)) 1241 qid = 0; 1242 1243 queue = ring->queues[qid]; 1244 WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); 1245 1246 return queue; 1247 } 1248 1249 static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) 1250 { 1251 struct io_uring_cmd *cmd = ent->cmd; 1252 1253 uring_cmd_set_ring_ent(cmd, ent); 1254 io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); 1255 } 1256 1257 /* queue a fuse request and send it if a ring entry is available */ 1258 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) 1259 { 1260 struct fuse_conn *fc = req->fm->fc; 1261 struct fuse_ring *ring = fc->ring; 1262 struct fuse_ring_queue *queue; 1263 struct fuse_ring_ent *ent = NULL; 1264 int err; 1265 1266 err = -EINVAL; 1267 queue = fuse_uring_task_to_queue(ring); 1268 if (!queue) 1269 goto err; 1270 1271 if (req->in.h.opcode != FUSE_NOTIFY_REPLY) 1272 req->in.h.unique = fuse_get_unique(fiq); 1273 1274 spin_lock(&queue->lock); 1275 err = -ENOTCONN; 1276 if (unlikely(queue->stopped)) 1277 goto err_unlock; 1278 1279 set_bit(FR_URING, &req->flags); 1280 req->ring_queue = queue; 1281 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1282 struct fuse_ring_ent, list); 1283 if (ent) 1284 fuse_uring_add_req_to_ring_ent(ent, req); 1285 else 1286 list_add_tail(&req->list, &queue->fuse_req_queue); 1287 spin_unlock(&queue->lock); 1288 1289 if (ent) 1290 fuse_uring_dispatch_ent(ent); 1291 1292 return; 1293 1294 err_unlock: 1295 spin_unlock(&queue->lock); 1296 err: 1297 req->out.h.error = err; 1298 clear_bit(FR_PENDING, &req->flags); 1299 fuse_request_end(req); 1300 } 1301 1302 bool fuse_uring_queue_bq_req(struct fuse_req *req) 1303 { 1304 struct fuse_conn *fc = req->fm->fc; 1305 struct fuse_ring *ring = fc->ring; 1306 struct fuse_ring_queue *queue; 1307 struct fuse_ring_ent *ent = NULL; 1308 1309 queue = fuse_uring_task_to_queue(ring); 1310 if (!queue) 1311 return false; 1312 1313 spin_lock(&queue->lock); 1314 if (unlikely(queue->stopped)) { 1315 spin_unlock(&queue->lock); 1316 return false; 1317 } 1318 1319 set_bit(FR_URING, &req->flags); 1320 req->ring_queue = queue; 1321 list_add_tail(&req->list, &queue->fuse_req_bg_queue); 1322 1323 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1324 struct fuse_ring_ent, list); 1325 spin_lock(&fc->bg_lock); 1326 fc->num_background++; 1327 if (fc->num_background == fc->max_background) 1328 fc->blocked = 1; 1329 fuse_uring_flush_bg(queue); 1330 spin_unlock(&fc->bg_lock); 1331 1332 /* 1333 * Due to bg_queue flush limits there might be other bg requests 1334 * in the queue that need to be handled first. Or no further req 1335 * might be available. 1336 */ 1337 req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, 1338 list); 1339 if (ent && req) { 1340 fuse_uring_add_req_to_ring_ent(ent, req); 1341 spin_unlock(&queue->lock); 1342 1343 fuse_uring_dispatch_ent(ent); 1344 } else { 1345 spin_unlock(&queue->lock); 1346 } 1347 1348 return true; 1349 } 1350 1351 bool fuse_uring_remove_pending_req(struct fuse_req *req) 1352 { 1353 struct fuse_ring_queue *queue = req->ring_queue; 1354 1355 return fuse_remove_pending_req(req, &queue->lock); 1356 } 1357 1358 static const struct fuse_iqueue_ops fuse_io_uring_ops = { 1359 /* should be send over io-uring as enhancement */ 1360 .send_forget = fuse_dev_queue_forget, 1361 1362 /* 1363 * could be send over io-uring, but interrupts should be rare, 1364 * no need to make the code complex 1365 */ 1366 .send_interrupt = fuse_dev_queue_interrupt, 1367 .send_req = fuse_uring_queue_fuse_req, 1368 }; 1369