1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * FUSE: Filesystem in Userspace 4 * Copyright (c) 2023-2024 DataDirect Networks. 5 */ 6 7 #include "fuse_i.h" 8 #include "dev_uring_i.h" 9 #include "fuse_dev_i.h" 10 11 #include <linux/fs.h> 12 #include <linux/io_uring/cmd.h> 13 14 static bool __read_mostly enable_uring; 15 module_param(enable_uring, bool, 0644); 16 MODULE_PARM_DESC(enable_uring, 17 "Enable userspace communication through io-uring"); 18 19 #define FUSE_URING_IOV_SEGS 2 /* header and payload */ 20 21 22 bool fuse_uring_enabled(void) 23 { 24 return enable_uring; 25 } 26 27 struct fuse_uring_pdu { 28 struct fuse_ring_ent *ent; 29 }; 30 31 static const struct fuse_iqueue_ops fuse_io_uring_ops; 32 33 static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, 34 struct fuse_ring_ent *ring_ent) 35 { 36 struct fuse_uring_pdu *pdu = 37 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 38 39 pdu->ent = ring_ent; 40 } 41 42 static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) 43 { 44 struct fuse_uring_pdu *pdu = 45 io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); 46 47 return pdu->ent; 48 } 49 50 static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) 51 { 52 struct fuse_ring *ring = queue->ring; 53 struct fuse_conn *fc = ring->fc; 54 55 lockdep_assert_held(&queue->lock); 56 lockdep_assert_held(&fc->bg_lock); 57 58 /* 59 * Allow one bg request per queue, ignoring global fc limits. 60 * This prevents a single queue from consuming all resources and 61 * eliminates the need for remote queue wake-ups when global 62 * limits are met but this queue has no more waiting requests. 63 */ 64 while ((fc->active_background < fc->max_background || 65 !queue->active_background) && 66 (!list_empty(&queue->fuse_req_bg_queue))) { 67 struct fuse_req *req; 68 69 req = list_first_entry(&queue->fuse_req_bg_queue, 70 struct fuse_req, list); 71 fc->active_background++; 72 queue->active_background++; 73 74 list_move_tail(&req->list, &queue->fuse_req_queue); 75 } 76 } 77 78 static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, 79 int error) 80 { 81 struct fuse_ring_queue *queue = ent->queue; 82 struct fuse_ring *ring = queue->ring; 83 struct fuse_conn *fc = ring->fc; 84 85 lockdep_assert_not_held(&queue->lock); 86 spin_lock(&queue->lock); 87 ent->fuse_req = NULL; 88 if (test_bit(FR_BACKGROUND, &req->flags)) { 89 queue->active_background--; 90 spin_lock(&fc->bg_lock); 91 fuse_uring_flush_bg(queue); 92 spin_unlock(&fc->bg_lock); 93 } 94 95 spin_unlock(&queue->lock); 96 97 if (error) 98 req->out.h.error = error; 99 100 clear_bit(FR_SENT, &req->flags); 101 fuse_request_end(req); 102 } 103 104 /* Abort all list queued request on the given ring queue */ 105 static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) 106 { 107 struct fuse_req *req; 108 LIST_HEAD(req_list); 109 110 spin_lock(&queue->lock); 111 list_for_each_entry(req, &queue->fuse_req_queue, list) 112 clear_bit(FR_PENDING, &req->flags); 113 list_splice_init(&queue->fuse_req_queue, &req_list); 114 spin_unlock(&queue->lock); 115 116 /* must not hold queue lock to avoid order issues with fi->lock */ 117 fuse_dev_end_requests(&req_list); 118 } 119 120 void fuse_uring_abort_end_requests(struct fuse_ring *ring) 121 { 122 int qid; 123 struct fuse_ring_queue *queue; 124 struct fuse_conn *fc = ring->fc; 125 126 for (qid = 0; qid < ring->nr_queues; qid++) { 127 queue = READ_ONCE(ring->queues[qid]); 128 if (!queue) 129 continue; 130 131 queue->stopped = true; 132 133 WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); 134 spin_lock(&queue->lock); 135 spin_lock(&fc->bg_lock); 136 fuse_uring_flush_bg(queue); 137 spin_unlock(&fc->bg_lock); 138 spin_unlock(&queue->lock); 139 fuse_uring_abort_end_queue_requests(queue); 140 } 141 } 142 143 bool fuse_uring_request_expired(struct fuse_conn *fc) 144 { 145 struct fuse_ring *ring = fc->ring; 146 struct fuse_ring_queue *queue; 147 int qid; 148 149 if (!ring) 150 return false; 151 152 for (qid = 0; qid < ring->nr_queues; qid++) { 153 queue = READ_ONCE(ring->queues[qid]); 154 if (!queue) 155 continue; 156 157 spin_lock(&queue->lock); 158 if (fuse_request_expired(fc, &queue->fuse_req_queue) || 159 fuse_request_expired(fc, &queue->fuse_req_bg_queue) || 160 fuse_fpq_processing_expired(fc, queue->fpq.processing)) { 161 spin_unlock(&queue->lock); 162 return true; 163 } 164 spin_unlock(&queue->lock); 165 } 166 167 return false; 168 } 169 170 void fuse_uring_destruct(struct fuse_conn *fc) 171 { 172 struct fuse_ring *ring = fc->ring; 173 int qid; 174 175 if (!ring) 176 return; 177 178 for (qid = 0; qid < ring->nr_queues; qid++) { 179 struct fuse_ring_queue *queue = ring->queues[qid]; 180 struct fuse_ring_ent *ent, *next; 181 182 if (!queue) 183 continue; 184 185 WARN_ON(!list_empty(&queue->ent_avail_queue)); 186 WARN_ON(!list_empty(&queue->ent_w_req_queue)); 187 WARN_ON(!list_empty(&queue->ent_commit_queue)); 188 WARN_ON(!list_empty(&queue->ent_in_userspace)); 189 190 list_for_each_entry_safe(ent, next, &queue->ent_released, 191 list) { 192 list_del_init(&ent->list); 193 kfree(ent); 194 } 195 196 kfree(queue->fpq.processing); 197 kfree(queue); 198 ring->queues[qid] = NULL; 199 } 200 201 kfree(ring->queues); 202 kfree(ring); 203 fc->ring = NULL; 204 } 205 206 /* 207 * Basic ring setup for this connection based on the provided configuration 208 */ 209 static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) 210 { 211 struct fuse_ring *ring; 212 size_t nr_queues = num_possible_cpus(); 213 struct fuse_ring *res = NULL; 214 size_t max_payload_size; 215 216 ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); 217 if (!ring) 218 return NULL; 219 220 ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), 221 GFP_KERNEL_ACCOUNT); 222 if (!ring->queues) 223 goto out_err; 224 225 max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); 226 max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); 227 228 spin_lock(&fc->lock); 229 if (fc->ring) { 230 /* race, another thread created the ring in the meantime */ 231 spin_unlock(&fc->lock); 232 res = fc->ring; 233 goto out_err; 234 } 235 236 init_waitqueue_head(&ring->stop_waitq); 237 238 ring->nr_queues = nr_queues; 239 ring->fc = fc; 240 ring->max_payload_sz = max_payload_size; 241 smp_store_release(&fc->ring, ring); 242 243 spin_unlock(&fc->lock); 244 return ring; 245 246 out_err: 247 kfree(ring->queues); 248 kfree(ring); 249 return res; 250 } 251 252 static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, 253 int qid) 254 { 255 struct fuse_conn *fc = ring->fc; 256 struct fuse_ring_queue *queue; 257 struct list_head *pq; 258 259 queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); 260 if (!queue) 261 return NULL; 262 pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); 263 if (!pq) { 264 kfree(queue); 265 return NULL; 266 } 267 268 queue->qid = qid; 269 queue->ring = ring; 270 spin_lock_init(&queue->lock); 271 272 INIT_LIST_HEAD(&queue->ent_avail_queue); 273 INIT_LIST_HEAD(&queue->ent_commit_queue); 274 INIT_LIST_HEAD(&queue->ent_w_req_queue); 275 INIT_LIST_HEAD(&queue->ent_in_userspace); 276 INIT_LIST_HEAD(&queue->fuse_req_queue); 277 INIT_LIST_HEAD(&queue->fuse_req_bg_queue); 278 INIT_LIST_HEAD(&queue->ent_released); 279 280 queue->fpq.processing = pq; 281 fuse_pqueue_init(&queue->fpq); 282 283 spin_lock(&fc->lock); 284 if (ring->queues[qid]) { 285 spin_unlock(&fc->lock); 286 kfree(queue->fpq.processing); 287 kfree(queue); 288 return ring->queues[qid]; 289 } 290 291 /* 292 * write_once and lock as the caller mostly doesn't take the lock at all 293 */ 294 WRITE_ONCE(ring->queues[qid], queue); 295 spin_unlock(&fc->lock); 296 297 return queue; 298 } 299 300 static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) 301 { 302 clear_bit(FR_SENT, &req->flags); 303 req->out.h.error = -ECONNABORTED; 304 fuse_request_end(req); 305 } 306 307 /* 308 * Release a request/entry on connection tear down 309 */ 310 static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) 311 { 312 struct fuse_req *req; 313 struct io_uring_cmd *cmd; 314 315 struct fuse_ring_queue *queue = ent->queue; 316 317 spin_lock(&queue->lock); 318 cmd = ent->cmd; 319 ent->cmd = NULL; 320 req = ent->fuse_req; 321 ent->fuse_req = NULL; 322 if (req) { 323 /* remove entry from queue->fpq->processing */ 324 list_del_init(&req->list); 325 } 326 327 /* 328 * The entry must not be freed immediately, due to access of direct 329 * pointer access of entries through IO_URING_F_CANCEL - there is a risk 330 * of race between daemon termination (which triggers IO_URING_F_CANCEL 331 * and accesses entries without checking the list state first 332 */ 333 list_move(&ent->list, &queue->ent_released); 334 ent->state = FRRS_RELEASED; 335 spin_unlock(&queue->lock); 336 337 if (cmd) 338 io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); 339 340 if (req) 341 fuse_uring_stop_fuse_req_end(req); 342 } 343 344 static void fuse_uring_stop_list_entries(struct list_head *head, 345 struct fuse_ring_queue *queue, 346 enum fuse_ring_req_state exp_state) 347 { 348 struct fuse_ring *ring = queue->ring; 349 struct fuse_ring_ent *ent, *next; 350 ssize_t queue_refs = SSIZE_MAX; 351 LIST_HEAD(to_teardown); 352 353 spin_lock(&queue->lock); 354 list_for_each_entry_safe(ent, next, head, list) { 355 if (ent->state != exp_state) { 356 pr_warn("entry teardown qid=%d state=%d expected=%d", 357 queue->qid, ent->state, exp_state); 358 continue; 359 } 360 361 ent->state = FRRS_TEARDOWN; 362 list_move(&ent->list, &to_teardown); 363 } 364 spin_unlock(&queue->lock); 365 366 /* no queue lock to avoid lock order issues */ 367 list_for_each_entry_safe(ent, next, &to_teardown, list) { 368 fuse_uring_entry_teardown(ent); 369 queue_refs = atomic_dec_return(&ring->queue_refs); 370 WARN_ON_ONCE(queue_refs < 0); 371 } 372 } 373 374 static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) 375 { 376 fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, 377 FRRS_USERSPACE); 378 fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, 379 FRRS_AVAILABLE); 380 } 381 382 /* 383 * Log state debug info 384 */ 385 static void fuse_uring_log_ent_state(struct fuse_ring *ring) 386 { 387 int qid; 388 struct fuse_ring_ent *ent; 389 390 for (qid = 0; qid < ring->nr_queues; qid++) { 391 struct fuse_ring_queue *queue = ring->queues[qid]; 392 393 if (!queue) 394 continue; 395 396 spin_lock(&queue->lock); 397 /* 398 * Log entries from the intermediate queue, the other queues 399 * should be empty 400 */ 401 list_for_each_entry(ent, &queue->ent_w_req_queue, list) { 402 pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", 403 ring, qid, ent, ent->state); 404 } 405 list_for_each_entry(ent, &queue->ent_commit_queue, list) { 406 pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", 407 ring, qid, ent, ent->state); 408 } 409 spin_unlock(&queue->lock); 410 } 411 ring->stop_debug_log = 1; 412 } 413 414 static void fuse_uring_async_stop_queues(struct work_struct *work) 415 { 416 int qid; 417 struct fuse_ring *ring = 418 container_of(work, struct fuse_ring, async_teardown_work.work); 419 420 /* XXX code dup */ 421 for (qid = 0; qid < ring->nr_queues; qid++) { 422 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 423 424 if (!queue) 425 continue; 426 427 fuse_uring_teardown_entries(queue); 428 } 429 430 /* 431 * Some ring entries might be in the middle of IO operations, 432 * i.e. in process to get handled by file_operations::uring_cmd 433 * or on the way to userspace - we could handle that with conditions in 434 * run time code, but easier/cleaner to have an async tear down handler 435 * If there are still queue references left 436 */ 437 if (atomic_read(&ring->queue_refs) > 0) { 438 if (time_after(jiffies, 439 ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) 440 fuse_uring_log_ent_state(ring); 441 442 schedule_delayed_work(&ring->async_teardown_work, 443 FUSE_URING_TEARDOWN_INTERVAL); 444 } else { 445 wake_up_all(&ring->stop_waitq); 446 } 447 } 448 449 /* 450 * Stop the ring queues 451 */ 452 void fuse_uring_stop_queues(struct fuse_ring *ring) 453 { 454 int qid; 455 456 for (qid = 0; qid < ring->nr_queues; qid++) { 457 struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); 458 459 if (!queue) 460 continue; 461 462 fuse_uring_teardown_entries(queue); 463 } 464 465 if (atomic_read(&ring->queue_refs) > 0) { 466 ring->teardown_time = jiffies; 467 INIT_DELAYED_WORK(&ring->async_teardown_work, 468 fuse_uring_async_stop_queues); 469 schedule_delayed_work(&ring->async_teardown_work, 470 FUSE_URING_TEARDOWN_INTERVAL); 471 } else { 472 wake_up_all(&ring->stop_waitq); 473 } 474 } 475 476 /* 477 * Handle IO_URING_F_CANCEL, typically should come on daemon termination. 478 * 479 * Releasing the last entry should trigger fuse_dev_release() if 480 * the daemon was terminated 481 */ 482 static void fuse_uring_cancel(struct io_uring_cmd *cmd, 483 unsigned int issue_flags) 484 { 485 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 486 struct fuse_ring_queue *queue; 487 bool need_cmd_done = false; 488 489 /* 490 * direct access on ent - it must not be destructed as long as 491 * IO_URING_F_CANCEL might come up 492 */ 493 queue = ent->queue; 494 spin_lock(&queue->lock); 495 if (ent->state == FRRS_AVAILABLE) { 496 ent->state = FRRS_USERSPACE; 497 list_move(&ent->list, &queue->ent_in_userspace); 498 need_cmd_done = true; 499 ent->cmd = NULL; 500 } 501 spin_unlock(&queue->lock); 502 503 if (need_cmd_done) { 504 /* no queue lock to avoid lock order issues */ 505 io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); 506 } 507 } 508 509 static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, 510 struct fuse_ring_ent *ring_ent) 511 { 512 uring_cmd_set_ring_ent(cmd, ring_ent); 513 io_uring_cmd_mark_cancelable(cmd, issue_flags); 514 } 515 516 /* 517 * Checks for errors and stores it into the request 518 */ 519 static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, 520 struct fuse_req *req, 521 struct fuse_conn *fc) 522 { 523 int err; 524 525 err = -EINVAL; 526 if (oh->unique == 0) { 527 /* Not supported through io-uring yet */ 528 pr_warn_once("notify through fuse-io-uring not supported\n"); 529 goto err; 530 } 531 532 if (oh->error <= -ERESTARTSYS || oh->error > 0) 533 goto err; 534 535 if (oh->error) { 536 err = oh->error; 537 goto err; 538 } 539 540 err = -ENOENT; 541 if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { 542 pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", 543 req->in.h.unique, 544 oh->unique & ~FUSE_INT_REQ_BIT); 545 goto err; 546 } 547 548 /* 549 * Is it an interrupt reply ID? 550 * XXX: Not supported through fuse-io-uring yet, it should not even 551 * find the request - should not happen. 552 */ 553 WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); 554 555 err = 0; 556 err: 557 return err; 558 } 559 560 static int fuse_uring_copy_from_ring(struct fuse_ring *ring, 561 struct fuse_req *req, 562 struct fuse_ring_ent *ent) 563 { 564 struct fuse_copy_state cs; 565 struct fuse_args *args = req->args; 566 struct iov_iter iter; 567 int err; 568 struct fuse_uring_ent_in_out ring_in_out; 569 570 err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, 571 sizeof(ring_in_out)); 572 if (err) 573 return -EFAULT; 574 575 err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, 576 &iter); 577 if (err) 578 return err; 579 580 fuse_copy_init(&cs, 0, &iter); 581 cs.is_uring = 1; 582 cs.req = req; 583 584 return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); 585 } 586 587 /* 588 * Copy data from the req to the ring buffer 589 */ 590 static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, 591 struct fuse_ring_ent *ent) 592 { 593 struct fuse_copy_state cs; 594 struct fuse_args *args = req->args; 595 struct fuse_in_arg *in_args = args->in_args; 596 int num_args = args->in_numargs; 597 int err; 598 struct iov_iter iter; 599 struct fuse_uring_ent_in_out ent_in_out = { 600 .flags = 0, 601 .commit_id = req->in.h.unique, 602 }; 603 604 err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); 605 if (err) { 606 pr_info_ratelimited("fuse: Import of user buffer failed\n"); 607 return err; 608 } 609 610 fuse_copy_init(&cs, 1, &iter); 611 cs.is_uring = 1; 612 cs.req = req; 613 614 if (num_args > 0) { 615 /* 616 * Expectation is that the first argument is the per op header. 617 * Some op code have that as zero size. 618 */ 619 if (args->in_args[0].size > 0) { 620 err = copy_to_user(&ent->headers->op_in, in_args->value, 621 in_args->size); 622 if (err) { 623 pr_info_ratelimited( 624 "Copying the header failed.\n"); 625 return -EFAULT; 626 } 627 } 628 in_args++; 629 num_args--; 630 } 631 632 /* copy the payload */ 633 err = fuse_copy_args(&cs, num_args, args->in_pages, 634 (struct fuse_arg *)in_args, 0); 635 if (err) { 636 pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); 637 return err; 638 } 639 640 ent_in_out.payload_sz = cs.ring.copied_sz; 641 err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, 642 sizeof(ent_in_out)); 643 return err ? -EFAULT : 0; 644 } 645 646 static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, 647 struct fuse_req *req) 648 { 649 struct fuse_ring_queue *queue = ent->queue; 650 struct fuse_ring *ring = queue->ring; 651 int err; 652 653 err = -EIO; 654 if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { 655 pr_err("qid=%d ring-req=%p invalid state %d on send\n", 656 queue->qid, ent, ent->state); 657 return err; 658 } 659 660 err = -EINVAL; 661 if (WARN_ON(req->in.h.unique == 0)) 662 return err; 663 664 /* copy the request */ 665 err = fuse_uring_args_to_ring(ring, req, ent); 666 if (unlikely(err)) { 667 pr_info_ratelimited("Copy to ring failed: %d\n", err); 668 return err; 669 } 670 671 /* copy fuse_in_header */ 672 err = copy_to_user(&ent->headers->in_out, &req->in.h, 673 sizeof(req->in.h)); 674 if (err) { 675 err = -EFAULT; 676 return err; 677 } 678 679 return 0; 680 } 681 682 static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, 683 struct fuse_req *req) 684 { 685 int err; 686 687 err = fuse_uring_copy_to_ring(ent, req); 688 if (!err) 689 set_bit(FR_SENT, &req->flags); 690 else 691 fuse_uring_req_end(ent, req, err); 692 693 return err; 694 } 695 696 /* 697 * Write data to the ring buffer and send the request to userspace, 698 * userspace will read it 699 * This is comparable with classical read(/dev/fuse) 700 */ 701 static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, 702 struct fuse_req *req, 703 unsigned int issue_flags) 704 { 705 struct fuse_ring_queue *queue = ent->queue; 706 int err; 707 struct io_uring_cmd *cmd; 708 709 err = fuse_uring_prepare_send(ent, req); 710 if (err) 711 return err; 712 713 spin_lock(&queue->lock); 714 cmd = ent->cmd; 715 ent->cmd = NULL; 716 ent->state = FRRS_USERSPACE; 717 list_move(&ent->list, &queue->ent_in_userspace); 718 spin_unlock(&queue->lock); 719 720 io_uring_cmd_done(cmd, 0, 0, issue_flags); 721 return 0; 722 } 723 724 /* 725 * Make a ring entry available for fuse_req assignment 726 */ 727 static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, 728 struct fuse_ring_queue *queue) 729 { 730 WARN_ON_ONCE(!ent->cmd); 731 list_move(&ent->list, &queue->ent_avail_queue); 732 ent->state = FRRS_AVAILABLE; 733 } 734 735 /* Used to find the request on SQE commit */ 736 static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, 737 struct fuse_req *req) 738 { 739 struct fuse_ring_queue *queue = ent->queue; 740 struct fuse_pqueue *fpq = &queue->fpq; 741 unsigned int hash; 742 743 req->ring_entry = ent; 744 hash = fuse_req_hash(req->in.h.unique); 745 list_move_tail(&req->list, &fpq->processing[hash]); 746 } 747 748 /* 749 * Assign a fuse queue entry to the given entry 750 */ 751 static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, 752 struct fuse_req *req) 753 { 754 struct fuse_ring_queue *queue = ent->queue; 755 756 lockdep_assert_held(&queue->lock); 757 758 if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && 759 ent->state != FRRS_COMMIT)) { 760 pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, 761 ent->state); 762 } 763 764 clear_bit(FR_PENDING, &req->flags); 765 ent->fuse_req = req; 766 ent->state = FRRS_FUSE_REQ; 767 list_move(&ent->list, &queue->ent_w_req_queue); 768 fuse_uring_add_to_pq(ent, req); 769 } 770 771 /* Fetch the next fuse request if available */ 772 static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) 773 __must_hold(&queue->lock) 774 { 775 struct fuse_req *req; 776 struct fuse_ring_queue *queue = ent->queue; 777 struct list_head *req_queue = &queue->fuse_req_queue; 778 779 lockdep_assert_held(&queue->lock); 780 781 /* get and assign the next entry while it is still holding the lock */ 782 req = list_first_entry_or_null(req_queue, struct fuse_req, list); 783 if (req) 784 fuse_uring_add_req_to_ring_ent(ent, req); 785 786 return req; 787 } 788 789 /* 790 * Read data from the ring buffer, which user space has written to 791 * This is comparible with handling of classical write(/dev/fuse). 792 * Also make the ring request available again for new fuse requests. 793 */ 794 static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, 795 unsigned int issue_flags) 796 { 797 struct fuse_ring *ring = ent->queue->ring; 798 struct fuse_conn *fc = ring->fc; 799 ssize_t err = 0; 800 801 err = copy_from_user(&req->out.h, &ent->headers->in_out, 802 sizeof(req->out.h)); 803 if (err) { 804 req->out.h.error = -EFAULT; 805 goto out; 806 } 807 808 err = fuse_uring_out_header_has_err(&req->out.h, req, fc); 809 if (err) { 810 /* req->out.h.error already set */ 811 goto out; 812 } 813 814 err = fuse_uring_copy_from_ring(ring, req, ent); 815 out: 816 fuse_uring_req_end(ent, req, err); 817 } 818 819 /* 820 * Get the next fuse req and send it 821 */ 822 static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, 823 struct fuse_ring_queue *queue, 824 unsigned int issue_flags) 825 { 826 int err; 827 struct fuse_req *req; 828 829 retry: 830 spin_lock(&queue->lock); 831 fuse_uring_ent_avail(ent, queue); 832 req = fuse_uring_ent_assign_req(ent); 833 spin_unlock(&queue->lock); 834 835 if (req) { 836 err = fuse_uring_send_next_to_ring(ent, req, issue_flags); 837 if (err) 838 goto retry; 839 } 840 } 841 842 static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) 843 { 844 struct fuse_ring_queue *queue = ent->queue; 845 846 lockdep_assert_held(&queue->lock); 847 848 if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) 849 return -EIO; 850 851 ent->state = FRRS_COMMIT; 852 list_move(&ent->list, &queue->ent_commit_queue); 853 854 return 0; 855 } 856 857 /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ 858 static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, 859 struct fuse_conn *fc) 860 { 861 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 862 struct fuse_ring_ent *ent; 863 int err; 864 struct fuse_ring *ring = fc->ring; 865 struct fuse_ring_queue *queue; 866 uint64_t commit_id = READ_ONCE(cmd_req->commit_id); 867 unsigned int qid = READ_ONCE(cmd_req->qid); 868 struct fuse_pqueue *fpq; 869 struct fuse_req *req; 870 871 err = -ENOTCONN; 872 if (!ring) 873 return err; 874 875 if (qid >= ring->nr_queues) 876 return -EINVAL; 877 878 queue = ring->queues[qid]; 879 if (!queue) 880 return err; 881 fpq = &queue->fpq; 882 883 if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) 884 return err; 885 886 spin_lock(&queue->lock); 887 /* Find a request based on the unique ID of the fuse request 888 * This should get revised, as it needs a hash calculation and list 889 * search. And full struct fuse_pqueue is needed (memory overhead). 890 * As well as the link from req to ring_ent. 891 */ 892 req = fuse_request_find(fpq, commit_id); 893 err = -ENOENT; 894 if (!req) { 895 pr_info("qid=%d commit_id %llu not found\n", queue->qid, 896 commit_id); 897 spin_unlock(&queue->lock); 898 return err; 899 } 900 list_del_init(&req->list); 901 ent = req->ring_entry; 902 req->ring_entry = NULL; 903 904 err = fuse_ring_ent_set_commit(ent); 905 if (err != 0) { 906 pr_info_ratelimited("qid=%d commit_id %llu state %d", 907 queue->qid, commit_id, ent->state); 908 spin_unlock(&queue->lock); 909 req->out.h.error = err; 910 clear_bit(FR_SENT, &req->flags); 911 fuse_request_end(req); 912 return err; 913 } 914 915 ent->cmd = cmd; 916 spin_unlock(&queue->lock); 917 918 /* without the queue lock, as other locks are taken */ 919 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 920 fuse_uring_commit(ent, req, issue_flags); 921 922 /* 923 * Fetching the next request is absolutely required as queued 924 * fuse requests would otherwise not get processed - committing 925 * and fetching is done in one step vs legacy fuse, which has separated 926 * read (fetch request) and write (commit result). 927 */ 928 fuse_uring_next_fuse_req(ent, queue, issue_flags); 929 return 0; 930 } 931 932 static bool is_ring_ready(struct fuse_ring *ring, int current_qid) 933 { 934 int qid; 935 struct fuse_ring_queue *queue; 936 bool ready = true; 937 938 for (qid = 0; qid < ring->nr_queues && ready; qid++) { 939 if (current_qid == qid) 940 continue; 941 942 queue = ring->queues[qid]; 943 if (!queue) { 944 ready = false; 945 break; 946 } 947 948 spin_lock(&queue->lock); 949 if (list_empty(&queue->ent_avail_queue)) 950 ready = false; 951 spin_unlock(&queue->lock); 952 } 953 954 return ready; 955 } 956 957 /* 958 * fuse_uring_req_fetch command handling 959 */ 960 static void fuse_uring_do_register(struct fuse_ring_ent *ent, 961 struct io_uring_cmd *cmd, 962 unsigned int issue_flags) 963 { 964 struct fuse_ring_queue *queue = ent->queue; 965 struct fuse_ring *ring = queue->ring; 966 struct fuse_conn *fc = ring->fc; 967 struct fuse_iqueue *fiq = &fc->iq; 968 969 fuse_uring_prepare_cancel(cmd, issue_flags, ent); 970 971 spin_lock(&queue->lock); 972 ent->cmd = cmd; 973 fuse_uring_ent_avail(ent, queue); 974 spin_unlock(&queue->lock); 975 976 if (!ring->ready) { 977 bool ready = is_ring_ready(ring, queue->qid); 978 979 if (ready) { 980 WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); 981 WRITE_ONCE(ring->ready, true); 982 wake_up_all(&fc->blocked_waitq); 983 } 984 } 985 } 986 987 /* 988 * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] 989 * the payload 990 */ 991 static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, 992 struct iovec iov[FUSE_URING_IOV_SEGS]) 993 { 994 struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); 995 struct iov_iter iter; 996 ssize_t ret; 997 998 if (sqe->len != FUSE_URING_IOV_SEGS) 999 return -EINVAL; 1000 1001 /* 1002 * Direction for buffer access will actually be READ and WRITE, 1003 * using write for the import should include READ access as well. 1004 */ 1005 ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, 1006 FUSE_URING_IOV_SEGS, &iov, &iter); 1007 if (ret < 0) 1008 return ret; 1009 1010 return 0; 1011 } 1012 1013 static struct fuse_ring_ent * 1014 fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, 1015 struct fuse_ring_queue *queue) 1016 { 1017 struct fuse_ring *ring = queue->ring; 1018 struct fuse_ring_ent *ent; 1019 size_t payload_size; 1020 struct iovec iov[FUSE_URING_IOV_SEGS]; 1021 int err; 1022 1023 err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); 1024 if (err) { 1025 pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", 1026 err); 1027 return ERR_PTR(err); 1028 } 1029 1030 err = -EINVAL; 1031 if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { 1032 pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); 1033 return ERR_PTR(err); 1034 } 1035 1036 payload_size = iov[1].iov_len; 1037 if (payload_size < ring->max_payload_sz) { 1038 pr_info_ratelimited("Invalid req payload len %zu\n", 1039 payload_size); 1040 return ERR_PTR(err); 1041 } 1042 1043 err = -ENOMEM; 1044 ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); 1045 if (!ent) 1046 return ERR_PTR(err); 1047 1048 INIT_LIST_HEAD(&ent->list); 1049 1050 ent->queue = queue; 1051 ent->headers = iov[0].iov_base; 1052 ent->payload = iov[1].iov_base; 1053 1054 atomic_inc(&ring->queue_refs); 1055 return ent; 1056 } 1057 1058 /* 1059 * Register header and payload buffer with the kernel and puts the 1060 * entry as "ready to get fuse requests" on the queue 1061 */ 1062 static int fuse_uring_register(struct io_uring_cmd *cmd, 1063 unsigned int issue_flags, struct fuse_conn *fc) 1064 { 1065 const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); 1066 struct fuse_ring *ring = smp_load_acquire(&fc->ring); 1067 struct fuse_ring_queue *queue; 1068 struct fuse_ring_ent *ent; 1069 int err; 1070 unsigned int qid = READ_ONCE(cmd_req->qid); 1071 1072 err = -ENOMEM; 1073 if (!ring) { 1074 ring = fuse_uring_create(fc); 1075 if (!ring) 1076 return err; 1077 } 1078 1079 if (qid >= ring->nr_queues) { 1080 pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); 1081 return -EINVAL; 1082 } 1083 1084 queue = ring->queues[qid]; 1085 if (!queue) { 1086 queue = fuse_uring_create_queue(ring, qid); 1087 if (!queue) 1088 return err; 1089 } 1090 1091 /* 1092 * The created queue above does not need to be destructed in 1093 * case of entry errors below, will be done at ring destruction time. 1094 */ 1095 1096 ent = fuse_uring_create_ring_ent(cmd, queue); 1097 if (IS_ERR(ent)) 1098 return PTR_ERR(ent); 1099 1100 fuse_uring_do_register(ent, cmd, issue_flags); 1101 1102 return 0; 1103 } 1104 1105 /* 1106 * Entry function from io_uring to handle the given passthrough command 1107 * (op code IORING_OP_URING_CMD) 1108 */ 1109 int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 1110 { 1111 struct fuse_dev *fud; 1112 struct fuse_conn *fc; 1113 u32 cmd_op = cmd->cmd_op; 1114 int err; 1115 1116 if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { 1117 fuse_uring_cancel(cmd, issue_flags); 1118 return 0; 1119 } 1120 1121 /* This extra SQE size holds struct fuse_uring_cmd_req */ 1122 if (!(issue_flags & IO_URING_F_SQE128)) 1123 return -EINVAL; 1124 1125 fud = fuse_get_dev(cmd->file); 1126 if (!fud) { 1127 pr_info_ratelimited("No fuse device found\n"); 1128 return -ENOTCONN; 1129 } 1130 fc = fud->fc; 1131 1132 /* Once a connection has io-uring enabled on it, it can't be disabled */ 1133 if (!enable_uring && !fc->io_uring) { 1134 pr_info_ratelimited("fuse-io-uring is disabled\n"); 1135 return -EOPNOTSUPP; 1136 } 1137 1138 if (fc->aborted) 1139 return -ECONNABORTED; 1140 if (!fc->connected) 1141 return -ENOTCONN; 1142 1143 /* 1144 * fuse_uring_register() needs the ring to be initialized, 1145 * we need to know the max payload size 1146 */ 1147 if (!fc->initialized) 1148 return -EAGAIN; 1149 1150 switch (cmd_op) { 1151 case FUSE_IO_URING_CMD_REGISTER: 1152 err = fuse_uring_register(cmd, issue_flags, fc); 1153 if (err) { 1154 pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", 1155 err); 1156 fc->io_uring = 0; 1157 wake_up_all(&fc->blocked_waitq); 1158 return err; 1159 } 1160 break; 1161 case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: 1162 err = fuse_uring_commit_fetch(cmd, issue_flags, fc); 1163 if (err) { 1164 pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", 1165 err); 1166 return err; 1167 } 1168 break; 1169 default: 1170 return -EINVAL; 1171 } 1172 1173 return -EIOCBQUEUED; 1174 } 1175 1176 static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, 1177 ssize_t ret, unsigned int issue_flags) 1178 { 1179 struct fuse_ring_queue *queue = ent->queue; 1180 1181 spin_lock(&queue->lock); 1182 ent->state = FRRS_USERSPACE; 1183 list_move(&ent->list, &queue->ent_in_userspace); 1184 ent->cmd = NULL; 1185 spin_unlock(&queue->lock); 1186 1187 io_uring_cmd_done(cmd, ret, 0, issue_flags); 1188 } 1189 1190 /* 1191 * This prepares and sends the ring request in fuse-uring task context. 1192 * User buffers are not mapped yet - the application does not have permission 1193 * to write to it - this has to be executed in ring task context. 1194 */ 1195 static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, 1196 unsigned int issue_flags) 1197 { 1198 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 1199 struct fuse_ring_queue *queue = ent->queue; 1200 int err; 1201 1202 if (!(issue_flags & IO_URING_F_TASK_DEAD)) { 1203 err = fuse_uring_prepare_send(ent, ent->fuse_req); 1204 if (err) { 1205 fuse_uring_next_fuse_req(ent, queue, issue_flags); 1206 return; 1207 } 1208 } else { 1209 err = -ECANCELED; 1210 } 1211 1212 fuse_uring_send(ent, cmd, err, issue_flags); 1213 } 1214 1215 static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) 1216 { 1217 unsigned int qid; 1218 struct fuse_ring_queue *queue; 1219 1220 qid = task_cpu(current); 1221 1222 if (WARN_ONCE(qid >= ring->nr_queues, 1223 "Core number (%u) exceeds nr queues (%zu)\n", qid, 1224 ring->nr_queues)) 1225 qid = 0; 1226 1227 queue = ring->queues[qid]; 1228 WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); 1229 1230 return queue; 1231 } 1232 1233 static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) 1234 { 1235 struct io_uring_cmd *cmd = ent->cmd; 1236 1237 uring_cmd_set_ring_ent(cmd, ent); 1238 io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); 1239 } 1240 1241 /* queue a fuse request and send it if a ring entry is available */ 1242 void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) 1243 { 1244 struct fuse_conn *fc = req->fm->fc; 1245 struct fuse_ring *ring = fc->ring; 1246 struct fuse_ring_queue *queue; 1247 struct fuse_ring_ent *ent = NULL; 1248 int err; 1249 1250 err = -EINVAL; 1251 queue = fuse_uring_task_to_queue(ring); 1252 if (!queue) 1253 goto err; 1254 1255 if (req->in.h.opcode != FUSE_NOTIFY_REPLY) 1256 req->in.h.unique = fuse_get_unique(fiq); 1257 1258 spin_lock(&queue->lock); 1259 err = -ENOTCONN; 1260 if (unlikely(queue->stopped)) 1261 goto err_unlock; 1262 1263 set_bit(FR_URING, &req->flags); 1264 req->ring_queue = queue; 1265 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1266 struct fuse_ring_ent, list); 1267 if (ent) 1268 fuse_uring_add_req_to_ring_ent(ent, req); 1269 else 1270 list_add_tail(&req->list, &queue->fuse_req_queue); 1271 spin_unlock(&queue->lock); 1272 1273 if (ent) 1274 fuse_uring_dispatch_ent(ent); 1275 1276 return; 1277 1278 err_unlock: 1279 spin_unlock(&queue->lock); 1280 err: 1281 req->out.h.error = err; 1282 clear_bit(FR_PENDING, &req->flags); 1283 fuse_request_end(req); 1284 } 1285 1286 bool fuse_uring_queue_bq_req(struct fuse_req *req) 1287 { 1288 struct fuse_conn *fc = req->fm->fc; 1289 struct fuse_ring *ring = fc->ring; 1290 struct fuse_ring_queue *queue; 1291 struct fuse_ring_ent *ent = NULL; 1292 1293 queue = fuse_uring_task_to_queue(ring); 1294 if (!queue) 1295 return false; 1296 1297 spin_lock(&queue->lock); 1298 if (unlikely(queue->stopped)) { 1299 spin_unlock(&queue->lock); 1300 return false; 1301 } 1302 1303 set_bit(FR_URING, &req->flags); 1304 req->ring_queue = queue; 1305 list_add_tail(&req->list, &queue->fuse_req_bg_queue); 1306 1307 ent = list_first_entry_or_null(&queue->ent_avail_queue, 1308 struct fuse_ring_ent, list); 1309 spin_lock(&fc->bg_lock); 1310 fc->num_background++; 1311 if (fc->num_background == fc->max_background) 1312 fc->blocked = 1; 1313 fuse_uring_flush_bg(queue); 1314 spin_unlock(&fc->bg_lock); 1315 1316 /* 1317 * Due to bg_queue flush limits there might be other bg requests 1318 * in the queue that need to be handled first. Or no further req 1319 * might be available. 1320 */ 1321 req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, 1322 list); 1323 if (ent && req) { 1324 fuse_uring_add_req_to_ring_ent(ent, req); 1325 spin_unlock(&queue->lock); 1326 1327 fuse_uring_dispatch_ent(ent); 1328 } else { 1329 spin_unlock(&queue->lock); 1330 } 1331 1332 return true; 1333 } 1334 1335 bool fuse_uring_remove_pending_req(struct fuse_req *req) 1336 { 1337 struct fuse_ring_queue *queue = req->ring_queue; 1338 1339 return fuse_remove_pending_req(req, &queue->lock); 1340 } 1341 1342 static const struct fuse_iqueue_ops fuse_io_uring_ops = { 1343 /* should be send over io-uring as enhancement */ 1344 .send_forget = fuse_dev_queue_forget, 1345 1346 /* 1347 * could be send over io-uring, but interrupts should be rare, 1348 * no need to make the code complex 1349 */ 1350 .send_interrupt = fuse_dev_queue_interrupt, 1351 .send_req = fuse_uring_queue_fuse_req, 1352 }; 1353