1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Network block device - make block devices work over TCP
4 *
5 * Note that you can not swap over this thing, yet. Seems to work but
6 * deadlocks sometimes - you can not swap over TCP in general.
7 *
8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
10 *
11 * (part of code stolen from loop.c)
12 */
13
14 #define pr_fmt(fmt) "nbd: " fmt
15
16 #include <linux/major.h>
17
18 #include <linux/blkdev.h>
19 #include <linux/module.h>
20 #include <linux/init.h>
21 #include <linux/sched.h>
22 #include <linux/sched/mm.h>
23 #include <linux/fs.h>
24 #include <linux/bio.h>
25 #include <linux/stat.h>
26 #include <linux/errno.h>
27 #include <linux/file.h>
28 #include <linux/ioctl.h>
29 #include <linux/mutex.h>
30 #include <linux/compiler.h>
31 #include <linux/completion.h>
32 #include <linux/err.h>
33 #include <linux/kernel.h>
34 #include <linux/slab.h>
35 #include <net/sock.h>
36 #include <linux/net.h>
37 #include <linux/kthread.h>
38 #include <linux/types.h>
39 #include <linux/debugfs.h>
40 #include <linux/blk-mq.h>
41
42 #include <linux/uaccess.h>
43 #include <asm/types.h>
44
45 #include <linux/nbd.h>
46 #include <linux/nbd-netlink.h>
47 #include <net/genetlink.h>
48
49 #define CREATE_TRACE_POINTS
50 #include <trace/events/nbd.h>
51
52 static DEFINE_IDR(nbd_index_idr);
53 static DEFINE_MUTEX(nbd_index_mutex);
54 static struct workqueue_struct *nbd_del_wq;
55 static struct cred *nbd_cred;
56 static int nbd_total_devices = 0;
57
58 struct nbd_sock {
59 struct socket *sock;
60 struct mutex tx_lock;
61 struct request *pending;
62 int sent;
63 bool dead;
64 int fallback_index;
65 int cookie;
66 struct work_struct work;
67 };
68
69 struct recv_thread_args {
70 struct work_struct work;
71 struct nbd_device *nbd;
72 struct nbd_sock *nsock;
73 int index;
74 };
75
76 struct link_dead_args {
77 struct work_struct work;
78 int index;
79 };
80
81 #define NBD_RT_TIMEDOUT 0
82 #define NBD_RT_DISCONNECT_REQUESTED 1
83 #define NBD_RT_DISCONNECTED 2
84 #define NBD_RT_HAS_PID_FILE 3
85 #define NBD_RT_HAS_CONFIG_REF 4
86 #define NBD_RT_BOUND 5
87 #define NBD_RT_DISCONNECT_ON_CLOSE 6
88 #define NBD_RT_HAS_BACKEND_FILE 7
89
90 #define NBD_DESTROY_ON_DISCONNECT 0
91 #define NBD_DISCONNECT_REQUESTED 1
92
93 struct nbd_config {
94 u32 flags;
95 unsigned long runtime_flags;
96 u64 dead_conn_timeout;
97
98 struct nbd_sock **socks;
99 int num_connections;
100 atomic_t live_connections;
101 wait_queue_head_t conn_wait;
102
103 atomic_t recv_threads;
104 wait_queue_head_t recv_wq;
105 unsigned int blksize_bits;
106 loff_t bytesize;
107 #if IS_ENABLED(CONFIG_DEBUG_FS)
108 struct dentry *dbg_dir;
109 #endif
110 };
111
nbd_blksize(struct nbd_config * config)112 static inline unsigned int nbd_blksize(struct nbd_config *config)
113 {
114 return 1u << config->blksize_bits;
115 }
116
117 struct nbd_device {
118 struct blk_mq_tag_set tag_set;
119
120 int index;
121 refcount_t config_refs;
122 refcount_t refs;
123 struct nbd_config *config;
124 struct mutex config_lock;
125 struct gendisk *disk;
126 struct workqueue_struct *recv_workq;
127 struct work_struct remove_work;
128
129 struct list_head list;
130 struct task_struct *task_setup;
131
132 unsigned long flags;
133 pid_t pid; /* pid of nbd-client, if attached */
134
135 char *backend;
136 };
137
138 #define NBD_CMD_REQUEUED 1
139 /*
140 * This flag will be set if nbd_queue_rq() succeed, and will be checked and
141 * cleared in completion. Both setting and clearing of the flag are protected
142 * by cmd->lock.
143 */
144 #define NBD_CMD_INFLIGHT 2
145
146 /* Just part of request header or data payload is sent successfully */
147 #define NBD_CMD_PARTIAL_SEND 3
148
149 struct nbd_cmd {
150 struct nbd_device *nbd;
151 struct mutex lock;
152 int index;
153 int cookie;
154 int retries;
155 blk_status_t status;
156 unsigned long flags;
157 u32 cmd_cookie;
158 };
159
160 #if IS_ENABLED(CONFIG_DEBUG_FS)
161 static struct dentry *nbd_dbg_dir;
162 #endif
163
164 #define nbd_name(nbd) ((nbd)->disk->disk_name)
165
166 #define NBD_DEF_BLKSIZE_BITS 10
167
168 static unsigned int nbds_max = 16;
169 static int max_part = 16;
170 static int part_shift;
171
172 static int nbd_dev_dbg_init(struct nbd_device *nbd);
173 static void nbd_dev_dbg_close(struct nbd_device *nbd);
174 static void nbd_config_put(struct nbd_device *nbd);
175 static void nbd_connect_reply(struct genl_info *info, int index);
176 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
177 static void nbd_dead_link_work(struct work_struct *work);
178 static void nbd_disconnect_and_put(struct nbd_device *nbd);
179
nbd_to_dev(struct nbd_device * nbd)180 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
181 {
182 return disk_to_dev(nbd->disk);
183 }
184
nbd_requeue_cmd(struct nbd_cmd * cmd)185 static void nbd_requeue_cmd(struct nbd_cmd *cmd)
186 {
187 struct request *req = blk_mq_rq_from_pdu(cmd);
188
189 lockdep_assert_held(&cmd->lock);
190
191 /*
192 * Clear INFLIGHT flag so that this cmd won't be completed in
193 * normal completion path
194 *
195 * INFLIGHT flag will be set when the cmd is queued to nbd next
196 * time.
197 */
198 __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
199
200 if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
201 blk_mq_requeue_request(req, true);
202 }
203
204 #define NBD_COOKIE_BITS 32
205
nbd_cmd_handle(struct nbd_cmd * cmd)206 static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
207 {
208 struct request *req = blk_mq_rq_from_pdu(cmd);
209 u32 tag = blk_mq_unique_tag(req);
210 u64 cookie = cmd->cmd_cookie;
211
212 return (cookie << NBD_COOKIE_BITS) | tag;
213 }
214
nbd_handle_to_tag(u64 handle)215 static u32 nbd_handle_to_tag(u64 handle)
216 {
217 return (u32)handle;
218 }
219
nbd_handle_to_cookie(u64 handle)220 static u32 nbd_handle_to_cookie(u64 handle)
221 {
222 return (u32)(handle >> NBD_COOKIE_BITS);
223 }
224
nbdcmd_to_ascii(int cmd)225 static const char *nbdcmd_to_ascii(int cmd)
226 {
227 switch (cmd) {
228 case NBD_CMD_READ: return "read";
229 case NBD_CMD_WRITE: return "write";
230 case NBD_CMD_DISC: return "disconnect";
231 case NBD_CMD_FLUSH: return "flush";
232 case NBD_CMD_TRIM: return "trim/discard";
233 }
234 return "invalid";
235 }
236
pid_show(struct device * dev,struct device_attribute * attr,char * buf)237 static ssize_t pid_show(struct device *dev,
238 struct device_attribute *attr, char *buf)
239 {
240 struct gendisk *disk = dev_to_disk(dev);
241 struct nbd_device *nbd = disk->private_data;
242
243 return sprintf(buf, "%d\n", nbd->pid);
244 }
245
246 static const struct device_attribute pid_attr = {
247 .attr = { .name = "pid", .mode = 0444},
248 .show = pid_show,
249 };
250
backend_show(struct device * dev,struct device_attribute * attr,char * buf)251 static ssize_t backend_show(struct device *dev,
252 struct device_attribute *attr, char *buf)
253 {
254 struct gendisk *disk = dev_to_disk(dev);
255 struct nbd_device *nbd = disk->private_data;
256
257 return sprintf(buf, "%s\n", nbd->backend ?: "");
258 }
259
260 static const struct device_attribute backend_attr = {
261 .attr = { .name = "backend", .mode = 0444},
262 .show = backend_show,
263 };
264
nbd_dev_remove(struct nbd_device * nbd)265 static void nbd_dev_remove(struct nbd_device *nbd)
266 {
267 struct gendisk *disk = nbd->disk;
268
269 del_gendisk(disk);
270 blk_mq_free_tag_set(&nbd->tag_set);
271
272 /*
273 * Remove from idr after del_gendisk() completes, so if the same ID is
274 * reused, the following add_disk() will succeed.
275 */
276 mutex_lock(&nbd_index_mutex);
277 idr_remove(&nbd_index_idr, nbd->index);
278 mutex_unlock(&nbd_index_mutex);
279 destroy_workqueue(nbd->recv_workq);
280 put_disk(disk);
281 }
282
nbd_dev_remove_work(struct work_struct * work)283 static void nbd_dev_remove_work(struct work_struct *work)
284 {
285 nbd_dev_remove(container_of(work, struct nbd_device, remove_work));
286 }
287
nbd_put(struct nbd_device * nbd)288 static void nbd_put(struct nbd_device *nbd)
289 {
290 if (!refcount_dec_and_test(&nbd->refs))
291 return;
292
293 /* Call del_gendisk() asynchrounously to prevent deadlock */
294 if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
295 queue_work(nbd_del_wq, &nbd->remove_work);
296 else
297 nbd_dev_remove(nbd);
298 }
299
nbd_disconnected(struct nbd_config * config)300 static int nbd_disconnected(struct nbd_config *config)
301 {
302 return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
303 test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
304 }
305
nbd_mark_nsock_dead(struct nbd_device * nbd,struct nbd_sock * nsock,int notify)306 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
307 int notify)
308 {
309 if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
310 struct link_dead_args *args;
311 args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
312 if (args) {
313 INIT_WORK(&args->work, nbd_dead_link_work);
314 args->index = nbd->index;
315 queue_work(system_percpu_wq, &args->work);
316 }
317 }
318 if (!nsock->dead) {
319 kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
320 if (atomic_dec_return(&nbd->config->live_connections) == 0) {
321 if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
322 &nbd->config->runtime_flags)) {
323 set_bit(NBD_RT_DISCONNECTED,
324 &nbd->config->runtime_flags);
325 dev_info(nbd_to_dev(nbd),
326 "Disconnected due to user request.\n");
327 }
328 }
329 }
330 nsock->dead = true;
331 nsock->pending = NULL;
332 nsock->sent = 0;
333 }
334
nbd_set_size(struct nbd_device * nbd,loff_t bytesize,loff_t blksize)335 static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize)
336 {
337 struct queue_limits lim;
338 int error;
339
340 if (!blksize)
341 blksize = 1u << NBD_DEF_BLKSIZE_BITS;
342
343 if (blk_validate_block_size(blksize))
344 return -EINVAL;
345
346 if (bytesize < 0)
347 return -EINVAL;
348
349 nbd->config->bytesize = bytesize;
350 nbd->config->blksize_bits = __ffs(blksize);
351
352 if (!nbd->pid)
353 return 0;
354
355 lim = queue_limits_start_update(nbd->disk->queue);
356 if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
357 lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT;
358 else
359 lim.max_hw_discard_sectors = 0;
360 if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) {
361 lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
362 } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) {
363 lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
364 } else {
365 lim.features |= BLK_FEAT_WRITE_CACHE;
366 lim.features &= ~BLK_FEAT_FUA;
367 }
368 if (nbd->config->flags & NBD_FLAG_ROTATIONAL)
369 lim.features |= BLK_FEAT_ROTATIONAL;
370 if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES)
371 lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT;
372
373 lim.logical_block_size = blksize;
374 lim.physical_block_size = blksize;
375 error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim);
376 if (error)
377 return error;
378
379 if (max_part)
380 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
381 if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
382 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
383 return 0;
384 }
385
nbd_complete_rq(struct request * req)386 static void nbd_complete_rq(struct request *req)
387 {
388 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
389
390 dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
391 cmd->status ? "failed" : "done");
392
393 blk_mq_end_request(req, cmd->status);
394 }
395
396 /*
397 * Forcibly shutdown the socket causing all listeners to error
398 */
sock_shutdown(struct nbd_device * nbd)399 static void sock_shutdown(struct nbd_device *nbd)
400 {
401 struct nbd_config *config = nbd->config;
402 int i;
403
404 if (config->num_connections == 0)
405 return;
406 if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
407 return;
408
409 for (i = 0; i < config->num_connections; i++) {
410 struct nbd_sock *nsock = config->socks[i];
411 mutex_lock(&nsock->tx_lock);
412 nbd_mark_nsock_dead(nbd, nsock, 0);
413 mutex_unlock(&nsock->tx_lock);
414 }
415 dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
416 }
417
req_to_nbd_cmd_type(struct request * req)418 static u32 req_to_nbd_cmd_type(struct request *req)
419 {
420 switch (req_op(req)) {
421 case REQ_OP_DISCARD:
422 return NBD_CMD_TRIM;
423 case REQ_OP_FLUSH:
424 return NBD_CMD_FLUSH;
425 case REQ_OP_WRITE:
426 return NBD_CMD_WRITE;
427 case REQ_OP_READ:
428 return NBD_CMD_READ;
429 case REQ_OP_WRITE_ZEROES:
430 return NBD_CMD_WRITE_ZEROES;
431 default:
432 return U32_MAX;
433 }
434 }
435
nbd_get_config_unlocked(struct nbd_device * nbd)436 static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd)
437 {
438 if (refcount_inc_not_zero(&nbd->config_refs)) {
439 /*
440 * Add smp_mb__after_atomic to ensure that reading nbd->config_refs
441 * and reading nbd->config is ordered. The pair is the barrier in
442 * nbd_alloc_and_init_config(), avoid nbd->config_refs is set
443 * before nbd->config.
444 */
445 smp_mb__after_atomic();
446 return nbd->config;
447 }
448
449 return NULL;
450 }
451
nbd_xmit_timeout(struct request * req)452 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
453 {
454 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
455 struct nbd_device *nbd = cmd->nbd;
456 struct nbd_config *config;
457
458 if (!mutex_trylock(&cmd->lock))
459 return BLK_EH_RESET_TIMER;
460
461 /* partial send is handled in nbd_sock's work function */
462 if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) {
463 mutex_unlock(&cmd->lock);
464 return BLK_EH_RESET_TIMER;
465 }
466
467 if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
468 mutex_unlock(&cmd->lock);
469 return BLK_EH_DONE;
470 }
471
472 config = nbd_get_config_unlocked(nbd);
473 if (!config) {
474 cmd->status = BLK_STS_TIMEOUT;
475 __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
476 mutex_unlock(&cmd->lock);
477 goto done;
478 }
479
480 if (config->num_connections > 1 ||
481 (config->num_connections == 1 && nbd->tag_set.timeout)) {
482 dev_err_ratelimited(nbd_to_dev(nbd),
483 "Connection timed out, retrying (%d/%d alive)\n",
484 atomic_read(&config->live_connections),
485 config->num_connections);
486 /*
487 * Hooray we have more connections, requeue this IO, the submit
488 * path will put it on a real connection. Or if only one
489 * connection is configured, the submit path will wait util
490 * a new connection is reconfigured or util dead timeout.
491 */
492 if (config->socks) {
493 if (cmd->index < config->num_connections) {
494 struct nbd_sock *nsock =
495 config->socks[cmd->index];
496 mutex_lock(&nsock->tx_lock);
497 /* We can have multiple outstanding requests, so
498 * we don't want to mark the nsock dead if we've
499 * already reconnected with a new socket, so
500 * only mark it dead if its the same socket we
501 * were sent out on.
502 */
503 if (cmd->cookie == nsock->cookie)
504 nbd_mark_nsock_dead(nbd, nsock, 1);
505 mutex_unlock(&nsock->tx_lock);
506 }
507 nbd_requeue_cmd(cmd);
508 mutex_unlock(&cmd->lock);
509 nbd_config_put(nbd);
510 return BLK_EH_DONE;
511 }
512 }
513
514 if (!nbd->tag_set.timeout) {
515 /*
516 * Userspace sets timeout=0 to disable socket disconnection,
517 * so just warn and reset the timer.
518 */
519 struct nbd_sock *nsock = config->socks[cmd->index];
520 cmd->retries++;
521 dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
522 req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
523 (unsigned long long)blk_rq_pos(req) << 9,
524 blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
525
526 mutex_lock(&nsock->tx_lock);
527 if (cmd->cookie != nsock->cookie) {
528 nbd_requeue_cmd(cmd);
529 mutex_unlock(&nsock->tx_lock);
530 mutex_unlock(&cmd->lock);
531 nbd_config_put(nbd);
532 return BLK_EH_DONE;
533 }
534 mutex_unlock(&nsock->tx_lock);
535 mutex_unlock(&cmd->lock);
536 nbd_config_put(nbd);
537 return BLK_EH_RESET_TIMER;
538 }
539
540 dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
541 set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
542 cmd->status = BLK_STS_IOERR;
543 __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
544 mutex_unlock(&cmd->lock);
545 sock_shutdown(nbd);
546 nbd_config_put(nbd);
547 done:
548 blk_mq_complete_request(req);
549 return BLK_EH_DONE;
550 }
551
__sock_xmit(struct nbd_device * nbd,struct socket * sock,int send,struct iov_iter * iter,int msg_flags,int * sent)552 static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
553 struct iov_iter *iter, int msg_flags, int *sent)
554 {
555 int result;
556 struct msghdr msg = {} ;
557 unsigned int noreclaim_flag;
558 const struct cred *old_cred;
559
560 if (unlikely(!sock)) {
561 dev_err_ratelimited(disk_to_dev(nbd->disk),
562 "Attempted %s on closed socket in sock_xmit\n",
563 (send ? "send" : "recv"));
564 return -EINVAL;
565 }
566
567 old_cred = override_creds(nbd_cred);
568
569 msg.msg_iter = *iter;
570
571 noreclaim_flag = memalloc_noreclaim_save();
572 do {
573 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
574 sock->sk->sk_use_task_frag = false;
575 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
576
577 if (send)
578 result = sock_sendmsg(sock, &msg);
579 else
580 result = sock_recvmsg(sock, &msg, msg.msg_flags);
581
582 if (result <= 0) {
583 if (result == 0)
584 result = -EPIPE; /* short read */
585 break;
586 }
587 if (sent)
588 *sent += result;
589 } while (msg_data_left(&msg));
590
591 memalloc_noreclaim_restore(noreclaim_flag);
592
593 revert_creds(old_cred);
594
595 return result;
596 }
597
598 /*
599 * Send or receive packet. Return a positive value on success and
600 * negtive value on failure, and never return 0.
601 */
sock_xmit(struct nbd_device * nbd,int index,int send,struct iov_iter * iter,int msg_flags,int * sent)602 static int sock_xmit(struct nbd_device *nbd, int index, int send,
603 struct iov_iter *iter, int msg_flags, int *sent)
604 {
605 struct nbd_config *config = nbd->config;
606 struct socket *sock = config->socks[index]->sock;
607
608 return __sock_xmit(nbd, sock, send, iter, msg_flags, sent);
609 }
610
611 /*
612 * Different settings for sk->sk_sndtimeo can result in different return values
613 * if there is a signal pending when we enter sendmsg, because reasons?
614 */
was_interrupted(int result)615 static inline int was_interrupted(int result)
616 {
617 return result == -ERESTARTSYS || result == -EINTR;
618 }
619
620 /*
621 * We've already sent header or part of data payload, have no choice but
622 * to set pending and schedule it in work.
623 *
624 * And we have to return BLK_STS_OK to block core, otherwise this same
625 * request may be re-dispatched with different tag, but our header has
626 * been sent out with old tag, and this way does confuse reply handling.
627 */
nbd_sched_pending_work(struct nbd_device * nbd,struct nbd_sock * nsock,struct nbd_cmd * cmd,int sent)628 static void nbd_sched_pending_work(struct nbd_device *nbd,
629 struct nbd_sock *nsock,
630 struct nbd_cmd *cmd, int sent)
631 {
632 struct request *req = blk_mq_rq_from_pdu(cmd);
633
634 /* pending work should be scheduled only once */
635 WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags));
636
637 nsock->pending = req;
638 nsock->sent = sent;
639 set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
640 refcount_inc(&nbd->config_refs);
641 schedule_work(&nsock->work);
642 }
643
644 /*
645 * Returns BLK_STS_RESOURCE if the caller should retry after a delay.
646 * Returns BLK_STS_IOERR if sending failed.
647 */
nbd_send_cmd(struct nbd_device * nbd,struct nbd_cmd * cmd,int index)648 static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
649 int index)
650 {
651 struct request *req = blk_mq_rq_from_pdu(cmd);
652 struct nbd_config *config = nbd->config;
653 struct nbd_sock *nsock = config->socks[index];
654 int result;
655 struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
656 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
657 struct iov_iter from;
658 struct bio *bio;
659 u64 handle;
660 u32 type;
661 u32 nbd_cmd_flags = 0;
662 int sent = nsock->sent, skip = 0;
663
664 lockdep_assert_held(&cmd->lock);
665 lockdep_assert_held(&nsock->tx_lock);
666
667 iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
668
669 type = req_to_nbd_cmd_type(req);
670 if (type == U32_MAX)
671 return BLK_STS_IOERR;
672
673 if (rq_data_dir(req) == WRITE &&
674 (config->flags & NBD_FLAG_READ_ONLY)) {
675 dev_err_ratelimited(disk_to_dev(nbd->disk),
676 "Write on read-only\n");
677 return BLK_STS_IOERR;
678 }
679
680 if (req->cmd_flags & REQ_FUA)
681 nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
682 if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES))
683 nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE;
684
685 /* We did a partial send previously, and we at least sent the whole
686 * request struct, so just go and send the rest of the pages in the
687 * request.
688 */
689 if (sent) {
690 if (sent >= sizeof(request)) {
691 skip = sent - sizeof(request);
692
693 /* initialize handle for tracing purposes */
694 handle = nbd_cmd_handle(cmd);
695
696 goto send_pages;
697 }
698 iov_iter_advance(&from, sent);
699 } else {
700 cmd->cmd_cookie++;
701 }
702 cmd->index = index;
703 cmd->cookie = nsock->cookie;
704 cmd->retries = 0;
705 request.type = htonl(type | nbd_cmd_flags);
706 if (type != NBD_CMD_FLUSH) {
707 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
708 request.len = htonl(blk_rq_bytes(req));
709 }
710 handle = nbd_cmd_handle(cmd);
711 request.cookie = cpu_to_be64(handle);
712
713 trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
714
715 dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
716 req, nbdcmd_to_ascii(type),
717 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
718 result = sock_xmit(nbd, index, 1, &from,
719 (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
720 trace_nbd_header_sent(req, handle);
721 if (result < 0) {
722 if (was_interrupted(result)) {
723 /* If we haven't sent anything we can just return BUSY,
724 * however if we have sent something we need to make
725 * sure we only allow this req to be sent until we are
726 * completely done.
727 */
728 if (sent) {
729 nbd_sched_pending_work(nbd, nsock, cmd, sent);
730 return BLK_STS_OK;
731 }
732 set_bit(NBD_CMD_REQUEUED, &cmd->flags);
733 return BLK_STS_RESOURCE;
734 }
735 dev_err_ratelimited(disk_to_dev(nbd->disk),
736 "Send control failed (result %d)\n", result);
737 goto requeue;
738 }
739 send_pages:
740 if (type != NBD_CMD_WRITE)
741 goto out;
742
743 bio = req->bio;
744 while (bio) {
745 struct bio *next = bio->bi_next;
746 struct bvec_iter iter;
747 struct bio_vec bvec;
748
749 bio_for_each_segment(bvec, bio, iter) {
750 bool is_last = !next && bio_iter_last(bvec, iter);
751 int flags = is_last ? 0 : MSG_MORE;
752
753 dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
754 req, bvec.bv_len);
755 iov_iter_bvec(&from, ITER_SOURCE, &bvec, 1, bvec.bv_len);
756 if (skip) {
757 if (skip >= iov_iter_count(&from)) {
758 skip -= iov_iter_count(&from);
759 continue;
760 }
761 iov_iter_advance(&from, skip);
762 skip = 0;
763 }
764 result = sock_xmit(nbd, index, 1, &from, flags, &sent);
765 if (result < 0) {
766 if (was_interrupted(result)) {
767 nbd_sched_pending_work(nbd, nsock, cmd, sent);
768 return BLK_STS_OK;
769 }
770 dev_err(disk_to_dev(nbd->disk),
771 "Send data failed (result %d)\n",
772 result);
773 goto requeue;
774 }
775 /*
776 * The completion might already have come in,
777 * so break for the last one instead of letting
778 * the iterator do it. This prevents use-after-free
779 * of the bio.
780 */
781 if (is_last)
782 break;
783 }
784 bio = next;
785 }
786 out:
787 trace_nbd_payload_sent(req, handle);
788 nsock->pending = NULL;
789 nsock->sent = 0;
790 __set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
791 return BLK_STS_OK;
792
793 requeue:
794 /*
795 * Can't requeue in case we are dealing with partial send
796 *
797 * We must run from pending work function.
798 * */
799 if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))
800 return BLK_STS_OK;
801
802 /* retry on a different socket */
803 dev_err_ratelimited(disk_to_dev(nbd->disk),
804 "Request send failed, requeueing\n");
805 nbd_mark_nsock_dead(nbd, nsock, 1);
806 nbd_requeue_cmd(cmd);
807 return BLK_STS_OK;
808 }
809
810 /* handle partial sending */
nbd_pending_cmd_work(struct work_struct * work)811 static void nbd_pending_cmd_work(struct work_struct *work)
812 {
813 struct nbd_sock *nsock = container_of(work, struct nbd_sock, work);
814 struct request *req = nsock->pending;
815 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
816 struct nbd_device *nbd = cmd->nbd;
817 unsigned long deadline = READ_ONCE(req->deadline);
818 unsigned int wait_ms = 2;
819
820 mutex_lock(&cmd->lock);
821
822 WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags));
823 if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)))
824 goto out;
825
826 mutex_lock(&nsock->tx_lock);
827 while (true) {
828 nbd_send_cmd(nbd, cmd, cmd->index);
829 if (!nsock->pending)
830 break;
831
832 /* don't bother timeout handler for partial sending */
833 if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) {
834 cmd->status = BLK_STS_IOERR;
835 blk_mq_complete_request(req);
836 break;
837 }
838 msleep(wait_ms);
839 wait_ms *= 2;
840 }
841 mutex_unlock(&nsock->tx_lock);
842 clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
843 out:
844 mutex_unlock(&cmd->lock);
845 nbd_config_put(nbd);
846 }
847
nbd_read_reply(struct nbd_device * nbd,struct socket * sock,struct nbd_reply * reply)848 static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock,
849 struct nbd_reply *reply)
850 {
851 struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
852 struct iov_iter to;
853 int result;
854
855 reply->magic = 0;
856 iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply));
857 result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL);
858 if (result < 0) {
859 if (!nbd_disconnected(nbd->config))
860 dev_err(disk_to_dev(nbd->disk),
861 "Receive control failed (result %d)\n", result);
862 return result;
863 }
864
865 if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
866 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
867 (unsigned long)ntohl(reply->magic));
868 return -EPROTO;
869 }
870
871 return 0;
872 }
873
874 /* NULL returned = something went wrong, inform userspace */
nbd_handle_reply(struct nbd_device * nbd,int index,struct nbd_reply * reply)875 static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
876 struct nbd_reply *reply)
877 {
878 int result;
879 struct nbd_cmd *cmd;
880 struct request *req = NULL;
881 u64 handle;
882 u16 hwq;
883 u32 tag;
884 int ret = 0;
885
886 handle = be64_to_cpu(reply->cookie);
887 tag = nbd_handle_to_tag(handle);
888 hwq = blk_mq_unique_tag_to_hwq(tag);
889 if (hwq < nbd->tag_set.nr_hw_queues)
890 req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
891 blk_mq_unique_tag_to_tag(tag));
892 if (!req || !blk_mq_request_started(req)) {
893 dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
894 tag, req);
895 return ERR_PTR(-ENOENT);
896 }
897 trace_nbd_header_received(req, handle);
898 cmd = blk_mq_rq_to_pdu(req);
899
900 mutex_lock(&cmd->lock);
901 if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
902 dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
903 tag, cmd->status, cmd->flags);
904 ret = -ENOENT;
905 goto out;
906 }
907 if (cmd->index != index) {
908 dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
909 tag, index, cmd->index);
910 ret = -ENOENT;
911 goto out;
912 }
913 if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
914 dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
915 req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
916 ret = -ENOENT;
917 goto out;
918 }
919 if (cmd->status != BLK_STS_OK) {
920 dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
921 req);
922 ret = -ENOENT;
923 goto out;
924 }
925 if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
926 dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
927 req);
928 ret = -ENOENT;
929 goto out;
930 }
931 if (ntohl(reply->error)) {
932 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
933 ntohl(reply->error));
934 cmd->status = BLK_STS_IOERR;
935 goto out;
936 }
937
938 dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
939 if (rq_data_dir(req) != WRITE) {
940 struct req_iterator iter;
941 struct bio_vec bvec;
942 struct iov_iter to;
943
944 rq_for_each_segment(bvec, req, iter) {
945 iov_iter_bvec(&to, ITER_DEST, &bvec, 1, bvec.bv_len);
946 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
947 if (result < 0) {
948 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
949 result);
950 /*
951 * If we've disconnected, we need to make sure we
952 * complete this request, otherwise error out
953 * and let the timeout stuff handle resubmitting
954 * this request onto another connection.
955 */
956 if (nbd_disconnected(nbd->config)) {
957 cmd->status = BLK_STS_IOERR;
958 goto out;
959 }
960 ret = -EIO;
961 goto out;
962 }
963 dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
964 req, bvec.bv_len);
965 }
966 }
967 out:
968 trace_nbd_payload_received(req, handle);
969 mutex_unlock(&cmd->lock);
970 return ret ? ERR_PTR(ret) : cmd;
971 }
972
recv_work(struct work_struct * work)973 static void recv_work(struct work_struct *work)
974 {
975 struct recv_thread_args *args = container_of(work,
976 struct recv_thread_args,
977 work);
978 struct nbd_device *nbd = args->nbd;
979 struct nbd_config *config = nbd->config;
980 struct request_queue *q = nbd->disk->queue;
981 struct nbd_sock *nsock = args->nsock;
982 struct nbd_cmd *cmd;
983 struct request *rq;
984
985 while (1) {
986 struct nbd_reply reply;
987
988 if (nbd_read_reply(nbd, nsock->sock, &reply))
989 break;
990
991 /*
992 * Grab .q_usage_counter so request pool won't go away, then no
993 * request use-after-free is possible during nbd_handle_reply().
994 * If queue is frozen, there won't be any inflight requests, we
995 * needn't to handle the incoming garbage message.
996 */
997 if (!percpu_ref_tryget(&q->q_usage_counter)) {
998 dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
999 __func__);
1000 break;
1001 }
1002
1003 cmd = nbd_handle_reply(nbd, args->index, &reply);
1004 if (IS_ERR(cmd)) {
1005 percpu_ref_put(&q->q_usage_counter);
1006 break;
1007 }
1008
1009 rq = blk_mq_rq_from_pdu(cmd);
1010 if (likely(!blk_should_fake_timeout(rq->q))) {
1011 bool complete;
1012
1013 mutex_lock(&cmd->lock);
1014 complete = __test_and_clear_bit(NBD_CMD_INFLIGHT,
1015 &cmd->flags);
1016 mutex_unlock(&cmd->lock);
1017 if (complete)
1018 blk_mq_complete_request(rq);
1019 }
1020 percpu_ref_put(&q->q_usage_counter);
1021 }
1022
1023 mutex_lock(&nsock->tx_lock);
1024 nbd_mark_nsock_dead(nbd, nsock, 1);
1025 mutex_unlock(&nsock->tx_lock);
1026
1027 nbd_config_put(nbd);
1028 atomic_dec(&config->recv_threads);
1029 wake_up(&config->recv_wq);
1030 kfree(args);
1031 }
1032
nbd_clear_req(struct request * req,void * data)1033 static bool nbd_clear_req(struct request *req, void *data)
1034 {
1035 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
1036
1037 /* don't abort one completed request */
1038 if (blk_mq_request_completed(req))
1039 return true;
1040
1041 mutex_lock(&cmd->lock);
1042 if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
1043 mutex_unlock(&cmd->lock);
1044 return true;
1045 }
1046 cmd->status = BLK_STS_IOERR;
1047 mutex_unlock(&cmd->lock);
1048
1049 blk_mq_complete_request(req);
1050 return true;
1051 }
1052
nbd_clear_que(struct nbd_device * nbd)1053 static void nbd_clear_que(struct nbd_device *nbd)
1054 {
1055 blk_mq_quiesce_queue(nbd->disk->queue);
1056 blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
1057 blk_mq_unquiesce_queue(nbd->disk->queue);
1058 dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
1059 }
1060
find_fallback(struct nbd_device * nbd,int index)1061 static int find_fallback(struct nbd_device *nbd, int index)
1062 {
1063 struct nbd_config *config = nbd->config;
1064 int new_index = -1;
1065 struct nbd_sock *nsock = config->socks[index];
1066 int fallback = nsock->fallback_index;
1067
1068 if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
1069 return new_index;
1070
1071 if (config->num_connections <= 1) {
1072 dev_err_ratelimited(disk_to_dev(nbd->disk),
1073 "Dead connection, failed to find a fallback\n");
1074 return new_index;
1075 }
1076
1077 if (fallback >= 0 && fallback < config->num_connections &&
1078 !config->socks[fallback]->dead)
1079 return fallback;
1080
1081 if (nsock->fallback_index < 0 ||
1082 nsock->fallback_index >= config->num_connections ||
1083 config->socks[nsock->fallback_index]->dead) {
1084 int i;
1085 for (i = 0; i < config->num_connections; i++) {
1086 if (i == index)
1087 continue;
1088 if (!config->socks[i]->dead) {
1089 new_index = i;
1090 break;
1091 }
1092 }
1093 nsock->fallback_index = new_index;
1094 if (new_index < 0) {
1095 dev_err_ratelimited(disk_to_dev(nbd->disk),
1096 "Dead connection, failed to find a fallback\n");
1097 return new_index;
1098 }
1099 }
1100 new_index = nsock->fallback_index;
1101 return new_index;
1102 }
1103
wait_for_reconnect(struct nbd_device * nbd)1104 static int wait_for_reconnect(struct nbd_device *nbd)
1105 {
1106 struct nbd_config *config = nbd->config;
1107 if (!config->dead_conn_timeout)
1108 return 0;
1109
1110 if (!wait_event_timeout(config->conn_wait,
1111 test_bit(NBD_RT_DISCONNECTED,
1112 &config->runtime_flags) ||
1113 atomic_read(&config->live_connections) > 0,
1114 config->dead_conn_timeout))
1115 return 0;
1116
1117 return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
1118 }
1119
nbd_handle_cmd(struct nbd_cmd * cmd,int index)1120 static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index)
1121 {
1122 struct request *req = blk_mq_rq_from_pdu(cmd);
1123 struct nbd_device *nbd = cmd->nbd;
1124 struct nbd_config *config;
1125 struct nbd_sock *nsock;
1126 blk_status_t ret;
1127
1128 lockdep_assert_held(&cmd->lock);
1129
1130 config = nbd_get_config_unlocked(nbd);
1131 if (!config) {
1132 dev_err_ratelimited(disk_to_dev(nbd->disk),
1133 "Socks array is empty\n");
1134 return BLK_STS_IOERR;
1135 }
1136
1137 if (index >= config->num_connections) {
1138 dev_err_ratelimited(disk_to_dev(nbd->disk),
1139 "Attempted send on invalid socket\n");
1140 nbd_config_put(nbd);
1141 return BLK_STS_IOERR;
1142 }
1143 cmd->status = BLK_STS_OK;
1144 again:
1145 nsock = config->socks[index];
1146 mutex_lock(&nsock->tx_lock);
1147 if (nsock->dead) {
1148 int old_index = index;
1149 index = find_fallback(nbd, index);
1150 mutex_unlock(&nsock->tx_lock);
1151 if (index < 0) {
1152 if (wait_for_reconnect(nbd)) {
1153 index = old_index;
1154 goto again;
1155 }
1156 /* All the sockets should already be down at this point,
1157 * we just want to make sure that DISCONNECTED is set so
1158 * any requests that come in that were queue'ed waiting
1159 * for the reconnect timer don't trigger the timer again
1160 * and instead just error out.
1161 */
1162 sock_shutdown(nbd);
1163 nbd_config_put(nbd);
1164 return BLK_STS_IOERR;
1165 }
1166 goto again;
1167 }
1168
1169 /* Handle the case that we have a pending request that was partially
1170 * transmitted that _has_ to be serviced first. We need to call requeue
1171 * here so that it gets put _after_ the request that is already on the
1172 * dispatch list.
1173 */
1174 blk_mq_start_request(req);
1175 if (unlikely(nsock->pending && nsock->pending != req)) {
1176 nbd_requeue_cmd(cmd);
1177 ret = BLK_STS_OK;
1178 goto out;
1179 }
1180 ret = nbd_send_cmd(nbd, cmd, index);
1181 out:
1182 mutex_unlock(&nsock->tx_lock);
1183 nbd_config_put(nbd);
1184 return ret;
1185 }
1186
nbd_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)1187 static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
1188 const struct blk_mq_queue_data *bd)
1189 {
1190 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
1191 blk_status_t ret;
1192
1193 /*
1194 * Since we look at the bio's to send the request over the network we
1195 * need to make sure the completion work doesn't mark this request done
1196 * before we are done doing our send. This keeps us from dereferencing
1197 * freed data if we have particularly fast completions (ie we get the
1198 * completion before we exit sock_xmit on the last bvec) or in the case
1199 * that the server is misbehaving (or there was an error) before we're
1200 * done sending everything over the wire.
1201 */
1202 mutex_lock(&cmd->lock);
1203 clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
1204
1205 /* We can be called directly from the user space process, which means we
1206 * could possibly have signals pending so our sendmsg will fail. In
1207 * this case we need to return that we are busy, otherwise error out as
1208 * appropriate.
1209 */
1210 ret = nbd_handle_cmd(cmd, hctx->queue_num);
1211 mutex_unlock(&cmd->lock);
1212
1213 return ret;
1214 }
1215
nbd_get_socket(struct nbd_device * nbd,unsigned long fd,int * err)1216 static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
1217 int *err)
1218 {
1219 struct socket *sock;
1220
1221 *err = 0;
1222 sock = sockfd_lookup(fd, err);
1223 if (!sock)
1224 return NULL;
1225
1226 if (!sk_is_tcp(sock->sk) &&
1227 !sk_is_stream_unix(sock->sk)) {
1228 dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n");
1229 *err = -EINVAL;
1230 sockfd_put(sock);
1231 return NULL;
1232 }
1233
1234 if (sock->ops->shutdown == sock_no_shutdown) {
1235 dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
1236 *err = -EINVAL;
1237 sockfd_put(sock);
1238 return NULL;
1239 }
1240
1241 return sock;
1242 }
1243
nbd_add_socket(struct nbd_device * nbd,unsigned long arg,bool netlink)1244 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
1245 bool netlink)
1246 {
1247 struct nbd_config *config = nbd->config;
1248 struct socket *sock;
1249 struct nbd_sock **socks;
1250 struct nbd_sock *nsock;
1251 unsigned int memflags;
1252 int err;
1253
1254 /* Arg will be cast to int, check it to avoid overflow */
1255 if (arg > INT_MAX)
1256 return -EINVAL;
1257 sock = nbd_get_socket(nbd, arg, &err);
1258 if (!sock)
1259 return err;
1260
1261 /*
1262 * We need to make sure we don't get any errant requests while we're
1263 * reallocating the ->socks array.
1264 */
1265 memflags = blk_mq_freeze_queue(nbd->disk->queue);
1266
1267 if (!netlink && !nbd->task_setup &&
1268 !test_bit(NBD_RT_BOUND, &config->runtime_flags))
1269 nbd->task_setup = current;
1270
1271 if (!netlink &&
1272 (nbd->task_setup != current ||
1273 test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
1274 dev_err(disk_to_dev(nbd->disk),
1275 "Device being setup by another task");
1276 err = -EBUSY;
1277 goto put_socket;
1278 }
1279
1280 nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
1281 if (!nsock) {
1282 err = -ENOMEM;
1283 goto put_socket;
1284 }
1285
1286 socks = krealloc(config->socks, (config->num_connections + 1) *
1287 sizeof(struct nbd_sock *), GFP_KERNEL);
1288 if (!socks) {
1289 kfree(nsock);
1290 err = -ENOMEM;
1291 goto put_socket;
1292 }
1293
1294 config->socks = socks;
1295
1296 nsock->fallback_index = -1;
1297 nsock->dead = false;
1298 mutex_init(&nsock->tx_lock);
1299 nsock->sock = sock;
1300 nsock->pending = NULL;
1301 nsock->sent = 0;
1302 nsock->cookie = 0;
1303 INIT_WORK(&nsock->work, nbd_pending_cmd_work);
1304 socks[config->num_connections++] = nsock;
1305 atomic_inc(&config->live_connections);
1306 blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
1307
1308 return 0;
1309
1310 put_socket:
1311 blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
1312 sockfd_put(sock);
1313 return err;
1314 }
1315
nbd_reconnect_socket(struct nbd_device * nbd,unsigned long arg)1316 static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
1317 {
1318 struct nbd_config *config = nbd->config;
1319 struct socket *sock, *old;
1320 struct recv_thread_args *args;
1321 int i;
1322 int err;
1323
1324 sock = nbd_get_socket(nbd, arg, &err);
1325 if (!sock)
1326 return err;
1327
1328 args = kzalloc(sizeof(*args), GFP_KERNEL);
1329 if (!args) {
1330 sockfd_put(sock);
1331 return -ENOMEM;
1332 }
1333
1334 for (i = 0; i < config->num_connections; i++) {
1335 struct nbd_sock *nsock = config->socks[i];
1336
1337 if (!nsock->dead)
1338 continue;
1339
1340 mutex_lock(&nsock->tx_lock);
1341 if (!nsock->dead) {
1342 mutex_unlock(&nsock->tx_lock);
1343 continue;
1344 }
1345 sk_set_memalloc(sock->sk);
1346 if (nbd->tag_set.timeout)
1347 sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
1348 atomic_inc(&config->recv_threads);
1349 refcount_inc(&nbd->config_refs);
1350 old = nsock->sock;
1351 nsock->fallback_index = -1;
1352 nsock->sock = sock;
1353 nsock->dead = false;
1354 INIT_WORK(&args->work, recv_work);
1355 args->index = i;
1356 args->nbd = nbd;
1357 args->nsock = nsock;
1358 nsock->cookie++;
1359 mutex_unlock(&nsock->tx_lock);
1360 sockfd_put(old);
1361
1362 clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
1363
1364 /* We take the tx_mutex in an error path in the recv_work, so we
1365 * need to queue_work outside of the tx_mutex.
1366 */
1367 queue_work(nbd->recv_workq, &args->work);
1368
1369 atomic_inc(&config->live_connections);
1370 wake_up(&config->conn_wait);
1371 return 0;
1372 }
1373 sockfd_put(sock);
1374 kfree(args);
1375 return -ENOSPC;
1376 }
1377
nbd_bdev_reset(struct nbd_device * nbd)1378 static void nbd_bdev_reset(struct nbd_device *nbd)
1379 {
1380 if (disk_openers(nbd->disk) > 1)
1381 return;
1382 set_capacity(nbd->disk, 0);
1383 }
1384
nbd_parse_flags(struct nbd_device * nbd)1385 static void nbd_parse_flags(struct nbd_device *nbd)
1386 {
1387 if (nbd->config->flags & NBD_FLAG_READ_ONLY)
1388 set_disk_ro(nbd->disk, true);
1389 else
1390 set_disk_ro(nbd->disk, false);
1391 }
1392
send_disconnects(struct nbd_device * nbd)1393 static void send_disconnects(struct nbd_device *nbd)
1394 {
1395 struct nbd_config *config = nbd->config;
1396 struct nbd_request request = {
1397 .magic = htonl(NBD_REQUEST_MAGIC),
1398 .type = htonl(NBD_CMD_DISC),
1399 };
1400 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1401 struct iov_iter from;
1402 int i, ret;
1403
1404 for (i = 0; i < config->num_connections; i++) {
1405 struct nbd_sock *nsock = config->socks[i];
1406
1407 iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
1408 mutex_lock(&nsock->tx_lock);
1409 ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1410 if (ret < 0)
1411 dev_err(disk_to_dev(nbd->disk),
1412 "Send disconnect failed %d\n", ret);
1413 mutex_unlock(&nsock->tx_lock);
1414 }
1415 }
1416
nbd_disconnect(struct nbd_device * nbd)1417 static int nbd_disconnect(struct nbd_device *nbd)
1418 {
1419 struct nbd_config *config = nbd->config;
1420
1421 dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1422 set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
1423 set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
1424 send_disconnects(nbd);
1425 return 0;
1426 }
1427
nbd_clear_sock(struct nbd_device * nbd)1428 static void nbd_clear_sock(struct nbd_device *nbd)
1429 {
1430 sock_shutdown(nbd);
1431 nbd_clear_que(nbd);
1432 nbd->task_setup = NULL;
1433 }
1434
nbd_config_put(struct nbd_device * nbd)1435 static void nbd_config_put(struct nbd_device *nbd)
1436 {
1437 if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1438 &nbd->config_lock)) {
1439 struct nbd_config *config = nbd->config;
1440 nbd_dev_dbg_close(nbd);
1441 invalidate_disk(nbd->disk);
1442 if (nbd->config->bytesize)
1443 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
1444 if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
1445 &config->runtime_flags))
1446 device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1447 nbd->pid = 0;
1448 if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
1449 &config->runtime_flags)) {
1450 device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
1451 kfree(nbd->backend);
1452 nbd->backend = NULL;
1453 }
1454 nbd_clear_sock(nbd);
1455 if (config->num_connections) {
1456 int i;
1457 for (i = 0; i < config->num_connections; i++) {
1458 sockfd_put(config->socks[i]->sock);
1459 kfree(config->socks[i]);
1460 }
1461 kfree(config->socks);
1462 }
1463 kfree(nbd->config);
1464 nbd->config = NULL;
1465
1466 nbd->tag_set.timeout = 0;
1467
1468 mutex_unlock(&nbd->config_lock);
1469 nbd_put(nbd);
1470 module_put(THIS_MODULE);
1471 }
1472 }
1473
nbd_start_device(struct nbd_device * nbd)1474 static int nbd_start_device(struct nbd_device *nbd)
1475 {
1476 struct nbd_config *config = nbd->config;
1477 int num_connections = config->num_connections;
1478 int error = 0, i;
1479
1480 if (nbd->pid)
1481 return -EBUSY;
1482 if (!config->socks)
1483 return -EINVAL;
1484 if (num_connections > 1 &&
1485 !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1486 dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1487 return -EINVAL;
1488 }
1489
1490 retry:
1491 mutex_unlock(&nbd->config_lock);
1492 blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections);
1493 mutex_lock(&nbd->config_lock);
1494
1495 /* if another code path updated nr_hw_queues, retry until succeed */
1496 if (num_connections != config->num_connections) {
1497 num_connections = config->num_connections;
1498 goto retry;
1499 }
1500
1501 nbd->pid = task_pid_nr(current);
1502
1503 nbd_parse_flags(nbd);
1504
1505 error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1506 if (error) {
1507 dev_err(disk_to_dev(nbd->disk), "device_create_file failed for pid!\n");
1508 return error;
1509 }
1510 set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
1511
1512 nbd_dev_dbg_init(nbd);
1513 for (i = 0; i < num_connections; i++) {
1514 struct recv_thread_args *args;
1515
1516 args = kzalloc(sizeof(*args), GFP_KERNEL);
1517 if (!args) {
1518 sock_shutdown(nbd);
1519 /*
1520 * If num_connections is m (2 < m),
1521 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
1522 * But NO.(n + 1) failed. We still have n recv threads.
1523 * So, add flush_workqueue here to prevent recv threads
1524 * dropping the last config_refs and trying to destroy
1525 * the workqueue from inside the workqueue.
1526 */
1527 if (i)
1528 flush_workqueue(nbd->recv_workq);
1529 return -ENOMEM;
1530 }
1531 sk_set_memalloc(config->socks[i]->sock->sk);
1532 if (nbd->tag_set.timeout)
1533 config->socks[i]->sock->sk->sk_sndtimeo =
1534 nbd->tag_set.timeout;
1535 atomic_inc(&config->recv_threads);
1536 refcount_inc(&nbd->config_refs);
1537 INIT_WORK(&args->work, recv_work);
1538 args->nbd = nbd;
1539 args->nsock = config->socks[i];
1540 args->index = i;
1541 queue_work(nbd->recv_workq, &args->work);
1542 }
1543 return nbd_set_size(nbd, config->bytesize, nbd_blksize(config));
1544 }
1545
nbd_start_device_ioctl(struct nbd_device * nbd)1546 static int nbd_start_device_ioctl(struct nbd_device *nbd)
1547 {
1548 struct nbd_config *config = nbd->config;
1549 int ret;
1550
1551 ret = nbd_start_device(nbd);
1552 if (ret)
1553 return ret;
1554
1555 if (max_part)
1556 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
1557 mutex_unlock(&nbd->config_lock);
1558 ret = wait_event_interruptible(config->recv_wq,
1559 atomic_read(&config->recv_threads) == 0);
1560 if (ret) {
1561 sock_shutdown(nbd);
1562 nbd_clear_que(nbd);
1563 }
1564
1565 flush_workqueue(nbd->recv_workq);
1566 mutex_lock(&nbd->config_lock);
1567 nbd_bdev_reset(nbd);
1568 /* user requested, ignore socket errors */
1569 if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
1570 ret = 0;
1571 if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
1572 ret = -ETIMEDOUT;
1573 return ret;
1574 }
1575
nbd_clear_sock_ioctl(struct nbd_device * nbd)1576 static void nbd_clear_sock_ioctl(struct nbd_device *nbd)
1577 {
1578 nbd_clear_sock(nbd);
1579 disk_force_media_change(nbd->disk);
1580 nbd_bdev_reset(nbd);
1581 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
1582 &nbd->config->runtime_flags))
1583 nbd_config_put(nbd);
1584 }
1585
nbd_set_cmd_timeout(struct nbd_device * nbd,u64 timeout)1586 static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
1587 {
1588 nbd->tag_set.timeout = timeout * HZ;
1589 if (timeout)
1590 blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1591 else
1592 blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
1593 }
1594
1595 /* Must be called with config_lock held */
__nbd_ioctl(struct block_device * bdev,struct nbd_device * nbd,unsigned int cmd,unsigned long arg)1596 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1597 unsigned int cmd, unsigned long arg)
1598 {
1599 struct nbd_config *config = nbd->config;
1600 loff_t bytesize;
1601
1602 switch (cmd) {
1603 case NBD_DISCONNECT:
1604 return nbd_disconnect(nbd);
1605 case NBD_CLEAR_SOCK:
1606 nbd_clear_sock_ioctl(nbd);
1607 return 0;
1608 case NBD_SET_SOCK:
1609 return nbd_add_socket(nbd, arg, false);
1610 case NBD_SET_BLKSIZE:
1611 return nbd_set_size(nbd, config->bytesize, arg);
1612 case NBD_SET_SIZE:
1613 return nbd_set_size(nbd, arg, nbd_blksize(config));
1614 case NBD_SET_SIZE_BLOCKS:
1615 if (check_shl_overflow(arg, config->blksize_bits, &bytesize))
1616 return -EINVAL;
1617 return nbd_set_size(nbd, bytesize, nbd_blksize(config));
1618 case NBD_SET_TIMEOUT:
1619 nbd_set_cmd_timeout(nbd, arg);
1620 return 0;
1621
1622 case NBD_SET_FLAGS:
1623 config->flags = arg;
1624 return 0;
1625 case NBD_DO_IT:
1626 return nbd_start_device_ioctl(nbd);
1627 case NBD_CLEAR_QUE:
1628 /*
1629 * This is for compatibility only. The queue is always cleared
1630 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1631 */
1632 return 0;
1633 case NBD_PRINT_DEBUG:
1634 /*
1635 * For compatibility only, we no longer keep a list of
1636 * outstanding requests.
1637 */
1638 return 0;
1639 }
1640 return -ENOTTY;
1641 }
1642
nbd_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)1643 static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode,
1644 unsigned int cmd, unsigned long arg)
1645 {
1646 struct nbd_device *nbd = bdev->bd_disk->private_data;
1647 struct nbd_config *config = nbd->config;
1648 int error = -EINVAL;
1649
1650 if (!capable(CAP_SYS_ADMIN))
1651 return -EPERM;
1652
1653 /* The block layer will pass back some non-nbd ioctls in case we have
1654 * special handling for them, but we don't so just return an error.
1655 */
1656 if (_IOC_TYPE(cmd) != 0xab)
1657 return -EINVAL;
1658
1659 mutex_lock(&nbd->config_lock);
1660
1661 /* Don't allow ioctl operations on a nbd device that was created with
1662 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1663 */
1664 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
1665 (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1666 error = __nbd_ioctl(bdev, nbd, cmd, arg);
1667 else
1668 dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1669 mutex_unlock(&nbd->config_lock);
1670 return error;
1671 }
1672
nbd_alloc_and_init_config(struct nbd_device * nbd)1673 static int nbd_alloc_and_init_config(struct nbd_device *nbd)
1674 {
1675 struct nbd_config *config;
1676
1677 if (WARN_ON(nbd->config))
1678 return -EINVAL;
1679
1680 if (!try_module_get(THIS_MODULE))
1681 return -ENODEV;
1682
1683 config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1684 if (!config) {
1685 module_put(THIS_MODULE);
1686 return -ENOMEM;
1687 }
1688
1689 atomic_set(&config->recv_threads, 0);
1690 init_waitqueue_head(&config->recv_wq);
1691 init_waitqueue_head(&config->conn_wait);
1692 config->blksize_bits = NBD_DEF_BLKSIZE_BITS;
1693 atomic_set(&config->live_connections, 0);
1694
1695 nbd->config = config;
1696 /*
1697 * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment,
1698 * its pair is the barrier in nbd_get_config_unlocked().
1699 * So nbd_get_config_unlocked() won't see nbd->config as null after
1700 * refcount_inc_not_zero() succeed.
1701 */
1702 smp_mb__before_atomic();
1703 refcount_set(&nbd->config_refs, 1);
1704
1705 return 0;
1706 }
1707
nbd_open(struct gendisk * disk,blk_mode_t mode)1708 static int nbd_open(struct gendisk *disk, blk_mode_t mode)
1709 {
1710 struct nbd_device *nbd;
1711 struct nbd_config *config;
1712 int ret = 0;
1713
1714 mutex_lock(&nbd_index_mutex);
1715 nbd = disk->private_data;
1716 if (!nbd) {
1717 ret = -ENXIO;
1718 goto out;
1719 }
1720 if (!refcount_inc_not_zero(&nbd->refs)) {
1721 ret = -ENXIO;
1722 goto out;
1723 }
1724
1725 config = nbd_get_config_unlocked(nbd);
1726 if (!config) {
1727 mutex_lock(&nbd->config_lock);
1728 if (refcount_inc_not_zero(&nbd->config_refs)) {
1729 mutex_unlock(&nbd->config_lock);
1730 goto out;
1731 }
1732 ret = nbd_alloc_and_init_config(nbd);
1733 if (ret) {
1734 mutex_unlock(&nbd->config_lock);
1735 goto out;
1736 }
1737
1738 refcount_inc(&nbd->refs);
1739 mutex_unlock(&nbd->config_lock);
1740 if (max_part)
1741 set_bit(GD_NEED_PART_SCAN, &disk->state);
1742 } else if (nbd_disconnected(config)) {
1743 if (max_part)
1744 set_bit(GD_NEED_PART_SCAN, &disk->state);
1745 }
1746 out:
1747 mutex_unlock(&nbd_index_mutex);
1748 return ret;
1749 }
1750
nbd_release(struct gendisk * disk)1751 static void nbd_release(struct gendisk *disk)
1752 {
1753 struct nbd_device *nbd = disk->private_data;
1754
1755 if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1756 disk_openers(disk) == 0)
1757 nbd_disconnect_and_put(nbd);
1758
1759 nbd_config_put(nbd);
1760 nbd_put(nbd);
1761 }
1762
nbd_free_disk(struct gendisk * disk)1763 static void nbd_free_disk(struct gendisk *disk)
1764 {
1765 struct nbd_device *nbd = disk->private_data;
1766
1767 kfree(nbd);
1768 }
1769
1770 static const struct block_device_operations nbd_fops =
1771 {
1772 .owner = THIS_MODULE,
1773 .open = nbd_open,
1774 .release = nbd_release,
1775 .ioctl = nbd_ioctl,
1776 .compat_ioctl = nbd_ioctl,
1777 .free_disk = nbd_free_disk,
1778 };
1779
1780 #if IS_ENABLED(CONFIG_DEBUG_FS)
1781
nbd_dbg_tasks_show(struct seq_file * s,void * unused)1782 static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1783 {
1784 struct nbd_device *nbd = s->private;
1785
1786 if (nbd->pid)
1787 seq_printf(s, "recv: %d\n", nbd->pid);
1788
1789 return 0;
1790 }
1791
1792 DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
1793
nbd_dbg_flags_show(struct seq_file * s,void * unused)1794 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1795 {
1796 struct nbd_device *nbd = s->private;
1797 u32 flags = nbd->config->flags;
1798
1799 seq_printf(s, "Hex: 0x%08x\n\n", flags);
1800
1801 seq_puts(s, "Known flags:\n");
1802
1803 if (flags & NBD_FLAG_HAS_FLAGS)
1804 seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1805 if (flags & NBD_FLAG_READ_ONLY)
1806 seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1807 if (flags & NBD_FLAG_SEND_FLUSH)
1808 seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1809 if (flags & NBD_FLAG_SEND_FUA)
1810 seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1811 if (flags & NBD_FLAG_SEND_TRIM)
1812 seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1813 if (flags & NBD_FLAG_SEND_WRITE_ZEROES)
1814 seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n");
1815 if (flags & NBD_FLAG_ROTATIONAL)
1816 seq_puts(s, "NBD_FLAG_ROTATIONAL\n");
1817
1818 return 0;
1819 }
1820
1821 DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
1822
nbd_dev_dbg_init(struct nbd_device * nbd)1823 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1824 {
1825 struct dentry *dir;
1826 struct nbd_config *config = nbd->config;
1827
1828 if (!nbd_dbg_dir)
1829 return -EIO;
1830
1831 dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1832 if (IS_ERR(dir)) {
1833 dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1834 nbd_name(nbd));
1835 return -EIO;
1836 }
1837 config->dbg_dir = dir;
1838
1839 debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
1840 debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1841 debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1842 debugfs_create_u32("blocksize_bits", 0444, dir, &config->blksize_bits);
1843 debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
1844
1845 return 0;
1846 }
1847
nbd_dev_dbg_close(struct nbd_device * nbd)1848 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1849 {
1850 debugfs_remove_recursive(nbd->config->dbg_dir);
1851 }
1852
nbd_dbg_init(void)1853 static int nbd_dbg_init(void)
1854 {
1855 struct dentry *dbg_dir;
1856
1857 dbg_dir = debugfs_create_dir("nbd", NULL);
1858 if (IS_ERR(dbg_dir))
1859 return -EIO;
1860
1861 nbd_dbg_dir = dbg_dir;
1862
1863 return 0;
1864 }
1865
nbd_dbg_close(void)1866 static void nbd_dbg_close(void)
1867 {
1868 debugfs_remove_recursive(nbd_dbg_dir);
1869 }
1870
1871 #else /* IS_ENABLED(CONFIG_DEBUG_FS) */
1872
nbd_dev_dbg_init(struct nbd_device * nbd)1873 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1874 {
1875 return 0;
1876 }
1877
nbd_dev_dbg_close(struct nbd_device * nbd)1878 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1879 {
1880 }
1881
nbd_dbg_init(void)1882 static int nbd_dbg_init(void)
1883 {
1884 return 0;
1885 }
1886
nbd_dbg_close(void)1887 static void nbd_dbg_close(void)
1888 {
1889 }
1890
1891 #endif
1892
nbd_init_request(struct blk_mq_tag_set * set,struct request * rq,unsigned int hctx_idx,unsigned int numa_node)1893 static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1894 unsigned int hctx_idx, unsigned int numa_node)
1895 {
1896 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1897 cmd->nbd = set->driver_data;
1898 cmd->flags = 0;
1899 mutex_init(&cmd->lock);
1900 return 0;
1901 }
1902
1903 static const struct blk_mq_ops nbd_mq_ops = {
1904 .queue_rq = nbd_queue_rq,
1905 .complete = nbd_complete_rq,
1906 .init_request = nbd_init_request,
1907 .timeout = nbd_xmit_timeout,
1908 };
1909
nbd_dev_add(int index,unsigned int refs)1910 static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
1911 {
1912 struct queue_limits lim = {
1913 .max_hw_sectors = 65536,
1914 .io_opt = 256 << SECTOR_SHIFT,
1915 .max_segments = USHRT_MAX,
1916 .max_segment_size = UINT_MAX,
1917 };
1918 struct nbd_device *nbd;
1919 struct gendisk *disk;
1920 int err = -ENOMEM;
1921
1922 nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1923 if (!nbd)
1924 goto out;
1925
1926 nbd->tag_set.ops = &nbd_mq_ops;
1927 nbd->tag_set.nr_hw_queues = 1;
1928 nbd->tag_set.queue_depth = 128;
1929 nbd->tag_set.numa_node = NUMA_NO_NODE;
1930 nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1931 nbd->tag_set.flags = BLK_MQ_F_BLOCKING;
1932 nbd->tag_set.driver_data = nbd;
1933 INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
1934 nbd->backend = NULL;
1935
1936 err = blk_mq_alloc_tag_set(&nbd->tag_set);
1937 if (err)
1938 goto out_free_nbd;
1939
1940 mutex_lock(&nbd_index_mutex);
1941 if (index >= 0) {
1942 err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1943 GFP_KERNEL);
1944 if (err == -ENOSPC)
1945 err = -EEXIST;
1946 } else {
1947 err = idr_alloc(&nbd_index_idr, nbd, 0,
1948 (MINORMASK >> part_shift) + 1, GFP_KERNEL);
1949 if (err >= 0)
1950 index = err;
1951 }
1952 nbd->index = index;
1953 mutex_unlock(&nbd_index_mutex);
1954 if (err < 0)
1955 goto out_free_tags;
1956
1957 disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL);
1958 if (IS_ERR(disk)) {
1959 err = PTR_ERR(disk);
1960 goto out_free_idr;
1961 }
1962 nbd->disk = disk;
1963
1964 nbd->recv_workq = alloc_workqueue("nbd%d-recv",
1965 WQ_MEM_RECLAIM | WQ_HIGHPRI |
1966 WQ_UNBOUND, 0, nbd->index);
1967 if (!nbd->recv_workq) {
1968 dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
1969 err = -ENOMEM;
1970 goto out_err_disk;
1971 }
1972
1973 mutex_init(&nbd->config_lock);
1974 refcount_set(&nbd->config_refs, 0);
1975 /*
1976 * Start out with a zero references to keep other threads from using
1977 * this device until it is fully initialized.
1978 */
1979 refcount_set(&nbd->refs, 0);
1980 INIT_LIST_HEAD(&nbd->list);
1981 disk->major = NBD_MAJOR;
1982 disk->first_minor = index << part_shift;
1983 disk->minors = 1 << part_shift;
1984 disk->fops = &nbd_fops;
1985 disk->private_data = nbd;
1986 sprintf(disk->disk_name, "nbd%d", index);
1987 err = add_disk(disk);
1988 if (err)
1989 goto out_free_work;
1990
1991 /*
1992 * Now publish the device.
1993 */
1994 refcount_set(&nbd->refs, refs);
1995 nbd_total_devices++;
1996 return nbd;
1997
1998 out_free_work:
1999 destroy_workqueue(nbd->recv_workq);
2000 out_err_disk:
2001 put_disk(disk);
2002 out_free_idr:
2003 mutex_lock(&nbd_index_mutex);
2004 idr_remove(&nbd_index_idr, index);
2005 mutex_unlock(&nbd_index_mutex);
2006 out_free_tags:
2007 blk_mq_free_tag_set(&nbd->tag_set);
2008 out_free_nbd:
2009 kfree(nbd);
2010 out:
2011 return ERR_PTR(err);
2012 }
2013
nbd_find_get_unused(void)2014 static struct nbd_device *nbd_find_get_unused(void)
2015 {
2016 struct nbd_device *nbd;
2017 int id;
2018
2019 lockdep_assert_held(&nbd_index_mutex);
2020
2021 idr_for_each_entry(&nbd_index_idr, nbd, id) {
2022 if (refcount_read(&nbd->config_refs) ||
2023 test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
2024 continue;
2025 if (refcount_inc_not_zero(&nbd->refs))
2026 return nbd;
2027 }
2028
2029 return NULL;
2030 }
2031
2032 /* Netlink interface. */
2033 static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
2034 [NBD_ATTR_INDEX] = { .type = NLA_U32 },
2035 [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 },
2036 [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 },
2037 [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 },
2038 [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 },
2039 [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 },
2040 [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
2041 [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
2042 [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED},
2043 [NBD_ATTR_BACKEND_IDENTIFIER] = { .type = NLA_STRING},
2044 };
2045
2046 static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
2047 [NBD_SOCK_FD] = { .type = NLA_U32 },
2048 };
2049
2050 /* We don't use this right now since we don't parse the incoming list, but we
2051 * still want it here so userspace knows what to expect.
2052 */
2053 static const struct nla_policy __attribute__((unused))
2054 nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
2055 [NBD_DEVICE_INDEX] = { .type = NLA_U32 },
2056 [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
2057 };
2058
nbd_genl_size_set(struct genl_info * info,struct nbd_device * nbd)2059 static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
2060 {
2061 struct nbd_config *config = nbd->config;
2062 u64 bsize = nbd_blksize(config);
2063 u64 bytes = config->bytesize;
2064
2065 if (info->attrs[NBD_ATTR_SIZE_BYTES])
2066 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
2067
2068 if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
2069 bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
2070
2071 if (bytes != config->bytesize || bsize != nbd_blksize(config))
2072 return nbd_set_size(nbd, bytes, bsize);
2073 return 0;
2074 }
2075
nbd_genl_connect(struct sk_buff * skb,struct genl_info * info)2076 static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
2077 {
2078 struct nbd_device *nbd;
2079 struct nbd_config *config;
2080 int index = -1;
2081 int ret;
2082 bool put_dev = false;
2083
2084 if (!netlink_capable(skb, CAP_SYS_ADMIN))
2085 return -EPERM;
2086
2087 if (info->attrs[NBD_ATTR_INDEX]) {
2088 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2089
2090 /*
2091 * Too big first_minor can cause duplicate creation of
2092 * sysfs files/links, since index << part_shift might overflow, or
2093 * MKDEV() expect that the max bits of first_minor is 20.
2094 */
2095 if (index < 0 || index > MINORMASK >> part_shift) {
2096 pr_err("illegal input index %d\n", index);
2097 return -EINVAL;
2098 }
2099 }
2100 if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SOCKETS)) {
2101 pr_err("must specify at least one socket\n");
2102 return -EINVAL;
2103 }
2104 if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SIZE_BYTES)) {
2105 pr_err("must specify a size in bytes for the device\n");
2106 return -EINVAL;
2107 }
2108 again:
2109 mutex_lock(&nbd_index_mutex);
2110 if (index == -1) {
2111 nbd = nbd_find_get_unused();
2112 } else {
2113 nbd = idr_find(&nbd_index_idr, index);
2114 if (nbd) {
2115 if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
2116 test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
2117 !refcount_inc_not_zero(&nbd->refs)) {
2118 mutex_unlock(&nbd_index_mutex);
2119 pr_err("device at index %d is going down\n",
2120 index);
2121 return -EINVAL;
2122 }
2123 }
2124 }
2125 mutex_unlock(&nbd_index_mutex);
2126
2127 if (!nbd) {
2128 nbd = nbd_dev_add(index, 2);
2129 if (IS_ERR(nbd)) {
2130 pr_err("failed to add new device\n");
2131 return PTR_ERR(nbd);
2132 }
2133 }
2134
2135 mutex_lock(&nbd->config_lock);
2136 if (refcount_read(&nbd->config_refs)) {
2137 mutex_unlock(&nbd->config_lock);
2138 nbd_put(nbd);
2139 if (index == -1)
2140 goto again;
2141 pr_err("nbd%d already in use\n", index);
2142 return -EBUSY;
2143 }
2144
2145 ret = nbd_alloc_and_init_config(nbd);
2146 if (ret) {
2147 mutex_unlock(&nbd->config_lock);
2148 nbd_put(nbd);
2149 pr_err("couldn't allocate config\n");
2150 return ret;
2151 }
2152
2153 config = nbd->config;
2154 set_bit(NBD_RT_BOUND, &config->runtime_flags);
2155 ret = nbd_genl_size_set(info, nbd);
2156 if (ret)
2157 goto out;
2158
2159 if (info->attrs[NBD_ATTR_TIMEOUT])
2160 nbd_set_cmd_timeout(nbd,
2161 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
2162 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
2163 config->dead_conn_timeout =
2164 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
2165 config->dead_conn_timeout *= HZ;
2166 }
2167 if (info->attrs[NBD_ATTR_SERVER_FLAGS])
2168 config->flags =
2169 nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
2170 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
2171 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
2172 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
2173 /*
2174 * We have 1 ref to keep the device around, and then 1
2175 * ref for our current operation here, which will be
2176 * inherited by the config. If we already have
2177 * DESTROY_ON_DISCONNECT set then we know we don't have
2178 * that extra ref already held so we don't need the
2179 * put_dev.
2180 */
2181 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
2182 &nbd->flags))
2183 put_dev = true;
2184 } else {
2185 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
2186 &nbd->flags))
2187 refcount_inc(&nbd->refs);
2188 }
2189 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
2190 set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2191 &config->runtime_flags);
2192 }
2193 }
2194
2195 if (info->attrs[NBD_ATTR_SOCKETS]) {
2196 struct nlattr *attr;
2197 int rem, fd;
2198
2199 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2200 rem) {
2201 struct nlattr *socks[NBD_SOCK_MAX+1];
2202
2203 if (nla_type(attr) != NBD_SOCK_ITEM) {
2204 pr_err("socks must be embedded in a SOCK_ITEM attr\n");
2205 ret = -EINVAL;
2206 goto out;
2207 }
2208 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2209 attr,
2210 nbd_sock_policy,
2211 info->extack);
2212 if (ret != 0) {
2213 pr_err("error processing sock list\n");
2214 ret = -EINVAL;
2215 goto out;
2216 }
2217 if (!socks[NBD_SOCK_FD])
2218 continue;
2219 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2220 ret = nbd_add_socket(nbd, fd, true);
2221 if (ret)
2222 goto out;
2223 }
2224 }
2225
2226 if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
2227 nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
2228 GFP_KERNEL);
2229 if (!nbd->backend) {
2230 ret = -ENOMEM;
2231 goto out;
2232 }
2233 }
2234 ret = device_create_file(disk_to_dev(nbd->disk), &backend_attr);
2235 if (ret) {
2236 dev_err(disk_to_dev(nbd->disk),
2237 "device_create_file failed for backend!\n");
2238 goto out;
2239 }
2240 set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
2241
2242 ret = nbd_start_device(nbd);
2243 out:
2244 mutex_unlock(&nbd->config_lock);
2245 if (!ret) {
2246 set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
2247 refcount_inc(&nbd->config_refs);
2248 nbd_connect_reply(info, nbd->index);
2249 }
2250 nbd_config_put(nbd);
2251 if (put_dev)
2252 nbd_put(nbd);
2253 return ret;
2254 }
2255
nbd_disconnect_and_put(struct nbd_device * nbd)2256 static void nbd_disconnect_and_put(struct nbd_device *nbd)
2257 {
2258 mutex_lock(&nbd->config_lock);
2259 nbd_disconnect(nbd);
2260 sock_shutdown(nbd);
2261 wake_up(&nbd->config->conn_wait);
2262 /*
2263 * Make sure recv thread has finished, we can safely call nbd_clear_que()
2264 * to cancel the inflight I/Os.
2265 */
2266 flush_workqueue(nbd->recv_workq);
2267 nbd_clear_que(nbd);
2268 nbd->task_setup = NULL;
2269 clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags);
2270 mutex_unlock(&nbd->config_lock);
2271
2272 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
2273 &nbd->config->runtime_flags))
2274 nbd_config_put(nbd);
2275 }
2276
nbd_genl_disconnect(struct sk_buff * skb,struct genl_info * info)2277 static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
2278 {
2279 struct nbd_device *nbd;
2280 int index;
2281
2282 if (!netlink_capable(skb, CAP_SYS_ADMIN))
2283 return -EPERM;
2284
2285 if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
2286 pr_err("must specify an index to disconnect\n");
2287 return -EINVAL;
2288 }
2289 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2290 mutex_lock(&nbd_index_mutex);
2291 nbd = idr_find(&nbd_index_idr, index);
2292 if (!nbd) {
2293 mutex_unlock(&nbd_index_mutex);
2294 pr_err("couldn't find device at index %d\n", index);
2295 return -EINVAL;
2296 }
2297 if (!refcount_inc_not_zero(&nbd->refs)) {
2298 mutex_unlock(&nbd_index_mutex);
2299 pr_err("device at index %d is going down\n", index);
2300 return -EINVAL;
2301 }
2302 mutex_unlock(&nbd_index_mutex);
2303 if (!refcount_inc_not_zero(&nbd->config_refs))
2304 goto put_nbd;
2305 nbd_disconnect_and_put(nbd);
2306 nbd_config_put(nbd);
2307 put_nbd:
2308 nbd_put(nbd);
2309 return 0;
2310 }
2311
nbd_genl_reconfigure(struct sk_buff * skb,struct genl_info * info)2312 static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
2313 {
2314 struct nbd_device *nbd = NULL;
2315 struct nbd_config *config;
2316 int index;
2317 int ret = 0;
2318 bool put_dev = false;
2319
2320 if (!netlink_capable(skb, CAP_SYS_ADMIN))
2321 return -EPERM;
2322
2323 if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
2324 pr_err("must specify a device to reconfigure\n");
2325 return -EINVAL;
2326 }
2327 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2328 mutex_lock(&nbd_index_mutex);
2329 nbd = idr_find(&nbd_index_idr, index);
2330 if (!nbd) {
2331 mutex_unlock(&nbd_index_mutex);
2332 pr_err("couldn't find a device at index %d\n", index);
2333 return -EINVAL;
2334 }
2335 if (nbd->backend) {
2336 if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
2337 if (nla_strcmp(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
2338 nbd->backend)) {
2339 mutex_unlock(&nbd_index_mutex);
2340 dev_err(nbd_to_dev(nbd),
2341 "backend image doesn't match with %s\n",
2342 nbd->backend);
2343 return -EINVAL;
2344 }
2345 } else {
2346 mutex_unlock(&nbd_index_mutex);
2347 dev_err(nbd_to_dev(nbd), "must specify backend\n");
2348 return -EINVAL;
2349 }
2350 }
2351 if (!refcount_inc_not_zero(&nbd->refs)) {
2352 mutex_unlock(&nbd_index_mutex);
2353 pr_err("device at index %d is going down\n", index);
2354 return -EINVAL;
2355 }
2356 mutex_unlock(&nbd_index_mutex);
2357
2358 config = nbd_get_config_unlocked(nbd);
2359 if (!config) {
2360 dev_err(nbd_to_dev(nbd),
2361 "not configured, cannot reconfigure\n");
2362 nbd_put(nbd);
2363 return -EINVAL;
2364 }
2365
2366 mutex_lock(&nbd->config_lock);
2367 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
2368 !nbd->pid) {
2369 dev_err(nbd_to_dev(nbd),
2370 "not configured, cannot reconfigure\n");
2371 ret = -EINVAL;
2372 goto out;
2373 }
2374
2375 ret = nbd_genl_size_set(info, nbd);
2376 if (ret)
2377 goto out;
2378
2379 if (info->attrs[NBD_ATTR_TIMEOUT])
2380 nbd_set_cmd_timeout(nbd,
2381 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
2382 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
2383 config->dead_conn_timeout =
2384 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
2385 config->dead_conn_timeout *= HZ;
2386 }
2387 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
2388 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
2389 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
2390 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
2391 &nbd->flags))
2392 put_dev = true;
2393 } else {
2394 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
2395 &nbd->flags))
2396 refcount_inc(&nbd->refs);
2397 }
2398
2399 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
2400 set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2401 &config->runtime_flags);
2402 } else {
2403 clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2404 &config->runtime_flags);
2405 }
2406 }
2407
2408 if (info->attrs[NBD_ATTR_SOCKETS]) {
2409 struct nlattr *attr;
2410 int rem, fd;
2411
2412 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2413 rem) {
2414 struct nlattr *socks[NBD_SOCK_MAX+1];
2415
2416 if (nla_type(attr) != NBD_SOCK_ITEM) {
2417 pr_err("socks must be embedded in a SOCK_ITEM attr\n");
2418 ret = -EINVAL;
2419 goto out;
2420 }
2421 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2422 attr,
2423 nbd_sock_policy,
2424 info->extack);
2425 if (ret != 0) {
2426 pr_err("error processing sock list\n");
2427 ret = -EINVAL;
2428 goto out;
2429 }
2430 if (!socks[NBD_SOCK_FD])
2431 continue;
2432 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2433 ret = nbd_reconnect_socket(nbd, fd);
2434 if (ret) {
2435 if (ret == -ENOSPC)
2436 ret = 0;
2437 goto out;
2438 }
2439 dev_info(nbd_to_dev(nbd), "reconnected socket\n");
2440 }
2441 }
2442 out:
2443 mutex_unlock(&nbd->config_lock);
2444 nbd_config_put(nbd);
2445 nbd_put(nbd);
2446 if (put_dev)
2447 nbd_put(nbd);
2448 return ret;
2449 }
2450
2451 static const struct genl_small_ops nbd_connect_genl_ops[] = {
2452 {
2453 .cmd = NBD_CMD_CONNECT,
2454 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2455 .doit = nbd_genl_connect,
2456 },
2457 {
2458 .cmd = NBD_CMD_DISCONNECT,
2459 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2460 .doit = nbd_genl_disconnect,
2461 },
2462 {
2463 .cmd = NBD_CMD_RECONFIGURE,
2464 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2465 .doit = nbd_genl_reconfigure,
2466 },
2467 {
2468 .cmd = NBD_CMD_STATUS,
2469 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2470 .doit = nbd_genl_status,
2471 },
2472 };
2473
2474 static const struct genl_multicast_group nbd_mcast_grps[] = {
2475 { .name = NBD_GENL_MCAST_GROUP_NAME, },
2476 };
2477
2478 static struct genl_family nbd_genl_family __ro_after_init = {
2479 .hdrsize = 0,
2480 .name = NBD_GENL_FAMILY_NAME,
2481 .version = NBD_GENL_VERSION,
2482 .module = THIS_MODULE,
2483 .small_ops = nbd_connect_genl_ops,
2484 .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops),
2485 .resv_start_op = NBD_CMD_STATUS + 1,
2486 .maxattr = NBD_ATTR_MAX,
2487 .netnsok = 1,
2488 .policy = nbd_attr_policy,
2489 .mcgrps = nbd_mcast_grps,
2490 .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
2491 };
2492 MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME);
2493
populate_nbd_status(struct nbd_device * nbd,struct sk_buff * reply)2494 static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2495 {
2496 struct nlattr *dev_opt;
2497 u8 connected = 0;
2498 int ret;
2499
2500 /* This is a little racey, but for status it's ok. The
2501 * reason we don't take a ref here is because we can't
2502 * take a ref in the index == -1 case as we would need
2503 * to put under the nbd_index_mutex, which could
2504 * deadlock if we are configured to remove ourselves
2505 * once we're disconnected.
2506 */
2507 if (refcount_read(&nbd->config_refs))
2508 connected = 1;
2509 dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
2510 if (!dev_opt)
2511 return -EMSGSIZE;
2512 ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2513 if (ret)
2514 return -EMSGSIZE;
2515 ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2516 connected);
2517 if (ret)
2518 return -EMSGSIZE;
2519 nla_nest_end(reply, dev_opt);
2520 return 0;
2521 }
2522
status_cb(int id,void * ptr,void * data)2523 static int status_cb(int id, void *ptr, void *data)
2524 {
2525 struct nbd_device *nbd = ptr;
2526 return populate_nbd_status(nbd, (struct sk_buff *)data);
2527 }
2528
nbd_genl_status(struct sk_buff * skb,struct genl_info * info)2529 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2530 {
2531 struct nlattr *dev_list;
2532 struct sk_buff *reply;
2533 void *reply_head;
2534 size_t msg_size;
2535 int index = -1;
2536 int ret = -ENOMEM;
2537
2538 if (info->attrs[NBD_ATTR_INDEX])
2539 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2540
2541 mutex_lock(&nbd_index_mutex);
2542
2543 msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2544 nla_attr_size(sizeof(u8)));
2545 msg_size *= (index == -1) ? nbd_total_devices : 1;
2546
2547 reply = genlmsg_new(msg_size, GFP_KERNEL);
2548 if (!reply)
2549 goto out;
2550 reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2551 NBD_CMD_STATUS);
2552 if (!reply_head) {
2553 nlmsg_free(reply);
2554 goto out;
2555 }
2556
2557 dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
2558 if (!dev_list) {
2559 nlmsg_free(reply);
2560 ret = -EMSGSIZE;
2561 goto out;
2562 }
2563
2564 if (index == -1) {
2565 ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2566 if (ret) {
2567 nlmsg_free(reply);
2568 goto out;
2569 }
2570 } else {
2571 struct nbd_device *nbd;
2572 nbd = idr_find(&nbd_index_idr, index);
2573 if (nbd) {
2574 ret = populate_nbd_status(nbd, reply);
2575 if (ret) {
2576 nlmsg_free(reply);
2577 goto out;
2578 }
2579 }
2580 }
2581 nla_nest_end(reply, dev_list);
2582 genlmsg_end(reply, reply_head);
2583 ret = genlmsg_reply(reply, info);
2584 out:
2585 mutex_unlock(&nbd_index_mutex);
2586 return ret;
2587 }
2588
nbd_connect_reply(struct genl_info * info,int index)2589 static void nbd_connect_reply(struct genl_info *info, int index)
2590 {
2591 struct sk_buff *skb;
2592 void *msg_head;
2593 int ret;
2594
2595 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2596 if (!skb)
2597 return;
2598 msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2599 NBD_CMD_CONNECT);
2600 if (!msg_head) {
2601 nlmsg_free(skb);
2602 return;
2603 }
2604 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2605 if (ret) {
2606 nlmsg_free(skb);
2607 return;
2608 }
2609 genlmsg_end(skb, msg_head);
2610 genlmsg_reply(skb, info);
2611 }
2612
nbd_mcast_index(int index)2613 static void nbd_mcast_index(int index)
2614 {
2615 struct sk_buff *skb;
2616 void *msg_head;
2617 int ret;
2618
2619 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2620 if (!skb)
2621 return;
2622 msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2623 NBD_CMD_LINK_DEAD);
2624 if (!msg_head) {
2625 nlmsg_free(skb);
2626 return;
2627 }
2628 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2629 if (ret) {
2630 nlmsg_free(skb);
2631 return;
2632 }
2633 genlmsg_end(skb, msg_head);
2634 genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2635 }
2636
nbd_dead_link_work(struct work_struct * work)2637 static void nbd_dead_link_work(struct work_struct *work)
2638 {
2639 struct link_dead_args *args = container_of(work, struct link_dead_args,
2640 work);
2641 nbd_mcast_index(args->index);
2642 kfree(args);
2643 }
2644
nbd_init(void)2645 static int __init nbd_init(void)
2646 {
2647 int i;
2648
2649 BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2650
2651 if (max_part < 0) {
2652 pr_err("max_part must be >= 0\n");
2653 return -EINVAL;
2654 }
2655
2656 part_shift = 0;
2657 if (max_part > 0) {
2658 part_shift = fls(max_part);
2659
2660 /*
2661 * Adjust max_part according to part_shift as it is exported
2662 * to user space so that user can know the max number of
2663 * partition kernel should be able to manage.
2664 *
2665 * Note that -1 is required because partition 0 is reserved
2666 * for the whole disk.
2667 */
2668 max_part = (1UL << part_shift) - 1;
2669 }
2670
2671 if ((1UL << part_shift) > DISK_MAX_PARTS)
2672 return -EINVAL;
2673
2674 if (nbds_max > 1UL << (MINORBITS - part_shift))
2675 return -EINVAL;
2676
2677 if (register_blkdev(NBD_MAJOR, "nbd"))
2678 return -EIO;
2679
2680 nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0);
2681 if (!nbd_del_wq) {
2682 unregister_blkdev(NBD_MAJOR, "nbd");
2683 return -ENOMEM;
2684 }
2685
2686 nbd_cred = prepare_kernel_cred(&init_task);
2687 if (!nbd_cred) {
2688 destroy_workqueue(nbd_del_wq);
2689 unregister_blkdev(NBD_MAJOR, "nbd");
2690 return -ENOMEM;
2691 }
2692
2693 if (genl_register_family(&nbd_genl_family)) {
2694 put_cred(nbd_cred);
2695 destroy_workqueue(nbd_del_wq);
2696 unregister_blkdev(NBD_MAJOR, "nbd");
2697 return -EINVAL;
2698 }
2699 nbd_dbg_init();
2700
2701 for (i = 0; i < nbds_max; i++)
2702 nbd_dev_add(i, 1);
2703 return 0;
2704 }
2705
nbd_exit_cb(int id,void * ptr,void * data)2706 static int nbd_exit_cb(int id, void *ptr, void *data)
2707 {
2708 struct list_head *list = (struct list_head *)data;
2709 struct nbd_device *nbd = ptr;
2710
2711 /* Skip nbd that is being removed asynchronously */
2712 if (refcount_read(&nbd->refs))
2713 list_add_tail(&nbd->list, list);
2714
2715 return 0;
2716 }
2717
nbd_cleanup(void)2718 static void __exit nbd_cleanup(void)
2719 {
2720 struct nbd_device *nbd;
2721 LIST_HEAD(del_list);
2722
2723 /*
2724 * Unregister netlink interface prior to waiting
2725 * for the completion of netlink commands.
2726 */
2727 genl_unregister_family(&nbd_genl_family);
2728
2729 nbd_dbg_close();
2730
2731 mutex_lock(&nbd_index_mutex);
2732 idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2733 mutex_unlock(&nbd_index_mutex);
2734
2735 while (!list_empty(&del_list)) {
2736 nbd = list_first_entry(&del_list, struct nbd_device, list);
2737 list_del_init(&nbd->list);
2738 if (refcount_read(&nbd->config_refs))
2739 pr_err("possibly leaking nbd_config (ref %d)\n",
2740 refcount_read(&nbd->config_refs));
2741 if (refcount_read(&nbd->refs) != 1)
2742 pr_err("possibly leaking a device\n");
2743 nbd_put(nbd);
2744 }
2745
2746 /* Also wait for nbd_dev_remove_work() completes */
2747 destroy_workqueue(nbd_del_wq);
2748
2749 put_cred(nbd_cred);
2750 idr_destroy(&nbd_index_idr);
2751 unregister_blkdev(NBD_MAJOR, "nbd");
2752 }
2753
2754 module_init(nbd_init);
2755 module_exit(nbd_cleanup);
2756
2757 MODULE_DESCRIPTION("Network Block Device");
2758 MODULE_LICENSE("GPL");
2759
2760 module_param(nbds_max, int, 0444);
2761 MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2762 module_param(max_part, int, 0444);
2763 MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2764