xref: /kvmtool/virtio/net.c (revision 8b27bcff44fd4adaa1466e3198e4222816eefa06)
1 #include "kvm/virtio-pci-dev.h"
2 #include "kvm/virtio-net.h"
3 #include "kvm/virtio.h"
4 #include "kvm/mutex.h"
5 #include "kvm/util.h"
6 #include "kvm/kvm.h"
7 #include "kvm/irq.h"
8 #include "kvm/uip.h"
9 #include "kvm/guest_compat.h"
10 #include "kvm/iovec.h"
11 #include "kvm/strbuf.h"
12 
13 #include <linux/vhost.h>
14 #include <linux/virtio_net.h>
15 #include <linux/if_tun.h>
16 #include <linux/types.h>
17 
18 #include <arpa/inet.h>
19 #include <net/if.h>
20 
21 #include <unistd.h>
22 #include <fcntl.h>
23 
24 #include <sys/socket.h>
25 #include <sys/ioctl.h>
26 #include <sys/types.h>
27 #include <sys/wait.h>
28 #include <sys/eventfd.h>
29 
30 #define VIRTIO_NET_QUEUE_SIZE		256
31 #define VIRTIO_NET_NUM_QUEUES		8
32 
33 struct net_dev;
34 
35 struct net_dev_operations {
36 	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
37 	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
38 };
39 
40 struct net_dev_queue {
41 	int				id;
42 	struct net_dev			*ndev;
43 	struct virt_queue		vq;
44 	pthread_t			thread;
45 	struct mutex			lock;
46 	pthread_cond_t			cond;
47 	int				gsi;
48 	int				irqfd;
49 };
50 
51 struct net_dev {
52 	struct mutex			mutex;
53 	struct virtio_device		vdev;
54 	struct list_head		list;
55 
56 	struct net_dev_queue		queues[VIRTIO_NET_NUM_QUEUES * 2 + 1];
57 	struct virtio_net_config	config;
58 	u32				queue_pairs;
59 
60 	int				vhost_fd;
61 	int				tap_fd;
62 	char				tap_name[IFNAMSIZ];
63 	bool				tap_ufo;
64 
65 	int				mode;
66 
67 	struct uip_info			info;
68 	struct net_dev_operations	*ops;
69 	struct kvm			*kvm;
70 
71 	struct virtio_net_params	*params;
72 };
73 
74 static LIST_HEAD(ndevs);
75 static int compat_id = -1;
76 
77 #define MAX_PACKET_SIZE 65550
78 
79 static bool has_virtio_feature(struct net_dev *ndev, u32 feature)
80 {
81 	return ndev->vdev.features & (1 << feature);
82 }
83 
84 static void *virtio_net_rx_thread(void *p)
85 {
86 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
87 	struct net_dev_queue *queue = p;
88 	struct virt_queue *vq = &queue->vq;
89 	struct net_dev *ndev = queue->ndev;
90 	struct kvm *kvm;
91 	u16 out, in;
92 	u16 head;
93 	int len, copied;
94 
95 	kvm__set_thread_name("virtio-net-rx");
96 
97 	kvm = ndev->kvm;
98 	while (1) {
99 		mutex_lock(&queue->lock);
100 		if (!virt_queue__available(vq))
101 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
102 		mutex_unlock(&queue->lock);
103 
104 		while (virt_queue__available(vq)) {
105 			unsigned char buffer[MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)];
106 			struct iovec dummy_iov = {
107 				.iov_base = buffer,
108 				.iov_len  = sizeof(buffer),
109 			};
110 			struct virtio_net_hdr_mrg_rxbuf *hdr;
111 			u16 num_buffers;
112 
113 			len = ndev->ops->rx(&dummy_iov, 1, ndev);
114 			if (len < 0) {
115 				pr_warning("%s: rx on vq %u failed (%d), exiting thread\n",
116 						__func__, queue->id, len);
117 				goto out_err;
118 			}
119 
120 			copied = num_buffers = 0;
121 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
122 			hdr = iov[0].iov_base;
123 			while (copied < len) {
124 				size_t iovsize = min_t(size_t, len - copied, iov_size(iov, in));
125 
126 				memcpy_toiovec(iov, buffer + copied, iovsize);
127 				copied += iovsize;
128 				virt_queue__set_used_elem_no_update(vq, head, iovsize, num_buffers++);
129 				if (copied == len)
130 					break;
131 				while (!virt_queue__available(vq))
132 					sleep(0);
133 				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
134 			}
135 
136 			if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
137 				hdr->num_buffers = virtio_host_to_guest_u16(vq, num_buffers);
138 
139 			virt_queue__used_idx_advance(vq, num_buffers);
140 
141 			/* We should interrupt guest right now, otherwise latency is huge. */
142 			if (virtio_queue__should_signal(vq))
143 				ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
144 		}
145 	}
146 
147 out_err:
148 	pthread_exit(NULL);
149 	return NULL;
150 
151 }
152 
153 static void *virtio_net_tx_thread(void *p)
154 {
155 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
156 	struct net_dev_queue *queue = p;
157 	struct virt_queue *vq = &queue->vq;
158 	struct net_dev *ndev = queue->ndev;
159 	struct kvm *kvm;
160 	u16 out, in;
161 	u16 head;
162 	int len;
163 
164 	kvm__set_thread_name("virtio-net-tx");
165 
166 	kvm = ndev->kvm;
167 
168 	while (1) {
169 		mutex_lock(&queue->lock);
170 		if (!virt_queue__available(vq))
171 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
172 		mutex_unlock(&queue->lock);
173 
174 		while (virt_queue__available(vq)) {
175 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
176 			len = ndev->ops->tx(iov, out, ndev);
177 			if (len < 0) {
178 				pr_warning("%s: tx on vq %u failed (%d)\n",
179 						__func__, queue->id, errno);
180 				goto out_err;
181 			}
182 
183 			virt_queue__set_used_elem(vq, head, len);
184 		}
185 
186 		if (virtio_queue__should_signal(vq))
187 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
188 	}
189 
190 out_err:
191 	pthread_exit(NULL);
192 	return NULL;
193 }
194 
195 static virtio_net_ctrl_ack virtio_net_handle_mq(struct kvm* kvm, struct net_dev *ndev, struct virtio_net_ctrl_hdr *ctrl)
196 {
197 	/* Not much to do here */
198 	return VIRTIO_NET_OK;
199 }
200 
201 static void *virtio_net_ctrl_thread(void *p)
202 {
203 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
204 	struct net_dev_queue *queue = p;
205 	struct virt_queue *vq = &queue->vq;
206 	struct net_dev *ndev = queue->ndev;
207 	u16 out, in, head;
208 	struct kvm *kvm = ndev->kvm;
209 	struct virtio_net_ctrl_hdr *ctrl;
210 	virtio_net_ctrl_ack *ack;
211 
212 	kvm__set_thread_name("virtio-net-ctrl");
213 
214 	while (1) {
215 		mutex_lock(&queue->lock);
216 		if (!virt_queue__available(vq))
217 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
218 		mutex_unlock(&queue->lock);
219 
220 		while (virt_queue__available(vq)) {
221 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
222 			ctrl = iov[0].iov_base;
223 			ack = iov[out].iov_base;
224 
225 			switch (ctrl->class) {
226 			case VIRTIO_NET_CTRL_MQ:
227 				*ack = virtio_net_handle_mq(kvm, ndev, ctrl);
228 				break;
229 			default:
230 				*ack = VIRTIO_NET_ERR;
231 				break;
232 			}
233 			virt_queue__set_used_elem(vq, head, iov[out].iov_len);
234 		}
235 
236 		if (virtio_queue__should_signal(vq))
237 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
238 	}
239 
240 	pthread_exit(NULL);
241 
242 	return NULL;
243 }
244 
245 static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
246 {
247 	struct net_dev_queue *net_queue = &ndev->queues[queue];
248 
249 	if ((u32)queue >= (ndev->queue_pairs * 2 + 1)) {
250 		pr_warning("Unknown queue index %u", queue);
251 		return;
252 	}
253 
254 	mutex_lock(&net_queue->lock);
255 	pthread_cond_signal(&net_queue->cond);
256 	mutex_unlock(&net_queue->lock);
257 }
258 
259 static int virtio_net_request_tap(struct net_dev *ndev, struct ifreq *ifr,
260 				  const char *tapname)
261 {
262 	int ret;
263 
264 	memset(ifr, 0, sizeof(*ifr));
265 	ifr->ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
266 	if (tapname)
267 		strlcpy(ifr->ifr_name, tapname, sizeof(ifr->ifr_name));
268 
269 	ret = ioctl(ndev->tap_fd, TUNSETIFF, ifr);
270 
271 	if (ret >= 0)
272 		strlcpy(ndev->tap_name, ifr->ifr_name, sizeof(ndev->tap_name));
273 	return ret;
274 }
275 
276 static int virtio_net_exec_script(const char* script, const char *tap_name)
277 {
278 	pid_t pid;
279 	int status;
280 
281 	pid = fork();
282 	if (pid == 0) {
283 		execl(script, script, tap_name, NULL);
284 		_exit(1);
285 	} else {
286 		waitpid(pid, &status, 0);
287 		if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
288 			pr_warning("Fail to setup tap by %s", script);
289 			return -1;
290 		}
291 	}
292 	return 0;
293 }
294 
295 static bool virtio_net__tap_init(struct net_dev *ndev)
296 {
297 	int sock = socket(AF_INET, SOCK_STREAM, 0);
298 	int hdr_len;
299 	struct sockaddr_in sin = {0};
300 	struct ifreq ifr;
301 	const struct virtio_net_params *params = ndev->params;
302 	bool skipconf = !!params->tapif;
303 
304 	hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
305 			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
306 			sizeof(struct virtio_net_hdr);
307 	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
308 		pr_warning("Config tap device TUNSETVNETHDRSZ error");
309 
310 	if (strcmp(params->script, "none")) {
311 		if (virtio_net_exec_script(params->script, ndev->tap_name) < 0)
312 			goto fail;
313 	} else if (!skipconf) {
314 		memset(&ifr, 0, sizeof(ifr));
315 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
316 		sin.sin_addr.s_addr = inet_addr(params->host_ip);
317 		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
318 		ifr.ifr_addr.sa_family = AF_INET;
319 		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
320 			pr_warning("Could not set ip address on tap device");
321 			goto fail;
322 		}
323 	}
324 
325 	if (!skipconf) {
326 		memset(&ifr, 0, sizeof(ifr));
327 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
328 		ioctl(sock, SIOCGIFFLAGS, &ifr);
329 		ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
330 		if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
331 			pr_warning("Could not bring tap device up");
332 	}
333 
334 	close(sock);
335 
336 	return 1;
337 
338 fail:
339 	if (sock >= 0)
340 		close(sock);
341 	if (ndev->tap_fd >= 0)
342 		close(ndev->tap_fd);
343 
344 	return 0;
345 }
346 
347 static void virtio_net__tap_exit(struct net_dev *ndev)
348 {
349 	int sock;
350 	struct ifreq ifr;
351 
352 	if (ndev->params->tapif)
353 		return;
354 
355 	sock = socket(AF_INET, SOCK_STREAM, 0);
356 	strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
357 	ioctl(sock, SIOCGIFFLAGS, &ifr);
358 	ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING);
359 	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0)
360 		pr_warning("Count not bring tap device down");
361 	close(sock);
362 }
363 
364 static bool virtio_net__tap_create(struct net_dev *ndev)
365 {
366 	int offload;
367 	struct ifreq ifr;
368 	const struct virtio_net_params *params = ndev->params;
369 	bool macvtap = (!!params->tapif) && (params->tapif[0] == '/');
370 
371 	/* Did the user already gave us the FD? */
372 	if (params->fd)
373 		ndev->tap_fd = params->fd;
374 	else {
375 		const char *tap_file = "/dev/net/tun";
376 
377 		/* Did the user ask us to use macvtap? */
378 		if (macvtap)
379 			tap_file = params->tapif;
380 
381 		ndev->tap_fd = open(tap_file, O_RDWR);
382 		if (ndev->tap_fd < 0) {
383 			pr_warning("Unable to open %s", tap_file);
384 			return 0;
385 		}
386 	}
387 
388 	if (!macvtap &&
389 	    virtio_net_request_tap(ndev, &ifr, params->tapif) < 0) {
390 		pr_warning("Config tap device error. Are you root?");
391 		goto fail;
392 	}
393 
394 	/*
395 	 * The UFO support had been removed from kernel in commit:
396 	 * ID: fb652fdfe83710da0ca13448a41b7ed027d0a984
397 	 * https://www.spinics.net/lists/netdev/msg443562.html
398 	 * In oder to support the older kernels without this commit,
399 	 * we set the TUN_F_UFO to offload by default to test the status of
400 	 * UFO kernel support.
401 	 */
402 	ndev->tap_ufo = true;
403 	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
404 	if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
405 		/*
406 		 * Is this failure caused by kernel remove the UFO support?
407 		 * Try TUNSETOFFLOAD without TUN_F_UFO.
408 		 */
409 		offload &= ~TUN_F_UFO;
410 		if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
411 			pr_warning("Config tap device TUNSETOFFLOAD error");
412 			goto fail;
413 		}
414 		ndev->tap_ufo = false;
415 	}
416 
417 	return 1;
418 
419 fail:
420 	if ((ndev->tap_fd >= 0) || (!params->fd) )
421 		close(ndev->tap_fd);
422 
423 	return 0;
424 }
425 
426 static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
427 {
428 	return writev(ndev->tap_fd, iov, out);
429 }
430 
431 static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
432 {
433 	return readv(ndev->tap_fd, iov, in);
434 }
435 
436 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
437 {
438 	return uip_tx(iov, out, &ndev->info);
439 }
440 
441 static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
442 {
443 	return uip_rx(iov, in, &ndev->info);
444 }
445 
446 static struct net_dev_operations tap_ops = {
447 	.rx	= tap_ops_rx,
448 	.tx	= tap_ops_tx,
449 };
450 
451 static struct net_dev_operations uip_ops = {
452 	.rx	= uip_ops_rx,
453 	.tx	= uip_ops_tx,
454 };
455 
456 static u8 *get_config(struct kvm *kvm, void *dev)
457 {
458 	struct net_dev *ndev = dev;
459 
460 	return ((u8 *)(&ndev->config));
461 }
462 
463 static size_t get_config_size(struct kvm *kvm, void *dev)
464 {
465 	struct net_dev *ndev = dev;
466 
467 	return sizeof(ndev->config);
468 }
469 
470 static u32 get_host_features(struct kvm *kvm, void *dev)
471 {
472 	u32 features;
473 	struct net_dev *ndev = dev;
474 
475 	features = 1UL << VIRTIO_NET_F_MAC
476 		| 1UL << VIRTIO_NET_F_CSUM
477 		| 1UL << VIRTIO_NET_F_HOST_TSO4
478 		| 1UL << VIRTIO_NET_F_HOST_TSO6
479 		| 1UL << VIRTIO_NET_F_GUEST_TSO4
480 		| 1UL << VIRTIO_NET_F_GUEST_TSO6
481 		| 1UL << VIRTIO_RING_F_EVENT_IDX
482 		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
483 		| 1UL << VIRTIO_NET_F_CTRL_VQ
484 		| 1UL << VIRTIO_NET_F_MRG_RXBUF
485 		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
486 
487 	/*
488 	 * The UFO feature for host and guest only can be enabled when the
489 	 * kernel has TAP UFO support.
490 	 */
491 	if (ndev->tap_ufo)
492 		features |= (1UL << VIRTIO_NET_F_HOST_UFO
493 				| 1UL << VIRTIO_NET_F_GUEST_UFO);
494 
495 	return features;
496 }
497 
498 static int virtio_net__vhost_set_features(struct net_dev *ndev)
499 {
500 	u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
501 	u64 vhost_features;
502 
503 	if (ioctl(ndev->vhost_fd, VHOST_GET_FEATURES, &vhost_features) != 0)
504 		die_perror("VHOST_GET_FEATURES failed");
505 
506 	/* make sure both side support mergable rx buffers */
507 	if (vhost_features & 1UL << VIRTIO_NET_F_MRG_RXBUF &&
508 			has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
509 		features |= 1UL << VIRTIO_NET_F_MRG_RXBUF;
510 
511 	return ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
512 }
513 
514 static void virtio_net_start(struct net_dev *ndev)
515 {
516 	if (ndev->mode == NET_MODE_TAP) {
517 		if (!virtio_net__tap_init(ndev))
518 			die_perror("TAP device initialized failed because");
519 
520 		if (ndev->vhost_fd &&
521 				virtio_net__vhost_set_features(ndev) != 0)
522 			die_perror("VHOST_SET_FEATURES failed");
523 	} else {
524 		ndev->info.vnet_hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
525 						sizeof(struct virtio_net_hdr_mrg_rxbuf) :
526 						sizeof(struct virtio_net_hdr);
527 		uip_init(&ndev->info);
528 	}
529 }
530 
531 static void virtio_net_stop(struct net_dev *ndev)
532 {
533 	/* Undo whatever start() did */
534 	if (ndev->mode == NET_MODE_TAP)
535 		virtio_net__tap_exit(ndev);
536 	else
537 		uip_exit(&ndev->info);
538 }
539 
540 static void virtio_net_update_endian(struct net_dev *ndev)
541 {
542 	struct virtio_net_config *conf = &ndev->config;
543 
544 	conf->status = virtio_host_to_guest_u16(&ndev->vdev,
545 						VIRTIO_NET_S_LINK_UP);
546 	conf->max_virtqueue_pairs = virtio_host_to_guest_u16(&ndev->vdev,
547 							     ndev->queue_pairs);
548 
549 	/* Let TAP know about vnet header endianness */
550 	if (ndev->mode == NET_MODE_TAP &&
551 	    ndev->vdev.endian != VIRTIO_ENDIAN_HOST) {
552 		int enable_val = 1, disable_val = 0;
553 		int enable_req, disable_req;
554 
555 		if (ndev->vdev.endian == VIRTIO_ENDIAN_LE) {
556 			enable_req = TUNSETVNETLE;
557 			disable_req = TUNSETVNETBE;
558 		} else {
559 			enable_req = TUNSETVNETBE;
560 			disable_req = TUNSETVNETLE;
561 		}
562 
563 		ioctl(ndev->tap_fd, disable_req, &disable_val);
564 		if (ioctl(ndev->tap_fd, enable_req, &enable_val) < 0)
565 			pr_err("Config tap device TUNSETVNETLE/BE error");
566 	}
567 }
568 
569 static void notify_status(struct kvm *kvm, void *dev, u32 status)
570 {
571 	struct net_dev *ndev = dev;
572 
573 	if (status & VIRTIO__STATUS_CONFIG)
574 		virtio_net_update_endian(ndev);
575 
576 	if (status & VIRTIO__STATUS_START)
577 		virtio_net_start(dev);
578 	else if (status & VIRTIO__STATUS_STOP)
579 		virtio_net_stop(dev);
580 }
581 
582 static bool is_ctrl_vq(struct net_dev *ndev, u32 vq)
583 {
584 	return vq == (u32)(ndev->queue_pairs * 2);
585 }
586 
587 static int init_vq(struct kvm *kvm, void *dev, u32 vq)
588 {
589 	struct vhost_vring_state state = { .index = vq };
590 	struct net_dev_queue *net_queue;
591 	struct vhost_vring_addr addr;
592 	struct net_dev *ndev = dev;
593 	struct virt_queue *queue;
594 	int r;
595 
596 	compat__remove_message(compat_id);
597 
598 	net_queue	= &ndev->queues[vq];
599 	net_queue->id	= vq;
600 	net_queue->ndev	= ndev;
601 	queue		= &net_queue->vq;
602 	virtio_init_device_vq(kvm, &ndev->vdev, queue, VIRTIO_NET_QUEUE_SIZE);
603 
604 	mutex_init(&net_queue->lock);
605 	pthread_cond_init(&net_queue->cond, NULL);
606 	if (is_ctrl_vq(ndev, vq)) {
607 		pthread_create(&net_queue->thread, NULL, virtio_net_ctrl_thread,
608 			       net_queue);
609 
610 		return 0;
611 	} else if (ndev->vhost_fd == 0 ) {
612 		if (vq & 1)
613 			pthread_create(&net_queue->thread, NULL,
614 				       virtio_net_tx_thread, net_queue);
615 		else
616 			pthread_create(&net_queue->thread, NULL,
617 				       virtio_net_rx_thread, net_queue);
618 
619 		return 0;
620 	}
621 
622 	if (queue->endian != VIRTIO_ENDIAN_HOST)
623 		die_perror("VHOST requires the same endianness in guest and host");
624 
625 	state.num = queue->vring.num;
626 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
627 	if (r < 0)
628 		die_perror("VHOST_SET_VRING_NUM failed");
629 	state.num = 0;
630 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
631 	if (r < 0)
632 		die_perror("VHOST_SET_VRING_BASE failed");
633 
634 	addr = (struct vhost_vring_addr) {
635 		.index = vq,
636 		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
637 		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
638 		.used_user_addr = (u64)(unsigned long)queue->vring.used,
639 	};
640 
641 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
642 	if (r < 0)
643 		die_perror("VHOST_SET_VRING_ADDR failed");
644 
645 	return 0;
646 }
647 
648 static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
649 {
650 	struct net_dev *ndev = dev;
651 	struct net_dev_queue *queue = &ndev->queues[vq];
652 
653 	if (!is_ctrl_vq(ndev, vq) && queue->gsi) {
654 		irq__del_irqfd(kvm, queue->gsi, queue->irqfd);
655 		close(queue->irqfd);
656 		queue->gsi = queue->irqfd = 0;
657 	}
658 
659 	/*
660 	 * TODO: vhost reset owner. It's the only way to cleanly stop vhost, but
661 	 * we can't restart it at the moment.
662 	 */
663 	if (ndev->vhost_fd && !is_ctrl_vq(ndev, vq)) {
664 		pr_warning("Cannot reset VHOST queue");
665 		ioctl(ndev->vhost_fd, VHOST_RESET_OWNER);
666 		return;
667 	}
668 
669 	/*
670 	 * Threads are waiting on cancellation points (readv or
671 	 * pthread_cond_wait) and should stop gracefully.
672 	 */
673 	pthread_cancel(queue->thread);
674 	pthread_join(queue->thread, NULL);
675 }
676 
677 static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
678 {
679 	struct net_dev *ndev = dev;
680 	struct net_dev_queue *queue = &ndev->queues[vq];
681 	struct vhost_vring_file file;
682 	int r;
683 
684 	if (ndev->vhost_fd == 0)
685 		return;
686 
687 	file = (struct vhost_vring_file) {
688 		.index	= vq,
689 		.fd	= eventfd(0, 0),
690 	};
691 
692 	r = irq__add_irqfd(kvm, gsi, file.fd, -1);
693 	if (r < 0)
694 		die_perror("KVM_IRQFD failed");
695 
696 	queue->irqfd = file.fd;
697 	queue->gsi = gsi;
698 
699 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
700 	if (r < 0)
701 		die_perror("VHOST_SET_VRING_CALL failed");
702 	file.fd = ndev->tap_fd;
703 	r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
704 	if (r != 0)
705 		die("VHOST_NET_SET_BACKEND failed %d", errno);
706 
707 }
708 
709 static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
710 {
711 	struct net_dev *ndev = dev;
712 	struct vhost_vring_file file = {
713 		.index	= vq,
714 		.fd	= efd,
715 	};
716 	int r;
717 
718 	if (ndev->vhost_fd == 0 || is_ctrl_vq(ndev, vq))
719 		return;
720 
721 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
722 	if (r < 0)
723 		die_perror("VHOST_SET_VRING_KICK failed");
724 }
725 
726 static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
727 {
728 	struct net_dev *ndev = dev;
729 
730 	virtio_net_handle_callback(kvm, ndev, vq);
731 
732 	return 0;
733 }
734 
735 static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
736 {
737 	struct net_dev *ndev = dev;
738 
739 	return &ndev->queues[vq].vq;
740 }
741 
742 static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
743 {
744 	/* FIXME: dynamic */
745 	return VIRTIO_NET_QUEUE_SIZE;
746 }
747 
748 static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
749 {
750 	/* FIXME: dynamic */
751 	return size;
752 }
753 
754 static unsigned int get_vq_count(struct kvm *kvm, void *dev)
755 {
756 	struct net_dev *ndev = dev;
757 
758 	return ndev->queue_pairs * 2 + 1;
759 }
760 
761 static struct virtio_ops net_dev_virtio_ops = {
762 	.get_config		= get_config,
763 	.get_config_size	= get_config_size,
764 	.get_host_features	= get_host_features,
765 	.get_vq_count		= get_vq_count,
766 	.init_vq		= init_vq,
767 	.exit_vq		= exit_vq,
768 	.get_vq			= get_vq,
769 	.get_size_vq		= get_size_vq,
770 	.set_size_vq		= set_size_vq,
771 	.notify_vq		= notify_vq,
772 	.notify_vq_gsi		= notify_vq_gsi,
773 	.notify_vq_eventfd	= notify_vq_eventfd,
774 	.notify_status		= notify_status,
775 };
776 
777 static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
778 {
779 	struct kvm_mem_bank *bank;
780 	struct vhost_memory *mem;
781 	int r, i;
782 
783 	ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
784 	if (ndev->vhost_fd < 0)
785 		die_perror("Failed openning vhost-net device");
786 
787 	mem = calloc(1, sizeof(*mem) + kvm->mem_slots * sizeof(struct vhost_memory_region));
788 	if (mem == NULL)
789 		die("Failed allocating memory for vhost memory map");
790 
791 	i = 0;
792 	list_for_each_entry(bank, &kvm->mem_banks, list) {
793 		mem->regions[i] = (struct vhost_memory_region) {
794 			.guest_phys_addr = bank->guest_phys_addr,
795 			.memory_size	 = bank->size,
796 			.userspace_addr	 = (unsigned long)bank->host_addr,
797 		};
798 		i++;
799 	}
800 	mem->nregions = i;
801 
802 	r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
803 	if (r != 0)
804 		die_perror("VHOST_SET_OWNER failed");
805 
806 	r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
807 	if (r != 0)
808 		die_perror("VHOST_SET_MEM_TABLE failed");
809 
810 	ndev->vdev.use_vhost = true;
811 
812 	free(mem);
813 }
814 
815 static inline void str_to_mac(const char *str, char *mac)
816 {
817 	sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
818 		mac, mac+1, mac+2, mac+3, mac+4, mac+5);
819 }
820 static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
821 			const char *param, const char *val)
822 {
823 	if (strcmp(param, "guest_mac") == 0) {
824 		str_to_mac(val, p->guest_mac);
825 	} else if (strcmp(param, "mode") == 0) {
826 		if (!strncmp(val, "user", 4)) {
827 			int i;
828 
829 			for (i = 0; i < kvm->cfg.num_net_devices; i++)
830 				if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
831 					die("Only one usermode network device allowed at a time");
832 			p->mode = NET_MODE_USER;
833 		} else if (!strncmp(val, "tap", 3)) {
834 			p->mode = NET_MODE_TAP;
835 		} else if (!strncmp(val, "none", 4)) {
836 			kvm->cfg.no_net = 1;
837 			return -1;
838 		} else
839 			die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
840 	} else if (strcmp(param, "script") == 0) {
841 		p->script = strdup(val);
842 	} else if (strcmp(param, "downscript") == 0) {
843 		p->downscript = strdup(val);
844 	} else if (strcmp(param, "guest_ip") == 0) {
845 		p->guest_ip = strdup(val);
846 	} else if (strcmp(param, "host_ip") == 0) {
847 		p->host_ip = strdup(val);
848 	} else if (strcmp(param, "trans") == 0) {
849 		p->trans = strdup(val);
850 	} else if (strcmp(param, "tapif") == 0) {
851 		p->tapif = strdup(val);
852 	} else if (strcmp(param, "vhost") == 0) {
853 		p->vhost = atoi(val);
854 	} else if (strcmp(param, "fd") == 0) {
855 		p->fd = atoi(val);
856 	} else if (strcmp(param, "mq") == 0) {
857 		p->mq = atoi(val);
858 	} else
859 		die("Unknown network parameter %s", param);
860 
861 	return 0;
862 }
863 
864 int netdev_parser(const struct option *opt, const char *arg, int unset)
865 {
866 	struct virtio_net_params p;
867 	char *buf = NULL, *cmd = NULL, *cur = NULL;
868 	bool on_cmd = true;
869 	struct kvm *kvm = opt->ptr;
870 
871 	if (arg) {
872 		buf = strdup(arg);
873 		if (buf == NULL)
874 			die("Failed allocating new net buffer");
875 		cur = strtok(buf, ",=");
876 	}
877 
878 	p = (struct virtio_net_params) {
879 		.guest_ip	= DEFAULT_GUEST_ADDR,
880 		.host_ip	= DEFAULT_HOST_ADDR,
881 		.script		= DEFAULT_SCRIPT,
882 		.downscript	= DEFAULT_SCRIPT,
883 		.mode		= NET_MODE_TAP,
884 	};
885 
886 	str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
887 	p.guest_mac[5] += kvm->cfg.num_net_devices;
888 
889 	while (cur) {
890 		if (on_cmd) {
891 			cmd = cur;
892 		} else {
893 			if (set_net_param(kvm, &p, cmd, cur) < 0)
894 				goto done;
895 		}
896 		on_cmd = !on_cmd;
897 
898 		cur = strtok(NULL, ",=");
899 	};
900 
901 	kvm->cfg.num_net_devices++;
902 
903 	kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
904 	if (kvm->cfg.net_params == NULL)
905 		die("Failed adding new network device");
906 
907 	kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
908 
909 done:
910 	free(buf);
911 	return 0;
912 }
913 
914 static int virtio_net__init_one(struct virtio_net_params *params)
915 {
916 	int i, r;
917 	struct net_dev *ndev;
918 	struct virtio_ops *ops;
919 	enum virtio_trans trans = VIRTIO_DEFAULT_TRANS(params->kvm);
920 
921 	ndev = calloc(1, sizeof(struct net_dev));
922 	if (ndev == NULL)
923 		return -ENOMEM;
924 
925 	list_add_tail(&ndev->list, &ndevs);
926 
927 	ops = malloc(sizeof(*ops));
928 	if (ops == NULL)
929 		return -ENOMEM;
930 
931 	ndev->kvm = params->kvm;
932 	ndev->params = params;
933 
934 	mutex_init(&ndev->mutex);
935 	ndev->queue_pairs = max(1, min(VIRTIO_NET_NUM_QUEUES, params->mq));
936 
937 	for (i = 0 ; i < 6 ; i++) {
938 		ndev->config.mac[i]		= params->guest_mac[i];
939 		ndev->info.guest_mac.addr[i]	= params->guest_mac[i];
940 		ndev->info.host_mac.addr[i]	= params->host_mac[i];
941 	}
942 
943 	ndev->mode = params->mode;
944 	if (ndev->mode == NET_MODE_TAP) {
945 		ndev->ops = &tap_ops;
946 		if (!virtio_net__tap_create(ndev))
947 			die_perror("You have requested a TAP device, but creation of one has failed because");
948 	} else {
949 		ndev->info.host_ip		= ntohl(inet_addr(params->host_ip));
950 		ndev->info.guest_ip		= ntohl(inet_addr(params->guest_ip));
951 		ndev->info.guest_netmask	= ntohl(inet_addr("255.255.255.0"));
952 		ndev->info.buf_nr		= 20,
953 		ndev->ops = &uip_ops;
954 		uip_static_init(&ndev->info);
955 	}
956 
957 	*ops = net_dev_virtio_ops;
958 
959 	if (params->trans) {
960 		if (strcmp(params->trans, "mmio") == 0)
961 			trans = VIRTIO_MMIO;
962 		else if (strcmp(params->trans, "pci") == 0)
963 			trans = VIRTIO_PCI;
964 		else
965 			pr_warning("virtio-net: Unknown transport method : %s, "
966 				   "falling back to %s.", params->trans,
967 				   virtio_trans_name(trans));
968 	}
969 
970 	r = virtio_init(params->kvm, ndev, &ndev->vdev, ops, trans,
971 			PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
972 	if (r < 0) {
973 		free(ops);
974 		return r;
975 	}
976 
977 	if (params->vhost)
978 		virtio_net__vhost_init(params->kvm, ndev);
979 
980 	if (compat_id == -1)
981 		compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
982 
983 	return 0;
984 }
985 
986 int virtio_net__init(struct kvm *kvm)
987 {
988 	int i, r;
989 
990 	for (i = 0; i < kvm->cfg.num_net_devices; i++) {
991 		kvm->cfg.net_params[i].kvm = kvm;
992 		r = virtio_net__init_one(&kvm->cfg.net_params[i]);
993 		if (r < 0)
994 			goto cleanup;
995 	}
996 
997 	if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
998 		static struct virtio_net_params net_params;
999 
1000 		net_params = (struct virtio_net_params) {
1001 			.guest_ip	= kvm->cfg.guest_ip,
1002 			.host_ip	= kvm->cfg.host_ip,
1003 			.kvm		= kvm,
1004 			.script		= kvm->cfg.script,
1005 			.mode		= NET_MODE_USER,
1006 		};
1007 		str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
1008 		str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
1009 
1010 		r = virtio_net__init_one(&net_params);
1011 		if (r < 0)
1012 			goto cleanup;
1013 	}
1014 
1015 	return 0;
1016 
1017 cleanup:
1018 	virtio_net__exit(kvm);
1019 	return r;
1020 }
1021 virtio_dev_init(virtio_net__init);
1022 
1023 int virtio_net__exit(struct kvm *kvm)
1024 {
1025 	struct virtio_net_params *params;
1026 	struct net_dev *ndev;
1027 	struct list_head *ptr, *n;
1028 
1029 	list_for_each_safe(ptr, n, &ndevs) {
1030 		ndev = list_entry(ptr, struct net_dev, list);
1031 		params = ndev->params;
1032 		/* Cleanup any tap device which attached to bridge */
1033 		if (ndev->mode == NET_MODE_TAP &&
1034 		    strcmp(params->downscript, "none"))
1035 			virtio_net_exec_script(params->downscript, ndev->tap_name);
1036 
1037 		list_del(&ndev->list);
1038 		free(ndev);
1039 	}
1040 	return 0;
1041 }
1042 virtio_dev_exit(virtio_net__exit);
1043