xref: /kvmtool/virtio/net.c (revision de166e5f7edc93de3189c6848c40b39a4ac11a93)
1 #include "kvm/virtio-pci-dev.h"
2 #include "kvm/virtio-net.h"
3 #include "kvm/virtio.h"
4 #include "kvm/mutex.h"
5 #include "kvm/util.h"
6 #include "kvm/kvm.h"
7 #include "kvm/irq.h"
8 #include "kvm/uip.h"
9 #include "kvm/guest_compat.h"
10 #include "kvm/iovec.h"
11 #include "kvm/strbuf.h"
12 
13 #include <linux/vhost.h>
14 #include <linux/virtio_net.h>
15 #include <linux/if_tun.h>
16 #include <linux/types.h>
17 
18 #include <arpa/inet.h>
19 #include <net/if.h>
20 
21 #include <unistd.h>
22 #include <fcntl.h>
23 
24 #include <sys/socket.h>
25 #include <sys/ioctl.h>
26 #include <sys/types.h>
27 #include <sys/wait.h>
28 #include <sys/eventfd.h>
29 
30 #define VIRTIO_NET_QUEUE_SIZE		256
31 #define VIRTIO_NET_NUM_QUEUES		8
32 
33 struct net_dev;
34 
35 struct net_dev_operations {
36 	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
37 	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
38 };
39 
40 struct net_dev_queue {
41 	int				id;
42 	struct net_dev			*ndev;
43 	struct virt_queue		vq;
44 	pthread_t			thread;
45 	struct mutex			lock;
46 	pthread_cond_t			cond;
47 	int				gsi;
48 	int				irqfd;
49 };
50 
51 struct net_dev {
52 	struct mutex			mutex;
53 	struct virtio_device		vdev;
54 	struct list_head		list;
55 
56 	struct net_dev_queue		queues[VIRTIO_NET_NUM_QUEUES * 2 + 1];
57 	struct virtio_net_config	config;
58 	u32				queue_pairs;
59 
60 	int				vhost_fd;
61 	int				tap_fd;
62 	char				tap_name[IFNAMSIZ];
63 	bool				tap_ufo;
64 
65 	int				mode;
66 
67 	struct uip_info			info;
68 	struct net_dev_operations	*ops;
69 	struct kvm			*kvm;
70 
71 	struct virtio_net_params	*params;
72 };
73 
74 static LIST_HEAD(ndevs);
75 static int compat_id = -1;
76 
77 #define MAX_PACKET_SIZE 65550
78 
79 static bool has_virtio_feature(struct net_dev *ndev, u32 feature)
80 {
81 	return ndev->vdev.features & (1 << feature);
82 }
83 
84 static int virtio_net_hdr_len(struct net_dev *ndev)
85 {
86 	if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ||
87 	    !ndev->vdev.legacy)
88 		return sizeof(struct virtio_net_hdr_mrg_rxbuf);
89 
90 	return sizeof(struct virtio_net_hdr);
91 }
92 
93 static void *virtio_net_rx_thread(void *p)
94 {
95 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
96 	struct net_dev_queue *queue = p;
97 	struct virt_queue *vq = &queue->vq;
98 	struct net_dev *ndev = queue->ndev;
99 	struct kvm *kvm;
100 	u16 out, in;
101 	u16 head;
102 	int len, copied;
103 
104 	kvm__set_thread_name("virtio-net-rx");
105 
106 	kvm = ndev->kvm;
107 	while (1) {
108 		mutex_lock(&queue->lock);
109 		if (!virt_queue__available(vq))
110 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
111 		mutex_unlock(&queue->lock);
112 
113 		while (virt_queue__available(vq)) {
114 			unsigned char buffer[MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)];
115 			struct iovec dummy_iov = {
116 				.iov_base = buffer,
117 				.iov_len  = sizeof(buffer),
118 			};
119 			struct virtio_net_hdr_mrg_rxbuf *hdr;
120 			u16 num_buffers;
121 
122 			len = ndev->ops->rx(&dummy_iov, 1, ndev);
123 			if (len < 0) {
124 				pr_warning("%s: rx on vq %u failed (%d), exiting thread\n",
125 						__func__, queue->id, len);
126 				goto out_err;
127 			}
128 
129 			copied = num_buffers = 0;
130 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
131 			hdr = iov[0].iov_base;
132 			while (copied < len) {
133 				size_t iovsize = min_t(size_t, len - copied, iov_size(iov, in));
134 
135 				memcpy_toiovec(iov, buffer + copied, iovsize);
136 				copied += iovsize;
137 				virt_queue__set_used_elem_no_update(vq, head, iovsize, num_buffers++);
138 				if (copied == len)
139 					break;
140 				while (!virt_queue__available(vq))
141 					sleep(0);
142 				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
143 			}
144 
145 			/*
146 			 * The device MUST set num_buffers, except in the case
147 			 * where the legacy driver did not negotiate
148 			 * VIRTIO_NET_F_MRG_RXBUF and the field does not exist.
149 			 */
150 			if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ||
151 			    !ndev->vdev.legacy)
152 				hdr->num_buffers = virtio_host_to_guest_u16(vq, num_buffers);
153 
154 			virt_queue__used_idx_advance(vq, num_buffers);
155 
156 			/* We should interrupt guest right now, otherwise latency is huge. */
157 			if (virtio_queue__should_signal(vq))
158 				ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
159 		}
160 	}
161 
162 out_err:
163 	pthread_exit(NULL);
164 	return NULL;
165 
166 }
167 
168 static void *virtio_net_tx_thread(void *p)
169 {
170 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
171 	struct net_dev_queue *queue = p;
172 	struct virt_queue *vq = &queue->vq;
173 	struct net_dev *ndev = queue->ndev;
174 	struct kvm *kvm;
175 	u16 out, in;
176 	u16 head;
177 	int len;
178 
179 	kvm__set_thread_name("virtio-net-tx");
180 
181 	kvm = ndev->kvm;
182 
183 	while (1) {
184 		mutex_lock(&queue->lock);
185 		if (!virt_queue__available(vq))
186 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
187 		mutex_unlock(&queue->lock);
188 
189 		while (virt_queue__available(vq)) {
190 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
191 			len = ndev->ops->tx(iov, out, ndev);
192 			if (len < 0) {
193 				pr_warning("%s: tx on vq %u failed (%d)\n",
194 						__func__, queue->id, errno);
195 				goto out_err;
196 			}
197 
198 			virt_queue__set_used_elem(vq, head, len);
199 		}
200 
201 		if (virtio_queue__should_signal(vq))
202 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
203 	}
204 
205 out_err:
206 	pthread_exit(NULL);
207 	return NULL;
208 }
209 
210 static virtio_net_ctrl_ack virtio_net_handle_mq(struct kvm* kvm, struct net_dev *ndev, struct virtio_net_ctrl_hdr *ctrl)
211 {
212 	/* Not much to do here */
213 	return VIRTIO_NET_OK;
214 }
215 
216 static void *virtio_net_ctrl_thread(void *p)
217 {
218 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
219 	struct net_dev_queue *queue = p;
220 	struct virt_queue *vq = &queue->vq;
221 	struct net_dev *ndev = queue->ndev;
222 	u16 out, in, head;
223 	struct kvm *kvm = ndev->kvm;
224 	struct virtio_net_ctrl_hdr ctrl;
225 	virtio_net_ctrl_ack ack;
226 	size_t len;
227 
228 	kvm__set_thread_name("virtio-net-ctrl");
229 
230 	while (1) {
231 		mutex_lock(&queue->lock);
232 		if (!virt_queue__available(vq))
233 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
234 		mutex_unlock(&queue->lock);
235 
236 		while (virt_queue__available(vq)) {
237 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
238 			len = min(iov_size(iov, in), sizeof(ctrl));
239 			memcpy_fromiovec((void *)&ctrl, iov, len);
240 
241 			switch (ctrl.class) {
242 			case VIRTIO_NET_CTRL_MQ:
243 				ack = virtio_net_handle_mq(kvm, ndev, &ctrl);
244 				break;
245 			default:
246 				ack = VIRTIO_NET_ERR;
247 				break;
248 			}
249 			memcpy_toiovec(iov + in, &ack, sizeof(ack));
250 			virt_queue__set_used_elem(vq, head, sizeof(ack));
251 		}
252 
253 		if (virtio_queue__should_signal(vq))
254 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
255 	}
256 
257 	pthread_exit(NULL);
258 
259 	return NULL;
260 }
261 
262 static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
263 {
264 	struct net_dev_queue *net_queue = &ndev->queues[queue];
265 
266 	if ((u32)queue >= (ndev->queue_pairs * 2 + 1)) {
267 		pr_warning("Unknown queue index %u", queue);
268 		return;
269 	}
270 
271 	mutex_lock(&net_queue->lock);
272 	pthread_cond_signal(&net_queue->cond);
273 	mutex_unlock(&net_queue->lock);
274 }
275 
276 static int virtio_net_request_tap(struct net_dev *ndev, struct ifreq *ifr,
277 				  const char *tapname)
278 {
279 	int ret;
280 
281 	memset(ifr, 0, sizeof(*ifr));
282 	ifr->ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
283 	if (tapname)
284 		strlcpy(ifr->ifr_name, tapname, sizeof(ifr->ifr_name));
285 
286 	ret = ioctl(ndev->tap_fd, TUNSETIFF, ifr);
287 
288 	if (ret >= 0)
289 		strlcpy(ndev->tap_name, ifr->ifr_name, sizeof(ndev->tap_name));
290 	return ret;
291 }
292 
293 static int virtio_net_exec_script(const char* script, const char *tap_name)
294 {
295 	pid_t pid;
296 	int status;
297 
298 	pid = fork();
299 	if (pid == 0) {
300 		execl(script, script, tap_name, NULL);
301 		_exit(1);
302 	} else {
303 		waitpid(pid, &status, 0);
304 		if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
305 			pr_warning("Fail to setup tap by %s", script);
306 			return -1;
307 		}
308 	}
309 	return 0;
310 }
311 
312 static bool virtio_net__tap_init(struct net_dev *ndev)
313 {
314 	int sock = socket(AF_INET, SOCK_STREAM, 0);
315 	int hdr_len;
316 	struct sockaddr_in sin = {0};
317 	struct ifreq ifr;
318 	const struct virtio_net_params *params = ndev->params;
319 	bool skipconf = !!params->tapif;
320 
321 	hdr_len = virtio_net_hdr_len(ndev);
322 	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
323 		pr_warning("Config tap device TUNSETVNETHDRSZ error");
324 
325 	if (strcmp(params->script, "none")) {
326 		if (virtio_net_exec_script(params->script, ndev->tap_name) < 0)
327 			goto fail;
328 	} else if (!skipconf) {
329 		memset(&ifr, 0, sizeof(ifr));
330 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
331 		sin.sin_addr.s_addr = inet_addr(params->host_ip);
332 		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
333 		ifr.ifr_addr.sa_family = AF_INET;
334 		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
335 			pr_warning("Could not set ip address on tap device");
336 			goto fail;
337 		}
338 	}
339 
340 	if (!skipconf) {
341 		memset(&ifr, 0, sizeof(ifr));
342 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
343 		ioctl(sock, SIOCGIFFLAGS, &ifr);
344 		ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
345 		if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
346 			pr_warning("Could not bring tap device up");
347 	}
348 
349 	close(sock);
350 
351 	return 1;
352 
353 fail:
354 	if (sock >= 0)
355 		close(sock);
356 	if (ndev->tap_fd >= 0)
357 		close(ndev->tap_fd);
358 
359 	return 0;
360 }
361 
362 static void virtio_net__tap_exit(struct net_dev *ndev)
363 {
364 	int sock;
365 	struct ifreq ifr;
366 
367 	if (ndev->params->tapif)
368 		return;
369 
370 	sock = socket(AF_INET, SOCK_STREAM, 0);
371 	strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
372 	ioctl(sock, SIOCGIFFLAGS, &ifr);
373 	ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING);
374 	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0)
375 		pr_warning("Count not bring tap device down");
376 	close(sock);
377 }
378 
379 static bool virtio_net__tap_create(struct net_dev *ndev)
380 {
381 	int offload;
382 	struct ifreq ifr;
383 	const struct virtio_net_params *params = ndev->params;
384 	bool macvtap = (!!params->tapif) && (params->tapif[0] == '/');
385 
386 	/* Did the user already gave us the FD? */
387 	if (params->fd)
388 		ndev->tap_fd = params->fd;
389 	else {
390 		const char *tap_file = "/dev/net/tun";
391 
392 		/* Did the user ask us to use macvtap? */
393 		if (macvtap)
394 			tap_file = params->tapif;
395 
396 		ndev->tap_fd = open(tap_file, O_RDWR);
397 		if (ndev->tap_fd < 0) {
398 			pr_warning("Unable to open %s", tap_file);
399 			return 0;
400 		}
401 	}
402 
403 	if (!macvtap &&
404 	    virtio_net_request_tap(ndev, &ifr, params->tapif) < 0) {
405 		pr_warning("Config tap device error. Are you root?");
406 		goto fail;
407 	}
408 
409 	/*
410 	 * The UFO support had been removed from kernel in commit:
411 	 * ID: fb652fdfe83710da0ca13448a41b7ed027d0a984
412 	 * https://www.spinics.net/lists/netdev/msg443562.html
413 	 * In oder to support the older kernels without this commit,
414 	 * we set the TUN_F_UFO to offload by default to test the status of
415 	 * UFO kernel support.
416 	 */
417 	ndev->tap_ufo = true;
418 	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
419 	if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
420 		/*
421 		 * Is this failure caused by kernel remove the UFO support?
422 		 * Try TUNSETOFFLOAD without TUN_F_UFO.
423 		 */
424 		offload &= ~TUN_F_UFO;
425 		if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
426 			pr_warning("Config tap device TUNSETOFFLOAD error");
427 			goto fail;
428 		}
429 		ndev->tap_ufo = false;
430 	}
431 
432 	return 1;
433 
434 fail:
435 	if ((ndev->tap_fd >= 0) || (!params->fd) )
436 		close(ndev->tap_fd);
437 
438 	return 0;
439 }
440 
441 static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
442 {
443 	return writev(ndev->tap_fd, iov, out);
444 }
445 
446 static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
447 {
448 	return readv(ndev->tap_fd, iov, in);
449 }
450 
451 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
452 {
453 	return uip_tx(iov, out, &ndev->info);
454 }
455 
456 static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
457 {
458 	return uip_rx(iov, in, &ndev->info);
459 }
460 
461 static struct net_dev_operations tap_ops = {
462 	.rx	= tap_ops_rx,
463 	.tx	= tap_ops_tx,
464 };
465 
466 static struct net_dev_operations uip_ops = {
467 	.rx	= uip_ops_rx,
468 	.tx	= uip_ops_tx,
469 };
470 
471 static u8 *get_config(struct kvm *kvm, void *dev)
472 {
473 	struct net_dev *ndev = dev;
474 
475 	return ((u8 *)(&ndev->config));
476 }
477 
478 static size_t get_config_size(struct kvm *kvm, void *dev)
479 {
480 	struct net_dev *ndev = dev;
481 
482 	return sizeof(ndev->config);
483 }
484 
485 static u32 get_host_features(struct kvm *kvm, void *dev)
486 {
487 	u32 features;
488 	struct net_dev *ndev = dev;
489 
490 	features = 1UL << VIRTIO_NET_F_MAC
491 		| 1UL << VIRTIO_NET_F_CSUM
492 		| 1UL << VIRTIO_NET_F_HOST_TSO4
493 		| 1UL << VIRTIO_NET_F_HOST_TSO6
494 		| 1UL << VIRTIO_NET_F_GUEST_TSO4
495 		| 1UL << VIRTIO_NET_F_GUEST_TSO6
496 		| 1UL << VIRTIO_RING_F_EVENT_IDX
497 		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
498 		| 1UL << VIRTIO_NET_F_CTRL_VQ
499 		| 1UL << VIRTIO_NET_F_MRG_RXBUF
500 		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0)
501 		| 1UL << VIRTIO_F_ANY_LAYOUT;
502 
503 	/*
504 	 * The UFO feature for host and guest only can be enabled when the
505 	 * kernel has TAP UFO support.
506 	 */
507 	if (ndev->tap_ufo)
508 		features |= (1UL << VIRTIO_NET_F_HOST_UFO
509 				| 1UL << VIRTIO_NET_F_GUEST_UFO);
510 
511 	return features;
512 }
513 
514 static int virtio_net__vhost_set_features(struct net_dev *ndev)
515 {
516 	u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
517 	u64 vhost_features;
518 
519 	if (ioctl(ndev->vhost_fd, VHOST_GET_FEATURES, &vhost_features) != 0)
520 		die_perror("VHOST_GET_FEATURES failed");
521 
522 	/* make sure both side support mergable rx buffers */
523 	if (vhost_features & 1UL << VIRTIO_NET_F_MRG_RXBUF &&
524 			has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
525 		features |= 1UL << VIRTIO_NET_F_MRG_RXBUF;
526 
527 	return ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
528 }
529 
530 static void virtio_net_start(struct net_dev *ndev)
531 {
532 	if (ndev->mode == NET_MODE_TAP) {
533 		if (!virtio_net__tap_init(ndev))
534 			die_perror("TAP device initialized failed because");
535 
536 		if (ndev->vhost_fd &&
537 				virtio_net__vhost_set_features(ndev) != 0)
538 			die_perror("VHOST_SET_FEATURES failed");
539 	} else {
540 		ndev->info.vnet_hdr_len = virtio_net_hdr_len(ndev);
541 		uip_init(&ndev->info);
542 	}
543 }
544 
545 static void virtio_net_stop(struct net_dev *ndev)
546 {
547 	/* Undo whatever start() did */
548 	if (ndev->mode == NET_MODE_TAP)
549 		virtio_net__tap_exit(ndev);
550 	else
551 		uip_exit(&ndev->info);
552 }
553 
554 static void virtio_net_update_endian(struct net_dev *ndev)
555 {
556 	struct virtio_net_config *conf = &ndev->config;
557 
558 	conf->status = virtio_host_to_guest_u16(&ndev->vdev,
559 						VIRTIO_NET_S_LINK_UP);
560 	conf->max_virtqueue_pairs = virtio_host_to_guest_u16(&ndev->vdev,
561 							     ndev->queue_pairs);
562 
563 	/* Let TAP know about vnet header endianness */
564 	if (ndev->mode == NET_MODE_TAP &&
565 	    ndev->vdev.endian != VIRTIO_ENDIAN_HOST) {
566 		int enable_val = 1, disable_val = 0;
567 		int enable_req, disable_req;
568 
569 		if (ndev->vdev.endian == VIRTIO_ENDIAN_LE) {
570 			enable_req = TUNSETVNETLE;
571 			disable_req = TUNSETVNETBE;
572 		} else {
573 			enable_req = TUNSETVNETBE;
574 			disable_req = TUNSETVNETLE;
575 		}
576 
577 		ioctl(ndev->tap_fd, disable_req, &disable_val);
578 		if (ioctl(ndev->tap_fd, enable_req, &enable_val) < 0)
579 			pr_err("Config tap device TUNSETVNETLE/BE error");
580 	}
581 }
582 
583 static void notify_status(struct kvm *kvm, void *dev, u32 status)
584 {
585 	struct net_dev *ndev = dev;
586 
587 	if (status & VIRTIO__STATUS_CONFIG)
588 		virtio_net_update_endian(ndev);
589 
590 	if (status & VIRTIO__STATUS_START)
591 		virtio_net_start(dev);
592 	else if (status & VIRTIO__STATUS_STOP)
593 		virtio_net_stop(dev);
594 }
595 
596 static bool is_ctrl_vq(struct net_dev *ndev, u32 vq)
597 {
598 	return vq == (u32)(ndev->queue_pairs * 2);
599 }
600 
601 static int init_vq(struct kvm *kvm, void *dev, u32 vq)
602 {
603 	struct vhost_vring_state state = { .index = vq };
604 	struct vhost_vring_file file = { .index = vq };
605 	struct net_dev_queue *net_queue;
606 	struct vhost_vring_addr addr;
607 	struct net_dev *ndev = dev;
608 	struct virt_queue *queue;
609 	int r;
610 
611 	compat__remove_message(compat_id);
612 
613 	net_queue	= &ndev->queues[vq];
614 	net_queue->id	= vq;
615 	net_queue->ndev	= ndev;
616 	queue		= &net_queue->vq;
617 	virtio_init_device_vq(kvm, &ndev->vdev, queue, VIRTIO_NET_QUEUE_SIZE);
618 
619 	mutex_init(&net_queue->lock);
620 	pthread_cond_init(&net_queue->cond, NULL);
621 	if (is_ctrl_vq(ndev, vq)) {
622 		pthread_create(&net_queue->thread, NULL, virtio_net_ctrl_thread,
623 			       net_queue);
624 
625 		return 0;
626 	} else if (ndev->vhost_fd == 0 ) {
627 		if (vq & 1)
628 			pthread_create(&net_queue->thread, NULL,
629 				       virtio_net_tx_thread, net_queue);
630 		else
631 			pthread_create(&net_queue->thread, NULL,
632 				       virtio_net_rx_thread, net_queue);
633 
634 		return 0;
635 	}
636 
637 	if (queue->endian != VIRTIO_ENDIAN_HOST)
638 		die_perror("VHOST requires the same endianness in guest and host");
639 
640 	state.num = queue->vring.num;
641 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
642 	if (r < 0)
643 		die_perror("VHOST_SET_VRING_NUM failed");
644 	state.num = 0;
645 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
646 	if (r < 0)
647 		die_perror("VHOST_SET_VRING_BASE failed");
648 
649 	addr = (struct vhost_vring_addr) {
650 		.index = vq,
651 		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
652 		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
653 		.used_user_addr = (u64)(unsigned long)queue->vring.used,
654 	};
655 
656 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
657 	if (r < 0)
658 		die_perror("VHOST_SET_VRING_ADDR failed");
659 
660 	file.fd = ndev->tap_fd;
661 	r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
662 	if (r < 0)
663 		die_perror("VHOST_NET_SET_BACKEND failed");
664 
665 	return 0;
666 }
667 
668 static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
669 {
670 	struct net_dev *ndev = dev;
671 	struct net_dev_queue *queue = &ndev->queues[vq];
672 
673 	if (!is_ctrl_vq(ndev, vq) && queue->gsi) {
674 		irq__del_irqfd(kvm, queue->gsi, queue->irqfd);
675 		close(queue->irqfd);
676 		queue->gsi = queue->irqfd = 0;
677 	}
678 
679 	/*
680 	 * TODO: vhost reset owner. It's the only way to cleanly stop vhost, but
681 	 * we can't restart it at the moment.
682 	 */
683 	if (ndev->vhost_fd && !is_ctrl_vq(ndev, vq)) {
684 		pr_warning("Cannot reset VHOST queue");
685 		ioctl(ndev->vhost_fd, VHOST_RESET_OWNER);
686 		return;
687 	}
688 
689 	/*
690 	 * Threads are waiting on cancellation points (readv or
691 	 * pthread_cond_wait) and should stop gracefully.
692 	 */
693 	pthread_cancel(queue->thread);
694 	pthread_join(queue->thread, NULL);
695 }
696 
697 static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
698 {
699 	struct net_dev *ndev = dev;
700 	struct net_dev_queue *queue = &ndev->queues[vq];
701 	struct vhost_vring_file file;
702 	int r;
703 
704 	if (ndev->vhost_fd == 0)
705 		return;
706 
707 	file = (struct vhost_vring_file) {
708 		.index	= vq,
709 		.fd	= eventfd(0, 0),
710 	};
711 
712 	r = irq__add_irqfd(kvm, gsi, file.fd, -1);
713 	if (r < 0)
714 		die_perror("KVM_IRQFD failed");
715 
716 	queue->irqfd = file.fd;
717 	queue->gsi = gsi;
718 
719 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
720 	if (r < 0)
721 		die_perror("VHOST_SET_VRING_CALL failed");
722 }
723 
724 static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
725 {
726 	struct net_dev *ndev = dev;
727 	struct vhost_vring_file file = {
728 		.index	= vq,
729 		.fd	= efd,
730 	};
731 	int r;
732 
733 	if (ndev->vhost_fd == 0 || is_ctrl_vq(ndev, vq))
734 		return;
735 
736 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
737 	if (r < 0)
738 		die_perror("VHOST_SET_VRING_KICK failed");
739 }
740 
741 static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
742 {
743 	struct net_dev *ndev = dev;
744 
745 	virtio_net_handle_callback(kvm, ndev, vq);
746 
747 	return 0;
748 }
749 
750 static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
751 {
752 	struct net_dev *ndev = dev;
753 
754 	return &ndev->queues[vq].vq;
755 }
756 
757 static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
758 {
759 	/* FIXME: dynamic */
760 	return VIRTIO_NET_QUEUE_SIZE;
761 }
762 
763 static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
764 {
765 	/* FIXME: dynamic */
766 	return size;
767 }
768 
769 static unsigned int get_vq_count(struct kvm *kvm, void *dev)
770 {
771 	struct net_dev *ndev = dev;
772 
773 	return ndev->queue_pairs * 2 + 1;
774 }
775 
776 static struct virtio_ops net_dev_virtio_ops = {
777 	.get_config		= get_config,
778 	.get_config_size	= get_config_size,
779 	.get_host_features	= get_host_features,
780 	.get_vq_count		= get_vq_count,
781 	.init_vq		= init_vq,
782 	.exit_vq		= exit_vq,
783 	.get_vq			= get_vq,
784 	.get_size_vq		= get_size_vq,
785 	.set_size_vq		= set_size_vq,
786 	.notify_vq		= notify_vq,
787 	.notify_vq_gsi		= notify_vq_gsi,
788 	.notify_vq_eventfd	= notify_vq_eventfd,
789 	.notify_status		= notify_status,
790 };
791 
792 static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
793 {
794 	struct kvm_mem_bank *bank;
795 	struct vhost_memory *mem;
796 	int r, i;
797 
798 	ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
799 	if (ndev->vhost_fd < 0)
800 		die_perror("Failed openning vhost-net device");
801 
802 	mem = calloc(1, sizeof(*mem) + kvm->mem_slots * sizeof(struct vhost_memory_region));
803 	if (mem == NULL)
804 		die("Failed allocating memory for vhost memory map");
805 
806 	i = 0;
807 	list_for_each_entry(bank, &kvm->mem_banks, list) {
808 		mem->regions[i] = (struct vhost_memory_region) {
809 			.guest_phys_addr = bank->guest_phys_addr,
810 			.memory_size	 = bank->size,
811 			.userspace_addr	 = (unsigned long)bank->host_addr,
812 		};
813 		i++;
814 	}
815 	mem->nregions = i;
816 
817 	r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
818 	if (r != 0)
819 		die_perror("VHOST_SET_OWNER failed");
820 
821 	r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
822 	if (r != 0)
823 		die_perror("VHOST_SET_MEM_TABLE failed");
824 
825 	ndev->vdev.use_vhost = true;
826 
827 	free(mem);
828 }
829 
830 static inline void str_to_mac(const char *str, char *mac)
831 {
832 	sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
833 		mac, mac+1, mac+2, mac+3, mac+4, mac+5);
834 }
835 static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
836 			const char *param, const char *val)
837 {
838 	if (strcmp(param, "guest_mac") == 0) {
839 		str_to_mac(val, p->guest_mac);
840 	} else if (strcmp(param, "mode") == 0) {
841 		if (!strncmp(val, "user", 4)) {
842 			int i;
843 
844 			for (i = 0; i < kvm->cfg.num_net_devices; i++)
845 				if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
846 					die("Only one usermode network device allowed at a time");
847 			p->mode = NET_MODE_USER;
848 		} else if (!strncmp(val, "tap", 3)) {
849 			p->mode = NET_MODE_TAP;
850 		} else if (!strncmp(val, "none", 4)) {
851 			kvm->cfg.no_net = 1;
852 			return -1;
853 		} else
854 			die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
855 	} else if (strcmp(param, "script") == 0) {
856 		p->script = strdup(val);
857 	} else if (strcmp(param, "downscript") == 0) {
858 		p->downscript = strdup(val);
859 	} else if (strcmp(param, "guest_ip") == 0) {
860 		p->guest_ip = strdup(val);
861 	} else if (strcmp(param, "host_ip") == 0) {
862 		p->host_ip = strdup(val);
863 	} else if (strcmp(param, "trans") == 0) {
864 		p->trans = strdup(val);
865 	} else if (strcmp(param, "tapif") == 0) {
866 		p->tapif = strdup(val);
867 	} else if (strcmp(param, "vhost") == 0) {
868 		p->vhost = atoi(val);
869 	} else if (strcmp(param, "fd") == 0) {
870 		p->fd = atoi(val);
871 	} else if (strcmp(param, "mq") == 0) {
872 		p->mq = atoi(val);
873 	} else
874 		die("Unknown network parameter %s", param);
875 
876 	return 0;
877 }
878 
879 int netdev_parser(const struct option *opt, const char *arg, int unset)
880 {
881 	struct virtio_net_params p;
882 	char *buf = NULL, *cmd = NULL, *cur = NULL;
883 	bool on_cmd = true;
884 	struct kvm *kvm = opt->ptr;
885 
886 	if (arg) {
887 		buf = strdup(arg);
888 		if (buf == NULL)
889 			die("Failed allocating new net buffer");
890 		cur = strtok(buf, ",=");
891 	}
892 
893 	p = (struct virtio_net_params) {
894 		.guest_ip	= DEFAULT_GUEST_ADDR,
895 		.host_ip	= DEFAULT_HOST_ADDR,
896 		.script		= DEFAULT_SCRIPT,
897 		.downscript	= DEFAULT_SCRIPT,
898 		.mode		= NET_MODE_TAP,
899 	};
900 
901 	str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
902 	p.guest_mac[5] += kvm->cfg.num_net_devices;
903 
904 	while (cur) {
905 		if (on_cmd) {
906 			cmd = cur;
907 		} else {
908 			if (set_net_param(kvm, &p, cmd, cur) < 0)
909 				goto done;
910 		}
911 		on_cmd = !on_cmd;
912 
913 		cur = strtok(NULL, ",=");
914 	};
915 
916 	kvm->cfg.num_net_devices++;
917 
918 	kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
919 	if (kvm->cfg.net_params == NULL)
920 		die("Failed adding new network device");
921 
922 	kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
923 
924 done:
925 	free(buf);
926 	return 0;
927 }
928 
929 static int virtio_net__init_one(struct virtio_net_params *params)
930 {
931 	int i, r;
932 	struct net_dev *ndev;
933 	struct virtio_ops *ops;
934 	enum virtio_trans trans = VIRTIO_DEFAULT_TRANS(params->kvm);
935 
936 	ndev = calloc(1, sizeof(struct net_dev));
937 	if (ndev == NULL)
938 		return -ENOMEM;
939 
940 	list_add_tail(&ndev->list, &ndevs);
941 
942 	ops = malloc(sizeof(*ops));
943 	if (ops == NULL)
944 		return -ENOMEM;
945 
946 	ndev->kvm = params->kvm;
947 	ndev->params = params;
948 
949 	mutex_init(&ndev->mutex);
950 	ndev->queue_pairs = max(1, min(VIRTIO_NET_NUM_QUEUES, params->mq));
951 
952 	for (i = 0 ; i < 6 ; i++) {
953 		ndev->config.mac[i]		= params->guest_mac[i];
954 		ndev->info.guest_mac.addr[i]	= params->guest_mac[i];
955 		ndev->info.host_mac.addr[i]	= params->host_mac[i];
956 	}
957 
958 	ndev->mode = params->mode;
959 	if (ndev->mode == NET_MODE_TAP) {
960 		ndev->ops = &tap_ops;
961 		if (!virtio_net__tap_create(ndev))
962 			die_perror("You have requested a TAP device, but creation of one has failed because");
963 	} else {
964 		ndev->info.host_ip		= ntohl(inet_addr(params->host_ip));
965 		ndev->info.guest_ip		= ntohl(inet_addr(params->guest_ip));
966 		ndev->info.guest_netmask	= ntohl(inet_addr("255.255.255.0"));
967 		ndev->info.buf_nr		= 20,
968 		ndev->ops = &uip_ops;
969 		uip_static_init(&ndev->info);
970 	}
971 
972 	*ops = net_dev_virtio_ops;
973 
974 	if (params->trans) {
975 		if (strcmp(params->trans, "mmio") == 0)
976 			trans = VIRTIO_MMIO;
977 		else if (strcmp(params->trans, "pci") == 0)
978 			trans = VIRTIO_PCI;
979 		else
980 			pr_warning("virtio-net: Unknown transport method : %s, "
981 				   "falling back to %s.", params->trans,
982 				   virtio_trans_name(trans));
983 	}
984 
985 	r = virtio_init(params->kvm, ndev, &ndev->vdev, ops, trans,
986 			PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
987 	if (r < 0) {
988 		free(ops);
989 		return r;
990 	}
991 
992 	if (params->vhost)
993 		virtio_net__vhost_init(params->kvm, ndev);
994 
995 	if (compat_id == -1)
996 		compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
997 
998 	return 0;
999 }
1000 
1001 int virtio_net__init(struct kvm *kvm)
1002 {
1003 	int i, r;
1004 
1005 	for (i = 0; i < kvm->cfg.num_net_devices; i++) {
1006 		kvm->cfg.net_params[i].kvm = kvm;
1007 		r = virtio_net__init_one(&kvm->cfg.net_params[i]);
1008 		if (r < 0)
1009 			goto cleanup;
1010 	}
1011 
1012 	if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
1013 		static struct virtio_net_params net_params;
1014 
1015 		net_params = (struct virtio_net_params) {
1016 			.guest_ip	= kvm->cfg.guest_ip,
1017 			.host_ip	= kvm->cfg.host_ip,
1018 			.kvm		= kvm,
1019 			.script		= kvm->cfg.script,
1020 			.mode		= NET_MODE_USER,
1021 		};
1022 		str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
1023 		str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
1024 
1025 		r = virtio_net__init_one(&net_params);
1026 		if (r < 0)
1027 			goto cleanup;
1028 	}
1029 
1030 	return 0;
1031 
1032 cleanup:
1033 	virtio_net__exit(kvm);
1034 	return r;
1035 }
1036 virtio_dev_init(virtio_net__init);
1037 
1038 int virtio_net__exit(struct kvm *kvm)
1039 {
1040 	struct virtio_net_params *params;
1041 	struct net_dev *ndev;
1042 	struct list_head *ptr, *n;
1043 
1044 	list_for_each_safe(ptr, n, &ndevs) {
1045 		ndev = list_entry(ptr, struct net_dev, list);
1046 		params = ndev->params;
1047 		/* Cleanup any tap device which attached to bridge */
1048 		if (ndev->mode == NET_MODE_TAP &&
1049 		    strcmp(params->downscript, "none"))
1050 			virtio_net_exec_script(params->downscript, ndev->tap_name);
1051 
1052 		list_del(&ndev->list);
1053 		free(ndev);
1054 	}
1055 	return 0;
1056 }
1057 virtio_dev_exit(virtio_net__exit);
1058