xref: /kvmtool/virtio/net.c (revision 16509081faf92c710c15bbc3ba5c60dc3c442f7b)
1 #include "kvm/virtio-pci-dev.h"
2 #include "kvm/virtio-net.h"
3 #include "kvm/virtio.h"
4 #include "kvm/mutex.h"
5 #include "kvm/util.h"
6 #include "kvm/kvm.h"
7 #include "kvm/irq.h"
8 #include "kvm/uip.h"
9 #include "kvm/guest_compat.h"
10 #include "kvm/iovec.h"
11 
12 #include <linux/vhost.h>
13 #include <linux/virtio_net.h>
14 #include <linux/if_tun.h>
15 #include <linux/types.h>
16 
17 #include <arpa/inet.h>
18 #include <net/if.h>
19 
20 #include <unistd.h>
21 #include <fcntl.h>
22 
23 #include <sys/socket.h>
24 #include <sys/ioctl.h>
25 #include <sys/types.h>
26 #include <sys/wait.h>
27 #include <sys/eventfd.h>
28 
29 #define VIRTIO_NET_QUEUE_SIZE		256
30 #define VIRTIO_NET_NUM_QUEUES		8
31 
32 struct net_dev;
33 
34 struct net_dev_operations {
35 	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
36 	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
37 };
38 
39 struct net_dev_queue {
40 	int				id;
41 	struct net_dev			*ndev;
42 	struct virt_queue		vq;
43 	pthread_t			thread;
44 	struct mutex			lock;
45 	pthread_cond_t			cond;
46 	int				gsi;
47 	int				irqfd;
48 };
49 
50 struct net_dev {
51 	struct mutex			mutex;
52 	struct virtio_device		vdev;
53 	struct list_head		list;
54 
55 	struct net_dev_queue		queues[VIRTIO_NET_NUM_QUEUES * 2 + 1];
56 	struct virtio_net_config	config;
57 	u32				features, queue_pairs;
58 
59 	int				vhost_fd;
60 	int				tap_fd;
61 	char				tap_name[IFNAMSIZ];
62 	bool				tap_ufo;
63 
64 	int				mode;
65 
66 	struct uip_info			info;
67 	struct net_dev_operations	*ops;
68 	struct kvm			*kvm;
69 
70 	struct virtio_net_params	*params;
71 };
72 
73 static LIST_HEAD(ndevs);
74 static int compat_id = -1;
75 
76 #define MAX_PACKET_SIZE 65550
77 
78 static bool has_virtio_feature(struct net_dev *ndev, u32 feature)
79 {
80 	return ndev->features & (1 << feature);
81 }
82 
83 static void virtio_net_fix_tx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
84 {
85 	hdr->hdr_len		= virtio_guest_to_host_u16(&ndev->vdev, hdr->hdr_len);
86 	hdr->gso_size		= virtio_guest_to_host_u16(&ndev->vdev, hdr->gso_size);
87 	hdr->csum_start		= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_start);
88 	hdr->csum_offset	= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_offset);
89 }
90 
91 static void virtio_net_fix_rx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
92 {
93 	hdr->hdr_len		= virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr_len);
94 	hdr->gso_size		= virtio_host_to_guest_u16(&ndev->vdev, hdr->gso_size);
95 	hdr->csum_start		= virtio_host_to_guest_u16(&ndev->vdev, hdr->csum_start);
96 	hdr->csum_offset	= virtio_host_to_guest_u16(&ndev->vdev, hdr->csum_offset);
97 }
98 
99 static void *virtio_net_rx_thread(void *p)
100 {
101 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
102 	struct net_dev_queue *queue = p;
103 	struct virt_queue *vq = &queue->vq;
104 	struct net_dev *ndev = queue->ndev;
105 	struct kvm *kvm;
106 	u16 out, in;
107 	u16 head;
108 	int len, copied;
109 
110 	kvm__set_thread_name("virtio-net-rx");
111 
112 	kvm = ndev->kvm;
113 	while (1) {
114 		mutex_lock(&queue->lock);
115 		if (!virt_queue__available(vq))
116 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
117 		mutex_unlock(&queue->lock);
118 
119 		while (virt_queue__available(vq)) {
120 			unsigned char buffer[MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)];
121 			struct iovec dummy_iov = {
122 				.iov_base = buffer,
123 				.iov_len  = sizeof(buffer),
124 			};
125 			struct virtio_net_hdr_mrg_rxbuf *hdr;
126 			u16 num_buffers;
127 
128 			len = ndev->ops->rx(&dummy_iov, 1, ndev);
129 			if (len < 0) {
130 				pr_warning("%s: rx on vq %u failed (%d), exiting thread\n",
131 						__func__, queue->id, len);
132 				goto out_err;
133 			}
134 
135 			copied = num_buffers = 0;
136 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
137 			hdr = iov[0].iov_base;
138 			while (copied < len) {
139 				size_t iovsize = min_t(size_t, len - copied, iov_size(iov, in));
140 
141 				memcpy_toiovec(iov, buffer + copied, iovsize);
142 				copied += iovsize;
143 				virt_queue__set_used_elem_no_update(vq, head, iovsize, num_buffers++);
144 				if (copied == len)
145 					break;
146 				while (!virt_queue__available(vq))
147 					sleep(0);
148 				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
149 			}
150 
151 			virtio_net_fix_rx_hdr(&hdr->hdr, ndev);
152 			if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
153 				hdr->num_buffers = virtio_host_to_guest_u16(vq, num_buffers);
154 
155 			virt_queue__used_idx_advance(vq, num_buffers);
156 
157 			/* We should interrupt guest right now, otherwise latency is huge. */
158 			if (virtio_queue__should_signal(vq))
159 				ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
160 		}
161 	}
162 
163 out_err:
164 	pthread_exit(NULL);
165 	return NULL;
166 
167 }
168 
169 static void *virtio_net_tx_thread(void *p)
170 {
171 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
172 	struct net_dev_queue *queue = p;
173 	struct virt_queue *vq = &queue->vq;
174 	struct net_dev *ndev = queue->ndev;
175 	struct kvm *kvm;
176 	u16 out, in;
177 	u16 head;
178 	int len;
179 
180 	kvm__set_thread_name("virtio-net-tx");
181 
182 	kvm = ndev->kvm;
183 
184 	while (1) {
185 		mutex_lock(&queue->lock);
186 		if (!virt_queue__available(vq))
187 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
188 		mutex_unlock(&queue->lock);
189 
190 		while (virt_queue__available(vq)) {
191 			struct virtio_net_hdr *hdr;
192 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
193 			hdr = iov[0].iov_base;
194 			virtio_net_fix_tx_hdr(hdr, ndev);
195 			len = ndev->ops->tx(iov, out, ndev);
196 			if (len < 0) {
197 				pr_warning("%s: tx on vq %u failed (%d)\n",
198 						__func__, queue->id, errno);
199 				goto out_err;
200 			}
201 
202 			virt_queue__set_used_elem(vq, head, len);
203 		}
204 
205 		if (virtio_queue__should_signal(vq))
206 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
207 	}
208 
209 out_err:
210 	pthread_exit(NULL);
211 	return NULL;
212 }
213 
214 static virtio_net_ctrl_ack virtio_net_handle_mq(struct kvm* kvm, struct net_dev *ndev, struct virtio_net_ctrl_hdr *ctrl)
215 {
216 	/* Not much to do here */
217 	return VIRTIO_NET_OK;
218 }
219 
220 static void *virtio_net_ctrl_thread(void *p)
221 {
222 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
223 	struct net_dev_queue *queue = p;
224 	struct virt_queue *vq = &queue->vq;
225 	struct net_dev *ndev = queue->ndev;
226 	u16 out, in, head;
227 	struct kvm *kvm = ndev->kvm;
228 	struct virtio_net_ctrl_hdr *ctrl;
229 	virtio_net_ctrl_ack *ack;
230 
231 	kvm__set_thread_name("virtio-net-ctrl");
232 
233 	while (1) {
234 		mutex_lock(&queue->lock);
235 		if (!virt_queue__available(vq))
236 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
237 		mutex_unlock(&queue->lock);
238 
239 		while (virt_queue__available(vq)) {
240 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
241 			ctrl = iov[0].iov_base;
242 			ack = iov[out].iov_base;
243 
244 			switch (ctrl->class) {
245 			case VIRTIO_NET_CTRL_MQ:
246 				*ack = virtio_net_handle_mq(kvm, ndev, ctrl);
247 				break;
248 			default:
249 				*ack = VIRTIO_NET_ERR;
250 				break;
251 			}
252 			virt_queue__set_used_elem(vq, head, iov[out].iov_len);
253 		}
254 
255 		if (virtio_queue__should_signal(vq))
256 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
257 	}
258 
259 	pthread_exit(NULL);
260 
261 	return NULL;
262 }
263 
264 static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
265 {
266 	struct net_dev_queue *net_queue = &ndev->queues[queue];
267 
268 	if ((u32)queue >= (ndev->queue_pairs * 2 + 1)) {
269 		pr_warning("Unknown queue index %u", queue);
270 		return;
271 	}
272 
273 	mutex_lock(&net_queue->lock);
274 	pthread_cond_signal(&net_queue->cond);
275 	mutex_unlock(&net_queue->lock);
276 }
277 
278 static int virtio_net_request_tap(struct net_dev *ndev, struct ifreq *ifr,
279 				  const char *tapname)
280 {
281 	int ret;
282 
283 	memset(ifr, 0, sizeof(*ifr));
284 	ifr->ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
285 	if (tapname)
286 		strncpy(ifr->ifr_name, tapname, sizeof(ifr->ifr_name));
287 
288 	ret = ioctl(ndev->tap_fd, TUNSETIFF, ifr);
289 
290 	if (ret >= 0)
291 		strncpy(ndev->tap_name, ifr->ifr_name, sizeof(ndev->tap_name));
292 	return ret;
293 }
294 
295 static int virtio_net_exec_script(const char* script, const char *tap_name)
296 {
297 	pid_t pid;
298 	int status;
299 
300 	pid = fork();
301 	if (pid == 0) {
302 		execl(script, script, tap_name, NULL);
303 		_exit(1);
304 	} else {
305 		waitpid(pid, &status, 0);
306 		if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
307 			pr_warning("Fail to setup tap by %s", script);
308 			return -1;
309 		}
310 	}
311 	return 0;
312 }
313 
314 static bool virtio_net__tap_init(struct net_dev *ndev)
315 {
316 	int sock = socket(AF_INET, SOCK_STREAM, 0);
317 	int hdr_len;
318 	struct sockaddr_in sin = {0};
319 	struct ifreq ifr;
320 	const struct virtio_net_params *params = ndev->params;
321 	bool skipconf = !!params->tapif;
322 
323 	hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
324 			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
325 			sizeof(struct virtio_net_hdr);
326 	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
327 		pr_warning("Config tap device TUNSETVNETHDRSZ error");
328 
329 	if (strcmp(params->script, "none")) {
330 		if (virtio_net_exec_script(params->script, ndev->tap_name) < 0)
331 			goto fail;
332 	} else if (!skipconf) {
333 		memset(&ifr, 0, sizeof(ifr));
334 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
335 		sin.sin_addr.s_addr = inet_addr(params->host_ip);
336 		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
337 		ifr.ifr_addr.sa_family = AF_INET;
338 		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
339 			pr_warning("Could not set ip address on tap device");
340 			goto fail;
341 		}
342 	}
343 
344 	if (!skipconf) {
345 		memset(&ifr, 0, sizeof(ifr));
346 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
347 		ioctl(sock, SIOCGIFFLAGS, &ifr);
348 		ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
349 		if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
350 			pr_warning("Could not bring tap device up");
351 	}
352 
353 	close(sock);
354 
355 	return 1;
356 
357 fail:
358 	if (sock >= 0)
359 		close(sock);
360 	if (ndev->tap_fd >= 0)
361 		close(ndev->tap_fd);
362 
363 	return 0;
364 }
365 
366 static void virtio_net__tap_exit(struct net_dev *ndev)
367 {
368 	int sock;
369 	struct ifreq ifr;
370 
371 	if (ndev->params->tapif)
372 		return;
373 
374 	sock = socket(AF_INET, SOCK_STREAM, 0);
375 	strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
376 	ioctl(sock, SIOCGIFFLAGS, &ifr);
377 	ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING);
378 	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0)
379 		pr_warning("Count not bring tap device down");
380 	close(sock);
381 }
382 
383 static bool virtio_net__tap_create(struct net_dev *ndev)
384 {
385 	int offload;
386 	struct ifreq ifr;
387 	const struct virtio_net_params *params = ndev->params;
388 	bool macvtap = (!!params->tapif) && (params->tapif[0] == '/');
389 
390 	/* Did the user already gave us the FD? */
391 	if (params->fd)
392 		ndev->tap_fd = params->fd;
393 	else {
394 		const char *tap_file = "/dev/net/tun";
395 
396 		/* Did the user ask us to use macvtap? */
397 		if (macvtap)
398 			tap_file = params->tapif;
399 
400 		ndev->tap_fd = open(tap_file, O_RDWR);
401 		if (ndev->tap_fd < 0) {
402 			pr_warning("Unable to open %s", tap_file);
403 			return 0;
404 		}
405 	}
406 
407 	if (!macvtap &&
408 	    virtio_net_request_tap(ndev, &ifr, params->tapif) < 0) {
409 		pr_warning("Config tap device error. Are you root?");
410 		goto fail;
411 	}
412 
413 	/*
414 	 * The UFO support had been removed from kernel in commit:
415 	 * ID: fb652fdfe83710da0ca13448a41b7ed027d0a984
416 	 * https://www.spinics.net/lists/netdev/msg443562.html
417 	 * In oder to support the older kernels without this commit,
418 	 * we set the TUN_F_UFO to offload by default to test the status of
419 	 * UFO kernel support.
420 	 */
421 	ndev->tap_ufo = true;
422 	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
423 	if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
424 		/*
425 		 * Is this failure caused by kernel remove the UFO support?
426 		 * Try TUNSETOFFLOAD without TUN_F_UFO.
427 		 */
428 		offload &= ~TUN_F_UFO;
429 		if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
430 			pr_warning("Config tap device TUNSETOFFLOAD error");
431 			goto fail;
432 		}
433 		ndev->tap_ufo = false;
434 	}
435 
436 	return 1;
437 
438 fail:
439 	if ((ndev->tap_fd >= 0) || (!params->fd) )
440 		close(ndev->tap_fd);
441 
442 	return 0;
443 }
444 
445 static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
446 {
447 	return writev(ndev->tap_fd, iov, out);
448 }
449 
450 static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
451 {
452 	return readv(ndev->tap_fd, iov, in);
453 }
454 
455 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
456 {
457 	return uip_tx(iov, out, &ndev->info);
458 }
459 
460 static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
461 {
462 	return uip_rx(iov, in, &ndev->info);
463 }
464 
465 static struct net_dev_operations tap_ops = {
466 	.rx	= tap_ops_rx,
467 	.tx	= tap_ops_tx,
468 };
469 
470 static struct net_dev_operations uip_ops = {
471 	.rx	= uip_ops_rx,
472 	.tx	= uip_ops_tx,
473 };
474 
475 static u8 *get_config(struct kvm *kvm, void *dev)
476 {
477 	struct net_dev *ndev = dev;
478 
479 	return ((u8 *)(&ndev->config));
480 }
481 
482 static u32 get_host_features(struct kvm *kvm, void *dev)
483 {
484 	u32 features;
485 	struct net_dev *ndev = dev;
486 
487 	features = 1UL << VIRTIO_NET_F_MAC
488 		| 1UL << VIRTIO_NET_F_CSUM
489 		| 1UL << VIRTIO_NET_F_HOST_TSO4
490 		| 1UL << VIRTIO_NET_F_HOST_TSO6
491 		| 1UL << VIRTIO_NET_F_GUEST_TSO4
492 		| 1UL << VIRTIO_NET_F_GUEST_TSO6
493 		| 1UL << VIRTIO_RING_F_EVENT_IDX
494 		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
495 		| 1UL << VIRTIO_NET_F_CTRL_VQ
496 		| 1UL << VIRTIO_NET_F_MRG_RXBUF
497 		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
498 
499 	/*
500 	 * The UFO feature for host and guest only can be enabled when the
501 	 * kernel has TAP UFO support.
502 	 */
503 	if (ndev->tap_ufo)
504 		features |= (1UL << VIRTIO_NET_F_HOST_UFO
505 				| 1UL << VIRTIO_NET_F_GUEST_UFO);
506 
507 	return features;
508 }
509 
510 static int virtio_net__vhost_set_features(struct net_dev *ndev)
511 {
512 	u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
513 	u64 vhost_features;
514 
515 	if (ioctl(ndev->vhost_fd, VHOST_GET_FEATURES, &vhost_features) != 0)
516 		die_perror("VHOST_GET_FEATURES failed");
517 
518 	/* make sure both side support mergable rx buffers */
519 	if (vhost_features & 1UL << VIRTIO_NET_F_MRG_RXBUF &&
520 			has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
521 		features |= 1UL << VIRTIO_NET_F_MRG_RXBUF;
522 
523 	return ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
524 }
525 
526 static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
527 {
528 	struct net_dev *ndev = dev;
529 	struct virtio_net_config *conf = &ndev->config;
530 
531 	ndev->features = features;
532 
533 	conf->status = virtio_host_to_guest_u16(&ndev->vdev, conf->status);
534 	conf->max_virtqueue_pairs = virtio_host_to_guest_u16(&ndev->vdev,
535 							     conf->max_virtqueue_pairs);
536 }
537 
538 static void virtio_net_start(struct net_dev *ndev)
539 {
540 	if (ndev->mode == NET_MODE_TAP) {
541 		if (!virtio_net__tap_init(ndev))
542 			die_perror("TAP device initialized failed because");
543 
544 		if (ndev->vhost_fd &&
545 				virtio_net__vhost_set_features(ndev) != 0)
546 			die_perror("VHOST_SET_FEATURES failed");
547 	} else {
548 		ndev->info.vnet_hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
549 						sizeof(struct virtio_net_hdr_mrg_rxbuf) :
550 						sizeof(struct virtio_net_hdr);
551 		uip_init(&ndev->info);
552 	}
553 }
554 
555 static void virtio_net_stop(struct net_dev *ndev)
556 {
557 	/* Undo whatever start() did */
558 	if (ndev->mode == NET_MODE_TAP)
559 		virtio_net__tap_exit(ndev);
560 	else
561 		uip_exit(&ndev->info);
562 }
563 
564 static void notify_status(struct kvm *kvm, void *dev, u32 status)
565 {
566 	if (status & VIRTIO__STATUS_START)
567 		virtio_net_start(dev);
568 	else if (status & VIRTIO__STATUS_STOP)
569 		virtio_net_stop(dev);
570 }
571 
572 static bool is_ctrl_vq(struct net_dev *ndev, u32 vq)
573 {
574 	return vq == (u32)(ndev->queue_pairs * 2);
575 }
576 
577 static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
578 		   u32 pfn)
579 {
580 	struct vhost_vring_state state = { .index = vq };
581 	struct net_dev_queue *net_queue;
582 	struct vhost_vring_addr addr;
583 	struct net_dev *ndev = dev;
584 	struct virt_queue *queue;
585 	void *p;
586 	int r;
587 
588 	compat__remove_message(compat_id);
589 
590 	net_queue	= &ndev->queues[vq];
591 	net_queue->id	= vq;
592 	net_queue->ndev	= ndev;
593 	queue		= &net_queue->vq;
594 	queue->pfn	= pfn;
595 	p		= virtio_get_vq(kvm, queue->pfn, page_size);
596 
597 	vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, align);
598 	virtio_init_device_vq(&ndev->vdev, queue);
599 
600 	mutex_init(&net_queue->lock);
601 	pthread_cond_init(&net_queue->cond, NULL);
602 	if (is_ctrl_vq(ndev, vq)) {
603 		pthread_create(&net_queue->thread, NULL, virtio_net_ctrl_thread,
604 			       net_queue);
605 
606 		return 0;
607 	} else if (ndev->vhost_fd == 0 ) {
608 		if (vq & 1)
609 			pthread_create(&net_queue->thread, NULL,
610 				       virtio_net_tx_thread, net_queue);
611 		else
612 			pthread_create(&net_queue->thread, NULL,
613 				       virtio_net_rx_thread, net_queue);
614 
615 		return 0;
616 	}
617 
618 	if (queue->endian != VIRTIO_ENDIAN_HOST)
619 		die_perror("VHOST requires the same endianness in guest and host");
620 
621 	state.num = queue->vring.num;
622 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
623 	if (r < 0)
624 		die_perror("VHOST_SET_VRING_NUM failed");
625 	state.num = 0;
626 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
627 	if (r < 0)
628 		die_perror("VHOST_SET_VRING_BASE failed");
629 
630 	addr = (struct vhost_vring_addr) {
631 		.index = vq,
632 		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
633 		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
634 		.used_user_addr = (u64)(unsigned long)queue->vring.used,
635 	};
636 
637 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
638 	if (r < 0)
639 		die_perror("VHOST_SET_VRING_ADDR failed");
640 
641 	return 0;
642 }
643 
644 static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
645 {
646 	struct net_dev *ndev = dev;
647 	struct net_dev_queue *queue = &ndev->queues[vq];
648 
649 	if (!is_ctrl_vq(ndev, vq) && queue->gsi) {
650 		irq__del_irqfd(kvm, queue->gsi, queue->irqfd);
651 		close(queue->irqfd);
652 		queue->gsi = queue->irqfd = 0;
653 	}
654 
655 	/*
656 	 * TODO: vhost reset owner. It's the only way to cleanly stop vhost, but
657 	 * we can't restart it at the moment.
658 	 */
659 	if (ndev->vhost_fd && !is_ctrl_vq(ndev, vq)) {
660 		pr_warning("Cannot reset VHOST queue");
661 		ioctl(ndev->vhost_fd, VHOST_RESET_OWNER);
662 		return;
663 	}
664 
665 	/*
666 	 * Threads are waiting on cancellation points (readv or
667 	 * pthread_cond_wait) and should stop gracefully.
668 	 */
669 	pthread_cancel(queue->thread);
670 	pthread_join(queue->thread, NULL);
671 }
672 
673 static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
674 {
675 	struct net_dev *ndev = dev;
676 	struct net_dev_queue *queue = &ndev->queues[vq];
677 	struct vhost_vring_file file;
678 	int r;
679 
680 	if (ndev->vhost_fd == 0)
681 		return;
682 
683 	file = (struct vhost_vring_file) {
684 		.index	= vq,
685 		.fd	= eventfd(0, 0),
686 	};
687 
688 	r = irq__add_irqfd(kvm, gsi, file.fd, -1);
689 	if (r < 0)
690 		die_perror("KVM_IRQFD failed");
691 
692 	queue->irqfd = file.fd;
693 	queue->gsi = gsi;
694 
695 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
696 	if (r < 0)
697 		die_perror("VHOST_SET_VRING_CALL failed");
698 	file.fd = ndev->tap_fd;
699 	r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
700 	if (r != 0)
701 		die("VHOST_NET_SET_BACKEND failed %d", errno);
702 
703 }
704 
705 static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
706 {
707 	struct net_dev *ndev = dev;
708 	struct vhost_vring_file file = {
709 		.index	= vq,
710 		.fd	= efd,
711 	};
712 	int r;
713 
714 	if (ndev->vhost_fd == 0 || is_ctrl_vq(ndev, vq))
715 		return;
716 
717 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
718 	if (r < 0)
719 		die_perror("VHOST_SET_VRING_KICK failed");
720 }
721 
722 static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
723 {
724 	struct net_dev *ndev = dev;
725 
726 	virtio_net_handle_callback(kvm, ndev, vq);
727 
728 	return 0;
729 }
730 
731 static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
732 {
733 	struct net_dev *ndev = dev;
734 
735 	return &ndev->queues[vq].vq;
736 }
737 
738 static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
739 {
740 	/* FIXME: dynamic */
741 	return VIRTIO_NET_QUEUE_SIZE;
742 }
743 
744 static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
745 {
746 	/* FIXME: dynamic */
747 	return size;
748 }
749 
750 static int get_vq_count(struct kvm *kvm, void *dev)
751 {
752 	struct net_dev *ndev = dev;
753 
754 	return ndev->queue_pairs * 2 + 1;
755 }
756 
757 static struct virtio_ops net_dev_virtio_ops = {
758 	.get_config		= get_config,
759 	.get_host_features	= get_host_features,
760 	.set_guest_features	= set_guest_features,
761 	.get_vq_count		= get_vq_count,
762 	.init_vq		= init_vq,
763 	.exit_vq		= exit_vq,
764 	.get_vq			= get_vq,
765 	.get_size_vq		= get_size_vq,
766 	.set_size_vq		= set_size_vq,
767 	.notify_vq		= notify_vq,
768 	.notify_vq_gsi		= notify_vq_gsi,
769 	.notify_vq_eventfd	= notify_vq_eventfd,
770 	.notify_status		= notify_status,
771 };
772 
773 static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
774 {
775 	struct kvm_mem_bank *bank;
776 	struct vhost_memory *mem;
777 	int r, i;
778 
779 	ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
780 	if (ndev->vhost_fd < 0)
781 		die_perror("Failed openning vhost-net device");
782 
783 	mem = calloc(1, sizeof(*mem) + kvm->mem_slots * sizeof(struct vhost_memory_region));
784 	if (mem == NULL)
785 		die("Failed allocating memory for vhost memory map");
786 
787 	i = 0;
788 	list_for_each_entry(bank, &kvm->mem_banks, list) {
789 		mem->regions[i] = (struct vhost_memory_region) {
790 			.guest_phys_addr = bank->guest_phys_addr,
791 			.memory_size	 = bank->size,
792 			.userspace_addr	 = (unsigned long)bank->host_addr,
793 		};
794 		i++;
795 	}
796 	mem->nregions = i;
797 
798 	r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
799 	if (r != 0)
800 		die_perror("VHOST_SET_OWNER failed");
801 
802 	r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
803 	if (r != 0)
804 		die_perror("VHOST_SET_MEM_TABLE failed");
805 
806 	ndev->vdev.use_vhost = true;
807 
808 	free(mem);
809 }
810 
811 static inline void str_to_mac(const char *str, char *mac)
812 {
813 	sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
814 		mac, mac+1, mac+2, mac+3, mac+4, mac+5);
815 }
816 static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
817 			const char *param, const char *val)
818 {
819 	if (strcmp(param, "guest_mac") == 0) {
820 		str_to_mac(val, p->guest_mac);
821 	} else if (strcmp(param, "mode") == 0) {
822 		if (!strncmp(val, "user", 4)) {
823 			int i;
824 
825 			for (i = 0; i < kvm->cfg.num_net_devices; i++)
826 				if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
827 					die("Only one usermode network device allowed at a time");
828 			p->mode = NET_MODE_USER;
829 		} else if (!strncmp(val, "tap", 3)) {
830 			p->mode = NET_MODE_TAP;
831 		} else if (!strncmp(val, "none", 4)) {
832 			kvm->cfg.no_net = 1;
833 			return -1;
834 		} else
835 			die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
836 	} else if (strcmp(param, "script") == 0) {
837 		p->script = strdup(val);
838 	} else if (strcmp(param, "downscript") == 0) {
839 		p->downscript = strdup(val);
840 	} else if (strcmp(param, "guest_ip") == 0) {
841 		p->guest_ip = strdup(val);
842 	} else if (strcmp(param, "host_ip") == 0) {
843 		p->host_ip = strdup(val);
844 	} else if (strcmp(param, "trans") == 0) {
845 		p->trans = strdup(val);
846 	} else if (strcmp(param, "tapif") == 0) {
847 		p->tapif = strdup(val);
848 	} else if (strcmp(param, "vhost") == 0) {
849 		p->vhost = atoi(val);
850 	} else if (strcmp(param, "fd") == 0) {
851 		p->fd = atoi(val);
852 	} else if (strcmp(param, "mq") == 0) {
853 		p->mq = atoi(val);
854 	} else
855 		die("Unknown network parameter %s", param);
856 
857 	return 0;
858 }
859 
860 int netdev_parser(const struct option *opt, const char *arg, int unset)
861 {
862 	struct virtio_net_params p;
863 	char *buf = NULL, *cmd = NULL, *cur = NULL;
864 	bool on_cmd = true;
865 	struct kvm *kvm = opt->ptr;
866 
867 	if (arg) {
868 		buf = strdup(arg);
869 		if (buf == NULL)
870 			die("Failed allocating new net buffer");
871 		cur = strtok(buf, ",=");
872 	}
873 
874 	p = (struct virtio_net_params) {
875 		.guest_ip	= DEFAULT_GUEST_ADDR,
876 		.host_ip	= DEFAULT_HOST_ADDR,
877 		.script		= DEFAULT_SCRIPT,
878 		.downscript	= DEFAULT_SCRIPT,
879 		.mode		= NET_MODE_TAP,
880 	};
881 
882 	str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
883 	p.guest_mac[5] += kvm->cfg.num_net_devices;
884 
885 	while (cur) {
886 		if (on_cmd) {
887 			cmd = cur;
888 		} else {
889 			if (set_net_param(kvm, &p, cmd, cur) < 0)
890 				goto done;
891 		}
892 		on_cmd = !on_cmd;
893 
894 		cur = strtok(NULL, ",=");
895 	};
896 
897 	kvm->cfg.num_net_devices++;
898 
899 	kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
900 	if (kvm->cfg.net_params == NULL)
901 		die("Failed adding new network device");
902 
903 	kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
904 
905 done:
906 	free(buf);
907 	return 0;
908 }
909 
910 static int virtio_net__init_one(struct virtio_net_params *params)
911 {
912 	int i, err;
913 	struct net_dev *ndev;
914 	struct virtio_ops *ops;
915 	enum virtio_trans trans = VIRTIO_DEFAULT_TRANS(params->kvm);
916 
917 	ndev = calloc(1, sizeof(struct net_dev));
918 	if (ndev == NULL)
919 		return -ENOMEM;
920 
921 	ops = malloc(sizeof(*ops));
922 	if (ops == NULL) {
923 		err = -ENOMEM;
924 		goto err_free_ndev;
925 	}
926 
927 	list_add_tail(&ndev->list, &ndevs);
928 
929 	ndev->kvm = params->kvm;
930 	ndev->params = params;
931 
932 	mutex_init(&ndev->mutex);
933 	ndev->queue_pairs = max(1, min(VIRTIO_NET_NUM_QUEUES, params->mq));
934 	ndev->config.status = VIRTIO_NET_S_LINK_UP;
935 	if (ndev->queue_pairs > 1)
936 		ndev->config.max_virtqueue_pairs = ndev->queue_pairs;
937 
938 	for (i = 0 ; i < 6 ; i++) {
939 		ndev->config.mac[i]		= params->guest_mac[i];
940 		ndev->info.guest_mac.addr[i]	= params->guest_mac[i];
941 		ndev->info.host_mac.addr[i]	= params->host_mac[i];
942 	}
943 
944 	ndev->mode = params->mode;
945 	if (ndev->mode == NET_MODE_TAP) {
946 		ndev->ops = &tap_ops;
947 		if (!virtio_net__tap_create(ndev))
948 			die_perror("You have requested a TAP device, but creation of one has failed because");
949 	} else {
950 		ndev->info.host_ip		= ntohl(inet_addr(params->host_ip));
951 		ndev->info.guest_ip		= ntohl(inet_addr(params->guest_ip));
952 		ndev->info.guest_netmask	= ntohl(inet_addr("255.255.255.0"));
953 		ndev->info.buf_nr		= 20,
954 		ndev->ops = &uip_ops;
955 		uip_static_init(&ndev->info);
956 	}
957 
958 	*ops = net_dev_virtio_ops;
959 
960 	if (params->trans) {
961 		if (strcmp(params->trans, "mmio") == 0)
962 			trans = VIRTIO_MMIO;
963 		else if (strcmp(params->trans, "pci") == 0)
964 			trans = VIRTIO_PCI;
965 		else
966 			pr_warning("virtio-net: Unknown transport method : %s, "
967 				   "falling back to %s.", params->trans,
968 				   virtio_trans_name(trans));
969 	}
970 
971 	virtio_init(params->kvm, ndev, &ndev->vdev, ops, trans,
972 		    PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
973 
974 	if (params->vhost)
975 		virtio_net__vhost_init(params->kvm, ndev);
976 
977 	if (compat_id == -1)
978 		compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
979 
980 	return 0;
981 
982 err_free_ndev:
983 	free(ndev);
984 	return err;
985 }
986 
987 int virtio_net__init(struct kvm *kvm)
988 {
989 	int i;
990 
991 	for (i = 0; i < kvm->cfg.num_net_devices; i++) {
992 		kvm->cfg.net_params[i].kvm = kvm;
993 		virtio_net__init_one(&kvm->cfg.net_params[i]);
994 	}
995 
996 	if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
997 		static struct virtio_net_params net_params;
998 
999 		net_params = (struct virtio_net_params) {
1000 			.guest_ip	= kvm->cfg.guest_ip,
1001 			.host_ip	= kvm->cfg.host_ip,
1002 			.kvm		= kvm,
1003 			.script		= kvm->cfg.script,
1004 			.mode		= NET_MODE_USER,
1005 		};
1006 		str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
1007 		str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
1008 
1009 		virtio_net__init_one(&net_params);
1010 	}
1011 
1012 	return 0;
1013 }
1014 virtio_dev_init(virtio_net__init);
1015 
1016 int virtio_net__exit(struct kvm *kvm)
1017 {
1018 	struct virtio_net_params *params;
1019 	struct net_dev *ndev;
1020 	struct list_head *ptr;
1021 
1022 	list_for_each(ptr, &ndevs) {
1023 		ndev = list_entry(ptr, struct net_dev, list);
1024 		params = ndev->params;
1025 		/* Cleanup any tap device which attached to bridge */
1026 		if (ndev->mode == NET_MODE_TAP &&
1027 		    strcmp(params->downscript, "none"))
1028 			virtio_net_exec_script(params->downscript, ndev->tap_name);
1029 	}
1030 	return 0;
1031 }
1032 virtio_dev_exit(virtio_net__exit);
1033