xref: /kvmtool/virtio/net.c (revision 31e0eacca520f60ac02dfaaaeaeddfcc132095c0)
1 #include "kvm/virtio-pci-dev.h"
2 #include "kvm/virtio-net.h"
3 #include "kvm/virtio.h"
4 #include "kvm/mutex.h"
5 #include "kvm/util.h"
6 #include "kvm/kvm.h"
7 #include "kvm/irq.h"
8 #include "kvm/uip.h"
9 #include "kvm/guest_compat.h"
10 #include "kvm/iovec.h"
11 #include "kvm/strbuf.h"
12 
13 #include <linux/vhost.h>
14 #include <linux/virtio_net.h>
15 #include <linux/if_tun.h>
16 #include <linux/types.h>
17 
18 #include <arpa/inet.h>
19 #include <net/if.h>
20 
21 #include <unistd.h>
22 #include <fcntl.h>
23 
24 #include <sys/socket.h>
25 #include <sys/ioctl.h>
26 #include <sys/types.h>
27 #include <sys/wait.h>
28 #include <sys/eventfd.h>
29 
30 #define VIRTIO_NET_QUEUE_SIZE		256
31 #define VIRTIO_NET_NUM_QUEUES		8
32 
33 struct net_dev;
34 
35 struct net_dev_operations {
36 	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
37 	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
38 };
39 
40 struct net_dev_queue {
41 	int				id;
42 	struct net_dev			*ndev;
43 	struct virt_queue		vq;
44 	pthread_t			thread;
45 	struct mutex			lock;
46 	pthread_cond_t			cond;
47 	int				gsi;
48 	int				irqfd;
49 };
50 
51 struct net_dev {
52 	struct mutex			mutex;
53 	struct virtio_device		vdev;
54 	struct list_head		list;
55 
56 	struct net_dev_queue		queues[VIRTIO_NET_NUM_QUEUES * 2 + 1];
57 	struct virtio_net_config	config;
58 	u32				features, queue_pairs;
59 
60 	int				vhost_fd;
61 	int				tap_fd;
62 	char				tap_name[IFNAMSIZ];
63 	bool				tap_ufo;
64 
65 	int				mode;
66 
67 	struct uip_info			info;
68 	struct net_dev_operations	*ops;
69 	struct kvm			*kvm;
70 
71 	struct virtio_net_params	*params;
72 };
73 
74 static LIST_HEAD(ndevs);
75 static int compat_id = -1;
76 
77 #define MAX_PACKET_SIZE 65550
78 
79 static bool has_virtio_feature(struct net_dev *ndev, u32 feature)
80 {
81 	return ndev->features & (1 << feature);
82 }
83 
84 static void virtio_net_fix_tx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
85 {
86 	hdr->hdr_len		= virtio_guest_to_host_u16(&ndev->vdev, hdr->hdr_len);
87 	hdr->gso_size		= virtio_guest_to_host_u16(&ndev->vdev, hdr->gso_size);
88 	hdr->csum_start		= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_start);
89 	hdr->csum_offset	= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_offset);
90 }
91 
92 static void virtio_net_fix_rx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
93 {
94 	hdr->hdr_len		= virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr_len);
95 	hdr->gso_size		= virtio_host_to_guest_u16(&ndev->vdev, hdr->gso_size);
96 	hdr->csum_start		= virtio_host_to_guest_u16(&ndev->vdev, hdr->csum_start);
97 	hdr->csum_offset	= virtio_host_to_guest_u16(&ndev->vdev, hdr->csum_offset);
98 }
99 
100 static void *virtio_net_rx_thread(void *p)
101 {
102 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
103 	struct net_dev_queue *queue = p;
104 	struct virt_queue *vq = &queue->vq;
105 	struct net_dev *ndev = queue->ndev;
106 	struct kvm *kvm;
107 	u16 out, in;
108 	u16 head;
109 	int len, copied;
110 
111 	kvm__set_thread_name("virtio-net-rx");
112 
113 	kvm = ndev->kvm;
114 	while (1) {
115 		mutex_lock(&queue->lock);
116 		if (!virt_queue__available(vq))
117 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
118 		mutex_unlock(&queue->lock);
119 
120 		while (virt_queue__available(vq)) {
121 			unsigned char buffer[MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)];
122 			struct iovec dummy_iov = {
123 				.iov_base = buffer,
124 				.iov_len  = sizeof(buffer),
125 			};
126 			struct virtio_net_hdr_mrg_rxbuf *hdr;
127 			u16 num_buffers;
128 
129 			len = ndev->ops->rx(&dummy_iov, 1, ndev);
130 			if (len < 0) {
131 				pr_warning("%s: rx on vq %u failed (%d), exiting thread\n",
132 						__func__, queue->id, len);
133 				goto out_err;
134 			}
135 
136 			copied = num_buffers = 0;
137 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
138 			hdr = iov[0].iov_base;
139 			while (copied < len) {
140 				size_t iovsize = min_t(size_t, len - copied, iov_size(iov, in));
141 
142 				memcpy_toiovec(iov, buffer + copied, iovsize);
143 				copied += iovsize;
144 				virt_queue__set_used_elem_no_update(vq, head, iovsize, num_buffers++);
145 				if (copied == len)
146 					break;
147 				while (!virt_queue__available(vq))
148 					sleep(0);
149 				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
150 			}
151 
152 			virtio_net_fix_rx_hdr(&hdr->hdr, ndev);
153 			if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
154 				hdr->num_buffers = virtio_host_to_guest_u16(vq, num_buffers);
155 
156 			virt_queue__used_idx_advance(vq, num_buffers);
157 
158 			/* We should interrupt guest right now, otherwise latency is huge. */
159 			if (virtio_queue__should_signal(vq))
160 				ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
161 		}
162 	}
163 
164 out_err:
165 	pthread_exit(NULL);
166 	return NULL;
167 
168 }
169 
170 static void *virtio_net_tx_thread(void *p)
171 {
172 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
173 	struct net_dev_queue *queue = p;
174 	struct virt_queue *vq = &queue->vq;
175 	struct net_dev *ndev = queue->ndev;
176 	struct kvm *kvm;
177 	u16 out, in;
178 	u16 head;
179 	int len;
180 
181 	kvm__set_thread_name("virtio-net-tx");
182 
183 	kvm = ndev->kvm;
184 
185 	while (1) {
186 		mutex_lock(&queue->lock);
187 		if (!virt_queue__available(vq))
188 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
189 		mutex_unlock(&queue->lock);
190 
191 		while (virt_queue__available(vq)) {
192 			struct virtio_net_hdr *hdr;
193 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
194 			hdr = iov[0].iov_base;
195 			virtio_net_fix_tx_hdr(hdr, ndev);
196 			len = ndev->ops->tx(iov, out, ndev);
197 			if (len < 0) {
198 				pr_warning("%s: tx on vq %u failed (%d)\n",
199 						__func__, queue->id, errno);
200 				goto out_err;
201 			}
202 
203 			virt_queue__set_used_elem(vq, head, len);
204 		}
205 
206 		if (virtio_queue__should_signal(vq))
207 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
208 	}
209 
210 out_err:
211 	pthread_exit(NULL);
212 	return NULL;
213 }
214 
215 static virtio_net_ctrl_ack virtio_net_handle_mq(struct kvm* kvm, struct net_dev *ndev, struct virtio_net_ctrl_hdr *ctrl)
216 {
217 	/* Not much to do here */
218 	return VIRTIO_NET_OK;
219 }
220 
221 static void *virtio_net_ctrl_thread(void *p)
222 {
223 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
224 	struct net_dev_queue *queue = p;
225 	struct virt_queue *vq = &queue->vq;
226 	struct net_dev *ndev = queue->ndev;
227 	u16 out, in, head;
228 	struct kvm *kvm = ndev->kvm;
229 	struct virtio_net_ctrl_hdr *ctrl;
230 	virtio_net_ctrl_ack *ack;
231 
232 	kvm__set_thread_name("virtio-net-ctrl");
233 
234 	while (1) {
235 		mutex_lock(&queue->lock);
236 		if (!virt_queue__available(vq))
237 			pthread_cond_wait(&queue->cond, &queue->lock.mutex);
238 		mutex_unlock(&queue->lock);
239 
240 		while (virt_queue__available(vq)) {
241 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
242 			ctrl = iov[0].iov_base;
243 			ack = iov[out].iov_base;
244 
245 			switch (ctrl->class) {
246 			case VIRTIO_NET_CTRL_MQ:
247 				*ack = virtio_net_handle_mq(kvm, ndev, ctrl);
248 				break;
249 			default:
250 				*ack = VIRTIO_NET_ERR;
251 				break;
252 			}
253 			virt_queue__set_used_elem(vq, head, iov[out].iov_len);
254 		}
255 
256 		if (virtio_queue__should_signal(vq))
257 			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, queue->id);
258 	}
259 
260 	pthread_exit(NULL);
261 
262 	return NULL;
263 }
264 
265 static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
266 {
267 	struct net_dev_queue *net_queue = &ndev->queues[queue];
268 
269 	if ((u32)queue >= (ndev->queue_pairs * 2 + 1)) {
270 		pr_warning("Unknown queue index %u", queue);
271 		return;
272 	}
273 
274 	mutex_lock(&net_queue->lock);
275 	pthread_cond_signal(&net_queue->cond);
276 	mutex_unlock(&net_queue->lock);
277 }
278 
279 static int virtio_net_request_tap(struct net_dev *ndev, struct ifreq *ifr,
280 				  const char *tapname)
281 {
282 	int ret;
283 
284 	memset(ifr, 0, sizeof(*ifr));
285 	ifr->ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
286 	if (tapname)
287 		strlcpy(ifr->ifr_name, tapname, sizeof(ifr->ifr_name));
288 
289 	ret = ioctl(ndev->tap_fd, TUNSETIFF, ifr);
290 
291 	if (ret >= 0)
292 		strlcpy(ndev->tap_name, ifr->ifr_name, sizeof(ndev->tap_name));
293 	return ret;
294 }
295 
296 static int virtio_net_exec_script(const char* script, const char *tap_name)
297 {
298 	pid_t pid;
299 	int status;
300 
301 	pid = fork();
302 	if (pid == 0) {
303 		execl(script, script, tap_name, NULL);
304 		_exit(1);
305 	} else {
306 		waitpid(pid, &status, 0);
307 		if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
308 			pr_warning("Fail to setup tap by %s", script);
309 			return -1;
310 		}
311 	}
312 	return 0;
313 }
314 
315 static bool virtio_net__tap_init(struct net_dev *ndev)
316 {
317 	int sock = socket(AF_INET, SOCK_STREAM, 0);
318 	int hdr_len;
319 	struct sockaddr_in sin = {0};
320 	struct ifreq ifr;
321 	const struct virtio_net_params *params = ndev->params;
322 	bool skipconf = !!params->tapif;
323 
324 	hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
325 			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
326 			sizeof(struct virtio_net_hdr);
327 	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
328 		pr_warning("Config tap device TUNSETVNETHDRSZ error");
329 
330 	if (strcmp(params->script, "none")) {
331 		if (virtio_net_exec_script(params->script, ndev->tap_name) < 0)
332 			goto fail;
333 	} else if (!skipconf) {
334 		memset(&ifr, 0, sizeof(ifr));
335 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
336 		sin.sin_addr.s_addr = inet_addr(params->host_ip);
337 		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
338 		ifr.ifr_addr.sa_family = AF_INET;
339 		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
340 			pr_warning("Could not set ip address on tap device");
341 			goto fail;
342 		}
343 	}
344 
345 	if (!skipconf) {
346 		memset(&ifr, 0, sizeof(ifr));
347 		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
348 		ioctl(sock, SIOCGIFFLAGS, &ifr);
349 		ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
350 		if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
351 			pr_warning("Could not bring tap device up");
352 	}
353 
354 	close(sock);
355 
356 	return 1;
357 
358 fail:
359 	if (sock >= 0)
360 		close(sock);
361 	if (ndev->tap_fd >= 0)
362 		close(ndev->tap_fd);
363 
364 	return 0;
365 }
366 
367 static void virtio_net__tap_exit(struct net_dev *ndev)
368 {
369 	int sock;
370 	struct ifreq ifr;
371 
372 	if (ndev->params->tapif)
373 		return;
374 
375 	sock = socket(AF_INET, SOCK_STREAM, 0);
376 	strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ifr.ifr_name));
377 	ioctl(sock, SIOCGIFFLAGS, &ifr);
378 	ifr.ifr_flags &= ~(IFF_UP | IFF_RUNNING);
379 	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0)
380 		pr_warning("Count not bring tap device down");
381 	close(sock);
382 }
383 
384 static bool virtio_net__tap_create(struct net_dev *ndev)
385 {
386 	int offload;
387 	struct ifreq ifr;
388 	const struct virtio_net_params *params = ndev->params;
389 	bool macvtap = (!!params->tapif) && (params->tapif[0] == '/');
390 
391 	/* Did the user already gave us the FD? */
392 	if (params->fd)
393 		ndev->tap_fd = params->fd;
394 	else {
395 		const char *tap_file = "/dev/net/tun";
396 
397 		/* Did the user ask us to use macvtap? */
398 		if (macvtap)
399 			tap_file = params->tapif;
400 
401 		ndev->tap_fd = open(tap_file, O_RDWR);
402 		if (ndev->tap_fd < 0) {
403 			pr_warning("Unable to open %s", tap_file);
404 			return 0;
405 		}
406 	}
407 
408 	if (!macvtap &&
409 	    virtio_net_request_tap(ndev, &ifr, params->tapif) < 0) {
410 		pr_warning("Config tap device error. Are you root?");
411 		goto fail;
412 	}
413 
414 	/*
415 	 * The UFO support had been removed from kernel in commit:
416 	 * ID: fb652fdfe83710da0ca13448a41b7ed027d0a984
417 	 * https://www.spinics.net/lists/netdev/msg443562.html
418 	 * In oder to support the older kernels without this commit,
419 	 * we set the TUN_F_UFO to offload by default to test the status of
420 	 * UFO kernel support.
421 	 */
422 	ndev->tap_ufo = true;
423 	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
424 	if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
425 		/*
426 		 * Is this failure caused by kernel remove the UFO support?
427 		 * Try TUNSETOFFLOAD without TUN_F_UFO.
428 		 */
429 		offload &= ~TUN_F_UFO;
430 		if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
431 			pr_warning("Config tap device TUNSETOFFLOAD error");
432 			goto fail;
433 		}
434 		ndev->tap_ufo = false;
435 	}
436 
437 	return 1;
438 
439 fail:
440 	if ((ndev->tap_fd >= 0) || (!params->fd) )
441 		close(ndev->tap_fd);
442 
443 	return 0;
444 }
445 
446 static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
447 {
448 	return writev(ndev->tap_fd, iov, out);
449 }
450 
451 static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
452 {
453 	return readv(ndev->tap_fd, iov, in);
454 }
455 
456 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
457 {
458 	return uip_tx(iov, out, &ndev->info);
459 }
460 
461 static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
462 {
463 	return uip_rx(iov, in, &ndev->info);
464 }
465 
466 static struct net_dev_operations tap_ops = {
467 	.rx	= tap_ops_rx,
468 	.tx	= tap_ops_tx,
469 };
470 
471 static struct net_dev_operations uip_ops = {
472 	.rx	= uip_ops_rx,
473 	.tx	= uip_ops_tx,
474 };
475 
476 static u8 *get_config(struct kvm *kvm, void *dev)
477 {
478 	struct net_dev *ndev = dev;
479 
480 	return ((u8 *)(&ndev->config));
481 }
482 
483 static size_t get_config_size(struct kvm *kvm, void *dev)
484 {
485 	struct net_dev *ndev = dev;
486 
487 	return sizeof(ndev->config);
488 }
489 
490 static u32 get_host_features(struct kvm *kvm, void *dev)
491 {
492 	u32 features;
493 	struct net_dev *ndev = dev;
494 
495 	features = 1UL << VIRTIO_NET_F_MAC
496 		| 1UL << VIRTIO_NET_F_CSUM
497 		| 1UL << VIRTIO_NET_F_HOST_TSO4
498 		| 1UL << VIRTIO_NET_F_HOST_TSO6
499 		| 1UL << VIRTIO_NET_F_GUEST_TSO4
500 		| 1UL << VIRTIO_NET_F_GUEST_TSO6
501 		| 1UL << VIRTIO_RING_F_EVENT_IDX
502 		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
503 		| 1UL << VIRTIO_NET_F_CTRL_VQ
504 		| 1UL << VIRTIO_NET_F_MRG_RXBUF
505 		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
506 
507 	/*
508 	 * The UFO feature for host and guest only can be enabled when the
509 	 * kernel has TAP UFO support.
510 	 */
511 	if (ndev->tap_ufo)
512 		features |= (1UL << VIRTIO_NET_F_HOST_UFO
513 				| 1UL << VIRTIO_NET_F_GUEST_UFO);
514 
515 	return features;
516 }
517 
518 static int virtio_net__vhost_set_features(struct net_dev *ndev)
519 {
520 	u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
521 	u64 vhost_features;
522 
523 	if (ioctl(ndev->vhost_fd, VHOST_GET_FEATURES, &vhost_features) != 0)
524 		die_perror("VHOST_GET_FEATURES failed");
525 
526 	/* make sure both side support mergable rx buffers */
527 	if (vhost_features & 1UL << VIRTIO_NET_F_MRG_RXBUF &&
528 			has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
529 		features |= 1UL << VIRTIO_NET_F_MRG_RXBUF;
530 
531 	return ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
532 }
533 
534 static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
535 {
536 	struct net_dev *ndev = dev;
537 	struct virtio_net_config *conf = &ndev->config;
538 
539 	ndev->features = features;
540 
541 	conf->status = virtio_host_to_guest_u16(&ndev->vdev, conf->status);
542 	conf->max_virtqueue_pairs = virtio_host_to_guest_u16(&ndev->vdev,
543 							     conf->max_virtqueue_pairs);
544 }
545 
546 static void virtio_net_start(struct net_dev *ndev)
547 {
548 	if (ndev->mode == NET_MODE_TAP) {
549 		if (!virtio_net__tap_init(ndev))
550 			die_perror("TAP device initialized failed because");
551 
552 		if (ndev->vhost_fd &&
553 				virtio_net__vhost_set_features(ndev) != 0)
554 			die_perror("VHOST_SET_FEATURES failed");
555 	} else {
556 		ndev->info.vnet_hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
557 						sizeof(struct virtio_net_hdr_mrg_rxbuf) :
558 						sizeof(struct virtio_net_hdr);
559 		uip_init(&ndev->info);
560 	}
561 }
562 
563 static void virtio_net_stop(struct net_dev *ndev)
564 {
565 	/* Undo whatever start() did */
566 	if (ndev->mode == NET_MODE_TAP)
567 		virtio_net__tap_exit(ndev);
568 	else
569 		uip_exit(&ndev->info);
570 }
571 
572 static void notify_status(struct kvm *kvm, void *dev, u32 status)
573 {
574 	if (status & VIRTIO__STATUS_START)
575 		virtio_net_start(dev);
576 	else if (status & VIRTIO__STATUS_STOP)
577 		virtio_net_stop(dev);
578 }
579 
580 static bool is_ctrl_vq(struct net_dev *ndev, u32 vq)
581 {
582 	return vq == (u32)(ndev->queue_pairs * 2);
583 }
584 
585 static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
586 		   u32 pfn)
587 {
588 	struct vhost_vring_state state = { .index = vq };
589 	struct net_dev_queue *net_queue;
590 	struct vhost_vring_addr addr;
591 	struct net_dev *ndev = dev;
592 	struct virt_queue *queue;
593 	void *p;
594 	int r;
595 
596 	compat__remove_message(compat_id);
597 
598 	net_queue	= &ndev->queues[vq];
599 	net_queue->id	= vq;
600 	net_queue->ndev	= ndev;
601 	queue		= &net_queue->vq;
602 	queue->pfn	= pfn;
603 	p		= virtio_get_vq(kvm, queue->pfn, page_size);
604 
605 	vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, align);
606 	virtio_init_device_vq(&ndev->vdev, queue);
607 
608 	mutex_init(&net_queue->lock);
609 	pthread_cond_init(&net_queue->cond, NULL);
610 	if (is_ctrl_vq(ndev, vq)) {
611 		pthread_create(&net_queue->thread, NULL, virtio_net_ctrl_thread,
612 			       net_queue);
613 
614 		return 0;
615 	} else if (ndev->vhost_fd == 0 ) {
616 		if (vq & 1)
617 			pthread_create(&net_queue->thread, NULL,
618 				       virtio_net_tx_thread, net_queue);
619 		else
620 			pthread_create(&net_queue->thread, NULL,
621 				       virtio_net_rx_thread, net_queue);
622 
623 		return 0;
624 	}
625 
626 	if (queue->endian != VIRTIO_ENDIAN_HOST)
627 		die_perror("VHOST requires the same endianness in guest and host");
628 
629 	state.num = queue->vring.num;
630 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
631 	if (r < 0)
632 		die_perror("VHOST_SET_VRING_NUM failed");
633 	state.num = 0;
634 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
635 	if (r < 0)
636 		die_perror("VHOST_SET_VRING_BASE failed");
637 
638 	addr = (struct vhost_vring_addr) {
639 		.index = vq,
640 		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
641 		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
642 		.used_user_addr = (u64)(unsigned long)queue->vring.used,
643 	};
644 
645 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
646 	if (r < 0)
647 		die_perror("VHOST_SET_VRING_ADDR failed");
648 
649 	return 0;
650 }
651 
652 static void exit_vq(struct kvm *kvm, void *dev, u32 vq)
653 {
654 	struct net_dev *ndev = dev;
655 	struct net_dev_queue *queue = &ndev->queues[vq];
656 
657 	if (!is_ctrl_vq(ndev, vq) && queue->gsi) {
658 		irq__del_irqfd(kvm, queue->gsi, queue->irqfd);
659 		close(queue->irqfd);
660 		queue->gsi = queue->irqfd = 0;
661 	}
662 
663 	/*
664 	 * TODO: vhost reset owner. It's the only way to cleanly stop vhost, but
665 	 * we can't restart it at the moment.
666 	 */
667 	if (ndev->vhost_fd && !is_ctrl_vq(ndev, vq)) {
668 		pr_warning("Cannot reset VHOST queue");
669 		ioctl(ndev->vhost_fd, VHOST_RESET_OWNER);
670 		return;
671 	}
672 
673 	/*
674 	 * Threads are waiting on cancellation points (readv or
675 	 * pthread_cond_wait) and should stop gracefully.
676 	 */
677 	pthread_cancel(queue->thread);
678 	pthread_join(queue->thread, NULL);
679 }
680 
681 static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
682 {
683 	struct net_dev *ndev = dev;
684 	struct net_dev_queue *queue = &ndev->queues[vq];
685 	struct vhost_vring_file file;
686 	int r;
687 
688 	if (ndev->vhost_fd == 0)
689 		return;
690 
691 	file = (struct vhost_vring_file) {
692 		.index	= vq,
693 		.fd	= eventfd(0, 0),
694 	};
695 
696 	r = irq__add_irqfd(kvm, gsi, file.fd, -1);
697 	if (r < 0)
698 		die_perror("KVM_IRQFD failed");
699 
700 	queue->irqfd = file.fd;
701 	queue->gsi = gsi;
702 
703 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
704 	if (r < 0)
705 		die_perror("VHOST_SET_VRING_CALL failed");
706 	file.fd = ndev->tap_fd;
707 	r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
708 	if (r != 0)
709 		die("VHOST_NET_SET_BACKEND failed %d", errno);
710 
711 }
712 
713 static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
714 {
715 	struct net_dev *ndev = dev;
716 	struct vhost_vring_file file = {
717 		.index	= vq,
718 		.fd	= efd,
719 	};
720 	int r;
721 
722 	if (ndev->vhost_fd == 0 || is_ctrl_vq(ndev, vq))
723 		return;
724 
725 	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
726 	if (r < 0)
727 		die_perror("VHOST_SET_VRING_KICK failed");
728 }
729 
730 static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
731 {
732 	struct net_dev *ndev = dev;
733 
734 	virtio_net_handle_callback(kvm, ndev, vq);
735 
736 	return 0;
737 }
738 
739 static struct virt_queue *get_vq(struct kvm *kvm, void *dev, u32 vq)
740 {
741 	struct net_dev *ndev = dev;
742 
743 	return &ndev->queues[vq].vq;
744 }
745 
746 static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
747 {
748 	/* FIXME: dynamic */
749 	return VIRTIO_NET_QUEUE_SIZE;
750 }
751 
752 static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
753 {
754 	/* FIXME: dynamic */
755 	return size;
756 }
757 
758 static unsigned int get_vq_count(struct kvm *kvm, void *dev)
759 {
760 	struct net_dev *ndev = dev;
761 
762 	return ndev->queue_pairs * 2 + 1;
763 }
764 
765 static struct virtio_ops net_dev_virtio_ops = {
766 	.get_config		= get_config,
767 	.get_config_size	= get_config_size,
768 	.get_host_features	= get_host_features,
769 	.set_guest_features	= set_guest_features,
770 	.get_vq_count		= get_vq_count,
771 	.init_vq		= init_vq,
772 	.exit_vq		= exit_vq,
773 	.get_vq			= get_vq,
774 	.get_size_vq		= get_size_vq,
775 	.set_size_vq		= set_size_vq,
776 	.notify_vq		= notify_vq,
777 	.notify_vq_gsi		= notify_vq_gsi,
778 	.notify_vq_eventfd	= notify_vq_eventfd,
779 	.notify_status		= notify_status,
780 };
781 
782 static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
783 {
784 	struct kvm_mem_bank *bank;
785 	struct vhost_memory *mem;
786 	int r, i;
787 
788 	ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
789 	if (ndev->vhost_fd < 0)
790 		die_perror("Failed openning vhost-net device");
791 
792 	mem = calloc(1, sizeof(*mem) + kvm->mem_slots * sizeof(struct vhost_memory_region));
793 	if (mem == NULL)
794 		die("Failed allocating memory for vhost memory map");
795 
796 	i = 0;
797 	list_for_each_entry(bank, &kvm->mem_banks, list) {
798 		mem->regions[i] = (struct vhost_memory_region) {
799 			.guest_phys_addr = bank->guest_phys_addr,
800 			.memory_size	 = bank->size,
801 			.userspace_addr	 = (unsigned long)bank->host_addr,
802 		};
803 		i++;
804 	}
805 	mem->nregions = i;
806 
807 	r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
808 	if (r != 0)
809 		die_perror("VHOST_SET_OWNER failed");
810 
811 	r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
812 	if (r != 0)
813 		die_perror("VHOST_SET_MEM_TABLE failed");
814 
815 	ndev->vdev.use_vhost = true;
816 
817 	free(mem);
818 }
819 
820 static inline void str_to_mac(const char *str, char *mac)
821 {
822 	sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
823 		mac, mac+1, mac+2, mac+3, mac+4, mac+5);
824 }
825 static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
826 			const char *param, const char *val)
827 {
828 	if (strcmp(param, "guest_mac") == 0) {
829 		str_to_mac(val, p->guest_mac);
830 	} else if (strcmp(param, "mode") == 0) {
831 		if (!strncmp(val, "user", 4)) {
832 			int i;
833 
834 			for (i = 0; i < kvm->cfg.num_net_devices; i++)
835 				if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
836 					die("Only one usermode network device allowed at a time");
837 			p->mode = NET_MODE_USER;
838 		} else if (!strncmp(val, "tap", 3)) {
839 			p->mode = NET_MODE_TAP;
840 		} else if (!strncmp(val, "none", 4)) {
841 			kvm->cfg.no_net = 1;
842 			return -1;
843 		} else
844 			die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
845 	} else if (strcmp(param, "script") == 0) {
846 		p->script = strdup(val);
847 	} else if (strcmp(param, "downscript") == 0) {
848 		p->downscript = strdup(val);
849 	} else if (strcmp(param, "guest_ip") == 0) {
850 		p->guest_ip = strdup(val);
851 	} else if (strcmp(param, "host_ip") == 0) {
852 		p->host_ip = strdup(val);
853 	} else if (strcmp(param, "trans") == 0) {
854 		p->trans = strdup(val);
855 	} else if (strcmp(param, "tapif") == 0) {
856 		p->tapif = strdup(val);
857 	} else if (strcmp(param, "vhost") == 0) {
858 		p->vhost = atoi(val);
859 	} else if (strcmp(param, "fd") == 0) {
860 		p->fd = atoi(val);
861 	} else if (strcmp(param, "mq") == 0) {
862 		p->mq = atoi(val);
863 	} else
864 		die("Unknown network parameter %s", param);
865 
866 	return 0;
867 }
868 
869 int netdev_parser(const struct option *opt, const char *arg, int unset)
870 {
871 	struct virtio_net_params p;
872 	char *buf = NULL, *cmd = NULL, *cur = NULL;
873 	bool on_cmd = true;
874 	struct kvm *kvm = opt->ptr;
875 
876 	if (arg) {
877 		buf = strdup(arg);
878 		if (buf == NULL)
879 			die("Failed allocating new net buffer");
880 		cur = strtok(buf, ",=");
881 	}
882 
883 	p = (struct virtio_net_params) {
884 		.guest_ip	= DEFAULT_GUEST_ADDR,
885 		.host_ip	= DEFAULT_HOST_ADDR,
886 		.script		= DEFAULT_SCRIPT,
887 		.downscript	= DEFAULT_SCRIPT,
888 		.mode		= NET_MODE_TAP,
889 	};
890 
891 	str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
892 	p.guest_mac[5] += kvm->cfg.num_net_devices;
893 
894 	while (cur) {
895 		if (on_cmd) {
896 			cmd = cur;
897 		} else {
898 			if (set_net_param(kvm, &p, cmd, cur) < 0)
899 				goto done;
900 		}
901 		on_cmd = !on_cmd;
902 
903 		cur = strtok(NULL, ",=");
904 	};
905 
906 	kvm->cfg.num_net_devices++;
907 
908 	kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
909 	if (kvm->cfg.net_params == NULL)
910 		die("Failed adding new network device");
911 
912 	kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
913 
914 done:
915 	free(buf);
916 	return 0;
917 }
918 
919 static int virtio_net__init_one(struct virtio_net_params *params)
920 {
921 	int i, r;
922 	struct net_dev *ndev;
923 	struct virtio_ops *ops;
924 	enum virtio_trans trans = VIRTIO_DEFAULT_TRANS(params->kvm);
925 
926 	ndev = calloc(1, sizeof(struct net_dev));
927 	if (ndev == NULL)
928 		return -ENOMEM;
929 
930 	list_add_tail(&ndev->list, &ndevs);
931 
932 	ops = malloc(sizeof(*ops));
933 	if (ops == NULL)
934 		return -ENOMEM;
935 
936 	ndev->kvm = params->kvm;
937 	ndev->params = params;
938 
939 	mutex_init(&ndev->mutex);
940 	ndev->queue_pairs = max(1, min(VIRTIO_NET_NUM_QUEUES, params->mq));
941 	ndev->config.status = VIRTIO_NET_S_LINK_UP;
942 	if (ndev->queue_pairs > 1)
943 		ndev->config.max_virtqueue_pairs = ndev->queue_pairs;
944 
945 	for (i = 0 ; i < 6 ; i++) {
946 		ndev->config.mac[i]		= params->guest_mac[i];
947 		ndev->info.guest_mac.addr[i]	= params->guest_mac[i];
948 		ndev->info.host_mac.addr[i]	= params->host_mac[i];
949 	}
950 
951 	ndev->mode = params->mode;
952 	if (ndev->mode == NET_MODE_TAP) {
953 		ndev->ops = &tap_ops;
954 		if (!virtio_net__tap_create(ndev))
955 			die_perror("You have requested a TAP device, but creation of one has failed because");
956 	} else {
957 		ndev->info.host_ip		= ntohl(inet_addr(params->host_ip));
958 		ndev->info.guest_ip		= ntohl(inet_addr(params->guest_ip));
959 		ndev->info.guest_netmask	= ntohl(inet_addr("255.255.255.0"));
960 		ndev->info.buf_nr		= 20,
961 		ndev->ops = &uip_ops;
962 		uip_static_init(&ndev->info);
963 	}
964 
965 	*ops = net_dev_virtio_ops;
966 
967 	if (params->trans) {
968 		if (strcmp(params->trans, "mmio") == 0)
969 			trans = VIRTIO_MMIO;
970 		else if (strcmp(params->trans, "pci") == 0)
971 			trans = VIRTIO_PCI;
972 		else
973 			pr_warning("virtio-net: Unknown transport method : %s, "
974 				   "falling back to %s.", params->trans,
975 				   virtio_trans_name(trans));
976 	}
977 
978 	r = virtio_init(params->kvm, ndev, &ndev->vdev, ops, trans,
979 			PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
980 	if (r < 0) {
981 		free(ops);
982 		return r;
983 	}
984 
985 	if (params->vhost)
986 		virtio_net__vhost_init(params->kvm, ndev);
987 
988 	if (compat_id == -1)
989 		compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
990 
991 	return 0;
992 }
993 
994 int virtio_net__init(struct kvm *kvm)
995 {
996 	int i, r;
997 
998 	for (i = 0; i < kvm->cfg.num_net_devices; i++) {
999 		kvm->cfg.net_params[i].kvm = kvm;
1000 		r = virtio_net__init_one(&kvm->cfg.net_params[i]);
1001 		if (r < 0)
1002 			goto cleanup;
1003 	}
1004 
1005 	if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
1006 		static struct virtio_net_params net_params;
1007 
1008 		net_params = (struct virtio_net_params) {
1009 			.guest_ip	= kvm->cfg.guest_ip,
1010 			.host_ip	= kvm->cfg.host_ip,
1011 			.kvm		= kvm,
1012 			.script		= kvm->cfg.script,
1013 			.mode		= NET_MODE_USER,
1014 		};
1015 		str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
1016 		str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
1017 
1018 		r = virtio_net__init_one(&net_params);
1019 		if (r < 0)
1020 			goto cleanup;
1021 	}
1022 
1023 	return 0;
1024 
1025 cleanup:
1026 	virtio_net__exit(kvm);
1027 	return r;
1028 }
1029 virtio_dev_init(virtio_net__init);
1030 
1031 int virtio_net__exit(struct kvm *kvm)
1032 {
1033 	struct virtio_net_params *params;
1034 	struct net_dev *ndev;
1035 	struct list_head *ptr, *n;
1036 
1037 	list_for_each_safe(ptr, n, &ndevs) {
1038 		ndev = list_entry(ptr, struct net_dev, list);
1039 		params = ndev->params;
1040 		/* Cleanup any tap device which attached to bridge */
1041 		if (ndev->mode == NET_MODE_TAP &&
1042 		    strcmp(params->downscript, "none"))
1043 			virtio_net_exec_script(params->downscript, ndev->tap_name);
1044 
1045 		list_del(&ndev->list);
1046 		free(ndev);
1047 	}
1048 	return 0;
1049 }
1050 virtio_dev_exit(virtio_net__exit);
1051