xref: /kvmtool/virtio/net.c (revision 702f4abb09ec469e4f14ba72d94a9661f2037987)
1 #include "kvm/virtio-pci-dev.h"
2 #include "kvm/virtio-net.h"
3 #include "kvm/virtio.h"
4 #include "kvm/ioport.h"
5 #include "kvm/types.h"
6 #include "kvm/mutex.h"
7 #include "kvm/util.h"
8 #include "kvm/kvm.h"
9 #include "kvm/pci.h"
10 #include "kvm/irq.h"
11 #include "kvm/uip.h"
12 #include "kvm/ioeventfd.h"
13 #include "kvm/guest_compat.h"
14 
15 #include <linux/virtio_net.h>
16 #include <linux/if_tun.h>
17 
18 #include <arpa/inet.h>
19 #include <net/if.h>
20 
21 #include <unistd.h>
22 #include <assert.h>
23 #include <fcntl.h>
24 
25 #include <sys/socket.h>
26 #include <sys/ioctl.h>
27 #include <sys/types.h>
28 #include <sys/wait.h>
29 
30 #define VIRTIO_NET_QUEUE_SIZE		128
31 #define VIRTIO_NET_NUM_QUEUES		2
32 #define VIRTIO_NET_RX_QUEUE		0
33 #define VIRTIO_NET_TX_QUEUE		1
34 
35 static struct pci_device_header pci_header = {
36 	.vendor_id			= PCI_VENDOR_ID_REDHAT_QUMRANET,
37 	.device_id			= PCI_DEVICE_ID_VIRTIO_NET,
38 	.header_type			= PCI_HEADER_TYPE_NORMAL,
39 	.revision_id			= 0,
40 	.class				= 0x020000,
41 	.subsys_vendor_id		= PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET,
42 	.subsys_id			= VIRTIO_ID_NET,
43 };
44 
45 struct net_dev;
46 
47 struct net_dev_operations {
48 	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
49 	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
50 };
51 
52 struct net_dev {
53 	pthread_mutex_t			mutex;
54 
55 	struct virt_queue		vqs[VIRTIO_NET_NUM_QUEUES];
56 	struct virtio_net_config	config;
57 	u32				host_features;
58 	u32				guest_features;
59 	u16				config_vector;
60 	u8				status;
61 	u8				isr;
62 	u16				queue_selector;
63 	u16				base_addr;
64 	u32				vq_vector[VIRTIO_NET_NUM_QUEUES];
65 	u32				gsis[VIRTIO_NET_NUM_QUEUES];
66 	u32				msix_io_block;
67 	int				compat_id;
68 	bool				msix_enabled;
69 
70 	pthread_t			io_rx_thread;
71 	pthread_mutex_t			io_rx_lock;
72 	pthread_cond_t			io_rx_cond;
73 
74 	pthread_t			io_tx_thread;
75 	pthread_mutex_t			io_tx_lock;
76 	pthread_cond_t			io_tx_cond;
77 
78 	int				tap_fd;
79 	char				tap_name[IFNAMSIZ];
80 
81 	int				mode;
82 
83 	struct uip_info			info;
84 	struct net_dev_operations	*ops;
85 };
86 
87 static struct net_dev ndev = {
88 	.mutex	= PTHREAD_MUTEX_INITIALIZER,
89 
90 	.config = {
91 		.status			= VIRTIO_NET_S_LINK_UP,
92 	},
93 	.host_features			= 1UL << VIRTIO_NET_F_MAC
94 					| 1UL << VIRTIO_NET_F_CSUM
95 					| 1UL << VIRTIO_NET_F_HOST_UFO
96 					| 1UL << VIRTIO_NET_F_HOST_TSO4
97 					| 1UL << VIRTIO_NET_F_HOST_TSO6
98 					| 1UL << VIRTIO_NET_F_GUEST_UFO
99 					| 1UL << VIRTIO_NET_F_GUEST_TSO4
100 					| 1UL << VIRTIO_NET_F_GUEST_TSO6,
101 	.info = {
102 		.buf_nr			= 20,
103 	}
104 };
105 
106 static void *virtio_net_rx_thread(void *p)
107 {
108 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
109 	struct virt_queue *vq;
110 	struct kvm *kvm;
111 	u16 out, in;
112 	u16 head;
113 	int len;
114 
115 	kvm	= p;
116 	vq	= &ndev.vqs[VIRTIO_NET_RX_QUEUE];
117 
118 	while (1) {
119 
120 		mutex_lock(&ndev.io_rx_lock);
121 		if (!virt_queue__available(vq))
122 			pthread_cond_wait(&ndev.io_rx_cond, &ndev.io_rx_lock);
123 		mutex_unlock(&ndev.io_rx_lock);
124 
125 		while (virt_queue__available(vq)) {
126 
127 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
128 
129 			len = ndev.ops->rx(iov, in, &ndev);
130 
131 			virt_queue__set_used_elem(vq, head, len);
132 
133 			/* We should interrupt guest right now, otherwise latency is huge. */
134 			kvm__irq_trigger(kvm, ndev.gsis[VIRTIO_NET_RX_QUEUE]);
135 		}
136 
137 	}
138 
139 	pthread_exit(NULL);
140 	return NULL;
141 
142 }
143 
144 static void *virtio_net_tx_thread(void *p)
145 {
146 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
147 	struct virt_queue *vq;
148 	struct kvm *kvm;
149 	u16 out, in;
150 	u16 head;
151 	int len;
152 
153 	kvm	= p;
154 	vq	= &ndev.vqs[VIRTIO_NET_TX_QUEUE];
155 
156 	while (1) {
157 		mutex_lock(&ndev.io_tx_lock);
158 		if (!virt_queue__available(vq))
159 			pthread_cond_wait(&ndev.io_tx_cond, &ndev.io_tx_lock);
160 		mutex_unlock(&ndev.io_tx_lock);
161 
162 		while (virt_queue__available(vq)) {
163 
164 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
165 
166 			len = ndev.ops->tx(iov, out, &ndev);
167 
168 			virt_queue__set_used_elem(vq, head, len);
169 		}
170 
171 		kvm__irq_trigger(kvm, ndev.gsis[VIRTIO_NET_TX_QUEUE]);
172 	}
173 
174 	pthread_exit(NULL);
175 
176 	return NULL;
177 
178 }
179 
180 static bool virtio_net_pci_io_device_specific_out(struct kvm *kvm, void *data,
181 							unsigned long offset, int size)
182 {
183 	u8 *config_space = (u8 *)&ndev.config;
184 	int type;
185 	u32 config_offset;
186 
187 	type = virtio__get_dev_specific_field(offset - 20, ndev.msix_enabled, 0, &config_offset);
188 	if (type == VIRTIO_PCI_O_MSIX) {
189 		if (offset == VIRTIO_MSI_CONFIG_VECTOR) {
190 			ndev.config_vector	= ioport__read16(data);
191 		} else {
192 			u32 gsi;
193 			u32 vec;
194 
195 			vec = ndev.vq_vector[ndev.queue_selector] = ioport__read16(data);
196 
197 			gsi = irq__add_msix_route(kvm,
198 						  pci_header.msix.table[vec].low,
199 						  pci_header.msix.table[vec].high,
200 						  pci_header.msix.table[vec].data);
201 
202 			ndev.gsis[ndev.queue_selector] = gsi;
203 		}
204 		return true;
205 	}
206 
207 	if (size != 1)
208 		return false;
209 
210 	if ((config_offset) > sizeof(struct virtio_net_config))
211 		pr_error("config offset is too big: %u", config_offset);
212 
213 	config_space[config_offset] = *(u8 *)data;
214 
215 	return true;
216 }
217 
218 static bool virtio_net_pci_io_device_specific_in(void *data, unsigned long offset, int size)
219 {
220 	u8 *config_space = (u8 *)&ndev.config;
221 	int type;
222 	u32 config_offset;
223 
224 	type = virtio__get_dev_specific_field(offset - 20, ndev.msix_enabled, 0, &config_offset);
225 	if (type == VIRTIO_PCI_O_MSIX) {
226 		if (offset == VIRTIO_MSI_CONFIG_VECTOR)
227 			ioport__write16(data, ndev.config_vector);
228 		else
229 			ioport__write16(data, ndev.vq_vector[ndev.queue_selector]);
230 
231 		return true;
232 	}
233 
234 	if (size != 1)
235 		return false;
236 
237 	if ((config_offset) > sizeof(struct virtio_net_config))
238 		pr_error("config offset is too big: %u", config_offset);
239 
240 	ioport__write8(data, config_space[config_offset]);
241 
242 	return true;
243 }
244 
245 static bool virtio_net_pci_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
246 {
247 	unsigned long	offset	= port - ndev.base_addr;
248 	bool		ret	= true;
249 
250 	mutex_lock(&ndev.mutex);
251 
252 	switch (offset) {
253 	case VIRTIO_PCI_HOST_FEATURES:
254 		ioport__write32(data, ndev.host_features);
255 		break;
256 	case VIRTIO_PCI_GUEST_FEATURES:
257 		ret = false;
258 		break;
259 	case VIRTIO_PCI_QUEUE_PFN:
260 		ioport__write32(data, ndev.vqs[ndev.queue_selector].pfn);
261 		break;
262 	case VIRTIO_PCI_QUEUE_NUM:
263 		ioport__write16(data, VIRTIO_NET_QUEUE_SIZE);
264 		break;
265 	case VIRTIO_PCI_QUEUE_SEL:
266 	case VIRTIO_PCI_QUEUE_NOTIFY:
267 		ret = false;
268 		break;
269 	case VIRTIO_PCI_STATUS:
270 		ioport__write8(data, ndev.status);
271 		break;
272 	case VIRTIO_PCI_ISR:
273 		ioport__write8(data, ndev.isr);
274 		kvm__irq_line(kvm, pci_header.irq_line, VIRTIO_IRQ_LOW);
275 		ndev.isr = VIRTIO_IRQ_LOW;
276 		break;
277 	default:
278 		ret = virtio_net_pci_io_device_specific_in(data, offset, size);
279 	};
280 
281 	mutex_unlock(&ndev.mutex);
282 
283 	return ret;
284 }
285 
286 static void virtio_net_handle_callback(struct kvm *kvm, u16 queue_index)
287 {
288 	switch (queue_index) {
289 	case VIRTIO_NET_TX_QUEUE:
290 		mutex_lock(&ndev.io_tx_lock);
291 		pthread_cond_signal(&ndev.io_tx_cond);
292 		mutex_unlock(&ndev.io_tx_lock);
293 		break;
294 	case VIRTIO_NET_RX_QUEUE:
295 		mutex_lock(&ndev.io_rx_lock);
296 		pthread_cond_signal(&ndev.io_rx_cond);
297 		mutex_unlock(&ndev.io_rx_lock);
298 		break;
299 	default:
300 		pr_warning("Unknown queue index %u", queue_index);
301 	}
302 }
303 
304 static bool virtio_net_pci_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
305 {
306 	unsigned long	offset		= port - ndev.base_addr;
307 	bool		ret		= true;
308 
309 	mutex_lock(&ndev.mutex);
310 
311 	switch (offset) {
312 	case VIRTIO_PCI_GUEST_FEATURES:
313 		ndev.guest_features	= ioport__read32(data);
314 		break;
315 	case VIRTIO_PCI_QUEUE_PFN: {
316 		struct virt_queue *queue;
317 		void *p;
318 
319 		assert(ndev.queue_selector < VIRTIO_NET_NUM_QUEUES);
320 
321 		compat__remove_message(ndev.compat_id);
322 
323 		queue			= &ndev.vqs[ndev.queue_selector];
324 		queue->pfn		= ioport__read32(data);
325 		p			= guest_pfn_to_host(kvm, queue->pfn);
326 
327 		vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
328 
329 		break;
330 	}
331 	case VIRTIO_PCI_QUEUE_SEL:
332 		ndev.queue_selector	= ioport__read16(data);
333 		break;
334 	case VIRTIO_PCI_QUEUE_NOTIFY: {
335 		u16 queue_index;
336 
337 		queue_index		= ioport__read16(data);
338 		virtio_net_handle_callback(kvm, queue_index);
339 		break;
340 	}
341 	case VIRTIO_PCI_STATUS:
342 		ndev.status		= ioport__read8(data);
343 		break;
344 	default:
345 		ret = virtio_net_pci_io_device_specific_out(kvm, data, offset, size);
346 	};
347 
348 	mutex_unlock(&ndev.mutex);
349 
350 	return ret;
351 }
352 
353 static void ioevent_callback(struct kvm *kvm, void *param)
354 {
355 	virtio_net_handle_callback(kvm, (u64)(long)param);
356 }
357 
358 static struct ioport_operations virtio_net_io_ops = {
359 	.io_in	= virtio_net_pci_io_in,
360 	.io_out	= virtio_net_pci_io_out,
361 };
362 
363 static void callback_mmio(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
364 {
365 	void *table = pci_header.msix.table;
366 	if (is_write)
367 		memcpy(table + addr - ndev.msix_io_block, data, len);
368 	else
369 		memcpy(data, table + addr - ndev.msix_io_block, len);
370 
371 	ndev.msix_enabled = 1;
372 }
373 
374 static bool virtio_net__tap_init(const struct virtio_net_parameters *params)
375 {
376 	int sock = socket(AF_INET, SOCK_STREAM, 0);
377 	int pid, status, offload, hdr_len;
378 	struct sockaddr_in sin = {0};
379 	struct ifreq ifr;
380 
381 	ndev.tap_fd = open("/dev/net/tun", O_RDWR);
382 	if (ndev.tap_fd < 0) {
383 		pr_warning("Unable to open /dev/net/tun");
384 		goto fail;
385 	}
386 
387 	memset(&ifr, 0, sizeof(ifr));
388 	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
389 	if (ioctl(ndev.tap_fd, TUNSETIFF, &ifr) < 0) {
390 		pr_warning("Config tap device error. Are you root?");
391 		goto fail;
392 	}
393 
394 	strncpy(ndev.tap_name, ifr.ifr_name, sizeof(ndev.tap_name));
395 
396 	if (ioctl(ndev.tap_fd, TUNSETNOCSUM, 1) < 0) {
397 		pr_warning("Config tap device TUNSETNOCSUM error");
398 		goto fail;
399 	}
400 
401 	hdr_len = sizeof(struct virtio_net_hdr);
402 	if (ioctl(ndev.tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0) {
403 		pr_warning("Config tap device TUNSETVNETHDRSZ error");
404 	}
405 
406 	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
407 	if (ioctl(ndev.tap_fd, TUNSETOFFLOAD, offload) < 0) {
408 		pr_warning("Config tap device TUNSETOFFLOAD error");
409 		goto fail;
410 	}
411 
412 	if (strcmp(params->script, "none")) {
413 		pid = fork();
414 		if (pid == 0) {
415 			execl(params->script, params->script, ndev.tap_name, NULL);
416 			_exit(1);
417 		} else {
418 			waitpid(pid, &status, 0);
419 			if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
420 				pr_warning("Fail to setup tap by %s", params->script);
421 				goto fail;
422 			}
423 		}
424 	} else {
425 		memset(&ifr, 0, sizeof(ifr));
426 		strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name));
427 		sin.sin_addr.s_addr = inet_addr(params->host_ip);
428 		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
429 		ifr.ifr_addr.sa_family = AF_INET;
430 		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
431 			pr_warning("Could not set ip address on tap device");
432 			goto fail;
433 		}
434 	}
435 
436 	memset(&ifr, 0, sizeof(ifr));
437 	strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name));
438 	ioctl(sock, SIOCGIFFLAGS, &ifr);
439 	ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
440 	if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
441 		pr_warning("Could not bring tap device up");
442 
443 	close(sock);
444 
445 	return 1;
446 
447 fail:
448 	if (sock >= 0)
449 		close(sock);
450 	if (ndev.tap_fd >= 0)
451 		close(ndev.tap_fd);
452 
453 	return 0;
454 }
455 
456 static void virtio_net__io_thread_init(struct kvm *kvm)
457 {
458 	pthread_mutex_init(&ndev.io_rx_lock, NULL);
459 	pthread_cond_init(&ndev.io_tx_cond, NULL);
460 
461 	pthread_mutex_init(&ndev.io_rx_lock, NULL);
462 	pthread_cond_init(&ndev.io_tx_cond, NULL);
463 
464 	pthread_create(&ndev.io_rx_thread, NULL, virtio_net_rx_thread, (void *)kvm);
465 	pthread_create(&ndev.io_tx_thread, NULL, virtio_net_tx_thread, (void *)kvm);
466 }
467 
468 static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
469 {
470 	return writev(ndev->tap_fd, iov, out);
471 }
472 
473 static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
474 {
475 	return readv(ndev->tap_fd, iov, in);
476 }
477 
478 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
479 {
480 	return uip_tx(iov, out, &ndev->info);
481 }
482 
483 static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
484 {
485 	return uip_rx(iov, in, &ndev->info);
486 }
487 
488 static struct net_dev_operations tap_ops = {
489 	.rx	= tap_ops_rx,
490 	.tx	= tap_ops_tx,
491 };
492 
493 static struct net_dev_operations uip_ops = {
494 	.rx	= uip_ops_rx,
495 	.tx	= uip_ops_tx,
496 };
497 
498 void virtio_net__init(const struct virtio_net_parameters *params)
499 {
500 	struct ioevent ioevent;
501 	u8 dev, line, pin;
502 	u16 net_base_addr;
503 	int i;
504 
505 	if (irq__register_device(VIRTIO_ID_NET, &dev, &pin, &line) < 0)
506 		return;
507 
508 	pci_header.irq_pin  = pin;
509 	pci_header.irq_line = line;
510 	net_base_addr	    = ioport__register(IOPORT_EMPTY, &virtio_net_io_ops, IOPORT_SIZE, NULL);
511 	pci_header.bar[0]   = net_base_addr | PCI_BASE_ADDRESS_SPACE_IO;
512 	ndev.base_addr	    = net_base_addr;
513 	pci__register(&pci_header, dev);
514 
515 	for (i = 0 ; i < 6 ; i++) {
516 		ndev.config.mac[i]		= params->guest_mac[i];
517 		ndev.info.guest_mac.addr[i]	= params->guest_mac[i];
518 		ndev.info.host_mac.addr[i]	= params->host_mac[i];
519 	}
520 
521 	ndev.mode = params->mode;
522 	if (ndev.mode == NET_MODE_TAP) {
523 		virtio_net__tap_init(params);
524 		ndev.ops = &tap_ops;
525 	} else {
526 		ndev.info.host_ip		= ntohl(inet_addr(params->host_ip));
527 		ndev.info.guest_ip		= ntohl(inet_addr(params->guest_ip));
528 		ndev.info.guest_netmask		= ntohl(inet_addr("255.255.255.0"));
529 		uip_init(&ndev.info);
530 		ndev.ops = &uip_ops;
531 	}
532 
533 	ndev.msix_io_block = pci_get_io_space_block();
534 	kvm__register_mmio(params->kvm, ndev.msix_io_block, 0x100, callback_mmio, NULL);
535 	pci_header.bar[1]	= ndev.msix_io_block |
536 				PCI_BASE_ADDRESS_SPACE_MEMORY |
537 				PCI_BASE_ADDRESS_MEM_TYPE_64;
538 	/* bar[2] is the continuation of bar[1] for 64bit addressing */
539 	pci_header.bar[2]	= 0;
540 	pci_header.status	= PCI_STATUS_CAP_LIST;
541 	pci_header.capabilities	= (void *)&pci_header.msix - (void *)&pci_header;
542 
543 	pci_header.msix.cap = PCI_CAP_ID_MSIX;
544 	pci_header.msix.next = 0;
545 	pci_header.msix.table_size = (VIRTIO_NET_NUM_QUEUES + 1) | PCI_MSIX_FLAGS_ENABLE;
546 	pci_header.msix.table_offset = 1; /* Use BAR 1 */
547 
548 	virtio_net__io_thread_init(params->kvm);
549 
550 	for (i = 0; i < VIRTIO_NET_NUM_QUEUES; i++) {
551 		ioevent = (struct ioevent) {
552 			.io_addr	= net_base_addr + VIRTIO_PCI_QUEUE_NOTIFY,
553 			.io_len		= sizeof(u16),
554 			.fn		= ioevent_callback,
555 			.datamatch	= i,
556 			.fn_ptr		= (void *)(long)i,
557 			.fn_kvm		= params->kvm,
558 			.fd		= eventfd(0, 0),
559 		};
560 
561 		ioeventfd__add_event(&ioevent);
562 	}
563 
564 	ndev.compat_id = compat__add_message("virtio-net device was not detected",
565 						"While you have requested a virtio-net device, "
566 						"the guest kernel didn't seem to detect it.\n"
567 						"Please make sure that the kernel was compiled"
568 						"with CONFIG_VIRTIO_NET.");
569 }
570