xref: /kvmtool/virtio/net.c (revision cb83de6f9db657b4414820b24e1e83794fd9b649)
1 #include "kvm/virtio-pci-dev.h"
2 #include "kvm/virtio-net.h"
3 #include "kvm/virtio.h"
4 #include "kvm/ioport.h"
5 #include "kvm/types.h"
6 #include "kvm/mutex.h"
7 #include "kvm/util.h"
8 #include "kvm/kvm.h"
9 #include "kvm/pci.h"
10 #include "kvm/irq.h"
11 #include "kvm/uip.h"
12 #include "kvm/ioeventfd.h"
13 #include "kvm/guest_compat.h"
14 
15 #include <linux/virtio_net.h>
16 #include <linux/if_tun.h>
17 
18 #include <arpa/inet.h>
19 #include <net/if.h>
20 
21 #include <unistd.h>
22 #include <assert.h>
23 #include <fcntl.h>
24 
25 #include <sys/socket.h>
26 #include <sys/ioctl.h>
27 #include <sys/types.h>
28 #include <sys/wait.h>
29 
30 #define VIRTIO_NET_QUEUE_SIZE		128
31 #define VIRTIO_NET_NUM_QUEUES		2
32 #define VIRTIO_NET_RX_QUEUE		0
33 #define VIRTIO_NET_TX_QUEUE		1
34 
35 static struct pci_device_header pci_header = {
36 	.vendor_id			= PCI_VENDOR_ID_REDHAT_QUMRANET,
37 	.device_id			= PCI_DEVICE_ID_VIRTIO_NET,
38 	.header_type			= PCI_HEADER_TYPE_NORMAL,
39 	.revision_id			= 0,
40 	.class				= 0x020000,
41 	.subsys_vendor_id		= PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET,
42 	.subsys_id			= VIRTIO_ID_NET,
43 };
44 
45 struct net_dev;
46 
47 struct net_dev_operations {
48 	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
49 	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
50 };
51 
52 struct net_dev {
53 	pthread_mutex_t			mutex;
54 
55 	struct virt_queue		vqs[VIRTIO_NET_NUM_QUEUES];
56 	struct virtio_net_config	config;
57 	u32				host_features;
58 	u32				guest_features;
59 	u16				config_vector;
60 	u8				status;
61 	u8				isr;
62 	u16				queue_selector;
63 	u16				base_addr;
64 	u32				vq_vector[VIRTIO_NET_NUM_QUEUES];
65 	u32				gsis[VIRTIO_NET_NUM_QUEUES];
66 	u32				msix_io_block;
67 	int				compat_id;
68 
69 	pthread_t			io_rx_thread;
70 	pthread_mutex_t			io_rx_lock;
71 	pthread_cond_t			io_rx_cond;
72 
73 	pthread_t			io_tx_thread;
74 	pthread_mutex_t			io_tx_lock;
75 	pthread_cond_t			io_tx_cond;
76 
77 	int				tap_fd;
78 	char				tap_name[IFNAMSIZ];
79 
80 	int				mode;
81 
82 	struct uip_info			info;
83 	struct net_dev_operations	*ops;
84 };
85 
86 static struct net_dev ndev = {
87 	.mutex	= PTHREAD_MUTEX_INITIALIZER,
88 
89 	.config = {
90 		.status			= VIRTIO_NET_S_LINK_UP,
91 	},
92 	.host_features			= 1UL << VIRTIO_NET_F_MAC
93 					| 1UL << VIRTIO_NET_F_CSUM
94 					| 1UL << VIRTIO_NET_F_HOST_UFO
95 					| 1UL << VIRTIO_NET_F_HOST_TSO4
96 					| 1UL << VIRTIO_NET_F_HOST_TSO6
97 					| 1UL << VIRTIO_NET_F_GUEST_UFO
98 					| 1UL << VIRTIO_NET_F_GUEST_TSO4
99 					| 1UL << VIRTIO_NET_F_GUEST_TSO6,
100 	.info = {
101 		.buf_nr			= 20,
102 	}
103 };
104 
105 static void *virtio_net_rx_thread(void *p)
106 {
107 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
108 	struct virt_queue *vq;
109 	struct kvm *kvm;
110 	u16 out, in;
111 	u16 head;
112 	int len;
113 
114 	kvm	= p;
115 	vq	= &ndev.vqs[VIRTIO_NET_RX_QUEUE];
116 
117 	while (1) {
118 
119 		mutex_lock(&ndev.io_rx_lock);
120 		if (!virt_queue__available(vq))
121 			pthread_cond_wait(&ndev.io_rx_cond, &ndev.io_rx_lock);
122 		mutex_unlock(&ndev.io_rx_lock);
123 
124 		while (virt_queue__available(vq)) {
125 
126 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
127 
128 			len = ndev.ops->rx(iov, in, &ndev);
129 
130 			virt_queue__set_used_elem(vq, head, len);
131 
132 			/* We should interrupt guest right now, otherwise latency is huge. */
133 			kvm__irq_trigger(kvm, ndev.gsis[VIRTIO_NET_RX_QUEUE]);
134 		}
135 
136 	}
137 
138 	pthread_exit(NULL);
139 	return NULL;
140 
141 }
142 
143 static void *virtio_net_tx_thread(void *p)
144 {
145 	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
146 	struct virt_queue *vq;
147 	struct kvm *kvm;
148 	u16 out, in;
149 	u16 head;
150 	int len;
151 
152 	kvm	= p;
153 	vq	= &ndev.vqs[VIRTIO_NET_TX_QUEUE];
154 
155 	while (1) {
156 		mutex_lock(&ndev.io_tx_lock);
157 		if (!virt_queue__available(vq))
158 			pthread_cond_wait(&ndev.io_tx_cond, &ndev.io_tx_lock);
159 		mutex_unlock(&ndev.io_tx_lock);
160 
161 		while (virt_queue__available(vq)) {
162 
163 			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
164 
165 			len = ndev.ops->tx(iov, out, &ndev);
166 
167 			virt_queue__set_used_elem(vq, head, len);
168 		}
169 
170 		kvm__irq_trigger(kvm, ndev.gsis[VIRTIO_NET_TX_QUEUE]);
171 	}
172 
173 	pthread_exit(NULL);
174 
175 	return NULL;
176 
177 }
178 
179 static bool virtio_net_pci_io_device_specific_in(void *data, unsigned long offset, int size)
180 {
181 	u8 *config_space = (u8 *)&ndev.config;
182 
183 	if (size != 1)
184 		return false;
185 
186 	if ((offset - VIRTIO_MSI_CONFIG_VECTOR) > sizeof(struct virtio_net_config))
187 		pr_error("config offset is too big: %li", offset - VIRTIO_MSI_CONFIG_VECTOR);
188 
189 	ioport__write8(data, config_space[offset - VIRTIO_MSI_CONFIG_VECTOR]);
190 
191 	return true;
192 }
193 
194 static bool virtio_net_pci_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
195 {
196 	unsigned long	offset	= port - ndev.base_addr;
197 	bool		ret	= true;
198 
199 	mutex_lock(&ndev.mutex);
200 
201 	switch (offset) {
202 	case VIRTIO_PCI_HOST_FEATURES:
203 		ioport__write32(data, ndev.host_features);
204 		break;
205 	case VIRTIO_PCI_GUEST_FEATURES:
206 		ret = false;
207 		break;
208 	case VIRTIO_PCI_QUEUE_PFN:
209 		ioport__write32(data, ndev.vqs[ndev.queue_selector].pfn);
210 		break;
211 	case VIRTIO_PCI_QUEUE_NUM:
212 		ioport__write16(data, VIRTIO_NET_QUEUE_SIZE);
213 		break;
214 	case VIRTIO_PCI_QUEUE_SEL:
215 	case VIRTIO_PCI_QUEUE_NOTIFY:
216 		ret = false;
217 		break;
218 	case VIRTIO_PCI_STATUS:
219 		ioport__write8(data, ndev.status);
220 		break;
221 	case VIRTIO_PCI_ISR:
222 		ioport__write8(data, ndev.isr);
223 		kvm__irq_line(kvm, pci_header.irq_line, VIRTIO_IRQ_LOW);
224 		ndev.isr = VIRTIO_IRQ_LOW;
225 		break;
226 	default:
227 		ret = virtio_net_pci_io_device_specific_in(data, offset, size);
228 	};
229 
230 	mutex_unlock(&ndev.mutex);
231 
232 	return ret;
233 }
234 
235 static void virtio_net_handle_callback(struct kvm *kvm, u16 queue_index)
236 {
237 	switch (queue_index) {
238 	case VIRTIO_NET_TX_QUEUE:
239 		mutex_lock(&ndev.io_tx_lock);
240 		pthread_cond_signal(&ndev.io_tx_cond);
241 		mutex_unlock(&ndev.io_tx_lock);
242 		break;
243 	case VIRTIO_NET_RX_QUEUE:
244 		mutex_lock(&ndev.io_rx_lock);
245 		pthread_cond_signal(&ndev.io_rx_cond);
246 		mutex_unlock(&ndev.io_rx_lock);
247 		break;
248 	default:
249 		pr_warning("Unknown queue index %u", queue_index);
250 	}
251 }
252 
253 static bool virtio_net_pci_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
254 {
255 	unsigned long	offset		= port - ndev.base_addr;
256 	bool		ret		= true;
257 
258 	mutex_lock(&ndev.mutex);
259 
260 	switch (offset) {
261 	case VIRTIO_PCI_GUEST_FEATURES:
262 		ndev.guest_features	= ioport__read32(data);
263 		break;
264 	case VIRTIO_PCI_QUEUE_PFN: {
265 		struct virt_queue *queue;
266 		void *p;
267 
268 		assert(ndev.queue_selector < VIRTIO_NET_NUM_QUEUES);
269 
270 		compat__remove_message(ndev.compat_id);
271 
272 		queue			= &ndev.vqs[ndev.queue_selector];
273 		queue->pfn		= ioport__read32(data);
274 		p			= guest_pfn_to_host(kvm, queue->pfn);
275 
276 		vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN);
277 
278 		break;
279 	}
280 	case VIRTIO_PCI_QUEUE_SEL:
281 		ndev.queue_selector	= ioport__read16(data);
282 		break;
283 	case VIRTIO_PCI_QUEUE_NOTIFY: {
284 		u16 queue_index;
285 
286 		queue_index		= ioport__read16(data);
287 		virtio_net_handle_callback(kvm, queue_index);
288 		break;
289 	}
290 	case VIRTIO_PCI_STATUS:
291 		ndev.status		= ioport__read8(data);
292 		break;
293 	case VIRTIO_MSI_CONFIG_VECTOR:
294 		ndev.config_vector	= ioport__read16(data);
295 		break;
296 	case VIRTIO_MSI_QUEUE_VECTOR: {
297 		u32 gsi;
298 		u32 vec;
299 
300 		vec = ndev.vq_vector[ndev.queue_selector] = ioport__read16(data);
301 
302 		gsi = irq__add_msix_route(kvm,
303 					  pci_header.msix.table[vec].low,
304 					  pci_header.msix.table[vec].high,
305 					  pci_header.msix.table[vec].data);
306 
307 		ndev.gsis[ndev.queue_selector] = gsi;
308 		break;
309 	}
310 	default:
311 		ret			= false;
312 	};
313 
314 	mutex_unlock(&ndev.mutex);
315 
316 	return ret;
317 }
318 
319 static void ioevent_callback(struct kvm *kvm, void *param)
320 {
321 	virtio_net_handle_callback(kvm, (u64)(long)param);
322 }
323 
324 static struct ioport_operations virtio_net_io_ops = {
325 	.io_in	= virtio_net_pci_io_in,
326 	.io_out	= virtio_net_pci_io_out,
327 };
328 
329 static void callback_mmio(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
330 {
331 	void *table = pci_header.msix.table;
332 	if (is_write)
333 		memcpy(table + addr - ndev.msix_io_block, data, len);
334 	else
335 		memcpy(data, table + addr - ndev.msix_io_block, len);
336 }
337 
338 static bool virtio_net__tap_init(const struct virtio_net_parameters *params)
339 {
340 	int sock = socket(AF_INET, SOCK_STREAM, 0);
341 	int pid, status, offload, hdr_len;
342 	struct sockaddr_in sin = {0};
343 	struct ifreq ifr;
344 
345 	ndev.tap_fd = open("/dev/net/tun", O_RDWR);
346 	if (ndev.tap_fd < 0) {
347 		pr_warning("Unable to open /dev/net/tun");
348 		goto fail;
349 	}
350 
351 	memset(&ifr, 0, sizeof(ifr));
352 	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
353 	if (ioctl(ndev.tap_fd, TUNSETIFF, &ifr) < 0) {
354 		pr_warning("Config tap device error. Are you root?");
355 		goto fail;
356 	}
357 
358 	strncpy(ndev.tap_name, ifr.ifr_name, sizeof(ndev.tap_name));
359 
360 	if (ioctl(ndev.tap_fd, TUNSETNOCSUM, 1) < 0) {
361 		pr_warning("Config tap device TUNSETNOCSUM error");
362 		goto fail;
363 	}
364 
365 	hdr_len = sizeof(struct virtio_net_hdr);
366 	if (ioctl(ndev.tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0) {
367 		pr_warning("Config tap device TUNSETVNETHDRSZ error");
368 	}
369 
370 	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
371 	if (ioctl(ndev.tap_fd, TUNSETOFFLOAD, offload) < 0) {
372 		pr_warning("Config tap device TUNSETOFFLOAD error");
373 		goto fail;
374 	}
375 
376 	if (strcmp(params->script, "none")) {
377 		pid = fork();
378 		if (pid == 0) {
379 			execl(params->script, params->script, ndev.tap_name, NULL);
380 			_exit(1);
381 		} else {
382 			waitpid(pid, &status, 0);
383 			if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
384 				pr_warning("Fail to setup tap by %s", params->script);
385 				goto fail;
386 			}
387 		}
388 	} else {
389 		memset(&ifr, 0, sizeof(ifr));
390 		strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name));
391 		sin.sin_addr.s_addr = inet_addr(params->host_ip);
392 		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
393 		ifr.ifr_addr.sa_family = AF_INET;
394 		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
395 			pr_warning("Could not set ip address on tap device");
396 			goto fail;
397 		}
398 	}
399 
400 	memset(&ifr, 0, sizeof(ifr));
401 	strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name));
402 	ioctl(sock, SIOCGIFFLAGS, &ifr);
403 	ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
404 	if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
405 		pr_warning("Could not bring tap device up");
406 
407 	close(sock);
408 
409 	return 1;
410 
411 fail:
412 	if (sock >= 0)
413 		close(sock);
414 	if (ndev.tap_fd >= 0)
415 		close(ndev.tap_fd);
416 
417 	return 0;
418 }
419 
420 static void virtio_net__io_thread_init(struct kvm *kvm)
421 {
422 	pthread_mutex_init(&ndev.io_rx_lock, NULL);
423 	pthread_cond_init(&ndev.io_tx_cond, NULL);
424 
425 	pthread_mutex_init(&ndev.io_rx_lock, NULL);
426 	pthread_cond_init(&ndev.io_tx_cond, NULL);
427 
428 	pthread_create(&ndev.io_rx_thread, NULL, virtio_net_rx_thread, (void *)kvm);
429 	pthread_create(&ndev.io_tx_thread, NULL, virtio_net_tx_thread, (void *)kvm);
430 }
431 
432 static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
433 {
434 	return writev(ndev->tap_fd, iov, out);
435 }
436 
437 static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
438 {
439 	return readv(ndev->tap_fd, iov, in);
440 }
441 
442 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
443 {
444 	return uip_tx(iov, out, &ndev->info);
445 }
446 
447 static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
448 {
449 	return uip_rx(iov, in, &ndev->info);
450 }
451 
452 static struct net_dev_operations tap_ops = {
453 	.rx	= tap_ops_rx,
454 	.tx	= tap_ops_tx,
455 };
456 
457 static struct net_dev_operations uip_ops = {
458 	.rx	= uip_ops_rx,
459 	.tx	= uip_ops_tx,
460 };
461 
462 void virtio_net__init(const struct virtio_net_parameters *params)
463 {
464 	struct ioevent ioevent;
465 	u8 dev, line, pin;
466 	u16 net_base_addr;
467 	int i;
468 
469 	if (irq__register_device(VIRTIO_ID_NET, &dev, &pin, &line) < 0)
470 		return;
471 
472 	pci_header.irq_pin  = pin;
473 	pci_header.irq_line = line;
474 	net_base_addr	    = ioport__register(IOPORT_EMPTY, &virtio_net_io_ops, IOPORT_SIZE, NULL);
475 	pci_header.bar[0]   = net_base_addr | PCI_BASE_ADDRESS_SPACE_IO;
476 	ndev.base_addr	    = net_base_addr;
477 	pci__register(&pci_header, dev);
478 
479 	for (i = 0 ; i < 6 ; i++) {
480 		ndev.config.mac[i]		= params->guest_mac[i];
481 		ndev.info.guest_mac.addr[i]	= params->guest_mac[i];
482 		ndev.info.host_mac.addr[i]	= params->host_mac[i];
483 	}
484 
485 	ndev.mode = params->mode;
486 	if (ndev.mode == NET_MODE_TAP) {
487 		virtio_net__tap_init(params);
488 		ndev.ops = &tap_ops;
489 	} else {
490 		ndev.info.host_ip		= ntohl(inet_addr(params->host_ip));
491 		ndev.info.guest_ip		= ntohl(inet_addr(params->guest_ip));
492 		ndev.info.guest_netmask		= ntohl(inet_addr("255.255.255.0"));
493 		uip_init(&ndev.info);
494 		ndev.ops = &uip_ops;
495 	}
496 
497 	ndev.msix_io_block = pci_get_io_space_block();
498 	kvm__register_mmio(params->kvm, ndev.msix_io_block, 0x100, callback_mmio, NULL);
499 	pci_header.bar[1]	= ndev.msix_io_block |
500 				PCI_BASE_ADDRESS_SPACE_MEMORY |
501 				PCI_BASE_ADDRESS_MEM_TYPE_64;
502 	/* bar[2] is the continuation of bar[1] for 64bit addressing */
503 	pci_header.bar[2]	= 0;
504 	pci_header.status	= PCI_STATUS_CAP_LIST;
505 	pci_header.capabilities	= (void *)&pci_header.msix - (void *)&pci_header;
506 
507 	pci_header.msix.cap = PCI_CAP_ID_MSIX;
508 	pci_header.msix.next = 0;
509 	pci_header.msix.table_size = (VIRTIO_NET_NUM_QUEUES + 1) | PCI_MSIX_FLAGS_ENABLE;
510 	pci_header.msix.table_offset = 1; /* Use BAR 1 */
511 
512 	virtio_net__io_thread_init(params->kvm);
513 
514 	for (i = 0; i < VIRTIO_NET_NUM_QUEUES; i++) {
515 		ioevent = (struct ioevent) {
516 			.io_addr	= net_base_addr + VIRTIO_PCI_QUEUE_NOTIFY,
517 			.io_len		= sizeof(u16),
518 			.fn		= ioevent_callback,
519 			.datamatch	= i,
520 			.fn_ptr		= (void *)(long)i,
521 			.fn_kvm		= params->kvm,
522 			.fd		= eventfd(0, 0),
523 		};
524 
525 		ioeventfd__add_event(&ioevent);
526 	}
527 
528 	ndev.compat_id = compat__add_message("virtio-net device was not detected",
529 						"While you have requested a virtio-net device, "
530 						"the guest kernel didn't seem to detect it.\n"
531 						"Please make sure that the kernel was compiled"
532 						"with CONFIG_VIRTIO_NET.");
533 }
534