1 #include "kvm/virtio-pci-dev.h" 2 #include "kvm/virtio-net.h" 3 #include "kvm/virtio.h" 4 #include "kvm/ioport.h" 5 #include "kvm/types.h" 6 #include "kvm/mutex.h" 7 #include "kvm/util.h" 8 #include "kvm/kvm.h" 9 #include "kvm/pci.h" 10 #include "kvm/irq.h" 11 #include "kvm/uip.h" 12 #include "kvm/ioeventfd.h" 13 14 #include <linux/virtio_net.h> 15 #include <linux/if_tun.h> 16 17 #include <arpa/inet.h> 18 #include <net/if.h> 19 20 #include <unistd.h> 21 #include <assert.h> 22 #include <fcntl.h> 23 24 #include <sys/socket.h> 25 #include <sys/ioctl.h> 26 #include <sys/types.h> 27 #include <sys/wait.h> 28 29 #define VIRTIO_NET_QUEUE_SIZE 128 30 #define VIRTIO_NET_NUM_QUEUES 2 31 #define VIRTIO_NET_RX_QUEUE 0 32 #define VIRTIO_NET_TX_QUEUE 1 33 34 static struct pci_device_header pci_header = { 35 .vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET, 36 .device_id = PCI_DEVICE_ID_VIRTIO_NET, 37 .header_type = PCI_HEADER_TYPE_NORMAL, 38 .revision_id = 0, 39 .class = 0x020000, 40 .subsys_vendor_id = PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET, 41 .subsys_id = VIRTIO_ID_NET, 42 }; 43 44 struct net_dev; 45 46 struct net_dev_operations { 47 int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev); 48 int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev); 49 }; 50 51 struct net_dev { 52 pthread_mutex_t mutex; 53 54 struct virt_queue vqs[VIRTIO_NET_NUM_QUEUES]; 55 struct virtio_net_config config; 56 u32 host_features; 57 u32 guest_features; 58 u16 config_vector; 59 u8 status; 60 u8 isr; 61 u16 queue_selector; 62 u16 base_addr; 63 u32 vq_vector[VIRTIO_NET_NUM_QUEUES]; 64 u32 gsis[VIRTIO_NET_NUM_QUEUES]; 65 u32 msix_io_block; 66 67 pthread_t io_rx_thread; 68 pthread_mutex_t io_rx_lock; 69 pthread_cond_t io_rx_cond; 70 71 pthread_t io_tx_thread; 72 pthread_mutex_t io_tx_lock; 73 pthread_cond_t io_tx_cond; 74 75 int tap_fd; 76 char tap_name[IFNAMSIZ]; 77 78 int mode; 79 80 struct uip_info info; 81 struct net_dev_operations *ops; 82 }; 83 84 static struct net_dev ndev = { 85 .mutex = PTHREAD_MUTEX_INITIALIZER, 86 87 .config = { 88 .status = VIRTIO_NET_S_LINK_UP, 89 }, 90 .host_features = 1UL << VIRTIO_NET_F_MAC 91 | 1UL << VIRTIO_NET_F_CSUM 92 | 1UL << VIRTIO_NET_F_HOST_UFO 93 | 1UL << VIRTIO_NET_F_HOST_TSO4 94 | 1UL << VIRTIO_NET_F_HOST_TSO6 95 | 1UL << VIRTIO_NET_F_GUEST_UFO 96 | 1UL << VIRTIO_NET_F_GUEST_TSO4 97 | 1UL << VIRTIO_NET_F_GUEST_TSO6, 98 .info = { 99 .buf_nr = 20, 100 } 101 }; 102 103 static void *virtio_net_rx_thread(void *p) 104 { 105 struct iovec iov[VIRTIO_NET_QUEUE_SIZE]; 106 struct virt_queue *vq; 107 struct kvm *kvm; 108 u16 out, in; 109 u16 head; 110 int len; 111 112 kvm = p; 113 vq = &ndev.vqs[VIRTIO_NET_RX_QUEUE]; 114 115 while (1) { 116 117 mutex_lock(&ndev.io_rx_lock); 118 if (!virt_queue__available(vq)) 119 pthread_cond_wait(&ndev.io_rx_cond, &ndev.io_rx_lock); 120 mutex_unlock(&ndev.io_rx_lock); 121 122 while (virt_queue__available(vq)) { 123 124 head = virt_queue__get_iov(vq, iov, &out, &in, kvm); 125 126 len = ndev.ops->rx(iov, in, &ndev); 127 128 virt_queue__set_used_elem(vq, head, len); 129 130 /* We should interrupt guest right now, otherwise latency is huge. */ 131 kvm__irq_trigger(kvm, ndev.gsis[VIRTIO_NET_RX_QUEUE]); 132 } 133 134 } 135 136 pthread_exit(NULL); 137 return NULL; 138 139 } 140 141 static void *virtio_net_tx_thread(void *p) 142 { 143 struct iovec iov[VIRTIO_NET_QUEUE_SIZE]; 144 struct virt_queue *vq; 145 struct kvm *kvm; 146 u16 out, in; 147 u16 head; 148 int len; 149 150 kvm = p; 151 vq = &ndev.vqs[VIRTIO_NET_TX_QUEUE]; 152 153 while (1) { 154 mutex_lock(&ndev.io_tx_lock); 155 if (!virt_queue__available(vq)) 156 pthread_cond_wait(&ndev.io_tx_cond, &ndev.io_tx_lock); 157 mutex_unlock(&ndev.io_tx_lock); 158 159 while (virt_queue__available(vq)) { 160 161 head = virt_queue__get_iov(vq, iov, &out, &in, kvm); 162 163 len = ndev.ops->tx(iov, out, &ndev); 164 165 virt_queue__set_used_elem(vq, head, len); 166 } 167 168 kvm__irq_trigger(kvm, ndev.gsis[VIRTIO_NET_TX_QUEUE]); 169 } 170 171 pthread_exit(NULL); 172 173 return NULL; 174 175 } 176 177 static bool virtio_net_pci_io_device_specific_in(void *data, unsigned long offset, int size, u32 count) 178 { 179 u8 *config_space = (u8 *)&ndev.config; 180 181 if (size != 1 || count != 1) 182 return false; 183 184 if ((offset - VIRTIO_MSI_CONFIG_VECTOR) > sizeof(struct virtio_net_config)) 185 pr_error("config offset is too big: %li", offset - VIRTIO_MSI_CONFIG_VECTOR); 186 187 ioport__write8(data, config_space[offset - VIRTIO_MSI_CONFIG_VECTOR]); 188 189 return true; 190 } 191 192 static bool virtio_net_pci_io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size, u32 count) 193 { 194 unsigned long offset = port - ndev.base_addr; 195 bool ret = true; 196 197 mutex_lock(&ndev.mutex); 198 199 switch (offset) { 200 case VIRTIO_PCI_HOST_FEATURES: 201 ioport__write32(data, ndev.host_features); 202 break; 203 case VIRTIO_PCI_GUEST_FEATURES: 204 ret = false; 205 break; 206 case VIRTIO_PCI_QUEUE_PFN: 207 ioport__write32(data, ndev.vqs[ndev.queue_selector].pfn); 208 break; 209 case VIRTIO_PCI_QUEUE_NUM: 210 ioport__write16(data, VIRTIO_NET_QUEUE_SIZE); 211 break; 212 case VIRTIO_PCI_QUEUE_SEL: 213 case VIRTIO_PCI_QUEUE_NOTIFY: 214 ret = false; 215 break; 216 case VIRTIO_PCI_STATUS: 217 ioport__write8(data, ndev.status); 218 break; 219 case VIRTIO_PCI_ISR: 220 ioport__write8(data, ndev.isr); 221 kvm__irq_line(kvm, pci_header.irq_line, VIRTIO_IRQ_LOW); 222 ndev.isr = VIRTIO_IRQ_LOW; 223 break; 224 case VIRTIO_MSI_CONFIG_VECTOR: 225 ioport__write16(data, ndev.config_vector); 226 break; 227 case VIRTIO_MSI_QUEUE_VECTOR: 228 ioport__write16(data, ndev.vq_vector[ndev.queue_selector]); 229 break; 230 default: 231 ret = virtio_net_pci_io_device_specific_in(data, offset, size, count); 232 }; 233 234 mutex_unlock(&ndev.mutex); 235 236 return ret; 237 } 238 239 static void virtio_net_handle_callback(struct kvm *kvm, u16 queue_index) 240 { 241 switch (queue_index) { 242 case VIRTIO_NET_TX_QUEUE: 243 mutex_lock(&ndev.io_tx_lock); 244 pthread_cond_signal(&ndev.io_tx_cond); 245 mutex_unlock(&ndev.io_tx_lock); 246 break; 247 case VIRTIO_NET_RX_QUEUE: 248 mutex_lock(&ndev.io_rx_lock); 249 pthread_cond_signal(&ndev.io_rx_cond); 250 mutex_unlock(&ndev.io_rx_lock); 251 break; 252 default: 253 pr_warning("Unknown queue index %u", queue_index); 254 } 255 } 256 257 static bool virtio_net_pci_io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size, u32 count) 258 { 259 unsigned long offset = port - ndev.base_addr; 260 bool ret = true; 261 262 mutex_lock(&ndev.mutex); 263 264 switch (offset) { 265 case VIRTIO_PCI_GUEST_FEATURES: 266 ndev.guest_features = ioport__read32(data); 267 break; 268 case VIRTIO_PCI_QUEUE_PFN: { 269 struct virt_queue *queue; 270 void *p; 271 272 assert(ndev.queue_selector < VIRTIO_NET_NUM_QUEUES); 273 274 queue = &ndev.vqs[ndev.queue_selector]; 275 queue->pfn = ioport__read32(data); 276 p = guest_pfn_to_host(kvm, queue->pfn); 277 278 vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, VIRTIO_PCI_VRING_ALIGN); 279 280 break; 281 } 282 case VIRTIO_PCI_QUEUE_SEL: 283 ndev.queue_selector = ioport__read16(data); 284 break; 285 case VIRTIO_PCI_QUEUE_NOTIFY: { 286 u16 queue_index; 287 288 queue_index = ioport__read16(data); 289 virtio_net_handle_callback(kvm, queue_index); 290 break; 291 } 292 case VIRTIO_PCI_STATUS: 293 ndev.status = ioport__read8(data); 294 break; 295 case VIRTIO_MSI_CONFIG_VECTOR: 296 ndev.config_vector = ioport__read16(data); 297 break; 298 case VIRTIO_MSI_QUEUE_VECTOR: { 299 u32 gsi; 300 u32 vec; 301 302 vec = ndev.vq_vector[ndev.queue_selector] = ioport__read16(data); 303 304 gsi = irq__add_msix_route(kvm, 305 pci_header.msix.table[vec].low, 306 pci_header.msix.table[vec].high, 307 pci_header.msix.table[vec].data); 308 309 ndev.gsis[ndev.queue_selector] = gsi; 310 break; 311 } 312 default: 313 ret = false; 314 }; 315 316 mutex_unlock(&ndev.mutex); 317 318 return ret; 319 } 320 321 static void ioevent_callback(struct kvm *kvm, void *param) 322 { 323 virtio_net_handle_callback(kvm, (u64)(long)param); 324 } 325 326 static struct ioport_operations virtio_net_io_ops = { 327 .io_in = virtio_net_pci_io_in, 328 .io_out = virtio_net_pci_io_out, 329 }; 330 331 static void callback_mmio(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr) 332 { 333 void *table = pci_header.msix.table; 334 if (is_write) 335 memcpy(table + addr - ndev.msix_io_block, data, len); 336 else 337 memcpy(data, table + addr - ndev.msix_io_block, len); 338 } 339 340 static bool virtio_net__tap_init(const struct virtio_net_parameters *params) 341 { 342 int sock = socket(AF_INET, SOCK_STREAM, 0); 343 int pid, status, offload, hdr_len; 344 struct sockaddr_in sin = {0}; 345 struct ifreq ifr; 346 347 ndev.tap_fd = open("/dev/net/tun", O_RDWR); 348 if (ndev.tap_fd < 0) { 349 pr_warning("Unable to open /dev/net/tun"); 350 goto fail; 351 } 352 353 memset(&ifr, 0, sizeof(ifr)); 354 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 355 if (ioctl(ndev.tap_fd, TUNSETIFF, &ifr) < 0) { 356 pr_warning("Config tap device error. Are you root?"); 357 goto fail; 358 } 359 360 strncpy(ndev.tap_name, ifr.ifr_name, sizeof(ndev.tap_name)); 361 362 if (ioctl(ndev.tap_fd, TUNSETNOCSUM, 1) < 0) { 363 pr_warning("Config tap device TUNSETNOCSUM error"); 364 goto fail; 365 } 366 367 hdr_len = sizeof(struct virtio_net_hdr); 368 if (ioctl(ndev.tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0) { 369 pr_warning("Config tap device TUNSETVNETHDRSZ error"); 370 } 371 372 offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO; 373 if (ioctl(ndev.tap_fd, TUNSETOFFLOAD, offload) < 0) { 374 pr_warning("Config tap device TUNSETOFFLOAD error"); 375 goto fail; 376 } 377 378 if (strcmp(params->script, "none")) { 379 pid = fork(); 380 if (pid == 0) { 381 execl(params->script, params->script, ndev.tap_name, NULL); 382 _exit(1); 383 } else { 384 waitpid(pid, &status, 0); 385 if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { 386 pr_warning("Fail to setup tap by %s", params->script); 387 goto fail; 388 } 389 } 390 } else { 391 memset(&ifr, 0, sizeof(ifr)); 392 strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name)); 393 sin.sin_addr.s_addr = inet_addr(params->host_ip); 394 memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr)); 395 ifr.ifr_addr.sa_family = AF_INET; 396 if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) { 397 pr_warning("Could not set ip address on tap device"); 398 goto fail; 399 } 400 } 401 402 memset(&ifr, 0, sizeof(ifr)); 403 strncpy(ifr.ifr_name, ndev.tap_name, sizeof(ndev.tap_name)); 404 ioctl(sock, SIOCGIFFLAGS, &ifr); 405 ifr.ifr_flags |= IFF_UP | IFF_RUNNING; 406 if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) 407 pr_warning("Could not bring tap device up"); 408 409 close(sock); 410 411 return 1; 412 413 fail: 414 if (sock >= 0) 415 close(sock); 416 if (ndev.tap_fd >= 0) 417 close(ndev.tap_fd); 418 419 return 0; 420 } 421 422 static void virtio_net__io_thread_init(struct kvm *kvm) 423 { 424 pthread_mutex_init(&ndev.io_rx_lock, NULL); 425 pthread_cond_init(&ndev.io_tx_cond, NULL); 426 427 pthread_mutex_init(&ndev.io_rx_lock, NULL); 428 pthread_cond_init(&ndev.io_tx_cond, NULL); 429 430 pthread_create(&ndev.io_rx_thread, NULL, virtio_net_rx_thread, (void *)kvm); 431 pthread_create(&ndev.io_tx_thread, NULL, virtio_net_tx_thread, (void *)kvm); 432 } 433 434 static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev) 435 { 436 return writev(ndev->tap_fd, iov, out); 437 } 438 439 static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev) 440 { 441 return readv(ndev->tap_fd, iov, in); 442 } 443 444 static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev) 445 { 446 return uip_tx(iov, out, &ndev->info); 447 } 448 449 static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev) 450 { 451 return uip_rx(iov, in, &ndev->info); 452 } 453 454 static struct net_dev_operations tap_ops = { 455 .rx = tap_ops_rx, 456 .tx = tap_ops_tx, 457 }; 458 459 static struct net_dev_operations uip_ops = { 460 .rx = uip_ops_rx, 461 .tx = uip_ops_tx, 462 }; 463 464 void virtio_net__init(const struct virtio_net_parameters *params) 465 { 466 struct ioevent ioevent; 467 u8 dev, line, pin; 468 u16 net_base_addr; 469 int i; 470 471 if (irq__register_device(VIRTIO_ID_NET, &dev, &pin, &line) < 0) 472 return; 473 474 pci_header.irq_pin = pin; 475 pci_header.irq_line = line; 476 net_base_addr = ioport__register(IOPORT_EMPTY, &virtio_net_io_ops, IOPORT_SIZE, NULL); 477 pci_header.bar[0] = net_base_addr | PCI_BASE_ADDRESS_SPACE_IO; 478 ndev.base_addr = net_base_addr; 479 pci__register(&pci_header, dev); 480 481 for (i = 0 ; i < 6 ; i++) { 482 ndev.config.mac[i] = params->guest_mac[i]; 483 ndev.info.guest_mac.addr[i] = params->guest_mac[i]; 484 ndev.info.host_mac.addr[i] = params->host_mac[i]; 485 } 486 487 ndev.mode = params->mode; 488 if (ndev.mode == NET_MODE_TAP) { 489 virtio_net__tap_init(params); 490 ndev.ops = &tap_ops; 491 } else { 492 ndev.info.host_ip = ntohl(inet_addr(params->host_ip)); 493 ndev.info.guest_ip = ntohl(inet_addr(params->guest_ip)); 494 ndev.info.guest_netmask = ntohl(inet_addr("255.255.255.0")); 495 uip_init(&ndev.info); 496 ndev.ops = &uip_ops; 497 } 498 499 ndev.msix_io_block = pci_get_io_space_block(); 500 kvm__register_mmio(params->kvm, ndev.msix_io_block, 0x100, callback_mmio, NULL); 501 pci_header.bar[1] = ndev.msix_io_block | 502 PCI_BASE_ADDRESS_SPACE_MEMORY | 503 PCI_BASE_ADDRESS_MEM_TYPE_64; 504 /* bar[2] is the continuation of bar[1] for 64bit addressing */ 505 pci_header.bar[2] = 0; 506 pci_header.status = PCI_STATUS_CAP_LIST; 507 pci_header.capabilities = (void *)&pci_header.msix - (void *)&pci_header; 508 509 pci_header.msix.cap = PCI_CAP_ID_MSIX; 510 pci_header.msix.next = 0; 511 pci_header.msix.table_size = (VIRTIO_NET_NUM_QUEUES + 1) | PCI_MSIX_FLAGS_ENABLE; 512 pci_header.msix.table_offset = 1; /* Use BAR 1 */ 513 514 virtio_net__io_thread_init(params->kvm); 515 516 for (i = 0; i < VIRTIO_NET_NUM_QUEUES; i++) { 517 ioevent = (struct ioevent) { 518 .io_addr = net_base_addr + VIRTIO_PCI_QUEUE_NOTIFY, 519 .io_len = sizeof(u16), 520 .fn = ioevent_callback, 521 .datamatch = i, 522 .fn_ptr = (void *)(long)i, 523 .fn_kvm = params->kvm, 524 .fd = eventfd(0, 0), 525 }; 526 527 ioeventfd__add_event(&ioevent); 528 } 529 } 530