Lines Matching +full:- +full:- +full:disable +full:- +full:rdma
2 * RDMA protocol and interfaces
4 * Copyright IBM, Corp. 2010-2013
5 * Copyright Red Hat, Inc. 2015-2016
13 * later. See the COPYING file in the top-level directory.
21 #include "rdma.h"
23 #include "migration-stats.h"
24 #include "qemu-file.h"
26 #include "qemu/error-report.h"
27 #include "qemu/main-loop.h"
37 #include <rdma/rdma_cma.h>
52 * This is only for non-live state being migrated.
78 * A work request ID is 64-bits and we split up these bits
81 * bits 0-15 : type of control message, 2^16
82 * bits 16-29: ram block index, 2^14
83 * bits 30-63: ram block chunk number, 2^34
85 * The last two bit ranges are only used for RDMA writes,
94 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
97 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
102 * RDMA migration protocol:
103 * 1. RDMA Writes (data messages, i.e. RAM)
114 * Work request IDs for IB SEND messages only (not RDMA writes).
134 RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
141 RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
148 * This is *not* used for RDMA writes, only IB Send/Recv.
158 * Negotiate RDMA capabilities during connection-setup time.
167 cap->version = htonl(cap->version); in caps_to_network()
168 cap->flags = htonl(cap->flags); in caps_to_network()
173 cap->version = ntohl(cap->version); in network_to_caps()
174 cap->flags = ntohl(cap->flags); in network_to_caps()
178 * Representation of a RAMBlock from an RDMA perspective.
190 struct ibv_mr **pmr; /* MRs for chunk-level registration */
191 struct ibv_mr *mr; /* MR for non-chunk-level registration */
192 uint32_t *remote_keys; /* rkeys for chunk-level registration */
193 uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
204 * This gets transmitted by the dest during connection-time
207 * the information needed to perform the actual RDMA.
262 db->remote_host_addr = htonll(db->remote_host_addr); in dest_block_to_network()
263 db->offset = htonll(db->offset); in dest_block_to_network()
264 db->length = htonll(db->length); in dest_block_to_network()
265 db->remote_rkey = htonl(db->remote_rkey); in dest_block_to_network()
270 db->remote_host_addr = ntohll(db->remote_host_addr); in network_to_dest_block()
271 db->offset = ntohll(db->offset); in network_to_dest_block()
272 db->length = ntohll(db->length); in network_to_dest_block()
273 db->remote_rkey = ntohl(db->remote_rkey); in network_to_dest_block()
278 * the RAMBlock descriptions at connection-time.
288 * Main data structure for RDMA state.
291 * having more than one RDMA connection open at the same time.
323 * infiniband-specific variables for opening the device
327 * cm_id->verbs, cm_id->channel, and cm_id->qp.
380 #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
406 control->type = htonl(control->type); in control_to_network()
407 control->len = htonl(control->len); in control_to_network()
408 control->repeat = htonl(control->repeat); in control_to_network()
413 control->type = ntohl(control->type); in network_to_control()
414 control->len = ntohl(control->len); in network_to_control()
415 control->repeat = ntohl(control->repeat); in network_to_control()
422 * the actual RDMA operation.
434 static bool rdma_errored(RDMAContext *rdma) in rdma_errored() argument
436 if (rdma->errored && !rdma->error_reported) { in rdma_errored()
437 error_report("RDMA is in an error state waiting migration" in rdma_errored()
439 rdma->error_reported = true; in rdma_errored()
441 return rdma->errored; in rdma_errored()
444 static void register_to_network(RDMAContext *rdma, RDMARegister *reg) in register_to_network() argument
447 local_block = &rdma->local_ram_blocks.block[reg->current_index]; in register_to_network()
449 if (local_block->is_ram_block) { in register_to_network()
454 reg->key.current_addr -= local_block->offset; in register_to_network()
455 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset; in register_to_network()
457 reg->key.current_addr = htonll(reg->key.current_addr); in register_to_network()
458 reg->current_index = htonl(reg->current_index); in register_to_network()
459 reg->chunks = htonll(reg->chunks); in register_to_network()
464 reg->key.current_addr = ntohll(reg->key.current_addr); in network_to_register()
465 reg->current_index = ntohl(reg->current_index); in network_to_register()
466 reg->chunks = ntohll(reg->chunks); in network_to_register()
476 static void compress_to_network(RDMAContext *rdma, RDMACompress *comp) in compress_to_network() argument
478 comp->value = htonl(comp->value); in compress_to_network()
480 * comp->offset as passed in is an address in the local ram_addr_t in compress_to_network()
483 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset; in compress_to_network()
484 comp->offset += rdma->dest_blocks[comp->block_idx].offset; in compress_to_network()
485 comp->block_idx = htonl(comp->block_idx); in compress_to_network()
486 comp->offset = htonll(comp->offset); in compress_to_network()
487 comp->length = htonll(comp->length); in compress_to_network()
492 comp->value = ntohl(comp->value); in network_to_compress()
493 comp->block_idx = ntohl(comp->block_idx); in network_to_compress()
494 comp->offset = ntohll(comp->offset); in network_to_compress()
495 comp->length = ntohll(comp->length); in network_to_compress()
501 * the RDMA operation.
511 result->rkey = htonl(result->rkey); in result_to_network()
512 result->host_addr = htonll(result->host_addr); in result_to_network()
517 result->rkey = ntohl(result->rkey); in network_to_result()
518 result->host_addr = ntohll(result->host_addr); in network_to_result()
521 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
524 int (*callback)(RDMAContext *rdma,
531 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT; in ram_chunk_index()
537 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr + in ram_chunk_start()
547 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) { in ram_chunk_end()
548 result = rdma_ram_block->local_host_addr + rdma_ram_block->length; in ram_chunk_end()
554 static void rdma_add_block(RDMAContext *rdma, const char *block_name, in rdma_add_block() argument
558 RDMALocalBlocks *local = &rdma->local_ram_blocks; in rdma_add_block()
560 RDMALocalBlock *old = local->block; in rdma_add_block()
562 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1); in rdma_add_block()
564 if (local->nb_blocks) { in rdma_add_block()
565 if (rdma->blockmap) { in rdma_add_block()
566 for (int x = 0; x < local->nb_blocks; x++) { in rdma_add_block()
567 g_hash_table_remove(rdma->blockmap, in rdma_add_block()
569 g_hash_table_insert(rdma->blockmap, in rdma_add_block()
571 &local->block[x]); in rdma_add_block()
574 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); in rdma_add_block()
578 block = &local->block[local->nb_blocks]; in rdma_add_block()
580 block->block_name = g_strdup(block_name); in rdma_add_block()
581 block->local_host_addr = host_addr; in rdma_add_block()
582 block->offset = block_offset; in rdma_add_block()
583 block->length = length; in rdma_add_block()
584 block->index = local->nb_blocks; in rdma_add_block()
585 block->src_index = ~0U; /* Filled in by the receipt of the block list */ in rdma_add_block()
586 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; in rdma_add_block()
587 block->transit_bitmap = bitmap_new(block->nb_chunks); in rdma_add_block()
588 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); in rdma_add_block()
589 block->unregister_bitmap = bitmap_new(block->nb_chunks); in rdma_add_block()
590 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks); in rdma_add_block()
591 block->remote_keys = g_new0(uint32_t, block->nb_chunks); in rdma_add_block()
593 block->is_ram_block = local->init ? false : true; in rdma_add_block()
595 if (rdma->blockmap) { in rdma_add_block()
596 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block); in rdma_add_block()
599 trace_rdma_add_block(block_name, local->nb_blocks, in rdma_add_block()
600 (uintptr_t) block->local_host_addr, in rdma_add_block()
601 block->offset, block->length, in rdma_add_block()
602 (uintptr_t) (block->local_host_addr + block->length), in rdma_add_block()
603 BITS_TO_LONGS(block->nb_chunks) * in rdma_add_block()
605 block->nb_chunks); in rdma_add_block()
607 local->nb_blocks++; in rdma_add_block()
630 static void qemu_rdma_init_ram_blocks(RDMAContext *rdma) in qemu_rdma_init_ram_blocks() argument
632 RDMALocalBlocks *local = &rdma->local_ram_blocks; in qemu_rdma_init_ram_blocks()
635 assert(rdma->blockmap == NULL); in qemu_rdma_init_ram_blocks()
637 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma); in qemu_rdma_init_ram_blocks()
639 trace_qemu_rdma_init_ram_blocks(local->nb_blocks); in qemu_rdma_init_ram_blocks()
640 rdma->dest_blocks = g_new0(RDMADestBlock, in qemu_rdma_init_ram_blocks()
641 rdma->local_ram_blocks.nb_blocks); in qemu_rdma_init_ram_blocks()
642 local->init = true; in qemu_rdma_init_ram_blocks()
649 static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) in rdma_delete_block() argument
651 RDMALocalBlocks *local = &rdma->local_ram_blocks; in rdma_delete_block()
652 RDMALocalBlock *old = local->block; in rdma_delete_block()
654 if (rdma->blockmap) { in rdma_delete_block()
655 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); in rdma_delete_block()
657 if (block->pmr) { in rdma_delete_block()
658 for (int j = 0; j < block->nb_chunks; j++) { in rdma_delete_block()
659 if (!block->pmr[j]) { in rdma_delete_block()
662 ibv_dereg_mr(block->pmr[j]); in rdma_delete_block()
663 rdma->total_registrations--; in rdma_delete_block()
665 g_free(block->pmr); in rdma_delete_block()
666 block->pmr = NULL; in rdma_delete_block()
669 if (block->mr) { in rdma_delete_block()
670 ibv_dereg_mr(block->mr); in rdma_delete_block()
671 rdma->total_registrations--; in rdma_delete_block()
672 block->mr = NULL; in rdma_delete_block()
675 g_free(block->transit_bitmap); in rdma_delete_block()
676 block->transit_bitmap = NULL; in rdma_delete_block()
678 g_free(block->unregister_bitmap); in rdma_delete_block()
679 block->unregister_bitmap = NULL; in rdma_delete_block()
681 g_free(block->remote_keys); in rdma_delete_block()
682 block->remote_keys = NULL; in rdma_delete_block()
684 g_free(block->block_name); in rdma_delete_block()
685 block->block_name = NULL; in rdma_delete_block()
687 if (rdma->blockmap) { in rdma_delete_block()
688 for (int x = 0; x < local->nb_blocks; x++) { in rdma_delete_block()
689 g_hash_table_remove(rdma->blockmap, in rdma_delete_block()
694 if (local->nb_blocks > 1) { in rdma_delete_block()
696 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1); in rdma_delete_block()
698 if (block->index) { in rdma_delete_block()
699 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index); in rdma_delete_block()
702 if (block->index < (local->nb_blocks - 1)) { in rdma_delete_block()
703 memcpy(local->block + block->index, old + (block->index + 1), in rdma_delete_block()
705 (local->nb_blocks - (block->index + 1))); in rdma_delete_block()
706 for (int x = block->index; x < local->nb_blocks - 1; x++) { in rdma_delete_block()
707 local->block[x].index--; in rdma_delete_block()
711 assert(block == local->block); in rdma_delete_block()
712 local->block = NULL; in rdma_delete_block()
715 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr, in rdma_delete_block()
716 block->offset, block->length, in rdma_delete_block()
717 (uintptr_t)(block->local_host_addr + block->length), in rdma_delete_block()
718 BITS_TO_LONGS(block->nb_chunks) * in rdma_delete_block()
719 sizeof(unsigned long) * 8, block->nb_chunks); in rdma_delete_block()
723 local->nb_blocks--; in rdma_delete_block()
725 if (local->nb_blocks && rdma->blockmap) { in rdma_delete_block()
726 for (int x = 0; x < local->nb_blocks; x++) { in rdma_delete_block()
727 g_hash_table_insert(rdma->blockmap, in rdma_delete_block()
728 (void *)(uintptr_t)local->block[x].offset, in rdma_delete_block()
729 &local->block[x]); in rdma_delete_block()
735 * Trace RDMA device open, with device details.
747 verbs->device->name, in qemu_rdma_dump_id()
748 verbs->device->dev_name, in qemu_rdma_dump_id()
749 verbs->device->dev_path, in qemu_rdma_dump_id()
750 verbs->device->ibdev_path, in qemu_rdma_dump_id()
758 * Trace RDMA gid addressing information.
759 * Useful for understanding the RDMA device hierarchy in the kernel.
765 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid); in qemu_rdma_dump_gid()
766 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid); in qemu_rdma_dump_gid()
771 * Figure out which RDMA device corresponds to the requested IP hostname
775 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) in qemu_rdma_resolve_host() argument
783 if (rdma->host == NULL || !strcmp(rdma->host, "")) { in qemu_rdma_resolve_host()
784 error_setg(errp, "RDMA ERROR: RDMA hostname has not been set"); in qemu_rdma_resolve_host()
785 return -1; in qemu_rdma_resolve_host()
789 rdma->channel = rdma_create_event_channel(); in qemu_rdma_resolve_host()
790 if (!rdma->channel) { in qemu_rdma_resolve_host()
791 error_setg(errp, "RDMA ERROR: could not create CM channel"); in qemu_rdma_resolve_host()
792 return -1; in qemu_rdma_resolve_host()
796 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP); in qemu_rdma_resolve_host()
798 error_setg(errp, "RDMA ERROR: could not create channel id"); in qemu_rdma_resolve_host()
802 snprintf(port_str, 16, "%d", rdma->port); in qemu_rdma_resolve_host()
805 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); in qemu_rdma_resolve_host()
807 error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s", in qemu_rdma_resolve_host()
808 rdma->host); in qemu_rdma_resolve_host()
813 for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) { in qemu_rdma_resolve_host()
815 inet_ntop(e->ai_family, in qemu_rdma_resolve_host()
816 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); in qemu_rdma_resolve_host()
817 trace_qemu_rdma_resolve_host_trying(rdma->host, ip); in qemu_rdma_resolve_host()
819 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, in qemu_rdma_resolve_host()
827 error_setg(errp, "RDMA ERROR: could not resolve address %s", rdma->host); in qemu_rdma_resolve_host()
832 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id); in qemu_rdma_resolve_host()
834 ret = rdma_get_cm_event(rdma->channel, &cm_event); in qemu_rdma_resolve_host()
836 error_setg(errp, "RDMA ERROR: could not perform event_addr_resolved"); in qemu_rdma_resolve_host()
840 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { in qemu_rdma_resolve_host()
842 "RDMA ERROR: result not equal to event_addr_resolved %s", in qemu_rdma_resolve_host()
843 rdma_event_str(cm_event->event)); in qemu_rdma_resolve_host()
850 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS); in qemu_rdma_resolve_host()
852 error_setg(errp, "RDMA ERROR: could not resolve rdma route"); in qemu_rdma_resolve_host()
856 ret = rdma_get_cm_event(rdma->channel, &cm_event); in qemu_rdma_resolve_host()
858 error_setg(errp, "RDMA ERROR: could not perform event_route_resolved"); in qemu_rdma_resolve_host()
861 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { in qemu_rdma_resolve_host()
862 error_setg(errp, "RDMA ERROR: " in qemu_rdma_resolve_host()
864 rdma_event_str(cm_event->event)); in qemu_rdma_resolve_host()
869 rdma->verbs = rdma->cm_id->verbs; in qemu_rdma_resolve_host()
870 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs); in qemu_rdma_resolve_host()
871 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id); in qemu_rdma_resolve_host()
875 rdma_destroy_id(rdma->cm_id); in qemu_rdma_resolve_host()
876 rdma->cm_id = NULL; in qemu_rdma_resolve_host()
878 rdma_destroy_event_channel(rdma->channel); in qemu_rdma_resolve_host()
879 rdma->channel = NULL; in qemu_rdma_resolve_host()
880 return -1; in qemu_rdma_resolve_host()
886 static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma, Error **errp) in qemu_rdma_alloc_pd_cq() argument
889 rdma->pd = ibv_alloc_pd(rdma->verbs); in qemu_rdma_alloc_pd_cq()
890 if (!rdma->pd) { in qemu_rdma_alloc_pd_cq()
892 return -1; in qemu_rdma_alloc_pd_cq()
896 rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs); in qemu_rdma_alloc_pd_cq()
897 if (!rdma->recv_comp_channel) { in qemu_rdma_alloc_pd_cq()
905 rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), in qemu_rdma_alloc_pd_cq()
906 NULL, rdma->recv_comp_channel, 0); in qemu_rdma_alloc_pd_cq()
907 if (!rdma->recv_cq) { in qemu_rdma_alloc_pd_cq()
913 rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs); in qemu_rdma_alloc_pd_cq()
914 if (!rdma->send_comp_channel) { in qemu_rdma_alloc_pd_cq()
919 rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), in qemu_rdma_alloc_pd_cq()
920 NULL, rdma->send_comp_channel, 0); in qemu_rdma_alloc_pd_cq()
921 if (!rdma->send_cq) { in qemu_rdma_alloc_pd_cq()
929 if (rdma->pd) { in qemu_rdma_alloc_pd_cq()
930 ibv_dealloc_pd(rdma->pd); in qemu_rdma_alloc_pd_cq()
932 if (rdma->recv_comp_channel) { in qemu_rdma_alloc_pd_cq()
933 ibv_destroy_comp_channel(rdma->recv_comp_channel); in qemu_rdma_alloc_pd_cq()
935 if (rdma->send_comp_channel) { in qemu_rdma_alloc_pd_cq()
936 ibv_destroy_comp_channel(rdma->send_comp_channel); in qemu_rdma_alloc_pd_cq()
938 if (rdma->recv_cq) { in qemu_rdma_alloc_pd_cq()
939 ibv_destroy_cq(rdma->recv_cq); in qemu_rdma_alloc_pd_cq()
940 rdma->recv_cq = NULL; in qemu_rdma_alloc_pd_cq()
942 rdma->pd = NULL; in qemu_rdma_alloc_pd_cq()
943 rdma->recv_comp_channel = NULL; in qemu_rdma_alloc_pd_cq()
944 rdma->send_comp_channel = NULL; in qemu_rdma_alloc_pd_cq()
945 return -1; in qemu_rdma_alloc_pd_cq()
952 static int qemu_rdma_alloc_qp(RDMAContext *rdma) in qemu_rdma_alloc_qp() argument
960 attr.send_cq = rdma->send_cq; in qemu_rdma_alloc_qp()
961 attr.recv_cq = rdma->recv_cq; in qemu_rdma_alloc_qp()
964 if (rdma_create_qp(rdma->cm_id, rdma->pd, &attr) < 0) { in qemu_rdma_alloc_qp()
965 return -1; in qemu_rdma_alloc_qp()
968 rdma->qp = rdma->cm_id->qp; in qemu_rdma_alloc_qp()
972 /* Check whether On-Demand Paging is supported by RDAM device */
1010 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma, Error **errp) in qemu_rdma_reg_whole_ram_blocks() argument
1013 RDMALocalBlocks *local = &rdma->local_ram_blocks; in qemu_rdma_reg_whole_ram_blocks()
1015 for (i = 0; i < local->nb_blocks; i++) { in qemu_rdma_reg_whole_ram_blocks()
1018 local->block[i].mr = in qemu_rdma_reg_whole_ram_blocks()
1019 ibv_reg_mr(rdma->pd, in qemu_rdma_reg_whole_ram_blocks()
1020 local->block[i].local_host_addr, in qemu_rdma_reg_whole_ram_blocks()
1021 local->block[i].length, access in qemu_rdma_reg_whole_ram_blocks()
1029 if (!local->block[i].mr && in qemu_rdma_reg_whole_ram_blocks()
1030 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) { in qemu_rdma_reg_whole_ram_blocks()
1033 local->block[i].mr = in qemu_rdma_reg_whole_ram_blocks()
1034 ibv_reg_mr(rdma->pd, in qemu_rdma_reg_whole_ram_blocks()
1035 local->block[i].local_host_addr, in qemu_rdma_reg_whole_ram_blocks()
1036 local->block[i].length, access); in qemu_rdma_reg_whole_ram_blocks()
1037 trace_qemu_rdma_register_odp_mr(local->block[i].block_name); in qemu_rdma_reg_whole_ram_blocks()
1039 if (local->block[i].mr) { in qemu_rdma_reg_whole_ram_blocks()
1040 qemu_rdma_advise_prefetch_mr(rdma->pd, in qemu_rdma_reg_whole_ram_blocks()
1041 (uintptr_t)local->block[i].local_host_addr, in qemu_rdma_reg_whole_ram_blocks()
1042 local->block[i].length, in qemu_rdma_reg_whole_ram_blocks()
1043 local->block[i].mr->lkey, in qemu_rdma_reg_whole_ram_blocks()
1044 local->block[i].block_name, in qemu_rdma_reg_whole_ram_blocks()
1049 if (!local->block[i].mr) { in qemu_rdma_reg_whole_ram_blocks()
1054 rdma->total_registrations++; in qemu_rdma_reg_whole_ram_blocks()
1060 for (i--; i >= 0; i--) { in qemu_rdma_reg_whole_ram_blocks()
1061 ibv_dereg_mr(local->block[i].mr); in qemu_rdma_reg_whole_ram_blocks()
1062 local->block[i].mr = NULL; in qemu_rdma_reg_whole_ram_blocks()
1063 rdma->total_registrations--; in qemu_rdma_reg_whole_ram_blocks()
1066 return -1; in qemu_rdma_reg_whole_ram_blocks()
1077 static void qemu_rdma_search_ram_block(RDMAContext *rdma, in qemu_rdma_search_ram_block() argument
1085 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, in qemu_rdma_search_ram_block()
1088 assert(current_addr >= block->offset); in qemu_rdma_search_ram_block()
1089 assert((current_addr + length) <= (block->offset + block->length)); in qemu_rdma_search_ram_block()
1091 *block_index = block->index; in qemu_rdma_search_ram_block()
1092 *chunk_index = ram_chunk_index(block->local_host_addr, in qemu_rdma_search_ram_block()
1093 block->local_host_addr + (current_addr - block->offset)); in qemu_rdma_search_ram_block()
1101 * to perform the actual RDMA operation.
1103 static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, in qemu_rdma_register_and_get_keys() argument
1108 if (block->mr) { in qemu_rdma_register_and_get_keys()
1110 *lkey = block->mr->lkey; in qemu_rdma_register_and_get_keys()
1113 *rkey = block->mr->rkey; in qemu_rdma_register_and_get_keys()
1119 if (!block->pmr) { in qemu_rdma_register_and_get_keys()
1120 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks); in qemu_rdma_register_and_get_keys()
1128 if (!block->pmr[chunk]) { in qemu_rdma_register_and_get_keys()
1129 uint64_t len = chunk_end - chunk_start; in qemu_rdma_register_and_get_keys()
1135 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access); in qemu_rdma_register_and_get_keys()
1142 if (!block->pmr[chunk] && in qemu_rdma_register_and_get_keys()
1143 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) { in qemu_rdma_register_and_get_keys()
1146 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access); in qemu_rdma_register_and_get_keys()
1147 trace_qemu_rdma_register_odp_mr(block->block_name); in qemu_rdma_register_and_get_keys()
1149 if (block->pmr[chunk]) { in qemu_rdma_register_and_get_keys()
1150 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start, in qemu_rdma_register_and_get_keys()
1151 len, block->pmr[chunk]->lkey, in qemu_rdma_register_and_get_keys()
1152 block->block_name, rkey); in qemu_rdma_register_and_get_keys()
1157 if (!block->pmr[chunk]) { in qemu_rdma_register_and_get_keys()
1158 return -1; in qemu_rdma_register_and_get_keys()
1160 rdma->total_registrations++; in qemu_rdma_register_and_get_keys()
1163 *lkey = block->pmr[chunk]->lkey; in qemu_rdma_register_and_get_keys()
1166 *rkey = block->pmr[chunk]->rkey; in qemu_rdma_register_and_get_keys()
1175 static int qemu_rdma_reg_control(RDMAContext *rdma, int idx) in qemu_rdma_reg_control() argument
1177 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd, in qemu_rdma_reg_control()
1178 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER, in qemu_rdma_reg_control()
1180 if (rdma->wr_data[idx].control_mr) { in qemu_rdma_reg_control()
1181 rdma->total_registrations++; in qemu_rdma_reg_control()
1184 return -1; in qemu_rdma_reg_control()
1188 * Perform a non-optimized memory unregistration after every transfer
1189 * for demonstration purposes, only if pin-all is not requested.
1193 - for bit clearing
1194 - and for receipt of unregister messages
1198 static int qemu_rdma_unregister_waiting(RDMAContext *rdma) in qemu_rdma_unregister_waiting() argument
1202 while (rdma->unregistrations[rdma->unregister_current]) { in qemu_rdma_unregister_waiting()
1204 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current]; in qemu_rdma_unregister_waiting()
1210 &(rdma->local_ram_blocks.block[index]); in qemu_rdma_unregister_waiting()
1220 rdma->unregister_current); in qemu_rdma_unregister_waiting()
1222 rdma->unregistrations[rdma->unregister_current] = 0; in qemu_rdma_unregister_waiting()
1223 rdma->unregister_current++; in qemu_rdma_unregister_waiting()
1225 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) { in qemu_rdma_unregister_waiting()
1226 rdma->unregister_current = 0; in qemu_rdma_unregister_waiting()
1231 * Unregistration is speculative (because migration is single-threaded in qemu_rdma_unregister_waiting()
1237 clear_bit(chunk, block->unregister_bitmap); in qemu_rdma_unregister_waiting()
1239 if (test_bit(chunk, block->transit_bitmap)) { in qemu_rdma_unregister_waiting()
1246 ret = ibv_dereg_mr(block->pmr[chunk]); in qemu_rdma_unregister_waiting()
1247 block->pmr[chunk] = NULL; in qemu_rdma_unregister_waiting()
1248 block->remote_keys[chunk] = 0; in qemu_rdma_unregister_waiting()
1253 return -1; in qemu_rdma_unregister_waiting()
1255 rdma->total_registrations--; in qemu_rdma_unregister_waiting()
1258 register_to_network(rdma, ®); in qemu_rdma_unregister_waiting()
1259 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, in qemu_rdma_unregister_waiting()
1263 return -1; in qemu_rdma_unregister_waiting()
1288 static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq, in qemu_rdma_poll() argument
1303 return -1; in qemu_rdma_poll()
1309 return -1; in qemu_rdma_poll()
1312 if (rdma->control_ready_expected && in qemu_rdma_poll()
1314 trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id, in qemu_rdma_poll()
1315 rdma->nb_sent); in qemu_rdma_poll()
1316 rdma->control_ready_expected = 0; in qemu_rdma_poll()
1324 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); in qemu_rdma_poll()
1326 trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent, in qemu_rdma_poll()
1327 index, chunk, block->local_host_addr, in qemu_rdma_poll()
1328 (void *)(uintptr_t)block->remote_host_addr); in qemu_rdma_poll()
1330 clear_bit(chunk, block->transit_bitmap); in qemu_rdma_poll()
1332 if (rdma->nb_sent > 0) { in qemu_rdma_poll()
1333 rdma->nb_sent--; in qemu_rdma_poll()
1336 trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent); in qemu_rdma_poll()
1348 * Returns 0 on success, none-0 on error.
1350 static int qemu_rdma_wait_comp_channel(RDMAContext *rdma, in qemu_rdma_wait_comp_channel() argument
1359 if (rdma->migration_started_on_destination && in qemu_rdma_wait_comp_channel()
1360 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) { in qemu_rdma_wait_comp_channel()
1361 yield_until_fd_readable(comp_channel->fd); in qemu_rdma_wait_comp_channel()
1370 while (!rdma->errored && !rdma->received_error) { in qemu_rdma_wait_comp_channel()
1372 pfds[0].fd = comp_channel->fd; in qemu_rdma_wait_comp_channel()
1376 pfds[1].fd = rdma->channel->fd; in qemu_rdma_wait_comp_channel()
1389 if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) { in qemu_rdma_wait_comp_channel()
1390 return -1; in qemu_rdma_wait_comp_channel()
1393 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED || in qemu_rdma_wait_comp_channel()
1394 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { in qemu_rdma_wait_comp_channel()
1396 return -1; in qemu_rdma_wait_comp_channel()
1405 default: /* Error of some type - in qemu_rdma_wait_comp_channel()
1408 return -1; in qemu_rdma_wait_comp_channel()
1411 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) { in qemu_rdma_wait_comp_channel()
1413 return -1; in qemu_rdma_wait_comp_channel()
1418 if (rdma->received_error) { in qemu_rdma_wait_comp_channel()
1419 return -1; in qemu_rdma_wait_comp_channel()
1421 return -rdma->errored; in qemu_rdma_wait_comp_channel()
1424 static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid) in to_channel() argument
1426 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel : in to_channel()
1427 rdma->recv_comp_channel; in to_channel()
1430 static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid) in to_cq() argument
1432 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq; in to_cq()
1444 * The only exception is actual RDMA Write completions. These
1448 static int qemu_rdma_block_for_wrid(RDMAContext *rdma, in qemu_rdma_block_for_wrid() argument
1456 struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested); in qemu_rdma_block_for_wrid()
1457 struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested); in qemu_rdma_block_for_wrid()
1460 return -1; in qemu_rdma_block_for_wrid()
1464 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len); in qemu_rdma_block_for_wrid()
1466 return -1; in qemu_rdma_block_for_wrid()
1484 ret = qemu_rdma_wait_comp_channel(rdma, ch); in qemu_rdma_block_for_wrid()
1501 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len); in qemu_rdma_block_for_wrid()
1532 rdma->errored = true; in qemu_rdma_block_for_wrid()
1533 return -1; in qemu_rdma_block_for_wrid()
1540 static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, in qemu_rdma_post_send_control() argument
1545 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL]; in qemu_rdma_post_send_control()
1548 .addr = (uintptr_t)(wr->control), in qemu_rdma_post_send_control()
1549 .length = head->len + sizeof(RDMAControlHeader), in qemu_rdma_post_send_control()
1550 .lkey = wr->control_mr->lkey, in qemu_rdma_post_send_control()
1560 trace_qemu_rdma_post_send_control(control_desc(head->type)); in qemu_rdma_post_send_control()
1565 * (not RAM in a performance-critical path), then its OK for now. in qemu_rdma_post_send_control()
1570 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head)); in qemu_rdma_post_send_control()
1571 memcpy(wr->control, head, sizeof(RDMAControlHeader)); in qemu_rdma_post_send_control()
1572 control_to_network((void *) wr->control); in qemu_rdma_post_send_control()
1575 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len); in qemu_rdma_post_send_control()
1579 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); in qemu_rdma_post_send_control()
1583 return -1; in qemu_rdma_post_send_control()
1586 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL); in qemu_rdma_post_send_control()
1588 error_setg(errp, "rdma migration: send polling control error"); in qemu_rdma_post_send_control()
1589 return -1; in qemu_rdma_post_send_control()
1599 static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx, in qemu_rdma_post_recv_control() argument
1604 .addr = (uintptr_t)(rdma->wr_data[idx].control), in qemu_rdma_post_recv_control()
1606 .lkey = rdma->wr_data[idx].control_mr->lkey, in qemu_rdma_post_recv_control()
1616 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) { in qemu_rdma_post_recv_control()
1618 return -1; in qemu_rdma_post_recv_control()
1627 static int qemu_rdma_exchange_get_response(RDMAContext *rdma, in qemu_rdma_exchange_get_response() argument
1632 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx, in qemu_rdma_exchange_get_response()
1636 error_setg(errp, "rdma migration: recv polling control error!"); in qemu_rdma_exchange_get_response()
1637 return -1; in qemu_rdma_exchange_get_response()
1640 network_to_control((void *) rdma->wr_data[idx].control); in qemu_rdma_exchange_get_response()
1641 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader)); in qemu_rdma_exchange_get_response()
1646 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type), in qemu_rdma_exchange_get_response()
1647 head->type); in qemu_rdma_exchange_get_response()
1648 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) { in qemu_rdma_exchange_get_response()
1652 control_desc(head->type), head->type, head->len); in qemu_rdma_exchange_get_response()
1653 if (head->type == RDMA_CONTROL_ERROR) { in qemu_rdma_exchange_get_response()
1654 rdma->received_error = true; in qemu_rdma_exchange_get_response()
1656 return -1; in qemu_rdma_exchange_get_response()
1658 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) { in qemu_rdma_exchange_get_response()
1659 error_setg(errp, "too long length: %d", head->len); in qemu_rdma_exchange_get_response()
1660 return -1; in qemu_rdma_exchange_get_response()
1662 if (sizeof(*head) + head->len != byte_len) { in qemu_rdma_exchange_get_response()
1664 head->len, byte_len); in qemu_rdma_exchange_get_response()
1665 return -1; in qemu_rdma_exchange_get_response()
1679 static void qemu_rdma_move_header(RDMAContext *rdma, int idx, in qemu_rdma_move_header() argument
1682 rdma->wr_data[idx].control_len = head->len; in qemu_rdma_move_header()
1683 rdma->wr_data[idx].control_curr = in qemu_rdma_move_header()
1684 rdma->wr_data[idx].control + sizeof(RDMAControlHeader); in qemu_rdma_move_header()
1688 * This is an 'atomic' high-level operation to deliver a single, unified
1689 * control-channel message.
1698 * instead piggy-backing on the acknowledgement.
1700 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, in qemu_rdma_exchange_send() argument
1703 int (*callback)(RDMAContext *rdma, in qemu_rdma_exchange_send() argument
1713 if (rdma->control_ready_expected) { in qemu_rdma_exchange_send()
1716 ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored, in qemu_rdma_exchange_send()
1720 return -1; in qemu_rdma_exchange_send()
1728 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA, errp); in qemu_rdma_exchange_send()
1730 return -1; in qemu_rdma_exchange_send()
1737 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp); in qemu_rdma_exchange_send()
1739 return -1; in qemu_rdma_exchange_send()
1745 ret = qemu_rdma_post_send_control(rdma, data, head, errp); in qemu_rdma_exchange_send()
1748 return -1; in qemu_rdma_exchange_send()
1757 ret = callback(rdma, errp); in qemu_rdma_exchange_send()
1759 return -1; in qemu_rdma_exchange_send()
1763 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type)); in qemu_rdma_exchange_send()
1764 ret = qemu_rdma_exchange_get_response(rdma, resp, in qemu_rdma_exchange_send()
1765 resp->type, RDMA_WRID_DATA, in qemu_rdma_exchange_send()
1769 return -1; in qemu_rdma_exchange_send()
1772 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp); in qemu_rdma_exchange_send()
1776 trace_qemu_rdma_exchange_send_received(control_desc(resp->type)); in qemu_rdma_exchange_send()
1779 rdma->control_ready_expected = 1; in qemu_rdma_exchange_send()
1785 * This is an 'atomic' high-level operation to receive a single, unified
1786 * control-channel message.
1788 static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head, in qemu_rdma_exchange_recv() argument
1801 ret = qemu_rdma_post_send_control(rdma, NULL, &ready, errp); in qemu_rdma_exchange_recv()
1804 return -1; in qemu_rdma_exchange_recv()
1810 ret = qemu_rdma_exchange_get_response(rdma, head, in qemu_rdma_exchange_recv()
1814 return -1; in qemu_rdma_exchange_recv()
1817 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head); in qemu_rdma_exchange_recv()
1822 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp); in qemu_rdma_exchange_recv()
1824 return -1; in qemu_rdma_exchange_recv()
1831 * Write an actual chunk of memory using RDMA.
1833 * If we're using dynamic registration on the dest-side, we have to
1836 static int qemu_rdma_write_one(RDMAContext *rdma, in qemu_rdma_write_one() argument
1846 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]); in qemu_rdma_write_one()
1856 sge.addr = (uintptr_t)(block->local_host_addr + in qemu_rdma_write_one()
1857 (current_addr - block->offset)); in qemu_rdma_write_one()
1860 chunk = ram_chunk_index(block->local_host_addr, in qemu_rdma_write_one()
1864 if (block->is_ram_block) { in qemu_rdma_write_one()
1868 chunks--; in qemu_rdma_write_one()
1871 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT); in qemu_rdma_write_one()
1873 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { in qemu_rdma_write_one()
1874 chunks--; in qemu_rdma_write_one()
1885 while (test_bit(chunk, block->transit_bitmap)) { in qemu_rdma_write_one()
1888 sge.addr, length, rdma->nb_sent, block->nb_chunks); in qemu_rdma_write_one()
1890 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); in qemu_rdma_write_one()
1896 current_index, chunk, sge.addr, length, rdma->nb_sent); in qemu_rdma_write_one()
1897 return -1; in qemu_rdma_write_one()
1901 if (!rdma->pin_all || !block->is_ram_block) { in qemu_rdma_write_one()
1902 if (!block->remote_keys[chunk]) { in qemu_rdma_write_one()
1906 * memset() + madvise() the entire chunk without RDMA. in qemu_rdma_write_one()
1923 compress_to_network(rdma, &comp); in qemu_rdma_write_one()
1924 ret = qemu_rdma_exchange_send(rdma, &head, in qemu_rdma_write_one()
1928 return -1; in qemu_rdma_write_one()
1950 if (block->is_ram_block) { in qemu_rdma_write_one()
1960 register_to_network(rdma, ®); in qemu_rdma_write_one()
1961 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, in qemu_rdma_write_one()
1964 return -1; in qemu_rdma_write_one()
1968 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, in qemu_rdma_write_one()
1972 return -1; in qemu_rdma_write_one()
1976 rdma->wr_data[reg_result_idx].control_curr; in qemu_rdma_write_one()
1980 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk], in qemu_rdma_write_one()
1981 reg_result->rkey, chunk); in qemu_rdma_write_one()
1983 block->remote_keys[chunk] = reg_result->rkey; in qemu_rdma_write_one()
1984 block->remote_host_addr = reg_result->host_addr; in qemu_rdma_write_one()
1987 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, in qemu_rdma_write_one()
1991 return -1; in qemu_rdma_write_one()
1995 send_wr.wr.rdma.rkey = block->remote_keys[chunk]; in qemu_rdma_write_one()
1997 send_wr.wr.rdma.rkey = block->remote_rkey; in qemu_rdma_write_one()
1999 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, in qemu_rdma_write_one()
2003 return -1; in qemu_rdma_write_one()
2020 send_wr.wr.rdma.remote_addr = block->remote_host_addr + in qemu_rdma_write_one()
2021 (current_addr - block->offset); in qemu_rdma_write_one()
2023 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr, in qemu_rdma_write_one()
2028 * per the specification they are positive - no idea why. in qemu_rdma_write_one()
2030 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); in qemu_rdma_write_one()
2034 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); in qemu_rdma_write_one()
2036 error_setg(errp, "rdma migration: failed to make " in qemu_rdma_write_one()
2038 return -1; in qemu_rdma_write_one()
2045 "rdma migration: post rdma write failed"); in qemu_rdma_write_one()
2046 return -1; in qemu_rdma_write_one()
2049 set_bit(chunk, block->transit_bitmap); in qemu_rdma_write_one()
2053 * overhead at all. I will assume that RDMA is magicaly and don't in qemu_rdma_write_one()
2058 * but this being RDMA, who knows. in qemu_rdma_write_one()
2062 rdma->total_writes++; in qemu_rdma_write_one()
2068 * Push out any unwritten RDMA operations.
2073 static int qemu_rdma_write_flush(RDMAContext *rdma, Error **errp) in qemu_rdma_write_flush() argument
2077 if (!rdma->current_length) { in qemu_rdma_write_flush()
2081 ret = qemu_rdma_write_one(rdma, rdma->current_index, rdma->current_addr, in qemu_rdma_write_flush()
2082 rdma->current_length, errp); in qemu_rdma_write_flush()
2085 return -1; in qemu_rdma_write_flush()
2089 rdma->nb_sent++; in qemu_rdma_write_flush()
2090 trace_qemu_rdma_write_flush(rdma->nb_sent); in qemu_rdma_write_flush()
2093 rdma->current_length = 0; in qemu_rdma_write_flush()
2094 rdma->current_addr = 0; in qemu_rdma_write_flush()
2099 static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma, in qemu_rdma_buffer_mergeable() argument
2106 if (rdma->current_index < 0) { in qemu_rdma_buffer_mergeable()
2110 if (rdma->current_chunk < 0) { in qemu_rdma_buffer_mergeable()
2114 block = &(rdma->local_ram_blocks.block[rdma->current_index]); in qemu_rdma_buffer_mergeable()
2115 host_addr = block->local_host_addr + (offset - block->offset); in qemu_rdma_buffer_mergeable()
2116 chunk_end = ram_chunk_end(block, rdma->current_chunk); in qemu_rdma_buffer_mergeable()
2118 if (rdma->current_length == 0) { in qemu_rdma_buffer_mergeable()
2125 if (offset != (rdma->current_addr + rdma->current_length)) { in qemu_rdma_buffer_mergeable()
2129 if (offset < block->offset) { in qemu_rdma_buffer_mergeable()
2133 if ((offset + len) > (block->offset + block->length)) { in qemu_rdma_buffer_mergeable()
2154 static int qemu_rdma_write(RDMAContext *rdma, in qemu_rdma_write() argument
2159 uint64_t index = rdma->current_index; in qemu_rdma_write()
2160 uint64_t chunk = rdma->current_chunk; in qemu_rdma_write()
2163 if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) { in qemu_rdma_write()
2164 if (qemu_rdma_write_flush(rdma, errp) < 0) { in qemu_rdma_write()
2165 return -1; in qemu_rdma_write()
2167 rdma->current_length = 0; in qemu_rdma_write()
2168 rdma->current_addr = current_addr; in qemu_rdma_write()
2170 qemu_rdma_search_ram_block(rdma, block_offset, in qemu_rdma_write()
2172 rdma->current_index = index; in qemu_rdma_write()
2173 rdma->current_chunk = chunk; in qemu_rdma_write()
2177 rdma->current_length += len; in qemu_rdma_write()
2180 if (rdma->current_length >= RDMA_MERGE_MAX) { in qemu_rdma_write()
2181 return qemu_rdma_write_flush(rdma, errp); in qemu_rdma_write()
2187 static void qemu_rdma_cleanup(RDMAContext *rdma) in qemu_rdma_cleanup() argument
2191 if (rdma->cm_id && rdma->connected) { in qemu_rdma_cleanup()
2192 if ((rdma->errored || in qemu_rdma_cleanup()
2193 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) && in qemu_rdma_cleanup()
2194 !rdma->received_error) { in qemu_rdma_cleanup()
2200 if (qemu_rdma_post_send_control(rdma, NULL, &head, &err) < 0) { in qemu_rdma_cleanup()
2205 rdma_disconnect(rdma->cm_id); in qemu_rdma_cleanup()
2207 rdma->connected = false; in qemu_rdma_cleanup()
2210 if (rdma->channel) { in qemu_rdma_cleanup()
2211 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL); in qemu_rdma_cleanup()
2213 g_free(rdma->dest_blocks); in qemu_rdma_cleanup()
2214 rdma->dest_blocks = NULL; in qemu_rdma_cleanup()
2217 if (rdma->wr_data[i].control_mr) { in qemu_rdma_cleanup()
2218 rdma->total_registrations--; in qemu_rdma_cleanup()
2219 ibv_dereg_mr(rdma->wr_data[i].control_mr); in qemu_rdma_cleanup()
2221 rdma->wr_data[i].control_mr = NULL; in qemu_rdma_cleanup()
2224 if (rdma->local_ram_blocks.block) { in qemu_rdma_cleanup()
2225 while (rdma->local_ram_blocks.nb_blocks) { in qemu_rdma_cleanup()
2226 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]); in qemu_rdma_cleanup()
2230 if (rdma->qp) { in qemu_rdma_cleanup()
2231 rdma_destroy_qp(rdma->cm_id); in qemu_rdma_cleanup()
2232 rdma->qp = NULL; in qemu_rdma_cleanup()
2234 if (rdma->recv_cq) { in qemu_rdma_cleanup()
2235 ibv_destroy_cq(rdma->recv_cq); in qemu_rdma_cleanup()
2236 rdma->recv_cq = NULL; in qemu_rdma_cleanup()
2238 if (rdma->send_cq) { in qemu_rdma_cleanup()
2239 ibv_destroy_cq(rdma->send_cq); in qemu_rdma_cleanup()
2240 rdma->send_cq = NULL; in qemu_rdma_cleanup()
2242 if (rdma->recv_comp_channel) { in qemu_rdma_cleanup()
2243 ibv_destroy_comp_channel(rdma->recv_comp_channel); in qemu_rdma_cleanup()
2244 rdma->recv_comp_channel = NULL; in qemu_rdma_cleanup()
2246 if (rdma->send_comp_channel) { in qemu_rdma_cleanup()
2247 ibv_destroy_comp_channel(rdma->send_comp_channel); in qemu_rdma_cleanup()
2248 rdma->send_comp_channel = NULL; in qemu_rdma_cleanup()
2250 if (rdma->pd) { in qemu_rdma_cleanup()
2251 ibv_dealloc_pd(rdma->pd); in qemu_rdma_cleanup()
2252 rdma->pd = NULL; in qemu_rdma_cleanup()
2254 if (rdma->cm_id) { in qemu_rdma_cleanup()
2255 rdma_destroy_id(rdma->cm_id); in qemu_rdma_cleanup()
2256 rdma->cm_id = NULL; in qemu_rdma_cleanup()
2260 if (rdma->listen_id) { in qemu_rdma_cleanup()
2261 if (!rdma->is_return_path) { in qemu_rdma_cleanup()
2262 rdma_destroy_id(rdma->listen_id); in qemu_rdma_cleanup()
2264 rdma->listen_id = NULL; in qemu_rdma_cleanup()
2266 if (rdma->channel) { in qemu_rdma_cleanup()
2267 if (!rdma->is_return_path) { in qemu_rdma_cleanup()
2268 rdma_destroy_event_channel(rdma->channel); in qemu_rdma_cleanup()
2270 rdma->channel = NULL; in qemu_rdma_cleanup()
2274 if (rdma->channel) { in qemu_rdma_cleanup()
2275 rdma_destroy_event_channel(rdma->channel); in qemu_rdma_cleanup()
2276 rdma->channel = NULL; in qemu_rdma_cleanup()
2278 g_free(rdma->host); in qemu_rdma_cleanup()
2279 rdma->host = NULL; in qemu_rdma_cleanup()
2283 static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp) in qemu_rdma_source_init() argument
2291 rdma->pin_all = pin_all; in qemu_rdma_source_init()
2293 ret = qemu_rdma_resolve_host(rdma, errp); in qemu_rdma_source_init()
2298 ret = qemu_rdma_alloc_pd_cq(rdma, errp); in qemu_rdma_source_init()
2303 ret = qemu_rdma_alloc_qp(rdma); in qemu_rdma_source_init()
2305 error_setg(errp, "RDMA ERROR: rdma migration: error allocating qp!"); in qemu_rdma_source_init()
2309 qemu_rdma_init_ram_blocks(rdma); in qemu_rdma_source_init()
2312 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); in qemu_rdma_source_init()
2313 for (int i = 0; i < rdma->local_ram_blocks.nb_blocks; i++) { in qemu_rdma_source_init()
2314 g_hash_table_insert(rdma->blockmap, in qemu_rdma_source_init()
2315 (void *)(uintptr_t)rdma->local_ram_blocks.block[i].offset, in qemu_rdma_source_init()
2316 &rdma->local_ram_blocks.block[i]); in qemu_rdma_source_init()
2320 ret = qemu_rdma_reg_control(rdma, i); in qemu_rdma_source_init()
2322 error_setg(errp, "RDMA ERROR: rdma migration: error " in qemu_rdma_source_init()
2331 qemu_rdma_cleanup(rdma); in qemu_rdma_source_init()
2332 return -1; in qemu_rdma_source_init()
2335 static int qemu_get_cm_event_timeout(RDMAContext *rdma, in qemu_get_cm_event_timeout() argument
2341 .fd = rdma->channel->fd, in qemu_get_cm_event_timeout()
2351 error_setg(errp, "RDMA ERROR: poll cm event timeout"); in qemu_get_cm_event_timeout()
2352 return -1; in qemu_get_cm_event_timeout()
2354 error_setg(errp, "RDMA ERROR: failed to poll cm event, errno=%i", in qemu_get_cm_event_timeout()
2356 return -1; in qemu_get_cm_event_timeout()
2358 if (rdma_get_cm_event(rdma->channel, cm_event) < 0) { in qemu_get_cm_event_timeout()
2359 error_setg(errp, "RDMA ERROR: failed to get cm event"); in qemu_get_cm_event_timeout()
2360 return -1; in qemu_get_cm_event_timeout()
2364 error_setg(errp, "RDMA ERROR: no POLLIN event, revent=%x", in qemu_get_cm_event_timeout()
2366 return -1; in qemu_get_cm_event_timeout()
2370 static int qemu_rdma_connect(RDMAContext *rdma, bool return_path, in qemu_rdma_connect() argument
2389 if (rdma->pin_all) { in qemu_rdma_connect()
2396 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp); in qemu_rdma_connect()
2401 ret = rdma_connect(rdma->cm_id, &conn_param); in qemu_rdma_connect()
2404 "RDMA ERROR: connecting to destination!"); in qemu_rdma_connect()
2409 ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp); in qemu_rdma_connect()
2411 ret = rdma_get_cm_event(rdma->channel, &cm_event); in qemu_rdma_connect()
2414 "RDMA ERROR: failed to get cm event"); in qemu_rdma_connect()
2421 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { in qemu_rdma_connect()
2422 error_setg(errp, "RDMA ERROR: connecting to destination!"); in qemu_rdma_connect()
2426 rdma->connected = true; in qemu_rdma_connect()
2428 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); in qemu_rdma_connect()
2433 * and disable them otherwise. in qemu_rdma_connect()
2435 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) { in qemu_rdma_connect()
2436 warn_report("RDMA: Server cannot support pinning all memory. " in qemu_rdma_connect()
2438 rdma->pin_all = false; in qemu_rdma_connect()
2441 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all); in qemu_rdma_connect()
2445 rdma->control_ready_expected = 1; in qemu_rdma_connect()
2446 rdma->nb_sent = 0; in qemu_rdma_connect()
2450 qemu_rdma_cleanup(rdma); in qemu_rdma_connect()
2451 return -1; in qemu_rdma_connect()
2454 static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) in qemu_rdma_dest_init() argument
2464 rdma->wr_data[i].control_len = 0; in qemu_rdma_dest_init()
2465 rdma->wr_data[i].control_curr = NULL; in qemu_rdma_dest_init()
2468 if (!rdma->host || !rdma->host[0]) { in qemu_rdma_dest_init()
2469 error_setg(errp, "RDMA ERROR: RDMA host is not set!"); in qemu_rdma_dest_init()
2470 rdma->errored = true; in qemu_rdma_dest_init()
2471 return -1; in qemu_rdma_dest_init()
2474 rdma->channel = rdma_create_event_channel(); in qemu_rdma_dest_init()
2475 if (!rdma->channel) { in qemu_rdma_dest_init()
2476 error_setg(errp, "RDMA ERROR: could not create rdma event channel"); in qemu_rdma_dest_init()
2477 rdma->errored = true; in qemu_rdma_dest_init()
2478 return -1; in qemu_rdma_dest_init()
2482 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP); in qemu_rdma_dest_init()
2484 error_setg(errp, "RDMA ERROR: could not create cm_id!"); in qemu_rdma_dest_init()
2488 snprintf(port_str, 16, "%d", rdma->port); in qemu_rdma_dest_init()
2491 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); in qemu_rdma_dest_init()
2493 error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s", in qemu_rdma_dest_init()
2494 rdma->host); in qemu_rdma_dest_init()
2501 error_setg(errp, "RDMA ERROR: Error: could not set REUSEADDR option"); in qemu_rdma_dest_init()
2506 for (e = res; e != NULL; e = e->ai_next) { in qemu_rdma_dest_init()
2508 inet_ntop(e->ai_family, in qemu_rdma_dest_init()
2509 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); in qemu_rdma_dest_init()
2510 trace_qemu_rdma_dest_init_trying(rdma->host, ip); in qemu_rdma_dest_init()
2511 ret = rdma_bind_addr(listen_id, e->ai_dst_addr); in qemu_rdma_dest_init()
2520 error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!"); in qemu_rdma_dest_init()
2524 rdma->listen_id = listen_id; in qemu_rdma_dest_init()
2531 rdma_destroy_event_channel(rdma->channel); in qemu_rdma_dest_init()
2532 rdma->channel = NULL; in qemu_rdma_dest_init()
2533 rdma->errored = true; in qemu_rdma_dest_init()
2534 return -1; in qemu_rdma_dest_init()
2539 RDMAContext *rdma) in qemu_rdma_return_path_dest_init()
2542 rdma_return_path->wr_data[i].control_len = 0; in qemu_rdma_return_path_dest_init()
2543 rdma_return_path->wr_data[i].control_curr = NULL; in qemu_rdma_return_path_dest_init()
2547 rdma_return_path->channel = rdma->channel; in qemu_rdma_return_path_dest_init()
2548 rdma_return_path->listen_id = rdma->listen_id; in qemu_rdma_return_path_dest_init()
2550 rdma->return_path = rdma_return_path; in qemu_rdma_return_path_dest_init()
2551 rdma_return_path->return_path = rdma; in qemu_rdma_return_path_dest_init()
2552 rdma_return_path->is_return_path = true; in qemu_rdma_return_path_dest_init()
2557 RDMAContext *rdma = NULL; in qemu_rdma_data_init() local
2559 rdma = g_new0(RDMAContext, 1); in qemu_rdma_data_init()
2560 rdma->current_index = -1; in qemu_rdma_data_init()
2561 rdma->current_chunk = -1; in qemu_rdma_data_init()
2563 rdma->host = g_strdup(saddr->host); in qemu_rdma_data_init()
2564 rdma->port = atoi(saddr->port); in qemu_rdma_data_init()
2565 return rdma; in qemu_rdma_data_init()
2571 * VM's ram is handled with regular RDMA messages.
2582 RDMAContext *rdma; in qio_channel_rdma_writev() local
2588 rdma = qatomic_rcu_read(&rioc->rdmaout); in qio_channel_rdma_writev()
2590 if (!rdma) { in qio_channel_rdma_writev()
2591 error_setg(errp, "RDMA control channel output is not set"); in qio_channel_rdma_writev()
2592 return -1; in qio_channel_rdma_writev()
2595 if (rdma->errored) { in qio_channel_rdma_writev()
2597 "RDMA is in an error state waiting migration to abort!"); in qio_channel_rdma_writev()
2598 return -1; in qio_channel_rdma_writev()
2605 ret = qemu_rdma_write_flush(rdma, errp); in qio_channel_rdma_writev()
2607 rdma->errored = true; in qio_channel_rdma_writev()
2608 return -1; in qio_channel_rdma_writev()
2618 remaining -= len; in qio_channel_rdma_writev()
2623 ret = qemu_rdma_exchange_send(rdma, &head, in qio_channel_rdma_writev()
2627 rdma->errored = true; in qio_channel_rdma_writev()
2628 return -1; in qio_channel_rdma_writev()
2639 static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, in qemu_rdma_fill() argument
2644 if (rdma->wr_data[idx].control_len) { in qemu_rdma_fill()
2645 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size); in qemu_rdma_fill()
2647 len = MIN(size, rdma->wr_data[idx].control_len); in qemu_rdma_fill()
2648 memcpy(buf, rdma->wr_data[idx].control_curr, len); in qemu_rdma_fill()
2649 rdma->wr_data[idx].control_curr += len; in qemu_rdma_fill()
2650 rdma->wr_data[idx].control_len -= len; in qemu_rdma_fill()
2658 * RDMA links don't use bytestreams, so we have to
2670 RDMAContext *rdma; in qio_channel_rdma_readv() local
2677 rdma = qatomic_rcu_read(&rioc->rdmain); in qio_channel_rdma_readv()
2679 if (!rdma) { in qio_channel_rdma_readv()
2680 error_setg(errp, "RDMA control channel input is not set"); in qio_channel_rdma_readv()
2681 return -1; in qio_channel_rdma_readv()
2684 if (rdma->errored) { in qio_channel_rdma_readv()
2686 "RDMA is in an error state waiting migration to abort!"); in qio_channel_rdma_readv()
2687 return -1; in qio_channel_rdma_readv()
2699 len = qemu_rdma_fill(rdma, data, want, 0); in qio_channel_rdma_readv()
2701 want -= len; in qio_channel_rdma_readv()
2717 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE, in qio_channel_rdma_readv()
2721 rdma->errored = true; in qio_channel_rdma_readv()
2722 return -1; in qio_channel_rdma_readv()
2728 len = qemu_rdma_fill(rdma, data, want, 0); in qio_channel_rdma_readv()
2730 want -= len; in qio_channel_rdma_readv()
2747 static int qemu_rdma_drain_cq(RDMAContext *rdma) in qemu_rdma_drain_cq() argument
2751 if (qemu_rdma_write_flush(rdma, &err) < 0) { in qemu_rdma_drain_cq()
2753 return -1; in qemu_rdma_drain_cq()
2756 while (rdma->nb_sent) { in qemu_rdma_drain_cq()
2757 if (qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL) < 0) { in qemu_rdma_drain_cq()
2758 error_report("rdma migration: complete polling error!"); in qemu_rdma_drain_cq()
2759 return -1; in qemu_rdma_drain_cq()
2763 qemu_rdma_unregister_waiting(rdma); in qemu_rdma_drain_cq()
2774 /* XXX we should make readv/writev actually honour this :-) */ in qio_channel_rdma_set_blocking()
2775 rioc->blocking = blocking; in qio_channel_rdma_set_blocking()
2792 RDMAContext *rdma; in qio_channel_rdma_source_prepare() local
2794 *timeout = -1; in qio_channel_rdma_source_prepare()
2797 if (rsource->condition == G_IO_IN) { in qio_channel_rdma_source_prepare()
2798 rdma = qatomic_rcu_read(&rsource->rioc->rdmain); in qio_channel_rdma_source_prepare()
2800 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout); in qio_channel_rdma_source_prepare()
2803 if (!rdma) { in qio_channel_rdma_source_prepare()
2808 if (rdma->wr_data[0].control_len) { in qio_channel_rdma_source_prepare()
2813 return cond & rsource->condition; in qio_channel_rdma_source_prepare()
2820 RDMAContext *rdma; in qio_channel_rdma_source_check() local
2824 if (rsource->condition == G_IO_IN) { in qio_channel_rdma_source_check()
2825 rdma = qatomic_rcu_read(&rsource->rioc->rdmain); in qio_channel_rdma_source_check()
2827 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout); in qio_channel_rdma_source_check()
2830 if (!rdma) { in qio_channel_rdma_source_check()
2835 if (rdma->wr_data[0].control_len) { in qio_channel_rdma_source_check()
2840 return cond & rsource->condition; in qio_channel_rdma_source_check()
2850 RDMAContext *rdma; in qio_channel_rdma_source_dispatch() local
2854 if (rsource->condition == G_IO_IN) { in qio_channel_rdma_source_dispatch()
2855 rdma = qatomic_rcu_read(&rsource->rioc->rdmain); in qio_channel_rdma_source_dispatch()
2857 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout); in qio_channel_rdma_source_dispatch()
2860 if (!rdma) { in qio_channel_rdma_source_dispatch()
2865 if (rdma->wr_data[0].control_len) { in qio_channel_rdma_source_dispatch()
2870 return (*func)(QIO_CHANNEL(rsource->rioc), in qio_channel_rdma_source_dispatch()
2871 (cond & rsource->condition), in qio_channel_rdma_source_dispatch()
2880 object_unref(OBJECT(ssource->rioc)); in qio_channel_rdma_source_finalize()
2901 ssource->rioc = rioc; in qio_channel_rdma_create_watch()
2904 ssource->condition = condition; in qio_channel_rdma_create_watch()
2918 aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd, in qio_channel_rdma_set_aio_fd_handler()
2920 aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd, in qio_channel_rdma_set_aio_fd_handler()
2923 aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd, in qio_channel_rdma_set_aio_fd_handler()
2925 aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd, in qio_channel_rdma_set_aio_fd_handler()
2939 if (rcu->rdmain) { in qio_channel_rdma_close_rcu()
2940 qemu_rdma_cleanup(rcu->rdmain); in qio_channel_rdma_close_rcu()
2943 if (rcu->rdmaout) { in qio_channel_rdma_close_rcu()
2944 qemu_rdma_cleanup(rcu->rdmaout); in qio_channel_rdma_close_rcu()
2947 g_free(rcu->rdmain); in qio_channel_rdma_close_rcu()
2948 g_free(rcu->rdmaout); in qio_channel_rdma_close_rcu()
2961 rdmain = rioc->rdmain; in qio_channel_rdma_close()
2963 qatomic_rcu_set(&rioc->rdmain, NULL); in qio_channel_rdma_close()
2966 rdmaout = rioc->rdmaout; in qio_channel_rdma_close()
2968 qatomic_rcu_set(&rioc->rdmaout, NULL); in qio_channel_rdma_close()
2971 rcu->rdmain = rdmain; in qio_channel_rdma_close()
2972 rcu->rdmaout = rdmaout; in qio_channel_rdma_close()
2988 rdmain = qatomic_rcu_read(&rioc->rdmain); in qio_channel_rdma_shutdown()
2989 rdmaout = qatomic_rcu_read(&rioc->rdmain); in qio_channel_rdma_shutdown()
2994 rdmain->errored = true; in qio_channel_rdma_shutdown()
2999 rdmaout->errored = true; in qio_channel_rdma_shutdown()
3005 rdmain->errored = true; in qio_channel_rdma_shutdown()
3008 rdmaout->errored = true; in qio_channel_rdma_shutdown()
3030 * @pages_sent : User-specificed pointer to indicate how many pages were
3039 RDMAContext *rdma; in qemu_rdma_save_page() local
3043 rdma = qatomic_rcu_read(&rioc->rdmaout); in qemu_rdma_save_page()
3045 if (!rdma) { in qemu_rdma_save_page()
3046 return -1; in qemu_rdma_save_page()
3049 if (rdma_errored(rdma)) { in qemu_rdma_save_page()
3050 return -1; in qemu_rdma_save_page()
3058 * an actual RDMA write will occur and a new chunk will be formed. in qemu_rdma_save_page()
3060 ret = qemu_rdma_write(rdma, block_offset, offset, size, &err); in qemu_rdma_save_page()
3075 ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL); in qemu_rdma_save_page()
3078 error_report("rdma migration: polling error"); in qemu_rdma_save_page()
3091 ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL); in qemu_rdma_save_page()
3094 error_report("rdma migration: polling error"); in qemu_rdma_save_page()
3108 rdma->errored = true; in qemu_rdma_save_page()
3109 return -1; in qemu_rdma_save_page()
3131 RDMAContext *rdma = opaque; in rdma_cm_poll_handler() local
3135 if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) { in rdma_cm_poll_handler()
3140 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED || in rdma_cm_poll_handler()
3141 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { in rdma_cm_poll_handler()
3142 if (!rdma->errored && in rdma_cm_poll_handler()
3143 migration_incoming_get_current()->state != in rdma_cm_poll_handler()
3145 error_report("receive cm event, cm event is %d", cm_event->event); in rdma_cm_poll_handler()
3146 rdma->errored = true; in rdma_cm_poll_handler()
3147 if (rdma->return_path) { in rdma_cm_poll_handler()
3148 rdma->return_path->errored = true; in rdma_cm_poll_handler()
3152 if (mis->loadvm_co) { in rdma_cm_poll_handler()
3153 qemu_coroutine_enter(mis->loadvm_co); in rdma_cm_poll_handler()
3160 static int qemu_rdma_accept(RDMAContext *rdma) in qemu_rdma_accept() argument
3175 ret = rdma_get_cm_event(rdma->channel, &cm_event); in qemu_rdma_accept()
3180 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { in qemu_rdma_accept()
3185 isock->host = g_strdup(rdma->host); in qemu_rdma_accept()
3186 isock->port = g_strdup_printf("%d", rdma->port); in qemu_rdma_accept()
3193 && !rdma->is_return_path) { in qemu_rdma_accept()
3200 qemu_rdma_return_path_dest_init(rdma_return_path, rdma); in qemu_rdma_accept()
3203 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); in qemu_rdma_accept()
3208 error_report("Unknown source RDMA version: %d, bailing...", in qemu_rdma_accept()
3224 rdma->pin_all = true; in qemu_rdma_accept()
3227 rdma->cm_id = cm_event->id; in qemu_rdma_accept()
3228 verbs = cm_event->id->verbs; in qemu_rdma_accept()
3232 trace_qemu_rdma_accept_pin_state(rdma->pin_all); in qemu_rdma_accept()
3238 if (!rdma->verbs) { in qemu_rdma_accept()
3239 rdma->verbs = verbs; in qemu_rdma_accept()
3240 } else if (rdma->verbs != verbs) { in qemu_rdma_accept()
3241 error_report("ibv context not matching %p, %p!", rdma->verbs, in qemu_rdma_accept()
3248 ret = qemu_rdma_alloc_pd_cq(rdma, &err); in qemu_rdma_accept()
3254 ret = qemu_rdma_alloc_qp(rdma); in qemu_rdma_accept()
3256 error_report("rdma migration: error allocating qp!"); in qemu_rdma_accept()
3260 qemu_rdma_init_ram_blocks(rdma); in qemu_rdma_accept()
3263 ret = qemu_rdma_reg_control(rdma, i); in qemu_rdma_accept()
3265 error_report("rdma: error registering %d control", i); in qemu_rdma_accept()
3272 && !rdma->is_return_path) { in qemu_rdma_accept()
3273 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, in qemu_rdma_accept()
3275 (void *)(intptr_t)rdma->return_path); in qemu_rdma_accept()
3277 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler, in qemu_rdma_accept()
3278 NULL, rdma); in qemu_rdma_accept()
3281 ret = rdma_accept(rdma->cm_id, &conn_param); in qemu_rdma_accept()
3287 ret = rdma_get_cm_event(rdma->channel, &cm_event); in qemu_rdma_accept()
3293 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { in qemu_rdma_accept()
3300 rdma->connected = true; in qemu_rdma_accept()
3302 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, &err); in qemu_rdma_accept()
3308 qemu_rdma_dump_gid("dest_connect", rdma->cm_id); in qemu_rdma_accept()
3313 rdma->errored = true; in qemu_rdma_accept()
3314 qemu_rdma_cleanup(rdma); in qemu_rdma_accept()
3316 return -1; in qemu_rdma_accept()
3321 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index; in dest_ram_sort_func()
3322 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index; in dest_ram_sort_func()
3324 return (a_index < b_index) ? -1 : (a_index != b_index); in dest_ram_sort_func()
3330 * can perform RDMA operations.
3350 RDMAContext *rdma; in rdma_registration_handle() local
3368 rdma = qatomic_rcu_read(&rioc->rdmain); in rdma_registration_handle()
3370 if (!rdma) { in rdma_registration_handle()
3371 return -1; in rdma_registration_handle()
3374 if (rdma_errored(rdma)) { in rdma_registration_handle()
3375 return -1; in rdma_registration_handle()
3378 local = &rdma->local_ram_blocks; in rdma_registration_handle()
3382 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE, &err); in rdma_registration_handle()
3390 error_report("rdma: Too many requests in this message (%d)." in rdma_registration_handle()
3397 comp = (RDMACompress *) rdma->wr_data[idx].control_curr; in rdma_registration_handle()
3400 trace_rdma_registration_handle_compress(comp->length, in rdma_registration_handle()
3401 comp->block_idx, in rdma_registration_handle()
3402 comp->offset); in rdma_registration_handle()
3403 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { in rdma_registration_handle()
3404 error_report("rdma: 'compress' bad block index %u (vs %d)", in rdma_registration_handle()
3405 (unsigned int)comp->block_idx, in rdma_registration_handle()
3406 rdma->local_ram_blocks.nb_blocks); in rdma_registration_handle()
3409 block = &(rdma->local_ram_blocks.block[comp->block_idx]); in rdma_registration_handle()
3411 host_addr = block->local_host_addr + in rdma_registration_handle()
3412 (comp->offset - block->offset); in rdma_registration_handle()
3413 if (comp->value) { in rdma_registration_handle()
3414 error_report("rdma: Zero page with non-zero (%d) value", in rdma_registration_handle()
3415 comp->value); in rdma_registration_handle()
3418 ram_handle_zero(host_addr, comp->length); in rdma_registration_handle()
3432 qsort(rdma->local_ram_blocks.block, in rdma_registration_handle()
3433 rdma->local_ram_blocks.nb_blocks, in rdma_registration_handle()
3435 for (int i = 0; i < local->nb_blocks; i++) { in rdma_registration_handle()
3436 local->block[i].index = i; in rdma_registration_handle()
3439 if (rdma->pin_all) { in rdma_registration_handle()
3440 ret = qemu_rdma_reg_whole_ram_blocks(rdma, &err); in rdma_registration_handle()
3453 for (int i = 0; i < local->nb_blocks; i++) { in rdma_registration_handle()
3454 rdma->dest_blocks[i].remote_host_addr = in rdma_registration_handle()
3455 (uintptr_t)(local->block[i].local_host_addr); in rdma_registration_handle()
3457 if (rdma->pin_all) { in rdma_registration_handle()
3458 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey; in rdma_registration_handle()
3461 rdma->dest_blocks[i].offset = local->block[i].offset; in rdma_registration_handle()
3462 rdma->dest_blocks[i].length = local->block[i].length; in rdma_registration_handle()
3464 dest_block_to_network(&rdma->dest_blocks[i]); in rdma_registration_handle()
3466 local->block[i].block_name, in rdma_registration_handle()
3467 local->block[i].offset, in rdma_registration_handle()
3468 local->block[i].length, in rdma_registration_handle()
3469 local->block[i].local_host_addr, in rdma_registration_handle()
3470 local->block[i].src_index); in rdma_registration_handle()
3473 blocks.len = rdma->local_ram_blocks.nb_blocks in rdma_registration_handle()
3477 ret = qemu_rdma_post_send_control(rdma, in rdma_registration_handle()
3478 (uint8_t *) rdma->dest_blocks, &blocks, in rdma_registration_handle()
3491 registers = (RDMARegister *) rdma->wr_data[idx].control_curr; in rdma_registration_handle()
3503 reg->current_index, reg->key.current_addr, reg->chunks); in rdma_registration_handle()
3505 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { in rdma_registration_handle()
3506 error_report("rdma: 'register' bad block index %u (vs %d)", in rdma_registration_handle()
3507 (unsigned int)reg->current_index, in rdma_registration_handle()
3508 rdma->local_ram_blocks.nb_blocks); in rdma_registration_handle()
3511 block = &(rdma->local_ram_blocks.block[reg->current_index]); in rdma_registration_handle()
3512 if (block->is_ram_block) { in rdma_registration_handle()
3513 if (block->offset > reg->key.current_addr) { in rdma_registration_handle()
3514 error_report("rdma: bad register address for block %s" in rdma_registration_handle()
3516 block->block_name, block->offset, in rdma_registration_handle()
3517 reg->key.current_addr); in rdma_registration_handle()
3520 host_addr = (block->local_host_addr + in rdma_registration_handle()
3521 (reg->key.current_addr - block->offset)); in rdma_registration_handle()
3522 chunk = ram_chunk_index(block->local_host_addr, in rdma_registration_handle()
3525 chunk = reg->key.chunk; in rdma_registration_handle()
3526 host_addr = block->local_host_addr + in rdma_registration_handle()
3527 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT)); in rdma_registration_handle()
3529 if (host_addr < (void *)block->local_host_addr) { in rdma_registration_handle()
3530 error_report("rdma: bad chunk for block %s" in rdma_registration_handle()
3532 block->block_name, reg->key.chunk); in rdma_registration_handle()
3537 chunk_end = ram_chunk_end(block, chunk + reg->chunks); in rdma_registration_handle()
3538 /* avoid "-Waddress-of-packed-member" warning */ in rdma_registration_handle()
3540 if (qemu_rdma_register_and_get_keys(rdma, block, in rdma_registration_handle()
3546 reg_result->rkey = tmp_rkey; in rdma_registration_handle()
3548 reg_result->host_addr = (uintptr_t)block->local_host_addr; in rdma_registration_handle()
3550 trace_rdma_registration_handle_register_rkey(reg_result->rkey); in rdma_registration_handle()
3555 ret = qemu_rdma_post_send_control(rdma, in rdma_registration_handle()
3566 registers = (RDMARegister *) rdma->wr_data[idx].control_curr; in rdma_registration_handle()
3573 reg->current_index, reg->key.chunk); in rdma_registration_handle()
3575 block = &(rdma->local_ram_blocks.block[reg->current_index]); in rdma_registration_handle()
3577 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]); in rdma_registration_handle()
3578 block->pmr[reg->key.chunk] = NULL; in rdma_registration_handle()
3581 error_report("rdma unregistration chunk failed: %s", in rdma_registration_handle()
3586 rdma->total_registrations--; in rdma_registration_handle()
3588 trace_rdma_registration_handle_unregister_success(reg->key.chunk); in rdma_registration_handle()
3591 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp, &err); in rdma_registration_handle()
3608 rdma->errored = true; in rdma_registration_handle()
3609 return -1; in rdma_registration_handle()
3621 int found = -1; in rdma_block_notification_handle()
3629 RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmain); in rdma_block_notification_handle() local
3631 if (!rdma) { in rdma_block_notification_handle()
3632 return -1; in rdma_block_notification_handle()
3636 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) { in rdma_block_notification_handle()
3637 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) { in rdma_block_notification_handle()
3643 if (found == -1) { in rdma_block_notification_handle()
3645 return -1; in rdma_block_notification_handle()
3648 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index; in rdma_block_notification_handle()
3649 trace_rdma_block_notification_handle(name, rdma->next_src_index); in rdma_block_notification_handle()
3650 rdma->next_src_index++; in rdma_block_notification_handle()
3663 RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmaout); in rdma_registration_start() local
3664 if (!rdma) { in rdma_registration_start()
3665 return -1; in rdma_registration_start()
3668 if (rdma_errored(rdma)) { in rdma_registration_start()
3669 return -1; in rdma_registration_start()
3685 RDMAContext *rdma; in rdma_registration_stop() local
3695 rdma = qatomic_rcu_read(&rioc->rdmaout); in rdma_registration_stop()
3696 if (!rdma) { in rdma_registration_stop()
3697 return -1; in rdma_registration_stop()
3700 if (rdma_errored(rdma)) { in rdma_registration_stop()
3701 return -1; in rdma_registration_stop()
3705 ret = qemu_rdma_drain_cq(rdma); in rdma_registration_stop()
3713 RDMALocalBlocks *local = &rdma->local_ram_blocks; in rdma_registration_stop()
3727 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp, in rdma_registration_stop()
3728 ®_result_idx, rdma->pin_all ? in rdma_registration_stop()
3733 return -1; in rdma_registration_stop()
3741 * (dynamic chunk registration disabled - pin everything with one rkey.) in rdma_registration_stop()
3743 * (dynamic chunk registration enabled - pin individual chunks.) in rdma_registration_stop()
3750 if (local->nb_blocks != nb_dest_blocks) { in rdma_registration_stop()
3752 local->nb_blocks, nb_dest_blocks); in rdma_registration_stop()
3755 rdma->errored = true; in rdma_registration_stop()
3756 return -1; in rdma_registration_stop()
3759 qemu_rdma_move_header(rdma, reg_result_idx, &resp); in rdma_registration_stop()
3760 memcpy(rdma->dest_blocks, in rdma_registration_stop()
3761 rdma->wr_data[reg_result_idx].control_curr, resp.len); in rdma_registration_stop()
3763 network_to_dest_block(&rdma->dest_blocks[i]); in rdma_registration_stop()
3766 if (rdma->dest_blocks[i].length != local->block[i].length) { in rdma_registration_stop()
3769 local->block[i].block_name, i, in rdma_registration_stop()
3770 local->block[i].length, in rdma_registration_stop()
3771 rdma->dest_blocks[i].length); in rdma_registration_stop()
3772 rdma->errored = true; in rdma_registration_stop()
3773 return -1; in rdma_registration_stop()
3775 local->block[i].remote_host_addr = in rdma_registration_stop()
3776 rdma->dest_blocks[i].remote_host_addr; in rdma_registration_stop()
3777 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey; in rdma_registration_stop()
3784 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL, &err); in rdma_registration_stop()
3793 rdma->errored = true; in rdma_registration_stop()
3794 return -1; in rdma_registration_stop()
3800 if (rioc->rdmain) { in qio_channel_rdma_finalize()
3801 qemu_rdma_cleanup(rioc->rdmain); in qio_channel_rdma_finalize()
3802 g_free(rioc->rdmain); in qio_channel_rdma_finalize()
3803 rioc->rdmain = NULL; in qio_channel_rdma_finalize()
3805 if (rioc->rdmaout) { in qio_channel_rdma_finalize()
3806 qemu_rdma_cleanup(rioc->rdmaout); in qio_channel_rdma_finalize()
3807 g_free(rioc->rdmaout); in qio_channel_rdma_finalize()
3808 rioc->rdmaout = NULL; in qio_channel_rdma_finalize()
3817 ioc_klass->io_writev = qio_channel_rdma_writev; in qio_channel_rdma_class_init()
3818 ioc_klass->io_readv = qio_channel_rdma_readv; in qio_channel_rdma_class_init()
3819 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking; in qio_channel_rdma_class_init()
3820 ioc_klass->io_close = qio_channel_rdma_close; in qio_channel_rdma_class_init()
3821 ioc_klass->io_create_watch = qio_channel_rdma_create_watch; in qio_channel_rdma_class_init()
3822 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler; in qio_channel_rdma_class_init()
3823 ioc_klass->io_shutdown = qio_channel_rdma_shutdown; in qio_channel_rdma_class_init()
3841 static QEMUFile *rdma_new_input(RDMAContext *rdma) in rdma_new_input() argument
3845 rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc)); in rdma_new_input()
3846 rioc->rdmain = rdma; in rdma_new_input()
3847 rioc->rdmaout = rdma->return_path; in rdma_new_input()
3849 return rioc->file; in rdma_new_input()
3852 static QEMUFile *rdma_new_output(RDMAContext *rdma) in rdma_new_output() argument
3856 rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc)); in rdma_new_output()
3857 rioc->rdmaout = rdma; in rdma_new_output()
3858 rioc->rdmain = rdma->return_path; in rdma_new_output()
3860 return rioc->file; in rdma_new_output()
3865 RDMAContext *rdma = opaque; in rdma_accept_incoming_migration() local
3869 if (qemu_rdma_accept(rdma) < 0) { in rdma_accept_incoming_migration()
3870 error_report("RDMA ERROR: Migration initialization failed"); in rdma_accept_incoming_migration()
3876 if (rdma->is_return_path) { in rdma_accept_incoming_migration()
3880 f = rdma_new_input(rdma); in rdma_accept_incoming_migration()
3882 error_report("RDMA ERROR: could not open RDMA for input"); in rdma_accept_incoming_migration()
3883 qemu_rdma_cleanup(rdma); in rdma_accept_incoming_migration()
3887 rdma->migration_started_on_destination = 1; in rdma_accept_incoming_migration()
3896 RDMAContext *rdma; in rdma_start_incoming_migration() local
3902 error_setg(errp, "RDMA: cannot disable RAM discard"); in rdma_start_incoming_migration()
3906 rdma = qemu_rdma_data_init(host_port, errp); in rdma_start_incoming_migration()
3907 if (rdma == NULL) { in rdma_start_incoming_migration()
3911 ret = qemu_rdma_dest_init(rdma, errp); in rdma_start_incoming_migration()
3918 ret = rdma_listen(rdma->listen_id, 5); in rdma_start_incoming_migration()
3921 error_setg(errp, "RDMA ERROR: listening on socket!"); in rdma_start_incoming_migration()
3926 s->rdma_migration = true; in rdma_start_incoming_migration()
3927 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, in rdma_start_incoming_migration()
3928 NULL, (void *)(intptr_t)rdma); in rdma_start_incoming_migration()
3932 qemu_rdma_cleanup(rdma); in rdma_start_incoming_migration()
3934 if (rdma) { in rdma_start_incoming_migration()
3935 g_free(rdma->host); in rdma_start_incoming_migration()
3937 g_free(rdma); in rdma_start_incoming_migration()
3945 RDMAContext *rdma; in rdma_start_outgoing_migration() local
3950 error_setg(errp, "RDMA: cannot disable RAM discard"); in rdma_start_outgoing_migration()
3954 rdma = qemu_rdma_data_init(host_port, errp); in rdma_start_outgoing_migration()
3955 if (rdma == NULL) { in rdma_start_outgoing_migration()
3959 ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp); in rdma_start_outgoing_migration()
3966 ret = qemu_rdma_connect(rdma, false, errp); in rdma_start_outgoing_migration()
3972 /* RDMA postcopy need a separate queue pair for return path */ in rdma_start_outgoing_migration()
3993 rdma->return_path = rdma_return_path; in rdma_start_outgoing_migration()
3994 rdma_return_path->return_path = rdma; in rdma_start_outgoing_migration()
3995 rdma_return_path->is_return_path = true; in rdma_start_outgoing_migration()
4000 s->to_dst_file = rdma_new_output(rdma); in rdma_start_outgoing_migration()
4001 s->rdma_migration = true; in rdma_start_outgoing_migration()
4005 qemu_rdma_cleanup(rdma); in rdma_start_outgoing_migration()
4007 g_free(rdma); in rdma_start_outgoing_migration()