1 /* 2 * Dirty page rate limit implementation code 3 * 4 * Copyright (c) 2022 CHINA TELECOM CO.,LTD. 5 * 6 * Authors: 7 * Hyman Huang(黄勇) <huangy81@chinatelecom.cn> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu/main-loop.h" 15 #include "qapi/qapi-commands-migration.h" 16 #include "qobject/qdict.h" 17 #include "qapi/error.h" 18 #include "system/dirtyrate.h" 19 #include "system/dirtylimit.h" 20 #include "monitor/hmp.h" 21 #include "monitor/monitor.h" 22 #include "exec/memory.h" 23 #include "exec/target_page.h" 24 #include "hw/boards.h" 25 #include "system/kvm.h" 26 #include "trace.h" 27 #include "migration/misc.h" 28 29 /* 30 * Dirtylimit stop working if dirty page rate error 31 * value less than DIRTYLIMIT_TOLERANCE_RANGE 32 */ 33 #define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */ 34 /* 35 * Plus or minus vcpu sleep time linearly if dirty 36 * page rate error value percentage over 37 * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT. 38 * Otherwise, plus or minus a fixed vcpu sleep time. 39 */ 40 #define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50 41 /* 42 * Max vcpu sleep time percentage during a cycle 43 * composed of dirty ring full and sleep time. 44 */ 45 #define DIRTYLIMIT_THROTTLE_PCT_MAX 99 46 47 struct { 48 VcpuStat stat; 49 bool running; 50 QemuThread thread; 51 } *vcpu_dirty_rate_stat; 52 53 typedef struct VcpuDirtyLimitState { 54 int cpu_index; 55 bool enabled; 56 /* 57 * Quota dirty page rate, unit is MB/s 58 * zero if not enabled. 59 */ 60 uint64_t quota; 61 } VcpuDirtyLimitState; 62 63 struct { 64 VcpuDirtyLimitState *states; 65 /* Max cpus number configured by user */ 66 int max_cpus; 67 /* Number of vcpu under dirtylimit */ 68 int limited_nvcpu; 69 } *dirtylimit_state; 70 71 /* protect dirtylimit_state */ 72 static QemuMutex dirtylimit_mutex; 73 74 /* dirtylimit thread quit if dirtylimit_quit is true */ 75 static bool dirtylimit_quit; 76 77 static void vcpu_dirty_rate_stat_collect(void) 78 { 79 VcpuStat stat; 80 int i = 0; 81 int64_t period = DIRTYLIMIT_CALC_TIME_MS; 82 83 if (migrate_dirty_limit() && migration_is_running()) { 84 period = migrate_vcpu_dirty_limit_period(); 85 } 86 87 /* calculate vcpu dirtyrate */ 88 vcpu_calculate_dirtyrate(period, 89 &stat, 90 GLOBAL_DIRTY_LIMIT, 91 false); 92 93 for (i = 0; i < stat.nvcpu; i++) { 94 vcpu_dirty_rate_stat->stat.rates[i].id = i; 95 vcpu_dirty_rate_stat->stat.rates[i].dirty_rate = 96 stat.rates[i].dirty_rate; 97 } 98 99 g_free(stat.rates); 100 } 101 102 static void *vcpu_dirty_rate_stat_thread(void *opaque) 103 { 104 rcu_register_thread(); 105 106 /* start log sync */ 107 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true); 108 109 while (qatomic_read(&vcpu_dirty_rate_stat->running)) { 110 vcpu_dirty_rate_stat_collect(); 111 if (dirtylimit_in_service()) { 112 dirtylimit_process(); 113 } 114 } 115 116 /* stop log sync */ 117 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false); 118 119 rcu_unregister_thread(); 120 return NULL; 121 } 122 123 int64_t vcpu_dirty_rate_get(int cpu_index) 124 { 125 DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates; 126 return qatomic_read_i64(&rates[cpu_index].dirty_rate); 127 } 128 129 void vcpu_dirty_rate_stat_start(void) 130 { 131 if (qatomic_read(&vcpu_dirty_rate_stat->running)) { 132 return; 133 } 134 135 qatomic_set(&vcpu_dirty_rate_stat->running, 1); 136 qemu_thread_create(&vcpu_dirty_rate_stat->thread, 137 "dirtyrate-stat", 138 vcpu_dirty_rate_stat_thread, 139 NULL, 140 QEMU_THREAD_JOINABLE); 141 } 142 143 void vcpu_dirty_rate_stat_stop(void) 144 { 145 qatomic_set(&vcpu_dirty_rate_stat->running, 0); 146 dirtylimit_state_unlock(); 147 bql_unlock(); 148 qemu_thread_join(&vcpu_dirty_rate_stat->thread); 149 bql_lock(); 150 dirtylimit_state_lock(); 151 } 152 153 void vcpu_dirty_rate_stat_initialize(void) 154 { 155 MachineState *ms = MACHINE(qdev_get_machine()); 156 int max_cpus = ms->smp.max_cpus; 157 158 vcpu_dirty_rate_stat = 159 g_malloc0(sizeof(*vcpu_dirty_rate_stat)); 160 161 vcpu_dirty_rate_stat->stat.nvcpu = max_cpus; 162 vcpu_dirty_rate_stat->stat.rates = 163 g_new0(DirtyRateVcpu, max_cpus); 164 165 vcpu_dirty_rate_stat->running = false; 166 } 167 168 void vcpu_dirty_rate_stat_finalize(void) 169 { 170 g_free(vcpu_dirty_rate_stat->stat.rates); 171 vcpu_dirty_rate_stat->stat.rates = NULL; 172 173 g_free(vcpu_dirty_rate_stat); 174 vcpu_dirty_rate_stat = NULL; 175 } 176 177 void dirtylimit_state_lock(void) 178 { 179 qemu_mutex_lock(&dirtylimit_mutex); 180 } 181 182 void dirtylimit_state_unlock(void) 183 { 184 qemu_mutex_unlock(&dirtylimit_mutex); 185 } 186 187 static void 188 __attribute__((__constructor__)) dirtylimit_mutex_init(void) 189 { 190 qemu_mutex_init(&dirtylimit_mutex); 191 } 192 193 static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index) 194 { 195 return &dirtylimit_state->states[cpu_index]; 196 } 197 198 void dirtylimit_state_initialize(void) 199 { 200 MachineState *ms = MACHINE(qdev_get_machine()); 201 int max_cpus = ms->smp.max_cpus; 202 int i; 203 204 dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state)); 205 206 dirtylimit_state->states = 207 g_new0(VcpuDirtyLimitState, max_cpus); 208 209 for (i = 0; i < max_cpus; i++) { 210 dirtylimit_state->states[i].cpu_index = i; 211 } 212 213 dirtylimit_state->max_cpus = max_cpus; 214 trace_dirtylimit_state_initialize(max_cpus); 215 } 216 217 void dirtylimit_state_finalize(void) 218 { 219 g_free(dirtylimit_state->states); 220 dirtylimit_state->states = NULL; 221 222 g_free(dirtylimit_state); 223 dirtylimit_state = NULL; 224 225 trace_dirtylimit_state_finalize(); 226 } 227 228 bool dirtylimit_in_service(void) 229 { 230 return !!dirtylimit_state; 231 } 232 233 bool dirtylimit_vcpu_index_valid(int cpu_index) 234 { 235 MachineState *ms = MACHINE(qdev_get_machine()); 236 237 return !(cpu_index < 0 || 238 cpu_index >= ms->smp.max_cpus); 239 } 240 241 static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) 242 { 243 static uint64_t max_dirtyrate; 244 uint64_t dirty_ring_size_MiB; 245 246 dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size()); 247 248 if (max_dirtyrate < dirtyrate) { 249 max_dirtyrate = dirtyrate; 250 } 251 252 return dirty_ring_size_MiB * 1000000 / max_dirtyrate; 253 } 254 255 static inline bool dirtylimit_done(uint64_t quota, 256 uint64_t current) 257 { 258 uint64_t min, max; 259 260 min = MIN(quota, current); 261 max = MAX(quota, current); 262 263 return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false; 264 } 265 266 static inline bool 267 dirtylimit_need_linear_adjustment(uint64_t quota, 268 uint64_t current) 269 { 270 uint64_t min, max; 271 272 min = MIN(quota, current); 273 max = MAX(quota, current); 274 275 return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT; 276 } 277 278 static void dirtylimit_set_throttle(CPUState *cpu, 279 uint64_t quota, 280 uint64_t current) 281 { 282 int64_t ring_full_time_us = 0; 283 uint64_t sleep_pct = 0; 284 uint64_t throttle_us = 0; 285 286 if (current == 0) { 287 cpu->throttle_us_per_full = 0; 288 return; 289 } 290 291 ring_full_time_us = dirtylimit_dirty_ring_full_time(current); 292 293 if (dirtylimit_need_linear_adjustment(quota, current)) { 294 if (quota < current) { 295 sleep_pct = (current - quota) * 100 / current; 296 throttle_us = 297 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 298 cpu->throttle_us_per_full += throttle_us; 299 } else { 300 sleep_pct = (quota - current) * 100 / quota; 301 throttle_us = 302 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 303 cpu->throttle_us_per_full -= throttle_us; 304 } 305 306 trace_dirtylimit_throttle_pct(cpu->cpu_index, 307 sleep_pct, 308 throttle_us); 309 } else { 310 if (quota < current) { 311 cpu->throttle_us_per_full += ring_full_time_us / 10; 312 } else { 313 cpu->throttle_us_per_full -= ring_full_time_us / 10; 314 } 315 } 316 317 /* 318 * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario), 319 * current dirty page rate may never reach the quota, we should stop 320 * increasing sleep time? 321 */ 322 cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full, 323 ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX); 324 325 cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0); 326 } 327 328 static void dirtylimit_adjust_throttle(CPUState *cpu) 329 { 330 uint64_t quota = 0; 331 uint64_t current = 0; 332 int cpu_index = cpu->cpu_index; 333 334 quota = dirtylimit_vcpu_get_state(cpu_index)->quota; 335 current = vcpu_dirty_rate_get(cpu_index); 336 337 if (!dirtylimit_done(quota, current)) { 338 dirtylimit_set_throttle(cpu, quota, current); 339 } 340 341 return; 342 } 343 344 void dirtylimit_process(void) 345 { 346 CPUState *cpu; 347 348 if (!qatomic_read(&dirtylimit_quit)) { 349 dirtylimit_state_lock(); 350 351 if (!dirtylimit_in_service()) { 352 dirtylimit_state_unlock(); 353 return; 354 } 355 356 CPU_FOREACH(cpu) { 357 if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 358 continue; 359 } 360 dirtylimit_adjust_throttle(cpu); 361 } 362 dirtylimit_state_unlock(); 363 } 364 } 365 366 void dirtylimit_change(bool start) 367 { 368 if (start) { 369 qatomic_set(&dirtylimit_quit, 0); 370 } else { 371 qatomic_set(&dirtylimit_quit, 1); 372 } 373 } 374 375 void dirtylimit_set_vcpu(int cpu_index, 376 uint64_t quota, 377 bool enable) 378 { 379 trace_dirtylimit_set_vcpu(cpu_index, quota); 380 381 if (enable) { 382 dirtylimit_state->states[cpu_index].quota = quota; 383 if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) { 384 dirtylimit_state->limited_nvcpu++; 385 } 386 } else { 387 dirtylimit_state->states[cpu_index].quota = 0; 388 if (dirtylimit_state->states[cpu_index].enabled) { 389 dirtylimit_state->limited_nvcpu--; 390 } 391 } 392 393 dirtylimit_state->states[cpu_index].enabled = enable; 394 } 395 396 void dirtylimit_set_all(uint64_t quota, 397 bool enable) 398 { 399 MachineState *ms = MACHINE(qdev_get_machine()); 400 int max_cpus = ms->smp.max_cpus; 401 int i; 402 403 for (i = 0; i < max_cpus; i++) { 404 dirtylimit_set_vcpu(i, quota, enable); 405 } 406 } 407 408 void dirtylimit_vcpu_execute(CPUState *cpu) 409 { 410 if (cpu->throttle_us_per_full) { 411 dirtylimit_state_lock(); 412 413 if (dirtylimit_in_service() && 414 dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 415 dirtylimit_state_unlock(); 416 trace_dirtylimit_vcpu_execute(cpu->cpu_index, 417 cpu->throttle_us_per_full); 418 419 g_usleep(cpu->throttle_us_per_full); 420 return; 421 } 422 423 dirtylimit_state_unlock(); 424 } 425 } 426 427 static void dirtylimit_init(void) 428 { 429 dirtylimit_state_initialize(); 430 dirtylimit_change(true); 431 vcpu_dirty_rate_stat_initialize(); 432 vcpu_dirty_rate_stat_start(); 433 } 434 435 static void dirtylimit_cleanup(void) 436 { 437 vcpu_dirty_rate_stat_stop(); 438 vcpu_dirty_rate_stat_finalize(); 439 dirtylimit_change(false); 440 dirtylimit_state_finalize(); 441 } 442 443 /* 444 * dirty page rate limit is not allowed to set if migration 445 * is running with dirty-limit capability enabled. 446 */ 447 static bool dirtylimit_is_allowed(void) 448 { 449 if (migration_is_running() && 450 !migration_thread_is_self() && 451 migrate_dirty_limit() && 452 dirtylimit_in_service()) { 453 return false; 454 } 455 return true; 456 } 457 458 void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index, 459 int64_t cpu_index, 460 Error **errp) 461 { 462 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 463 return; 464 } 465 466 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 467 error_setg(errp, "incorrect cpu index specified"); 468 return; 469 } 470 471 if (!dirtylimit_is_allowed()) { 472 error_setg(errp, "can't cancel dirty page rate limit while" 473 " migration is running"); 474 return; 475 } 476 477 if (!dirtylimit_in_service()) { 478 return; 479 } 480 481 dirtylimit_state_lock(); 482 483 if (has_cpu_index) { 484 dirtylimit_set_vcpu(cpu_index, 0, false); 485 } else { 486 dirtylimit_set_all(0, false); 487 } 488 489 if (!dirtylimit_state->limited_nvcpu) { 490 dirtylimit_cleanup(); 491 } 492 493 dirtylimit_state_unlock(); 494 } 495 496 void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 497 { 498 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 499 Error *err = NULL; 500 501 qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err); 502 if (err) { 503 hmp_handle_error(mon, err); 504 return; 505 } 506 507 monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query " 508 "dirty limit for virtual CPU]\n"); 509 } 510 511 void qmp_set_vcpu_dirty_limit(bool has_cpu_index, 512 int64_t cpu_index, 513 uint64_t dirty_rate, 514 Error **errp) 515 { 516 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 517 error_setg(errp, "dirty page limit feature requires KVM with" 518 " accelerator property 'dirty-ring-size' set'"); 519 return; 520 } 521 522 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 523 error_setg(errp, "incorrect cpu index specified"); 524 return; 525 } 526 527 if (!dirtylimit_is_allowed()) { 528 error_setg(errp, "can't set dirty page rate limit while" 529 " migration is running"); 530 return; 531 } 532 533 if (!dirty_rate) { 534 qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp); 535 return; 536 } 537 538 dirtylimit_state_lock(); 539 540 if (!dirtylimit_in_service()) { 541 dirtylimit_init(); 542 } 543 544 if (has_cpu_index) { 545 dirtylimit_set_vcpu(cpu_index, dirty_rate, true); 546 } else { 547 dirtylimit_set_all(dirty_rate, true); 548 } 549 550 dirtylimit_state_unlock(); 551 } 552 553 void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 554 { 555 int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate"); 556 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 557 Error *err = NULL; 558 559 if (dirty_rate < 0) { 560 error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate); 561 goto out; 562 } 563 564 qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err); 565 566 out: 567 hmp_handle_error(mon, err); 568 } 569 570 /* Return the max throttle time of each virtual CPU */ 571 uint64_t dirtylimit_throttle_time_per_round(void) 572 { 573 CPUState *cpu; 574 int64_t max = 0; 575 576 CPU_FOREACH(cpu) { 577 if (cpu->throttle_us_per_full > max) { 578 max = cpu->throttle_us_per_full; 579 } 580 } 581 582 return max; 583 } 584 585 /* 586 * Estimate average dirty ring full time of each virtaul CPU. 587 * Return 0 if guest doesn't dirty memory. 588 */ 589 uint64_t dirtylimit_ring_full_time(void) 590 { 591 CPUState *cpu; 592 uint64_t curr_rate = 0; 593 int nvcpus = 0; 594 595 CPU_FOREACH(cpu) { 596 if (cpu->running) { 597 nvcpus++; 598 curr_rate += vcpu_dirty_rate_get(cpu->cpu_index); 599 } 600 } 601 602 if (!curr_rate || !nvcpus) { 603 return 0; 604 } 605 606 return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus); 607 } 608 609 static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index) 610 { 611 DirtyLimitInfo *info = NULL; 612 613 info = g_malloc0(sizeof(*info)); 614 info->cpu_index = cpu_index; 615 info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota; 616 info->current_rate = vcpu_dirty_rate_get(cpu_index); 617 618 return info; 619 } 620 621 static struct DirtyLimitInfoList *dirtylimit_query_all(void) 622 { 623 int i, index; 624 DirtyLimitInfo *info = NULL; 625 DirtyLimitInfoList *head = NULL, **tail = &head; 626 627 dirtylimit_state_lock(); 628 629 if (!dirtylimit_in_service()) { 630 dirtylimit_state_unlock(); 631 return NULL; 632 } 633 634 for (i = 0; i < dirtylimit_state->max_cpus; i++) { 635 index = dirtylimit_state->states[i].cpu_index; 636 if (dirtylimit_vcpu_get_state(index)->enabled) { 637 info = dirtylimit_query_vcpu(index); 638 QAPI_LIST_APPEND(tail, info); 639 } 640 } 641 642 dirtylimit_state_unlock(); 643 644 return head; 645 } 646 647 struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp) 648 { 649 return dirtylimit_query_all(); 650 } 651 652 void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 653 { 654 DirtyLimitInfoList *info; 655 g_autoptr(DirtyLimitInfoList) head = NULL; 656 Error *err = NULL; 657 658 if (!dirtylimit_in_service()) { 659 monitor_printf(mon, "Dirty page limit not enabled!\n"); 660 return; 661 } 662 663 head = qmp_query_vcpu_dirty_limit(&err); 664 if (err) { 665 hmp_handle_error(mon, err); 666 return; 667 } 668 669 for (info = head; info != NULL; info = info->next) { 670 monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s)," 671 " current rate %"PRIi64 " (MB/s)\n", 672 info->value->cpu_index, 673 info->value->limit_rate, 674 info->value->current_rate); 675 } 676 } 677