1 /* 2 * Dirty page rate limit implementation code 3 * 4 * Copyright (c) 2022 CHINA TELECOM CO.,LTD. 5 * 6 * Authors: 7 * Hyman Huang(黄勇) <huangy81@chinatelecom.cn> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu/main-loop.h" 15 #include "qapi/qapi-commands-migration.h" 16 #include "qobject/qdict.h" 17 #include "qapi/error.h" 18 #include "system/dirtyrate.h" 19 #include "system/dirtylimit.h" 20 #include "monitor/hmp.h" 21 #include "monitor/monitor.h" 22 #include "system/memory.h" 23 #include "exec/target_page.h" 24 #include "hw/boards.h" 25 #include "system/kvm.h" 26 #include "trace.h" 27 #include "migration/misc.h" 28 29 /* 30 * Dirtylimit stop working if dirty page rate error 31 * value less than DIRTYLIMIT_TOLERANCE_RANGE 32 */ 33 #define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */ 34 /* 35 * Plus or minus vcpu sleep time linearly if dirty 36 * page rate error value percentage over 37 * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT. 38 * Otherwise, plus or minus a fixed vcpu sleep time. 39 */ 40 #define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50 41 /* 42 * Max vcpu sleep time percentage during a cycle 43 * composed of dirty ring full and sleep time. 44 */ 45 #define DIRTYLIMIT_THROTTLE_PCT_MAX 99 46 47 struct { 48 VcpuStat stat; 49 bool running; 50 QemuThread thread; 51 } *vcpu_dirty_rate_stat; 52 53 typedef struct VcpuDirtyLimitState { 54 int cpu_index; 55 bool enabled; 56 /* 57 * Quota dirty page rate, unit is MB/s 58 * zero if not enabled. 59 */ 60 uint64_t quota; 61 } VcpuDirtyLimitState; 62 63 struct { 64 VcpuDirtyLimitState *states; 65 /* Max cpus number configured by user */ 66 int max_cpus; 67 /* Number of vcpu under dirtylimit */ 68 int limited_nvcpu; 69 } *dirtylimit_state; 70 71 /* protect dirtylimit_state */ 72 static QemuMutex dirtylimit_mutex; 73 74 /* dirtylimit thread quit if dirtylimit_quit is true */ 75 static bool dirtylimit_quit; 76 77 static void vcpu_dirty_rate_stat_collect(void) 78 { 79 VcpuStat stat; 80 int i = 0; 81 int64_t period = DIRTYLIMIT_CALC_TIME_MS; 82 83 if (migrate_dirty_limit() && migration_is_running()) { 84 period = migrate_vcpu_dirty_limit_period(); 85 } 86 87 /* calculate vcpu dirtyrate */ 88 vcpu_calculate_dirtyrate(period, 89 &stat, 90 GLOBAL_DIRTY_LIMIT, 91 false); 92 93 for (i = 0; i < stat.nvcpu; i++) { 94 vcpu_dirty_rate_stat->stat.rates[i].id = i; 95 vcpu_dirty_rate_stat->stat.rates[i].dirty_rate = 96 stat.rates[i].dirty_rate; 97 } 98 99 g_free(stat.rates); 100 } 101 102 static void *vcpu_dirty_rate_stat_thread(void *opaque) 103 { 104 rcu_register_thread(); 105 106 /* start log sync */ 107 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true); 108 109 while (qatomic_read(&vcpu_dirty_rate_stat->running)) { 110 vcpu_dirty_rate_stat_collect(); 111 if (dirtylimit_in_service()) { 112 dirtylimit_process(); 113 } 114 } 115 116 /* stop log sync */ 117 global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false); 118 119 rcu_unregister_thread(); 120 return NULL; 121 } 122 123 int64_t vcpu_dirty_rate_get(int cpu_index) 124 { 125 DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates; 126 return qatomic_read_i64(&rates[cpu_index].dirty_rate); 127 } 128 129 void vcpu_dirty_rate_stat_start(void) 130 { 131 if (qatomic_read(&vcpu_dirty_rate_stat->running)) { 132 return; 133 } 134 135 qatomic_set(&vcpu_dirty_rate_stat->running, 1); 136 qemu_thread_create(&vcpu_dirty_rate_stat->thread, 137 "dirtyrate-stat", 138 vcpu_dirty_rate_stat_thread, 139 NULL, 140 QEMU_THREAD_JOINABLE); 141 } 142 143 void vcpu_dirty_rate_stat_stop(void) 144 { 145 qatomic_set(&vcpu_dirty_rate_stat->running, 0); 146 dirtylimit_state_unlock(); 147 bql_unlock(); 148 qemu_thread_join(&vcpu_dirty_rate_stat->thread); 149 bql_lock(); 150 dirtylimit_state_lock(); 151 } 152 153 void vcpu_dirty_rate_stat_initialize(void) 154 { 155 MachineState *ms = MACHINE(qdev_get_machine()); 156 int max_cpus = ms->smp.max_cpus; 157 158 vcpu_dirty_rate_stat = 159 g_malloc0(sizeof(*vcpu_dirty_rate_stat)); 160 161 vcpu_dirty_rate_stat->stat.nvcpu = max_cpus; 162 vcpu_dirty_rate_stat->stat.rates = 163 g_new0(DirtyRateVcpu, max_cpus); 164 165 vcpu_dirty_rate_stat->running = false; 166 } 167 168 void vcpu_dirty_rate_stat_finalize(void) 169 { 170 g_free(vcpu_dirty_rate_stat->stat.rates); 171 vcpu_dirty_rate_stat->stat.rates = NULL; 172 173 g_free(vcpu_dirty_rate_stat); 174 vcpu_dirty_rate_stat = NULL; 175 } 176 177 void dirtylimit_state_lock(void) 178 { 179 qemu_mutex_lock(&dirtylimit_mutex); 180 } 181 182 void dirtylimit_state_unlock(void) 183 { 184 qemu_mutex_unlock(&dirtylimit_mutex); 185 } 186 187 static void 188 __attribute__((__constructor__)) dirtylimit_mutex_init(void) 189 { 190 qemu_mutex_init(&dirtylimit_mutex); 191 } 192 193 static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index) 194 { 195 return &dirtylimit_state->states[cpu_index]; 196 } 197 198 void dirtylimit_state_initialize(void) 199 { 200 MachineState *ms = MACHINE(qdev_get_machine()); 201 int max_cpus = ms->smp.max_cpus; 202 int i; 203 204 dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state)); 205 206 dirtylimit_state->states = 207 g_new0(VcpuDirtyLimitState, max_cpus); 208 209 for (i = 0; i < max_cpus; i++) { 210 dirtylimit_state->states[i].cpu_index = i; 211 } 212 213 dirtylimit_state->max_cpus = max_cpus; 214 trace_dirtylimit_state_initialize(max_cpus); 215 } 216 217 void dirtylimit_state_finalize(void) 218 { 219 g_free(dirtylimit_state->states); 220 dirtylimit_state->states = NULL; 221 222 g_free(dirtylimit_state); 223 dirtylimit_state = NULL; 224 225 trace_dirtylimit_state_finalize(); 226 } 227 228 bool dirtylimit_in_service(void) 229 { 230 return !!dirtylimit_state; 231 } 232 233 bool dirtylimit_vcpu_index_valid(int cpu_index) 234 { 235 MachineState *ms = MACHINE(qdev_get_machine()); 236 237 return !(cpu_index < 0 || 238 cpu_index >= ms->smp.max_cpus); 239 } 240 241 static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate) 242 { 243 static uint64_t max_dirtyrate; 244 uint64_t dirty_ring_size_MiB; 245 246 dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size()); 247 248 if (max_dirtyrate < dirtyrate) { 249 max_dirtyrate = dirtyrate; 250 } 251 252 return dirty_ring_size_MiB * 1000000 / max_dirtyrate; 253 } 254 255 static inline bool dirtylimit_done(uint64_t quota, 256 uint64_t current) 257 { 258 uint64_t min, max; 259 260 min = MIN(quota, current); 261 max = MAX(quota, current); 262 263 return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false; 264 } 265 266 static inline bool 267 dirtylimit_need_linear_adjustment(uint64_t quota, 268 uint64_t current) 269 { 270 uint64_t min, max; 271 272 min = MIN(quota, current); 273 max = MAX(quota, current); 274 275 return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT; 276 } 277 278 static void dirtylimit_set_throttle(CPUState *cpu, 279 uint64_t quota, 280 uint64_t current) 281 { 282 int64_t ring_full_time_us = 0; 283 uint64_t sleep_pct = 0; 284 uint64_t throttle_us = 0; 285 286 if (current == 0) { 287 cpu->throttle_us_per_full = 0; 288 return; 289 } 290 291 ring_full_time_us = dirtylimit_dirty_ring_full_time(current); 292 293 if (dirtylimit_need_linear_adjustment(quota, current)) { 294 if (quota < current) { 295 sleep_pct = (current - quota) * 100 / current; 296 throttle_us = 297 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 298 cpu->throttle_us_per_full += throttle_us; 299 } else { 300 sleep_pct = (quota - current) * 100 / quota; 301 throttle_us = 302 ring_full_time_us * sleep_pct / (double)(100 - sleep_pct); 303 cpu->throttle_us_per_full -= throttle_us; 304 } 305 306 trace_dirtylimit_throttle_pct(cpu->cpu_index, 307 sleep_pct, 308 throttle_us); 309 } else { 310 if (quota < current) { 311 cpu->throttle_us_per_full += ring_full_time_us / 10; 312 } else { 313 cpu->throttle_us_per_full -= ring_full_time_us / 10; 314 } 315 } 316 317 /* 318 * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario), 319 * current dirty page rate may never reach the quota, we should stop 320 * increasing sleep time? 321 */ 322 cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full, 323 ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX); 324 325 cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0); 326 } 327 328 static void dirtylimit_adjust_throttle(CPUState *cpu) 329 { 330 uint64_t quota = 0; 331 uint64_t current = 0; 332 int cpu_index = cpu->cpu_index; 333 334 quota = dirtylimit_vcpu_get_state(cpu_index)->quota; 335 current = vcpu_dirty_rate_get(cpu_index); 336 337 if (!dirtylimit_done(quota, current)) { 338 dirtylimit_set_throttle(cpu, quota, current); 339 } 340 } 341 342 void dirtylimit_process(void) 343 { 344 CPUState *cpu; 345 346 if (!qatomic_read(&dirtylimit_quit)) { 347 dirtylimit_state_lock(); 348 349 if (!dirtylimit_in_service()) { 350 dirtylimit_state_unlock(); 351 return; 352 } 353 354 CPU_FOREACH(cpu) { 355 if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 356 continue; 357 } 358 dirtylimit_adjust_throttle(cpu); 359 } 360 dirtylimit_state_unlock(); 361 } 362 } 363 364 void dirtylimit_change(bool start) 365 { 366 if (start) { 367 qatomic_set(&dirtylimit_quit, 0); 368 } else { 369 qatomic_set(&dirtylimit_quit, 1); 370 } 371 } 372 373 void dirtylimit_set_vcpu(int cpu_index, 374 uint64_t quota, 375 bool enable) 376 { 377 trace_dirtylimit_set_vcpu(cpu_index, quota); 378 379 if (enable) { 380 dirtylimit_state->states[cpu_index].quota = quota; 381 if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) { 382 dirtylimit_state->limited_nvcpu++; 383 } 384 } else { 385 dirtylimit_state->states[cpu_index].quota = 0; 386 if (dirtylimit_state->states[cpu_index].enabled) { 387 dirtylimit_state->limited_nvcpu--; 388 } 389 } 390 391 dirtylimit_state->states[cpu_index].enabled = enable; 392 } 393 394 void dirtylimit_set_all(uint64_t quota, 395 bool enable) 396 { 397 MachineState *ms = MACHINE(qdev_get_machine()); 398 int max_cpus = ms->smp.max_cpus; 399 int i; 400 401 for (i = 0; i < max_cpus; i++) { 402 dirtylimit_set_vcpu(i, quota, enable); 403 } 404 } 405 406 void dirtylimit_vcpu_execute(CPUState *cpu) 407 { 408 if (cpu->throttle_us_per_full) { 409 dirtylimit_state_lock(); 410 411 if (dirtylimit_in_service() && 412 dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) { 413 dirtylimit_state_unlock(); 414 trace_dirtylimit_vcpu_execute(cpu->cpu_index, 415 cpu->throttle_us_per_full); 416 417 g_usleep(cpu->throttle_us_per_full); 418 return; 419 } 420 421 dirtylimit_state_unlock(); 422 } 423 } 424 425 static void dirtylimit_init(void) 426 { 427 dirtylimit_state_initialize(); 428 dirtylimit_change(true); 429 vcpu_dirty_rate_stat_initialize(); 430 vcpu_dirty_rate_stat_start(); 431 } 432 433 static void dirtylimit_cleanup(void) 434 { 435 vcpu_dirty_rate_stat_stop(); 436 vcpu_dirty_rate_stat_finalize(); 437 dirtylimit_change(false); 438 dirtylimit_state_finalize(); 439 } 440 441 /* 442 * dirty page rate limit is not allowed to set if migration 443 * is running with dirty-limit capability enabled. 444 */ 445 static bool dirtylimit_is_allowed(void) 446 { 447 if (migration_is_running() && 448 !migration_thread_is_self() && 449 migrate_dirty_limit() && 450 dirtylimit_in_service()) { 451 return false; 452 } 453 return true; 454 } 455 456 void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index, 457 int64_t cpu_index, 458 Error **errp) 459 { 460 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 461 return; 462 } 463 464 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 465 error_setg(errp, "incorrect cpu index specified"); 466 return; 467 } 468 469 if (!dirtylimit_is_allowed()) { 470 error_setg(errp, "can't cancel dirty page rate limit while" 471 " migration is running"); 472 return; 473 } 474 475 if (!dirtylimit_in_service()) { 476 return; 477 } 478 479 dirtylimit_state_lock(); 480 481 if (has_cpu_index) { 482 dirtylimit_set_vcpu(cpu_index, 0, false); 483 } else { 484 dirtylimit_set_all(0, false); 485 } 486 487 if (!dirtylimit_state->limited_nvcpu) { 488 dirtylimit_cleanup(); 489 } 490 491 dirtylimit_state_unlock(); 492 } 493 494 void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 495 { 496 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 497 Error *err = NULL; 498 499 qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err); 500 if (err) { 501 hmp_handle_error(mon, err); 502 return; 503 } 504 505 monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query " 506 "dirty limit for virtual CPU]\n"); 507 } 508 509 void qmp_set_vcpu_dirty_limit(bool has_cpu_index, 510 int64_t cpu_index, 511 uint64_t dirty_rate, 512 Error **errp) 513 { 514 if (!kvm_enabled() || !kvm_dirty_ring_enabled()) { 515 error_setg(errp, "dirty page limit feature requires KVM with" 516 " accelerator property 'dirty-ring-size' set'"); 517 return; 518 } 519 520 if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) { 521 error_setg(errp, "incorrect cpu index specified"); 522 return; 523 } 524 525 if (!dirtylimit_is_allowed()) { 526 error_setg(errp, "can't set dirty page rate limit while" 527 " migration is running"); 528 return; 529 } 530 531 if (!dirty_rate) { 532 qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp); 533 return; 534 } 535 536 dirtylimit_state_lock(); 537 538 if (!dirtylimit_in_service()) { 539 dirtylimit_init(); 540 } 541 542 if (has_cpu_index) { 543 dirtylimit_set_vcpu(cpu_index, dirty_rate, true); 544 } else { 545 dirtylimit_set_all(dirty_rate, true); 546 } 547 548 dirtylimit_state_unlock(); 549 } 550 551 void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 552 { 553 int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate"); 554 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1); 555 Error *err = NULL; 556 557 if (dirty_rate < 0) { 558 error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate); 559 goto out; 560 } 561 562 qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err); 563 564 out: 565 hmp_handle_error(mon, err); 566 } 567 568 /* Return the max throttle time of each virtual CPU */ 569 uint64_t dirtylimit_throttle_time_per_round(void) 570 { 571 CPUState *cpu; 572 int64_t max = 0; 573 574 CPU_FOREACH(cpu) { 575 if (cpu->throttle_us_per_full > max) { 576 max = cpu->throttle_us_per_full; 577 } 578 } 579 580 return max; 581 } 582 583 /* 584 * Estimate average dirty ring full time of each virtaul CPU. 585 * Return 0 if guest doesn't dirty memory. 586 */ 587 uint64_t dirtylimit_ring_full_time(void) 588 { 589 CPUState *cpu; 590 uint64_t curr_rate = 0; 591 int nvcpus = 0; 592 593 CPU_FOREACH(cpu) { 594 if (cpu->running) { 595 nvcpus++; 596 curr_rate += vcpu_dirty_rate_get(cpu->cpu_index); 597 } 598 } 599 600 if (!curr_rate || !nvcpus) { 601 return 0; 602 } 603 604 return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus); 605 } 606 607 static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index) 608 { 609 DirtyLimitInfo *info = NULL; 610 611 info = g_malloc0(sizeof(*info)); 612 info->cpu_index = cpu_index; 613 info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota; 614 info->current_rate = vcpu_dirty_rate_get(cpu_index); 615 616 return info; 617 } 618 619 static struct DirtyLimitInfoList *dirtylimit_query_all(void) 620 { 621 int i, index; 622 DirtyLimitInfo *info = NULL; 623 DirtyLimitInfoList *head = NULL, **tail = &head; 624 625 dirtylimit_state_lock(); 626 627 if (!dirtylimit_in_service()) { 628 dirtylimit_state_unlock(); 629 return NULL; 630 } 631 632 for (i = 0; i < dirtylimit_state->max_cpus; i++) { 633 index = dirtylimit_state->states[i].cpu_index; 634 if (dirtylimit_vcpu_get_state(index)->enabled) { 635 info = dirtylimit_query_vcpu(index); 636 QAPI_LIST_APPEND(tail, info); 637 } 638 } 639 640 dirtylimit_state_unlock(); 641 642 return head; 643 } 644 645 struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp) 646 { 647 return dirtylimit_query_all(); 648 } 649 650 void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict) 651 { 652 DirtyLimitInfoList *info; 653 g_autoptr(DirtyLimitInfoList) head = NULL; 654 Error *err = NULL; 655 656 if (!dirtylimit_in_service()) { 657 monitor_printf(mon, "Dirty page limit not enabled!\n"); 658 return; 659 } 660 661 head = qmp_query_vcpu_dirty_limit(&err); 662 if (err) { 663 hmp_handle_error(mon, err); 664 return; 665 } 666 667 for (info = head; info != NULL; info = info->next) { 668 monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s)," 669 " current rate %"PRIi64 " (MB/s)\n", 670 info->value->cpu_index, 671 info->value->limit_rate, 672 info->value->current_rate); 673 } 674 } 675