1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * CPUFreq governor based on scheduler-provided CPU utilization data. 4 * 5 * Copyright (C) 2016, Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 9 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) 10 11 struct sugov_tunables { 12 struct gov_attr_set attr_set; 13 unsigned int rate_limit_us; 14 }; 15 16 struct sugov_policy { 17 struct cpufreq_policy *policy; 18 19 struct sugov_tunables *tunables; 20 struct list_head tunables_hook; 21 22 raw_spinlock_t update_lock; 23 u64 last_freq_update_time; 24 s64 freq_update_delay_ns; 25 unsigned int next_freq; 26 unsigned int cached_raw_freq; 27 28 /* The next fields are only needed if fast switch cannot be used: */ 29 struct irq_work irq_work; 30 struct kthread_work work; 31 struct mutex work_lock; 32 struct kthread_worker worker; 33 struct task_struct *thread; 34 bool work_in_progress; 35 36 bool limits_changed; 37 bool need_freq_update; 38 }; 39 40 struct sugov_cpu { 41 struct update_util_data update_util; 42 struct sugov_policy *sg_policy; 43 unsigned int cpu; 44 45 bool iowait_boost_pending; 46 unsigned int iowait_boost; 47 u64 last_update; 48 49 unsigned long util; 50 unsigned long bw_min; 51 52 /* The field below is for single-CPU policies only: */ 53 #ifdef CONFIG_NO_HZ_COMMON 54 unsigned long saved_idle_calls; 55 #endif 56 }; 57 58 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); 59 60 /************************ Governor internals ***********************/ 61 62 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) 63 { 64 s64 delta_ns; 65 66 /* 67 * Since cpufreq_update_util() is called with rq->lock held for 68 * the @target_cpu, our per-CPU data is fully serialized. 69 * 70 * However, drivers cannot in general deal with cross-CPU 71 * requests, so while get_next_freq() will work, our 72 * sugov_update_commit() call may not for the fast switching platforms. 73 * 74 * Hence stop here for remote requests if they aren't supported 75 * by the hardware, as calculating the frequency is pointless if 76 * we cannot in fact act on it. 77 * 78 * This is needed on the slow switching platforms too to prevent CPUs 79 * going offline from leaving stale IRQ work items behind. 80 */ 81 if (!cpufreq_this_cpu_can_update(sg_policy->policy)) 82 return false; 83 84 if (unlikely(READ_ONCE(sg_policy->limits_changed))) { 85 WRITE_ONCE(sg_policy->limits_changed, false); 86 sg_policy->need_freq_update = true; 87 88 /* 89 * The above limits_changed update must occur before the reads 90 * of policy limits in cpufreq_driver_resolve_freq() or a policy 91 * limits update might be missed, so use a memory barrier to 92 * ensure it. 93 * 94 * This pairs with the write memory barrier in sugov_limits(). 95 */ 96 smp_mb(); 97 98 return true; 99 } else if (sg_policy->need_freq_update) { 100 /* ignore_dl_rate_limit() wants a new frequency to be found. */ 101 return true; 102 } 103 104 delta_ns = time - sg_policy->last_freq_update_time; 105 106 return delta_ns >= sg_policy->freq_update_delay_ns; 107 } 108 109 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, 110 unsigned int next_freq) 111 { 112 if (sg_policy->need_freq_update) { 113 sg_policy->need_freq_update = false; 114 /* 115 * The policy limits have changed, but if the return value of 116 * cpufreq_driver_resolve_freq() after applying the new limits 117 * is still equal to the previously selected frequency, the 118 * driver callback need not be invoked unless the driver 119 * specifically wants that to happen on every update of the 120 * policy limits. 121 */ 122 if (sg_policy->next_freq == next_freq && 123 !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) 124 return false; 125 } else if (sg_policy->next_freq == next_freq) { 126 return false; 127 } 128 129 sg_policy->next_freq = next_freq; 130 sg_policy->last_freq_update_time = time; 131 132 return true; 133 } 134 135 static void sugov_deferred_update(struct sugov_policy *sg_policy) 136 { 137 if (!sg_policy->work_in_progress) { 138 sg_policy->work_in_progress = true; 139 irq_work_queue(&sg_policy->irq_work); 140 } 141 } 142 143 /** 144 * get_capacity_ref_freq - get the reference frequency that has been used to 145 * correlate frequency and compute capacity for a given cpufreq policy. We use 146 * the CPU managing it for the arch_scale_freq_ref() call in the function. 147 * @policy: the cpufreq policy of the CPU in question. 148 * 149 * Return: the reference CPU frequency to compute a capacity. 150 */ 151 static __always_inline 152 unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) 153 { 154 unsigned int freq = arch_scale_freq_ref(policy->cpu); 155 156 if (freq) 157 return freq; 158 159 if (arch_scale_freq_invariant()) 160 return policy->cpuinfo.max_freq; 161 162 /* 163 * Apply a 25% margin so that we select a higher frequency than 164 * the current one before the CPU is fully busy: 165 */ 166 return policy->cur + (policy->cur >> 2); 167 } 168 169 /** 170 * get_next_freq - Compute a new frequency for a given cpufreq policy. 171 * @sg_policy: schedutil policy object to compute the new frequency for. 172 * @util: Current CPU utilization. 173 * @max: CPU capacity. 174 * 175 * If the utilization is frequency-invariant, choose the new frequency to be 176 * proportional to it, that is 177 * 178 * next_freq = C * max_freq * util / max 179 * 180 * Otherwise, approximate the would-be frequency-invariant utilization by 181 * util_raw * (curr_freq / max_freq) which leads to 182 * 183 * next_freq = C * curr_freq * util_raw / max 184 * 185 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. 186 * 187 * The lowest driver-supported frequency which is equal or greater than the raw 188 * next_freq (as calculated above) is returned, subject to policy min/max and 189 * cpufreq driver limitations. 190 */ 191 static unsigned int get_next_freq(struct sugov_policy *sg_policy, 192 unsigned long util, unsigned long max) 193 { 194 struct cpufreq_policy *policy = sg_policy->policy; 195 unsigned int freq; 196 197 freq = get_capacity_ref_freq(policy); 198 freq = map_util_freq(util, freq, max); 199 200 if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) 201 return sg_policy->next_freq; 202 203 sg_policy->cached_raw_freq = freq; 204 return cpufreq_driver_resolve_freq(policy, freq); 205 } 206 207 unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, 208 unsigned long min, 209 unsigned long max) 210 { 211 /* Add dvfs headroom to actual utilization */ 212 actual = map_util_perf(actual); 213 /* Actually we don't need to target the max performance */ 214 if (actual < max) 215 max = actual; 216 217 /* 218 * Ensure at least minimum performance while providing more compute 219 * capacity when possible. 220 */ 221 return max(min, max); 222 } 223 224 static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) 225 { 226 unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); 227 228 if (!scx_switched_all()) 229 util += cpu_util_cfs_boost(sg_cpu->cpu); 230 util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); 231 util = max(util, boost); 232 sg_cpu->bw_min = min; 233 sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); 234 } 235 236 /** 237 * sugov_iowait_reset() - Reset the IO boost status of a CPU. 238 * @sg_cpu: the sugov data for the CPU to boost 239 * @time: the update time from the caller 240 * @set_iowait_boost: true if an IO boost has been requested 241 * 242 * The IO wait boost of a task is disabled after a tick since the last update 243 * of a CPU. If a new IO wait boost is requested after more then a tick, then 244 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy 245 * efficiency by ignoring sporadic wakeups from IO. 246 */ 247 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, 248 bool set_iowait_boost) 249 { 250 s64 delta_ns = time - sg_cpu->last_update; 251 252 /* Reset boost only if a tick has elapsed since last request */ 253 if (delta_ns <= TICK_NSEC) 254 return false; 255 256 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; 257 sg_cpu->iowait_boost_pending = set_iowait_boost; 258 259 return true; 260 } 261 262 /** 263 * sugov_iowait_boost() - Updates the IO boost status of a CPU. 264 * @sg_cpu: the sugov data for the CPU to boost 265 * @time: the update time from the caller 266 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait 267 * 268 * Each time a task wakes up after an IO operation, the CPU utilization can be 269 * boosted to a certain utilization which doubles at each "frequent and 270 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization 271 * of the maximum OPP. 272 * 273 * To keep doubling, an IO boost has to be requested at least once per tick, 274 * otherwise we restart from the utilization of the minimum OPP. 275 */ 276 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 277 unsigned int flags) 278 { 279 bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; 280 281 /* Reset boost if the CPU appears to have been idle enough */ 282 if (sg_cpu->iowait_boost && 283 sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) 284 return; 285 286 /* Boost only tasks waking up after IO */ 287 if (!set_iowait_boost) 288 return; 289 290 /* Ensure boost doubles only one time at each request */ 291 if (sg_cpu->iowait_boost_pending) 292 return; 293 sg_cpu->iowait_boost_pending = true; 294 295 /* Double the boost at each request */ 296 if (sg_cpu->iowait_boost) { 297 sg_cpu->iowait_boost = 298 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); 299 return; 300 } 301 302 /* First wakeup after IO: start with minimum boost */ 303 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; 304 } 305 306 /** 307 * sugov_iowait_apply() - Apply the IO boost to a CPU. 308 * @sg_cpu: the sugov data for the cpu to boost 309 * @time: the update time from the caller 310 * @max_cap: the max CPU capacity 311 * 312 * A CPU running a task which woken up after an IO operation can have its 313 * utilization boosted to speed up the completion of those IO operations. 314 * The IO boost value is increased each time a task wakes up from IO, in 315 * sugov_iowait_apply(), and it's instead decreased by this function, 316 * each time an increase has not been requested (!iowait_boost_pending). 317 * 318 * A CPU which also appears to have been idle for at least one tick has also 319 * its IO boost utilization reset. 320 * 321 * This mechanism is designed to boost high frequently IO waiting tasks, while 322 * being more conservative on tasks which does sporadic IO operations. 323 */ 324 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, 325 unsigned long max_cap) 326 { 327 /* No boost currently required */ 328 if (!sg_cpu->iowait_boost) 329 return 0; 330 331 /* Reset boost if the CPU appears to have been idle enough */ 332 if (sugov_iowait_reset(sg_cpu, time, false)) 333 return 0; 334 335 if (!sg_cpu->iowait_boost_pending) { 336 /* 337 * No boost pending; reduce the boost value. 338 */ 339 sg_cpu->iowait_boost >>= 1; 340 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { 341 sg_cpu->iowait_boost = 0; 342 return 0; 343 } 344 } 345 346 sg_cpu->iowait_boost_pending = false; 347 348 /* 349 * sg_cpu->util is already in capacity scale; convert iowait_boost 350 * into the same scale so we can compare. 351 */ 352 return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; 353 } 354 355 #ifdef CONFIG_NO_HZ_COMMON 356 static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) 357 { 358 unsigned long idle_calls; 359 bool ret; 360 361 /* 362 * The heuristics in this function is for the fair class. For SCX, the 363 * performance target comes directly from the BPF scheduler. Let's just 364 * follow it. 365 */ 366 if (scx_switched_all()) 367 return false; 368 369 /* if capped by uclamp_max, always update to be in compliance */ 370 if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) 371 return false; 372 373 /* 374 * Maintain the frequency if the CPU has not been idle recently, as 375 * reduction is likely to be premature. 376 */ 377 idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); 378 ret = idle_calls == sg_cpu->saved_idle_calls; 379 380 sg_cpu->saved_idle_calls = idle_calls; 381 return ret; 382 } 383 #else 384 static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } 385 #endif /* CONFIG_NO_HZ_COMMON */ 386 387 /* 388 * Make sugov_should_update_freq() ignore the rate limit when DL 389 * has increased the utilization. 390 */ 391 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) 392 { 393 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) 394 sg_cpu->sg_policy->need_freq_update = true; 395 } 396 397 static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, 398 u64 time, unsigned long max_cap, 399 unsigned int flags) 400 { 401 unsigned long boost; 402 403 sugov_iowait_boost(sg_cpu, time, flags); 404 sg_cpu->last_update = time; 405 406 ignore_dl_rate_limit(sg_cpu); 407 408 if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) 409 return false; 410 411 boost = sugov_iowait_apply(sg_cpu, time, max_cap); 412 sugov_get_util(sg_cpu, boost); 413 414 return true; 415 } 416 417 static void sugov_update_single_freq(struct update_util_data *hook, u64 time, 418 unsigned int flags) 419 { 420 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 421 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 422 unsigned int cached_freq = sg_policy->cached_raw_freq; 423 unsigned long max_cap; 424 unsigned int next_f; 425 426 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 427 428 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) 429 return; 430 431 next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); 432 433 if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && 434 !sg_policy->need_freq_update) { 435 next_f = sg_policy->next_freq; 436 437 /* Restore cached freq as next_freq has changed */ 438 sg_policy->cached_raw_freq = cached_freq; 439 } 440 441 if (!sugov_update_next_freq(sg_policy, time, next_f)) 442 return; 443 444 /* 445 * This code runs under rq->lock for the target CPU, so it won't run 446 * concurrently on two different CPUs for the same target and it is not 447 * necessary to acquire the lock in the fast switch case. 448 */ 449 if (sg_policy->policy->fast_switch_enabled) { 450 cpufreq_driver_fast_switch(sg_policy->policy, next_f); 451 } else { 452 raw_spin_lock(&sg_policy->update_lock); 453 sugov_deferred_update(sg_policy); 454 raw_spin_unlock(&sg_policy->update_lock); 455 } 456 } 457 458 static void sugov_update_single_perf(struct update_util_data *hook, u64 time, 459 unsigned int flags) 460 { 461 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 462 unsigned long prev_util = sg_cpu->util; 463 unsigned long max_cap; 464 465 /* 466 * Fall back to the "frequency" path if frequency invariance is not 467 * supported, because the direct mapping between the utilization and 468 * the performance levels depends on the frequency invariance. 469 */ 470 if (!arch_scale_freq_invariant()) { 471 sugov_update_single_freq(hook, time, flags); 472 return; 473 } 474 475 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 476 477 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) 478 return; 479 480 if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) 481 sg_cpu->util = prev_util; 482 483 cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, 484 sg_cpu->util, max_cap); 485 486 sg_cpu->sg_policy->last_freq_update_time = time; 487 } 488 489 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) 490 { 491 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 492 struct cpufreq_policy *policy = sg_policy->policy; 493 unsigned long util = 0, max_cap; 494 unsigned int j; 495 496 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); 497 498 for_each_cpu(j, policy->cpus) { 499 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 500 unsigned long boost; 501 502 boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); 503 sugov_get_util(j_sg_cpu, boost); 504 505 util = max(j_sg_cpu->util, util); 506 } 507 508 return get_next_freq(sg_policy, util, max_cap); 509 } 510 511 static void 512 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) 513 { 514 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 515 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 516 unsigned int next_f; 517 518 raw_spin_lock(&sg_policy->update_lock); 519 520 sugov_iowait_boost(sg_cpu, time, flags); 521 sg_cpu->last_update = time; 522 523 ignore_dl_rate_limit(sg_cpu); 524 525 if (sugov_should_update_freq(sg_policy, time)) { 526 next_f = sugov_next_freq_shared(sg_cpu, time); 527 528 if (!sugov_update_next_freq(sg_policy, time, next_f)) 529 goto unlock; 530 531 if (sg_policy->policy->fast_switch_enabled) 532 cpufreq_driver_fast_switch(sg_policy->policy, next_f); 533 else 534 sugov_deferred_update(sg_policy); 535 } 536 unlock: 537 raw_spin_unlock(&sg_policy->update_lock); 538 } 539 540 static void sugov_work(struct kthread_work *work) 541 { 542 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); 543 unsigned int freq; 544 unsigned long flags; 545 546 /* 547 * Hold sg_policy->update_lock shortly to handle the case where: 548 * in case sg_policy->next_freq is read here, and then updated by 549 * sugov_deferred_update() just before work_in_progress is set to false 550 * here, we may miss queueing the new update. 551 * 552 * Note: If a work was queued after the update_lock is released, 553 * sugov_work() will just be called again by kthread_work code; and the 554 * request will be proceed before the sugov thread sleeps. 555 */ 556 raw_spin_lock_irqsave(&sg_policy->update_lock, flags); 557 freq = sg_policy->next_freq; 558 sg_policy->work_in_progress = false; 559 raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); 560 561 mutex_lock(&sg_policy->work_lock); 562 __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); 563 mutex_unlock(&sg_policy->work_lock); 564 } 565 566 static void sugov_irq_work(struct irq_work *irq_work) 567 { 568 struct sugov_policy *sg_policy; 569 570 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 571 572 kthread_queue_work(&sg_policy->worker, &sg_policy->work); 573 } 574 575 /************************** sysfs interface ************************/ 576 577 static struct sugov_tunables *global_tunables; 578 static DEFINE_MUTEX(global_tunables_lock); 579 580 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) 581 { 582 return container_of(attr_set, struct sugov_tunables, attr_set); 583 } 584 585 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) 586 { 587 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 588 589 return sprintf(buf, "%u\n", tunables->rate_limit_us); 590 } 591 592 static ssize_t 593 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) 594 { 595 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 596 struct sugov_policy *sg_policy; 597 unsigned int rate_limit_us; 598 599 if (kstrtouint(buf, 10, &rate_limit_us)) 600 return -EINVAL; 601 602 tunables->rate_limit_us = rate_limit_us; 603 604 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) 605 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; 606 607 return count; 608 } 609 610 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 611 612 static struct attribute *sugov_attrs[] = { 613 &rate_limit_us.attr, 614 NULL 615 }; 616 ATTRIBUTE_GROUPS(sugov); 617 618 static void sugov_tunables_free(struct kobject *kobj) 619 { 620 struct gov_attr_set *attr_set = to_gov_attr_set(kobj); 621 622 kfree(to_sugov_tunables(attr_set)); 623 } 624 625 static const struct kobj_type sugov_tunables_ktype = { 626 .default_groups = sugov_groups, 627 .sysfs_ops = &governor_sysfs_ops, 628 .release = &sugov_tunables_free, 629 }; 630 631 /********************** cpufreq governor interface *********************/ 632 633 static struct cpufreq_governor schedutil_gov; 634 635 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) 636 { 637 struct sugov_policy *sg_policy; 638 639 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); 640 if (!sg_policy) 641 return NULL; 642 643 sg_policy->policy = policy; 644 raw_spin_lock_init(&sg_policy->update_lock); 645 return sg_policy; 646 } 647 648 static void sugov_policy_free(struct sugov_policy *sg_policy) 649 { 650 kfree(sg_policy); 651 } 652 653 static int sugov_kthread_create(struct sugov_policy *sg_policy) 654 { 655 struct task_struct *thread; 656 struct sched_attr attr = { 657 .size = sizeof(struct sched_attr), 658 .sched_policy = SCHED_DEADLINE, 659 .sched_flags = SCHED_FLAG_SUGOV, 660 .sched_nice = 0, 661 .sched_priority = 0, 662 /* 663 * Fake (unused) bandwidth; workaround to "fix" 664 * priority inheritance. 665 */ 666 .sched_runtime = NSEC_PER_MSEC, 667 .sched_deadline = 10 * NSEC_PER_MSEC, 668 .sched_period = 10 * NSEC_PER_MSEC, 669 }; 670 struct cpufreq_policy *policy = sg_policy->policy; 671 int ret; 672 673 /* kthread only required for slow path */ 674 if (policy->fast_switch_enabled) 675 return 0; 676 677 kthread_init_work(&sg_policy->work, sugov_work); 678 kthread_init_worker(&sg_policy->worker); 679 thread = kthread_create(kthread_worker_fn, &sg_policy->worker, 680 "sugov:%d", 681 cpumask_first(policy->related_cpus)); 682 if (IS_ERR(thread)) { 683 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); 684 return PTR_ERR(thread); 685 } 686 687 ret = sched_setattr_nocheck(thread, &attr); 688 if (ret) { 689 kthread_stop(thread); 690 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); 691 return ret; 692 } 693 694 sg_policy->thread = thread; 695 if (policy->dvfs_possible_from_any_cpu) 696 set_cpus_allowed_ptr(thread, policy->related_cpus); 697 else 698 kthread_bind_mask(thread, policy->related_cpus); 699 700 init_irq_work(&sg_policy->irq_work, sugov_irq_work); 701 mutex_init(&sg_policy->work_lock); 702 703 wake_up_process(thread); 704 705 return 0; 706 } 707 708 static void sugov_kthread_stop(struct sugov_policy *sg_policy) 709 { 710 /* kthread only required for slow path */ 711 if (sg_policy->policy->fast_switch_enabled) 712 return; 713 714 kthread_flush_worker(&sg_policy->worker); 715 kthread_stop(sg_policy->thread); 716 mutex_destroy(&sg_policy->work_lock); 717 } 718 719 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) 720 { 721 struct sugov_tunables *tunables; 722 723 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); 724 if (tunables) { 725 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); 726 if (!have_governor_per_policy()) 727 global_tunables = tunables; 728 } 729 return tunables; 730 } 731 732 static void sugov_clear_global_tunables(void) 733 { 734 if (!have_governor_per_policy()) 735 global_tunables = NULL; 736 } 737 738 static int sugov_init(struct cpufreq_policy *policy) 739 { 740 struct sugov_policy *sg_policy; 741 struct sugov_tunables *tunables; 742 int ret = 0; 743 744 /* State should be equivalent to EXIT */ 745 if (policy->governor_data) 746 return -EBUSY; 747 748 cpufreq_enable_fast_switch(policy); 749 750 sg_policy = sugov_policy_alloc(policy); 751 if (!sg_policy) { 752 ret = -ENOMEM; 753 goto disable_fast_switch; 754 } 755 756 ret = sugov_kthread_create(sg_policy); 757 if (ret) 758 goto free_sg_policy; 759 760 mutex_lock(&global_tunables_lock); 761 762 if (global_tunables) { 763 if (WARN_ON(have_governor_per_policy())) { 764 ret = -EINVAL; 765 goto stop_kthread; 766 } 767 policy->governor_data = sg_policy; 768 sg_policy->tunables = global_tunables; 769 770 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); 771 goto out; 772 } 773 774 tunables = sugov_tunables_alloc(sg_policy); 775 if (!tunables) { 776 ret = -ENOMEM; 777 goto stop_kthread; 778 } 779 780 tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); 781 782 policy->governor_data = sg_policy; 783 sg_policy->tunables = tunables; 784 785 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, 786 get_governor_parent_kobj(policy), "%s", 787 schedutil_gov.name); 788 if (ret) 789 goto fail; 790 791 out: 792 /* 793 * Schedutil is the preferred governor for EAS, so rebuild sched domains 794 * on governor changes to make sure the scheduler knows about them. 795 */ 796 em_rebuild_sched_domains(); 797 mutex_unlock(&global_tunables_lock); 798 return 0; 799 800 fail: 801 kobject_put(&tunables->attr_set.kobj); 802 policy->governor_data = NULL; 803 sugov_clear_global_tunables(); 804 805 stop_kthread: 806 sugov_kthread_stop(sg_policy); 807 mutex_unlock(&global_tunables_lock); 808 809 free_sg_policy: 810 sugov_policy_free(sg_policy); 811 812 disable_fast_switch: 813 cpufreq_disable_fast_switch(policy); 814 815 pr_err("initialization failed (error %d)\n", ret); 816 return ret; 817 } 818 819 static void sugov_exit(struct cpufreq_policy *policy) 820 { 821 struct sugov_policy *sg_policy = policy->governor_data; 822 struct sugov_tunables *tunables = sg_policy->tunables; 823 unsigned int count; 824 825 mutex_lock(&global_tunables_lock); 826 827 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); 828 policy->governor_data = NULL; 829 if (!count) 830 sugov_clear_global_tunables(); 831 832 mutex_unlock(&global_tunables_lock); 833 834 sugov_kthread_stop(sg_policy); 835 sugov_policy_free(sg_policy); 836 cpufreq_disable_fast_switch(policy); 837 838 em_rebuild_sched_domains(); 839 } 840 841 static int sugov_start(struct cpufreq_policy *policy) 842 { 843 struct sugov_policy *sg_policy = policy->governor_data; 844 void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); 845 unsigned int cpu; 846 847 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 848 sg_policy->last_freq_update_time = 0; 849 sg_policy->next_freq = 0; 850 sg_policy->work_in_progress = false; 851 sg_policy->limits_changed = false; 852 sg_policy->cached_raw_freq = 0; 853 854 sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); 855 856 if (policy_is_shared(policy)) 857 uu = sugov_update_shared; 858 else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) 859 uu = sugov_update_single_perf; 860 else 861 uu = sugov_update_single_freq; 862 863 for_each_cpu(cpu, policy->cpus) { 864 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 865 866 memset(sg_cpu, 0, sizeof(*sg_cpu)); 867 sg_cpu->cpu = cpu; 868 sg_cpu->sg_policy = sg_policy; 869 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); 870 } 871 return 0; 872 } 873 874 static void sugov_stop(struct cpufreq_policy *policy) 875 { 876 struct sugov_policy *sg_policy = policy->governor_data; 877 unsigned int cpu; 878 879 for_each_cpu(cpu, policy->cpus) 880 cpufreq_remove_update_util_hook(cpu); 881 882 synchronize_rcu(); 883 884 if (!policy->fast_switch_enabled) { 885 irq_work_sync(&sg_policy->irq_work); 886 kthread_cancel_work_sync(&sg_policy->work); 887 } 888 } 889 890 static void sugov_limits(struct cpufreq_policy *policy) 891 { 892 struct sugov_policy *sg_policy = policy->governor_data; 893 894 if (!policy->fast_switch_enabled) { 895 mutex_lock(&sg_policy->work_lock); 896 cpufreq_policy_apply_limits(policy); 897 mutex_unlock(&sg_policy->work_lock); 898 } 899 900 /* 901 * The limits_changed update below must take place before the updates 902 * of policy limits in cpufreq_set_policy() or a policy limits update 903 * might be missed, so use a memory barrier to ensure it. 904 * 905 * This pairs with the memory barrier in sugov_should_update_freq(). 906 */ 907 smp_wmb(); 908 909 WRITE_ONCE(sg_policy->limits_changed, true); 910 } 911 912 static struct cpufreq_governor schedutil_gov = { 913 .name = "schedutil", 914 .owner = THIS_MODULE, 915 .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, 916 .init = sugov_init, 917 .exit = sugov_exit, 918 .start = sugov_start, 919 .stop = sugov_stop, 920 .limits = sugov_limits, 921 }; 922 923 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 924 struct cpufreq_governor *cpufreq_default_governor(void) 925 { 926 return &schedutil_gov; 927 } 928 #endif 929 930 bool sugov_is_governor(struct cpufreq_policy *policy) 931 { 932 return policy->governor == &schedutil_gov; 933 } 934 935 cpufreq_governor_init(schedutil_gov); 936