1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * CPUFreq governor based on scheduler-provided CPU utilization data.
4 *
5 * Copyright (C) 2016, Intel Corporation
6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7 */
8 #include <uapi/linux/sched/types.h>
9 #include "sched.h"
10
11 #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
12
13 struct sugov_tunables {
14 struct gov_attr_set attr_set;
15 unsigned int rate_limit_us;
16 };
17
18 struct sugov_policy {
19 struct cpufreq_policy *policy;
20
21 struct sugov_tunables *tunables;
22 struct list_head tunables_hook;
23
24 raw_spinlock_t update_lock;
25 u64 last_freq_update_time;
26 s64 freq_update_delay_ns;
27 unsigned int next_freq;
28 unsigned int cached_raw_freq;
29
30 /* The next fields are only needed if fast switch cannot be used: */
31 struct irq_work irq_work;
32 struct kthread_work work;
33 struct mutex work_lock;
34 struct kthread_worker worker;
35 struct task_struct *thread;
36 bool work_in_progress;
37
38 bool limits_changed;
39 bool need_freq_update;
40 };
41
42 struct sugov_cpu {
43 struct update_util_data update_util;
44 struct sugov_policy *sg_policy;
45 unsigned int cpu;
46
47 bool iowait_boost_pending;
48 unsigned int iowait_boost;
49 u64 last_update;
50
51 unsigned long util;
52 unsigned long bw_min;
53
54 /* The field below is for single-CPU policies only: */
55 #ifdef CONFIG_NO_HZ_COMMON
56 unsigned long saved_idle_calls;
57 #endif
58 };
59
60 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
61
62 /************************ Governor internals ***********************/
63
sugov_should_update_freq(struct sugov_policy * sg_policy,u64 time)64 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
65 {
66 s64 delta_ns;
67
68 /*
69 * Since cpufreq_update_util() is called with rq->lock held for
70 * the @target_cpu, our per-CPU data is fully serialized.
71 *
72 * However, drivers cannot in general deal with cross-CPU
73 * requests, so while get_next_freq() will work, our
74 * sugov_update_commit() call may not for the fast switching platforms.
75 *
76 * Hence stop here for remote requests if they aren't supported
77 * by the hardware, as calculating the frequency is pointless if
78 * we cannot in fact act on it.
79 *
80 * This is needed on the slow switching platforms too to prevent CPUs
81 * going offline from leaving stale IRQ work items behind.
82 */
83 if (!cpufreq_this_cpu_can_update(sg_policy->policy))
84 return false;
85
86 if (unlikely(READ_ONCE(sg_policy->limits_changed))) {
87 WRITE_ONCE(sg_policy->limits_changed, false);
88 sg_policy->need_freq_update = true;
89
90 /*
91 * The above limits_changed update must occur before the reads
92 * of policy limits in cpufreq_driver_resolve_freq() or a policy
93 * limits update might be missed, so use a memory barrier to
94 * ensure it.
95 *
96 * This pairs with the write memory barrier in sugov_limits().
97 */
98 smp_mb();
99
100 return true;
101 } else if (sg_policy->need_freq_update) {
102 /* ignore_dl_rate_limit() wants a new frequency to be found. */
103 return true;
104 }
105
106 delta_ns = time - sg_policy->last_freq_update_time;
107
108 return delta_ns >= sg_policy->freq_update_delay_ns;
109 }
110
sugov_update_next_freq(struct sugov_policy * sg_policy,u64 time,unsigned int next_freq)111 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
112 unsigned int next_freq)
113 {
114 if (sg_policy->need_freq_update) {
115 sg_policy->need_freq_update = false;
116 /*
117 * The policy limits have changed, but if the return value of
118 * cpufreq_driver_resolve_freq() after applying the new limits
119 * is still equal to the previously selected frequency, the
120 * driver callback need not be invoked unless the driver
121 * specifically wants that to happen on every update of the
122 * policy limits.
123 */
124 if (sg_policy->next_freq == next_freq &&
125 !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS))
126 return false;
127 } else if (sg_policy->next_freq == next_freq) {
128 return false;
129 }
130
131 sg_policy->next_freq = next_freq;
132 sg_policy->last_freq_update_time = time;
133
134 return true;
135 }
136
sugov_deferred_update(struct sugov_policy * sg_policy)137 static void sugov_deferred_update(struct sugov_policy *sg_policy)
138 {
139 if (!sg_policy->work_in_progress) {
140 sg_policy->work_in_progress = true;
141 irq_work_queue(&sg_policy->irq_work);
142 }
143 }
144
145 /**
146 * get_capacity_ref_freq - get the reference frequency that has been used to
147 * correlate frequency and compute capacity for a given cpufreq policy. We use
148 * the CPU managing it for the arch_scale_freq_ref() call in the function.
149 * @policy: the cpufreq policy of the CPU in question.
150 *
151 * Return: the reference CPU frequency to compute a capacity.
152 */
153 static __always_inline
get_capacity_ref_freq(struct cpufreq_policy * policy)154 unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
155 {
156 unsigned int freq = arch_scale_freq_ref(policy->cpu);
157
158 if (freq)
159 return freq;
160
161 if (arch_scale_freq_invariant())
162 return policy->cpuinfo.max_freq;
163
164 /*
165 * Apply a 25% margin so that we select a higher frequency than
166 * the current one before the CPU is fully busy:
167 */
168 return policy->cur + (policy->cur >> 2);
169 }
170
171 /**
172 * get_next_freq - Compute a new frequency for a given cpufreq policy.
173 * @sg_policy: schedutil policy object to compute the new frequency for.
174 * @util: Current CPU utilization.
175 * @max: CPU capacity.
176 *
177 * If the utilization is frequency-invariant, choose the new frequency to be
178 * proportional to it, that is
179 *
180 * next_freq = C * max_freq * util / max
181 *
182 * Otherwise, approximate the would-be frequency-invariant utilization by
183 * util_raw * (curr_freq / max_freq) which leads to
184 *
185 * next_freq = C * curr_freq * util_raw / max
186 *
187 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
188 *
189 * The lowest driver-supported frequency which is equal or greater than the raw
190 * next_freq (as calculated above) is returned, subject to policy min/max and
191 * cpufreq driver limitations.
192 */
get_next_freq(struct sugov_policy * sg_policy,unsigned long util,unsigned long max)193 static unsigned int get_next_freq(struct sugov_policy *sg_policy,
194 unsigned long util, unsigned long max)
195 {
196 struct cpufreq_policy *policy = sg_policy->policy;
197 unsigned int freq;
198
199 freq = get_capacity_ref_freq(policy);
200 freq = map_util_freq(util, freq, max);
201
202 if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
203 return sg_policy->next_freq;
204
205 sg_policy->cached_raw_freq = freq;
206 return cpufreq_driver_resolve_freq(policy, freq);
207 }
208
sugov_effective_cpu_perf(int cpu,unsigned long actual,unsigned long min,unsigned long max)209 unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
210 unsigned long min,
211 unsigned long max)
212 {
213 /* Add dvfs headroom to actual utilization */
214 actual = map_util_perf(actual);
215 /* Actually we don't need to target the max performance */
216 if (actual < max)
217 max = actual;
218
219 /*
220 * Ensure at least minimum performance while providing more compute
221 * capacity when possible.
222 */
223 return max(min, max);
224 }
225
sugov_get_util(struct sugov_cpu * sg_cpu,unsigned long boost)226 static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
227 {
228 unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
229
230 if (!scx_switched_all())
231 util += cpu_util_cfs_boost(sg_cpu->cpu);
232 util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
233 util = max(util, boost);
234 sg_cpu->bw_min = min;
235 sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
236 }
237
238 /**
239 * sugov_iowait_reset() - Reset the IO boost status of a CPU.
240 * @sg_cpu: the sugov data for the CPU to boost
241 * @time: the update time from the caller
242 * @set_iowait_boost: true if an IO boost has been requested
243 *
244 * The IO wait boost of a task is disabled after a tick since the last update
245 * of a CPU. If a new IO wait boost is requested after more then a tick, then
246 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
247 * efficiency by ignoring sporadic wakeups from IO.
248 */
sugov_iowait_reset(struct sugov_cpu * sg_cpu,u64 time,bool set_iowait_boost)249 static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
250 bool set_iowait_boost)
251 {
252 s64 delta_ns = time - sg_cpu->last_update;
253
254 /* Reset boost only if a tick has elapsed since last request */
255 if (delta_ns <= TICK_NSEC)
256 return false;
257
258 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
259 sg_cpu->iowait_boost_pending = set_iowait_boost;
260
261 return true;
262 }
263
264 /**
265 * sugov_iowait_boost() - Updates the IO boost status of a CPU.
266 * @sg_cpu: the sugov data for the CPU to boost
267 * @time: the update time from the caller
268 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
269 *
270 * Each time a task wakes up after an IO operation, the CPU utilization can be
271 * boosted to a certain utilization which doubles at each "frequent and
272 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
273 * of the maximum OPP.
274 *
275 * To keep doubling, an IO boost has to be requested at least once per tick,
276 * otherwise we restart from the utilization of the minimum OPP.
277 */
sugov_iowait_boost(struct sugov_cpu * sg_cpu,u64 time,unsigned int flags)278 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
279 unsigned int flags)
280 {
281 bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
282
283 /* Reset boost if the CPU appears to have been idle enough */
284 if (sg_cpu->iowait_boost &&
285 sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
286 return;
287
288 /* Boost only tasks waking up after IO */
289 if (!set_iowait_boost)
290 return;
291
292 /* Ensure boost doubles only one time at each request */
293 if (sg_cpu->iowait_boost_pending)
294 return;
295 sg_cpu->iowait_boost_pending = true;
296
297 /* Double the boost at each request */
298 if (sg_cpu->iowait_boost) {
299 sg_cpu->iowait_boost =
300 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
301 return;
302 }
303
304 /* First wakeup after IO: start with minimum boost */
305 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
306 }
307
308 /**
309 * sugov_iowait_apply() - Apply the IO boost to a CPU.
310 * @sg_cpu: the sugov data for the cpu to boost
311 * @time: the update time from the caller
312 * @max_cap: the max CPU capacity
313 *
314 * A CPU running a task which woken up after an IO operation can have its
315 * utilization boosted to speed up the completion of those IO operations.
316 * The IO boost value is increased each time a task wakes up from IO, in
317 * sugov_iowait_apply(), and it's instead decreased by this function,
318 * each time an increase has not been requested (!iowait_boost_pending).
319 *
320 * A CPU which also appears to have been idle for at least one tick has also
321 * its IO boost utilization reset.
322 *
323 * This mechanism is designed to boost high frequently IO waiting tasks, while
324 * being more conservative on tasks which does sporadic IO operations.
325 */
sugov_iowait_apply(struct sugov_cpu * sg_cpu,u64 time,unsigned long max_cap)326 static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
327 unsigned long max_cap)
328 {
329 /* No boost currently required */
330 if (!sg_cpu->iowait_boost)
331 return 0;
332
333 /* Reset boost if the CPU appears to have been idle enough */
334 if (sugov_iowait_reset(sg_cpu, time, false))
335 return 0;
336
337 if (!sg_cpu->iowait_boost_pending) {
338 /*
339 * No boost pending; reduce the boost value.
340 */
341 sg_cpu->iowait_boost >>= 1;
342 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
343 sg_cpu->iowait_boost = 0;
344 return 0;
345 }
346 }
347
348 sg_cpu->iowait_boost_pending = false;
349
350 /*
351 * sg_cpu->util is already in capacity scale; convert iowait_boost
352 * into the same scale so we can compare.
353 */
354 return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
355 }
356
357 #ifdef CONFIG_NO_HZ_COMMON
sugov_hold_freq(struct sugov_cpu * sg_cpu)358 static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
359 {
360 unsigned long idle_calls;
361 bool ret;
362
363 /*
364 * The heuristics in this function is for the fair class. For SCX, the
365 * performance target comes directly from the BPF scheduler. Let's just
366 * follow it.
367 */
368 if (scx_switched_all())
369 return false;
370
371 /* if capped by uclamp_max, always update to be in compliance */
372 if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
373 return false;
374
375 /*
376 * Maintain the frequency if the CPU has not been idle recently, as
377 * reduction is likely to be premature.
378 */
379 idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
380 ret = idle_calls == sg_cpu->saved_idle_calls;
381
382 sg_cpu->saved_idle_calls = idle_calls;
383 return ret;
384 }
385 #else /* !CONFIG_NO_HZ_COMMON: */
sugov_hold_freq(struct sugov_cpu * sg_cpu)386 static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
387 #endif /* !CONFIG_NO_HZ_COMMON */
388
389 /*
390 * Make sugov_should_update_freq() ignore the rate limit when DL
391 * has increased the utilization.
392 */
ignore_dl_rate_limit(struct sugov_cpu * sg_cpu)393 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
394 {
395 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
396 sg_cpu->sg_policy->need_freq_update = true;
397 }
398
sugov_update_single_common(struct sugov_cpu * sg_cpu,u64 time,unsigned long max_cap,unsigned int flags)399 static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
400 u64 time, unsigned long max_cap,
401 unsigned int flags)
402 {
403 unsigned long boost;
404
405 sugov_iowait_boost(sg_cpu, time, flags);
406 sg_cpu->last_update = time;
407
408 ignore_dl_rate_limit(sg_cpu);
409
410 if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
411 return false;
412
413 boost = sugov_iowait_apply(sg_cpu, time, max_cap);
414 sugov_get_util(sg_cpu, boost);
415
416 return true;
417 }
418
sugov_update_single_freq(struct update_util_data * hook,u64 time,unsigned int flags)419 static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
420 unsigned int flags)
421 {
422 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
423 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
424 unsigned int cached_freq = sg_policy->cached_raw_freq;
425 unsigned long max_cap;
426 unsigned int next_f;
427
428 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
429
430 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
431 return;
432
433 next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
434
435 if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
436 !sg_policy->need_freq_update) {
437 next_f = sg_policy->next_freq;
438
439 /* Restore cached freq as next_freq has changed */
440 sg_policy->cached_raw_freq = cached_freq;
441 }
442
443 if (!sugov_update_next_freq(sg_policy, time, next_f))
444 return;
445
446 /*
447 * This code runs under rq->lock for the target CPU, so it won't run
448 * concurrently on two different CPUs for the same target and it is not
449 * necessary to acquire the lock in the fast switch case.
450 */
451 if (sg_policy->policy->fast_switch_enabled) {
452 cpufreq_driver_fast_switch(sg_policy->policy, next_f);
453 } else {
454 raw_spin_lock(&sg_policy->update_lock);
455 sugov_deferred_update(sg_policy);
456 raw_spin_unlock(&sg_policy->update_lock);
457 }
458 }
459
sugov_update_single_perf(struct update_util_data * hook,u64 time,unsigned int flags)460 static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
461 unsigned int flags)
462 {
463 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
464 unsigned long prev_util = sg_cpu->util;
465 unsigned long max_cap;
466
467 /*
468 * Fall back to the "frequency" path if frequency invariance is not
469 * supported, because the direct mapping between the utilization and
470 * the performance levels depends on the frequency invariance.
471 */
472 if (!arch_scale_freq_invariant()) {
473 sugov_update_single_freq(hook, time, flags);
474 return;
475 }
476
477 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
478
479 if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
480 return;
481
482 if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
483 sg_cpu->util = prev_util;
484
485 cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
486 sg_cpu->util, max_cap);
487
488 sg_cpu->sg_policy->last_freq_update_time = time;
489 }
490
sugov_next_freq_shared(struct sugov_cpu * sg_cpu,u64 time)491 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
492 {
493 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
494 struct cpufreq_policy *policy = sg_policy->policy;
495 unsigned long util = 0, max_cap;
496 unsigned int j;
497
498 max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
499
500 for_each_cpu(j, policy->cpus) {
501 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
502 unsigned long boost;
503
504 boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
505 sugov_get_util(j_sg_cpu, boost);
506
507 util = max(j_sg_cpu->util, util);
508 }
509
510 return get_next_freq(sg_policy, util, max_cap);
511 }
512
513 static void
sugov_update_shared(struct update_util_data * hook,u64 time,unsigned int flags)514 sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
515 {
516 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
517 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
518 unsigned int next_f;
519
520 raw_spin_lock(&sg_policy->update_lock);
521
522 sugov_iowait_boost(sg_cpu, time, flags);
523 sg_cpu->last_update = time;
524
525 ignore_dl_rate_limit(sg_cpu);
526
527 if (sugov_should_update_freq(sg_policy, time)) {
528 next_f = sugov_next_freq_shared(sg_cpu, time);
529
530 if (!sugov_update_next_freq(sg_policy, time, next_f))
531 goto unlock;
532
533 if (sg_policy->policy->fast_switch_enabled)
534 cpufreq_driver_fast_switch(sg_policy->policy, next_f);
535 else
536 sugov_deferred_update(sg_policy);
537 }
538 unlock:
539 raw_spin_unlock(&sg_policy->update_lock);
540 }
541
sugov_work(struct kthread_work * work)542 static void sugov_work(struct kthread_work *work)
543 {
544 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
545 unsigned int freq;
546 unsigned long flags;
547
548 /*
549 * Hold sg_policy->update_lock shortly to handle the case where:
550 * in case sg_policy->next_freq is read here, and then updated by
551 * sugov_deferred_update() just before work_in_progress is set to false
552 * here, we may miss queueing the new update.
553 *
554 * Note: If a work was queued after the update_lock is released,
555 * sugov_work() will just be called again by kthread_work code; and the
556 * request will be proceed before the sugov thread sleeps.
557 */
558 raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
559 freq = sg_policy->next_freq;
560 sg_policy->work_in_progress = false;
561 raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
562
563 mutex_lock(&sg_policy->work_lock);
564 __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
565 mutex_unlock(&sg_policy->work_lock);
566 }
567
sugov_irq_work(struct irq_work * irq_work)568 static void sugov_irq_work(struct irq_work *irq_work)
569 {
570 struct sugov_policy *sg_policy;
571
572 sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
573
574 kthread_queue_work(&sg_policy->worker, &sg_policy->work);
575 }
576
577 /************************** sysfs interface ************************/
578
579 static struct sugov_tunables *global_tunables;
580 static DEFINE_MUTEX(global_tunables_lock);
581
to_sugov_tunables(struct gov_attr_set * attr_set)582 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
583 {
584 return container_of(attr_set, struct sugov_tunables, attr_set);
585 }
586
rate_limit_us_show(struct gov_attr_set * attr_set,char * buf)587 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
588 {
589 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
590
591 return sprintf(buf, "%u\n", tunables->rate_limit_us);
592 }
593
594 static ssize_t
rate_limit_us_store(struct gov_attr_set * attr_set,const char * buf,size_t count)595 rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
596 {
597 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
598 struct sugov_policy *sg_policy;
599 unsigned int rate_limit_us;
600
601 if (kstrtouint(buf, 10, &rate_limit_us))
602 return -EINVAL;
603
604 tunables->rate_limit_us = rate_limit_us;
605
606 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
607 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
608
609 return count;
610 }
611
612 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
613
614 static struct attribute *sugov_attrs[] = {
615 &rate_limit_us.attr,
616 NULL
617 };
618 ATTRIBUTE_GROUPS(sugov);
619
sugov_tunables_free(struct kobject * kobj)620 static void sugov_tunables_free(struct kobject *kobj)
621 {
622 struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
623
624 kfree(to_sugov_tunables(attr_set));
625 }
626
627 static const struct kobj_type sugov_tunables_ktype = {
628 .default_groups = sugov_groups,
629 .sysfs_ops = &governor_sysfs_ops,
630 .release = &sugov_tunables_free,
631 };
632
633 /********************** cpufreq governor interface *********************/
634
635 static struct cpufreq_governor schedutil_gov;
636
sugov_policy_alloc(struct cpufreq_policy * policy)637 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
638 {
639 struct sugov_policy *sg_policy;
640
641 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
642 if (!sg_policy)
643 return NULL;
644
645 sg_policy->policy = policy;
646 raw_spin_lock_init(&sg_policy->update_lock);
647 return sg_policy;
648 }
649
sugov_policy_free(struct sugov_policy * sg_policy)650 static void sugov_policy_free(struct sugov_policy *sg_policy)
651 {
652 kfree(sg_policy);
653 }
654
sugov_kthread_create(struct sugov_policy * sg_policy)655 static int sugov_kthread_create(struct sugov_policy *sg_policy)
656 {
657 struct task_struct *thread;
658 struct sched_attr attr = {
659 .size = sizeof(struct sched_attr),
660 .sched_policy = SCHED_DEADLINE,
661 .sched_flags = SCHED_FLAG_SUGOV,
662 .sched_nice = 0,
663 .sched_priority = 0,
664 /*
665 * Fake (unused) bandwidth; workaround to "fix"
666 * priority inheritance.
667 */
668 .sched_runtime = NSEC_PER_MSEC,
669 .sched_deadline = 10 * NSEC_PER_MSEC,
670 .sched_period = 10 * NSEC_PER_MSEC,
671 };
672 struct cpufreq_policy *policy = sg_policy->policy;
673 int ret;
674
675 /* kthread only required for slow path */
676 if (policy->fast_switch_enabled)
677 return 0;
678
679 kthread_init_work(&sg_policy->work, sugov_work);
680 kthread_init_worker(&sg_policy->worker);
681 thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
682 "sugov:%d",
683 cpumask_first(policy->related_cpus));
684 if (IS_ERR(thread)) {
685 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
686 return PTR_ERR(thread);
687 }
688
689 ret = sched_setattr_nocheck(thread, &attr);
690 if (ret) {
691 kthread_stop(thread);
692 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
693 return ret;
694 }
695
696 sg_policy->thread = thread;
697 if (policy->dvfs_possible_from_any_cpu)
698 set_cpus_allowed_ptr(thread, policy->related_cpus);
699 else
700 kthread_bind_mask(thread, policy->related_cpus);
701
702 init_irq_work(&sg_policy->irq_work, sugov_irq_work);
703 mutex_init(&sg_policy->work_lock);
704
705 wake_up_process(thread);
706
707 return 0;
708 }
709
sugov_kthread_stop(struct sugov_policy * sg_policy)710 static void sugov_kthread_stop(struct sugov_policy *sg_policy)
711 {
712 /* kthread only required for slow path */
713 if (sg_policy->policy->fast_switch_enabled)
714 return;
715
716 kthread_flush_worker(&sg_policy->worker);
717 kthread_stop(sg_policy->thread);
718 mutex_destroy(&sg_policy->work_lock);
719 }
720
sugov_tunables_alloc(struct sugov_policy * sg_policy)721 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
722 {
723 struct sugov_tunables *tunables;
724
725 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
726 if (tunables) {
727 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
728 if (!have_governor_per_policy())
729 global_tunables = tunables;
730 }
731 return tunables;
732 }
733
sugov_clear_global_tunables(void)734 static void sugov_clear_global_tunables(void)
735 {
736 if (!have_governor_per_policy())
737 global_tunables = NULL;
738 }
739
sugov_init(struct cpufreq_policy * policy)740 static int sugov_init(struct cpufreq_policy *policy)
741 {
742 struct sugov_policy *sg_policy;
743 struct sugov_tunables *tunables;
744 int ret = 0;
745
746 /* State should be equivalent to EXIT */
747 if (policy->governor_data)
748 return -EBUSY;
749
750 cpufreq_enable_fast_switch(policy);
751
752 sg_policy = sugov_policy_alloc(policy);
753 if (!sg_policy) {
754 ret = -ENOMEM;
755 goto disable_fast_switch;
756 }
757
758 ret = sugov_kthread_create(sg_policy);
759 if (ret)
760 goto free_sg_policy;
761
762 mutex_lock(&global_tunables_lock);
763
764 if (global_tunables) {
765 if (WARN_ON(have_governor_per_policy())) {
766 ret = -EINVAL;
767 goto stop_kthread;
768 }
769 policy->governor_data = sg_policy;
770 sg_policy->tunables = global_tunables;
771
772 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
773 goto out;
774 }
775
776 tunables = sugov_tunables_alloc(sg_policy);
777 if (!tunables) {
778 ret = -ENOMEM;
779 goto stop_kthread;
780 }
781
782 tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
783
784 policy->governor_data = sg_policy;
785 sg_policy->tunables = tunables;
786
787 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
788 get_governor_parent_kobj(policy), "%s",
789 schedutil_gov.name);
790 if (ret)
791 goto fail;
792
793 out:
794 /*
795 * Schedutil is the preferred governor for EAS, so rebuild sched domains
796 * on governor changes to make sure the scheduler knows about them.
797 */
798 em_rebuild_sched_domains();
799 mutex_unlock(&global_tunables_lock);
800 return 0;
801
802 fail:
803 kobject_put(&tunables->attr_set.kobj);
804 policy->governor_data = NULL;
805 sugov_clear_global_tunables();
806
807 stop_kthread:
808 sugov_kthread_stop(sg_policy);
809 mutex_unlock(&global_tunables_lock);
810
811 free_sg_policy:
812 sugov_policy_free(sg_policy);
813
814 disable_fast_switch:
815 cpufreq_disable_fast_switch(policy);
816
817 pr_err("initialization failed (error %d)\n", ret);
818 return ret;
819 }
820
sugov_exit(struct cpufreq_policy * policy)821 static void sugov_exit(struct cpufreq_policy *policy)
822 {
823 struct sugov_policy *sg_policy = policy->governor_data;
824 struct sugov_tunables *tunables = sg_policy->tunables;
825 unsigned int count;
826
827 mutex_lock(&global_tunables_lock);
828
829 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
830 policy->governor_data = NULL;
831 if (!count)
832 sugov_clear_global_tunables();
833
834 mutex_unlock(&global_tunables_lock);
835
836 sugov_kthread_stop(sg_policy);
837 sugov_policy_free(sg_policy);
838 cpufreq_disable_fast_switch(policy);
839
840 em_rebuild_sched_domains();
841 }
842
sugov_start(struct cpufreq_policy * policy)843 static int sugov_start(struct cpufreq_policy *policy)
844 {
845 struct sugov_policy *sg_policy = policy->governor_data;
846 void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
847 unsigned int cpu;
848
849 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
850 sg_policy->last_freq_update_time = 0;
851 sg_policy->next_freq = 0;
852 sg_policy->work_in_progress = false;
853 sg_policy->limits_changed = false;
854 sg_policy->cached_raw_freq = 0;
855
856 sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
857
858 if (policy_is_shared(policy))
859 uu = sugov_update_shared;
860 else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
861 uu = sugov_update_single_perf;
862 else
863 uu = sugov_update_single_freq;
864
865 for_each_cpu(cpu, policy->cpus) {
866 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
867
868 memset(sg_cpu, 0, sizeof(*sg_cpu));
869 sg_cpu->cpu = cpu;
870 sg_cpu->sg_policy = sg_policy;
871 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
872 }
873 return 0;
874 }
875
sugov_stop(struct cpufreq_policy * policy)876 static void sugov_stop(struct cpufreq_policy *policy)
877 {
878 struct sugov_policy *sg_policy = policy->governor_data;
879 unsigned int cpu;
880
881 for_each_cpu(cpu, policy->cpus)
882 cpufreq_remove_update_util_hook(cpu);
883
884 synchronize_rcu();
885
886 if (!policy->fast_switch_enabled) {
887 irq_work_sync(&sg_policy->irq_work);
888 kthread_cancel_work_sync(&sg_policy->work);
889 }
890 }
891
sugov_limits(struct cpufreq_policy * policy)892 static void sugov_limits(struct cpufreq_policy *policy)
893 {
894 struct sugov_policy *sg_policy = policy->governor_data;
895
896 if (!policy->fast_switch_enabled) {
897 mutex_lock(&sg_policy->work_lock);
898 cpufreq_policy_apply_limits(policy);
899 mutex_unlock(&sg_policy->work_lock);
900 }
901
902 /*
903 * The limits_changed update below must take place before the updates
904 * of policy limits in cpufreq_set_policy() or a policy limits update
905 * might be missed, so use a memory barrier to ensure it.
906 *
907 * This pairs with the memory barrier in sugov_should_update_freq().
908 */
909 smp_wmb();
910
911 WRITE_ONCE(sg_policy->limits_changed, true);
912 }
913
914 static struct cpufreq_governor schedutil_gov = {
915 .name = "schedutil",
916 .owner = THIS_MODULE,
917 .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
918 .init = sugov_init,
919 .exit = sugov_exit,
920 .start = sugov_start,
921 .stop = sugov_stop,
922 .limits = sugov_limits,
923 };
924
925 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
cpufreq_default_governor(void)926 struct cpufreq_governor *cpufreq_default_governor(void)
927 {
928 return &schedutil_gov;
929 }
930 #endif
931
sugov_is_governor(struct cpufreq_policy * policy)932 bool sugov_is_governor(struct cpufreq_policy *policy)
933 {
934 return policy->governor == &schedutil_gov;
935 }
936
937 cpufreq_governor_init(schedutil_gov);
938