xref: /linux/drivers/thermal/intel/intel_powerclamp.c (revision a970ed18812d0cf5e1f54401403300bb35b36433)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012-2023, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/cpu.h>
31 #include <linux/thermal.h>
32 #include <linux/debugfs.h>
33 #include <linux/seq_file.h>
34 #include <linux/idle_inject.h>
35 
36 #include <asm/msr.h>
37 #include <asm/mwait.h>
38 #include <asm/cpu_device_id.h>
39 
40 #define MAX_TARGET_RATIO (100U)
41 /* For each undisturbed clamping period (no extra wake ups during idle time),
42  * we increment the confidence counter for the given target ratio.
43  * CONFIDENCE_OK defines the level where runtime calibration results are
44  * valid.
45  */
46 #define CONFIDENCE_OK (3)
47 /* Default idle injection duration, driver adjust sleep time to meet target
48  * idle ratio. Similar to frequency modulation.
49  */
50 #define DEFAULT_DURATION_JIFFIES (6)
51 
52 static struct dentry *debug_dir;
53 static bool poll_pkg_cstate_enable;
54 
55 /* Idle ratio observed using package C-state counters */
56 static unsigned int current_ratio;
57 
58 /* Skip the idle injection till set to true */
59 static bool should_skip;
60 
61 struct powerclamp_data {
62 	unsigned int cpu;
63 	unsigned int count;
64 	unsigned int guard;
65 	unsigned int window_size_now;
66 	unsigned int target_ratio;
67 	bool clamping;
68 };
69 
70 static struct powerclamp_data powerclamp_data;
71 
72 static struct thermal_cooling_device *cooling_dev;
73 
74 static DEFINE_MUTEX(powerclamp_lock);
75 
76 /* This duration is in microseconds */
77 static unsigned int duration;
78 static unsigned int pkg_cstate_ratio_cur;
79 static unsigned int window_size;
80 
duration_set(const char * arg,const struct kernel_param * kp)81 static int duration_set(const char *arg, const struct kernel_param *kp)
82 {
83 	int ret = 0;
84 	unsigned long new_duration;
85 
86 	ret = kstrtoul(arg, 10, &new_duration);
87 	if (ret)
88 		goto exit;
89 	if (new_duration > 25 || new_duration < 6) {
90 		pr_err("Out of recommended range %lu, between 6-25ms\n",
91 			new_duration);
92 		ret = -EINVAL;
93 		goto exit;
94 	}
95 
96 	mutex_lock(&powerclamp_lock);
97 	duration = clamp(new_duration, 6ul, 25ul) * 1000;
98 	mutex_unlock(&powerclamp_lock);
99 exit:
100 
101 	return ret;
102 }
103 
duration_get(char * buf,const struct kernel_param * kp)104 static int duration_get(char *buf, const struct kernel_param *kp)
105 {
106 	int ret;
107 
108 	mutex_lock(&powerclamp_lock);
109 	ret = sysfs_emit(buf, "%d\n", duration / 1000);
110 	mutex_unlock(&powerclamp_lock);
111 
112 	return ret;
113 }
114 
115 static const struct kernel_param_ops duration_ops = {
116 	.set = duration_set,
117 	.get = duration_get,
118 };
119 
120 module_param_cb(duration, &duration_ops, NULL, 0644);
121 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
122 
123 #define DEFAULT_MAX_IDLE	50
124 #define MAX_ALL_CPU_IDLE	75
125 
126 static u8 max_idle = DEFAULT_MAX_IDLE;
127 
128 static cpumask_var_t idle_injection_cpu_mask;
129 
allocate_copy_idle_injection_mask(const struct cpumask * copy_mask)130 static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
131 {
132 	if (cpumask_available(idle_injection_cpu_mask))
133 		goto copy_mask;
134 
135 	/* This mask is allocated only one time and freed during module exit */
136 	if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
137 		return -ENOMEM;
138 
139 copy_mask:
140 	cpumask_copy(idle_injection_cpu_mask, copy_mask);
141 
142 	return 0;
143 }
144 
145 /* Return true if the cpumask and idle percent combination is invalid */
check_invalid(cpumask_var_t mask,u8 idle)146 static bool check_invalid(cpumask_var_t mask, u8 idle)
147 {
148 	if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
149 		return true;
150 
151 	return false;
152 }
153 
cpumask_set(const char * arg,const struct kernel_param * kp)154 static int cpumask_set(const char *arg, const struct kernel_param *kp)
155 {
156 	cpumask_var_t new_mask;
157 	int ret;
158 
159 	mutex_lock(&powerclamp_lock);
160 
161 	/* Can't set mask when cooling device is in use */
162 	if (powerclamp_data.clamping) {
163 		ret = -EAGAIN;
164 		goto skip_cpumask_set;
165 	}
166 
167 	ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
168 	if (!ret)
169 		goto skip_cpumask_set;
170 
171 	ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
172 			   nr_cpumask_bits);
173 	if (ret)
174 		goto free_cpumask_set;
175 
176 	if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
177 		ret = -EINVAL;
178 		goto free_cpumask_set;
179 	}
180 
181 	/*
182 	 * When module parameters are passed from kernel command line
183 	 * during insmod, the module parameter callback is called
184 	 * before powerclamp_init(), so we can't assume that some
185 	 * cpumask can be allocated and copied before here. Also
186 	 * in this case this cpumask is used as the default mask.
187 	 */
188 	ret = allocate_copy_idle_injection_mask(new_mask);
189 
190 free_cpumask_set:
191 	free_cpumask_var(new_mask);
192 skip_cpumask_set:
193 	mutex_unlock(&powerclamp_lock);
194 
195 	return ret;
196 }
197 
cpumask_get(char * buf,const struct kernel_param * kp)198 static int cpumask_get(char *buf, const struct kernel_param *kp)
199 {
200 	if (!cpumask_available(idle_injection_cpu_mask))
201 		return -ENODEV;
202 
203 	return cpumap_print_to_pagebuf(false, buf, idle_injection_cpu_mask);
204 }
205 
206 static const struct kernel_param_ops cpumask_ops = {
207 	.set = cpumask_set,
208 	.get = cpumask_get,
209 };
210 
211 module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
212 MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
213 
max_idle_set(const char * arg,const struct kernel_param * kp)214 static int max_idle_set(const char *arg, const struct kernel_param *kp)
215 {
216 	u8 new_max_idle;
217 	int ret = 0;
218 
219 	mutex_lock(&powerclamp_lock);
220 
221 	/* Can't set mask when cooling device is in use */
222 	if (powerclamp_data.clamping) {
223 		ret = -EAGAIN;
224 		goto skip_limit_set;
225 	}
226 
227 	ret = kstrtou8(arg, 10, &new_max_idle);
228 	if (ret)
229 		goto skip_limit_set;
230 
231 	if (new_max_idle > MAX_TARGET_RATIO) {
232 		ret = -EINVAL;
233 		goto skip_limit_set;
234 	}
235 
236 	if (!cpumask_available(idle_injection_cpu_mask)) {
237 		ret = allocate_copy_idle_injection_mask(cpu_present_mask);
238 		if (ret)
239 			goto skip_limit_set;
240 	}
241 
242 	if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
243 		ret = -EINVAL;
244 		goto skip_limit_set;
245 	}
246 
247 	max_idle = new_max_idle;
248 
249 skip_limit_set:
250 	mutex_unlock(&powerclamp_lock);
251 
252 	return ret;
253 }
254 
255 static const struct kernel_param_ops max_idle_ops = {
256 	.set = max_idle_set,
257 	.get = param_get_byte,
258 };
259 
260 module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
261 MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
262 
263 struct powerclamp_calibration_data {
264 	unsigned long confidence;  /* used for calibration, basically a counter
265 				    * gets incremented each time a clamping
266 				    * period is completed without extra wakeups
267 				    * once that counter is reached given level,
268 				    * compensation is deemed usable.
269 				    */
270 	unsigned long steady_comp; /* steady state compensation used when
271 				    * no extra wakeups occurred.
272 				    */
273 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
274 				     * mostly from external interrupts.
275 				     */
276 };
277 
278 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
279 
window_size_set(const char * arg,const struct kernel_param * kp)280 static int window_size_set(const char *arg, const struct kernel_param *kp)
281 {
282 	int ret = 0;
283 	unsigned long new_window_size;
284 
285 	ret = kstrtoul(arg, 10, &new_window_size);
286 	if (ret)
287 		goto exit_win;
288 	if (new_window_size > 10 || new_window_size < 2) {
289 		pr_err("Out of recommended window size %lu, between 2-10\n",
290 			new_window_size);
291 		ret = -EINVAL;
292 	}
293 
294 	window_size = clamp(new_window_size, 2ul, 10ul);
295 	smp_mb();
296 
297 exit_win:
298 
299 	return ret;
300 }
301 
302 static const struct kernel_param_ops window_size_ops = {
303 	.set = window_size_set,
304 	.get = param_get_int,
305 };
306 
307 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
308 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
309 	"\tpowerclamp controls idle ratio within this window. larger\n"
310 	"\twindow size results in slower response time but more smooth\n"
311 	"\tclamping results. default to 2.");
312 
313 struct pkg_cstate_info {
314 	bool skip;
315 	int msr_index;
316 	int cstate_id;
317 };
318 
319 #define PKG_CSTATE_INIT(id) {				\
320 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
321 		.cstate_id = id				\
322 			}
323 
324 static struct pkg_cstate_info pkg_cstates[] = {
325 	PKG_CSTATE_INIT(2),
326 	PKG_CSTATE_INIT(3),
327 	PKG_CSTATE_INIT(6),
328 	PKG_CSTATE_INIT(7),
329 	PKG_CSTATE_INIT(8),
330 	PKG_CSTATE_INIT(9),
331 	PKG_CSTATE_INIT(10),
332 	{NULL},
333 };
334 
has_pkg_state_counter(void)335 static bool has_pkg_state_counter(void)
336 {
337 	u64 val;
338 	struct pkg_cstate_info *info = pkg_cstates;
339 
340 	/* check if any one of the counter msrs exists */
341 	while (info->msr_index) {
342 		if (!rdmsrq_safe(info->msr_index, &val))
343 			return true;
344 		info++;
345 	}
346 
347 	return false;
348 }
349 
pkg_state_counter(void)350 static u64 pkg_state_counter(void)
351 {
352 	u64 val;
353 	u64 count = 0;
354 	struct pkg_cstate_info *info = pkg_cstates;
355 
356 	while (info->msr_index) {
357 		if (!info->skip) {
358 			if (!rdmsrq_safe(info->msr_index, &val))
359 				count += val;
360 			else
361 				info->skip = true;
362 		}
363 		info++;
364 	}
365 
366 	return count;
367 }
368 
get_compensation(int ratio)369 static unsigned int get_compensation(int ratio)
370 {
371 	unsigned int comp = 0;
372 
373 	if (!poll_pkg_cstate_enable)
374 		return 0;
375 
376 	/* we only use compensation if all adjacent ones are good */
377 	if (ratio == 1 &&
378 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
379 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
380 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
381 		comp = (cal_data[ratio].steady_comp +
382 			cal_data[ratio + 1].steady_comp +
383 			cal_data[ratio + 2].steady_comp) / 3;
384 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
385 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
386 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
387 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
388 		comp = (cal_data[ratio].steady_comp +
389 			cal_data[ratio - 1].steady_comp +
390 			cal_data[ratio - 2].steady_comp) / 3;
391 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
392 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
393 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
394 		comp = (cal_data[ratio].steady_comp +
395 			cal_data[ratio - 1].steady_comp +
396 			cal_data[ratio + 1].steady_comp) / 3;
397 	}
398 
399 	/* do not exceed limit */
400 	if (comp + ratio >= MAX_TARGET_RATIO)
401 		comp = MAX_TARGET_RATIO - ratio - 1;
402 
403 	return comp;
404 }
405 
adjust_compensation(int target_ratio,unsigned int win)406 static void adjust_compensation(int target_ratio, unsigned int win)
407 {
408 	int delta;
409 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
410 
411 	/*
412 	 * adjust compensations if confidence level has not been reached.
413 	 */
414 	if (d->confidence >= CONFIDENCE_OK)
415 		return;
416 
417 	delta = powerclamp_data.target_ratio - current_ratio;
418 	/* filter out bad data */
419 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
420 		if (d->steady_comp)
421 			d->steady_comp =
422 				roundup(delta+d->steady_comp, 2)/2;
423 		else
424 			d->steady_comp = delta;
425 		d->confidence++;
426 	}
427 }
428 
powerclamp_adjust_controls(unsigned int target_ratio,unsigned int guard,unsigned int win)429 static bool powerclamp_adjust_controls(unsigned int target_ratio,
430 				unsigned int guard, unsigned int win)
431 {
432 	static u64 msr_last, tsc_last;
433 	u64 msr_now, tsc_now;
434 	u64 val64;
435 
436 	/* check result for the last window */
437 	msr_now = pkg_state_counter();
438 	tsc_now = rdtsc();
439 
440 	/* calculate pkg cstate vs tsc ratio */
441 	if (!msr_last || !tsc_last)
442 		current_ratio = 1;
443 	else if (tsc_now-tsc_last) {
444 		val64 = 100*(msr_now-msr_last);
445 		do_div(val64, (tsc_now-tsc_last));
446 		current_ratio = val64;
447 	}
448 
449 	/* update record */
450 	msr_last = msr_now;
451 	tsc_last = tsc_now;
452 
453 	adjust_compensation(target_ratio, win);
454 
455 	/* if we are above target+guard, skip */
456 	return powerclamp_data.target_ratio + guard <= current_ratio;
457 }
458 
459 /*
460  * This function calculates runtime from the current target ratio.
461  * This function gets called under powerclamp_lock.
462  */
get_run_time(void)463 static unsigned int get_run_time(void)
464 {
465 	unsigned int compensated_ratio;
466 	unsigned int runtime;
467 
468 	/*
469 	 * make sure user selected ratio does not take effect until
470 	 * the next round. adjust target_ratio if user has changed
471 	 * target such that we can converge quickly.
472 	 */
473 	powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
474 	powerclamp_data.window_size_now = window_size;
475 
476 	/*
477 	 * systems may have different ability to enter package level
478 	 * c-states, thus we need to compensate the injected idle ratio
479 	 * to achieve the actual target reported by the HW.
480 	 */
481 	compensated_ratio = powerclamp_data.target_ratio +
482 		get_compensation(powerclamp_data.target_ratio);
483 	if (compensated_ratio <= 0)
484 		compensated_ratio = 1;
485 
486 	runtime = duration * 100 / compensated_ratio - duration;
487 
488 	return runtime;
489 }
490 
491 /*
492  * 1 HZ polling while clamping is active, useful for userspace
493  * to monitor actual idle ratio.
494  */
495 static void poll_pkg_cstate(struct work_struct *dummy);
496 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
poll_pkg_cstate(struct work_struct * dummy)497 static void poll_pkg_cstate(struct work_struct *dummy)
498 {
499 	static u64 msr_last;
500 	static u64 tsc_last;
501 
502 	u64 msr_now;
503 	u64 tsc_now;
504 	u64 val64;
505 
506 	msr_now = pkg_state_counter();
507 	tsc_now = rdtsc();
508 
509 	/* calculate pkg cstate vs tsc ratio */
510 	if (!msr_last || !tsc_last)
511 		pkg_cstate_ratio_cur = 1;
512 	else {
513 		if (tsc_now - tsc_last) {
514 			val64 = 100 * (msr_now - msr_last);
515 			do_div(val64, (tsc_now - tsc_last));
516 			pkg_cstate_ratio_cur = val64;
517 		}
518 	}
519 
520 	/* update record */
521 	msr_last = msr_now;
522 	tsc_last = tsc_now;
523 
524 	mutex_lock(&powerclamp_lock);
525 	if (powerclamp_data.clamping)
526 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
527 	mutex_unlock(&powerclamp_lock);
528 }
529 
530 static struct idle_inject_device *ii_dev;
531 
532 /*
533  * This function is called from idle injection core on timer expiry
534  * for the run duration. This allows powerclamp to readjust or skip
535  * injecting idle for this cycle.
536  */
idle_inject_update(void)537 static bool idle_inject_update(void)
538 {
539 	bool update = false;
540 
541 	/* We can't sleep in this callback */
542 	if (!mutex_trylock(&powerclamp_lock))
543 		return true;
544 
545 	if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
546 
547 		should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
548 							 powerclamp_data.guard,
549 							 powerclamp_data.window_size_now);
550 		update = true;
551 	}
552 
553 	if (update) {
554 		unsigned int runtime = get_run_time();
555 
556 		idle_inject_set_duration(ii_dev, runtime, duration);
557 	}
558 
559 	powerclamp_data.count++;
560 
561 	mutex_unlock(&powerclamp_lock);
562 
563 	if (should_skip)
564 		return false;
565 
566 	return true;
567 }
568 
569 /* This function starts idle injection by calling idle_inject_start() */
trigger_idle_injection(void)570 static void trigger_idle_injection(void)
571 {
572 	unsigned int runtime = get_run_time();
573 
574 	idle_inject_set_duration(ii_dev, runtime, duration);
575 	idle_inject_start(ii_dev);
576 	powerclamp_data.clamping = true;
577 }
578 
579 /*
580  * This function is called from start_power_clamp() to register
581  * CPUS with powercap idle injection register and set default
582  * idle duration and latency.
583  */
powerclamp_idle_injection_register(void)584 static int powerclamp_idle_injection_register(void)
585 {
586 	poll_pkg_cstate_enable = false;
587 	if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
588 		ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
589 		if (topology_max_packages() == 1 && topology_max_dies_per_package() == 1)
590 			poll_pkg_cstate_enable = true;
591 	} else {
592 		ii_dev = idle_inject_register(idle_injection_cpu_mask);
593 	}
594 
595 	if (!ii_dev) {
596 		pr_err("powerclamp: idle_inject_register failed\n");
597 		return -EAGAIN;
598 	}
599 
600 	idle_inject_set_duration(ii_dev, TICK_USEC, duration);
601 	idle_inject_set_latency(ii_dev, UINT_MAX);
602 
603 	return 0;
604 }
605 
606 /*
607  * This function is called from end_power_clamp() to stop idle injection
608  * and unregister CPUS from powercap idle injection core.
609  */
remove_idle_injection(void)610 static void remove_idle_injection(void)
611 {
612 	if (!powerclamp_data.clamping)
613 		return;
614 
615 	powerclamp_data.clamping = false;
616 	idle_inject_stop(ii_dev);
617 }
618 
619 /*
620  * This function is called when user change the cooling device
621  * state from zero to some other value.
622  */
start_power_clamp(void)623 static int start_power_clamp(void)
624 {
625 	int ret;
626 
627 	ret = powerclamp_idle_injection_register();
628 	if (!ret) {
629 		trigger_idle_injection();
630 		if (poll_pkg_cstate_enable)
631 			schedule_delayed_work(&poll_pkg_cstate_work, 0);
632 	}
633 
634 	return ret;
635 }
636 
637 /*
638  * This function is called when user change the cooling device
639  * state from non zero value zero.
640  */
end_power_clamp(void)641 static void end_power_clamp(void)
642 {
643 	if (powerclamp_data.clamping) {
644 		remove_idle_injection();
645 		idle_inject_unregister(ii_dev);
646 	}
647 }
648 
powerclamp_get_max_state(struct thermal_cooling_device * cdev,unsigned long * state)649 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
650 				 unsigned long *state)
651 {
652 	*state = MAX_TARGET_RATIO;
653 
654 	return 0;
655 }
656 
powerclamp_get_cur_state(struct thermal_cooling_device * cdev,unsigned long * state)657 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
658 				 unsigned long *state)
659 {
660 	mutex_lock(&powerclamp_lock);
661 	*state = powerclamp_data.target_ratio;
662 	mutex_unlock(&powerclamp_lock);
663 
664 	return 0;
665 }
666 
powerclamp_set_cur_state(struct thermal_cooling_device * cdev,unsigned long new_target_ratio)667 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
668 				 unsigned long new_target_ratio)
669 {
670 	int ret = 0;
671 
672 	mutex_lock(&powerclamp_lock);
673 
674 	new_target_ratio = clamp(new_target_ratio, 0UL,
675 				(unsigned long) (max_idle - 1));
676 
677 	if (powerclamp_data.target_ratio == new_target_ratio)
678 		goto exit_set;
679 
680 	if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
681 		pr_info("Start idle injection to reduce power\n");
682 		powerclamp_data.target_ratio = new_target_ratio;
683 		ret = start_power_clamp();
684 		if (ret)
685 			powerclamp_data.target_ratio = 0;
686 		goto exit_set;
687 	} else	if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
688 		pr_info("Stop forced idle injection\n");
689 		end_power_clamp();
690 		powerclamp_data.target_ratio = 0;
691 	} else	/* adjust currently running */ {
692 		unsigned int runtime;
693 
694 		powerclamp_data.target_ratio = new_target_ratio;
695 		runtime = get_run_time();
696 		idle_inject_set_duration(ii_dev, runtime, duration);
697 	}
698 
699 exit_set:
700 	mutex_unlock(&powerclamp_lock);
701 
702 	return ret;
703 }
704 
705 /* bind to generic thermal layer as cooling device*/
706 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
707 	.get_max_state = powerclamp_get_max_state,
708 	.get_cur_state = powerclamp_get_cur_state,
709 	.set_cur_state = powerclamp_set_cur_state,
710 };
711 
712 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
713 	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
714 	{}
715 };
716 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
717 
powerclamp_probe(void)718 static int __init powerclamp_probe(void)
719 {
720 
721 	if (!x86_match_cpu(intel_powerclamp_ids)) {
722 		pr_err("CPU does not support MWAIT\n");
723 		return -ENODEV;
724 	}
725 
726 	/* The goal for idle time alignment is to achieve package cstate. */
727 	if (!has_pkg_state_counter()) {
728 		pr_info("No package C-state available\n");
729 		return -ENODEV;
730 	}
731 
732 	return 0;
733 }
734 
powerclamp_debug_show(struct seq_file * m,void * unused)735 static int powerclamp_debug_show(struct seq_file *m, void *unused)
736 {
737 	int i = 0;
738 
739 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
740 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
741 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
742 			i,
743 			cal_data[i].confidence,
744 			cal_data[i].steady_comp,
745 			cal_data[i].dynamic_comp);
746 	}
747 
748 	return 0;
749 }
750 
751 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
752 
powerclamp_create_debug_files(void)753 static inline void powerclamp_create_debug_files(void)
754 {
755 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
756 
757 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
758 			    &powerclamp_debug_fops);
759 }
760 
powerclamp_init(void)761 static int __init powerclamp_init(void)
762 {
763 	int retval;
764 
765 	/* probe cpu features and ids here */
766 	retval = powerclamp_probe();
767 	if (retval)
768 		return retval;
769 
770 	mutex_lock(&powerclamp_lock);
771 	if (!cpumask_available(idle_injection_cpu_mask))
772 		retval = allocate_copy_idle_injection_mask(cpu_present_mask);
773 	mutex_unlock(&powerclamp_lock);
774 
775 	if (retval)
776 		return retval;
777 
778 	/* set default limit, maybe adjusted during runtime based on feedback */
779 	window_size = 2;
780 
781 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
782 						      &powerclamp_cooling_ops);
783 	if (IS_ERR(cooling_dev))
784 		return -ENODEV;
785 
786 	if (!duration)
787 		duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
788 
789 	powerclamp_create_debug_files();
790 
791 	return 0;
792 }
793 module_init(powerclamp_init);
794 
powerclamp_exit(void)795 static void __exit powerclamp_exit(void)
796 {
797 	mutex_lock(&powerclamp_lock);
798 	end_power_clamp();
799 	mutex_unlock(&powerclamp_lock);
800 
801 	thermal_cooling_device_unregister(cooling_dev);
802 
803 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
804 	debugfs_remove_recursive(debug_dir);
805 
806 	if (cpumask_available(idle_injection_cpu_mask))
807 		free_cpumask_var(idle_injection_cpu_mask);
808 }
809 module_exit(powerclamp_exit);
810 
811 MODULE_IMPORT_NS("IDLE_INJECT");
812 
813 MODULE_LICENSE("GPL");
814 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
815 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
816 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
817