1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright IBM Corp. 2024
4 */
5
6 #define KMSG_COMPONENT "hd"
7 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
8
9 /*
10 * Hiperdispatch:
11 * Dynamically calculates the optimum number of high capacity COREs
12 * by considering the state the system is in. When hiperdispatch decides
13 * that a capacity update is necessary, it schedules a topology update.
14 * During topology updates the CPU capacities are always re-adjusted.
15 *
16 * There is two places where CPU capacities are being accessed within
17 * hiperdispatch.
18 * -> hiperdispatch's reoccuring work function reads CPU capacities to
19 * determine high capacity CPU count.
20 * -> during a topology update hiperdispatch's adjustment function
21 * updates CPU capacities.
22 * These two can run on different CPUs in parallel which can cause
23 * hiperdispatch to make wrong decisions. This can potentially cause
24 * some overhead by leading to extra rebuild_sched_domains() calls
25 * for correction. Access to capacities within hiperdispatch has to be
26 * serialized to prevent the overhead.
27 *
28 * Hiperdispatch decision making revolves around steal time.
29 * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
30 * crosses the threshold value hiperdispatch falls back to giving high
31 * capacities to entitled CPUs. When steal time drops below the
32 * threshold boundary, hiperdispatch utilizes all CPUs by giving all
33 * of them high capacity.
34 *
35 * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
36 * performance. Comparing the throughput of;
37 * - single CORE, with N threads, running N tasks
38 * - N separate COREs running N tasks,
39 * using individual COREs for individual tasks yield better
40 * performance. This performance difference is roughly ~30% (can change
41 * between machine generations)
42 *
43 * Hiperdispatch tries to hint scheduler to use individual COREs for
44 * each task, as long as steal time on those COREs are less than 30%,
45 * therefore delaying the throughput loss caused by using SMP threads.
46 */
47
48 #include <linux/cpufeature.h>
49 #include <linux/cpumask.h>
50 #include <linux/debugfs.h>
51 #include <linux/device.h>
52 #include <linux/kernel_stat.h>
53 #include <linux/kstrtox.h>
54 #include <linux/ktime.h>
55 #include <linux/sysctl.h>
56 #include <linux/types.h>
57 #include <linux/workqueue.h>
58 #include <asm/hiperdispatch.h>
59 #include <asm/setup.h>
60 #include <asm/smp.h>
61 #include <asm/topology.h>
62
63 #define CREATE_TRACE_POINTS
64 #include <asm/trace/hiperdispatch.h>
65
66 #define HD_DELAY_FACTOR (4)
67 #define HD_DELAY_INTERVAL (HZ / 4)
68 #define HD_STEAL_THRESHOLD 30
69 #define HD_STEAL_AVG_WEIGHT 16
70
71 static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */
72 static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */
73 static int hd_high_capacity_cores; /* Current CORE count with high capacity */
74 static int hd_entitled_cores; /* Total vertical high and medium CORE count */
75 static int hd_online_cores; /* Current online CORE count */
76
77 static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
78 static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */
79 static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */
80 static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */
81
82 static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
83 static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
84 static int hd_enabled;
85
86 static void hd_capacity_work_fn(struct work_struct *work);
87 static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
88
hd_set_hiperdispatch_mode(int enable)89 static int hd_set_hiperdispatch_mode(int enable)
90 {
91 if (!cpu_has_topology())
92 enable = 0;
93 if (hd_enabled == enable)
94 return 0;
95 hd_enabled = enable;
96 return 1;
97 }
98
hd_reset_state(void)99 void hd_reset_state(void)
100 {
101 cpumask_clear(&hd_vl_coremask);
102 cpumask_clear(&hd_vmvl_cpumask);
103 hd_entitled_cores = 0;
104 hd_online_cores = 0;
105 }
106
hd_add_core(int cpu)107 void hd_add_core(int cpu)
108 {
109 const struct cpumask *siblings;
110 int polarization;
111
112 hd_online_cores++;
113 polarization = smp_cpu_get_polarization(cpu);
114 siblings = topology_sibling_cpumask(cpu);
115 switch (polarization) {
116 case POLARIZATION_VH:
117 hd_entitled_cores++;
118 break;
119 case POLARIZATION_VM:
120 hd_entitled_cores++;
121 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
122 break;
123 case POLARIZATION_VL:
124 cpumask_set_cpu(cpu, &hd_vl_coremask);
125 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
126 break;
127 }
128 }
129
130 /* Serialize update and read operations of debug counters. */
131 static DEFINE_MUTEX(hd_counter_mutex);
132
hd_update_times(void)133 static void hd_update_times(void)
134 {
135 static ktime_t prev;
136 ktime_t now;
137
138 /*
139 * Check if hiperdispatch is active, if not set the prev to 0.
140 * This way it is possible to differentiate the first update iteration after
141 * enabling hiperdispatch.
142 */
143 if (hd_entitled_cores == 0 || hd_enabled == 0) {
144 prev = ktime_set(0, 0);
145 return;
146 }
147 now = ktime_get();
148 if (ktime_after(prev, 0)) {
149 if (hd_high_capacity_cores == hd_online_cores)
150 hd_high_time += ktime_ms_delta(now, prev);
151 else
152 hd_low_time += ktime_ms_delta(now, prev);
153 }
154 prev = now;
155 }
156
hd_update_capacities(void)157 static void hd_update_capacities(void)
158 {
159 int cpu, upscaling_cores;
160 unsigned long capacity;
161
162 upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
163 capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
164 hd_high_capacity_cores = hd_entitled_cores;
165 for_each_cpu(cpu, &hd_vl_coremask) {
166 smp_set_core_capacity(cpu, capacity);
167 if (capacity != CPU_CAPACITY_HIGH)
168 continue;
169 hd_high_capacity_cores++;
170 upscaling_cores--;
171 if (upscaling_cores == 0)
172 capacity = CPU_CAPACITY_LOW;
173 }
174 }
175
hd_disable_hiperdispatch(void)176 void hd_disable_hiperdispatch(void)
177 {
178 cancel_delayed_work_sync(&hd_capacity_work);
179 hd_high_capacity_cores = hd_online_cores;
180 hd_previous_steal = 0;
181 }
182
hd_enable_hiperdispatch(void)183 int hd_enable_hiperdispatch(void)
184 {
185 mutex_lock(&hd_counter_mutex);
186 hd_update_times();
187 mutex_unlock(&hd_counter_mutex);
188 if (hd_enabled == 0)
189 return 0;
190 if (hd_entitled_cores == 0)
191 return 0;
192 if (hd_online_cores <= hd_entitled_cores)
193 return 0;
194 mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
195 hd_update_capacities();
196 return 1;
197 }
198
hd_steal_avg(unsigned long new)199 static unsigned long hd_steal_avg(unsigned long new)
200 {
201 static unsigned long steal;
202
203 steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
204 return steal;
205 }
206
hd_calculate_steal_percentage(void)207 static unsigned long hd_calculate_steal_percentage(void)
208 {
209 unsigned long time_delta, steal_delta, steal, percentage;
210 static ktime_t prev;
211 int cpus, cpu;
212 ktime_t now;
213
214 cpus = 0;
215 steal = 0;
216 percentage = 0;
217 for_each_cpu(cpu, &hd_vmvl_cpumask) {
218 steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
219 cpus++;
220 }
221 /*
222 * If there is no vertical medium and low CPUs steal time
223 * is 0 as vertical high CPUs shouldn't experience steal time.
224 */
225 if (cpus == 0)
226 return percentage;
227 now = ktime_get();
228 time_delta = ktime_to_ns(ktime_sub(now, prev));
229 if (steal > hd_previous_steal && hd_previous_steal != 0) {
230 steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
231 percentage = steal_delta / cpus;
232 }
233 hd_previous_steal = steal;
234 prev = now;
235 return percentage;
236 }
237
hd_capacity_work_fn(struct work_struct * work)238 static void hd_capacity_work_fn(struct work_struct *work)
239 {
240 unsigned long steal_percentage, new_cores;
241
242 mutex_lock(&smp_cpu_state_mutex);
243 /*
244 * If online cores are less or equal to entitled cores hiperdispatch
245 * does not need to make any adjustments, call a topology update to
246 * disable hiperdispatch.
247 * Normally this check is handled on topology update, but during cpu
248 * unhotplug, topology and cpu mask updates are done in reverse
249 * order, causing hd_enable_hiperdispatch() to get stale data.
250 */
251 if (hd_online_cores <= hd_entitled_cores) {
252 topology_schedule_update();
253 mutex_unlock(&smp_cpu_state_mutex);
254 return;
255 }
256 steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
257 if (steal_percentage < hd_steal_threshold)
258 new_cores = hd_online_cores;
259 else
260 new_cores = hd_entitled_cores;
261 if (hd_high_capacity_cores != new_cores) {
262 trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
263 hd_high_capacity_cores = new_cores;
264 atomic64_inc(&hd_adjustments);
265 topology_schedule_update();
266 }
267 trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
268 mutex_unlock(&smp_cpu_state_mutex);
269 schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
270 }
271
hiperdispatch_ctl_handler(const struct ctl_table * ctl,int write,void * buffer,size_t * lenp,loff_t * ppos)272 static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
273 void *buffer, size_t *lenp, loff_t *ppos)
274 {
275 int hiperdispatch;
276 int rc;
277 struct ctl_table ctl_entry = {
278 .procname = ctl->procname,
279 .data = &hiperdispatch,
280 .maxlen = sizeof(int),
281 .extra1 = SYSCTL_ZERO,
282 .extra2 = SYSCTL_ONE,
283 };
284
285 hiperdispatch = hd_enabled;
286 rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
287 if (rc < 0 || !write)
288 return rc;
289 mutex_lock(&smp_cpu_state_mutex);
290 if (hd_set_hiperdispatch_mode(hiperdispatch))
291 topology_schedule_update();
292 mutex_unlock(&smp_cpu_state_mutex);
293 return 0;
294 }
295
296 static const struct ctl_table hiperdispatch_ctl_table[] = {
297 {
298 .procname = "hiperdispatch",
299 .mode = 0644,
300 .proc_handler = hiperdispatch_ctl_handler,
301 },
302 };
303
hd_steal_threshold_show(struct device * dev,struct device_attribute * attr,char * buf)304 static ssize_t hd_steal_threshold_show(struct device *dev,
305 struct device_attribute *attr,
306 char *buf)
307 {
308 return sysfs_emit(buf, "%u\n", hd_steal_threshold);
309 }
310
hd_steal_threshold_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)311 static ssize_t hd_steal_threshold_store(struct device *dev,
312 struct device_attribute *attr,
313 const char *buf,
314 size_t count)
315 {
316 unsigned int val;
317 int rc;
318
319 rc = kstrtouint(buf, 0, &val);
320 if (rc)
321 return rc;
322 if (val > 100)
323 return -ERANGE;
324 hd_steal_threshold = val;
325 return count;
326 }
327
328 static DEVICE_ATTR_RW(hd_steal_threshold);
329
hd_delay_factor_show(struct device * dev,struct device_attribute * attr,char * buf)330 static ssize_t hd_delay_factor_show(struct device *dev,
331 struct device_attribute *attr,
332 char *buf)
333 {
334 return sysfs_emit(buf, "%u\n", hd_delay_factor);
335 }
336
hd_delay_factor_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)337 static ssize_t hd_delay_factor_store(struct device *dev,
338 struct device_attribute *attr,
339 const char *buf,
340 size_t count)
341 {
342 unsigned int val;
343 int rc;
344
345 rc = kstrtouint(buf, 0, &val);
346 if (rc)
347 return rc;
348 if (!val)
349 return -ERANGE;
350 hd_delay_factor = val;
351 return count;
352 }
353
354 static DEVICE_ATTR_RW(hd_delay_factor);
355
356 static struct attribute *hd_attrs[] = {
357 &dev_attr_hd_steal_threshold.attr,
358 &dev_attr_hd_delay_factor.attr,
359 NULL,
360 };
361
362 static const struct attribute_group hd_attr_group = {
363 .name = "hiperdispatch",
364 .attrs = hd_attrs,
365 };
366
hd_greedy_time_get(void * unused,u64 * val)367 static int hd_greedy_time_get(void *unused, u64 *val)
368 {
369 mutex_lock(&hd_counter_mutex);
370 hd_update_times();
371 *val = hd_high_time;
372 mutex_unlock(&hd_counter_mutex);
373 return 0;
374 }
375
376 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
377
hd_conservative_time_get(void * unused,u64 * val)378 static int hd_conservative_time_get(void *unused, u64 *val)
379 {
380 mutex_lock(&hd_counter_mutex);
381 hd_update_times();
382 *val = hd_low_time;
383 mutex_unlock(&hd_counter_mutex);
384 return 0;
385 }
386
387 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
388
hd_adjustment_count_get(void * unused,u64 * val)389 static int hd_adjustment_count_get(void *unused, u64 *val)
390 {
391 *val = atomic64_read(&hd_adjustments);
392 return 0;
393 }
394
395 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
396
hd_create_debugfs_counters(void)397 static void __init hd_create_debugfs_counters(void)
398 {
399 struct dentry *dir;
400
401 dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
402 debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
403 debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
404 debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
405 }
406
hd_create_attributes(void)407 static void __init hd_create_attributes(void)
408 {
409 struct device *dev;
410
411 dev = bus_get_dev_root(&cpu_subsys);
412 if (!dev)
413 return;
414 if (sysfs_create_group(&dev->kobj, &hd_attr_group))
415 pr_warn("Unable to create hiperdispatch attribute group\n");
416 put_device(dev);
417 }
418
hd_init(void)419 static int __init hd_init(void)
420 {
421 if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
422 hd_set_hiperdispatch_mode(1);
423 topology_schedule_update();
424 }
425 if (!register_sysctl("s390", hiperdispatch_ctl_table))
426 pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
427 hd_create_debugfs_counters();
428 hd_create_attributes();
429 return 0;
430 }
431 late_initcall(hd_init);
432