1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2024 4 */ 5 6 #define KMSG_COMPONENT "hd" 7 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 8 9 /* 10 * Hiperdispatch: 11 * Dynamically calculates the optimum number of high capacity COREs 12 * by considering the state the system is in. When hiperdispatch decides 13 * that a capacity update is necessary, it schedules a topology update. 14 * During topology updates the CPU capacities are always re-adjusted. 15 * 16 * There is two places where CPU capacities are being accessed within 17 * hiperdispatch. 18 * -> hiperdispatch's reoccuring work function reads CPU capacities to 19 * determine high capacity CPU count. 20 * -> during a topology update hiperdispatch's adjustment function 21 * updates CPU capacities. 22 * These two can run on different CPUs in parallel which can cause 23 * hiperdispatch to make wrong decisions. This can potentially cause 24 * some overhead by leading to extra rebuild_sched_domains() calls 25 * for correction. Access to capacities within hiperdispatch has to be 26 * serialized to prevent the overhead. 27 * 28 * Hiperdispatch decision making revolves around steal time. 29 * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time 30 * crosses the threshold value hiperdispatch falls back to giving high 31 * capacities to entitled CPUs. When steal time drops below the 32 * threshold boundary, hiperdispatch utilizes all CPUs by giving all 33 * of them high capacity. 34 * 35 * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread 36 * performance. Comparing the throughput of; 37 * - single CORE, with N threads, running N tasks 38 * - N separate COREs running N tasks, 39 * using individual COREs for individual tasks yield better 40 * performance. This performance difference is roughly ~30% (can change 41 * between machine generations) 42 * 43 * Hiperdispatch tries to hint scheduler to use individual COREs for 44 * each task, as long as steal time on those COREs are less than 30%, 45 * therefore delaying the throughput loss caused by using SMP threads. 46 */ 47 48 #include <linux/cpufeature.h> 49 #include <linux/cpumask.h> 50 #include <linux/debugfs.h> 51 #include <linux/device.h> 52 #include <linux/kernel_stat.h> 53 #include <linux/kstrtox.h> 54 #include <linux/ktime.h> 55 #include <linux/sysctl.h> 56 #include <linux/types.h> 57 #include <linux/workqueue.h> 58 #include <asm/hiperdispatch.h> 59 #include <asm/setup.h> 60 #include <asm/smp.h> 61 #include <asm/topology.h> 62 63 #define CREATE_TRACE_POINTS 64 #include <asm/trace/hiperdispatch.h> 65 66 #define HD_DELAY_FACTOR (4) 67 #define HD_DELAY_INTERVAL (HZ / 4) 68 #define HD_STEAL_THRESHOLD 30 69 #define HD_STEAL_AVG_WEIGHT 16 70 71 static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ 72 static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ 73 static int hd_high_capacity_cores; /* Current CORE count with high capacity */ 74 static int hd_entitled_cores; /* Total vertical high and medium CORE count */ 75 static int hd_online_cores; /* Current online CORE count */ 76 77 static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ 78 static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */ 79 static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */ 80 static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */ 81 82 static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD; 83 static unsigned int hd_delay_factor = HD_DELAY_FACTOR; 84 static int hd_enabled; 85 86 static void hd_capacity_work_fn(struct work_struct *work); 87 static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); 88 89 static int hd_set_hiperdispatch_mode(int enable) 90 { 91 if (!cpu_has_topology()) 92 enable = 0; 93 if (hd_enabled == enable) 94 return 0; 95 hd_enabled = enable; 96 return 1; 97 } 98 99 void hd_reset_state(void) 100 { 101 cpumask_clear(&hd_vl_coremask); 102 cpumask_clear(&hd_vmvl_cpumask); 103 hd_entitled_cores = 0; 104 hd_online_cores = 0; 105 } 106 107 void hd_add_core(int cpu) 108 { 109 const struct cpumask *siblings; 110 int polarization; 111 112 hd_online_cores++; 113 polarization = smp_cpu_get_polarization(cpu); 114 siblings = topology_sibling_cpumask(cpu); 115 switch (polarization) { 116 case POLARIZATION_VH: 117 hd_entitled_cores++; 118 break; 119 case POLARIZATION_VM: 120 hd_entitled_cores++; 121 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); 122 break; 123 case POLARIZATION_VL: 124 cpumask_set_cpu(cpu, &hd_vl_coremask); 125 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); 126 break; 127 } 128 } 129 130 /* Serialize update and read operations of debug counters. */ 131 static DEFINE_MUTEX(hd_counter_mutex); 132 133 static void hd_update_times(void) 134 { 135 static ktime_t prev; 136 ktime_t now; 137 138 /* 139 * Check if hiperdispatch is active, if not set the prev to 0. 140 * This way it is possible to differentiate the first update iteration after 141 * enabling hiperdispatch. 142 */ 143 if (hd_entitled_cores == 0 || hd_enabled == 0) { 144 prev = ktime_set(0, 0); 145 return; 146 } 147 now = ktime_get(); 148 if (ktime_after(prev, 0)) { 149 if (hd_high_capacity_cores == hd_online_cores) 150 hd_high_time += ktime_ms_delta(now, prev); 151 else 152 hd_low_time += ktime_ms_delta(now, prev); 153 } 154 prev = now; 155 } 156 157 static void hd_update_capacities(void) 158 { 159 int cpu, upscaling_cores; 160 unsigned long capacity; 161 162 upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; 163 capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; 164 hd_high_capacity_cores = hd_entitled_cores; 165 for_each_cpu(cpu, &hd_vl_coremask) { 166 smp_set_core_capacity(cpu, capacity); 167 if (capacity != CPU_CAPACITY_HIGH) 168 continue; 169 hd_high_capacity_cores++; 170 upscaling_cores--; 171 if (upscaling_cores == 0) 172 capacity = CPU_CAPACITY_LOW; 173 } 174 } 175 176 void hd_disable_hiperdispatch(void) 177 { 178 cancel_delayed_work_sync(&hd_capacity_work); 179 hd_high_capacity_cores = hd_online_cores; 180 hd_previous_steal = 0; 181 } 182 183 int hd_enable_hiperdispatch(void) 184 { 185 mutex_lock(&hd_counter_mutex); 186 hd_update_times(); 187 mutex_unlock(&hd_counter_mutex); 188 if (hd_enabled == 0) 189 return 0; 190 if (hd_entitled_cores == 0) 191 return 0; 192 if (hd_online_cores <= hd_entitled_cores) 193 return 0; 194 mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); 195 hd_update_capacities(); 196 return 1; 197 } 198 199 static unsigned long hd_steal_avg(unsigned long new) 200 { 201 static unsigned long steal; 202 203 steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT; 204 return steal; 205 } 206 207 static unsigned long hd_calculate_steal_percentage(void) 208 { 209 unsigned long time_delta, steal_delta, steal, percentage; 210 static ktime_t prev; 211 int cpus, cpu; 212 ktime_t now; 213 214 cpus = 0; 215 steal = 0; 216 percentage = 0; 217 for_each_cpu(cpu, &hd_vmvl_cpumask) { 218 steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; 219 cpus++; 220 } 221 /* 222 * If there is no vertical medium and low CPUs steal time 223 * is 0 as vertical high CPUs shouldn't experience steal time. 224 */ 225 if (cpus == 0) 226 return percentage; 227 now = ktime_get(); 228 time_delta = ktime_to_ns(ktime_sub(now, prev)); 229 if (steal > hd_previous_steal && hd_previous_steal != 0) { 230 steal_delta = (steal - hd_previous_steal) * 100 / time_delta; 231 percentage = steal_delta / cpus; 232 } 233 hd_previous_steal = steal; 234 prev = now; 235 return percentage; 236 } 237 238 static void hd_capacity_work_fn(struct work_struct *work) 239 { 240 unsigned long steal_percentage, new_cores; 241 242 mutex_lock(&smp_cpu_state_mutex); 243 /* 244 * If online cores are less or equal to entitled cores hiperdispatch 245 * does not need to make any adjustments, call a topology update to 246 * disable hiperdispatch. 247 * Normally this check is handled on topology update, but during cpu 248 * unhotplug, topology and cpu mask updates are done in reverse 249 * order, causing hd_enable_hiperdispatch() to get stale data. 250 */ 251 if (hd_online_cores <= hd_entitled_cores) { 252 topology_schedule_update(); 253 mutex_unlock(&smp_cpu_state_mutex); 254 return; 255 } 256 steal_percentage = hd_steal_avg(hd_calculate_steal_percentage()); 257 if (steal_percentage < hd_steal_threshold) 258 new_cores = hd_online_cores; 259 else 260 new_cores = hd_entitled_cores; 261 if (hd_high_capacity_cores != new_cores) { 262 trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores); 263 hd_high_capacity_cores = new_cores; 264 atomic64_inc(&hd_adjustments); 265 topology_schedule_update(); 266 } 267 trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores); 268 mutex_unlock(&smp_cpu_state_mutex); 269 schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); 270 } 271 272 static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, 273 void *buffer, size_t *lenp, loff_t *ppos) 274 { 275 int hiperdispatch; 276 int rc; 277 struct ctl_table ctl_entry = { 278 .procname = ctl->procname, 279 .data = &hiperdispatch, 280 .maxlen = sizeof(int), 281 .extra1 = SYSCTL_ZERO, 282 .extra2 = SYSCTL_ONE, 283 }; 284 285 hiperdispatch = hd_enabled; 286 rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); 287 if (rc < 0 || !write) 288 return rc; 289 mutex_lock(&smp_cpu_state_mutex); 290 if (hd_set_hiperdispatch_mode(hiperdispatch)) 291 topology_schedule_update(); 292 mutex_unlock(&smp_cpu_state_mutex); 293 return 0; 294 } 295 296 static const struct ctl_table hiperdispatch_ctl_table[] = { 297 { 298 .procname = "hiperdispatch", 299 .mode = 0644, 300 .proc_handler = hiperdispatch_ctl_handler, 301 }, 302 }; 303 304 static ssize_t hd_steal_threshold_show(struct device *dev, 305 struct device_attribute *attr, 306 char *buf) 307 { 308 return sysfs_emit(buf, "%u\n", hd_steal_threshold); 309 } 310 311 static ssize_t hd_steal_threshold_store(struct device *dev, 312 struct device_attribute *attr, 313 const char *buf, 314 size_t count) 315 { 316 unsigned int val; 317 int rc; 318 319 rc = kstrtouint(buf, 0, &val); 320 if (rc) 321 return rc; 322 if (val > 100) 323 return -ERANGE; 324 hd_steal_threshold = val; 325 return count; 326 } 327 328 static DEVICE_ATTR_RW(hd_steal_threshold); 329 330 static ssize_t hd_delay_factor_show(struct device *dev, 331 struct device_attribute *attr, 332 char *buf) 333 { 334 return sysfs_emit(buf, "%u\n", hd_delay_factor); 335 } 336 337 static ssize_t hd_delay_factor_store(struct device *dev, 338 struct device_attribute *attr, 339 const char *buf, 340 size_t count) 341 { 342 unsigned int val; 343 int rc; 344 345 rc = kstrtouint(buf, 0, &val); 346 if (rc) 347 return rc; 348 if (!val) 349 return -ERANGE; 350 hd_delay_factor = val; 351 return count; 352 } 353 354 static DEVICE_ATTR_RW(hd_delay_factor); 355 356 static struct attribute *hd_attrs[] = { 357 &dev_attr_hd_steal_threshold.attr, 358 &dev_attr_hd_delay_factor.attr, 359 NULL, 360 }; 361 362 static const struct attribute_group hd_attr_group = { 363 .name = "hiperdispatch", 364 .attrs = hd_attrs, 365 }; 366 367 static int hd_greedy_time_get(void *unused, u64 *val) 368 { 369 mutex_lock(&hd_counter_mutex); 370 hd_update_times(); 371 *val = hd_high_time; 372 mutex_unlock(&hd_counter_mutex); 373 return 0; 374 } 375 376 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n"); 377 378 static int hd_conservative_time_get(void *unused, u64 *val) 379 { 380 mutex_lock(&hd_counter_mutex); 381 hd_update_times(); 382 *val = hd_low_time; 383 mutex_unlock(&hd_counter_mutex); 384 return 0; 385 } 386 387 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n"); 388 389 static int hd_adjustment_count_get(void *unused, u64 *val) 390 { 391 *val = atomic64_read(&hd_adjustments); 392 return 0; 393 } 394 395 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n"); 396 397 static void __init hd_create_debugfs_counters(void) 398 { 399 struct dentry *dir; 400 401 dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir); 402 debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops); 403 debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops); 404 debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops); 405 } 406 407 static void __init hd_create_attributes(void) 408 { 409 struct device *dev; 410 411 dev = bus_get_dev_root(&cpu_subsys); 412 if (!dev) 413 return; 414 if (sysfs_create_group(&dev->kobj, &hd_attr_group)) 415 pr_warn("Unable to create hiperdispatch attribute group\n"); 416 put_device(dev); 417 } 418 419 static int __init hd_init(void) 420 { 421 if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) { 422 hd_set_hiperdispatch_mode(1); 423 topology_schedule_update(); 424 } 425 if (!register_sysctl("s390", hiperdispatch_ctl_table)) 426 pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n"); 427 hd_create_debugfs_counters(); 428 hd_create_attributes(); 429 return 0; 430 } 431 late_initcall(hd_init); 432