Lines Matching +full:domain +full:- +full:idle +full:- +full:state

1 // SPDX-License-Identifier: GPL-2.0
3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
5 * Built-in idle CPU tracking policy.
14 /* Enable/disable built-in idle CPU selection policy */
17 /* Enable/disable per-node idle cpumasks */
28 * cpumasks to track idle CPUs within each NUMA node.
31 * from is used to track all the idle CPUs in the system.
39 * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
45 * Per-node idle cpumasks.
50 * Return the idle masks associated to a target @node.
52 * NUMA_NO_NODE identifies the global idle cpumask.
61 * per-node idle cpumasks are disabled.
74 struct cpumask *idle_cpus = idle_cpumask(node)->cpu; in scx_idle_test_and_clear_cpu()
79 * cluster is not wholly idle either way. This also prevents in scx_idle_test_and_clear_cpu()
84 struct cpumask *idle_smts = idle_cpumask(node)->smt; in scx_idle_test_and_clear_cpu()
89 * @cpu is never cleared from the idle SMT mask. Ensure that in scx_idle_test_and_clear_cpu()
107 * Pick an idle CPU in a specific NUMA node.
115 cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); in pick_idle_cpu_in_node()
120 return -EBUSY; in pick_idle_cpu_in_node()
123 cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); in pick_idle_cpu_in_node()
125 return -EBUSY; in pick_idle_cpu_in_node()
135 * Tracks nodes that have not yet been visited when searching for an idle
141 * Search for an idle CPU across all nodes, excluding @node.
146 s32 cpu = -EBUSY; in pick_idle_cpu_from_online_nodes()
165 * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU in pick_idle_cpu_from_online_nodes()
170 * in a per-node array, instead of actually traversing them every in pick_idle_cpu_from_online_nodes()
184 * Find an idle CPU in the system, starting from @node.
205 return -EBUSY; in scx_pick_idle_cpu()
214 * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
215 * domain is not defined).
225 return sd->span_weight; in llc_weight()
229 * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
230 * domain is not defined).
244 * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
245 * NUMA domain is not defined).
255 sg = sd->groups; in numa_weight()
259 return sg->group_weight; in numa_weight()
263 * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
264 * domain is not defined).
274 sg = sd->groups; in numa_span()
300 * - LLC 0: cpu0..cpu7 in llc_numa_mismatch()
301 * - LLC 1: cpu8..cpu15 [offline] in llc_numa_mismatch()
304 * - LLC 0: cpu16..cpu23 in llc_numa_mismatch()
305 * - LLC 1: cpu24..cpu31 in llc_numa_mismatch()
320 * Initialize topology-aware scheduling.
323 * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
327 * CPU belongs to a single LLC domain, and that each LLC domain is entirely
337 * Enable LLC domain optimization only when there are multiple LLC in scx_idle_update_selcpu_topology()
339 * single LLC domain, the idle CPU selection logic can choose any in scx_idle_update_selcpu_topology()
342 * Note that it is sufficient to check the LLC domain of the first in scx_idle_update_selcpu_topology()
343 * online CPU to determine whether a single LLC domain includes all in scx_idle_update_selcpu_topology()
360 * If all CPUs belong to the same NUMA node and the same LLC domain, in scx_idle_update_selcpu_topology()
362 * for an idle CPU in the same domain twice is redundant. in scx_idle_update_selcpu_topology()
365 * optimization, as we would naturally select idle CPUs within in scx_idle_update_selcpu_topology()
366 * specific NUMA nodes querying the corresponding per-node cpumask. in scx_idle_update_selcpu_topology()
368 if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { in scx_idle_update_selcpu_topology()
379 pr_debug("sched_ext: LLC idle selection %s\n", in scx_idle_update_selcpu_topology()
381 pr_debug("sched_ext: NUMA idle selection %s\n", in scx_idle_update_selcpu_topology()
395 * Built-in CPU idle selection policy:
397 * 1. Prioritize full-idle cores:
398 * - always prioritize CPUs from fully idle cores (both logical CPUs are
399 * idle) to avoid interference caused by SMT.
402 * - prefer the last used CPU to take advantage of cached data (L1, L2) and
405 * 3. Pick a CPU within the same LLC (Last-Level Cache):
406 * - if the above conditions aren't met, pick a CPU that shares the same LLC
410 * - choose a CPU from the same NUMA node to reduce memory access latency.
412 * 5. Pick any idle CPU usable by the task.
422 * Return the picked CPU if idle, or a negative value otherwise.
440 * Determine the scheduling domain only if the task is allowed to run in scx_select_cpu_dfl()
444 * updating a cpumask every time we need to select an idle CPU (which in scx_select_cpu_dfl()
446 * if a task's scheduling domain is restricted by user-space (through in scx_select_cpu_dfl()
447 * CPU affinity), the task will simply use the flat scheduling domain in scx_select_cpu_dfl()
448 * defined by user-space. in scx_select_cpu_dfl()
450 if (p->nr_cpus_allowed >= num_possible_cpus()) { in scx_select_cpu_dfl()
465 * If the waker's CPU is cache affine and prev_cpu is idle, in scx_select_cpu_dfl()
483 * Checking only for the presence of idle CPUs is also in scx_select_cpu_dfl()
485 * piled up on it even if there is an idle core elsewhere on in scx_select_cpu_dfl()
489 if (!(current->flags & PF_EXITING) && in scx_select_cpu_dfl()
490 cpu_rq(cpu)->scx.local_dsq.nr == 0 && in scx_select_cpu_dfl()
492 !cpumask_empty(idle_cpumask(waker_node)->cpu)) { in scx_select_cpu_dfl()
493 if (cpumask_test_cpu(cpu, p->cpus_ptr)) in scx_select_cpu_dfl()
499 * If CPU has SMT, any wholly idle CPU is likely a better pick than in scx_select_cpu_dfl()
500 * partially idle @prev_cpu. in scx_select_cpu_dfl()
504 * Keep using @prev_cpu if it's part of a fully idle core. in scx_select_cpu_dfl()
506 if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && in scx_select_cpu_dfl()
513 * Search for any fully idle core in the same LLC domain. in scx_select_cpu_dfl()
522 * Search for any fully idle core in the same NUMA node. in scx_select_cpu_dfl()
531 * Search for any full-idle core usable by the task. in scx_select_cpu_dfl()
533 * If the node-aware idle CPU selection policy is enabled in scx_select_cpu_dfl()
538 cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); in scx_select_cpu_dfl()
543 * Give up if we're strictly looking for a full-idle SMT in scx_select_cpu_dfl()
547 cpu = -EBUSY; in scx_select_cpu_dfl()
553 * Use @prev_cpu if it's idle. in scx_select_cpu_dfl()
561 * Search for any idle CPU in the same LLC domain. in scx_select_cpu_dfl()
570 * Search for any idle CPU in the same NUMA node. in scx_select_cpu_dfl()
579 * Search for any idle CPU usable by the task. in scx_select_cpu_dfl()
581 * If the node-aware idle CPU selection policy is enabled in scx_select_cpu_dfl()
586 cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); in scx_select_cpu_dfl()
595 * Initialize global and per-node idle cpumasks.
601 /* Allocate global idle cpumasks */ in scx_idle_init_masks()
605 /* Allocate per-node idle cpumasks */ in scx_idle_init_masks()
615 BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); in scx_idle_init_masks()
616 BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); in scx_idle_init_masks()
620 static void update_builtin_idle(int cpu, bool idle) in update_builtin_idle() argument
623 struct cpumask *idle_cpus = idle_cpumask(node)->cpu; in update_builtin_idle()
625 assign_cpu(cpu, idle_cpus, idle); in update_builtin_idle()
630 struct cpumask *idle_smts = idle_cpumask(node)->smt; in update_builtin_idle()
632 if (idle) { in update_builtin_idle()
635 * only for optimization and self-correcting. in update_builtin_idle()
648 * Update the idle state of a CPU to @idle.
651 * scheduler of an actual idle state transition (idle to busy or vice
652 * versa). If @do_notify is false, only the idle state in the idle masks is
655 * This distinction is necessary, because an idle CPU can be "reserved" and
658 * to idle without a true state transition. Refreshing the idle masks
659 * without invoking ops.update_idle() ensures accurate idle state tracking
660 * while avoiding unnecessary updates and maintaining balanced state
663 void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) in __scx_update_idle() argument
671 * the idle thread and vice versa. in __scx_update_idle()
673 * Idle transitions are indicated by do_notify being set to true, in __scx_update_idle()
677 SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); in __scx_update_idle()
680 * Update the idle masks: in __scx_update_idle()
681 * - for real idle transitions (do_notify == true) in __scx_update_idle()
682 * - for idle-to-idle transitions (indicated by the previous task in __scx_update_idle()
683 * being the idle thread, managed by pick_task_idle()) in __scx_update_idle()
685 * Skip updating idle masks if the previous task is not the idle in __scx_update_idle()
687 * transitioning from a task to the idle thread (calling this in __scx_update_idle()
690 * In this way we can avoid updating the idle masks twice, in __scx_update_idle()
694 if (do_notify || is_idle_task(rq->curr)) in __scx_update_idle()
695 update_builtin_idle(cpu, idle); in __scx_update_idle()
703 * Consider all online cpus idle. Should converge to the actual state in reset_idle_masks()
706 if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { in reset_idle_masks()
707 cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); in reset_idle_masks()
708 cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); in reset_idle_masks()
715 cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); in reset_idle_masks()
716 cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); in reset_idle_masks()
723 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) in scx_idle_enable()
728 if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) in scx_idle_enable()
751 scx_ops_error("per-node idle tracking is disabled"); in validate_node()
752 return -EOPNOTSUPP; in validate_node()
757 return -ENOENT; in validate_node()
762 return -EINVAL; in validate_node()
768 return -EINVAL; in validate_node()
781 scx_ops_error("built-in idle tracking is disabled"); in check_builtin_idle_enabled()
786 * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
803 * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
807 * @is_idle: out parameter indicating whether the returned CPU is idle
809 * Can only be called from ops.select_cpu() if the built-in CPU selection is
810 * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
814 * currently idle and thus a good candidate for direct dispatching.
845 * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
846 * idle-tracking per-CPU cpumask of a target NUMA node.
849 * Returns an empty cpumask if idle tracking is not enabled, if @node is
860 return idle_cpumask(node)->cpu; in scx_bpf_get_idle_cpumask_node()
867 * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
868 * per-CPU cpumask.
870 * Returns an empty mask if idle tracking is not enabled, or running on a
884 return idle_cpumask(NUMA_NO_NODE)->cpu; in scx_bpf_get_idle_cpumask()
891 * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
892 * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
896 * Returns an empty cpumask if idle tracking is not enabled, if @node is
908 return idle_cpumask(node)->smt; in scx_bpf_get_idle_smtmask_node()
910 return idle_cpumask(node)->cpu; in scx_bpf_get_idle_smtmask_node()
917 * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
918 * per-physical-core cpumask. Can be used to determine if an entire physical
921 * Returns an empty mask if idle tracking is not enabled, or running on a
936 return idle_cpumask(NUMA_NO_NODE)->smt; in scx_bpf_get_idle_smtmask()
938 return idle_cpumask(NUMA_NO_NODE)->cpu; in scx_bpf_get_idle_smtmask()
945 * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
946 * either the percpu, or SMT idle-tracking cpumask.
953 * a reference to a global idle cpumask, which is read-only in the in scx_bpf_put_idle_cpumask()
960 * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
961 * @cpu: cpu to test and clear idle for
963 * Returns %true if @cpu was idle and its idle state was successfully cleared.
981 * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
986 * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
988 * Returns the picked idle cpu number on success, or -%EBUSY if no matching
1010 * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
1014 * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
1015 * number on success. -%EBUSY if no matching cpu was found.
1017 * Idle CPU tracking may race against CPU scheduling state transitions. For
1018 * example, this function may return -%EBUSY as CPUs are transitioning into the
1019 * idle state. If the caller then assumes that there will be dispatch events on
1035 scx_ops_error("per-node idle tracking is enabled"); in scx_bpf_pick_idle_cpu()
1036 return -EBUSY; in scx_bpf_pick_idle_cpu()
1040 return -EBUSY; in scx_bpf_pick_idle_cpu()
1046 * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
1052 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1053 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1054 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1060 * the CPU idle state).
1063 * set, this function can't tell which CPUs are idle and will always pick any
1086 return -EBUSY; in scx_bpf_pick_any_cpu_node()
1090 * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
1094 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1095 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1096 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1100 * set, this function can't tell which CPUs are idle and will always pick any
1112 scx_ops_error("per-node idle tracking is enabled"); in scx_bpf_pick_any_cpu()
1113 return -EBUSY; in scx_bpf_pick_any_cpu()
1126 return -EBUSY; in scx_bpf_pick_any_cpu()