kernel/sched/ext.c

1 /* SPDX-License-Identifier: GPL-2.0 */
3  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
36 	SCX_EXIT_UNREG = 64,	/* user-space initiated unregistration */
37 	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
38 	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
54  *   SYS ACT: System-defined exit actions
55  *   SYS RSN: System-defined exit reasons
56  *   USR    : User-defined exit codes and reasons
59  * actions and/or system reasons with a user-defined exit code.
74 	/* %SCX_EXIT_* - broad category of the exit reason */
97 	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
103 	 * keeps running the current task even after its slice expires. If this
131 	 * the default slice on enqueue. If this ops flag is set, they also go
135 	 * only select the current CPU. Also, p->cpus_ptr will only contain its
136 	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
137 	 * and thus may disagree with cpumask_weight(p->cpus_ptr).
144 	 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
158 	 * If set, enable per-node idle cpumasks. If clear, use a single global
197 /* argument container for ops->cgroup_init() */
215  * Argument container for ops->cpu_acquire(). Currently empty, but may be
220 /* argument container for ops->cpu_release() */
241  * struct sched_ext_ops - Operation table for BPF scheduler implementation
258 	 * saves a small bit of overhead down the line.
302 	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
322 	 * When not %NULL, @prev is an SCX task with its slice depleted. If
324 	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
335 	 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
346 	 * execution state transitions. A task becomes ->runnable() on a CPU,
347 	 * and then goes through one or more ->running() and ->stopping() pairs
348 	 * as it runs on the CPU, and eventually becomes ->quiescent() when it's
353 	 * - waking up (%SCX_ENQ_WAKEUP)
354 	 * - being moved from another CPU
355 	 * - being restored after temporarily taken off the queue for an
358 	 * This and ->enqueue() are related but not coupled. This operation
359 	 * notifies @p's state transition and may not be followed by ->enqueue()
362 	 * task may be ->enqueue()'d without being preceded by this operation
363 	 * e.g. after exhausting its slice.
371 	 * See ->runnable() for explanation on the task state notifiers.
380 	 * See ->runnable() for explanation on the task state notifiers. If
381 	 * !@runnable, ->quiescent() will be invoked after this operation
391 	 * See ->runnable() for explanation on the task state notifiers.
395 	 * - sleeping (%SCX_DEQ_SLEEP)
396 	 * - being moved to another CPU
397 	 * - being temporarily taken off the queue for an attribute change
400 	 * This and ->dequeue() are related but not coupled. This operation
401 	 * notifies @p's state transition and may not be preceded by ->dequeue()
416 	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
422 	 * @core_sched_before: Task ordering for core-sched
426 	 * Used by core-sched to determine the ordering between two tasks. See
427 	 * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
428 	 * core-sched.
464 	 * state. By default, implementing this operation disables the built-in
467 	 * - scx_bpf_select_cpu_dfl()
468 	 * - scx_bpf_test_and_clear_cpu_idle()
469 	 * - scx_bpf_pick_idle_cpu()
474 	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
497 	 * caller should consult @args->reason to determine the cause.
510 	 * Return 0 for success, -errno for failure. An error return while
517 	 * @exit_task: Exit a previously-running task from the system
584 	 * Return 0 for success, -errno for failure. An error return while
609 	 * Return 0 for success, -errno for failure. An error return aborts the
726 	 * Must be a non-zero valid BPF object name including only isalnum(),
759 	 * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
769 	 * invoked in a ->cpu_release() callback, and the task is again
770 	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
772 	 * of the ->cpu_acquire() callback.
782 	 * The BPF scheduler is responsible for triggering a follow-up
801 	 * The generic core-sched layer decided to execute the task even though
823 	 * current task of the target CPU is an SCX task, its ->scx.slice is
856  * sched_ext_entity->ops_state
861  * NONE -> QUEUEING -> QUEUED -> DISPATCHING
864  *   \-------------------------------/
889 	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
897 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
932 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
970  * Non-NULL values are used for direct dispatch from enqueue path. A valid
979  * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
980  * to avoid live-locking in bypass mode where all tasks are dispatched to
981  * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
1017 	char			line[SCX_EXIT_MSG_LEN];  member
1034 	.cpu			= -1,
1064 		return jiffies_to_msecs(at - now);  in jiffies_delta_msecs()
1066 		return -(long)jiffies_to_msecs(now - at);  in jiffies_delta_msecs()
1072 	return ~((1 << fls(flags)) - 1);  in higher_bits()
1084 	return (s32)(a - b) < 0;  in u32_before()
1108 	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,  in scx_kf_allow()
1109 		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",  in scx_kf_allow()
1110 		  current->scx.kf_mask, mask);  in scx_kf_allow()
1111 	current->scx.kf_mask |= mask;  in scx_kf_allow()
1118 	current->scx.kf_mask &= ~mask;  in scx_kf_disallow()
1181  * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
1184  * for non-nesting operations due to the way the tasks are tracked.
1193 	current->scx.kf_tasks[0] = task;					\
1195 	current->scx.kf_tasks[0] = NULL;					\
1202 	current->scx.kf_tasks[0] = task;					\
1204 	current->scx.kf_tasks[0] = NULL;					\
1212 	current->scx.kf_tasks[0] = task0;					\
1213 	current->scx.kf_tasks[1] = task1;					\
1215 	current->scx.kf_tasks[0] = NULL;					\
1216 	current->scx.kf_tasks[1] = NULL;					\
1223 	if (unlikely(!(current->scx.kf_mask & mask))) {  in scx_kf_allowed()
1225 			      mask, current->scx.kf_mask);  in scx_kf_allowed()
1237 		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {  in scx_kf_allowed()
1243 		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {  in scx_kf_allowed()
1258 	if (unlikely((p != current->scx.kf_tasks[0] &&  in scx_kf_allowed_on_arg_tasks()
1259 		      p != current->scx.kf_tasks[1]))) {  in scx_kf_allowed_on_arg_tasks()
1269 	return !current->scx.kf_mask;  in scx_kf_allowed_if_unlocked()
1273  * nldsq_next_task - Iterate to the next task in a non-local DSQ
1286 	lockdep_assert_held(&dsq->lock);  in nldsq_next_task()
1289 		list_node = &cur->scx.dsq_list.node;  in nldsq_next_task()
1291 		list_node = &dsq->list;  in nldsq_next_task()
1296 			list_node = list_node->prev;  in nldsq_next_task()
1298 			list_node = list_node->next;  in nldsq_next_task()
1300 		if (list_node == &dsq->list)  in nldsq_next_task()
1305 	} while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);  in nldsq_next_task()
1316  * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
1317  * dispatch order. BPF-visible iterator is opaque and larger to allow future
1337 	u64				slice;  member
1358  * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
1376 		     ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));  in scx_task_iter_start()
1380 	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };  in scx_task_iter_start()
1381 	list_add(&iter->cursor.tasks_node, &scx_tasks);  in scx_task_iter_start()
1382 	iter->locked = NULL;  in scx_task_iter_start()
1383 	iter->cnt = 0;  in scx_task_iter_start()
1388 	if (iter->locked) {  in __scx_task_iter_rq_unlock()
1389 		task_rq_unlock(iter->rq, iter->locked, &iter->rf);  in __scx_task_iter_rq_unlock()
1390 		iter->locked = NULL;  in __scx_task_iter_rq_unlock()
1395  * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
1409  * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
1410  * @iter: iterator to re-lock
1412  * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
1413  * doesn't re-lock the rq lock. Must be called before other iterator operations.
1421  * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
1430 	list_del_init(&iter->cursor.tasks_node);  in scx_task_iter_stop()
1435  * scx_task_iter_next - Next task
1439  * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
1444 	struct list_head *cursor = &iter->cursor.tasks_node;  in scx_task_iter_next()
1447 	if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {  in scx_task_iter_next()
1454 		if (&pos->tasks_node == &scx_tasks)  in scx_task_iter_next()
1456 		if (!(pos->flags & SCX_TASK_CURSOR)) {  in scx_task_iter_next()
1457 			list_move(cursor, &pos->tasks_node);  in scx_task_iter_next()
1467  * scx_task_iter_next_locked - Next non-idle task with its rq locked
1470  * Visit the non-idle task with its rq lock held. Allows callers to specify
1483 		 * while loading the BPF scheduler and vice-versa while  in scx_task_iter_next_locked()
1487 		 * - It's unsafe to use __setschduler_prio() on an init_task to  in scx_task_iter_next_locked()
1491 		 * - ops.init/exit_task() can easily be confused if called with  in scx_task_iter_next_locked()
1498 		 * - %PF_IDLE may not be set for an init_task whose CPU hasn't  in scx_task_iter_next_locked()
1501 		 * - %PF_IDLE can be set on tasks that are not init_tasks. See  in scx_task_iter_next_locked()
1506 		if (p->sched_class != &idle_sched_class)  in scx_task_iter_next_locked()
1512 	iter->rq = task_rq_lock(p, &iter->rf);  in scx_task_iter_next_locked()
1513 	iter->locked = p;  in scx_task_iter_next_locked()
1554 	 * The total number of tasks enqueued (or pick_task-ed) with a
1555 	 * default time slice (SCX_SLICE_DFL).
1576  * The event counter is organized by a per-CPU variable to minimize the
1577  * accounting overhead without synchronization. A system-wide view on the
1583  * scx_add_event - Increase an event counter for 'name' by 'cnt'
1595  * __scx_add_event - Increase an event counter for 'name' by 'cnt'
1607  * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
1613 	(dst_e)->kind += READ_ONCE((src_e)->kind);				\
1617  * scx_dump_event - Dump an event 'kind' in 'events' to 's'
1623 	dump_line(&(s), "%40s: %16lld", #kind, (events)->kind);			\
1650 	return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);  in scx_rq_bypassing()
1654  * wait_ops_state - Busy-wait the specified ops state to end
1658  * Busy-wait for @p to transition out of @opss. This can only be used when the
1667 	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);  in wait_ops_state()
1671  * ops_cpu_valid - Verify a cpu number
1691  * ops_sanitize_err - Sanitize a -errno value
1693  * @err: -errno value to sanitize
1695  * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
1696  * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
1704 	if (err < 0 && err >= -MAX_ERRNO)  in ops_sanitize_err()
1708 	return -EPROTO;  in ops_sanitize_err()
1733  * schedule_deferred - Schedule execution of deferred actions on an rq
1750 	if (rq->scx.flags & SCX_RQ_IN_WAKEUP)  in schedule_deferred()
1757 	if (rq->scx.flags & SCX_RQ_IN_BALANCE) {  in schedule_deferred()
1758 		queue_balance_callback(rq, &rq->scx.deferred_bal_cb,  in schedule_deferred()
1765 	 * IRQ re-enable which may take a bit longer than the scheduler hooks.  in schedule_deferred()
1767 	 * the time to IRQ re-enable shouldn't be long.  in schedule_deferred()
1769 	irq_work_queue(&rq->scx.deferred_irq_work);  in schedule_deferred()
1773  * touch_core_sched - Update timestamp used for core-sched task ordering
1777  * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
1778  * implement global or local-DSQ FIFO ordering for core-sched. Should be called
1779  * when a task becomes runnable and its turn on the CPU ends (e.g. slice
1792 	 * it may be better to use per-core dispatch sequence instead.  in touch_core_sched()
1795 		p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));  in touch_core_sched()
1800  * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
1804  * If the BPF scheduler implements custom core-sched ordering via
1805  * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
1807  * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
1821 	struct task_struct *curr = rq->curr;  in update_curr_scx()
1828 	if (curr->scx.slice != SCX_SLICE_INF) {  in update_curr_scx()
1829 		curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);  in update_curr_scx()
1830 		if (!curr->scx.slice)  in update_curr_scx()
1843 	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);  in scx_dsq_priq_less()
1848 	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */  in dsq_mod_nr()
1849 	WRITE_ONCE(dsq->nr, dsq->nr + delta);  in dsq_mod_nr()
1855 	bool is_local = dsq->id == SCX_DSQ_LOCAL;  in dispatch_enqueue()
1857 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));  in dispatch_enqueue()
1858 	WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||  in dispatch_enqueue()
1859 		     !RB_EMPTY_NODE(&p->scx.dsq_priq));  in dispatch_enqueue()
1862 		raw_spin_lock(&dsq->lock);  in dispatch_enqueue()
1863 		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {  in dispatch_enqueue()
1866 			raw_spin_unlock(&dsq->lock);  in dispatch_enqueue()
1868 			raw_spin_lock(&dsq->lock);  in dispatch_enqueue()
1872 	if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&  in dispatch_enqueue()
1877 		 * starving vtime-dispatched tasks by FIFO-dispatched tasks, we  in dispatch_enqueue()
1881 		scx_ops_error("cannot use vtime ordering for built-in DSQs");  in dispatch_enqueue()
1893 		if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&  in dispatch_enqueue()
1895 			scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",  in dispatch_enqueue()
1896 				      dsq->id);  in dispatch_enqueue()
1898 		p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;  in dispatch_enqueue()
1899 		rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);  in dispatch_enqueue()
1903 		 * that @dsq->list is vtime ordered.  in dispatch_enqueue()
1905 		rbp = rb_prev(&p->scx.dsq_priq);  in dispatch_enqueue()
1910 			list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);  in dispatch_enqueue()
1912 			list_add(&p->scx.dsq_list.node, &dsq->list);  in dispatch_enqueue()
1916 		if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))  in dispatch_enqueue()
1917 			scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",  in dispatch_enqueue()
1918 				      dsq->id);  in dispatch_enqueue()
1921 			list_add(&p->scx.dsq_list.node, &dsq->list);  in dispatch_enqueue()
1923 			list_add_tail(&p->scx.dsq_list.node, &dsq->list);  in dispatch_enqueue()
1927 	dsq->seq++;  in dispatch_enqueue()
1928 	p->scx.dsq_seq = dsq->seq;  in dispatch_enqueue()
1931 	p->scx.dsq = dsq;  in dispatch_enqueue()
1939 	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;  in dispatch_enqueue()
1940 	p->scx.ddsp_enq_flags = 0;  in dispatch_enqueue()
1947 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in dispatch_enqueue()
1953 		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&  in dispatch_enqueue()
1954 		    rq->curr->sched_class == &ext_sched_class) {  in dispatch_enqueue()
1955 			rq->curr->scx.slice = 0;  in dispatch_enqueue()
1960 						 rq->curr->sched_class))  in dispatch_enqueue()
1963 		raw_spin_unlock(&dsq->lock);  in dispatch_enqueue()
1970 	WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));  in task_unlink_from_dsq()
1972 	if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {  in task_unlink_from_dsq()
1973 		rb_erase(&p->scx.dsq_priq, &dsq->priq);  in task_unlink_from_dsq()
1974 		RB_CLEAR_NODE(&p->scx.dsq_priq);  in task_unlink_from_dsq()
1975 		p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;  in task_unlink_from_dsq()
1978 	list_del_init(&p->scx.dsq_list.node);  in task_unlink_from_dsq()
1979 	dsq_mod_nr(dsq, -1);  in task_unlink_from_dsq()
1984 	struct scx_dispatch_q *dsq = p->scx.dsq;  in dispatch_dequeue()
1985 	bool is_local = dsq == &rq->scx.local_dsq;  in dispatch_dequeue()
1989 		 * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.  in dispatch_dequeue()
1992 		if (unlikely(!list_empty(&p->scx.dsq_list.node)))  in dispatch_dequeue()
1993 			list_del_init(&p->scx.dsq_list.node);  in dispatch_dequeue()
1998 		 * @p->scx.holding_cpu may be set under the protection of  in dispatch_dequeue()
2001 		if (p->scx.holding_cpu >= 0)  in dispatch_dequeue()
2002 			p->scx.holding_cpu = -1;  in dispatch_dequeue()
2008 		raw_spin_lock(&dsq->lock);  in dispatch_dequeue()
2011 	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't  in dispatch_dequeue()
2014 	if (p->scx.holding_cpu < 0) {  in dispatch_dequeue()
2020 		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the  in dispatch_dequeue()
2024 		WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));  in dispatch_dequeue()
2025 		p->scx.holding_cpu = -1;  in dispatch_dequeue()
2027 	p->scx.dsq = NULL;  in dispatch_dequeue()
2030 		raw_spin_unlock(&dsq->lock);  in dispatch_dequeue()
2039 		return &rq->scx.local_dsq;  in find_dsq_for_dispatch()
2047 		return &cpu_rq(cpu)->scx.local_dsq;  in find_dsq_for_dispatch()
2056 		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",  in find_dsq_for_dispatch()
2057 			      dsq_id, p->comm, p->pid);  in find_dsq_for_dispatch()
2070 	 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value  in mark_direct_dispatch()
2073 	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));  in mark_direct_dispatch()
2078 			scx_ops_error("%s[%d] already direct-dispatched",  in mark_direct_dispatch()
2079 				      p->comm, p->pid);  in mark_direct_dispatch()
2081 			scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",  in mark_direct_dispatch()
2082 				      ddsp_task->comm, ddsp_task->pid,  in mark_direct_dispatch()
2083 				      p->comm, p->pid);  in mark_direct_dispatch()
2087 	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);  in mark_direct_dispatch()
2088 	WARN_ON_ONCE(p->scx.ddsp_enq_flags);  in mark_direct_dispatch()
2090 	p->scx.ddsp_dsq_id = dsq_id;  in mark_direct_dispatch()
2091 	p->scx.ddsp_enq_flags = enq_flags;  in mark_direct_dispatch()
2098 		find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);  in direct_dispatch()
2102 	p->scx.ddsp_enq_flags |= enq_flags;  in direct_dispatch()
2110 	if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {  in direct_dispatch()
2113 		opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;  in direct_dispatch()
2123 			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in direct_dispatch()
2127 				  p->comm, p->pid, opss);  in direct_dispatch()
2128 			atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in direct_dispatch()
2132 		WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));  in direct_dispatch()
2133 		list_add_tail(&p->scx.dsq_list.node,  in direct_dispatch()
2134 			      &rq->scx.ddsp_deferred_locals);  in direct_dispatch()
2139 	dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);  in direct_dispatch()
2151 	return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));  in scx_rq_online()
2160 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));  in do_enqueue_task()
2179 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)  in do_enqueue_task()
2184 	    unlikely(p->flags & PF_EXITING)) {  in do_enqueue_task()
2200 	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;  in do_enqueue_task()
2202 	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);  in do_enqueue_task()
2203 	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);  in do_enqueue_task()
2212 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)  in do_enqueue_task()
2219 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);  in do_enqueue_task()
2228 	 * For task-ordering, slice refill must be treated as implying the end  in do_enqueue_task()
2229 	 * of the current slice. Otherwise, the longer @p stays on the CPU, the  in do_enqueue_task()
2233 	p->scx.slice = SCX_SLICE_DFL;  in do_enqueue_task()
2236 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);  in do_enqueue_task()
2241 	p->scx.slice = SCX_SLICE_DFL;  in do_enqueue_task()
2248 	return !list_empty(&p->scx.runnable_node);  in task_runnable()
2255 	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {  in set_task_runnable()
2256 		p->scx.runnable_at = jiffies;  in set_task_runnable()
2257 		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;  in set_task_runnable()
2264 	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);  in set_task_runnable()
2269 	list_del_init(&p->scx.runnable_node);  in clr_task_runnable()
2271 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;  in clr_task_runnable()
2276 	int sticky_cpu = p->scx.sticky_cpu;  in enqueue_task_scx()
2279 		rq->scx.flags |= SCX_RQ_IN_WAKEUP;  in enqueue_task_scx()
2281 	enq_flags |= rq->scx.extra_enq_flags;  in enqueue_task_scx()
2284 		p->scx.sticky_cpu = -1;  in enqueue_task_scx()
2290 	 * direct-dispatch into the local DSQ by setting the sticky_cpu.  in enqueue_task_scx()
2295 	if (p->scx.flags & SCX_TASK_QUEUED) {  in enqueue_task_scx()
2301 	p->scx.flags |= SCX_TASK_QUEUED;  in enqueue_task_scx()
2302 	rq->scx.nr_running++;  in enqueue_task_scx()
2313 	rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;  in enqueue_task_scx()
2316 	    unlikely(cpu_of(rq) != p->scx.selected_cpu))  in enqueue_task_scx()
2328 	opss = atomic_long_read_acquire(&p->scx.ops_state);  in ops_dequeue()
2343 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,  in ops_dequeue()
2362 		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);  in ops_dequeue()
2369 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {  in dequeue_task_scx()
2378 	 * and then stops running. As we want running <-> stopping transitions  in dequeue_task_scx()
2379 	 * to be contained within runnable <-> quiescent transitions, trigger  in dequeue_task_scx()
2380 	 * ->stopping() early here instead of in put_prev_task_scx().  in dequeue_task_scx()
2382 	 * @p may go through multiple stopping <-> running transitions between  in dequeue_task_scx()
2397 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;  in dequeue_task_scx()
2399 		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;  in dequeue_task_scx()
2401 	p->scx.flags &= ~SCX_TASK_QUEUED;  in dequeue_task_scx()
2402 	rq->scx.nr_running--;  in dequeue_task_scx()
2411 	struct task_struct *p = rq->curr;  in yield_task_scx()
2416 		p->scx.slice = 0;  in yield_task_scx()
2421 	struct task_struct *from = rq->curr;  in yield_to_task_scx()
2433 	struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;  in move_local_task_to_local_dsq()
2436 	lockdep_assert_held(&src_dsq->lock);  in move_local_task_to_local_dsq()
2439 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);  in move_local_task_to_local_dsq()
2442 		list_add(&p->scx.dsq_list.node, &dst_dsq->list);  in move_local_task_to_local_dsq()
2444 		list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);  in move_local_task_to_local_dsq()
2447 	p->scx.dsq = dst_dsq;  in move_local_task_to_local_dsq()
2452  * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
2468 	p->scx.sticky_cpu = cpu_of(dst_rq);  in move_remote_task_to_local_dsq()
2474 	 * We want to pass scx-specific enq_flags but activate_task() will  in move_remote_task_to_local_dsq()
2476 	 * @rq->scx.extra_enq_flags instead.  in move_remote_task_to_local_dsq()
2478 	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));  in move_remote_task_to_local_dsq()
2479 	WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);  in move_remote_task_to_local_dsq()
2480 	dst_rq->scx.extra_enq_flags = enq_flags;  in move_remote_task_to_local_dsq()
2482 	dst_rq->scx.extra_enq_flags = 0;  in move_remote_task_to_local_dsq()
2489  * - is_cpu_allowed() asks "Can this task run on this CPU?" while
2499  * - The BPF scheduler is bypassed while the rq is offline and we can always say
2512 	 * If @p has migration disabled, @p->cpus_ptr is updated to contain only  in task_can_run_on_remote_rq()
2514 	 * out. However, put_prev_task_scx() is called before @p->cpus_ptr is  in task_can_run_on_remote_rq()
2526 				      p->comm, p->pid, task_cpu(p), cpu);  in task_can_run_on_remote_rq()
2539 				      cpu, p->comm, p->pid);  in task_can_run_on_remote_rq()
2553  * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
2561  * non-local DSQ, it's better to use the same mechanism to protect against
2562  * dequeues and maintain the invariant that @p->scx.dsq can only change while
2570  * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
2572  * would be cleared to -1. While other cpus may have updated it to different
2587 	lockdep_assert_held(&dsq->lock);  in unlink_dsq_and_lock_src_rq()
2589 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);  in unlink_dsq_and_lock_src_rq()
2591 	p->scx.holding_cpu = cpu;  in unlink_dsq_and_lock_src_rq()
2593 	raw_spin_unlock(&dsq->lock);  in unlink_dsq_and_lock_src_rq()
2597 	return likely(p->scx.holding_cpu == cpu) &&  in unlink_dsq_and_lock_src_rq()
2622  * move_task_between_dsqs() - Move a task from one DSQ to another
2642 	BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);  in move_task_between_dsqs()
2643 	lockdep_assert_held(&src_dsq->lock);  in move_task_between_dsqs()
2646 	if (dst_dsq->id == SCX_DSQ_LOCAL) {  in move_task_between_dsqs()
2654 		/* no need to migrate if destination is a non-local DSQ */  in move_task_between_dsqs()
2662 	if (dst_dsq->id == SCX_DSQ_LOCAL) {  in move_task_between_dsqs()
2663 		/* @p is going from a non-local DSQ to a local DSQ */  in move_task_between_dsqs()
2668 			raw_spin_unlock(&src_dsq->lock);  in move_task_between_dsqs()
2670 			raw_spin_unlock(&src_dsq->lock);  in move_task_between_dsqs()
2676 		 * @p is going from a non-local DSQ to a non-local DSQ. As  in move_task_between_dsqs()
2680 		p->scx.dsq = NULL;  in move_task_between_dsqs()
2681 		raw_spin_unlock(&src_dsq->lock);  in move_task_between_dsqs()
2690  * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
2710 		while (atomic_read(&scx_ops_breather_depth) && --cnt)  in scx_ops_breather()
2725 	 * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can  in consume_dispatch_q()
2726 	 * live-lock the machine into soft lockups. Give a breather.  in consume_dispatch_q()
2733 	 * @dsq->list without locking and skip if it seems empty.  in consume_dispatch_q()
2735 	if (list_empty(&dsq->list))  in consume_dispatch_q()
2738 	raw_spin_lock(&dsq->lock);  in consume_dispatch_q()
2746 			raw_spin_unlock(&dsq->lock);  in consume_dispatch_q()
2757 	raw_spin_unlock(&dsq->lock);  in consume_dispatch_q()
2769  * dispatch_to_local_dsq - Dispatch a task to a local dsq
2817 	 * we're moving from a DSQ and use the same mechanism - mark the task  in dispatch_to_local_dsq()
2821 	p->scx.holding_cpu = raw_smp_processor_id();  in dispatch_to_local_dsq()
2824 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);  in dispatch_to_local_dsq()
2834 	if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&  in dispatch_to_local_dsq()
2842 			p->scx.holding_cpu = -1;  in dispatch_to_local_dsq()
2843 			dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);  in dispatch_to_local_dsq()
2852 		if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))  in dispatch_to_local_dsq()
2867  * finish_dispatch - Asynchronously finish dispatching a task
2898 	opss = atomic_long_read(&p->scx.ops_state);  in finish_dispatch()
2908 		 * dispatch/dequeue and re-enqueue cycle between  in finish_dispatch()
2916 		 * it - the BPF scheduler is allowed to dispatch tasks  in finish_dispatch()
2921 		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,  in finish_dispatch()
2936 	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));  in finish_dispatch()
2940 	if (dsq->id == SCX_DSQ_LOCAL)  in finish_dispatch()
2951 	for (u = 0; u < dspc->cursor; u++) {  in flush_dispatch_buf()
2952 		struct scx_dsp_buf_ent *ent = &dspc->buf[u];  in flush_dispatch_buf()
2954 		finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,  in flush_dispatch_buf()
2955 				ent->enq_flags);  in flush_dispatch_buf()
2958 	dspc->nr_tasks += dspc->cursor;  in flush_dispatch_buf()
2959 	dspc->cursor = 0;  in flush_dispatch_buf()
2965 	bool prev_on_scx = prev->sched_class == &ext_sched_class;  in balance_one()
2966 	bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;  in balance_one()
2970 	rq->scx.flags |= SCX_RQ_IN_BALANCE;  in balance_one()
2971 	rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);  in balance_one()
2974 	    unlikely(rq->scx.cpu_released)) {  in balance_one()
2978 		 * core. This callback complements ->cpu_release(), which is  in balance_one()
2983 		rq->scx.cpu_released = false;  in balance_one()
2990 		 * If @prev is runnable & has slice left, it has priority and  in balance_one()
2994 		 * implement ->cpu_release().  in balance_one()
2999 		if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {  in balance_one()
3000 			rq->scx.flags |= SCX_RQ_BAL_KEEP;  in balance_one()
3006 	if (rq->scx.local_dsq.nr)  in balance_one()
3015 	dspc->rq = rq;  in balance_one()
3025 		dspc->nr_tasks = 0;  in balance_one()
3032 		if (prev_on_rq && prev->scx.slice) {  in balance_one()
3033 			rq->scx.flags |= SCX_RQ_BAL_KEEP;  in balance_one()
3036 		if (rq->scx.local_dsq.nr)  in balance_one()
3050 		if (unlikely(!--nr_loops)) {  in balance_one()
3054 	} while (dspc->nr_tasks);  in balance_one()
3063 		rq->scx.flags |= SCX_RQ_BAL_KEEP;  in balance_one()
3067 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;  in balance_one()
3071 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;  in balance_one()
3086 	 * When core-sched is enabled, this ops.balance() call will be followed  in balance_scx()
3096 			struct task_struct *sprev = srq->curr;  in balance_scx()
3122 	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,  in process_ddsp_deferred_locals()
3126 		list_del_init(&p->scx.dsq_list.node);  in process_ddsp_deferred_locals()
3128 		dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);  in process_ddsp_deferred_locals()
3129 		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))  in process_ddsp_deferred_locals()
3130 			dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);  in process_ddsp_deferred_locals()
3136 	if (p->scx.flags & SCX_TASK_QUEUED) {  in set_next_task_scx()
3138 		 * Core-sched might decide to execute @p before it is  in set_next_task_scx()
3145 	p->se.exec_start = rq_clock_task(rq);  in set_next_task_scx()
3148 	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))  in set_next_task_scx()
3155 	 * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().  in set_next_task_scx()
3157 	if ((p->scx.slice == SCX_SLICE_INF) !=  in set_next_task_scx()
3158 	    (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {  in set_next_task_scx()
3159 		if (p->scx.slice == SCX_SLICE_INF)  in set_next_task_scx()
3160 			rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;  in set_next_task_scx()
3162 			rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;  in set_next_task_scx()
3170 		 * tick-stopped CPUs.  in set_next_task_scx()
3192 	const struct sched_class *next_class = next->sched_class;  in switch_class()
3200 	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);  in switch_class()
3216 	 * sched_class, so invoke the ->cpu_release() callback if we have not  in switch_class()
3220 	 * ->cpu_release() complements ->cpu_acquire(), which is emitted the  in switch_class()
3223 	if (!rq->scx.cpu_released) {  in switch_class()
3232 		rq->scx.cpu_released = true;  in switch_class()
3242 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))  in put_prev_task_scx()
3245 	if (p->scx.flags & SCX_TASK_QUEUED) {  in put_prev_task_scx()
3249 		 * If @p has slice left and is being put, @p is getting  in put_prev_task_scx()
3250 		 * preempted by a higher priority scheduler class or core-sched  in put_prev_task_scx()
3254 		if (p->scx.slice && !scx_rq_bypassing(rq)) {  in put_prev_task_scx()
3255 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);  in put_prev_task_scx()
3263 		 * which should trigger an explicit follow-up scheduling event.  in put_prev_task_scx()
3265 		if (sched_class_above(&ext_sched_class, next->sched_class)) {  in put_prev_task_scx()
3267 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);  in put_prev_task_scx()
3269 			do_enqueue_task(rq, p, 0, -1);  in put_prev_task_scx()
3274 	if (next && next->sched_class != &ext_sched_class)  in put_prev_task_scx()
3280 	return list_first_entry_or_null(&rq->scx.local_dsq.list,  in first_local_task()
3286 	struct task_struct *prev = rq->curr;  in pick_task_scx()
3288 	bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;  in pick_task_scx()
3306 	if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {  in pick_task_scx()
3307 		if (prev->scx.flags & SCX_TASK_QUEUED) {  in pick_task_scx()
3314 			    prev->sched_class != &ext_sched_class)) {  in pick_task_scx()
3324 	 * If balance_scx() is telling us to keep running @prev, replenish slice  in pick_task_scx()
3330 		if (!p->scx.slice) {  in pick_task_scx()
3331 			p->scx.slice = SCX_SLICE_DFL;  in pick_task_scx()
3342 		if (unlikely(!p->scx.slice)) {  in pick_task_scx()
3344 				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",  in pick_task_scx()
3345 						p->comm, p->pid, __func__);  in pick_task_scx()
3348 			p->scx.slice = SCX_SLICE_DFL;  in pick_task_scx()
3358  * scx_prio_less - Task ordering for core-sched
3363  * Core-sched is implemented as an additional scheduling layer on top of the
3365  * SCX, core-sched calls this function to interrogate the task ordering.
3367  * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
3369  * priority the task - the global FIFO ordering matching the default scheduling
3372  * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
3388 		return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);  in scx_prio_less()
3422 		p->scx.selected_cpu = cpu;  in select_task_rq_scx()
3433 			p->scx.slice = SCX_SLICE_DFL;  in select_task_rq_scx()
3434 			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;  in select_task_rq_scx()
3439 		p->scx.selected_cpu = cpu;  in select_task_rq_scx()
3458 	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily  in set_cpus_allowed_scx()
3459 	 * differ from the configured one in @p->cpus_mask. Always tell the bpf  in set_cpus_allowed_scx()
3462 	 * Fine-grained memory write control is enforced by BPF making the const  in set_cpus_allowed_scx()
3467 				 p, (struct cpumask *)p->cpus_ptr);  in set_cpus_allowed_scx()
3501 	rq->scx.flags |= SCX_RQ_ONLINE;  in rq_online_scx()
3506 	rq->scx.flags &= ~SCX_RQ_ONLINE;  in rq_offline_scx()
3518 	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {  in check_rq_for_timeouts()
3519 		unsigned long last_runnable = p->scx.runnable_at;  in check_rq_for_timeouts()
3523 			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);  in check_rq_for_timeouts()
3527 					   p->comm, p->pid,  in check_rq_for_timeouts()
3564 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);  in scx_tick()
3579 	 * While disabling, always resched and refresh core-sched timestamp as  in task_tick_scx()
3580 	 * we can't trust the slice management or ops.core_sched_before().  in task_tick_scx()
3583 		curr->scx.slice = 0;  in task_tick_scx()
3589 	if (!curr->scx.slice)  in task_tick_scx()
3598 	 * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the  in tg_cgrp()
3601 	if (tg && tg->css.cgroup)  in tg_cgrp()
3602 		return tg->css.cgroup;  in tg_cgrp()
3617 	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;  in scx_get_task_state()
3644 	WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",  in scx_set_task_state()
3645 		  prev_state, state, p->comm, p->pid);  in scx_set_task_state()
3647 	p->scx.flags &= ~SCX_TASK_STATE_MASK;  in scx_set_task_state()
3648 	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;  in scx_set_task_state()
3655 	p->scx.disallow = false;  in scx_ops_init_task()
3672 	if (p->scx.disallow) {  in scx_ops_init_task()
3680 			 * We're in the load path and @p->policy will be applied  in scx_ops_init_task()
3681 			 * right after. Reverting @p->policy here and rejecting  in scx_ops_init_task()
3683 			 * guarantees that if ops.init_task() sets @p->disallow,  in scx_ops_init_task()
3686 			if (p->policy == SCHED_EXT) {  in scx_ops_init_task()
3687 				p->policy = SCHED_NORMAL;  in scx_ops_init_task()
3692 		} else if (p->policy == SCHED_EXT) {  in scx_ops_init_task()
3693 			scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",  in scx_ops_init_task()
3694 				      p->comm, p->pid);  in scx_ops_init_task()
3698 	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;  in scx_ops_init_task()
3716 		weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];  in scx_ops_enable_task()
3718 	p->scx.weight = sched_weight_to_cgroup(weight);  in scx_ops_enable_task()
3725 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);  in scx_ops_enable_task()
3772 	INIT_LIST_HEAD(&scx->dsq_list.node);  in init_scx_entity()
3773 	RB_CLEAR_NODE(&scx->dsq_priq);  in init_scx_entity()
3774 	scx->sticky_cpu = -1;  in init_scx_entity()
3775 	scx->holding_cpu = -1;  in init_scx_entity()
3776 	INIT_LIST_HEAD(&scx->runnable_node);  in init_scx_entity()
3777 	scx->runnable_at = jiffies;  in init_scx_entity()
3778 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;  in init_scx_entity()
3779 	scx->slice = SCX_SLICE_DFL;  in init_scx_entity()
3813 		if (p->sched_class == &ext_sched_class) {  in scx_post_fork()
3824 	list_add_tail(&p->scx.tasks_node, &scx_tasks);  in scx_post_fork()
3850 	list_del_init(&p->scx.tasks_node);  in sched_ext_free()
3854 	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->  in sched_ext_free()
3872 	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));  in reweight_task_scx()
3874 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);  in reweight_task_scx()
3887 	 * different scheduler class. Keep the BPF scheduler up-to-date.  in switching_to_scx()
3891 				 p, (struct cpumask *)p->cpus_ptr);  in switching_to_scx()
3907 	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&  in scx_check_setscheduler()
3908 	    p->policy != policy && policy == SCHED_EXT)  in scx_check_setscheduler()
3909 		return -EACCES;  in scx_check_setscheduler()
3917 	struct task_struct *p = rq->curr;  in scx_can_stop_tick()
3922 	if (p->sched_class != &ext_sched_class)  in scx_can_stop_tick()
3930 	return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;  in scx_can_stop_tick()
3943 	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));  in scx_tg_online()
3950 				{ .weight = tg->scx_weight };  in scx_tg_online()
3953 					      tg->css.cgroup, &args);  in scx_tg_online()
3958 			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;  in scx_tg_online()
3960 		tg->scx_flags |= SCX_TG_ONLINE;  in scx_tg_online()
3969 	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));  in scx_tg_offline()
3973 	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))  in scx_tg_offline()
3974 		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);  in scx_tg_offline()
3975 	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);  in scx_tg_offline()
3996 		WARN_ON_ONCE(p->scx.cgrp_moving_from);  in scx_cgroup_can_attach()
4001 		 * always match one-to-one.  in scx_cgroup_can_attach()
4008 					      p, from, css->cgroup);  in scx_cgroup_can_attach()
4013 		p->scx.cgrp_moving_from = from;  in scx_cgroup_can_attach()
4020 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)  in scx_cgroup_can_attach()
4022 				    p, p->scx.cgrp_moving_from, css->cgroup);  in scx_cgroup_can_attach()
4023 		p->scx.cgrp_moving_from = NULL;  in scx_cgroup_can_attach()
4039 	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))  in scx_cgroup_move_task()
4041 				 p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));  in scx_cgroup_move_task()
4042 	p->scx.cgrp_moving_from = NULL;  in scx_cgroup_move_task()
4059 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)  in scx_cgroup_cancel_attach()
4061 				    p, p->scx.cgrp_moving_from, css->cgroup);  in scx_cgroup_cancel_attach()
4062 		p->scx.cgrp_moving_from = NULL;  in scx_cgroup_cancel_attach()
4072 	if (scx_cgroup_enabled && tg->scx_weight != weight) {  in scx_group_set_weight()
4076 		tg->scx_weight = weight;  in scx_group_set_weight()
4084 	/* TODO: Implement ops->cgroup_set_idle() */  in scx_group_set_idle()
4107  * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
4109  *   the victim task's slice to 0 and triggering reschedule on the target CPU.
4111  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
4113  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
4158 	raw_spin_lock_init(&dsq->lock);  in init_dsq()
4159 	INIT_LIST_HEAD(&dsq->list);  in init_dsq()
4160 	dsq->id = dsq_id;  in init_dsq()
4169 		return ERR_PTR(-EINVAL);  in create_dsq()
4173 		return ERR_PTR(-ENOMEM);  in create_dsq()
4177 	ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,  in create_dsq()
4208 	raw_spin_lock_irqsave(&dsq->lock, flags);  in destroy_dsq()
4210 	if (dsq->nr) {  in destroy_dsq()
4211 		scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",  in destroy_dsq()
4212 			      dsq->id, dsq->nr);  in destroy_dsq()
4216 	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))  in destroy_dsq()
4220 	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from  in destroy_dsq()
4225 	dsq->id = SCX_DSQ_INVALID;  in destroy_dsq()
4226 	llist_add(&dsq->free_node, &dsqs_to_free);  in destroy_dsq()
4230 	raw_spin_unlock_irqrestore(&dsq->lock, flags);  in destroy_dsq()
4252 		if (!(tg->scx_flags & SCX_TG_INITED))  in scx_cgroup_exit()
4254 		tg->scx_flags &= ~SCX_TG_INITED;  in scx_cgroup_exit()
4263 		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup);  in scx_cgroup_exit()
4285 		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };  in scx_cgroup_init()
4287 		if ((tg->scx_flags &  in scx_cgroup_init()
4292 			tg->scx_flags |= SCX_TG_INITED;  in scx_cgroup_init()
4301 				      css->cgroup, &args);  in scx_cgroup_init()
4307 		tg->scx_flags |= SCX_TG_INITED;  in scx_cgroup_init()
4398 	sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind);		\
4458  * scx_softlockup - sched_ext softlockup handler
4461  * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
4462  * live-lock the system by making many CPUs target the same DSQ to the point
4463  * where soft-lockup detection triggers. This function is called from
4464  * soft-lockup watchdog when the triggering point is close and tries to unjam
4481 	printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",  in scx_softlockup()
4491 	scx_ops_error("soft lockup - CPU#%d stuck for %us",  in scx_softlockup()
4502  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
4514  * - ops.select_cpu() is ignored and the default select_cpu() is used.
4516  * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
4519  * - ops.dispatch() is ignored.
4521  * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
4525  * - pick_next_task() suppresses zero slice warning.
4527  * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
4530  * - scx_prio_less() reverts to the default core_sched_at order.
4549 		scx_ops_bypass_depth--;  in scx_ops_bypass()
4554 			      ktime_get_ns() - bypass_timestamp);  in scx_ops_bypass()
4561 	 * queued tasks are re-queued according to the new scx_rq_bypassing()  in scx_ops_bypass()
4575 			WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);  in scx_ops_bypass()
4576 			rq->scx.flags |= SCX_RQ_BYPASSING;  in scx_ops_bypass()
4578 			WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));  in scx_ops_bypass()
4579 			rq->scx.flags &= ~SCX_RQ_BYPASSING;  in scx_ops_bypass()
4599 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,  in scx_ops_bypass()
4623 	kvfree(ei->dump);  in free_exit_info()
4624 	kfree(ei->msg);  in free_exit_info()
4625 	kfree(ei->bt);  in free_exit_info()
4637 	ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);  in alloc_exit_info()
4638 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);  in alloc_exit_info()
4639 	ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);  in alloc_exit_info()
4641 	if (!ei->bt || !ei->msg || !ei->dump) {  in alloc_exit_info()
4659 		return "disabled by sysrq-S";  in scx_exit_reason()
4684 		 * disable was scheduled - don't kill the new ops. DONE  in scx_ops_disable_workfn()
4692 	ei->kind = kind;  in scx_ops_disable_workfn()
4693 	ei->reason = scx_exit_reason(ei->kind);  in scx_ops_disable_workfn()
4704 			scx_exit_info->msg);  in scx_ops_disable_workfn()
4740 		const struct sched_class *old_class = p->sched_class;  in scx_ops_disable_workfn()
4742 			__setscheduler_class(p->policy, p->prio);  in scx_ops_disable_workfn()
4745 		if (old_class != new_class && p->se.sched_delayed)  in scx_ops_disable_workfn()
4750 		p->sched_class = new_class;  in scx_ops_disable_workfn()
4755 		check_class_changed(task_rq(p), p, old_class, p->prio);  in scx_ops_disable_workfn()
4770 	/* no task is on scx, turn off all the switches and flush in-progress calls */  in scx_ops_disable_workfn()
4782 	if (ei->kind >= SCX_EXIT_ERROR) {  in scx_ops_disable_workfn()
4784 		       scx_ops.name, ei->reason);  in scx_ops_disable_workfn()
4786 		if (ei->msg[0] != '\0')  in scx_ops_disable_workfn()
4787 			pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);  in scx_ops_disable_workfn()
4789 		stack_trace_print(ei->bt, ei->bt_len, 2);  in scx_ops_disable_workfn()
4793 			scx_ops.name, ei->reason);  in scx_ops_disable_workfn()
4818 			destroy_dsq(dsq->id);  in scx_ops_disable_workfn()
4821 	} while (dsq == ERR_PTR(-EAGAIN));  in scx_ops_disable_workfn()
4870 	if (s->size)  in dump_newline()
4891 	if (s->size) {  in dump_line()
4915 	dd->cpu = smp_processor_id();		/* allow scx_bpf_dump() */  in ops_dump_init()
4916 	dd->first = true;  in ops_dump_init()
4917 	dd->cursor = 0;  in ops_dump_init()
4918 	dd->s = s;  in ops_dump_init()
4919 	dd->prefix = prefix;  in ops_dump_init()
4925 	char *line = dd->buf.line;  in ops_dump_flush()  local
4927 	if (!dd->cursor)  in ops_dump_flush()
4931 	 * There's something to flush and this is the first line. Insert a blank  in ops_dump_flush()
4932 	 * line to distinguish ops dump.  in ops_dump_flush()
4934 	if (dd->first) {  in ops_dump_flush()
4935 		dump_newline(dd->s);  in ops_dump_flush()
4936 		dd->first = false;  in ops_dump_flush()
4940 	 * There may be multiple lines in $line. Scan and emit each line  in ops_dump_flush()
4944 		char *end = line;  in ops_dump_flush()
4951 		 * If $line overflowed, it may not have newline at the end.  in ops_dump_flush()
4956 		dump_line(dd->s, "%s%s", dd->prefix, line);  in ops_dump_flush()
4960 		/* move to the next line */  in ops_dump_flush()
4964 		line = end;  in ops_dump_flush()
4967 	dd->cursor = 0;  in ops_dump_flush()
4973 	scx_dump_data.cpu = -1;  in ops_dump_exit()
4981 	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);  in scx_dump_task()
4984 	if (p->scx.dsq)  in scx_dump_task()
4986 			  (unsigned long long)p->scx.dsq->id);  in scx_dump_task()
4990 		  marker, task_state_to_char(p), p->comm, p->pid,  in scx_dump_task()
4991 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));  in scx_dump_task()
4993 		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,  in scx_dump_task()
4994 		  p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,  in scx_dump_task()
4997 		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);  in scx_dump_task()
4998 	dump_line(s, "      dsq_vtime=%llu slice=%llu weight=%u",  in scx_dump_task()
4999 		  p->scx.dsq_vtime, p->scx.slice, p->scx.weight);  in scx_dump_task()
5000 	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));  in scx_dump_task()
5022 		.kind = ei->kind,  in scx_dump_state()
5023 		.exit_code = ei->exit_code,  in scx_dump_state()
5024 		.reason = ei->reason,  in scx_dump_state()
5036 	seq_buf_init(&s, ei->dump, dump_len);  in scx_dump_state()
5038 	if (ei->kind == SCX_EXIT_NONE) {  in scx_dump_state()
5039 		dump_line(&s, "Debug dump triggered by %s", ei->reason);  in scx_dump_state()
5042 			  current->comm, current->pid, ei->kind);  in scx_dump_state()
5043 		dump_line(&s, "  %s (%s)", ei->reason, ei->msg);  in scx_dump_state()
5046 		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);  in scx_dump_state()
5057 	dump_line(&s, "----------");  in scx_dump_state()
5069 		idle = list_empty(&rq->scx.runnable_list) &&  in scx_dump_state()
5070 			rq->curr->sched_class == &idle_sched_class;  in scx_dump_state()
5085 		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",  in scx_dump_state()
5086 			  cpu, rq->scx.nr_running, rq->scx.flags,  in scx_dump_state()
5087 			  rq->scx.cpu_released, rq->scx.ops_qseq,  in scx_dump_state()
5088 			  rq->scx.pnt_seq);  in scx_dump_state()
5090 			  rq->curr->comm, rq->curr->pid,  in scx_dump_state()
5091 			  rq->curr->sched_class);  in scx_dump_state()
5092 		if (!cpumask_empty(rq->scx.cpus_to_kick))  in scx_dump_state()
5094 				  cpumask_pr_args(rq->scx.cpus_to_kick));  in scx_dump_state()
5095 		if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))  in scx_dump_state()
5097 				  cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));  in scx_dump_state()
5098 		if (!cpumask_empty(rq->scx.cpus_to_preempt))  in scx_dump_state()
5100 				  cpumask_pr_args(rq->scx.cpus_to_preempt));  in scx_dump_state()
5101 		if (!cpumask_empty(rq->scx.cpus_to_wait))  in scx_dump_state()
5103 				  cpumask_pr_args(rq->scx.cpus_to_wait));  in scx_dump_state()
5129 		if (rq->curr->sched_class == &ext_sched_class)  in scx_dump_state()
5130 			scx_dump_task(&s, &dctx, rq->curr, '*');  in scx_dump_state()
5132 		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)  in scx_dump_state()
5140 	dump_line(&s, "--------------");  in scx_dump_state()
5154 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),  in scx_dump_state()
5164 	if (ei->kind >= SCX_EXIT_ERROR)  in scx_ops_error_irq_workfn()
5183 	ei->exit_code = exit_code;  in scx_ops_exit_kind()
5186 		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);  in scx_ops_exit_kind()
5189 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);  in scx_ops_exit_kind()
5193 	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again  in scx_ops_exit_kind()
5196 	ei->kind = kind;  in scx_ops_exit_kind()
5197 	ei->reason = scx_exit_reason(ei->kind);  in scx_ops_exit_kind()
5208 		sched_set_fifo(helper->task);  in scx_create_rt_helper()
5221 	if (ops->hotplug_seq) {  in check_hotplug_seq()
5223 		if (ops->hotplug_seq != global_hotplug_seq) {  in check_hotplug_seq()
5226 				     ops->hotplug_seq, global_hotplug_seq);  in check_hotplug_seq()
5237 	if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {  in validate_ops()
5239 		return -EINVAL;  in validate_ops()
5243 	 * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle  in validate_ops()
5246 	if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&  in validate_ops()
5247 	    (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {  in validate_ops()
5249 		return -EINVAL;  in validate_ops()
5252 	if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)  in validate_ops()
5268 		return -EINVAL;  in scx_ops_enable()
5286 			ret = -ENOMEM;  in scx_ops_enable()
5296 			ret = -ENOMEM;  in scx_ops_enable()
5308 				ret = -ENOMEM;  in scx_ops_enable()
5320 		ret = -EBUSY;  in scx_ops_enable()
5326 		ret = -ENOMEM;  in scx_ops_enable()
5330 	scx_root_kobj->kset = scx_kset;  in scx_ops_enable()
5335 	scx_exit_info = alloc_exit_info(ops->exit_dump_len);  in scx_ops_enable()
5337 		ret = -ENOMEM;  in scx_ops_enable()
5356 		cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;  in scx_ops_enable()
5360 	 * online CPUs by watching ->on/offline_cpu() after ->init().  in scx_ops_enable()
5390 	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;  in scx_ops_enable()
5395 		ret = -ENOMEM;  in scx_ops_enable()
5399 	if (ops->timeout_ms)  in scx_ops_enable()
5400 		timeout = msecs_to_jiffies(ops->timeout_ms);  in scx_ops_enable()
5421 	if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)  in scx_ops_enable()
5423 	if (ops->flags & SCX_OPS_ENQ_LAST)  in scx_ops_enable()
5425 	if (ops->flags & SCX_OPS_ENQ_EXITING)  in scx_ops_enable()
5427 	if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)  in scx_ops_enable()
5477 				      ret, p->comm, p->pid);  in scx_ops_enable()
5494 	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));  in scx_ops_enable()
5498 	 * We're fully committed and can't fail. The task READY -> ENABLED  in scx_ops_enable()
5505 		const struct sched_class *old_class = p->sched_class;  in scx_ops_enable()
5507 			__setscheduler_class(p->policy, p->prio);  in scx_ops_enable()
5510 		if (old_class != new_class && p->se.sched_delayed)  in scx_ops_enable()
5515 		p->scx.slice = SCX_SLICE_DFL;  in scx_ops_enable()
5516 		p->sched_class = new_class;  in scx_ops_enable()
5521 		check_class_changed(task_rq(p), p, old_class, p->prio);  in scx_ops_enable()
5533 	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))  in scx_ops_enable()
5609 	t = btf_type_by_id(reg->btf, reg->btf_id);  in bpf_scx_btf_struct_access()
5611 		if (off >= offsetof(struct task_struct, scx.slice) &&  in bpf_scx_btf_struct_access()
5612 		    off + size <= offsetofend(struct task_struct, scx.slice))  in bpf_scx_btf_struct_access()
5622 	return -EACCES;  in bpf_scx_btf_struct_access()
5656 			return -E2BIG;  in bpf_scx_init_member()
5657 		ops->dispatch_max_batch = *(u32 *)(udata + moff);  in bpf_scx_init_member()
5661 			return -EINVAL;  in bpf_scx_init_member()
5662 		ops->flags = *(u64 *)(udata + moff);  in bpf_scx_init_member()
5665 		ret = bpf_obj_name_cpy(ops->name, uops->name,  in bpf_scx_init_member()
5666 				       sizeof(ops->name));  in bpf_scx_init_member()
5670 			return -EINVAL;  in bpf_scx_init_member()
5675 			return -E2BIG;  in bpf_scx_init_member()
5676 		ops->timeout_ms = *(u32 *)(udata + moff);  in bpf_scx_init_member()
5679 		ops->exit_dump_len =  in bpf_scx_init_member()
5683 		ops->hotplug_seq = *(u64 *)(udata + moff);  in bpf_scx_init_member()
5709 		if (prog->sleepable)  in bpf_scx_check_member()
5710 			return -EINVAL;  in bpf_scx_check_member()
5737 	 * sched_ext does not support updating the actively-loaded BPF  in bpf_scx_update()
5743 	return -EOPNOTSUPP;  in bpf_scx_update()
5751 … sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }  in sched_ext_ops__select_cpu()
5767 …ched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }  in sched_ext_ops__init_task()
5772 …ed_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }  in sched_ext_ops__cgroup_init()
5774 …cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }  in sched_ext_ops__cgroup_prep_move()
5781 static s32 sched_ext_ops__init(void) { return -EINVAL; }  in sched_ext_ops__init()
5854 	.help_msg	= "reset-sched-ext(S)",
5861 	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };  in sysrq_handle_sched_ext_dump()
5869 	.help_msg	= "dump-sched-ext(D)",
5888 	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);  in can_skip_idle_kick()
5894 	struct scx_rq *this_scx = &this_rq->scx;  in kick_one_cpu()
5905 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {  in kick_one_cpu()
5906 			if (rq->curr->sched_class == &ext_sched_class)  in kick_one_cpu()
5907 				rq->curr->scx.slice = 0;  in kick_one_cpu()
5908 			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);  in kick_one_cpu()
5911 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {  in kick_one_cpu()
5912 			pseqs[cpu] = rq->scx.pnt_seq;  in kick_one_cpu()
5918 		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);  in kick_one_cpu()
5919 		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);  in kick_one_cpu()
5944 	struct scx_rq *this_scx = &this_rq->scx;  in kick_cpus_irq_workfn()
5949 	for_each_cpu(cpu, this_scx->cpus_to_kick) {  in kick_cpus_irq_workfn()
5951 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);  in kick_cpus_irq_workfn()
5952 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);  in kick_cpus_irq_workfn()
5955 	for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {  in kick_cpus_irq_workfn()
5957 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);  in kick_cpus_irq_workfn()
5963 	for_each_cpu(cpu, this_scx->cpus_to_wait) {  in kick_cpus_irq_workfn()
5964 		unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;  in kick_cpus_irq_workfn()
5971 			 * We busy-wait here to guarantee that no other task can  in kick_cpus_irq_workfn()
5979 		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);  in kick_cpus_irq_workfn()
5984  * print_scx_info - print out sched_ext scheduler state
6010 	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||  in print_scx_info()
6017 	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,  in print_scx_info()
6022 	/* print everything onto one line to conserve console space */  in print_scx_info()
6080 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);  in init_sched_ext_class()
6081 		INIT_LIST_HEAD(&rq->scx.runnable_list);  in init_sched_ext_class()
6082 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);  in init_sched_ext_class()
6084 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));  in init_sched_ext_class()
6085 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));  in init_sched_ext_class()
6086 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));  in init_sched_ext_class()
6087 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));  in init_sched_ext_class()
6088 		init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);  in init_sched_ext_class()
6089 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);  in init_sched_ext_class()
6092 			cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;  in init_sched_ext_class()
6136 	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {  in scx_dsq_insert_commit()
6141 	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){  in scx_dsq_insert_commit()
6143 		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,  in scx_dsq_insert_commit()
6152  * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
6155  * @slice: duration @p can run for in nsecs, 0 to keep the current value
6180  * @p is allowed to run for @slice. The scheduling path is triggered on slice
6181  * exhaustion. If zero, the current residual slice is maintained. If
6185 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,  in scx_bpf_dsq_insert()  argument
6191 	if (slice)  in scx_bpf_dsq_insert()
6192 		p->scx.slice = slice;  in scx_bpf_dsq_insert()
6194 		p->scx.slice = p->scx.slice ?: 1;  in scx_bpf_dsq_insert()
6200 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,  in scx_bpf_dispatch()  argument
6204 	scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);  in scx_bpf_dispatch()
6208  * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
6211  * @slice: duration @p can run for in nsecs, 0 to keep the current value
6212  * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
6221  * vice-versa.
6225  * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
6229 					  u64 slice, u64 vtime, u64 enq_flags)  in scx_bpf_dsq_insert_vtime()  argument
6234 	if (slice)  in scx_bpf_dsq_insert_vtime()
6235 		p->scx.slice = slice;  in scx_bpf_dsq_insert_vtime()
6237 		p->scx.slice = p->scx.slice ?: 1;  in scx_bpf_dsq_insert_vtime()
6239 	p->scx.dsq_vtime = vtime;  in scx_bpf_dsq_insert_vtime()
6246 					u64 slice, u64 vtime, u64 enq_flags)  in scx_bpf_dispatch_vtime()  argument
6249 	scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);  in scx_bpf_dispatch_vtime()
6269 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;  in scx_dsq_move()
6287 	in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;  in scx_dsq_move()
6300 	 * cause similar live-lock conditions as consume_dispatch_q(). Insert a  in scx_dsq_move()
6306 	raw_spin_lock(&src_dsq->lock);  in scx_dsq_move()
6310 	 * re-enqueud, or be in the process of being consumed by someone else.  in scx_dsq_move()
6312 	if (unlikely(p->scx.dsq != src_dsq ||  in scx_dsq_move()
6313 		     u32_before(kit->cursor.priv, p->scx.dsq_seq) ||  in scx_dsq_move()
6314 		     p->scx.holding_cpu >= 0) ||  in scx_dsq_move()
6316 		raw_spin_unlock(&src_dsq->lock);  in scx_dsq_move()
6324 	 * Apply vtime and slice updates before moving so that the new time is  in scx_dsq_move()
6328 	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)  in scx_dsq_move()
6329 		p->scx.dsq_vtime = kit->vtime;  in scx_dsq_move()
6330 	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)  in scx_dsq_move()
6331 		p->scx.slice = kit->slice;  in scx_dsq_move()
6346 	kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |  in scx_dsq_move()
6354  * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
6363 	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);  in scx_bpf_dispatch_nr_slots()
6367  * scx_bpf_dispatch_cancel - Cancel the latest dispatch
6379 	if (dspc->cursor > 0)  in scx_bpf_dispatch_cancel()
6380 		dspc->cursor--;  in scx_bpf_dispatch_cancel()
6386  * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
6389  * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
6392  * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
6407 	flush_dispatch_buf(dspc->rq);  in scx_bpf_dsq_move_to_local()
6415 	if (consume_dispatch_q(dspc->rq, dsq)) {  in scx_bpf_dsq_move_to_local()
6422 		dspc->nr_tasks++;  in scx_bpf_dsq_move_to_local()
6437  * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
6439  * @slice: duration the moved task can run for in nsecs
6441  * Override the slice of the next task that will be moved from @it__iter using
6443  * slice duration is kept.
6446 					    u64 slice)  in scx_bpf_dsq_move_set_slice()  argument
6450 	kit->slice = slice;  in scx_bpf_dsq_move_set_slice()
6451 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;  in scx_bpf_dsq_move_set_slice()
6456 			struct bpf_iter_scx_dsq *it__iter, u64 slice)  in scx_bpf_dispatch_from_dsq_set_slice()  argument
6459 	scx_bpf_dsq_move_set_slice(it__iter, slice);  in scx_bpf_dispatch_from_dsq_set_slice()
6463  * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
6465  * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
6468  * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
6477 	kit->vtime = vtime;  in scx_bpf_dsq_move_set_vtime()
6478 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;  in scx_bpf_dsq_move_set_vtime()
6490  * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
6497  * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
6505  * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
6531  * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
6541  * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
6589  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
6592  * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
6610 	 * @rq->scx.local_dsq. Move all candidate tasks off to a private list  in scx_bpf_reenqueue_local()
6613 	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,  in scx_bpf_reenqueue_local()
6618 		 * deactivate and re-activate @p anyway. Skip re-enqueueing.  in scx_bpf_reenqueue_local()
6621 		 * re-enqueue a migrating task while its current CPU and allowed  in scx_bpf_reenqueue_local()
6626 		 * Also skip re-enqueueing tasks that can only run on this  in scx_bpf_reenqueue_local()
6627 		 * CPU, as they would just be re-added to the same local  in scx_bpf_reenqueue_local()
6630 		if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)  in scx_bpf_reenqueue_local()
6634 		list_add_tail(&p->scx.dsq_list.node, &tasks);  in scx_bpf_reenqueue_local()
6638 		list_del_init(&p->scx.dsq_list.node);  in scx_bpf_reenqueue_local()
6639 		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);  in scx_bpf_reenqueue_local()
6660  * scx_bpf_create_dsq - Create a custom DSQ
6671 		return -EINVAL;  in scx_bpf_create_dsq()
6697  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
6744 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);  in scx_bpf_kick_cpu()
6746 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);  in scx_bpf_kick_cpu()
6749 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);  in scx_bpf_kick_cpu()
6751 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);  in scx_bpf_kick_cpu()
6754 	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);  in scx_bpf_kick_cpu()
6760  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
6764  * -%ENOENT is returned.
6774 		ret = READ_ONCE(this_rq()->scx.local_dsq.nr);  in scx_bpf_dsq_nr_queued()
6780 			ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);  in scx_bpf_dsq_nr_queued()
6786 			ret = READ_ONCE(dsq->nr);  in scx_bpf_dsq_nr_queued()
6790 	ret = -ENOENT;  in scx_bpf_dsq_nr_queued()
6797  * scx_bpf_destroy_dsq - Destroy a custom DSQ
6811  * bpf_iter_scx_dsq_new - Create a DSQ iterator
6832 	 * Always clear $kit->dsq.  in bpf_iter_scx_dsq_new()
6834 	kit->dsq = NULL;  in bpf_iter_scx_dsq_new()
6837 		return -EINVAL;  in bpf_iter_scx_dsq_new()
6839 	kit->dsq = find_user_dsq(dsq_id);  in bpf_iter_scx_dsq_new()
6840 	if (!kit->dsq)  in bpf_iter_scx_dsq_new()
6841 		return -ENOENT;  in bpf_iter_scx_dsq_new()
6843 	INIT_LIST_HEAD(&kit->cursor.node);  in bpf_iter_scx_dsq_new()
6844 	kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;  in bpf_iter_scx_dsq_new()
6845 	kit->cursor.priv = READ_ONCE(kit->dsq->seq);  in bpf_iter_scx_dsq_new()
6851  * bpf_iter_scx_dsq_next - Progress a DSQ iterator
6859 	bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;  in bpf_iter_scx_dsq_next()
6863 	if (!kit->dsq)  in bpf_iter_scx_dsq_next()
6866 	raw_spin_lock_irqsave(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_next()
6868 	if (list_empty(&kit->cursor.node))  in bpf_iter_scx_dsq_next()
6871 		p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);  in bpf_iter_scx_dsq_next()
6879 		p = nldsq_next_task(kit->dsq, p, rev);  in bpf_iter_scx_dsq_next()
6880 	} while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));  in bpf_iter_scx_dsq_next()
6884 			list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);  in bpf_iter_scx_dsq_next()
6886 			list_move(&kit->cursor.node, &p->scx.dsq_list.node);  in bpf_iter_scx_dsq_next()
6888 		list_del_init(&kit->cursor.node);  in bpf_iter_scx_dsq_next()
6891 	raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_next()
6897  * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
6906 	if (!kit->dsq)  in bpf_iter_scx_dsq_destroy()
6909 	if (!list_empty(&kit->cursor.node)) {  in bpf_iter_scx_dsq_destroy()
6912 		raw_spin_lock_irqsave(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_destroy()
6913 		list_del_init(&kit->cursor.node);  in bpf_iter_scx_dsq_destroy()
6914 		raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);  in bpf_iter_scx_dsq_destroy()
6916 	kit->dsq = NULL;  in bpf_iter_scx_dsq_destroy()
6931 		return -EINVAL;  in __bstr_format()
6962 	return __bstr_format(buf->data, buf->line, sizeof(buf->line),  in bstr_format()
6969  * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
6986 				  scx_exit_bstr_buf.line);  in scx_bpf_exit_bstr()
6991  * scx_bpf_error_bstr - Indicate fatal error
7007 				  scx_exit_bstr_buf.line);  in scx_bpf_error_bstr()
7012  * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
7020  * The extra dump may be multiple lines. A single line may be split over
7021  * multiple calls. The last line is automatically terminated.
7027 	struct scx_bstr_buf *buf = &dd->buf;  in scx_bpf_dump_bstr()
7030 	if (raw_smp_processor_id() != dd->cpu) {  in scx_bpf_dump_bstr()
7035 	/* append the formatted string to the line buf */  in scx_bpf_dump_bstr()
7036 	ret = __bstr_format(buf->data, buf->line + dd->cursor,  in scx_bpf_dump_bstr()
7037 			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);  in scx_bpf_dump_bstr()
7039 		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",  in scx_bpf_dump_bstr()
7040 			  dd->prefix, fmt, data, data__sz, ret);  in scx_bpf_dump_bstr()
7044 	dd->cursor += ret;  in scx_bpf_dump_bstr()
7045 	dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));  in scx_bpf_dump_bstr()
7047 	if (!dd->cursor)  in scx_bpf_dump_bstr()
7051 	 * If the line buf overflowed or ends in a newline, flush it into the  in scx_bpf_dump_bstr()
7052 	 * dump. This is to allow the caller to generate a single line over  in scx_bpf_dump_bstr()
7054 	 * the line buf, the only case which can lead to an unexpected  in scx_bpf_dump_bstr()
7058 	if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')  in scx_bpf_dump_bstr()
7063  * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
7079  * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
7101  * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
7143 		rq->scx.cpuperf_target = perf;  in scx_bpf_cpuperf_set()
7152  * scx_bpf_nr_node_ids - Return the number of possible node IDs
7162  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
7172  * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
7180  * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
7188  * scx_bpf_put_cpumask - Release a possible/online cpumask
7195 	 * a reference to a global cpumask, which is read-only in the caller and  in scx_bpf_put_cpumask()
7202  * scx_bpf_task_running - Is task currently running?
7207 	return task_rq(p)->curr == p;  in scx_bpf_task_running()
7211  * scx_bpf_task_cpu - CPU a task is currently associated with
7220  * scx_bpf_cpu_rq - Fetch the rq of a CPU
7232  * scx_bpf_task_cgroup - Return the sched cgroup of a task
7235  * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
7237  * determine @p's current cgroup as, unlike following @p->cgroups,
7238  * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
7239  * rq-locked operations. Can be called on the parameter tasks of rq-locked
7245 	struct task_group *tg = p->sched_task_group;  in scx_bpf_task_cgroup()
7260  * scx_bpf_now - Returns a high-performance monotonically non-decreasing
7267  *  Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
7268  *  eventually reads a hardware timestamp counter -- is neither performant nor
7269  *  scalable. scx_bpf_now() aims to provide a high-performance clock by
7279  * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
7282  *  is no such guarantee -- the clock can go backward. It provides a
7283  *  monotonically *non-decreasing* clock so that it would provide the same
7295 	if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {  in scx_bpf_now()
7299 		 * Note that scx_bpf_now() is re-entrant between a process  in scx_bpf_now()
7304 		clock = READ_ONCE(rq->scx.clock);  in scx_bpf_now()
7322  * scx_bpf_events - Get a system-wide event counter to
7332 	/* Aggregate per-CPU event counters into the system-wide counters. */  in scx_bpf_events()
7348 	 * We cannot entirely trust a BPF-provided size since a BPF program  in scx_bpf_events()
7401 	 * Some kfuncs are context-sensitive and can only be called from  in scx_init()
7449 		return -ENOMEM;  in scx_init()
7452 	ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);  in scx_init()