xref: /linux/kernel/time/posix-cpu-timers.c (revision e9ef810dfee7a2227da9d423aecb0ced35faddbe)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Implement CPU time clocks for the POSIX clock interface.
4   */
5  
6  #include <linux/sched/signal.h>
7  #include <linux/sched/cputime.h>
8  #include <linux/posix-timers.h>
9  #include <linux/errno.h>
10  #include <linux/math64.h>
11  #include <linux/uaccess.h>
12  #include <linux/kernel_stat.h>
13  #include <trace/events/timer.h>
14  #include <linux/tick.h>
15  #include <linux/workqueue.h>
16  #include <linux/compat.h>
17  #include <linux/sched/deadline.h>
18  #include <linux/task_work.h>
19  
20  #include "posix-timers.h"
21  
22  static void posix_cpu_timer_rearm(struct k_itimer *timer);
23  
posix_cputimers_group_init(struct posix_cputimers * pct,u64 cpu_limit)24  void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
25  {
26  	posix_cputimers_init(pct);
27  	if (cpu_limit != RLIM_INFINITY) {
28  		pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
29  		pct->timers_active = true;
30  	}
31  }
32  
33  /*
34   * Called after updating RLIMIT_CPU to run cpu timer and update
35   * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
36   * necessary. Needs siglock protection since other code may update the
37   * expiration cache as well.
38   *
39   * Returns 0 on success, -ESRCH on failure.  Can fail if the task is exiting and
40   * we cannot lock_task_sighand.  Cannot fail if task is current.
41   */
update_rlimit_cpu(struct task_struct * task,unsigned long rlim_new)42  int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
43  {
44  	u64 nsecs = rlim_new * NSEC_PER_SEC;
45  	unsigned long irq_fl;
46  
47  	if (!lock_task_sighand(task, &irq_fl))
48  		return -ESRCH;
49  	set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
50  	unlock_task_sighand(task, &irq_fl);
51  	return 0;
52  }
53  
54  /*
55   * Functions for validating access to tasks.
56   */
pid_for_clock(const clockid_t clock,bool gettime)57  static struct pid *pid_for_clock(const clockid_t clock, bool gettime)
58  {
59  	const bool thread = !!CPUCLOCK_PERTHREAD(clock);
60  	const pid_t upid = CPUCLOCK_PID(clock);
61  	struct pid *pid;
62  
63  	if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
64  		return NULL;
65  
66  	/*
67  	 * If the encoded PID is 0, then the timer is targeted at current
68  	 * or the process to which current belongs.
69  	 */
70  	if (upid == 0)
71  		return thread ? task_pid(current) : task_tgid(current);
72  
73  	pid = find_vpid(upid);
74  	if (!pid)
75  		return NULL;
76  
77  	if (thread) {
78  		struct task_struct *tsk = pid_task(pid, PIDTYPE_PID);
79  		return (tsk && same_thread_group(tsk, current)) ? pid : NULL;
80  	}
81  
82  	/*
83  	 * For clock_gettime(PROCESS) allow finding the process by
84  	 * with the pid of the current task.  The code needs the tgid
85  	 * of the process so that pid_task(pid, PIDTYPE_TGID) can be
86  	 * used to find the process.
87  	 */
88  	if (gettime && (pid == task_pid(current)))
89  		return task_tgid(current);
90  
91  	/*
92  	 * For processes require that pid identifies a process.
93  	 */
94  	return pid_has_task(pid, PIDTYPE_TGID) ? pid : NULL;
95  }
96  
validate_clock_permissions(const clockid_t clock)97  static inline int validate_clock_permissions(const clockid_t clock)
98  {
99  	int ret;
100  
101  	rcu_read_lock();
102  	ret = pid_for_clock(clock, false) ? 0 : -EINVAL;
103  	rcu_read_unlock();
104  
105  	return ret;
106  }
107  
clock_pid_type(const clockid_t clock)108  static inline enum pid_type clock_pid_type(const clockid_t clock)
109  {
110  	return CPUCLOCK_PERTHREAD(clock) ? PIDTYPE_PID : PIDTYPE_TGID;
111  }
112  
cpu_timer_task_rcu(struct k_itimer * timer)113  static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer)
114  {
115  	return pid_task(timer->it.cpu.pid, clock_pid_type(timer->it_clock));
116  }
117  
118  /*
119   * Update expiry time from increment, and increase overrun count,
120   * given the current clock sample.
121   */
bump_cpu_timer(struct k_itimer * timer,u64 now)122  static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
123  {
124  	u64 delta, incr, expires = timer->it.cpu.node.expires;
125  	int i;
126  
127  	if (!timer->it_interval)
128  		return expires;
129  
130  	if (now < expires)
131  		return expires;
132  
133  	incr = timer->it_interval;
134  	delta = now + incr - expires;
135  
136  	/* Don't use (incr*2 < delta), incr*2 might overflow. */
137  	for (i = 0; incr < delta - incr; i++)
138  		incr = incr << 1;
139  
140  	for (; i >= 0; incr >>= 1, i--) {
141  		if (delta < incr)
142  			continue;
143  
144  		timer->it.cpu.node.expires += incr;
145  		timer->it_overrun += 1LL << i;
146  		delta -= incr;
147  	}
148  	return timer->it.cpu.node.expires;
149  }
150  
151  /* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
expiry_cache_is_inactive(const struct posix_cputimers * pct)152  static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
153  {
154  	return !(~pct->bases[CPUCLOCK_PROF].nextevt |
155  		 ~pct->bases[CPUCLOCK_VIRT].nextevt |
156  		 ~pct->bases[CPUCLOCK_SCHED].nextevt);
157  }
158  
159  static int
posix_cpu_clock_getres(const clockid_t which_clock,struct timespec64 * tp)160  posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
161  {
162  	int error = validate_clock_permissions(which_clock);
163  
164  	if (!error) {
165  		tp->tv_sec = 0;
166  		tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
167  		if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
168  			/*
169  			 * If sched_clock is using a cycle counter, we
170  			 * don't have any idea of its true resolution
171  			 * exported, but it is much more than 1s/HZ.
172  			 */
173  			tp->tv_nsec = 1;
174  		}
175  	}
176  	return error;
177  }
178  
179  static int
posix_cpu_clock_set(const clockid_t clock,const struct timespec64 * tp)180  posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
181  {
182  	int error = validate_clock_permissions(clock);
183  
184  	/*
185  	 * You can never reset a CPU clock, but we check for other errors
186  	 * in the call before failing with EPERM.
187  	 */
188  	return error ? : -EPERM;
189  }
190  
191  /*
192   * Sample a per-thread clock for the given task. clkid is validated.
193   */
cpu_clock_sample(const clockid_t clkid,struct task_struct * p)194  static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
195  {
196  	u64 utime, stime;
197  
198  	if (clkid == CPUCLOCK_SCHED)
199  		return task_sched_runtime(p);
200  
201  	task_cputime(p, &utime, &stime);
202  
203  	switch (clkid) {
204  	case CPUCLOCK_PROF:
205  		return utime + stime;
206  	case CPUCLOCK_VIRT:
207  		return utime;
208  	default:
209  		WARN_ON_ONCE(1);
210  	}
211  	return 0;
212  }
213  
store_samples(u64 * samples,u64 stime,u64 utime,u64 rtime)214  static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
215  {
216  	samples[CPUCLOCK_PROF] = stime + utime;
217  	samples[CPUCLOCK_VIRT] = utime;
218  	samples[CPUCLOCK_SCHED] = rtime;
219  }
220  
task_sample_cputime(struct task_struct * p,u64 * samples)221  static void task_sample_cputime(struct task_struct *p, u64 *samples)
222  {
223  	u64 stime, utime;
224  
225  	task_cputime(p, &utime, &stime);
226  	store_samples(samples, stime, utime, p->se.sum_exec_runtime);
227  }
228  
proc_sample_cputime_atomic(struct task_cputime_atomic * at,u64 * samples)229  static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
230  				       u64 *samples)
231  {
232  	u64 stime, utime, rtime;
233  
234  	utime = atomic64_read(&at->utime);
235  	stime = atomic64_read(&at->stime);
236  	rtime = atomic64_read(&at->sum_exec_runtime);
237  	store_samples(samples, stime, utime, rtime);
238  }
239  
240  /*
241   * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
242   * to avoid race conditions with concurrent updates to cputime.
243   */
__update_gt_cputime(atomic64_t * cputime,u64 sum_cputime)244  static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
245  {
246  	u64 curr_cputime = atomic64_read(cputime);
247  
248  	do {
249  		if (sum_cputime <= curr_cputime)
250  			return;
251  	} while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime));
252  }
253  
update_gt_cputime(struct task_cputime_atomic * cputime_atomic,struct task_cputime * sum)254  static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
255  			      struct task_cputime *sum)
256  {
257  	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
258  	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
259  	__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
260  }
261  
262  /**
263   * thread_group_sample_cputime - Sample cputime for a given task
264   * @tsk:	Task for which cputime needs to be started
265   * @samples:	Storage for time samples
266   *
267   * Called from sys_getitimer() to calculate the expiry time of an active
268   * timer. That means group cputime accounting is already active. Called
269   * with task sighand lock held.
270   *
271   * Updates @times with an uptodate sample of the thread group cputimes.
272   */
thread_group_sample_cputime(struct task_struct * tsk,u64 * samples)273  void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
274  {
275  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
276  	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
277  
278  	WARN_ON_ONCE(!pct->timers_active);
279  
280  	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
281  }
282  
283  /**
284   * thread_group_start_cputime - Start cputime and return a sample
285   * @tsk:	Task for which cputime needs to be started
286   * @samples:	Storage for time samples
287   *
288   * The thread group cputime accounting is avoided when there are no posix
289   * CPU timers armed. Before starting a timer it's required to check whether
290   * the time accounting is active. If not, a full update of the atomic
291   * accounting store needs to be done and the accounting enabled.
292   *
293   * Updates @times with an uptodate sample of the thread group cputimes.
294   */
thread_group_start_cputime(struct task_struct * tsk,u64 * samples)295  static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
296  {
297  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
298  	struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
299  
300  	lockdep_assert_task_sighand_held(tsk);
301  
302  	/* Check if cputimer isn't running. This is accessed without locking. */
303  	if (!READ_ONCE(pct->timers_active)) {
304  		struct task_cputime sum;
305  
306  		/*
307  		 * The POSIX timer interface allows for absolute time expiry
308  		 * values through the TIMER_ABSTIME flag, therefore we have
309  		 * to synchronize the timer to the clock every time we start it.
310  		 */
311  		thread_group_cputime(tsk, &sum);
312  		update_gt_cputime(&cputimer->cputime_atomic, &sum);
313  
314  		/*
315  		 * We're setting timers_active without a lock. Ensure this
316  		 * only gets written to in one operation. We set it after
317  		 * update_gt_cputime() as a small optimization, but
318  		 * barriers are not required because update_gt_cputime()
319  		 * can handle concurrent updates.
320  		 */
321  		WRITE_ONCE(pct->timers_active, true);
322  	}
323  	proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
324  }
325  
__thread_group_cputime(struct task_struct * tsk,u64 * samples)326  static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)
327  {
328  	struct task_cputime ct;
329  
330  	thread_group_cputime(tsk, &ct);
331  	store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);
332  }
333  
334  /*
335   * Sample a process (thread group) clock for the given task clkid. If the
336   * group's cputime accounting is already enabled, read the atomic
337   * store. Otherwise a full update is required.  clkid is already validated.
338   */
cpu_clock_sample_group(const clockid_t clkid,struct task_struct * p,bool start)339  static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
340  				  bool start)
341  {
342  	struct thread_group_cputimer *cputimer = &p->signal->cputimer;
343  	struct posix_cputimers *pct = &p->signal->posix_cputimers;
344  	u64 samples[CPUCLOCK_MAX];
345  
346  	if (!READ_ONCE(pct->timers_active)) {
347  		if (start)
348  			thread_group_start_cputime(p, samples);
349  		else
350  			__thread_group_cputime(p, samples);
351  	} else {
352  		proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
353  	}
354  
355  	return samples[clkid];
356  }
357  
posix_cpu_clock_get(const clockid_t clock,struct timespec64 * tp)358  static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
359  {
360  	const clockid_t clkid = CPUCLOCK_WHICH(clock);
361  	struct task_struct *tsk;
362  	u64 t;
363  
364  	rcu_read_lock();
365  	tsk = pid_task(pid_for_clock(clock, true), clock_pid_type(clock));
366  	if (!tsk) {
367  		rcu_read_unlock();
368  		return -EINVAL;
369  	}
370  
371  	if (CPUCLOCK_PERTHREAD(clock))
372  		t = cpu_clock_sample(clkid, tsk);
373  	else
374  		t = cpu_clock_sample_group(clkid, tsk, false);
375  	rcu_read_unlock();
376  
377  	*tp = ns_to_timespec64(t);
378  	return 0;
379  }
380  
381  /*
382   * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
383   * This is called from sys_timer_create() and do_cpu_nanosleep() with the
384   * new timer already all-zeros initialized.
385   */
posix_cpu_timer_create(struct k_itimer * new_timer)386  static int posix_cpu_timer_create(struct k_itimer *new_timer)
387  {
388  	static struct lock_class_key posix_cpu_timers_key;
389  	struct pid *pid;
390  
391  	rcu_read_lock();
392  	pid = pid_for_clock(new_timer->it_clock, false);
393  	if (!pid) {
394  		rcu_read_unlock();
395  		return -EINVAL;
396  	}
397  
398  	/*
399  	 * If posix timer expiry is handled in task work context then
400  	 * timer::it_lock can be taken without disabling interrupts as all
401  	 * other locking happens in task context. This requires a separate
402  	 * lock class key otherwise regular posix timer expiry would record
403  	 * the lock class being taken in interrupt context and generate a
404  	 * false positive warning.
405  	 */
406  	if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
407  		lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
408  
409  	new_timer->kclock = &clock_posix_cpu;
410  	timerqueue_init(&new_timer->it.cpu.node);
411  	new_timer->it.cpu.pid = get_pid(pid);
412  	rcu_read_unlock();
413  	return 0;
414  }
415  
timer_base(struct k_itimer * timer,struct task_struct * tsk)416  static struct posix_cputimer_base *timer_base(struct k_itimer *timer,
417  					      struct task_struct *tsk)
418  {
419  	int clkidx = CPUCLOCK_WHICH(timer->it_clock);
420  
421  	if (CPUCLOCK_PERTHREAD(timer->it_clock))
422  		return tsk->posix_cputimers.bases + clkidx;
423  	else
424  		return tsk->signal->posix_cputimers.bases + clkidx;
425  }
426  
427  /*
428   * Force recalculating the base earliest expiration on the next tick.
429   * This will also re-evaluate the need to keep around the process wide
430   * cputime counter and tick dependency and eventually shut these down
431   * if necessary.
432   */
trigger_base_recalc_expires(struct k_itimer * timer,struct task_struct * tsk)433  static void trigger_base_recalc_expires(struct k_itimer *timer,
434  					struct task_struct *tsk)
435  {
436  	struct posix_cputimer_base *base = timer_base(timer, tsk);
437  
438  	base->nextevt = 0;
439  }
440  
441  /*
442   * Dequeue the timer and reset the base if it was its earliest expiration.
443   * It makes sure the next tick recalculates the base next expiration so we
444   * don't keep the costly process wide cputime counter around for a random
445   * amount of time, along with the tick dependency.
446   *
447   * If another timer gets queued between this and the next tick, its
448   * expiration will update the base next event if necessary on the next
449   * tick.
450   */
disarm_timer(struct k_itimer * timer,struct task_struct * p)451  static void disarm_timer(struct k_itimer *timer, struct task_struct *p)
452  {
453  	struct cpu_timer *ctmr = &timer->it.cpu;
454  	struct posix_cputimer_base *base;
455  
456  	if (!cpu_timer_dequeue(ctmr))
457  		return;
458  
459  	base = timer_base(timer, p);
460  	if (cpu_timer_getexpires(ctmr) == base->nextevt)
461  		trigger_base_recalc_expires(timer, p);
462  }
463  
464  
465  /*
466   * Clean up a CPU-clock timer that is about to be destroyed.
467   * This is called from timer deletion with the timer already locked.
468   * If we return TIMER_RETRY, it's necessary to release the timer's lock
469   * and try again.  (This happens when the timer is in the middle of firing.)
470   */
posix_cpu_timer_del(struct k_itimer * timer)471  static int posix_cpu_timer_del(struct k_itimer *timer)
472  {
473  	struct cpu_timer *ctmr = &timer->it.cpu;
474  	struct sighand_struct *sighand;
475  	struct task_struct *p;
476  	unsigned long flags;
477  	int ret = 0;
478  
479  	rcu_read_lock();
480  	p = cpu_timer_task_rcu(timer);
481  	if (!p)
482  		goto out;
483  
484  	/*
485  	 * Protect against sighand release/switch in exit/exec and process/
486  	 * thread timer list entry concurrent read/writes.
487  	 */
488  	sighand = lock_task_sighand(p, &flags);
489  	if (unlikely(sighand == NULL)) {
490  		/*
491  		 * This raced with the reaping of the task. The exit cleanup
492  		 * should have removed this timer from the timer queue.
493  		 */
494  		WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
495  	} else {
496  		if (timer->it.cpu.firing) {
497  			/*
498  			 * Prevent signal delivery. The timer cannot be dequeued
499  			 * because it is on the firing list which is not protected
500  			 * by sighand->lock. The delivery path is waiting for
501  			 * the timer lock. So go back, unlock and retry.
502  			 */
503  			timer->it.cpu.firing = false;
504  			ret = TIMER_RETRY;
505  		} else {
506  			disarm_timer(timer, p);
507  		}
508  		unlock_task_sighand(p, &flags);
509  	}
510  
511  out:
512  	rcu_read_unlock();
513  
514  	if (!ret) {
515  		put_pid(ctmr->pid);
516  		timer->it_status = POSIX_TIMER_DISARMED;
517  	}
518  	return ret;
519  }
520  
cleanup_timerqueue(struct timerqueue_head * head)521  static void cleanup_timerqueue(struct timerqueue_head *head)
522  {
523  	struct timerqueue_node *node;
524  	struct cpu_timer *ctmr;
525  
526  	while ((node = timerqueue_getnext(head))) {
527  		timerqueue_del(head, node);
528  		ctmr = container_of(node, struct cpu_timer, node);
529  		ctmr->head = NULL;
530  	}
531  }
532  
533  /*
534   * Clean out CPU timers which are still armed when a thread exits. The
535   * timers are only removed from the list. No other updates are done. The
536   * corresponding posix timers are still accessible, but cannot be rearmed.
537   *
538   * This must be called with the siglock held.
539   */
cleanup_timers(struct posix_cputimers * pct)540  static void cleanup_timers(struct posix_cputimers *pct)
541  {
542  	cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead);
543  	cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead);
544  	cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);
545  }
546  
547  /*
548   * These are both called with the siglock held, when the current thread
549   * is being reaped.  When the final (leader) thread in the group is reaped,
550   * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
551   */
posix_cpu_timers_exit(struct task_struct * tsk)552  void posix_cpu_timers_exit(struct task_struct *tsk)
553  {
554  	cleanup_timers(&tsk->posix_cputimers);
555  }
posix_cpu_timers_exit_group(struct task_struct * tsk)556  void posix_cpu_timers_exit_group(struct task_struct *tsk)
557  {
558  	cleanup_timers(&tsk->signal->posix_cputimers);
559  }
560  
561  /*
562   * Insert the timer on the appropriate list before any timers that
563   * expire later.  This must be called with the sighand lock held.
564   */
arm_timer(struct k_itimer * timer,struct task_struct * p)565  static void arm_timer(struct k_itimer *timer, struct task_struct *p)
566  {
567  	struct posix_cputimer_base *base = timer_base(timer, p);
568  	struct cpu_timer *ctmr = &timer->it.cpu;
569  	u64 newexp = cpu_timer_getexpires(ctmr);
570  
571  	timer->it_status = POSIX_TIMER_ARMED;
572  	if (!cpu_timer_enqueue(&base->tqhead, ctmr))
573  		return;
574  
575  	/*
576  	 * We are the new earliest-expiring POSIX 1.b timer, hence
577  	 * need to update expiration cache. Take into account that
578  	 * for process timers we share expiration cache with itimers
579  	 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
580  	 */
581  	if (newexp < base->nextevt)
582  		base->nextevt = newexp;
583  
584  	if (CPUCLOCK_PERTHREAD(timer->it_clock))
585  		tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
586  	else
587  		tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER);
588  }
589  
590  /*
591   * The timer is locked, fire it and arrange for its reload.
592   */
cpu_timer_fire(struct k_itimer * timer)593  static void cpu_timer_fire(struct k_itimer *timer)
594  {
595  	struct cpu_timer *ctmr = &timer->it.cpu;
596  
597  	timer->it_status = POSIX_TIMER_DISARMED;
598  
599  	if (unlikely(ctmr->nanosleep)) {
600  		/*
601  		 * This a special case for clock_nanosleep,
602  		 * not a normal timer from sys_timer_create.
603  		 */
604  		wake_up_process(timer->it_process);
605  		cpu_timer_setexpires(ctmr, 0);
606  	} else {
607  		posix_timer_queue_signal(timer);
608  		/* Disable oneshot timers */
609  		if (!timer->it_interval)
610  			cpu_timer_setexpires(ctmr, 0);
611  	}
612  }
613  
614  static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now);
615  
616  /*
617   * Guts of sys_timer_settime for CPU timers.
618   * This is called with the timer locked and interrupts disabled.
619   * If we return TIMER_RETRY, it's necessary to release the timer's lock
620   * and try again.  (This happens when the timer is in the middle of firing.)
621   */
posix_cpu_timer_set(struct k_itimer * timer,int timer_flags,struct itimerspec64 * new,struct itimerspec64 * old)622  static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
623  			       struct itimerspec64 *new, struct itimerspec64 *old)
624  {
625  	bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
626  	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
627  	struct cpu_timer *ctmr = &timer->it.cpu;
628  	u64 old_expires, new_expires, now;
629  	struct sighand_struct *sighand;
630  	struct task_struct *p;
631  	unsigned long flags;
632  	int ret = 0;
633  
634  	rcu_read_lock();
635  	p = cpu_timer_task_rcu(timer);
636  	if (!p) {
637  		/*
638  		 * If p has just been reaped, we can no
639  		 * longer get any information about it at all.
640  		 */
641  		rcu_read_unlock();
642  		return -ESRCH;
643  	}
644  
645  	/*
646  	 * Use the to_ktime conversion because that clamps the maximum
647  	 * value to KTIME_MAX and avoid multiplication overflows.
648  	 */
649  	new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value));
650  
651  	/*
652  	 * Protect against sighand release/switch in exit/exec and p->cpu_timers
653  	 * and p->signal->cpu_timers read/write in arm_timer()
654  	 */
655  	sighand = lock_task_sighand(p, &flags);
656  	/*
657  	 * If p has just been reaped, we can no
658  	 * longer get any information about it at all.
659  	 */
660  	if (unlikely(sighand == NULL)) {
661  		rcu_read_unlock();
662  		return -ESRCH;
663  	}
664  
665  	/* Retrieve the current expiry time before disarming the timer */
666  	old_expires = cpu_timer_getexpires(ctmr);
667  
668  	if (unlikely(timer->it.cpu.firing)) {
669  		/*
670  		 * Prevent signal delivery. The timer cannot be dequeued
671  		 * because it is on the firing list which is not protected
672  		 * by sighand->lock. The delivery path is waiting for
673  		 * the timer lock. So go back, unlock and retry.
674  		 */
675  		timer->it.cpu.firing = false;
676  		ret = TIMER_RETRY;
677  	} else {
678  		cpu_timer_dequeue(ctmr);
679  		timer->it_status = POSIX_TIMER_DISARMED;
680  	}
681  
682  	/*
683  	 * Sample the current clock for saving the previous setting
684  	 * and for rearming the timer.
685  	 */
686  	if (CPUCLOCK_PERTHREAD(timer->it_clock))
687  		now = cpu_clock_sample(clkid, p);
688  	else
689  		now = cpu_clock_sample_group(clkid, p, !sigev_none);
690  
691  	/* Retrieve the previous expiry value if requested. */
692  	if (old) {
693  		old->it_value = (struct timespec64){ };
694  		if (old_expires)
695  			__posix_cpu_timer_get(timer, old, now);
696  	}
697  
698  	/* Retry if the timer expiry is running concurrently */
699  	if (unlikely(ret)) {
700  		unlock_task_sighand(p, &flags);
701  		goto out;
702  	}
703  
704  	/* Convert relative expiry time to absolute */
705  	if (new_expires && !(timer_flags & TIMER_ABSTIME))
706  		new_expires += now;
707  
708  	/* Set the new expiry time (might be 0) */
709  	cpu_timer_setexpires(ctmr, new_expires);
710  
711  	/*
712  	 * Arm the timer if it is not disabled, the new expiry value has
713  	 * not yet expired and the timer requires signal delivery.
714  	 * SIGEV_NONE timers are never armed. In case the timer is not
715  	 * armed, enforce the reevaluation of the timer base so that the
716  	 * process wide cputime counter can be disabled eventually.
717  	 */
718  	if (likely(!sigev_none)) {
719  		if (new_expires && now < new_expires)
720  			arm_timer(timer, p);
721  		else
722  			trigger_base_recalc_expires(timer, p);
723  	}
724  
725  	unlock_task_sighand(p, &flags);
726  
727  	posix_timer_set_common(timer, new);
728  
729  	/*
730  	 * If the new expiry time was already in the past the timer was not
731  	 * queued. Fire it immediately even if the thread never runs to
732  	 * accumulate more time on this clock.
733  	 */
734  	if (!sigev_none && new_expires && now >= new_expires)
735  		cpu_timer_fire(timer);
736  out:
737  	rcu_read_unlock();
738  	return ret;
739  }
740  
__posix_cpu_timer_get(struct k_itimer * timer,struct itimerspec64 * itp,u64 now)741  static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now)
742  {
743  	bool sigev_none = timer->it_sigev_notify == SIGEV_NONE;
744  	u64 expires, iv = timer->it_interval;
745  
746  	/*
747  	 * Make sure that interval timers are moved forward for the
748  	 * following cases:
749  	 *  - SIGEV_NONE timers which are never armed
750  	 *  - Timers which expired, but the signal has not yet been
751  	 *    delivered
752  	 */
753  	if (iv && timer->it_status != POSIX_TIMER_ARMED)
754  		expires = bump_cpu_timer(timer, now);
755  	else
756  		expires = cpu_timer_getexpires(&timer->it.cpu);
757  
758  	/*
759  	 * Expired interval timers cannot have a remaining time <= 0.
760  	 * The kernel has to move them forward so that the next
761  	 * timer expiry is > @now.
762  	 */
763  	if (now < expires) {
764  		itp->it_value = ns_to_timespec64(expires - now);
765  	} else {
766  		/*
767  		 * A single shot SIGEV_NONE timer must return 0, when it is
768  		 * expired! Timers which have a real signal delivery mode
769  		 * must return a remaining time greater than 0 because the
770  		 * signal has not yet been delivered.
771  		 */
772  		if (!sigev_none)
773  			itp->it_value.tv_nsec = 1;
774  	}
775  }
776  
posix_cpu_timer_get(struct k_itimer * timer,struct itimerspec64 * itp)777  static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
778  {
779  	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
780  	struct task_struct *p;
781  	u64 now;
782  
783  	rcu_read_lock();
784  	p = cpu_timer_task_rcu(timer);
785  	if (p && cpu_timer_getexpires(&timer->it.cpu)) {
786  		itp->it_interval = ktime_to_timespec64(timer->it_interval);
787  
788  		if (CPUCLOCK_PERTHREAD(timer->it_clock))
789  			now = cpu_clock_sample(clkid, p);
790  		else
791  			now = cpu_clock_sample_group(clkid, p, false);
792  
793  		__posix_cpu_timer_get(timer, itp, now);
794  	}
795  	rcu_read_unlock();
796  }
797  
798  #define MAX_COLLECTED	20
799  
collect_timerqueue(struct timerqueue_head * head,struct list_head * firing,u64 now)800  static u64 collect_timerqueue(struct timerqueue_head *head,
801  			      struct list_head *firing, u64 now)
802  {
803  	struct timerqueue_node *next;
804  	int i = 0;
805  
806  	while ((next = timerqueue_getnext(head))) {
807  		struct cpu_timer *ctmr;
808  		u64 expires;
809  
810  		ctmr = container_of(next, struct cpu_timer, node);
811  		expires = cpu_timer_getexpires(ctmr);
812  		/* Limit the number of timers to expire at once */
813  		if (++i == MAX_COLLECTED || now < expires)
814  			return expires;
815  
816  		ctmr->firing = true;
817  		/* See posix_cpu_timer_wait_running() */
818  		rcu_assign_pointer(ctmr->handling, current);
819  		cpu_timer_dequeue(ctmr);
820  		list_add_tail(&ctmr->elist, firing);
821  	}
822  
823  	return U64_MAX;
824  }
825  
collect_posix_cputimers(struct posix_cputimers * pct,u64 * samples,struct list_head * firing)826  static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples,
827  				    struct list_head *firing)
828  {
829  	struct posix_cputimer_base *base = pct->bases;
830  	int i;
831  
832  	for (i = 0; i < CPUCLOCK_MAX; i++, base++) {
833  		base->nextevt = collect_timerqueue(&base->tqhead, firing,
834  						    samples[i]);
835  	}
836  }
837  
check_dl_overrun(struct task_struct * tsk)838  static inline void check_dl_overrun(struct task_struct *tsk)
839  {
840  	if (tsk->dl.dl_overrun) {
841  		tsk->dl.dl_overrun = 0;
842  		send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
843  	}
844  }
845  
check_rlimit(u64 time,u64 limit,int signo,bool rt,bool hard)846  static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
847  {
848  	if (time < limit)
849  		return false;
850  
851  	if (print_fatal_signals) {
852  		pr_info("%s Watchdog Timeout (%s): %s[%d]\n",
853  			rt ? "RT" : "CPU", hard ? "hard" : "soft",
854  			current->comm, task_pid_nr(current));
855  	}
856  	send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID);
857  	return true;
858  }
859  
860  /*
861   * Check for any per-thread CPU timers that have fired and move them off
862   * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
863   * tsk->it_*_expires values to reflect the remaining thread CPU timers.
864   */
check_thread_timers(struct task_struct * tsk,struct list_head * firing)865  static void check_thread_timers(struct task_struct *tsk,
866  				struct list_head *firing)
867  {
868  	struct posix_cputimers *pct = &tsk->posix_cputimers;
869  	u64 samples[CPUCLOCK_MAX];
870  	unsigned long soft;
871  
872  	if (dl_task(tsk))
873  		check_dl_overrun(tsk);
874  
875  	if (expiry_cache_is_inactive(pct))
876  		return;
877  
878  	task_sample_cputime(tsk, samples);
879  	collect_posix_cputimers(pct, samples, firing);
880  
881  	/*
882  	 * Check for the special case thread timers.
883  	 */
884  	soft = task_rlimit(tsk, RLIMIT_RTTIME);
885  	if (soft != RLIM_INFINITY) {
886  		/* Task RT timeout is accounted in jiffies. RTTIME is usec */
887  		unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ);
888  		unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME);
889  
890  		/* At the hard limit, send SIGKILL. No further action. */
891  		if (hard != RLIM_INFINITY &&
892  		    check_rlimit(rttime, hard, SIGKILL, true, true))
893  			return;
894  
895  		/* At the soft limit, send a SIGXCPU every second */
896  		if (check_rlimit(rttime, soft, SIGXCPU, true, false)) {
897  			soft += USEC_PER_SEC;
898  			tsk->signal->rlim[RLIMIT_RTTIME].rlim_cur = soft;
899  		}
900  	}
901  
902  	if (expiry_cache_is_inactive(pct))
903  		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
904  }
905  
stop_process_timers(struct signal_struct * sig)906  static inline void stop_process_timers(struct signal_struct *sig)
907  {
908  	struct posix_cputimers *pct = &sig->posix_cputimers;
909  
910  	/* Turn off the active flag. This is done without locking. */
911  	WRITE_ONCE(pct->timers_active, false);
912  	tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
913  }
914  
check_cpu_itimer(struct task_struct * tsk,struct cpu_itimer * it,u64 * expires,u64 cur_time,int signo)915  static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
916  			     u64 *expires, u64 cur_time, int signo)
917  {
918  	if (!it->expires)
919  		return;
920  
921  	if (cur_time >= it->expires) {
922  		if (it->incr)
923  			it->expires += it->incr;
924  		else
925  			it->expires = 0;
926  
927  		trace_itimer_expire(signo == SIGPROF ?
928  				    ITIMER_PROF : ITIMER_VIRTUAL,
929  				    task_tgid(tsk), cur_time);
930  		send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
931  	}
932  
933  	if (it->expires && it->expires < *expires)
934  		*expires = it->expires;
935  }
936  
937  /*
938   * Check for any per-thread CPU timers that have fired and move them
939   * off the tsk->*_timers list onto the firing list.  Per-thread timers
940   * have already been taken off.
941   */
check_process_timers(struct task_struct * tsk,struct list_head * firing)942  static void check_process_timers(struct task_struct *tsk,
943  				 struct list_head *firing)
944  {
945  	struct signal_struct *const sig = tsk->signal;
946  	struct posix_cputimers *pct = &sig->posix_cputimers;
947  	u64 samples[CPUCLOCK_MAX];
948  	unsigned long soft;
949  
950  	/*
951  	 * If there are no active process wide timers (POSIX 1.b, itimers,
952  	 * RLIMIT_CPU) nothing to check. Also skip the process wide timer
953  	 * processing when there is already another task handling them.
954  	 */
955  	if (!READ_ONCE(pct->timers_active) || pct->expiry_active)
956  		return;
957  
958  	/*
959  	 * Signify that a thread is checking for process timers.
960  	 * Write access to this field is protected by the sighand lock.
961  	 */
962  	pct->expiry_active = true;
963  
964  	/*
965  	 * Collect the current process totals. Group accounting is active
966  	 * so the sample can be taken directly.
967  	 */
968  	proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic, samples);
969  	collect_posix_cputimers(pct, samples, firing);
970  
971  	/*
972  	 * Check for the special case process timers.
973  	 */
974  	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF],
975  			 &pct->bases[CPUCLOCK_PROF].nextevt,
976  			 samples[CPUCLOCK_PROF], SIGPROF);
977  	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT],
978  			 &pct->bases[CPUCLOCK_VIRT].nextevt,
979  			 samples[CPUCLOCK_VIRT], SIGVTALRM);
980  
981  	soft = task_rlimit(tsk, RLIMIT_CPU);
982  	if (soft != RLIM_INFINITY) {
983  		/* RLIMIT_CPU is in seconds. Samples are nanoseconds */
984  		unsigned long hard = task_rlimit_max(tsk, RLIMIT_CPU);
985  		u64 ptime = samples[CPUCLOCK_PROF];
986  		u64 softns = (u64)soft * NSEC_PER_SEC;
987  		u64 hardns = (u64)hard * NSEC_PER_SEC;
988  
989  		/* At the hard limit, send SIGKILL. No further action. */
990  		if (hard != RLIM_INFINITY &&
991  		    check_rlimit(ptime, hardns, SIGKILL, false, true))
992  			return;
993  
994  		/* At the soft limit, send a SIGXCPU every second */
995  		if (check_rlimit(ptime, softns, SIGXCPU, false, false)) {
996  			sig->rlim[RLIMIT_CPU].rlim_cur = soft + 1;
997  			softns += NSEC_PER_SEC;
998  		}
999  
1000  		/* Update the expiry cache */
1001  		if (softns < pct->bases[CPUCLOCK_PROF].nextevt)
1002  			pct->bases[CPUCLOCK_PROF].nextevt = softns;
1003  	}
1004  
1005  	if (expiry_cache_is_inactive(pct))
1006  		stop_process_timers(sig);
1007  
1008  	pct->expiry_active = false;
1009  }
1010  
1011  /*
1012   * This is called from the signal code (via posixtimer_rearm)
1013   * when the last timer signal was delivered and we have to reload the timer.
1014   */
posix_cpu_timer_rearm(struct k_itimer * timer)1015  static void posix_cpu_timer_rearm(struct k_itimer *timer)
1016  {
1017  	clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock);
1018  	struct task_struct *p;
1019  	struct sighand_struct *sighand;
1020  	unsigned long flags;
1021  	u64 now;
1022  
1023  	rcu_read_lock();
1024  	p = cpu_timer_task_rcu(timer);
1025  	if (!p)
1026  		goto out;
1027  
1028  	/* Protect timer list r/w in arm_timer() */
1029  	sighand = lock_task_sighand(p, &flags);
1030  	if (unlikely(sighand == NULL))
1031  		goto out;
1032  
1033  	/*
1034  	 * Fetch the current sample and update the timer's expiry time.
1035  	 */
1036  	if (CPUCLOCK_PERTHREAD(timer->it_clock))
1037  		now = cpu_clock_sample(clkid, p);
1038  	else
1039  		now = cpu_clock_sample_group(clkid, p, true);
1040  
1041  	bump_cpu_timer(timer, now);
1042  
1043  	/*
1044  	 * Now re-arm for the new expiry time.
1045  	 */
1046  	arm_timer(timer, p);
1047  	unlock_task_sighand(p, &flags);
1048  out:
1049  	rcu_read_unlock();
1050  }
1051  
1052  /**
1053   * task_cputimers_expired - Check whether posix CPU timers are expired
1054   *
1055   * @samples:	Array of current samples for the CPUCLOCK clocks
1056   * @pct:	Pointer to a posix_cputimers container
1057   *
1058   * Returns true if any member of @samples is greater than the corresponding
1059   * member of @pct->bases[CLK].nextevt. False otherwise
1060   */
1061  static inline bool
task_cputimers_expired(const u64 * samples,struct posix_cputimers * pct)1062  task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct)
1063  {
1064  	int i;
1065  
1066  	for (i = 0; i < CPUCLOCK_MAX; i++) {
1067  		if (samples[i] >= pct->bases[i].nextevt)
1068  			return true;
1069  	}
1070  	return false;
1071  }
1072  
1073  /**
1074   * fastpath_timer_check - POSIX CPU timers fast path.
1075   *
1076   * @tsk:	The task (thread) being checked.
1077   *
1078   * Check the task and thread group timers.  If both are zero (there are no
1079   * timers set) return false.  Otherwise snapshot the task and thread group
1080   * timers and compare them with the corresponding expiration times.  Return
1081   * true if a timer has expired, else return false.
1082   */
fastpath_timer_check(struct task_struct * tsk)1083  static inline bool fastpath_timer_check(struct task_struct *tsk)
1084  {
1085  	struct posix_cputimers *pct = &tsk->posix_cputimers;
1086  	struct signal_struct *sig;
1087  
1088  	if (!expiry_cache_is_inactive(pct)) {
1089  		u64 samples[CPUCLOCK_MAX];
1090  
1091  		task_sample_cputime(tsk, samples);
1092  		if (task_cputimers_expired(samples, pct))
1093  			return true;
1094  	}
1095  
1096  	sig = tsk->signal;
1097  	pct = &sig->posix_cputimers;
1098  	/*
1099  	 * Check if thread group timers expired when timers are active and
1100  	 * no other thread in the group is already handling expiry for
1101  	 * thread group cputimers. These fields are read without the
1102  	 * sighand lock. However, this is fine because this is meant to be
1103  	 * a fastpath heuristic to determine whether we should try to
1104  	 * acquire the sighand lock to handle timer expiry.
1105  	 *
1106  	 * In the worst case scenario, if concurrently timers_active is set
1107  	 * or expiry_active is cleared, but the current thread doesn't see
1108  	 * the change yet, the timer checks are delayed until the next
1109  	 * thread in the group gets a scheduler interrupt to handle the
1110  	 * timer. This isn't an issue in practice because these types of
1111  	 * delays with signals actually getting sent are expected.
1112  	 */
1113  	if (READ_ONCE(pct->timers_active) && !READ_ONCE(pct->expiry_active)) {
1114  		u64 samples[CPUCLOCK_MAX];
1115  
1116  		proc_sample_cputime_atomic(&sig->cputimer.cputime_atomic,
1117  					   samples);
1118  
1119  		if (task_cputimers_expired(samples, pct))
1120  			return true;
1121  	}
1122  
1123  	if (dl_task(tsk) && tsk->dl.dl_overrun)
1124  		return true;
1125  
1126  	return false;
1127  }
1128  
1129  static void handle_posix_cpu_timers(struct task_struct *tsk);
1130  
1131  #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
posix_cpu_timers_work(struct callback_head * work)1132  static void posix_cpu_timers_work(struct callback_head *work)
1133  {
1134  	struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
1135  
1136  	mutex_lock(&cw->mutex);
1137  	handle_posix_cpu_timers(current);
1138  	mutex_unlock(&cw->mutex);
1139  }
1140  
1141  /*
1142   * Invoked from the posix-timer core when a cancel operation failed because
1143   * the timer is marked firing. The caller holds rcu_read_lock(), which
1144   * protects the timer and the task which is expiring it from being freed.
1145   */
posix_cpu_timer_wait_running(struct k_itimer * timr)1146  static void posix_cpu_timer_wait_running(struct k_itimer *timr)
1147  {
1148  	struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
1149  
1150  	/* Has the handling task completed expiry already? */
1151  	if (!tsk)
1152  		return;
1153  
1154  	/* Ensure that the task cannot go away */
1155  	get_task_struct(tsk);
1156  	/* Now drop the RCU protection so the mutex can be locked */
1157  	rcu_read_unlock();
1158  	/* Wait on the expiry mutex */
1159  	mutex_lock(&tsk->posix_cputimers_work.mutex);
1160  	/* Release it immediately again. */
1161  	mutex_unlock(&tsk->posix_cputimers_work.mutex);
1162  	/* Drop the task reference. */
1163  	put_task_struct(tsk);
1164  	/* Relock RCU so the callsite is balanced */
1165  	rcu_read_lock();
1166  }
1167  
posix_cpu_timer_wait_running_nsleep(struct k_itimer * timr)1168  static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
1169  {
1170  	/* Ensure that timr->it.cpu.handling task cannot go away */
1171  	rcu_read_lock();
1172  	spin_unlock_irq(&timr->it_lock);
1173  	posix_cpu_timer_wait_running(timr);
1174  	rcu_read_unlock();
1175  	/* @timr is on stack and is valid */
1176  	spin_lock_irq(&timr->it_lock);
1177  }
1178  
1179  /*
1180   * Clear existing posix CPU timers task work.
1181   */
clear_posix_cputimers_work(struct task_struct * p)1182  void clear_posix_cputimers_work(struct task_struct *p)
1183  {
1184  	/*
1185  	 * A copied work entry from the old task is not meaningful, clear it.
1186  	 * N.B. init_task_work will not do this.
1187  	 */
1188  	memset(&p->posix_cputimers_work.work, 0,
1189  	       sizeof(p->posix_cputimers_work.work));
1190  	init_task_work(&p->posix_cputimers_work.work,
1191  		       posix_cpu_timers_work);
1192  	mutex_init(&p->posix_cputimers_work.mutex);
1193  	p->posix_cputimers_work.scheduled = false;
1194  }
1195  
1196  /*
1197   * Initialize posix CPU timers task work in init task. Out of line to
1198   * keep the callback static and to avoid header recursion hell.
1199   */
posix_cputimers_init_work(void)1200  void __init posix_cputimers_init_work(void)
1201  {
1202  	clear_posix_cputimers_work(current);
1203  }
1204  
1205  /*
1206   * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
1207   * in hard interrupt context or in task context with interrupts
1208   * disabled. Aside of that the writer/reader interaction is always in the
1209   * context of the current task, which means they are strict per CPU.
1210   */
posix_cpu_timers_work_scheduled(struct task_struct * tsk)1211  static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1212  {
1213  	return tsk->posix_cputimers_work.scheduled;
1214  }
1215  
__run_posix_cpu_timers(struct task_struct * tsk)1216  static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1217  {
1218  	if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
1219  		return;
1220  
1221  	/* Schedule task work to actually expire the timers */
1222  	tsk->posix_cputimers_work.scheduled = true;
1223  	task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
1224  }
1225  
posix_cpu_timers_enable_work(struct task_struct * tsk,unsigned long start)1226  static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1227  						unsigned long start)
1228  {
1229  	bool ret = true;
1230  
1231  	/*
1232  	 * On !RT kernels interrupts are disabled while collecting expired
1233  	 * timers, so no tick can happen and the fast path check can be
1234  	 * reenabled without further checks.
1235  	 */
1236  	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
1237  		tsk->posix_cputimers_work.scheduled = false;
1238  		return true;
1239  	}
1240  
1241  	/*
1242  	 * On RT enabled kernels ticks can happen while the expired timers
1243  	 * are collected under sighand lock. But any tick which observes
1244  	 * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
1245  	 * checks. So reenabling the tick work has do be done carefully:
1246  	 *
1247  	 * Disable interrupts and run the fast path check if jiffies have
1248  	 * advanced since the collecting of expired timers started. If
1249  	 * jiffies have not advanced or the fast path check did not find
1250  	 * newly expired timers, reenable the fast path check in the timer
1251  	 * interrupt. If there are newly expired timers, return false and
1252  	 * let the collection loop repeat.
1253  	 */
1254  	local_irq_disable();
1255  	if (start != jiffies && fastpath_timer_check(tsk))
1256  		ret = false;
1257  	else
1258  		tsk->posix_cputimers_work.scheduled = false;
1259  	local_irq_enable();
1260  
1261  	return ret;
1262  }
1263  #else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
__run_posix_cpu_timers(struct task_struct * tsk)1264  static inline void __run_posix_cpu_timers(struct task_struct *tsk)
1265  {
1266  	lockdep_posixtimer_enter();
1267  	handle_posix_cpu_timers(tsk);
1268  	lockdep_posixtimer_exit();
1269  }
1270  
posix_cpu_timer_wait_running(struct k_itimer * timr)1271  static void posix_cpu_timer_wait_running(struct k_itimer *timr)
1272  {
1273  	cpu_relax();
1274  }
1275  
posix_cpu_timer_wait_running_nsleep(struct k_itimer * timr)1276  static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
1277  {
1278  	spin_unlock_irq(&timr->it_lock);
1279  	cpu_relax();
1280  	spin_lock_irq(&timr->it_lock);
1281  }
1282  
posix_cpu_timers_work_scheduled(struct task_struct * tsk)1283  static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
1284  {
1285  	return false;
1286  }
1287  
posix_cpu_timers_enable_work(struct task_struct * tsk,unsigned long start)1288  static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
1289  						unsigned long start)
1290  {
1291  	return true;
1292  }
1293  #endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
1294  
handle_posix_cpu_timers(struct task_struct * tsk)1295  static void handle_posix_cpu_timers(struct task_struct *tsk)
1296  {
1297  	struct k_itimer *timer, *next;
1298  	unsigned long flags, start;
1299  	LIST_HEAD(firing);
1300  
1301  	if (!lock_task_sighand(tsk, &flags))
1302  		return;
1303  
1304  	do {
1305  		/*
1306  		 * On RT locking sighand lock does not disable interrupts,
1307  		 * so this needs to be careful vs. ticks. Store the current
1308  		 * jiffies value.
1309  		 */
1310  		start = READ_ONCE(jiffies);
1311  		barrier();
1312  
1313  		/*
1314  		 * Here we take off tsk->signal->cpu_timers[N] and
1315  		 * tsk->cpu_timers[N] all the timers that are firing, and
1316  		 * put them on the firing list.
1317  		 */
1318  		check_thread_timers(tsk, &firing);
1319  
1320  		check_process_timers(tsk, &firing);
1321  
1322  		/*
1323  		 * The above timer checks have updated the expiry cache and
1324  		 * because nothing can have queued or modified timers after
1325  		 * sighand lock was taken above it is guaranteed to be
1326  		 * consistent. So the next timer interrupt fastpath check
1327  		 * will find valid data.
1328  		 *
1329  		 * If timer expiry runs in the timer interrupt context then
1330  		 * the loop is not relevant as timers will be directly
1331  		 * expired in interrupt context. The stub function below
1332  		 * returns always true which allows the compiler to
1333  		 * optimize the loop out.
1334  		 *
1335  		 * If timer expiry is deferred to task work context then
1336  		 * the following rules apply:
1337  		 *
1338  		 * - On !RT kernels no tick can have happened on this CPU
1339  		 *   after sighand lock was acquired because interrupts are
1340  		 *   disabled. So reenabling task work before dropping
1341  		 *   sighand lock and reenabling interrupts is race free.
1342  		 *
1343  		 * - On RT kernels ticks might have happened but the tick
1344  		 *   work ignored posix CPU timer handling because the
1345  		 *   CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
1346  		 *   must be done very carefully including a check whether
1347  		 *   ticks have happened since the start of the timer
1348  		 *   expiry checks. posix_cpu_timers_enable_work() takes
1349  		 *   care of that and eventually lets the expiry checks
1350  		 *   run again.
1351  		 */
1352  	} while (!posix_cpu_timers_enable_work(tsk, start));
1353  
1354  	/*
1355  	 * We must release sighand lock before taking any timer's lock.
1356  	 * There is a potential race with timer deletion here, as the
1357  	 * siglock now protects our private firing list.  We have set
1358  	 * the firing flag in each timer, so that a deletion attempt
1359  	 * that gets the timer lock before we do will give it up and
1360  	 * spin until we've taken care of that timer below.
1361  	 */
1362  	unlock_task_sighand(tsk, &flags);
1363  
1364  	/*
1365  	 * Now that all the timers on our list have the firing flag,
1366  	 * no one will touch their list entries but us.  We'll take
1367  	 * each timer's lock before clearing its firing flag, so no
1368  	 * timer call will interfere.
1369  	 */
1370  	list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
1371  		bool cpu_firing;
1372  
1373  		/*
1374  		 * spin_lock() is sufficient here even independent of the
1375  		 * expiry context. If expiry happens in hard interrupt
1376  		 * context it's obvious. For task work context it's safe
1377  		 * because all other operations on timer::it_lock happen in
1378  		 * task context (syscall or exit).
1379  		 */
1380  		spin_lock(&timer->it_lock);
1381  		list_del_init(&timer->it.cpu.elist);
1382  		cpu_firing = timer->it.cpu.firing;
1383  		timer->it.cpu.firing = false;
1384  		/*
1385  		 * If the firing flag is cleared then this raced with a
1386  		 * timer rearm/delete operation. So don't generate an
1387  		 * event.
1388  		 */
1389  		if (likely(cpu_firing))
1390  			cpu_timer_fire(timer);
1391  		/* See posix_cpu_timer_wait_running() */
1392  		rcu_assign_pointer(timer->it.cpu.handling, NULL);
1393  		spin_unlock(&timer->it_lock);
1394  	}
1395  }
1396  
1397  /*
1398   * This is called from the timer interrupt handler.  The irq handler has
1399   * already updated our counts.  We need to check if any timers fire now.
1400   * Interrupts are disabled.
1401   */
run_posix_cpu_timers(void)1402  void run_posix_cpu_timers(void)
1403  {
1404  	struct task_struct *tsk = current;
1405  
1406  	lockdep_assert_irqs_disabled();
1407  
1408  	/*
1409  	 * Ensure that release_task(tsk) can't happen while
1410  	 * handle_posix_cpu_timers() is running. Otherwise, a concurrent
1411  	 * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
1412  	 * miss timer->it.cpu.firing != 0.
1413  	 */
1414  	if (tsk->exit_state)
1415  		return;
1416  
1417  	/*
1418  	 * If the actual expiry is deferred to task work context and the
1419  	 * work is already scheduled there is no point to do anything here.
1420  	 */
1421  	if (posix_cpu_timers_work_scheduled(tsk))
1422  		return;
1423  
1424  	/*
1425  	 * The fast path checks that there are no expired thread or thread
1426  	 * group timers.  If that's so, just return.
1427  	 */
1428  	if (!fastpath_timer_check(tsk))
1429  		return;
1430  
1431  	__run_posix_cpu_timers(tsk);
1432  }
1433  
1434  /*
1435   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1436   * The tsk->sighand->siglock must be held by the caller.
1437   */
set_process_cpu_timer(struct task_struct * tsk,unsigned int clkid,u64 * newval,u64 * oldval)1438  void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
1439  			   u64 *newval, u64 *oldval)
1440  {
1441  	u64 now, *nextevt;
1442  
1443  	if (WARN_ON_ONCE(clkid >= CPUCLOCK_SCHED))
1444  		return;
1445  
1446  	nextevt = &tsk->signal->posix_cputimers.bases[clkid].nextevt;
1447  	now = cpu_clock_sample_group(clkid, tsk, true);
1448  
1449  	if (oldval) {
1450  		/*
1451  		 * We are setting itimer. The *oldval is absolute and we update
1452  		 * it to be relative, *newval argument is relative and we update
1453  		 * it to be absolute.
1454  		 */
1455  		if (*oldval) {
1456  			if (*oldval <= now) {
1457  				/* Just about to fire. */
1458  				*oldval = TICK_NSEC;
1459  			} else {
1460  				*oldval -= now;
1461  			}
1462  		}
1463  
1464  		if (*newval)
1465  			*newval += now;
1466  	}
1467  
1468  	/*
1469  	 * Update expiration cache if this is the earliest timer. CPUCLOCK_PROF
1470  	 * expiry cache is also used by RLIMIT_CPU!.
1471  	 */
1472  	if (*newval < *nextevt)
1473  		*nextevt = *newval;
1474  
1475  	tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER);
1476  }
1477  
do_cpu_nanosleep(const clockid_t which_clock,int flags,const struct timespec64 * rqtp)1478  static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1479  			    const struct timespec64 *rqtp)
1480  {
1481  	struct itimerspec64 it;
1482  	struct k_itimer timer;
1483  	u64 expires;
1484  	int error;
1485  
1486  	/*
1487  	 * Set up a temporary timer and then wait for it to go off.
1488  	 */
1489  	memset(&timer, 0, sizeof timer);
1490  	spin_lock_init(&timer.it_lock);
1491  	timer.it_clock = which_clock;
1492  	timer.it_overrun = -1;
1493  	error = posix_cpu_timer_create(&timer);
1494  	timer.it_process = current;
1495  	timer.it.cpu.nanosleep = true;
1496  
1497  	if (!error) {
1498  		static struct itimerspec64 zero_it;
1499  		struct restart_block *restart;
1500  
1501  		memset(&it, 0, sizeof(it));
1502  		it.it_value = *rqtp;
1503  
1504  		spin_lock_irq(&timer.it_lock);
1505  		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
1506  		if (error) {
1507  			spin_unlock_irq(&timer.it_lock);
1508  			return error;
1509  		}
1510  
1511  		while (!signal_pending(current)) {
1512  			if (!cpu_timer_getexpires(&timer.it.cpu)) {
1513  				/*
1514  				 * Our timer fired and was reset, below
1515  				 * deletion can not fail.
1516  				 */
1517  				posix_cpu_timer_del(&timer);
1518  				spin_unlock_irq(&timer.it_lock);
1519  				return 0;
1520  			}
1521  
1522  			/*
1523  			 * Block until cpu_timer_fire (or a signal) wakes us.
1524  			 */
1525  			__set_current_state(TASK_INTERRUPTIBLE);
1526  			spin_unlock_irq(&timer.it_lock);
1527  			schedule();
1528  			spin_lock_irq(&timer.it_lock);
1529  		}
1530  
1531  		/*
1532  		 * We were interrupted by a signal.
1533  		 */
1534  		expires = cpu_timer_getexpires(&timer.it.cpu);
1535  		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
1536  		if (!error) {
1537  			/* Timer is now unarmed, deletion can not fail. */
1538  			posix_cpu_timer_del(&timer);
1539  		} else {
1540  			while (error == TIMER_RETRY) {
1541  				posix_cpu_timer_wait_running_nsleep(&timer);
1542  				error = posix_cpu_timer_del(&timer);
1543  			}
1544  		}
1545  
1546  		spin_unlock_irq(&timer.it_lock);
1547  
1548  		if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
1549  			/*
1550  			 * It actually did fire already.
1551  			 */
1552  			return 0;
1553  		}
1554  
1555  		error = -ERESTART_RESTARTBLOCK;
1556  		/*
1557  		 * Report back to the user the time still remaining.
1558  		 */
1559  		restart = &current->restart_block;
1560  		restart->nanosleep.expires = expires;
1561  		if (restart->nanosleep.type != TT_NONE)
1562  			error = nanosleep_copyout(restart, &it.it_value);
1563  	}
1564  
1565  	return error;
1566  }
1567  
1568  static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1569  
posix_cpu_nsleep(const clockid_t which_clock,int flags,const struct timespec64 * rqtp)1570  static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1571  			    const struct timespec64 *rqtp)
1572  {
1573  	struct restart_block *restart_block = &current->restart_block;
1574  	int error;
1575  
1576  	/*
1577  	 * Diagnose required errors first.
1578  	 */
1579  	if (CPUCLOCK_PERTHREAD(which_clock) &&
1580  	    (CPUCLOCK_PID(which_clock) == 0 ||
1581  	     CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
1582  		return -EINVAL;
1583  
1584  	error = do_cpu_nanosleep(which_clock, flags, rqtp);
1585  
1586  	if (error == -ERESTART_RESTARTBLOCK) {
1587  
1588  		if (flags & TIMER_ABSTIME)
1589  			return -ERESTARTNOHAND;
1590  
1591  		restart_block->nanosleep.clockid = which_clock;
1592  		set_restart_fn(restart_block, posix_cpu_nsleep_restart);
1593  	}
1594  	return error;
1595  }
1596  
posix_cpu_nsleep_restart(struct restart_block * restart_block)1597  static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1598  {
1599  	clockid_t which_clock = restart_block->nanosleep.clockid;
1600  	struct timespec64 t;
1601  
1602  	t = ns_to_timespec64(restart_block->nanosleep.expires);
1603  
1604  	return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
1605  }
1606  
1607  #define PROCESS_CLOCK	make_process_cpuclock(0, CPUCLOCK_SCHED)
1608  #define THREAD_CLOCK	make_thread_cpuclock(0, CPUCLOCK_SCHED)
1609  
process_cpu_clock_getres(const clockid_t which_clock,struct timespec64 * tp)1610  static int process_cpu_clock_getres(const clockid_t which_clock,
1611  				    struct timespec64 *tp)
1612  {
1613  	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
1614  }
process_cpu_clock_get(const clockid_t which_clock,struct timespec64 * tp)1615  static int process_cpu_clock_get(const clockid_t which_clock,
1616  				 struct timespec64 *tp)
1617  {
1618  	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
1619  }
process_cpu_timer_create(struct k_itimer * timer)1620  static int process_cpu_timer_create(struct k_itimer *timer)
1621  {
1622  	timer->it_clock = PROCESS_CLOCK;
1623  	return posix_cpu_timer_create(timer);
1624  }
process_cpu_nsleep(const clockid_t which_clock,int flags,const struct timespec64 * rqtp)1625  static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1626  			      const struct timespec64 *rqtp)
1627  {
1628  	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
1629  }
thread_cpu_clock_getres(const clockid_t which_clock,struct timespec64 * tp)1630  static int thread_cpu_clock_getres(const clockid_t which_clock,
1631  				   struct timespec64 *tp)
1632  {
1633  	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
1634  }
thread_cpu_clock_get(const clockid_t which_clock,struct timespec64 * tp)1635  static int thread_cpu_clock_get(const clockid_t which_clock,
1636  				struct timespec64 *tp)
1637  {
1638  	return posix_cpu_clock_get(THREAD_CLOCK, tp);
1639  }
thread_cpu_timer_create(struct k_itimer * timer)1640  static int thread_cpu_timer_create(struct k_itimer *timer)
1641  {
1642  	timer->it_clock = THREAD_CLOCK;
1643  	return posix_cpu_timer_create(timer);
1644  }
1645  
1646  const struct k_clock clock_posix_cpu = {
1647  	.clock_getres		= posix_cpu_clock_getres,
1648  	.clock_set		= posix_cpu_clock_set,
1649  	.clock_get_timespec	= posix_cpu_clock_get,
1650  	.timer_create		= posix_cpu_timer_create,
1651  	.nsleep			= posix_cpu_nsleep,
1652  	.timer_set		= posix_cpu_timer_set,
1653  	.timer_del		= posix_cpu_timer_del,
1654  	.timer_get		= posix_cpu_timer_get,
1655  	.timer_rearm		= posix_cpu_timer_rearm,
1656  	.timer_wait_running	= posix_cpu_timer_wait_running,
1657  };
1658  
1659  const struct k_clock clock_process = {
1660  	.clock_getres		= process_cpu_clock_getres,
1661  	.clock_get_timespec	= process_cpu_clock_get,
1662  	.timer_create		= process_cpu_timer_create,
1663  	.nsleep			= process_cpu_nsleep,
1664  };
1665  
1666  const struct k_clock clock_thread = {
1667  	.clock_getres		= thread_cpu_clock_getres,
1668  	.clock_get_timespec	= thread_cpu_clock_get,
1669  	.timer_create		= thread_cpu_timer_create,
1670  };
1671