core.c 174 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
2
 *  kernel/sched/core.c
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Ingo Molnar's avatar
Ingo Molnar committed
4
 *  Core kernel scheduler code and related syscalls
Linus Torvalds's avatar
Linus Torvalds committed
5
6
7
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 */
8
#include "sched.h"
Linus Torvalds's avatar
Linus Torvalds committed
9

10
#include <linux/kthread.h>
11
#include <linux/nospec.h>
12

13
14
#include <linux/kcov.h>

15
#include <asm/switch_to.h>
16
#include <asm/tlb.h>
Linus Torvalds's avatar
Linus Torvalds committed
17

18
#include "../workqueue_internal.h"
19
#include "../smpboot.h"
20

21
#define CREATE_TRACE_POINTS
22
#include <trace/events/sched.h>
23

24
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
25

26
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
27
28
/*
 * Debugging: various feature bits
29
30
31
32
 *
 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
 * sysctl_sched_features, defined in sched.h, to allow constants propagation
 * at compile time and compiler optimization based on features default.
33
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
34
35
#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |
36
const_debug unsigned int sysctl_sched_features =
37
#include "features.h"
Peter Zijlstra's avatar
Peter Zijlstra committed
38
39
	0;
#undef SCHED_FEAT
40
#endif
Peter Zijlstra's avatar
Peter Zijlstra committed
41

42
43
44
45
46
47
/*
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
const_debug unsigned int sysctl_sched_nr_migrate = 32;

48
49
50
51
52
53
54
55
/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
 * default: 1s
 */
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

Peter Zijlstra's avatar
Peter Zijlstra committed
56
/*
Ingo Molnar's avatar
Ingo Molnar committed
57
 * period over which we measure -rt task CPU usage in us.
Peter Zijlstra's avatar
Peter Zijlstra committed
58
59
 * default: 1s
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
60
unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstra's avatar
Peter Zijlstra committed
61

62
__read_mostly int scheduler_running;
63

Peter Zijlstra's avatar
Peter Zijlstra committed
64
65
66
67
68
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;
Peter Zijlstra's avatar
Peter Zijlstra committed
69

70
71
72
/*
 * __task_rq_lock - lock the rq @p resides on.
 */
73
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
74
75
76
77
78
79
80
81
82
83
	__acquires(rq->lock)
{
	struct rq *rq;

	lockdep_assert_held(&p->pi_lock);

	for (;;) {
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
84
			rq_pin_lock(rq, rf);
85
86
87
88
89
90
91
92
93
94
95
96
			return rq;
		}
		raw_spin_unlock(&rq->lock);

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

/*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 */
97
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
98
99
100
101
102
103
	__acquires(p->pi_lock)
	__acquires(rq->lock)
{
	struct rq *rq;

	for (;;) {
104
		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
105
106
107
108
109
110
111
112
113
114
115
116
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		/*
		 *	move_queued_task()		task_rq_lock()
		 *
		 *	ACQUIRE (rq->lock)
		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
		 *	[S] ->cpu = new_cpu		[L] task_rq()
		 *					[L] ->on_rq
		 *	RELEASE (rq->lock)
		 *
117
		 * If we observe the old CPU in task_rq_lock, the acquire of
118
119
		 * the old rq->lock will fully serialize against the stores.
		 *
Ingo Molnar's avatar
Ingo Molnar committed
120
		 * If we observe the new CPU in task_rq_lock, the acquire will
121
122
123
		 * pair with the WMB to ensure we must then also see migrating.
		 */
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
124
			rq_pin_lock(rq, rf);
125
126
127
			return rq;
		}
		raw_spin_unlock(&rq->lock);
128
		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
129
130
131
132
133
134

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/*
 * RQ-clock updating methods:
 */

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
 * In theory, the compile should just see 0 here, and optimize out the call
 * to sched_rt_avg_update. But I don't trust it...
 */
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	s64 steal = 0, irq_delta = 0;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

	/*
	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
	 * this case when a previous update_rq_clock() happened inside a
	 * {soft,}irq region.
	 *
	 * When this happens, we stop ->clock_task and only update the
	 * prev_irq_time stamp to account for the part that fit, so that a next
	 * update will consume the rest. This ensures ->clock_task is
	 * monotonic.
	 *
	 * It does however cause some slight miss-attribution of {soft,}irq
	 * time, a more accurate solution would be to update the irq_time using
	 * the current rq->clock timestamp, except that would require using
	 * atomic ops.
	 */
	if (irq_delta > delta)
		irq_delta = delta;

	rq->prev_irq_time += irq_delta;
	delta -= irq_delta;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
	if (static_key_false((&paravirt_steal_rq_enabled))) {
		steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))
			steal = delta;

		rq->prev_steal_time_rq += steal;
		delta -= steal;
	}
#endif

	rq->clock_task += delta;

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
		sched_rt_avg_update(rq, irq_delta + steal);
#endif
}

void update_rq_clock(struct rq *rq)
{
	s64 delta;

	lockdep_assert_held(&rq->lock);

	if (rq->clock_update_flags & RQCF_ACT_SKIP)
		return;

#ifdef CONFIG_SCHED_DEBUG
203
204
	if (sched_feat(WARN_DOUBLE_CLOCK))
		SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
205
206
	rq->clock_update_flags |= RQCF_UPDATED;
#endif
207

208
209
210
211
212
213
214
215
	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
	if (delta < 0)
		return;
	rq->clock += delta;
	update_rq_clock_task(rq, delta);
}


216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#ifdef CONFIG_SCHED_HRTICK
/*
 * Use HR-timers to deliver accurate preemption points.
 */

static void hrtick_clear(struct rq *rq)
{
	if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
234
	struct rq_flags rf;
235
236
237

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

238
	rq_lock(rq, &rf);
239
	update_rq_clock(rq);
240
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
241
	rq_unlock(rq, &rf);
242
243
244
245

	return HRTIMER_NORESTART;
}

246
#ifdef CONFIG_SMP
Peter Zijlstra's avatar
Peter Zijlstra committed
247

248
static void __hrtick_restart(struct rq *rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
249
250
251
{
	struct hrtimer *timer = &rq->hrtick_timer;

252
	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra's avatar
Peter Zijlstra committed
253
254
}

255
256
257
258
/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
259
{
260
	struct rq *rq = arg;
261
	struct rq_flags rf;
262

263
	rq_lock(rq, &rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
264
	__hrtick_restart(rq);
265
	rq->hrtick_csd_pending = 0;
266
	rq_unlock(rq, &rf);
267
268
}

269
270
271
272
273
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
274
void hrtick_start(struct rq *rq, u64 delay)
275
{
276
	struct hrtimer *timer = &rq->hrtick_timer;
277
278
279
280
281
282
283
284
285
	ktime_t time;
	s64 delta;

	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense and can cause timer DoS.
	 */
	delta = max_t(s64, delay, 10000LL);
	time = ktime_add_ns(timer->base->get_time(), delta);
286

287
	hrtimer_set_expires(timer, time);
288
289

	if (rq == this_rq()) {
Peter Zijlstra's avatar
Peter Zijlstra committed
290
		__hrtick_restart(rq);
291
	} else if (!rq->hrtick_csd_pending) {
292
		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
293
294
		rq->hrtick_csd_pending = 1;
	}
295
296
}

297
298
299
300
301
302
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
303
void hrtick_start(struct rq *rq, u64 delay)
304
{
Wanpeng Li's avatar
Wanpeng Li committed
305
306
307
308
309
	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense. Rely on vruntime for fairness.
	 */
	delay = max_t(u64, delay, 10000LL);
310
311
	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
		      HRTIMER_MODE_REL_PINNED);
312
313
}
#endif /* CONFIG_SMP */
314

315
static void hrtick_rq_init(struct rq *rq)
316
{
317
318
#ifdef CONFIG_SMP
	rq->hrtick_csd_pending = 0;
319

320
321
322
323
	rq->hrtick_csd.flags = 0;
	rq->hrtick_csd.func = __hrtick_start;
	rq->hrtick_csd.info = rq;
#endif
324

325
326
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rq->hrtick_timer.function = hrtick;
327
}
Andrew Morton's avatar
Andrew Morton committed
328
#else	/* CONFIG_SCHED_HRTICK */
329
330
331
332
static inline void hrtick_clear(struct rq *rq)
{
}

333
static inline void hrtick_rq_init(struct rq *rq)
334
335
{
}
Andrew Morton's avatar
Andrew Morton committed
336
#endif	/* CONFIG_SCHED_HRTICK */
337

338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
/*
 * cmpxchg based fetch_or, macro so it works for different integer types
 */
#define fetch_or(ptr, mask)						\
	({								\
		typeof(ptr) _ptr = (ptr);				\
		typeof(mask) _mask = (mask);				\
		typeof(*_ptr) _old, _val = *_ptr;			\
									\
		for (;;) {						\
			_old = cmpxchg(_ptr, _val, _val | _mask);	\
			if (_old == _val)				\
				break;					\
			_val = _old;					\
		}							\
	_old;								\
})

356
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
357
358
359
360
361
362
363
364
365
366
/*
 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 * this avoids any races wrt polling state changes and thereby avoids
 * spurious IPIs.
 */
static bool set_nr_and_not_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
367
368
369
370
371
372
373
374
375
376

/*
 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 *
 * If this returns true, then the idle task promises to call
 * sched_ttwu_pending() and reschedule soon.
 */
static bool set_nr_if_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
377
	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
378
379
380
381
382
383
384
385
386
387
388
389
390
391

	for (;;) {
		if (!(val & _TIF_POLLING_NRFLAG))
			return false;
		if (val & _TIF_NEED_RESCHED)
			return true;
		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
		if (old == val)
			break;
		val = old;
	}
	return true;
}

392
393
394
395
396
397
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
	set_tsk_need_resched(p);
	return true;
}
398
399
400
401
402
403
404

#ifdef CONFIG_SMP
static bool set_nr_if_polling(struct task_struct *p)
{
	return false;
}
#endif
405
406
#endif

407
408
409
410
411
412
413
414
415
416
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
	struct wake_q_node *node = &task->wake_q;

	/*
	 * Atomically grab the task, if ->wake_q is !nil already it means
	 * its already queued (either by us or someone else) and will get the
	 * wakeup due to that.
	 *
	 * This cmpxchg() implies a full barrier, which pairs with the write
417
	 * barrier implied by the wakeup in wake_up_q().
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
	 */
	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
		return;

	get_task_struct(task);

	/*
	 * The head is context local, there can be no concurrency.
	 */
	*head->lastp = node;
	head->lastp = &node->next;
}

void wake_up_q(struct wake_q_head *head)
{
	struct wake_q_node *node = head->first;

	while (node != WAKE_Q_TAIL) {
		struct task_struct *task;

		task = container_of(node, struct task_struct, wake_q);
		BUG_ON(!task);
Ingo Molnar's avatar
Ingo Molnar committed
440
		/* Task can safely be re-inserted now: */
441
442
443
444
445
446
447
448
449
450
451
452
		node = node->next;
		task->wake_q.next = NULL;

		/*
		 * wake_up_process() implies a wmb() to pair with the queueing
		 * in wake_q_add() so as not to miss wakeups.
		 */
		wake_up_process(task);
		put_task_struct(task);
	}
}

453
/*
454
 * resched_curr - mark rq's current task 'to be rescheduled now'.
455
456
457
458
459
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
460
void resched_curr(struct rq *rq)
461
{
462
	struct task_struct *curr = rq->curr;
463
464
	int cpu;

465
	lockdep_assert_held(&rq->lock);
466

467
	if (test_tsk_need_resched(curr))
468
469
		return;

470
	cpu = cpu_of(rq);
471

472
	if (cpu == smp_processor_id()) {
473
		set_tsk_need_resched(curr);
474
		set_preempt_need_resched();
475
		return;
476
	}
477

478
	if (set_nr_and_not_polling(curr))
479
		smp_send_reschedule(cpu);
480
481
	else
		trace_sched_wake_idle_without_ipi(cpu);
482
483
}

484
void resched_cpu(int cpu)
485
486
487
488
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long flags;

489
	raw_spin_lock_irqsave(&rq->lock, flags);
490
491
	if (cpu_online(cpu) || cpu == smp_processor_id())
		resched_curr(rq);
492
	raw_spin_unlock_irqrestore(&rq->lock, flags);
493
}
494

495
#ifdef CONFIG_SMP
496
#ifdef CONFIG_NO_HZ_COMMON
497
/*
Ingo Molnar's avatar
Ingo Molnar committed
498
499
 * In the semi idle case, use the nearest busy CPU for migrating timers
 * from an idle CPU.  This is good for power-savings.
500
501
 *
 * We don't do similar optimization for completely idle system, as
Ingo Molnar's avatar
Ingo Molnar committed
502
503
 * selecting an idle CPU will add more delays to the timers than intended
 * (as that CPU's timer base may not be uptodate wrt jiffies etc).
504
 */
505
int get_nohz_timer_target(void)
506
{
507
	int i, cpu = smp_processor_id();
508
509
	struct sched_domain *sd;

510
	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
511
512
		return cpu;

513
	rcu_read_lock();
514
	for_each_domain(cpu, sd) {
515
		for_each_cpu(i, sched_domain_span(sd)) {
516
517
518
			if (cpu == i)
				continue;

519
			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
520
521
522
523
				cpu = i;
				goto unlock;
			}
		}
524
	}
525

526
527
	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
528
529
unlock:
	rcu_read_unlock();
530
531
	return cpu;
}
Ingo Molnar's avatar
Ingo Molnar committed
532

533
534
535
536
537
538
539
540
541
542
/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
 * idle system the next event might even be infinite time into the
 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 * leaves the inner idle loop so the newly added timer is taken into
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
543
static void wake_up_idle_cpu(int cpu)
544
545
546
547
548
549
{
	struct rq *rq = cpu_rq(cpu);

	if (cpu == smp_processor_id())
		return;

550
	if (set_nr_and_not_polling(rq->idle))
551
		smp_send_reschedule(cpu);
552
553
	else
		trace_sched_wake_idle_without_ipi(cpu);
554
555
}

556
static bool wake_up_full_nohz_cpu(int cpu)
557
{
558
559
560
561
562
563
	/*
	 * We just need the target to call irq_exit() and re-evaluate
	 * the next tick. The nohz full kick at least implies that.
	 * If needed we can still optimize that later with an
	 * empty IRQ.
	 */
564
565
	if (cpu_is_offline(cpu))
		return true;  /* Don't try to wake offline CPUs. */
566
	if (tick_nohz_full_cpu(cpu)) {
567
568
		if (cpu != smp_processor_id() ||
		    tick_nohz_tick_stopped())
569
			tick_nohz_full_kick_cpu(cpu);
570
571
572
573
574
575
		return true;
	}

	return false;
}

576
577
578
579
580
/*
 * Wake up the specified CPU.  If the CPU is going offline, it is the
 * caller's responsibility to deal with the lost wakeup, for example,
 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
 */
581
582
void wake_up_nohz_cpu(int cpu)
{
583
	if (!wake_up_full_nohz_cpu(cpu))
584
585
586
		wake_up_idle_cpu(cpu);
}

587
static inline bool got_nohz_idle_kick(void)
588
{
589
	int cpu = smp_processor_id();
590

591
	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
592
593
594
595
596
597
598
599
600
		return false;

	if (idle_cpu(cpu) && !need_resched())
		return true;

	/*
	 * We can't run Idle Load Balance on this CPU for this time so we
	 * cancel it and clear NOHZ_BALANCE_KICK
	 */
601
	atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
602
	return false;
603
604
}

605
#else /* CONFIG_NO_HZ_COMMON */
606

607
static inline bool got_nohz_idle_kick(void)
Peter Zijlstra's avatar
Peter Zijlstra committed
608
{
609
	return false;
Peter Zijlstra's avatar
Peter Zijlstra committed
610
611
}

612
#endif /* CONFIG_NO_HZ_COMMON */
613

614
#ifdef CONFIG_NO_HZ_FULL
615
bool sched_can_stop_tick(struct rq *rq)
616
{
617
618
619
620
621
622
	int fifo_nr_running;

	/* Deadline tasks, even if single, need the tick */
	if (rq->dl.dl_nr_running)
		return false;

623
	/*
624
625
	 * If there are more than one RR tasks, we need the tick to effect the
	 * actual RR behaviour.
626
	 */
627
628
629
630
631
	if (rq->rt.rr_nr_running) {
		if (rq->rt.rr_nr_running == 1)
			return true;
		else
			return false;
632
633
	}

634
635
636
637
638
639
640
641
642
643
644
645
646
647
	/*
	 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
	 * forced preemption between FIFO tasks.
	 */
	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
	if (fifo_nr_running)
		return true;

	/*
	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
	 * if there's more than one we need the tick for involuntary
	 * preemption.
	 */
	if (rq->nr_running > 1)
648
		return false;
649

650
	return true;
651
652
}
#endif /* CONFIG_NO_HZ_FULL */
653

654
void sched_avg_update(struct rq *rq)
655
{
656
657
	s64 period = sched_avg_period();

658
	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
659
660
661
662
663
664
		/*
		 * Inline assembly required to prevent the compiler
		 * optimising this loop into a divmod call.
		 * See __iter_div_u64_rem() for another example of this.
		 */
		asm("" : "+rm" (rq->age_stamp));
665
666
667
		rq->age_stamp += period;
		rq->rt_avg /= 2;
	}
668
669
}

670
#endif /* CONFIG_SMP */
671

672
673
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
674
/*
675
676
677
678
 * Iterate task_group tree rooted at *from, calling @down when first entering a
 * node and @up when leaving it for the final time.
 *
 * Caller must hold rcu_lock or sufficient equivalent.
679
 */
680
int walk_tg_tree_from(struct task_group *from,
681
			     tg_visitor down, tg_visitor up, void *data)
682
683
{
	struct task_group *parent, *child;
Peter Zijlstra's avatar
Peter Zijlstra committed
684
	int ret;
685

686
687
	parent = from;

688
down:
Peter Zijlstra's avatar
Peter Zijlstra committed
689
690
	ret = (*down)(parent, data);
	if (ret)
691
		goto out;
692
693
694
695
696
697
698
	list_for_each_entry_rcu(child, &parent->children, siblings) {
		parent = child;
		goto down;

up:
		continue;
	}
Peter Zijlstra's avatar
Peter Zijlstra committed
699
	ret = (*up)(parent, data);
700
701
	if (ret || parent == from)
		goto out;
702
703
704
705
706

	child = parent;
	parent = parent->parent;
	if (parent)
		goto up;
707
out:
Peter Zijlstra's avatar
Peter Zijlstra committed
708
	return ret;
709
710
}

711
int tg_nop(struct task_group *tg, void *data)
Peter Zijlstra's avatar
Peter Zijlstra committed
712
{
713
	return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
714
}
715
716
#endif

717
static void set_load_weight(struct task_struct *p, bool update_load)
718
{
Nikhil Rao's avatar
Nikhil Rao committed
719
720
721
	int prio = p->static_prio - MAX_RT_PRIO;
	struct load_weight *load = &p->se.load;

Ingo Molnar's avatar
Ingo Molnar committed
722
723
724
	/*
	 * SCHED_IDLE tasks get minimal weight:
	 */
725
	if (idle_policy(p->policy)) {
726
		load->weight = scale_load(WEIGHT_IDLEPRIO);
Nikhil Rao's avatar
Nikhil Rao committed
727
		load->inv_weight = WMULT_IDLEPRIO;
Ingo Molnar's avatar
Ingo Molnar committed
728
729
		return;
	}
730

731
732
733
734
735
736
737
738
739
740
	/*
	 * SCHED_OTHER tasks have to update their load when changing their
	 * weight
	 */
	if (update_load && p->sched_class == &fair_sched_class) {
		reweight_task(p, prio);
	} else {
		load->weight = scale_load(sched_prio_to_weight[prio]);
		load->inv_weight = sched_prio_to_wmult[prio];
	}
741
742
}

743
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
744
{
745
746
747
	if (!(flags & ENQUEUE_NOCLOCK))
		update_rq_clock(rq);

748
749
	if (!(flags & ENQUEUE_RESTORE))
		sched_info_queued(rq, p);
750

751
	p->sched_class->enqueue_task(rq, p, flags);
752
753
}

754
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
755
{
756
757
758
	if (!(flags & DEQUEUE_NOCLOCK))
		update_rq_clock(rq);

759
760
	if (!(flags & DEQUEUE_SAVE))
		sched_info_dequeued(rq, p);
761

762
	p->sched_class->dequeue_task(rq, p, flags);
763
764
}

765
void activate_task(struct rq *rq, struct task_struct *p, int flags)
766
767
768
769
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

770
	enqueue_task(rq, p, flags);
771
772
}

773
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
774
775
776
777
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible++;

778
	dequeue_task(rq, p, flags);
779
780
}

781
/*
Ingo Molnar's avatar
Ingo Molnar committed
782
 * __normal_prio - return the priority that is based on the static prio
783
784
785
 */
static inline int __normal_prio(struct task_struct *p)
{
Ingo Molnar's avatar
Ingo Molnar committed
786
	return p->static_prio;
787
788
}

789
790
791
792
793
794
795
/*
 * Calculate the expected normal priority: i.e. priority
 * without taking RT-inheritance into account. Might be
 * boosted by interactivity modifiers. Changes upon fork,
 * setprio syscalls, and whenever the interactivity
 * estimator recalculates.
 */
796
static inline int normal_prio(struct task_struct *p)
797
798
799
{
	int prio;

800
801
802
	if (task_has_dl_policy(p))
		prio = MAX_DL_PRIO-1;
	else if (task_has_rt_policy(p))
803
804
805
806
807
808
809
810
811
812
813
814
815
		prio = MAX_RT_PRIO-1 - p->rt_priority;
	else
		prio = __normal_prio(p);
	return prio;
}

/*
 * Calculate the current priority, i.e. the priority
 * taken into account by the scheduler. This value might
 * be boosted by RT tasks, or might be boosted by
 * interactivity modifiers. Will be RT if the task got
 * RT-boosted. If not then it returns p->normal_prio.
 */
816
static int effective_prio(struct task_struct *p)
817
818
819
820
821
822
823
824
825
826
827
828
{
	p->normal_prio = normal_prio(p);
	/*
	 * If we are RT tasks or we were boosted to RT priority,
	 * keep the priority unchanged. Otherwise, update priority
	 * to the normal priority:
	 */
	if (!rt_prio(p->prio))
		return p->normal_prio;
	return p->prio;
}

Linus Torvalds's avatar
Linus Torvalds committed
829
830
831
/**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
832
833
 *
 * Return: 1 if the task is currently executing. 0 otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
834
 */
835
inline int task_curr(const struct task_struct *p)
Linus Torvalds's avatar
Linus Torvalds committed
836
837
838
839
{
	return cpu_curr(task_cpu(p)) == p;
}

840
/*
841
842
843
844
845
 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
 * use the balance_callback list if you want balancing.
 *
 * this means any call to check_class_changed() must be followed by a call to
 * balance_callback().
846
 */
847
848
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
				       const struct sched_class *prev_class,
Peter Zijlstra's avatar
Peter Zijlstra committed
849
				       int oldprio)
850
851
852
{
	if (prev_class != p->sched_class) {
		if (prev_class->switched_from)
Peter Zijlstra's avatar
Peter Zijlstra committed
853
			prev_class->switched_from(rq, p);
854

Peter Zijlstra's avatar
Peter Zijlstra committed
855
		p->sched_class->switched_to(rq, p);
856
	} else if (oldprio != p->prio || dl_task(p))
Peter Zijlstra's avatar
Peter Zijlstra committed
857
		p->sched_class->prio_changed(rq, p, oldprio);
858
859
}

860
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
861
862
863
864
865
866
867
868
869
870
{
	const struct sched_class *class;

	if (p->sched_class == rq->curr->sched_class) {
		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
	} else {
		for_each_class(class) {
			if (class == rq->curr->sched_class)
				break;
			if (class == p->sched_class) {
871
				resched_curr(rq);
872
873
874
875
876
877
878
879
880
				break;
			}
		}
	}

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
881
	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
882
		rq_clock_skip_update(rq);
883
884
}

Linus Torvalds's avatar
Linus Torvalds committed
885
#ifdef CONFIG_SMP
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912

static inline bool is_per_cpu_kthread(struct task_struct *p)
{
	if (!(p->flags & PF_KTHREAD))
		return false;

	if (p->nr_cpus_allowed != 1)
		return false;

	return true;
}

/*
 * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
 * __set_cpus_allowed_ptr() and select_fallback_rq().
 */
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
		return false;

	if (is_per_cpu_kthread(p))
		return cpu_online(cpu);

	return cpu_active(cpu);
}

Peter Zijlstra's avatar
Peter Zijlstra committed
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
/*
 * This is how migration works:
 *
 * 1) we invoke migration_cpu_stop() on the target CPU using
 *    stop_one_cpu().
 * 2) stopper starts to run (implicitly forcing the migrated thread
 *    off the CPU)
 * 3) it checks whether the migrated task is still in the wrong runqueue.
 * 4) if it's in the wrong runqueue then the migration thread removes
 *    it and puts it into the right queue.
 * 5) stopper completes and stop_one_cpu() returns and the migration
 *    is done.
 */

/*
 * move_queued_task - move a queued task to new rq.
 *
 * Returns (locked) new rq. Old rq's lock is released.
 */
932
933
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
				   struct task_struct *p, int new_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
934
935
936
937
{
	lockdep_assert_held(&rq->lock);

	p->on_rq = TASK_ON_RQ_MIGRATING;
938
	dequeue_task(rq, p, DEQUEUE_NOCLOCK);
Peter Zijlstra's avatar
Peter Zijlstra committed
939
	set_task_cpu(p, new_cpu);
940
	rq_unlock(rq, rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
941
942
943

	rq = cpu_rq(new_cpu);

944
	rq_lock(rq, rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
945
946
	BUG_ON(task_cpu(p) != new_cpu);
	enqueue_task(rq, p, 0);
947
	p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstra's avatar
Peter Zijlstra committed
948
949
950
951
952
953
954
955
956
957
958
	check_preempt_curr(rq, p, 0);

	return rq;
}

struct migration_arg {
	struct task_struct *task;
	int dest_cpu;
};

/*
Ingo Molnar's avatar
Ingo Molnar committed
959
 * Move (not current) task off this CPU, onto the destination CPU. We're doing
Peter Zijlstra's avatar
Peter Zijlstra committed
960
961
962
963
964
965
966
 * this because either it can't run here any more (set_cpus_allowed()
 * away from this CPU, or CPU going down), or because we're
 * attempting to rebalance this task on exec (sched_exec).
 *
 * So we race with normal scheduler movements, but that's OK, as long
 * as the task is no longer on this CPU.
 */
967
968
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
				 struct task_struct *p, int dest_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
969
970
{
	/* Affinity changed (again). */
971
	if (!is_cpu_allowed(p, dest_cpu))
972
		return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
973

974
	update_rq_clock(rq);
975
	rq = move_queued_task(rq, rf, p, dest_cpu);
976
977

	return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
978
979
980
981
982
983
984
985
986
987
}

/*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
 * 'pushing' onto another runqueue.
 */
static int migration_cpu_stop(void *data)
{
	struct migration_arg *arg = data;
988
989
	struct task_struct *p = arg->task;
	struct rq *rq = this_rq();
990
	struct rq_flags rf;
Peter Zijlstra's avatar
Peter Zijlstra committed
991
992

	/*
Ingo Molnar's avatar
Ingo Molnar committed
993
994
	 * The original target CPU might have gone down and we might
	 * be on another CPU but it doesn't matter.
Peter Zijlstra's avatar
Peter Zijlstra committed
995
996
997
998
999
1000
1001
1002
	 */
	local_irq_disable();
	/*
	 * We need to explicitly wake pending tasks before running
	 * __migrate_task() such that we will not miss enforcing cpus_allowed
	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
	 */
	sched_ttwu_pending();
1003
1004

	raw_spin_lock(&p->pi_lock);
1005
	rq_lock(rq, &rf);
1006
1007
1008
1009
1010
	/*
	 * If task_rq(p) != rq, it cannot be migrated here, because we're
	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
	 * we're holding p->pi_lock.
	 */
1011
1012
	if (task_rq(p) == rq) {
		if (task_on_rq_queued(p))
1013
			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1014
1015
1016
		else
			p->wake_cpu = arg->dest_cpu;
	}
1017
	rq_unlock(rq, &rf);
1018
1019
	raw_spin_unlock(&p->pi_lock);

Peter Zijlstra's avatar
Peter Zijlstra committed
1020
1021
1022
1023
	local_irq_enable();
	return 0;
}

1024
1025
1026
1027
1028
/*
 * sched_class::set_cpus_allowed must do the below, but is not required to
 * actually call this function.
 */
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
Peter Zijlstra's avatar
Peter Zijlstra committed
1029
1030
1031
1032
1033
{
	cpumask_copy(&p->cpus_allowed, new_mask);
	p->nr_cpus_allowed = cpumask_weight(new_mask);
}

1034
1035
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
1036
1037
1038
	struct rq *rq = task_rq(p);
	bool queued, running;

1039
	lockdep_assert_held(&p->pi_lock);
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049

	queued = task_on_rq_queued(p);
	running = task_current(rq, p);

	if (queued) {
		/*
		 * Because __kthread_bind() calls this on blocked tasks without
		 * holding rq->lock.
		 */
		lockdep_assert_held(&rq->lock);
1050
		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1051
1052
1053
1054
	}
	if (running)
		put_prev_task(rq, p);

1055
	p->sched_class->set_cpus_allowed(p, new_mask);
1056
1057

	if (queued)
1058
		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1059
	if (running)
1060
		set_curr_task(rq, p);
1061
1062
}

Peter Zijlstra's avatar
Peter Zijlstra committed
1063
1064
1065
1066
1067
1068
1069
1070
1071
/*
 * Change a given task's CPU affinity. Migrate the thread to a
 * proper CPU and schedule it away if the CPU it's executing on
 * is removed from the allowed bitmask.
 *
 * NOTE: the caller must have a valid reference to the task, the
 * task must not exit() & deallocate itself prematurely. The
 * call is not atomic; no spinlocks may be held.
 */
1072
1073
static int __set_cpus_allowed_ptr(struct task_struct *p,
				  const struct cpumask *new_mask, bool check)
Peter Zijlstra's avatar
Peter Zijlstra committed
1074
{
1075
	const struct cpumask *cpu_valid_mask = cpu_active_mask;
Peter Zijlstra's avatar
Peter Zijlstra committed
1076
	unsigned int dest_cpu;
1077
1078
	struct rq_flags rf;
	struct rq *rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
1079
1080
	int ret = 0;

1081
	rq = task_rq_lock(p, &rf);
1082
	update_rq_clock(rq);
Peter Zijlstra's avatar
Peter Zijlstra committed
1083

1084
1085
1086
1087
1088
1089
1090
	if (p->flags & PF_KTHREAD) {
		/*
		 * Kernel threads are allowed on online && !active CPUs
		 */
		cpu_valid_mask = cpu_online_mask;
	}

1091
1092
1093
1094
1095
1096
1097
1098
1099
	/*
	 * Must re-check here, to close a race against __kthread_bind(),
	 * sched_setaffinity() is not guaranteed to observe the flag.
	 */
	if (check && (p->flags & PF_NO_SETAFFINITY)) {
		ret = -EINVAL;
		goto out;
	}

Peter Zijlstra's avatar
Peter Zijlstra committed
1100
1101
1102
	if (cpumask_equal(&p->cpus_allowed, new_mask))
		goto out;

1103
	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
Peter Zijlstra's avatar
Peter Zijlstra committed
1104
1105
1106
1107
1108
1109
		ret = -EINVAL;
		goto out;
	}

	do_set_cpus_allowed(p, new_mask);

1110
1111
1112
	if (p->flags & PF_KTHREAD) {
		/*
		 * For kernel threads that do indeed end up on online &&
Ingo Molnar's avatar
Ingo Molnar committed
1113
		 * !active we want to ensure they are strict per-CPU threads.
1114
1115
1116
1117
1118
1119
		 */
		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
			!cpumask_intersects(new_mask, cpu_active_mask) &&
			p->nr_cpus_allowed != 1);
	}